extract_page.py

import re
import sys
from pprint import pprint
from typing import Any, List, Optional

import esprima as es
from lxml import etree, html
from lxml.etree import _Element
from rdflib import RDF, Graph, Literal

from common import *


class Comment:
    id: str
    num: int
    text: str
    html: Any
    elem: _Element

    def __repr__(self):
        return str(self.__dict__)


class Page:
    id: str
    title: str
    type: str  # cours ou exercice
    comment_success: Comment
    comments_sugg: List[Comment]
    comments_misc: List[Comment]

    def __init__(self) -> None:
        self.comments_sugg = []
        self.comments_misc = []

    def __repr__(self):
        return str(self.__dict__)


class ParseError(Exception):
    pass


class RegexParser:
    def parse(self, js, output=sys.stdout):
        # Find function declaration and only keep code after it
        func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
        if len(func_split) < 2:
            raise ParseError("Failed to find function 'entrerDonnees'")
        body = func_split[1]

        activity_type, activity_var_name = self._parse_activity_constructor(body)
        print(activity_type, end="", file=output)
        if activity_type == "ExerciceQC_QCU":
            print(" ", self._parse_qcu_answers(body), end="", file=output)

    def _parse_activity_constructor(self, code: str) -> tuple[str, str]:
        """
        Find activity constructor call, return the activity type
        and resulting variable name.
        """
        constructor_match = re.search(
            r"""
                (\w+)               # result variable name
                \s+=\s+new\s+       #
                (Cours|Exercice\w+) # constructor name
                \((.*?)\);          # optional arguments between parentheses
                    """,
            code,
            re.VERBOSE,
        )
        if constructor_match is None:
            raise ParseError("Failed to parse activity constructor")
        # Handle case of QC variants
        var_name, act_type, args = constructor_match.groups()
        if act_type == "ExerciceQC" and args in ('"QCU"', '"QCM"'):
            act_type += "_" + args.replace('"', "")
        return act_type, var_name

    def _parse_qcu_answers(self, code: str) -> list[bool]:
        """Parse the correct answers for a QCU activity, as a list of booleans"""
        correct_choices = []
        index = 0
        for line in code.splitlines():
            line = line.strip()
            m = re.match(r"var nr = (\d+);", line)
            if m is not None:
                # "index" line
                index = int(m.group(1))
            elif line == "exo.tabStylesR[nr] = CODE_F;":
                # "incorrect answer" line
                insert_grow(correct_choices, index, False, fill_value=False)
            elif line == "exo.tabStylesR[nr] = CODE_V;":
                # "correct answer" line
                insert_grow(correct_choices, index, True, fill_value=False)
        return correct_choices

    def __str__(self) -> str:
        return "RegexParser"


class XpathParser:
    """A parser for the JS portion of an activity, that uses XPath to query
    an XML representation of Esprima's abstract syntax tree (AST)"""

    # XPath requests pre-compiled as functions
    request_function = etree.XPath(
        '//FunctionDeclaration[id/Identifier[@name="entrerDonnees"]]'
    )
    request_index_and_values = etree.XPath(
        '*//VariableDeclarator[id//*[@name="nr"]]/init/Literal | *//AssignmentExpression[*//Identifier[@name="CODE_V"]]'
    )
    request_constructor_id = etree.XPath(
        '*//NewExpression/callee/Identifier[@name="Cours" or starts-with(@name, "Exercice")]'
    )

    def __init__(self):
        pass

    def parse(self, js, output=sys.stdout):
        jstree = es.parseScript(js, None)
        # Convert Esprima object tree to XML etree
        xml = self.to_xml(jstree.toDict(), "jstree")
        try:
            self.fun = self.request_function(xml)[0]
            act_type = self._parse_activity_type()
            print(act_type, end="", file=output)
            if act_type == "ExerciceQC_QCU":
                print(" ", self._parse_qcu_answers(), end="", file=output)
        except Exception as e:
            raise ParseError(e)

    def _parse_activity_type(self) -> str:
        constructor_id = self.request_constructor_id(self.fun)[0]
        match constructor_id.get("name"):
            case "ExerciceQC":
                arg = constructor_id.xpath("../../arguments/Literal/@value")[0]
                if arg not in ["QCU", "QCM"]:
                    raise ParseError(f"ExerciceQC: invalid argument '{arg}'")
                return f"ExerciceQC_{arg}"
            case other:
                return other

    def _parse_qcu_answers(self) -> list[bool]:
        """Parse the correct answers for a QCU activity, as a list of booleans"""
        indexes_and_values = self.request_index_and_values(self.fun)
        correct_choices = []
        index = 0
        for e in indexes_and_values:
            value = e.xpath("@value")
            if len(value) != 0:
                # "index line"
                index = int(value[0])
            else:
                # "true line"
                insert_grow(correct_choices, index, True, fill_value=False)
        return correct_choices

    def to_xml(self, obj, tag_name: Optional[str] = None):
        """Recursively convert an object structure to an XML `ElementTree`.
        Structures are expected to be Python dictionaries.
        Converting a dictionary produces a tag named after the "type" attribute (if present).
        - A primitive attribute (i.e. not list nor dict) becomes a tag attribute.
        - A list attribute becomes a tag with its contents as sub-tags.
        - A dictionary attribute becomes a tag (named like the attribute's key)
        containing a sub-tag for the dictionary itself
        """
        if isinstance(obj, dict):
            # Dictionary (or object):
            # - if it has a "type" key, the dict represents an object -> use its value as the tag name
            # - if a tag_name is specified as well, it's probably important (like an attribute name),
            # so we keep both, as 2 nested tags (tag_name for the outer tag, type for the inner tag)
            inner_tag = None
            outer_tag = None
            has_inner = "type" in obj.keys()
            if has_inner:
                inner_tag = etree.Element(obj["type"], None, None)
            else:
                inner_tag = etree.Element("_dict", None, None)

            if tag_name is not None:
                outer_tag = etree.Element(tag_name)
                if has_inner:
                    outer_tag.append(inner_tag)
                else:
                    inner_tag = outer_tag
            else:
                outer_tag = inner_tag
            # Recurse on dictionary items
            for key, val in obj.items():
                if key != "type":  # exception for 'type', handled as attribute
                    if isinstance(val, (list, dict)):
                        # Structured attributes become child tags
                        inner_tag.append(self.to_xml(val, key))
                    else:
                        # Primitive attributes become tag attributes
                        inner_tag.set(key, str(val))
            return outer_tag

        elif isinstance(obj, list):
            tag_name = tag_name or "_list"
            list_tag = etree.Element(tag_name)
            for e in obj:
                list_tag.append(self.to_xml(e))
            return list_tag

        else:
            tag_name = tag_name or "_literal"
            leaf_tag = etree.Element(tag_name)
            leaf_tag.text = str(obj)
            return leaf_tag

    def __str__(self) -> str:
        return "XpathParser"


class MatchParser:
    """A parser for the JS portion of an activity, that uses Python match statements
    to navigate the abstract syntax tree (AST) produced by Esprima"""

    def __init__(self, graph: Graph, act_id: str) -> None:
        self.graph = graph
        self.act_id = act_id

    def parse(self, js, output=sys.stdout):
        self.output = output
        jstree = es.parseScript(js, None)
        # Try to match our template with one of the top-level statements
        for statement in jstree.body:
            self.match_function(statement.toDict())

    def match_constructor_call(self, new_expr: dict):
        match new_expr:
            case {
                "type": "NewExpression",
                "callee": {
                    "type": "Identifier",
                    "name": typ,
                },
                "arguments": [*args],
            }:
                match typ:
                    case "Cours" | "ExerciceQM" | "ExerciceTAT" | "ExerciceGD":
                        self.print(typ)
                        self.graph.add((NS[self.act_id], RDF.type, NS[typ]))
                    case "ExerciceQC":
                        match args:
                            case [{"type": "Literal", "value": "QCU"}, *_]:
                                typ += "_QCU"
                            case [{"type": "Literal", "value": "QCM"}, *_]:
                                typ += "_QCM"
                            case _:
                                raise ParseError(
                                    f"ExerciceQC: Invalid argument '{args}'"
                                )
                        self.print(typ)
                        self.graph.add((NS[self.act_id], RDF.type, NS[typ]))
                    case _:
                        raise ParseError(f"Unknown activity type '{typ}'")
            case _:
                pass

    def print(self, s: str):
        print(s, end="", file=self.output)

    def match_function(self, func: dict):
        """Checks if `func` matches a function declaration named `entrerDonnees`,
        and search its body if successful
        """

        match func:
            case {
                "type": "FunctionDeclaration",
                "id": {"name": "entrerDonnees"},
                "body": {"type": "BlockStatement", "body": body},
            }:
                # Matched a function declaration and captured its `body` attr
                for statement in body:
                    # Find constructor calls (e.g. `new Thing()`) recursively
                    recurse_prefix(statement, self.match_constructor_call)

    def __str__(self) -> str:
        return "MatchParser"


def recurse_prefix(t, f):
    """Depth-first prefixed recursion: calls a function on an object, then on
    all its children (if it's a list or dictionary) recursively

    :param t: The object
    :param f: The function to call
    """
    f(t)
    if isinstance(t, list):
        for e in t:
            recurse_prefix(e, f)
    elif isinstance(t, dict):
        for e in t.values():
            recurse_prefix(e, f)


# Regex to separate non-digits and digits
regex_comment = re.compile(r"(\D*)(\d*)")


def parse_page(graph: Graph, filepath: str, id: str):
    page = Page()
    # Parse with lxml
    tree = html.parse(filepath)
    root = tree.getroot()

    # Collect all inline scripts (no external 'src') and join them in a
    # block of JS code
    # scripts = root.cssselect('script[type="text/javascript"]:not([src])')
    scripts: List[_Element] = root.xpath(
        '/html/head/script[@type="text/javascript" and not(@src)]'
    )
    js = "\n".join((s.text_content() for s in scripts))

    # Try different parsers, each writing to a different file to compare their results
    for parser in [XpathParser(), MatchParser(graph, id), RegexParser()]:
        with open(f"/tmp/{str(parser)}.txt", "a") as f:
            print(f"\n{id:8}", end="", file=f)
            try:
                parser.parse(js, output=f)
            except ParseError as e:
                print(f"{parser} -> {id}: Parsing error: {e}", file=sys.stderr)

    # Parse comments
    zi = root.get_element_by_id("zoneInvisible")
    for cmt_div in zi:
        comment = Comment()
        comment.text = cmt_div.text_content()
        comment.html = html.tostring(cmt_div, encoding="unicode")
        comment.elem = cmt_div
        comment.id = cmt_div.get("id") or ""
        # Split id in two parts (non-digits and digits), then match on these parts
        m = regex_comment.match(comment.id)
        if m is not None:
            match m.groups():
                case ["divCmt", num]:
                    comment.num = int(num)
                    page.comments_misc.append(comment)
                    graph.add((NS[id], NS["commentaireInfo"], Literal(comment.html)))
                case ["divSugg", num]:
                    comment.num = int(num)
                    page.comments_sugg.append(comment)
                    graph.add((NS[id], NS["commentaireSugg"], Literal(comment.html)))
                case ["divCmtSucces", _]:
                    page.comment_success = comment
                    graph.add((NS[id], NS["commentaireSucces"], Literal(comment.html)))
                case [other, _]:
                    pass
    # pprint(page)