extract_page.py

import re
from abc import abstractmethod
from dataclasses import dataclass
from typing import Any, Callable

import esprima as es
from lxml import etree, html
from lxml.etree import _Element
from lxml.html import HtmlElement
from rdflib import RDF, Graph, Literal
from typing_extensions import override

from common import *

# Initialise logger
log = get_logger("extract_page")


class Comment:
    def __init__(self):
        self.id: str
        self.num: int
        self.text: str
        self.html: Any
        self.elem: _Element

    @override
    def __repr__(self):
        return str(self.__dict__)


class Activity:
    def __init__(self):
        self.id: str = ""
        """The ID of the page this activity is in (`pg###`)"""
        self.title: str = ""
        """Human-readable title of the activity"""
        self.description: str | None = None
        """Description of the activity's body (HTML),
        e.g. the instructions for an exercise activity"""
        self.comment_consigne: Comment | None = None
        """Another form of activity description but in a comment. May or may not
        coexist with a regular description"""
        self.comment_success: Comment | None = None
        """Comment displayed on success, if applicable"""
        self.comments_sugg: list[Comment] = []
        """Help comments displayed on failure, if applicable"""
        self.comments_misc: list[Comment] = []
        """Any other comments, if present"""
        self.ref: URIRef

    def save(self, graph: Graph):
        """Save activity data to the graph. Subclasses may override this method
        to save their specific data."""
        self.ref = NS[self.id]
        # => Type
        graph.add((self.ref, RDF.type, NS[self.get_name()]))
        # => Title
        set_title(graph, self.ref, self.title)
        # => Description
        description = self.description or ""
        if self.comment_consigne is not None:
            description += self.comment_consigne.html
        if description != "":
            graph.add((self.ref, NS["description"], Literal(description)))
        # => Comments
        if self.comment_success is not None:
            graph.add(
                (self.ref, NS["commentaireSucces"], Literal(self.comment_success.html))
            )
        for comment in self.comments_sugg:
            graph.add((self.ref, NS["commentaireSugg"], Literal(comment.html)))
        for comment in self.comments_misc:
            graph.add((self.ref, NS["commentaireInfo"], Literal(comment.html)))

    def parse_html(self, root: HtmlElement):
        """From a `lxml.html` parsing tree, extract all data relevant to this class.
        Subclasses may override this method to extract more specific data.
        """
        # => Title
        self.title = root.xpath("/html/head/title")[0].text
        # => Comments
        zi = root.get_element_by_id("zoneInvisible")
        for cmt_div in zi:
            comment = Comment()
            comment.text = cmt_div.text_content()
            comment.html = to_html(cmt_div)
            comment.elem = cmt_div
            comment.id = cmt_div.get("id") or ""
            # Split id in two parts (non-digits and digits), then match on these parts
            m = regex_comment.match(comment.id)
            if m is not None:
                match m.groups():
                    case ["divCmt", num]:
                        comment.num = int(num)
                        self.comments_misc.append(comment)
                    case ["divSugg", num]:
                        comment.num = int(num)
                        self.comments_sugg.append(comment)
                    case ["divCmtSucces", _]:
                        self.comment_success = comment
                    case ["divConsigne", _]:
                        self.comment_consigne = comment
                    case [alpha, num]:
                        log.warning(
                            f"No match for comment {alpha}[{num}] ('{comment.id}')"
                        )

    def get_name(self) -> str:
        return type(self).__name__

    @classmethod
    def from_typename(cls, name: str):
        """Convenience function to create an `Activity` subclass from a name"""
        match name:
            case "Cours":
                return Cours()
            case "ExerciceQC":
                return ExerciceQC()
            case "ExerciceQM":
                return ExerciceQM()
            case "ExerciceTAT":
                return ExerciceTAT()
            case "ExerciceGD":
                return ExerciceGD()
            case _:
                raise NameError(name=name)

    @override
    def __repr__(self):
        return self.get_name() + str(self.__dict__)


class Cours(Activity):
    @override
    def parse_html(self, root: HtmlElement):
        super().parse_html(root)
        # => Description
        cours = root.get_element_by_id("STY_texteCours")
        self.description = to_html(cours).strip()


class Exercice(Activity):
    @override
    def parse_html(self, root: HtmlElement):
        super().parse_html(root)
        # => Description
        question = root.get_element_by_id("STY_question")
        self.description = to_html(question).strip()


@dataclass
class Choice:
    """A possible answer for a question, correct or not"""

    def __init__(self, index: int = 0):
        self.index = index
        self.is_correct: bool = False
        self.html: str = ""


class ChoiceGroup:
    def __init__(self):
        self.label: str
        self.items: list[Choice]


class ExerciceQC(Exercice):
    def __init__(self, is_qcm: bool = False) -> None:
        super().__init__()
        self.is_qcm = is_qcm
        self.choices: list[Choice] = []

    @override
    def get_name(self) -> str:
        return "ExerciceQC_QCM" if self.is_qcm else "ExerciceQC_QCU"

    @override
    def parse_html(self, root: HtmlElement):
        super().parse_html(root)
        # Find question choices
        for choice in root.find_class("STY_reponseQC"):
            # Choices have an 'id' attribute in the form 'lienRepX'
            # where X is their index (starting at 1)
            index = int(choice.attrib["id"].replace("lienRep", ""))
            self.set_html(index - 1, to_html(choice).strip())

    @override
    def save(self, graph: Graph):
        super().save(graph)
        for choice in self.choices:
            rdf_name = f"{self.id}q{choice.index}"  # ex: pg157q2
            display_name = rdf_name + " | " + ("V" if choice.is_correct else "F")
            choice_node = NS[rdf_name]
            graph.add((choice_node, RDF.type, NS["Reponse"]))
            graph.add((choice_node, NS["index"], Literal(choice.index)))
            graph.add((choice_node, NS["correct"], Literal(choice.is_correct)))
            graph.add((choice_node, NS["html"], Literal(choice.html)))
            graph.add(
                (
                    choice_node,
                    NS["__protege_display_name"],
                    Literal(display_name),
                )
            )
            graph.add((NS[self.id], NS["aReponse"], choice_node))
            # Our fake "class hierarchy" just for easier visualization
            graph.add((choice_node, RDFS.subClassOf, NS[self.id]))

    def set_correct(self, choice_index: int, correct: bool):
        """Set the choice at `choice_index` as correct or not, creating it if needed."""
        self._get_or_create(choice_index).is_correct = correct

    def set_html(self, choice_index: int, html: str):
        """Set the `html` attribute for the choice at `choice_index`, creating it if needed."""
        self._get_or_create(choice_index).html = html

    def _get_or_create(self, index: int) -> Choice:
        """Returns the choice at `index`, creating it if needed."""
        for i in range(len(self.choices), index + 1):
            self.choices.append(Choice(i))
        return self.choices[index]


class ExerciceQM(Exercice):
    def __init__(self):
        super().__init__()
        self.questions: list[ChoiceGroup]


class ExerciceTAT(Exercice):
    def __init__(self):
        super().__init__()
        self.text: str  # can be HTML
        self.gaps: list[ChoiceGroup]


class ExerciceGD(Exercice):
    def __init__(self):
        super().__init__()
        self.targets: list[str]
        self.draggables: list[list[Choice]]


class JSParser:
    @abstractmethod
    def parse(self, js: str) -> Activity:
        """Parse a string of JavaScript code and returns an instance of the
        correct `Activity` subclass, partially populated with data found in the code.
        """
        pass

    @override
    def __str__(self) -> str:
        return type(self).__name__


class RegexParser(JSParser):
    def __init__(self, graph: Graph, act_id: str) -> None:
        self.graph = graph
        self.act_id = act_id

    @override
    def parse(self, js: str) -> Activity:
        # Find function declaration and only keep code after it
        func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
        if len(func_split) < 2:
            raise ParseError("Failed to find function 'entrerDonnees'")
        body = func_split[1]

        activity, activity_var_name = self._parse_activity_constructor(body)
        if isinstance(activity, ExerciceQC):
            # Parse correct answers
            self._parse_qc_answers(body, activity)

        return activity

    def _parse_activity_constructor(self, code: str) -> tuple[Activity, str]:
        """
        Find activity constructor call, return the activity type
        and resulting variable name.
        """
        constructor_match = re.search(
            r"""
                (\w+)               # result variable name
                \s+=\s+new\s+       #
                (Cours|Exercice\w+) # constructor name
                \((.*?)\);          # optional arguments between parentheses
                    """,
            code,
            re.VERBOSE,
        )
        if constructor_match is None:
            raise ParseError("Failed to parse activity constructor")

        var_name, act_type, args = constructor_match.groups()
        activity = Activity.from_typename(act_type)
        # Handle case of QC variants
        if isinstance(activity, ExerciceQC) and args == '"QCM"':
            activity.is_qcm = True
        return activity, var_name

    def _parse_qc_answers(self, code: str, exo: ExerciceQC) -> None:
        """Parse the correct answers for a QCU activity"""
        index = 0
        for line in code.splitlines():
            line = line.strip()
            m = re.match(r"var nr = (\d+);", line)
            if m is not None:
                # "index" line
                index = int(m.group(1)) - 1  # question indexes start at 1
            elif line == "exo.tabStylesR[nr] = CODE_F;":
                # "incorrect answer" line
                exo.set_correct(index, False)
            elif line == "exo.tabStylesR[nr] = CODE_V;":
                # "correct answer" line
                exo.set_correct(index, True)


class XpathParser(JSParser):
    """A parser for the JS portion of an activity, that uses XPath to query
    an XML representation of Esprima's abstract syntax tree (AST)"""

    # XPath requests pre-compiled as functions
    request_function = etree.XPath(
        '//FunctionDeclaration[id/Identifier[@name="entrerDonnees"]]'
    )
    request_index_and_values = etree.XPath(
        '*//VariableDeclarator[id//*[@name="nr"]]/init/Literal | *//AssignmentExpression//Identifier[starts-with(@name,"CODE_")]'
    )
    request_constructor_id = etree.XPath(
        '*//NewExpression/callee/Identifier[@name="Cours" or starts-with(@name, "Exercice")]'
    )

    def __init__(self) -> None:
        self.fun: Any
        """AST element corresponding to the function we're interested in.
        Initialised in `self.parse()`."""

    @override
    def parse(self, js: str) -> Activity:
        jstree: Any = es.parseScript(js, None)
        # Convert Esprima object tree to XML etree
        xml = self.to_xml(jstree.toDict(), "jstree")
        try:
            self.fun = self.request_function(xml)[0]
            activity = self._parse_activity_type()
            if isinstance(activity, ExerciceQC):
                self._parse_qc_answers(activity)
            return activity
        except Exception as e:
            raise ParseError(e)

    def _parse_activity_type(self) -> Activity:
        constructor_id = self.request_constructor_id(self.fun)[0]
        match constructor_id.get("name"):
            case "ExerciceQC":
                arg = constructor_id.xpath("../../arguments/Literal/@value")[0]
                if arg == "QCM":
                    return ExerciceQC(is_qcm=True)
                elif arg == "QCU":
                    return ExerciceQC()
                else:
                    raise ParseError(f"ExerciceQC: invalid argument '{arg}'")
            case other:
                return Activity.from_typename(other)

    def _parse_qc_answers(self, activity: ExerciceQC) -> None:
        """Parse the correct answers for a QC activity"""
        indexes_and_values = self.request_index_and_values(self.fun)
        index = 0
        for e in indexes_and_values:
            value = e.xpath("@value")
            if len(value) != 0:
                # "index line"
                index = int(value[0]) - 1  # question indexes start at 1
            else:
                # "correct" or "incorrect" line
                activity.set_correct(index, e.get("name") == "CODE_V")

    def to_xml(self, obj: Any, tag_name: str | None = None):
        """Recursively convert an object structure to an XML `ElementTree`.
        Structures are expected to be Python dictionaries.
        Converting a dictionary produces a tag named after the "type" attribute (if present).
        - A primitive attribute (i.e. not list nor dict) becomes a tag attribute.
        - A list attribute becomes a tag with its contents as sub-tags.
        - A dictionary attribute becomes a tag (named like the attribute's key)
        containing a sub-tag for the dictionary itself
        """
        if isinstance(obj, dict):
            # Dictionary (or object):
            # - if it has a "type" key, the dict represents an object -> use its value as the tag name
            # - if a tag_name is specified as well, it's probably important (like an attribute name),
            # so we keep both, as 2 nested tags (tag_name for the outer tag, type for the inner tag)
            inner_tag = None
            outer_tag = None
            has_inner = "type" in obj.keys()
            if has_inner:
                inner_tag = etree.Element(obj["type"], None, None)
            else:
                inner_tag = etree.Element("_dict", None, None)

            if tag_name is not None:
                outer_tag = etree.Element(tag_name)
                if has_inner:
                    outer_tag.append(inner_tag)
                else:
                    inner_tag = outer_tag
            else:
                outer_tag = inner_tag
            # Recurse on dictionary items
            for key, val in obj.items():
                if key != "type":  # exception for 'type', handled as attribute
                    if isinstance(val, (list, dict)):
                        # Structured attributes become child tags
                        inner_tag.append(self.to_xml(val, key))
                    else:
                        # Primitive attributes become tag attributes
                        inner_tag.set(key, str(val))
            return outer_tag

        elif isinstance(obj, list):
            tag_name = tag_name or "_list"
            list_tag = etree.Element(tag_name)
            for e in obj:
                list_tag.append(self.to_xml(e))
            return list_tag

        else:
            tag_name = tag_name or "_literal"
            leaf_tag = etree.Element(tag_name)
            leaf_tag.text = str(obj)
            return leaf_tag


class MatchParser(JSParser):
    """A parser for the JS portion of an activity, that uses Python match statements
    to navigate the abstract syntax tree (AST) produced by Esprima"""

    def __init__(self, graph: Graph, act_id: str) -> None:
        self.graph = graph
        self.act_id = act_id
        self.activity: Activity | None = None

    @override
    def parse(self, js: str) -> Activity:
        jstree = es.parseScript(js, None)
        # Try to match our template with one of the top-level statements
        for statement in jstree.body:
            self.match_function(statement.toDict())

        if self.activity is not None:
            return self.activity
        else:
            raise ParseError("No activity constructor found")

    def match_constructor_call(self, new_expr: dict[str, Any]):
        if self.activity is not None:  # Ignore anything after the first match
            return
        match new_expr:
            case {
                "type": "NewExpression",
                "callee": {
                    "type": "Identifier",
                    "name": typ,
                },
                "arguments": [*args],
            }:
                match typ:
                    case "Cours" | "ExerciceQM" | "ExerciceTAT" | "ExerciceGD":
                        self.activity = Activity.from_typename(typ)
                    case "ExerciceQC":
                        match args:
                            case [{"type": "Literal", "value": "QCU"}, *_]:
                                typ += "_QCU"
                                self.activity = ExerciceQC()
                            case [{"type": "Literal", "value": "QCM"}, *_]:
                                typ += "_QCM"
                                self.activity = ExerciceQC(is_qcm=True)
                            case _:
                                raise ParseError(
                                    f"ExerciceQC: Invalid argument '{args}'"
                                )
                    case _:
                        raise ParseError(f"Unknown activity type '{typ}'")
            case _:
                pass

    def match_function(self, func: dict[str, Any]):
        """Checks if `func` matches a function declaration named `entrerDonnees`,
        and search its body if successful
        """

        match func:
            case {
                "type": "FunctionDeclaration",
                "id": {"name": "entrerDonnees"},
                "body": {"type": "BlockStatement", "body": body},
            }:
                # Matched a function declaration and captured its `body` attr
                for statement in body:
                    # Find constructor calls (e.g. `new Thing()`) recursively
                    recurse_prefix(statement, self.match_constructor_call)
            case _:
                pass


def recurse_prefix(t: Any, f: Callable[[Any], None]):
    """Depth-first prefixed recursion: calls a function on an object, then on
    all its children (if it's a list or dictionary) recursively

    :param t: The object
    :param f: The function to call
    """
    f(t)
    if isinstance(t, list):
        for e in t:
            recurse_prefix(e, f)
    elif isinstance(t, dict):
        for e in t.values():
            recurse_prefix(e, f)


# Regex to separate non-digits and digits
regex_comment = re.compile(r"(\D*)(\d*)")


def parse_page(graph: Graph, filepath: str, id: str):
    # Activity data is spread across HTML and JS code, which are parsed
    # differently. Additionally, some pieces of data are specific to the
    # activity type (Cours, ExerciceQC...) and this type is in the JS portion.
    # This requires parsing the JS code first, to get the type, then proceed
    # with HTML to get the rest of the type-specific data.

    # We still need to find the inline scripts before parsing them
    tree = html.parse(filepath)
    root = tree.getroot()
    # Collect all inline scripts (no external 'src') and join them in a
    # block of JS code
    scripts: list[HtmlElement] = root.xpath(
        '/html/head/script[@type="text/javascript" and not(@src)]'
    )
    js = "\n".join((s.text_content() for s in scripts))

    activity = Activity()
    # Try different parsers, each writing to a different file to compare their results
    for parser in [XpathParser(), MatchParser(graph, id), RegexParser(graph, id)]:
        with open(f"/tmp/{str(parser)}_debuglog.txt", "a") as f:
            print(f"\n{id:8}", end="", file=f)
            try:
                activity: Activity = parser.parse(js)
                print(activity, end="", file=f)
            except ParseError as e:
                log.error(
                    f"{parser} -> {id}: Parsing error: {e}. Treating this as a generic Activity."
                )

    activity.id = id
    # Parse the HTML portion
    activity.parse_html(root)
    # Save everything to the graph
    activity.save(graph)