Skip to content
Snippets Groups Projects
Select Git revision
  • 8a3a78bce123228a7cac5349d3e77d05c8ba7dbe
  • mui5-annotation-on-video-stable default
  • get_setter_canvasSizeInformations
  • fix-error-div-into-p
  • annotation-on-video-v2
  • detached
  • annotation-on-video-r17
  • mui5
  • mui5-react-18
  • jacob-test
  • annotation-on-video protected
  • master
  • test-antoinev1
  • 20-fetch-thumbnail-on-annotation
  • add-research-field
  • Save
  • add-plugin
  • 14-wip-no-seek-to
  • 14-bug-on-video-time-control
  • 9_wip_videotests
  • _upgrade_material_ui
  • latest-tetras-16
  • v3.3.0
  • v3.2.0
  • v3.1.1
  • v3.1.0
  • v3.0.0
  • v3.0.0-rc.7
  • v3.0.0-rc.6
  • v3.0.0-rc.5
  • v3.0.0-rc.4
  • v3.0.0-rc.3
  • v3.0.0-rc.2
  • v3.0.0-rc.1
  • v3.0.0-beta.10
  • v3.0.0-beta.9
  • v3.0.0-beta.8
  • v3.0.0-beta.7
  • v3.0.0-beta.6
  • v3.0.0-beta.5
  • v3.0.0-beta.3
41 results

MinimalWindow.js

Blame
  • extract_page.py 20.53 KiB
    import re
    from abc import abstractmethod
    from dataclasses import dataclass
    from typing import Any, Callable
    
    import esprima as es
    from lxml import etree, html
    from lxml.etree import _Element
    from lxml.html import HtmlElement
    from rdflib import RDF, Graph, Literal
    from typing_extensions import override
    
    from common import *
    
    # Initialise logger
    log = get_logger("extract_page")
    
    
    class Comment:
        def __init__(self):
            self.id: str
            self.num: int
            self.text: str
            self.html: Any
            self.elem: _Element
    
        @override
        def __repr__(self):
            return str(self.__dict__)
    
    
    class Activity:
        def __init__(self):
            self.id: str = ""
            """The ID of the page this activity is in (`pg###`)"""
            self.title: str = ""
            """Human-readable title of the activity"""
            self.description: str | None = None
            """Description of the activity's body (HTML),
            e.g. the instructions for an exercise activity"""
            self.comment_consigne: Comment | None = None
            """Another form of activity description but in a comment. May or may not
            coexist with a regular description"""
            self.comment_success: Comment | None = None
            """Comment displayed on success, if applicable"""
            self.comments_sugg: list[Comment] = []
            """Help comments displayed on failure, if applicable"""
            self.comments_misc: list[Comment] = []
            """Any other comments, if present"""
            self.ref: URIRef
    
        def save(self, graph: Graph):
            """Save activity data to the graph. Subclasses may override this method
            to save their specific data."""
            self.ref = NS[self.id]
            # => Type
            graph.add((self.ref, RDF.type, NS[self.get_name()]))
            # => Title
            set_title(graph, self.ref, self.title)
            # => Description
            description = self.description or ""
            if self.comment_consigne is not None:
                description += self.comment_consigne.html
            if description != "":
                graph.add((self.ref, NS["description"], Literal(description)))
            # => Comments
            if self.comment_success is not None:
                graph.add(
                    (self.ref, NS["commentaireSucces"], Literal(self.comment_success.html))
                )
            for comment in self.comments_sugg:
                graph.add((self.ref, NS["commentaireSugg"], Literal(comment.html)))
            for comment in self.comments_misc:
                graph.add((self.ref, NS["commentaireInfo"], Literal(comment.html)))
    
        def parse_html(self, root: HtmlElement):
            """From a `lxml.html` parsing tree, extract all data relevant to this class.
            Subclasses may override this method to extract more specific data.
            """
            # => Title
            self.title = root.xpath("/html/head/title")[0].text
            # => Comments
            zi = root.get_element_by_id("zoneInvisible")
            for cmt_div in zi:
                comment = Comment()
                comment.text = cmt_div.text_content()
                comment.html = to_html(cmt_div)
                comment.elem = cmt_div
                comment.id = cmt_div.get("id") or ""
                # Split id in two parts (non-digits and digits), then match on these parts
                m = regex_comment.match(comment.id)
                if m is not None:
                    match m.groups():
                        case ["divCmt", num]:
                            comment.num = int(num)
                            self.comments_misc.append(comment)
                        case ["divSugg", num]:
                            comment.num = int(num)
                            self.comments_sugg.append(comment)
                        case ["divCmtSucces", _]:
                            self.comment_success = comment
                        case ["divConsigne", _]:
                            self.comment_consigne = comment
                        case [alpha, num]:
                            log.warning(
                                f"No match for comment {alpha}[{num}] ('{comment.id}')"
                            )
    
        def get_name(self) -> str:
            return type(self).__name__
    
        @classmethod
        def from_typename(cls, name: str):
            """Convenience function to create an `Activity` subclass from a name"""
            match name:
                case "Cours":
                    return Cours()
                case "ExerciceQC":
                    return ExerciceQC()
                case "ExerciceQM":
                    return ExerciceQM()
                case "ExerciceTAT":
                    return ExerciceTAT()
                case "ExerciceGD":
                    return ExerciceGD()
                case _:
                    raise NameError(name=name)
    
        @override
        def __repr__(self):
            return self.get_name() + str(self.__dict__)
    
    
    class Cours(Activity):
        @override
        def parse_html(self, root: HtmlElement):
            super().parse_html(root)
            # => Description
            cours = root.get_element_by_id("STY_texteCours")
            self.description = to_html(cours).strip()
    
    
    class Exercice(Activity):
        @override
        def parse_html(self, root: HtmlElement):
            super().parse_html(root)
            # => Description
            question = root.get_element_by_id("STY_question")
            self.description = to_html(question).strip()
    
    
    @dataclass
    class Choice:
        """A possible answer for a question, correct or not"""
    
        def __init__(self, index: int = 0):
            self.index = index
            self.is_correct: bool = False
            self.html: str = ""
    
    
    class ChoiceGroup:
        def __init__(self):
            self.label: str
            self.items: list[Choice]
    
    
    class ExerciceQC(Exercice):
        def __init__(self, is_qcm: bool = False) -> None:
            super().__init__()
            self.is_qcm = is_qcm
            self.choices: list[Choice] = []
    
        @override
        def get_name(self) -> str:
            return "ExerciceQC_QCM" if self.is_qcm else "ExerciceQC_QCU"
    
        @override
        def parse_html(self, root: HtmlElement):
            super().parse_html(root)
            # Find question choices
            for choice in root.find_class("STY_reponseQC"):
                # Choices have an 'id' attribute in the form 'lienRepX'
                # where X is their index (starting at 1)
                index = int(choice.attrib["id"].replace("lienRep", ""))
                self.set_html(index - 1, to_html(choice).strip())
    
        @override
        def save(self, graph: Graph):
            super().save(graph)
            for choice in self.choices:
                rdf_name = f"{self.id}q{choice.index}"  # ex: pg157q2
                display_name = rdf_name + " | " + ("V" if choice.is_correct else "F")
                choice_node = NS[rdf_name]
                graph.add((choice_node, RDF.type, NS["Reponse"]))
                graph.add((choice_node, NS["index"], Literal(choice.index)))
                graph.add((choice_node, NS["correct"], Literal(choice.is_correct)))
                graph.add((choice_node, NS["html"], Literal(choice.html)))
                graph.add(
                    (
                        choice_node,
                        NS["__protege_display_name"],
                        Literal(display_name),
                    )
                )
                graph.add((NS[self.id], NS["aReponse"], choice_node))
                # Our fake "class hierarchy" just for easier visualization
                graph.add((choice_node, RDFS.subClassOf, NS[self.id]))
    
        def set_correct(self, choice_index: int, correct: bool):
            """Set the choice at `choice_index` as correct or not, creating it if needed."""
            self._get_or_create(choice_index).is_correct = correct
    
        def set_html(self, choice_index: int, html: str):
            """Set the `html` attribute for the choice at `choice_index`, creating it if needed."""
            self._get_or_create(choice_index).html = html
    
        def _get_or_create(self, index: int) -> Choice:
            """Returns the choice at `index`, creating it if needed."""
            for i in range(len(self.choices), index + 1):
                self.choices.append(Choice(i))
            return self.choices[index]
    
    
    class ExerciceQM(Exercice):
        def __init__(self):
            super().__init__()
            self.questions: list[ChoiceGroup]
    
    
    class ExerciceTAT(Exercice):
        def __init__(self):
            super().__init__()
            self.text: str  # can be HTML
            self.gaps: list[ChoiceGroup]
    
    
    class ExerciceGD(Exercice):
        def __init__(self):
            super().__init__()
            self.targets: list[str]
            self.draggables: list[list[Choice]]
    
    
    class JSParser:
        @abstractmethod
        def parse(self, js: str) -> Activity:
            """Parse a string of JavaScript code and returns an instance of the
            correct `Activity` subclass, partially populated with data found in the code.
            """
            pass
    
        @override
        def __str__(self) -> str:
            return type(self).__name__
    
    
    class RegexParser(JSParser):
        def __init__(self, graph: Graph, act_id: str) -> None:
            self.graph = graph
            self.act_id = act_id
    
        @override
        def parse(self, js: str) -> Activity:
            # Find function declaration and only keep code after it
            func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
            if len(func_split) < 2:
                raise ParseError("Failed to find function 'entrerDonnees'")
            body = func_split[1]
    
            activity, activity_var_name = self._parse_activity_constructor(body)
            if isinstance(activity, ExerciceQC):
                # Parse correct answers
                self._parse_qc_answers(body, activity)
    
            return activity
    
        def _parse_activity_constructor(self, code: str) -> tuple[Activity, str]:
            """
            Find activity constructor call, return the activity type
            and resulting variable name.
            """
            constructor_match = re.search(
                r"""
                    (\w+)               # result variable name
                    \s+=\s+new\s+       # 
                    (Cours|Exercice\w+) # constructor name
                    \((.*?)\);          # optional arguments between parentheses
                        """,
                code,
                re.VERBOSE,
            )
            if constructor_match is None:
                raise ParseError("Failed to parse activity constructor")
    
            var_name, act_type, args = constructor_match.groups()
            activity = Activity.from_typename(act_type)
            # Handle case of QC variants
            if isinstance(activity, ExerciceQC) and args == '"QCM"':
                activity.is_qcm = True
            return activity, var_name
    
        def _parse_qc_answers(self, code: str, exo: ExerciceQC) -> None:
            """Parse the correct answers for a QCU activity"""
            index = 0
            for line in code.splitlines():
                line = line.strip()
                m = re.match(r"var nr = (\d+);", line)
                if m is not None:
                    # "index" line
                    index = int(m.group(1)) - 1  # question indexes start at 1
                elif line == "exo.tabStylesR[nr] = CODE_F;":
                    # "incorrect answer" line
                    exo.set_correct(index, False)
                elif line == "exo.tabStylesR[nr] = CODE_V;":
                    # "correct answer" line
                    exo.set_correct(index, True)
    
    
    class XpathParser(JSParser):
        """A parser for the JS portion of an activity, that uses XPath to query
        an XML representation of Esprima's abstract syntax tree (AST)"""
    
        # XPath requests pre-compiled as functions
        request_function = etree.XPath(
            '//FunctionDeclaration[id/Identifier[@name="entrerDonnees"]]'
        )
        request_index_and_values = etree.XPath(
            '*//VariableDeclarator[id//*[@name="nr"]]/init/Literal | *//AssignmentExpression//Identifier[starts-with(@name,"CODE_")]'
        )
        request_constructor_id = etree.XPath(
            '*//NewExpression/callee/Identifier[@name="Cours" or starts-with(@name, "Exercice")]'
        )
    
        def __init__(self) -> None:
            self.fun: Any
            """AST element corresponding to the function we're interested in.
            Initialised in `self.parse()`."""
    
        @override
        def parse(self, js: str) -> Activity:
            jstree: Any = es.parseScript(js, None)
            # Convert Esprima object tree to XML etree
            xml = self.to_xml(jstree.toDict(), "jstree")
            try:
                self.fun = self.request_function(xml)[0]
                activity = self._parse_activity_type()
                if isinstance(activity, ExerciceQC):
                    self._parse_qc_answers(activity)
                return activity
            except Exception as e:
                raise ParseError(e)
    
        def _parse_activity_type(self) -> Activity:
            constructor_id = self.request_constructor_id(self.fun)[0]
            match constructor_id.get("name"):
                case "ExerciceQC":
                    arg = constructor_id.xpath("../../arguments/Literal/@value")[0]
                    if arg == "QCM":
                        return ExerciceQC(is_qcm=True)
                    elif arg == "QCU":
                        return ExerciceQC()
                    else:
                        raise ParseError(f"ExerciceQC: invalid argument '{arg}'")
                case other:
                    return Activity.from_typename(other)
    
        def _parse_qc_answers(self, activity: ExerciceQC) -> None:
            """Parse the correct answers for a QC activity"""
            indexes_and_values = self.request_index_and_values(self.fun)
            index = 0
            for e in indexes_and_values:
                value = e.xpath("@value")
                if len(value) != 0:
                    # "index line"
                    index = int(value[0]) - 1  # question indexes start at 1
                else:
                    # "correct" or "incorrect" line
                    activity.set_correct(index, e.get("name") == "CODE_V")
    
        def to_xml(self, obj: Any, tag_name: str | None = None):
            """Recursively convert an object structure to an XML `ElementTree`.
            Structures are expected to be Python dictionaries.
            Converting a dictionary produces a tag named after the "type" attribute (if present).
            - A primitive attribute (i.e. not list nor dict) becomes a tag attribute.
            - A list attribute becomes a tag with its contents as sub-tags.
            - A dictionary attribute becomes a tag (named like the attribute's key)
            containing a sub-tag for the dictionary itself
            """
            if isinstance(obj, dict):
                # Dictionary (or object):
                # - if it has a "type" key, the dict represents an object -> use its value as the tag name
                # - if a tag_name is specified as well, it's probably important (like an attribute name),
                # so we keep both, as 2 nested tags (tag_name for the outer tag, type for the inner tag)
                inner_tag = None
                outer_tag = None
                has_inner = "type" in obj.keys()
                if has_inner:
                    inner_tag = etree.Element(obj["type"], None, None)
                else:
                    inner_tag = etree.Element("_dict", None, None)
    
                if tag_name is not None:
                    outer_tag = etree.Element(tag_name)
                    if has_inner:
                        outer_tag.append(inner_tag)
                    else:
                        inner_tag = outer_tag
                else:
                    outer_tag = inner_tag
                # Recurse on dictionary items
                for key, val in obj.items():
                    if key != "type":  # exception for 'type', handled as attribute
                        if isinstance(val, (list, dict)):
                            # Structured attributes become child tags
                            inner_tag.append(self.to_xml(val, key))
                        else:
                            # Primitive attributes become tag attributes
                            inner_tag.set(key, str(val))
                return outer_tag
    
            elif isinstance(obj, list):
                tag_name = tag_name or "_list"
                list_tag = etree.Element(tag_name)
                for e in obj:
                    list_tag.append(self.to_xml(e))
                return list_tag
    
            else:
                tag_name = tag_name or "_literal"
                leaf_tag = etree.Element(tag_name)
                leaf_tag.text = str(obj)
                return leaf_tag
    
    
    class MatchParser(JSParser):
        """A parser for the JS portion of an activity, that uses Python match statements
        to navigate the abstract syntax tree (AST) produced by Esprima"""
    
        def __init__(self, graph: Graph, act_id: str) -> None:
            self.graph = graph
            self.act_id = act_id
            self.activity: Activity | None = None
    
        @override
        def parse(self, js: str) -> Activity:
            jstree = es.parseScript(js, None)
            # Try to match our template with one of the top-level statements
            for statement in jstree.body:
                self.match_function(statement.toDict())
    
            if self.activity is not None:
                return self.activity
            else:
                raise ParseError("No activity constructor found")
    
        def match_constructor_call(self, new_expr: dict[str, Any]):
            if self.activity is not None:  # Ignore anything after the first match
                return
            match new_expr:
                case {
                    "type": "NewExpression",
                    "callee": {
                        "type": "Identifier",
                        "name": typ,
                    },
                    "arguments": [*args],
                }:
                    match typ:
                        case "Cours" | "ExerciceQM" | "ExerciceTAT" | "ExerciceGD":
                            self.activity = Activity.from_typename(typ)
                        case "ExerciceQC":
                            match args:
                                case [{"type": "Literal", "value": "QCU"}, *_]:
                                    typ += "_QCU"
                                    self.activity = ExerciceQC()
                                case [{"type": "Literal", "value": "QCM"}, *_]:
                                    typ += "_QCM"
                                    self.activity = ExerciceQC(is_qcm=True)
                                case _:
                                    raise ParseError(
                                        f"ExerciceQC: Invalid argument '{args}'"
                                    )
                        case _:
                            raise ParseError(f"Unknown activity type '{typ}'")
                case _:
                    pass
    
        def match_function(self, func: dict[str, Any]):
            """Checks if `func` matches a function declaration named `entrerDonnees`,
            and search its body if successful
            """
    
            match func:
                case {
                    "type": "FunctionDeclaration",
                    "id": {"name": "entrerDonnees"},
                    "body": {"type": "BlockStatement", "body": body},
                }:
                    # Matched a function declaration and captured its `body` attr
                    for statement in body:
                        # Find constructor calls (e.g. `new Thing()`) recursively
                        recurse_prefix(statement, self.match_constructor_call)
                case _:
                    pass
    
    
    def recurse_prefix(t: Any, f: Callable[[Any], None]):
        """Depth-first prefixed recursion: calls a function on an object, then on
        all its children (if it's a list or dictionary) recursively
    
        :param t: The object
        :param f: The function to call
        """
        f(t)
        if isinstance(t, list):
            for e in t:
                recurse_prefix(e, f)
        elif isinstance(t, dict):
            for e in t.values():
                recurse_prefix(e, f)
    
    
    # Regex to separate non-digits and digits
    regex_comment = re.compile(r"(\D*)(\d*)")
    
    
    def parse_page(graph: Graph, filepath: str, id: str):
        # Activity data is spread across HTML and JS code, which are parsed
        # differently. Additionally, some pieces of data are specific to the
        # activity type (Cours, ExerciceQC...) and this type is in the JS portion.
        # This requires parsing the JS code first, to get the type, then proceed
        # with HTML to get the rest of the type-specific data.
    
        # We still need to find the inline scripts before parsing them
        tree = html.parse(filepath)
        root = tree.getroot()
        # Collect all inline scripts (no external 'src') and join them in a
        # block of JS code
        scripts: list[HtmlElement] = root.xpath(
            '/html/head/script[@type="text/javascript" and not(@src)]'
        )
        js = "\n".join((s.text_content() for s in scripts))
    
        activity = Activity()
        # Try different parsers, each writing to a different file to compare their results
        for parser in [XpathParser(), MatchParser(graph, id), RegexParser(graph, id)]:
            with open(f"/tmp/{str(parser)}_debuglog.txt", "a") as f:
                print(f"\n{id:8}", end="", file=f)
                try:
                    activity: Activity = parser.parse(js)
                    print(activity, end="", file=f)
                except ParseError as e:
                    log.error(
                        f"{parser} -> {id}: Parsing error: {e}. Treating this as a generic Activity."
                    )
    
        activity.id = id
        # Parse the HTML portion
        activity.parse_html(root)
        # Save everything to the graph
        activity.save(graph)