Merge branch '22-parse-activity' into extraction

5a7cc90d · Eliott Sammier · d09e3cfc · 90b5d8a5 · 5a7cc90d · 5a7cc90d
Commit 5a7cc90d authored Jun 19, 2024 by Eliott Sammier
--- a/tetras_extraction/macao_12/macao_schema.ttl
+++ b/tetras_extraction/macao_12/macao_schema.ttl
@@ -7,19 +7,29 @@
 @base <http://www.semanticweb.org/eliott/ontologies/2024/4/macao/> .

 <http://www.semanticweb.org/eliott/ontologies/2024/4/macao> rdf:type owl:Ontology ;
-                                                                  rdfs:label "macao-schema"@fr .
+                                                             rdfs:label "macao-schema"@fr ;
+                                                             owl:versionInfo 1.1 .

 #################################################################
-#    Annotation properties
+#    Object Properties
 #################################################################

-###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao#test
-:test rdf:type owl:AnnotationProperty .
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponse
+:aReponse rdf:type owl:ObjectProperty ;
+          rdfs:range :Reponse .


-#################################################################
-#    Object Properties
-#################################################################
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponseCorrecte
+:aReponseCorrecte rdf:type owl:ObjectProperty ;
+                  rdfs:domain :Exercice ;
+                  rdfs:range :Reponse .
+
+
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponseIncorrecte
+:aReponseIncorrecte rdf:type owl:ObjectProperty ;
+                    rdfs:domain :Exercice ;
+                    rdfs:range :Reponse .
+

 ###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/contenuDans
 :contenuDans rdf:type owl:ObjectProperty ;
@@ -77,6 +87,34 @@
               rdfs:range xsd:anyURI .


+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireInfo
+:commentaireInfo rdf:type owl:DatatypeProperty ;
+                 rdfs:domain :Page ;
+                 rdfs:range rdf:XMLLiteral .
+
+
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireSucces
+:commentaireSucces rdf:type owl:DatatypeProperty ;
+                   rdfs:domain :Page ;
+                   rdfs:range rdf:XMLLiteral .
+
+
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireSugg
+:commentaireSugg rdf:type owl:DatatypeProperty ;
+                 rdfs:domain :Page ;
+                 rdfs:range rdf:XMLLiteral .
+
+
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/correct
+:correct rdf:type owl:DatatypeProperty ;
+         rdfs:range xsd:boolean .
+
+
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/html
+:html rdf:type owl:DatatypeProperty ;
+      rdfs:range rdf:XMLLiteral .
+
+
 ###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/id
 :id rdf:type owl:DatatypeProperty ;
    rdfs:subPropertyOf owl:topDataProperty ;
@@ -96,10 +134,6 @@
 #    Classes
 #################################################################

-###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao#MacaoRoot
-:MacaoRoot rdf:type owl:Class .
-
-
 ###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Cours
 :Cours rdf:type owl:Class ;
       rdfs:subClassOf :Page .
@@ -110,16 +144,41 @@
          rdfs:subClassOf :Page .


-###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/FlashObject
-:FlashObject rdf:type owl:Class ;
-             rdfs:subClassOf :MacaoRessource .
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceGD
+:ExerciceGD rdf:type owl:Class ;
+            rdfs:subClassOf :Exercice .
+
+
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC
+:ExerciceQC rdf:type owl:Class ;
+            rdfs:subClassOf :Exercice .
+
+
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC_QCM
+:ExerciceQC_QCM rdf:type owl:Class ;
+                rdfs:subClassOf :ExerciceQC .
+
+
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC_QCU
+:ExerciceQC_QCU rdf:type owl:Class ;
+                rdfs:subClassOf :ExerciceQC .
+
+
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQM
+:ExerciceQM rdf:type owl:Class ;
+            rdfs:subClassOf :Exercice .


-###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/GD
-:GD rdf:type owl:Class ;
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceTAT
+:ExerciceTAT rdf:type owl:Class ;
             rdfs:subClassOf :Exercice .


+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/FlashObject
+:FlashObject rdf:type owl:Class ;
+             rdfs:subClassOf :MacaoRessource .
+
+
 ###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Image
 :Image rdf:type owl:Class ;
       rdfs:subClassOf :MacaoRessource .
@@ -139,6 +198,10 @@
                rdfs:subClassOf :MacaoObject .


+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/MacaoRoot
+:MacaoRoot rdf:type owl:Class .
+
+
 ###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Module
 :Module rdf:type owl:Class ;
        rdfs:subClassOf :MacaoContenu .
@@ -149,14 +212,9 @@
      rdfs:subClassOf :MacaoContenu .


-###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/QC
-:QC rdf:type owl:Class ;
-    rdfs:subClassOf :Exercice .
-
-
-###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/QM
-:QM rdf:type owl:Class ;
-    rdfs:subClassOf :Exercice .
+###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Reponse
+:Reponse rdf:type owl:Class ;
+         rdfs:subClassOf :MacaoContenu .


 ###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/SimpleFlash
@@ -169,25 +227,15 @@
            rdfs:subClassOf :MacaoContenu .


-###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/TAT
-:TAT rdf:type owl:Class ;
-     rdfs:subClassOf :Exercice .
-
-
-###  http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ValuePartition
-:ValuePartition rdf:type owl:Class .
-
-
-
 #################################################################
 #    General axioms
 #################################################################

 [ rdf:type owl:AllDisjointClasses ;
-  owl:members ( :GD
-                :QC
-                :QM
-                :TAT
+  owl:members ( :ExerciceGD
+                :ExerciceQC
+                :ExerciceQM
+                :ExerciceTAT
              )
 ] .


--- a/tetras_extraction/macao_12/result/macao_content.ttl
+++ b/tetras_extraction/macao_12/result/macao_content.ttl
--- a/tetras_extraction/macao_12/script/.vscode/extensions.json
+++ b/tetras_extraction/macao_12/script/.vscode/extensions.json
+{
+    "recommendations": [
+        "detachhead.basedpyright",
+        "ms-python.black-formatter"
+    ]
+}
\ No newline at end of file
--- a/tetras_extraction/macao_12/script/common.py
+++ b/tetras_extraction/macao_12/script/common.py
+from os import environ, path
 from sys import stderr
-from rdflib import RDFS, Graph, Literal, URIRef
+from typing import Any
+
+from lxml import html
+from rdflib import Graph, Literal, RDFS, URIRef
 from rdflib import Namespace
-from os import path, environ


 def env_path_or_rel_default(env_var: str, default: str) -> str:
@@ -34,15 +37,33 @@ NS = Namespace("http://www.semanticweb.org/eliott/ontologies/2024/4/macao/")
 # Utility functions ############################################################


-def eprint(*args, **kwargs):
+def eprint(*args, **kwargs):  # pyright: ignore[reportMissingParameterType]
    """Just like `print()`, but to standard error instead of standard output"""
    print(*args, file=stderr, **kwargs)


-def add_title(g: Graph, subject: URIRef, title: str):
+def to_html(elem: html.HtmlElement) -> str:
+    """Shorthand function to serialise a `HtmlElement` to a HTML string"""
+    return html.tostring(elem, encoding="unicode")
+
+
+def insert_grow(l: list[Any], index: int, value: Any, fill_value: Any | None = None):
+    """Insert at a given position in a list, growing it if necessary
+
+    :param l: list
+    :param index: The position where the value is inserted
+    :param value: The value to insert
+    :param fill_value: The value used for elements created automatically when growing, defaults to None
+    """
+    for _ in range(len(l), index + 1):
+        l.append(fill_value)
+    l[index] = value
+
+
+def set_title(g: Graph, subject: URIRef, title: str):
    """Add triples to define the `subject`'s title and label"""
-    g.add((subject, RDFS.label, Literal(title)))
-    g.add((subject, NS["titre"], Literal(title)))
+    g.set((subject, RDFS.label, Literal(title)))
+    g.set((subject, NS["titre"], Literal(title)))


 def add_index(g: Graph, subject: URIRef, index: int):
@@ -59,3 +80,10 @@ def add_index(g: Graph, subject: URIRef, index: int):
                Literal(f"{index:02} | {name} | ") + title,
            )
        )
+
+
+# Exceptions ###################################################################
+
+
+class ParseError(Exception):
+    pass
--- a/tetras_extraction/macao_12/script/extract.py
+++ b/tetras_extraction/macao_12/script/extract.py
-from pprint import pprint
-from typing import Optional
+import filecmp

 from lxml import etree
 from rdflib import RDFS, Graph, Literal, URIRef
@@ -66,9 +65,11 @@ def parse_manifest(graph: Graph):
    # Parse with lxml
    root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
    org = ns_find(root, ".//organization")
+    if org is None:
+        raise ParseError("Missing node <organization> in manifest")
    # For all top-level modules
    for i, e in enumerate(ns_findall(org, "item")):
-        module = NS[e.get("identifier")]
+        module = NS[e.get("identifier", default="None")]
        parse_manifest_rec(graph, e)
        graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
        add_index(graph, module, i)
@@ -76,9 +77,9 @@ def parse_manifest(graph: Graph):

 def parse_manifest_rec(
    graph: Graph,
-    elem,
-    parentResource: Optional[URIRef] = None,
-    index: Optional[int] = None,
+    elem: etree._Element,
+    parentResource: URIRef | None = None,
+    index: int | None = None,
 ):
    """Parses a module `MosMod` from the manifest recursively, adding all its
    descendants to the `graph`
@@ -87,12 +88,13 @@ def parse_manifest_rec(
    """

    # Get title and ID
-    title: str = ns_find(elem, "title").text
-    id: str = elem.get("identifier")
+    title = ns_find(elem, "title")
+    title = title.text if title is not None else "None"  # safe default value
+    id: str = elem.get("identifier", default="None")
    # Declare RDF resource and simple properties
    subject = NS[id]
    graph.add((subject, RDF.type, OWL.NamedIndividual))
-    add_title(graph, subject, title)
+    set_title(graph, subject, str(title))
    if id.startswith("MosMod"):
        # It's a Module:
        graph.add((subject, RDF.type, NS["Module"]))
@@ -119,18 +121,32 @@ def parse_manifest_rec(
        extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id)


-import extract_page
+def compare_files(f1: str, f2: str):
+    print(
+        "Files {} and {} {}.".format(
+            f1, f2, "are identical" if filecmp.cmp(f1, f2) else "differ"
+        )
+    )


 def main():
    g = create_graph()
+
+    # Create or reset debug log files for all activity parsers, to compare their
+    # results afterwards
+    parsers = ("Match", "Xpath", "Regex")
+    logfiles = [f"/tmp/{p}Parser_debuglog.txt" for p in parsers]
+    for logfile in logfiles:
+        with open(logfile, "w") as f:
+            print("", file=f)
+
    parse_manifest(g)
    export_graph(g)
-    # extract_page.parse_page(
-    #     g,
-    #     f"{SOURCE_DIR}/contenu/pages/pg60.html",
-    #     "pg60",
-    # )
+
+    # Compare log files 2 by 2
+    compare_files(logfiles[0], logfiles[1])
+    compare_files(logfiles[0], logfiles[2])
+    compare_files(logfiles[1], logfiles[2])


 if __name__ == "__main__":

--- a/tetras_extraction/macao_12/script/extract_mosetp.py
+++ b/tetras_extraction/macao_12/script/extract_mosetp.py
+from os import path
 import re
 import subprocess
-from os import path

-from rdflib import OWL, RDF, RDFS, Graph, Literal
+from rdflib import Graph, Literal, OWL, RDF, RDFS

-from extract_page import parse_page
 from common import *
+from extract_page import parse_page


 def generate_triples(
@@ -23,7 +23,8 @@ def generate_triples(
    # Type and simple properties
    graph.add((page, RDF.type, OWL.NamedIndividual))
    graph.add((page, RDF.type, NS["Page"]))
-    add_title(graph, page, page_title)
+    graph.add((page, NS["id"], Literal(page_id)))
+    set_title(graph, page, page_title)
    add_index(
        graph,
        page,

--- a/tetras_extraction/macao_12/script/extract_page.py
+++ b/tetras_extraction/macao_12/script/extract_page.py
+from abc import abstractmethod
+from dataclasses import dataclass
 import re
-from pprint import pprint
-from typing import Any, List
+from typing import Any, Callable

-from lxml import html
+import esprima as es
+from lxml import etree, html
 from lxml.etree import _Element
-from rdflib import Graph, Literal
+from lxml.html import HtmlElement
+from rdflib import Graph, Literal, RDF
+from typing_extensions import override

 from common import *


 class Comment:
-    id: str
-    num: int
-    text: str
-    html: Any
-    elem: _Element
+    def __init__(self):
+        self.id: str
+        self.num: int
+        self.text: str
+        self.html: Any
+        self.elem: _Element

+    @override
    def __repr__(self):
        return str(self.__dict__)


-class Page:
-    id: str
-    title: str
-    type: str  # cours ou exercice
-    comment_success: Comment
-    comments_sugg: List[Comment]
-    comments_misc: List[Comment]
+class Activity:
+    def __init__(self):
+        self.id: str = ""
+        """The ID of the page this activity is in (`pg###`)"""
+        self.title: str = ""
+        """Human-readable title of the activity"""
+        self.description: str | None = None
+        """Description of the activity's body (HTML),
+        e.g. the instructions for an exercise activity"""
+        self.comment_consigne: Comment | None = None
+        """Another form of activity description but in a comment. May or may not
+        coexist with a regular description"""
+        self.comment_success: Comment | None = None
+        """Comment displayed on success, if applicable"""
+        self.comments_sugg: list[Comment] = []
+        """Help comments displayed on failure, if applicable"""
+        self.comments_misc: list[Comment] = []
+        """Any other comments, if present"""
+        self.ref: URIRef

-    def __init__(self) -> None:
-        self.comments_sugg = []
-        self.comments_misc = []
-
-    def __repr__(self):
-        return str(self.__dict__)
-
-
-# Regex to separate non-digits and digits
-regex_comment = re.compile(r"(\D*)(\d*)")
-
-
-def parse_page(graph: Graph, filepath: str, id: str):
-    page = Page()
-    # Parse with lxml
-    tree = html.parse(filepath)
-    root = tree.getroot()
+    def save(self, graph: Graph):
+        """Save activity data to the graph. Subclasses may override this method
+        to save their specific data."""
+        self.ref = NS[self.id]
+        # => Type
+        graph.add((self.ref, RDF.type, NS[self.get_name()]))
+        # => Title
+        set_title(graph, self.ref, self.title)
+        # => Description
+        description = self.description or ""
+        if self.comment_consigne is not None:
+            description += self.comment_consigne.html
+        if description != "":
+            graph.add((self.ref, NS["description"], Literal(description)))
+        # => Comments
+        if self.comment_success is not None:
+            graph.add(
+                (self.ref, NS["commentaireSucces"], Literal(self.comment_success.html))
+            )
+        for comment in self.comments_sugg:
+            graph.add((self.ref, NS["commentaireSugg"], Literal(comment.html)))
+        for comment in self.comments_misc:
+            graph.add((self.ref, NS["commentaireInfo"], Literal(comment.html)))

-    # Parse comments
+    def parse_html(self, root: HtmlElement):
+        """From a `lxml.html` parsing tree, extract all data relevant to this class.
+        Subclasses may override this method to extract more specific data.
+        """
+        # => Title
+        self.title = root.xpath("/html/head/title")[0].text
+        # => Comments
        zi = root.get_element_by_id("zoneInvisible")
        for cmt_div in zi:
            comment = Comment()
            comment.text = cmt_div.text_content()
-        comment.html = html.tostring(cmt_div, encoding="unicode")
+            comment.html = to_html(cmt_div)
            comment.elem = cmt_div
-        comment.id = cmt_div.get("id")
+            comment.id = cmt_div.get("id") or ""
            # Split id in two parts (non-digits and digits), then match on these parts
            m = regex_comment.match(comment.id)
            if m is not None:
                match m.groups():
                    case ["divCmt", num]:
-                    print(f"Comment, num={num}")
                        comment.num = int(num)
-                    page.comments_misc.append(comment)
-                    graph.add((NS[id], NS["commentaireInfo"], Literal(comment.html)))
+                        self.comments_misc.append(comment)
                    case ["divSugg", num]:
-                    print(f"Suggestion, num={num}")
                        comment.num = int(num)
-                    page.comments_sugg.append(comment)
-                    graph.add((NS[id], NS["commentaireSugg"], Literal(comment.html)))
+                        self.comments_sugg.append(comment)
                    case ["divCmtSucces", _]:
-                    print(f"Succès")
-                    page.comment_success = comment
-                    graph.add((NS[id], NS["commentaireSucces"], Literal(comment.html)))
-                case [other, _]:
-                    print(f"other: {other}")
-    # pprint(page)
+                        self.comment_success = comment
+                    case ["divConsigne", _]:
+                        self.comment_consigne = comment
+                    case [alpha, num]:
+                        eprint(f"No match for comment {alpha}[{num}] ('{comment.id}')")
+
+    def get_name(self) -> str:
+        return type(self).__name__
+
+    @classmethod
+    def from_typename(cls, name: str):
+        """Convenience function to create an `Activity` subclass from a name"""
+        match name:
+            case "Cours":
+                return Cours()
+            case "ExerciceQC":
+                return ExerciceQC()
+            case "ExerciceQM":
+                return ExerciceQM()
+            case "ExerciceTAT":
+                return ExerciceTAT()
+            case "ExerciceGD":
+                return ExerciceGD()
+            case _:
+                raise NameError(name=name)
+
+    @override
+    def __repr__(self):
+        return self.get_name() + str(self.__dict__)
+
+
+class Cours(Activity):
+    @override
+    def parse_html(self, root: HtmlElement):
+        super().parse_html(root)
+        # => Description
+        cours = root.get_element_by_id("STY_texteCours")
+        self.description = to_html(cours).strip()
+
+
+class Exercice(Activity):
+    @override
+    def parse_html(self, root: HtmlElement):
+        super().parse_html(root)
+        # => Description
+        question = root.get_element_by_id("STY_question")
+        self.description = to_html(question).strip()
+
+
+@dataclass
+class Choice:
+    """A possible answer for a question, correct or not"""
+
+    def __init__(self, index: int = 0):
+        self.index = index
+        self.is_correct: bool = False
+        self.html: str = ""
+
+
+class ChoiceGroup:
+    def __init__(self):
+        self.label: str
+        self.items: list[Choice]
+
+
+class ExerciceQC(Exercice):
+    def __init__(self, is_qcm: bool = False) -> None:
+        super().__init__()
+        self.is_qcm = is_qcm
+        self.choices: list[Choice] = []
+
+    @override
+    def get_name(self) -> str:
+        return "ExerciceQC_QCM" if self.is_qcm else "ExerciceQC_QCU"
+
+    @override
+    def parse_html(self, root: HtmlElement):
+        super().parse_html(root)
+        # Find question choices
+        for choice in root.find_class("STY_reponseQC"):
+            # Choices have an 'id' attribute in the form 'lienRepX'
+            # where X is their index (starting at 1)
+            index = int(choice.attrib["id"].replace("lienRep", ""))
+            self.set_html(index - 1, to_html(choice).strip())
+
+    @override
+    def save(self, graph: Graph):
+        super().save(graph)
+        for choice in self.choices:
+            rdf_name = f"{self.id}q{choice.index}"  # ex: pg157q2
+            display_name = rdf_name + " | " + ("V" if choice.is_correct else "F")
+            choice_node = NS[rdf_name]
+            graph.add((choice_node, RDF.type, NS["Reponse"]))
+            graph.add((choice_node, NS["index"], Literal(choice.index)))
+            graph.add((choice_node, NS["correct"], Literal(choice.is_correct)))
+            graph.add((choice_node, NS["html"], Literal(choice.html)))
+            graph.add(
+                (
+                    choice_node,
+                    NS["__protege_display_name"],
+                    Literal(display_name),
+                )
+            )
+            graph.add((NS[self.id], NS["aReponse"], choice_node))
+            # Our fake "class hierarchy" just for easier visualization
+            graph.add((choice_node, RDFS.subClassOf, NS[self.id]))
+
+    def set_correct(self, choice_index: int, correct: bool):
+        """Set the choice at `choice_index` as correct or not, creating it if needed."""
+        self._get_or_create(choice_index).is_correct = correct
+
+    def set_html(self, choice_index: int, html: str):
+        """Set the `html` attribute for the choice at `choice_index`, creating it if needed."""
+        self._get_or_create(choice_index).html = html
+
+    def _get_or_create(self, index: int) -> Choice:
+        """Returns the choice at `index`, creating it if needed."""
+        for i in range(len(self.choices), index + 1):
+            self.choices.append(Choice(i))
+        return self.choices[index]
+
+
+class ExerciceQM(Exercice):
+    def __init__(self):
+        super().__init__()
+        self.questions: list[ChoiceGroup]
+
+
+class ExerciceTAT(Exercice):
+    def __init__(self):
+        super().__init__()
+        self.text: str  # can be HTML
+        self.gaps: list[ChoiceGroup]
+
+
+class ExerciceGD(Exercice):
+    def __init__(self):
+        super().__init__()
+        self.targets: list[str]
+        self.draggables: list[list[Choice]]
+
+
+class JSParser:
+    @abstractmethod
+    def parse(self, js: str) -> Activity:
+        """Parse a string of JavaScript code and returns an instance of the
+        correct `Activity` subclass, partially populated with data found in the code.
+        """
+        pass
+
+    @override
+    def __str__(self) -> str:
+        return type(self).__name__
+
+
+class RegexParser(JSParser):
+    def __init__(self, graph: Graph, act_id: str) -> None:
+        self.graph = graph
+        self.act_id = act_id
+
+    @override
+    def parse(self, js: str) -> Activity:
+        # Find function declaration and only keep code after it
+        func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
+        if len(func_split) < 2:
+            raise ParseError("Failed to find function 'entrerDonnees'")
+        body = func_split[1]
+
+        activity, activity_var_name = self._parse_activity_constructor(body)
+        if isinstance(activity, ExerciceQC):
+            # Parse correct answers
+            self._parse_qc_answers(body, activity)
+
+        return activity
+
+    def _parse_activity_constructor(self, code: str) -> tuple[Activity, str]:
+        """
+        Find activity constructor call, return the activity type
+        and resulting variable name.
+        """
+        constructor_match = re.search(
+            r"""
+                (\w+)               # result variable name
+                \s+=\s+new\s+       # 
+                (Cours|Exercice\w+) # constructor name
+                \((.*?)\);          # optional arguments between parentheses
+                    """,
+            code,
+            re.VERBOSE,
+        )
+        if constructor_match is None:
+            raise ParseError("Failed to parse activity constructor")
+
+        var_name, act_type, args = constructor_match.groups()
+        activity = Activity.from_typename(act_type)
+        # Handle case of QC variants
+        if isinstance(activity, ExerciceQC) and args == '"QCM"':
+            activity.is_qcm = True
+        return activity, var_name
+
+    def _parse_qc_answers(self, code: str, exo: ExerciceQC) -> None:
+        """Parse the correct answers for a QCU activity"""
+        index = 0
+        for line in code.splitlines():
+            line = line.strip()
+            m = re.match(r"var nr = (\d+);", line)
+            if m is not None:
+                # "index" line
+                index = int(m.group(1)) - 1  # question indexes start at 1
+            elif line == "exo.tabStylesR[nr] = CODE_F;":
+                # "incorrect answer" line
+                exo.set_correct(index, False)
+            elif line == "exo.tabStylesR[nr] = CODE_V;":
+                # "correct answer" line
+                exo.set_correct(index, True)
+
+
+class XpathParser(JSParser):
+    """A parser for the JS portion of an activity, that uses XPath to query
+    an XML representation of Esprima's abstract syntax tree (AST)"""
+
+    # XPath requests pre-compiled as functions
+    request_function = etree.XPath(
+        '//FunctionDeclaration[id/Identifier[@name="entrerDonnees"]]'
+    )
+    request_index_and_values = etree.XPath(
+        '*//VariableDeclarator[id//*[@name="nr"]]/init/Literal | *//AssignmentExpression//Identifier[starts-with(@name,"CODE_")]'
+    )
+    request_constructor_id = etree.XPath(
+        '*//NewExpression/callee/Identifier[@name="Cours" or starts-with(@name, "Exercice")]'
+    )
+
+    def __init__(self) -> None:
+        self.fun: Any
+        """AST element corresponding to the function we're interested in.
+        Initialised in `self.parse()`."""
+
+    @override
+    def parse(self, js: str) -> Activity:
+        jstree: Any = es.parseScript(js, None)
+        # Convert Esprima object tree to XML etree
+        xml = self.to_xml(jstree.toDict(), "jstree")
+        try:
+            self.fun = self.request_function(xml)[0]
+            activity = self._parse_activity_type()
+            if isinstance(activity, ExerciceQC):
+                self._parse_qc_answers(activity)
+            return activity
+        except Exception as e:
+            raise ParseError(e)
+
+    def _parse_activity_type(self) -> Activity:
+        constructor_id = self.request_constructor_id(self.fun)[0]
+        match constructor_id.get("name"):
+            case "ExerciceQC":
+                arg = constructor_id.xpath("../../arguments/Literal/@value")[0]
+                if arg == "QCM":
+                    return ExerciceQC(is_qcm=True)
+                elif arg == "QCU":
+                    return ExerciceQC()
+                else:
+                    raise ParseError(f"ExerciceQC: invalid argument '{arg}'")
+            case other:
+                return Activity.from_typename(other)
+
+    def _parse_qc_answers(self, activity: ExerciceQC) -> None:
+        """Parse the correct answers for a QC activity"""
+        indexes_and_values = self.request_index_and_values(self.fun)
+        index = 0
+        for e in indexes_and_values:
+            value = e.xpath("@value")
+            if len(value) != 0:
+                # "index line"
+                index = int(value[0]) - 1  # question indexes start at 1
+            else:
+                # "correct" or "incorrect" line
+                activity.set_correct(index, e.get("name") == "CODE_V")
+
+    def to_xml(self, obj: Any, tag_name: str | None = None):
+        """Recursively convert an object structure to an XML `ElementTree`.
+        Structures are expected to be Python dictionaries.
+        Converting a dictionary produces a tag named after the "type" attribute (if present).
+        - A primitive attribute (i.e. not list nor dict) becomes a tag attribute.
+        - A list attribute becomes a tag with its contents as sub-tags.
+        - A dictionary attribute becomes a tag (named like the attribute's key)
+        containing a sub-tag for the dictionary itself
+        """
+        if isinstance(obj, dict):
+            # Dictionary (or object):
+            # - if it has a "type" key, the dict represents an object -> use its value as the tag name
+            # - if a tag_name is specified as well, it's probably important (like an attribute name),
+            # so we keep both, as 2 nested tags (tag_name for the outer tag, type for the inner tag)
+            inner_tag = None
+            outer_tag = None
+            has_inner = "type" in obj.keys()
+            if has_inner:
+                inner_tag = etree.Element(obj["type"], None, None)
+            else:
+                inner_tag = etree.Element("_dict", None, None)
+
+            if tag_name is not None:
+                outer_tag = etree.Element(tag_name)
+                if has_inner:
+                    outer_tag.append(inner_tag)
+                else:
+                    inner_tag = outer_tag
+            else:
+                outer_tag = inner_tag
+            # Recurse on dictionary items
+            for key, val in obj.items():
+                if key != "type":  # exception for 'type', handled as attribute
+                    if isinstance(val, (list, dict)):
+                        # Structured attributes become child tags
+                        inner_tag.append(self.to_xml(val, key))
+                    else:
+                        # Primitive attributes become tag attributes
+                        inner_tag.set(key, str(val))
+            return outer_tag
+
+        elif isinstance(obj, list):
+            tag_name = tag_name or "_list"
+            list_tag = etree.Element(tag_name)
+            for e in obj:
+                list_tag.append(self.to_xml(e))
+            return list_tag
+
+        else:
+            tag_name = tag_name or "_literal"
+            leaf_tag = etree.Element(tag_name)
+            leaf_tag.text = str(obj)
+            return leaf_tag
+
+
+class MatchParser(JSParser):
+    """A parser for the JS portion of an activity, that uses Python match statements
+    to navigate the abstract syntax tree (AST) produced by Esprima"""
+
+    def __init__(self, graph: Graph, act_id: str) -> None:
+        self.graph = graph
+        self.act_id = act_id
+        self.activity: Activity | None = None
+
+    @override
+    def parse(self, js: str) -> Activity:
+        jstree = es.parseScript(js, None)
+        # Try to match our template with one of the top-level statements
+        for statement in jstree.body:
+            self.match_function(statement.toDict())
+
+        if self.activity is not None:
+            return self.activity
+        else:
+            raise ParseError("No activity constructor found")
+
+    def match_constructor_call(self, new_expr: dict[str, Any]):
+        if self.activity is not None:  # Ignore anything after the first match
+            return
+        match new_expr:
+            case {
+                "type": "NewExpression",
+                "callee": {
+                    "type": "Identifier",
+                    "name": typ,
+                },
+                "arguments": [*args],
+            }:
+                match typ:
+                    case "Cours" | "ExerciceQM" | "ExerciceTAT" | "ExerciceGD":
+                        self.activity = Activity.from_typename(typ)
+                    case "ExerciceQC":
+                        match args:
+                            case [{"type": "Literal", "value": "QCU"}, *_]:
+                                typ += "_QCU"
+                                self.activity = ExerciceQC()
+                            case [{"type": "Literal", "value": "QCM"}, *_]:
+                                typ += "_QCM"
+                                self.activity = ExerciceQC(is_qcm=True)
+                            case _:
+                                raise ParseError(
+                                    f"ExerciceQC: Invalid argument '{args}'"
+                                )
+                    case _:
+                        raise ParseError(f"Unknown activity type '{typ}'")
+            case _:
+                pass
+
+    def match_function(self, func: dict[str, Any]):
+        """Checks if `func` matches a function declaration named `entrerDonnees`,
+        and search its body if successful
+        """
+
+        match func:
+            case {
+                "type": "FunctionDeclaration",
+                "id": {"name": "entrerDonnees"},
+                "body": {"type": "BlockStatement", "body": body},
+            }:
+                # Matched a function declaration and captured its `body` attr
+                for statement in body:
+                    # Find constructor calls (e.g. `new Thing()`) recursively
+                    recurse_prefix(statement, self.match_constructor_call)
+            case _:
+                pass
+
+
+def recurse_prefix(t: Any, f: Callable[[Any], None]):
+    """Depth-first prefixed recursion: calls a function on an object, then on
+    all its children (if it's a list or dictionary) recursively
+
+    :param t: The object
+    :param f: The function to call
+    """
+    f(t)
+    if isinstance(t, list):
+        for e in t:
+            recurse_prefix(e, f)
+    elif isinstance(t, dict):
+        for e in t.values():
+            recurse_prefix(e, f)
+
+
+# Regex to separate non-digits and digits
+regex_comment = re.compile(r"(\D*)(\d*)")
+
+
+def parse_page(graph: Graph, filepath: str, id: str):
+    # Activity data is spread across HTML and JS code, which are parsed
+    # differently. Additionally, some pieces of data are specific to the
+    # activity type (Cours, ExerciceQC...) and this type is in the JS portion.
+    # This requires parsing the JS code first, to get the type, then proceed
+    # with HTML to get the rest of the type-specific data.
+
+    # We still need to find the inline scripts before parsing them
+    tree = html.parse(filepath)
+    root = tree.getroot()
+    # Collect all inline scripts (no external 'src') and join them in a
+    # block of JS code
+    scripts: list[HtmlElement] = root.xpath(
+        '/html/head/script[@type="text/javascript" and not(@src)]'
+    )
+    js = "\n".join((s.text_content() for s in scripts))
+
+    activity = Activity()
+    # Try different parsers, each writing to a different file to compare their results
+    for parser in [XpathParser(), MatchParser(graph, id), RegexParser(graph, id)]:
+        with open(f"/tmp/{str(parser)}_debuglog.txt", "a") as f:
+            print(f"\n{id:8}", end="", file=f)
+            try:
+                activity: Activity = parser.parse(js)
+                print(activity, end="", file=f)
+            except ParseError as e:
+                eprint(f"{parser} -> {id}: Parsing error: {e}")
+                eprint("Treating this as a generic Activity.")
+
+    activity.id = id
+    # Parse the HTML portion
+    activity.parse_html(root)
+    # Save everything to the graph
+    activity.save(graph)
--- a/tetras_extraction/macao_12/script/pyrightconfig.json
+++ b/tetras_extraction/macao_12/script/pyrightconfig.json
+{
+    "reportMissingTypeStubs": "information",
+    "reportUnusedCallResult": "none",
+    "reportUnusedVariable": "warning",
+    "reportUnusedImport": "warning",
+    "reportMissingParameterType": "warning",
+    "reportMissingArgumentType": "warning",
+    "reportPrivateUsage": "none" /* lxml.etree often returns _Element */,
+    "reportUnknownParameterType": "none",
+    "reportUnknownArgumentType": "none",
+    "reportUnknownVariableType": "none",
+    "reportUnknownMemberType": "none",
+    "reportAny": "none"
+}
\ No newline at end of file
--- a/tetras_extraction/macao_12/script/requirements.txt
+++ b/tetras_extraction/macao_12/script/requirements.txt
+cssselect==1.2.0
+esprima==4.0.1
 isodate==0.6.1
 lxml==5.2.2
 pyparsing==3.1.2
 rdflib==7.0.0
 six==1.16.0
+types-beautifulsoup4==4.12.0.20240511
+types-html5lib==1.1.11.20240228
+types-lxml==2024.4.14
+typing_extensions==4.12.1