diff --git a/tetras_extraction/script/src/extract.py b/tetras_extraction/script/src/extract.py index 0df3139598a60e3301d4010d0e8bff171e2b0a1e..564ccb5128496229e8f0d80a63ddd56c87dbb280 100644 --- a/tetras_extraction/script/src/extract.py +++ b/tetras_extraction/script/src/extract.py @@ -148,14 +148,6 @@ def compare_files(f1: str, f2: str): def main(): g = create_graph() - # Create or reset debug log files for all activity parsers, to compare their - # results afterwards - parsers = ("Match", "Xpath", "Regex") - logfiles = [f"/tmp/{p}Parser_debuglog.txt" for p in parsers] - for logfile in logfiles: - with open(logfile, "w") as f: - print("", file=f) - if MACAO_VERSION == "full": # Run the parser once for each version, but with the same RDF graph for Context.version in ["macao_12", "macao_3"]: @@ -163,12 +155,8 @@ def main(): parse_manifest(g) else: parse_manifest(g) - export_graph(g) - # Compare log files 2 by 2 - compare_files(logfiles[0], logfiles[1]) - compare_files(logfiles[0], logfiles[2]) - compare_files(logfiles[1], logfiles[2]) + export_graph(g) if __name__ == "__main__": diff --git a/tetras_extraction/script/src/extract_page.py b/tetras_extraction/script/src/extract_page.py index f77871a1d227d37bfcc3ce2f3746f61bad4a8a3f..1a3e90881d37d1f0a1ee93f66ace33a00939d10e 100644 --- a/tetras_extraction/script/src/extract_page.py +++ b/tetras_extraction/script/src/extract_page.py @@ -1,10 +1,9 @@ import re from abc import abstractmethod from dataclasses import dataclass -from typing import Any, Callable +from typing import Any -import esprima as es -from lxml import etree, html +from lxml import html from lxml.etree import _Element from lxml.html import HtmlElement from rdflib import RDF, Graph, Literal @@ -428,210 +427,6 @@ class RegexParser(JSParser): raise exception from e -class XpathParser(JSParser): - """A parser for the JS portion of an activity, that uses XPath to query - an XML representation of Esprima's abstract syntax tree (AST)""" - - # XPath requests pre-compiled as functions - request_function = etree.XPath( - '//FunctionDeclaration[id/Identifier[@name="entrerDonnees"]]' - ) - request_index_and_values = etree.XPath( - '*//VariableDeclarator[id//*[@name="nr"]]/init/Literal | *//AssignmentExpression//Identifier[starts-with(@name,"CODE_")]' - ) - request_constructor_id = etree.XPath( - '*//NewExpression/callee/Identifier[@name="Cours" or starts-with(@name, "Exercice")]' - ) - - def __init__(self) -> None: - self.fun: Any - """AST element corresponding to the function we're interested in. - Initialised in `self.parse()`.""" - - @override - def parse(self, js: str) -> Activity: - jstree: Any = es.parseScript(js, None) - # Convert Esprima object tree to XML etree - xml = self.to_xml(jstree.toDict(), "jstree") - try: - self.fun = self.request_function(xml)[0] - activity = self._parse_activity_type() - if isinstance(activity, ExerciceQC): - self._parse_qc_answers(activity) - return activity - except Exception as e: - raise ParseError(e) - - def _parse_activity_type(self) -> Activity: - constructor_id = self.request_constructor_id(self.fun)[0] - match constructor_id.get("name"): - case "ExerciceQC": - arg = constructor_id.xpath("../../arguments/Literal/@value")[0] - if arg == "QCM": - return ExerciceQC(is_qcm=True) - elif arg == "QCU": - return ExerciceQC() - else: - raise ParseError(f"ExerciceQC: invalid argument '{arg}'") - case other: - return Activity.from_typename(other) - - def _parse_qc_answers(self, activity: ExerciceQC) -> None: - """Parse the correct answers for a QC activity""" - indexes_and_values = self.request_index_and_values(self.fun) - choice_id = "0" - for e in indexes_and_values: - value = e.xpath("@value") - if len(value) != 0: - # "index line" - choice_id = value[0] - else: - # "correct" or "incorrect" line - activity.set_correct(choice_id, e.get("name") == "CODE_V") - - def to_xml(self, obj: Any, tag_name: str | None = None): - """Recursively convert an object structure to an XML `ElementTree`. - Structures are expected to be Python dictionaries. - Converting a dictionary produces a tag named after the "type" attribute (if present). - - A primitive attribute (i.e. not list nor dict) becomes a tag attribute. - - A list attribute becomes a tag with its contents as sub-tags. - - A dictionary attribute becomes a tag (named like the attribute's key) - containing a sub-tag for the dictionary itself - """ - if isinstance(obj, dict): - # Dictionary (or object): - # - if it has a "type" key, the dict represents an object -> use its value as the tag name - # - if a tag_name is specified as well, it's probably important (like an attribute name), - # so we keep both, as 2 nested tags (tag_name for the outer tag, type for the inner tag) - inner_tag = None - outer_tag = None - has_inner = "type" in obj.keys() - if has_inner: - inner_tag = etree.Element(obj["type"], None, None) - else: - inner_tag = etree.Element("_dict", None, None) - - if tag_name is not None: - outer_tag = etree.Element(tag_name) - if has_inner: - outer_tag.append(inner_tag) - else: - inner_tag = outer_tag - else: - outer_tag = inner_tag - # Recurse on dictionary items - for key, val in obj.items(): - if key != "type": # exception for 'type', handled as attribute - if isinstance(val, (list, dict)): - # Structured attributes become child tags - inner_tag.append(self.to_xml(val, key)) - else: - # Primitive attributes become tag attributes - inner_tag.set(key, str(val)) - return outer_tag - - elif isinstance(obj, list): - tag_name = tag_name or "_list" - list_tag = etree.Element(tag_name) - for e in obj: - list_tag.append(self.to_xml(e)) - return list_tag - - else: - tag_name = tag_name or "_literal" - leaf_tag = etree.Element(tag_name) - leaf_tag.text = str(obj) - return leaf_tag - - -class MatchParser(JSParser): - """A parser for the JS portion of an activity, that uses Python match statements - to navigate the abstract syntax tree (AST) produced by Esprima""" - - def __init__(self, graph: Graph, act_id: str) -> None: - self.graph = graph - self.act_id = act_id - self.activity: Activity | None = None - - @override - def parse(self, js: str) -> Activity: - jstree = es.parseScript(js, None) - # Try to match our template with one of the top-level statements - for statement in jstree.body: - self.match_function(statement.toDict()) - - if self.activity is not None: - return self.activity - else: - raise ParseError("No activity constructor found") - - def match_constructor_call(self, new_expr: dict[str, Any]): - if self.activity is not None: # Ignore anything after the first match - return - match new_expr: - case { - "type": "NewExpression", - "callee": { - "type": "Identifier", - "name": typ, - }, - "arguments": [*args], - }: - match typ: - case "Cours" | "ExerciceQM" | "ExerciceTAT" | "ExerciceGD": - self.activity = Activity.from_typename(typ) - case "ExerciceQC": - match args: - case [{"type": "Literal", "value": "QCU"}, *_]: - typ += "_QCU" - self.activity = ExerciceQC() - case [{"type": "Literal", "value": "QCM"}, *_]: - typ += "_QCM" - self.activity = ExerciceQC(is_qcm=True) - case _: - raise ParseError( - f"ExerciceQC: Invalid argument '{args}'" - ) - case _: - raise ParseError(f"Unknown activity type '{typ}'") - case _: - pass - - def match_function(self, func: dict[str, Any]): - """Checks if `func` matches a function declaration named `entrerDonnees`, - and search its body if successful - """ - - match func: - case { - "type": "FunctionDeclaration", - "id": {"name": "entrerDonnees"}, - "body": {"type": "BlockStatement", "body": body}, - }: - # Matched a function declaration and captured its `body` attr - for statement in body: - # Find constructor calls (e.g. `new Thing()`) recursively - recurse_prefix(statement, self.match_constructor_call) - case _: - pass - - -def recurse_prefix(t: Any, f: Callable[[Any], None]): - """Depth-first prefixed recursion: calls a function on an object, then on - all its children (if it's a list or dictionary) recursively - - :param t: The object - :param f: The function to call - """ - f(t) - if isinstance(t, list): - for e in t: - recurse_prefix(e, f) - elif isinstance(t, dict): - for e in t.values(): - recurse_prefix(e, f) - - def decode_answer_id(id: str): """ Decode an obfuscated answer ID, just like the `decodeX()` function @@ -681,17 +476,13 @@ def parse_page(graph: Graph, filepath: str, id: str): js = "\n".join((s.text_content() for s in scripts)) activity = Activity() - # Try different parsers, each writing to a different file to compare their results - for parser in [XpathParser(), MatchParser(graph, id), RegexParser(graph, id)]: - with open(f"/tmp/{str(parser)}_debuglog.txt", "a") as f: - print(f"\n{id:8}", end="", file=f) - try: - activity: Activity = parser.parse(js) - print(activity, end="", file=f) - except ParseError as e: - log.error( - f"{parser} -> {id}: Parsing error: {e}. Treating this as a generic Activity." - ) + parser = RegexParser(graph, id) + try: + activity: Activity = parser.parse(js) + except ParseError as e: + log.error( + f"{parser} -> {id}: Parsing error: {e}. Treating this as a generic Activity." + ) activity.id = id # Parse the HTML portion