Merge branch '22-parse-activity' into 23-parse-exo-qc

035d29ec · Eliott Sammier · a5778912 · 3af96c18 · 035d29ec · 035d29ec
Commit 035d29ec authored 1 year ago by Eliott Sammier
--- a/tetras_extraction/macao_12/script/common.py
+++ b/tetras_extraction/macao_12/script/common.py
@@ -73,3 +73,10 @@ def add_index(g: Graph, subject: URIRef, index: int):
                Literal(f"{index:02} | {name} | ") + title,
            )
        )
+
+
+# Exceptions ###################################################################
+
+
+class ParseError(Exception):
+    pass
--- a/tetras_extraction/macao_12/script/extract.py
+++ b/tetras_extraction/macao_12/script/extract.py
+import filecmp
 from pprint import pprint
 from typing import Optional

@@ -66,9 +67,11 @@ def parse_manifest(graph: Graph):
    # Parse with lxml
    root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
    org = ns_find(root, ".//organization")
+    if org is None:
+        raise ParseError("Missing node <organization> in manifest")
    # For all top-level modules
    for i, e in enumerate(ns_findall(org, "item")):
-        module = NS[e.get("identifier")]
+        module = NS[e.get("identifier", default="None")]
        parse_manifest_rec(graph, e)
        graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
        add_index(graph, module, i)
@@ -76,7 +79,7 @@ def parse_manifest(graph: Graph):

 def parse_manifest_rec(
    graph: Graph,
-    elem,
+    elem: etree._Element,
    parentResource: Optional[URIRef] = None,
    index: Optional[int] = None,
 ):
@@ -87,12 +90,13 @@ def parse_manifest_rec(
    """

    # Get title and ID
-    title: str = ns_find(elem, "title").text
-    id: str = elem.get("identifier")
+    title = ns_find(elem, "title")
+    title = title.text if title is not None else "None"  # safe default value
+    id: str = elem.get("identifier", default="None")
    # Declare RDF resource and simple properties
    subject = NS[id]
    graph.add((subject, RDF.type, OWL.NamedIndividual))
-    add_title(graph, subject, title)
+    add_title(graph, subject, str(title))
    if id.startswith("MosMod"):
        # It's a Module:
        graph.add((subject, RDF.type, NS["Module"]))
@@ -119,18 +123,32 @@ def parse_manifest_rec(
        extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id)


-import extract_page
+def compare_files(f1, f2):
+    print(
+        "Files {} and {} {}.".format(
+            f1, f2, "are identical" if filecmp.cmp(f1, f2) else "differ"
+        )
+    )


 def main():
    g = create_graph()
+
+    # Create or reset debug log files for all activity parsers, to compare their
+    # results afterwards
+    parsers = ("Match", "Xpath", "Regex")
+    logfiles = [f"/tmp/{p}Parser_debuglog.txt" for p in parsers]
+    for logfile in logfiles:
+        with open(logfile, "w") as f:
+            print("", file=f)
+
    parse_manifest(g)
    export_graph(g)
-    # extract_page.parse_page(
-    #     g,
-    #     f"{SOURCE_DIR}/contenu/pages/pg60.html",
-    #     "pg60",
-    # )
+
+    # Compare log files 2 by 2
+    compare_files(logfiles[0], logfiles[1])
+    compare_files(logfiles[0], logfiles[2])
+    compare_files(logfiles[1], logfiles[2])


 if __name__ == "__main__":

--- a/tetras_extraction/macao_12/script/extract_page.py
+++ b/tetras_extraction/macao_12/script/extract_page.py
@@ -38,11 +38,11 @@ class Page:
        return str(self.__dict__)


-class ParseError(Exception):
-    pass
-
-
 class RegexParser:
+    def __init__(self, graph: Graph, act_id: str) -> None:
+        self.graph = graph
+        self.act_id = act_id
+
    def parse(self, js, output=sys.stdout):
        # Find function declaration and only keep code after it
        func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
@@ -51,6 +51,8 @@ class RegexParser:
        body = func_split[1]

        activity_type, activity_var_name = self._parse_activity_constructor(body)
+        # Save to graph
+        self.graph.add((NS[self.act_id], RDF.type, NS[activity_type]))
        print(activity_type, end="", file=output)
        if activity_type.startswith("ExerciceQC"):
            print(" ", self._parse_qc_answers(body), end="", file=output)
@@ -314,14 +316,14 @@ def parse_page(graph: Graph, filepath: str, id: str):
    # Collect all inline scripts (no external 'src') and join them in a
    # block of JS code
    # scripts = root.cssselect('script[type="text/javascript"]:not([src])')
-    scripts: List[_Element] = root.xpath(
+    scripts: List[html.HtmlElement] = root.xpath(
        '/html/head/script[@type="text/javascript" and not(@src)]'
    )
    js = "\n".join((s.text_content() for s in scripts))

    # Try different parsers, each writing to a different file to compare their results
-    for parser in [XpathParser(), MatchParser(graph, id), RegexParser()]:
-        with open(f"/tmp/{str(parser)}.txt", "a") as f:
+    for parser in [XpathParser(), MatchParser(graph, id), RegexParser(graph, id)]:
+        with open(f"/tmp/{str(parser)}_debuglog.txt", "a") as f:
            print(f"\n{id:8}", end="", file=f)
            try:
                parser.parse(js, output=f)