diff --git a/tetras_extraction/macao_12/script/common.py b/tetras_extraction/macao_12/script/common.py index 2a20499da028b1f67efcaf4030747ac23a14d263..6cba2d03437f342abfff6bc8da3044ee0481f024 100644 --- a/tetras_extraction/macao_12/script/common.py +++ b/tetras_extraction/macao_12/script/common.py @@ -73,3 +73,10 @@ def add_index(g: Graph, subject: URIRef, index: int): Literal(f"{index:02} | {name} | ") + title, ) ) + + +# Exceptions ################################################################### + + +class ParseError(Exception): + pass diff --git a/tetras_extraction/macao_12/script/extract.py b/tetras_extraction/macao_12/script/extract.py index dacf474e2facb4871c2028b7fccaf0b9f0bf0fc8..0461cfe25b30323801e2f126aef65a6f29975965 100644 --- a/tetras_extraction/macao_12/script/extract.py +++ b/tetras_extraction/macao_12/script/extract.py @@ -1,3 +1,4 @@ +import filecmp from pprint import pprint from typing import Optional @@ -66,9 +67,11 @@ def parse_manifest(graph: Graph): # Parse with lxml root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot() org = ns_find(root, ".//organization") + if org is None: + raise ParseError("Missing node <organization> in manifest") # For all top-level modules for i, e in enumerate(ns_findall(org, "item")): - module = NS[e.get("identifier")] + module = NS[e.get("identifier", default="None")] parse_manifest_rec(graph, e) graph.add((module, RDFS.subClassOf, NS["MacaoRoot"])) add_index(graph, module, i) @@ -76,7 +79,7 @@ def parse_manifest(graph: Graph): def parse_manifest_rec( graph: Graph, - elem, + elem: etree._Element, parentResource: Optional[URIRef] = None, index: Optional[int] = None, ): @@ -87,12 +90,13 @@ def parse_manifest_rec( """ # Get title and ID - title: str = ns_find(elem, "title").text - id: str = elem.get("identifier") + title = ns_find(elem, "title") + title = title.text if title is not None else "None" # safe default value + id: str = elem.get("identifier", default="None") # Declare RDF resource and simple properties subject = NS[id] graph.add((subject, RDF.type, OWL.NamedIndividual)) - add_title(graph, subject, title) + add_title(graph, subject, str(title)) if id.startswith("MosMod"): # It's a Module: graph.add((subject, RDF.type, NS["Module"])) @@ -119,18 +123,32 @@ def parse_manifest_rec( extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id) -import extract_page +def compare_files(f1, f2): + print( + "Files {} and {} {}.".format( + f1, f2, "are identical" if filecmp.cmp(f1, f2) else "differ" + ) + ) def main(): g = create_graph() + + # Create or reset debug log files for all activity parsers, to compare their + # results afterwards + parsers = ("Match", "Xpath", "Regex") + logfiles = [f"/tmp/{p}Parser_debuglog.txt" for p in parsers] + for logfile in logfiles: + with open(logfile, "w") as f: + print("", file=f) + parse_manifest(g) export_graph(g) - # extract_page.parse_page( - # g, - # f"{SOURCE_DIR}/contenu/pages/pg60.html", - # "pg60", - # ) + + # Compare log files 2 by 2 + compare_files(logfiles[0], logfiles[1]) + compare_files(logfiles[0], logfiles[2]) + compare_files(logfiles[1], logfiles[2]) if __name__ == "__main__": diff --git a/tetras_extraction/macao_12/script/extract_page.py b/tetras_extraction/macao_12/script/extract_page.py index e68d07cdd3d46c6b8f3794203893b9bad4ca3889..733a6e396bc230b01680e964cd2d72206a171f43 100644 --- a/tetras_extraction/macao_12/script/extract_page.py +++ b/tetras_extraction/macao_12/script/extract_page.py @@ -38,11 +38,11 @@ class Page: return str(self.__dict__) -class ParseError(Exception): - pass - - class RegexParser: + def __init__(self, graph: Graph, act_id: str) -> None: + self.graph = graph + self.act_id = act_id + def parse(self, js, output=sys.stdout): # Find function declaration and only keep code after it func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js) @@ -51,6 +51,8 @@ class RegexParser: body = func_split[1] activity_type, activity_var_name = self._parse_activity_constructor(body) + # Save to graph + self.graph.add((NS[self.act_id], RDF.type, NS[activity_type])) print(activity_type, end="", file=output) if activity_type.startswith("ExerciceQC"): print(" ", self._parse_qc_answers(body), end="", file=output) @@ -314,14 +316,14 @@ def parse_page(graph: Graph, filepath: str, id: str): # Collect all inline scripts (no external 'src') and join them in a # block of JS code # scripts = root.cssselect('script[type="text/javascript"]:not([src])') - scripts: List[_Element] = root.xpath( + scripts: List[html.HtmlElement] = root.xpath( '/html/head/script[@type="text/javascript" and not(@src)]' ) js = "\n".join((s.text_content() for s in scripts)) # Try different parsers, each writing to a different file to compare their results - for parser in [XpathParser(), MatchParser(graph, id), RegexParser()]: - with open(f"/tmp/{str(parser)}.txt", "a") as f: + for parser in [XpathParser(), MatchParser(graph, id), RegexParser(graph, id)]: + with open(f"/tmp/{str(parser)}_debuglog.txt", "a") as f: print(f"\n{id:8}", end="", file=f) try: parser.parse(js, output=f)