diff --git a/tetras_extraction/macao_12/script/common.py b/tetras_extraction/macao_12/script/common.py index 2a20499da028b1f67efcaf4030747ac23a14d263..6cba2d03437f342abfff6bc8da3044ee0481f024 100644 --- a/tetras_extraction/macao_12/script/common.py +++ b/tetras_extraction/macao_12/script/common.py @@ -73,3 +73,10 @@ def add_index(g: Graph, subject: URIRef, index: int): Literal(f"{index:02} | {name} | ") + title, ) ) + + +# Exceptions ################################################################### + + +class ParseError(Exception): + pass diff --git a/tetras_extraction/macao_12/script/extract.py b/tetras_extraction/macao_12/script/extract.py index dacf474e2facb4871c2028b7fccaf0b9f0bf0fc8..abcca88141e7da11f894780cd7b2851d25a3446c 100644 --- a/tetras_extraction/macao_12/script/extract.py +++ b/tetras_extraction/macao_12/script/extract.py @@ -66,9 +66,11 @@ def parse_manifest(graph: Graph): # Parse with lxml root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot() org = ns_find(root, ".//organization") + if org is None: + raise ParseError("Missing node <organization> in manifest") # For all top-level modules for i, e in enumerate(ns_findall(org, "item")): - module = NS[e.get("identifier")] + module = NS[e.get("identifier", default="None")] parse_manifest_rec(graph, e) graph.add((module, RDFS.subClassOf, NS["MacaoRoot"])) add_index(graph, module, i) @@ -76,7 +78,7 @@ def parse_manifest(graph: Graph): def parse_manifest_rec( graph: Graph, - elem, + elem: etree._Element, parentResource: Optional[URIRef] = None, index: Optional[int] = None, ): @@ -87,12 +89,13 @@ def parse_manifest_rec( """ # Get title and ID - title: str = ns_find(elem, "title").text - id: str = elem.get("identifier") + title = ns_find(elem, "title") + title = title.text if title is not None else "None" # safe default value + id: str = elem.get("identifier", default="None") # Declare RDF resource and simple properties subject = NS[id] graph.add((subject, RDF.type, OWL.NamedIndividual)) - add_title(graph, subject, title) + add_title(graph, subject, str(title)) if id.startswith("MosMod"): # It's a Module: graph.add((subject, RDF.type, NS["Module"])) diff --git a/tetras_extraction/macao_12/script/extract_page.py b/tetras_extraction/macao_12/script/extract_page.py index e38e2df685eca85971d4db01030f26232a69197f..86f9a782f1d5de6351b4243d7b51c3209368c643 100644 --- a/tetras_extraction/macao_12/script/extract_page.py +++ b/tetras_extraction/macao_12/script/extract_page.py @@ -38,10 +38,6 @@ class Page: return str(self.__dict__) -class ParseError(Exception): - pass - - class RegexParser: def parse(self, js, output=sys.stdout): # Find function declaration and only keep code after it @@ -293,7 +289,7 @@ def parse_page(graph: Graph, filepath: str, id: str): # Collect all inline scripts (no external 'src') and join them in a # block of JS code # scripts = root.cssselect('script[type="text/javascript"]:not([src])') - scripts: List[_Element] = root.xpath( + scripts: List[html.HtmlElement] = root.xpath( '/html/head/script[@type="text/javascript" and not(@src)]' ) js = "\n".join((s.text_content() for s in scripts))