From 27ce9e5c7f205044f6ce80b5f40a532dc24920d3 Mon Sep 17 00:00:00 2001 From: eliott <eliott.sammier@tetras-libre.fr> Date: Wed, 5 Jun 2024 17:51:12 +0200 Subject: [PATCH] Improve types & error handling --- tetras_extraction/macao_12/script/common.py | 7 +++++++ tetras_extraction/macao_12/script/extract.py | 13 ++++++++----- tetras_extraction/macao_12/script/extract_page.py | 6 +----- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/tetras_extraction/macao_12/script/common.py b/tetras_extraction/macao_12/script/common.py index 2a20499d..6cba2d03 100644 --- a/tetras_extraction/macao_12/script/common.py +++ b/tetras_extraction/macao_12/script/common.py @@ -73,3 +73,10 @@ def add_index(g: Graph, subject: URIRef, index: int): Literal(f"{index:02} | {name} | ") + title, ) ) + + +# Exceptions ################################################################### + + +class ParseError(Exception): + pass diff --git a/tetras_extraction/macao_12/script/extract.py b/tetras_extraction/macao_12/script/extract.py index dacf474e..abcca881 100644 --- a/tetras_extraction/macao_12/script/extract.py +++ b/tetras_extraction/macao_12/script/extract.py @@ -66,9 +66,11 @@ def parse_manifest(graph: Graph): # Parse with lxml root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot() org = ns_find(root, ".//organization") + if org is None: + raise ParseError("Missing node <organization> in manifest") # For all top-level modules for i, e in enumerate(ns_findall(org, "item")): - module = NS[e.get("identifier")] + module = NS[e.get("identifier", default="None")] parse_manifest_rec(graph, e) graph.add((module, RDFS.subClassOf, NS["MacaoRoot"])) add_index(graph, module, i) @@ -76,7 +78,7 @@ def parse_manifest(graph: Graph): def parse_manifest_rec( graph: Graph, - elem, + elem: etree._Element, parentResource: Optional[URIRef] = None, index: Optional[int] = None, ): @@ -87,12 +89,13 @@ def parse_manifest_rec( """ # Get title and ID - title: str = ns_find(elem, "title").text - id: str = elem.get("identifier") + title = ns_find(elem, "title") + title = title.text if title is not None else "None" # safe default value + id: str = elem.get("identifier", default="None") # Declare RDF resource and simple properties subject = NS[id] graph.add((subject, RDF.type, OWL.NamedIndividual)) - add_title(graph, subject, title) + add_title(graph, subject, str(title)) if id.startswith("MosMod"): # It's a Module: graph.add((subject, RDF.type, NS["Module"])) diff --git a/tetras_extraction/macao_12/script/extract_page.py b/tetras_extraction/macao_12/script/extract_page.py index e38e2df6..86f9a782 100644 --- a/tetras_extraction/macao_12/script/extract_page.py +++ b/tetras_extraction/macao_12/script/extract_page.py @@ -38,10 +38,6 @@ class Page: return str(self.__dict__) -class ParseError(Exception): - pass - - class RegexParser: def parse(self, js, output=sys.stdout): # Find function declaration and only keep code after it @@ -293,7 +289,7 @@ def parse_page(graph: Graph, filepath: str, id: str): # Collect all inline scripts (no external 'src') and join them in a # block of JS code # scripts = root.cssselect('script[type="text/javascript"]:not([src])') - scripts: List[_Element] = root.xpath( + scripts: List[html.HtmlElement] = root.xpath( '/html/head/script[@type="text/javascript" and not(@src)]' ) js = "\n".join((s.text_content() for s in scripts)) -- GitLab