Improve types & error handling

27ce9e5c · Eliott Sammier · 25f3425a · 27ce9e5c · 27ce9e5c · 27ce9e5c
Commit 27ce9e5c authored 1 year ago by Eliott Sammier
--- a/tetras_extraction/macao_12/script/common.py
+++ b/tetras_extraction/macao_12/script/common.py
@@ -73,3 +73,10 @@ def add_index(g: Graph, subject: URIRef, index: int):
                Literal(f"{index:02} | {name} | ") + title,
            )
        )
+
+
+# Exceptions ###################################################################
+
+
+class ParseError(Exception):
+    pass
--- a/tetras_extraction/macao_12/script/extract.py
+++ b/tetras_extraction/macao_12/script/extract.py
@@ -66,9 +66,11 @@ def parse_manifest(graph: Graph):
    # Parse with lxml
    root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
    org = ns_find(root, ".//organization")
+    if org is None:
+        raise ParseError("Missing node <organization> in manifest")
    # For all top-level modules
    for i, e in enumerate(ns_findall(org, "item")):
-        module = NS[e.get("identifier")]
+        module = NS[e.get("identifier", default="None")]
        parse_manifest_rec(graph, e)
        graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
        add_index(graph, module, i)
@@ -76,7 +78,7 @@ def parse_manifest(graph: Graph):

 def parse_manifest_rec(
    graph: Graph,
-    elem,
+    elem: etree._Element,
    parentResource: Optional[URIRef] = None,
    index: Optional[int] = None,
 ):
@@ -87,12 +89,13 @@ def parse_manifest_rec(
    """

    # Get title and ID
-    title: str = ns_find(elem, "title").text
-    id: str = elem.get("identifier")
+    title = ns_find(elem, "title")
+    title = title.text if title is not None else "None"  # safe default value
+    id: str = elem.get("identifier", default="None")
    # Declare RDF resource and simple properties
    subject = NS[id]
    graph.add((subject, RDF.type, OWL.NamedIndividual))
-    add_title(graph, subject, title)
+    add_title(graph, subject, str(title))
    if id.startswith("MosMod"):
        # It's a Module:
        graph.add((subject, RDF.type, NS["Module"]))

--- a/tetras_extraction/macao_12/script/extract_page.py
+++ b/tetras_extraction/macao_12/script/extract_page.py
@@ -38,10 +38,6 @@ class Page:
        return str(self.__dict__)


-class ParseError(Exception):
-    pass
-
-
 class RegexParser:
    def parse(self, js, output=sys.stdout):
        # Find function declaration and only keep code after it
@@ -293,7 +289,7 @@ def parse_page(graph: Graph, filepath: str, id: str):
    # Collect all inline scripts (no external 'src') and join them in a
    # block of JS code
    # scripts = root.cssselect('script[type="text/javascript"]:not([src])')
-    scripts: List[_Element] = root.xpath(
+    scripts: List[html.HtmlElement] = root.xpath(
        '/html/head/script[@type="text/javascript" and not(@src)]'
    )
    js = "\n".join((s.text_content() for s in scripts))