From 27ce9e5c7f205044f6ce80b5f40a532dc24920d3 Mon Sep 17 00:00:00 2001
From: eliott <eliott.sammier@tetras-libre.fr>
Date: Wed, 5 Jun 2024 17:51:12 +0200
Subject: [PATCH] Improve types & error handling

---
 tetras_extraction/macao_12/script/common.py       |  7 +++++++
 tetras_extraction/macao_12/script/extract.py      | 13 ++++++++-----
 tetras_extraction/macao_12/script/extract_page.py |  6 +-----
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/tetras_extraction/macao_12/script/common.py b/tetras_extraction/macao_12/script/common.py
index 2a20499d..6cba2d03 100644
--- a/tetras_extraction/macao_12/script/common.py
+++ b/tetras_extraction/macao_12/script/common.py
@@ -73,3 +73,10 @@ def add_index(g: Graph, subject: URIRef, index: int):
                 Literal(f"{index:02} | {name} | ") + title,
             )
         )
+
+
+# Exceptions ###################################################################
+
+
+class ParseError(Exception):
+    pass
diff --git a/tetras_extraction/macao_12/script/extract.py b/tetras_extraction/macao_12/script/extract.py
index dacf474e..abcca881 100644
--- a/tetras_extraction/macao_12/script/extract.py
+++ b/tetras_extraction/macao_12/script/extract.py
@@ -66,9 +66,11 @@ def parse_manifest(graph: Graph):
     # Parse with lxml
     root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
     org = ns_find(root, ".//organization")
+    if org is None:
+        raise ParseError("Missing node <organization> in manifest")
     # For all top-level modules
     for i, e in enumerate(ns_findall(org, "item")):
-        module = NS[e.get("identifier")]
+        module = NS[e.get("identifier", default="None")]
         parse_manifest_rec(graph, e)
         graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
         add_index(graph, module, i)
@@ -76,7 +78,7 @@ def parse_manifest(graph: Graph):
 
 def parse_manifest_rec(
     graph: Graph,
-    elem,
+    elem: etree._Element,
     parentResource: Optional[URIRef] = None,
     index: Optional[int] = None,
 ):
@@ -87,12 +89,13 @@ def parse_manifest_rec(
     """
 
     # Get title and ID
-    title: str = ns_find(elem, "title").text
-    id: str = elem.get("identifier")
+    title = ns_find(elem, "title")
+    title = title.text if title is not None else "None"  # safe default value
+    id: str = elem.get("identifier", default="None")
     # Declare RDF resource and simple properties
     subject = NS[id]
     graph.add((subject, RDF.type, OWL.NamedIndividual))
-    add_title(graph, subject, title)
+    add_title(graph, subject, str(title))
     if id.startswith("MosMod"):
         # It's a Module:
         graph.add((subject, RDF.type, NS["Module"]))
diff --git a/tetras_extraction/macao_12/script/extract_page.py b/tetras_extraction/macao_12/script/extract_page.py
index e38e2df6..86f9a782 100644
--- a/tetras_extraction/macao_12/script/extract_page.py
+++ b/tetras_extraction/macao_12/script/extract_page.py
@@ -38,10 +38,6 @@ class Page:
         return str(self.__dict__)
 
 
-class ParseError(Exception):
-    pass
-
-
 class RegexParser:
     def parse(self, js, output=sys.stdout):
         # Find function declaration and only keep code after it
@@ -293,7 +289,7 @@ def parse_page(graph: Graph, filepath: str, id: str):
     # Collect all inline scripts (no external 'src') and join them in a
     # block of JS code
     # scripts = root.cssselect('script[type="text/javascript"]:not([src])')
-    scripts: List[_Element] = root.xpath(
+    scripts: List[html.HtmlElement] = root.xpath(
         '/html/head/script[@type="text/javascript" and not(@src)]'
     )
     js = "\n".join((s.text_content() for s in scripts))
-- 
GitLab