From 852c7d082b22c67d4ed251ea800260388c47856c Mon Sep 17 00:00:00 2001 From: eliott <eliott.sammier@tetras-libre.fr> Date: Mon, 29 Jul 2024 17:40:43 +0200 Subject: [PATCH] Fix SOURCE_DIR issues depending on the Macao version --- tetras_extraction/script/src/common.py | 9 +++++++++ tetras_extraction/script/src/extract.py | 15 +++++---------- tetras_extraction/script/src/extract_mosetp.py | 6 ++++-- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/tetras_extraction/script/src/common.py b/tetras_extraction/script/src/common.py index cf0fff97..845650df 100644 --- a/tetras_extraction/script/src/common.py +++ b/tetras_extraction/script/src/common.py @@ -38,6 +38,15 @@ RDF_SCHEMA_FILE = env_path_or_rel_default("RDF_SCHEMA_FILE", "../../macao_schema RDF_FULL_FILE = env_path_or_rel_default("RDF_FULL_FILE", RESULT_DIR + "/macao_full.ttl") """Path to the full RDF file, including schema, extracted content and inferences""" + +class Context: + """Some global variables like paths are not constant, and may change at + runtime. This singleton holds the non-constant copies of such variables.""" + + version = MACAO_VERSION + source_dir = SOURCE_DIR + + NS = Namespace("http://www.semanticweb.org/eliott/ontologies/2024/4/macao/") """The rdflib base Namespace for our ontology""" diff --git a/tetras_extraction/script/src/extract.py b/tetras_extraction/script/src/extract.py index f35d98c7..e6959d8b 100644 --- a/tetras_extraction/script/src/extract.py +++ b/tetras_extraction/script/src/extract.py @@ -66,7 +66,7 @@ def parse_manifest(graph: Graph): modules hierarchy. """ # Parse with lxml - root = etree.parse(source_dir + "/imsmanifest.xml", None).getroot() + root = etree.parse(Context.source_dir + "/imsmanifest.xml", None).getroot() org = ns_find(root, ".//organization") if org is None: raise ParseError("Missing node <organization> in manifest") @@ -122,9 +122,7 @@ def parse_manifest_rec( if index is not None: add_index(graph, subject, index) # Parse list of pages - extract_mosetp.parse_mosetp( - graph, f"{source_dir}/sco/{id}.html", id, f"{source_dir}/contenu/pages" - ) + extract_mosetp.parse_mosetp(graph, f"{Context.source_dir}/sco/{id}.html", id) def compare_files(f1: str, f2: str): @@ -135,9 +133,6 @@ def compare_files(f1: str, f2: str): ) -source_dir = SOURCE_DIR - - def main(): g = create_graph() @@ -150,9 +145,9 @@ def main(): print("", file=f) if MACAO_VERSION == "full": - for version in [12, 3]: - global source_dir - source_dir = f"{SOURCE_DIR}/macao_{version}" + # Run the parser once for each version, but with the same RDF graph + for Context.version in ["12", "3"]: + Context.source_dir = f"{SOURCE_DIR}/macao_{Context.version}" parse_manifest(g) else: parse_manifest(g) diff --git a/tetras_extraction/script/src/extract_mosetp.py b/tetras_extraction/script/src/extract_mosetp.py index d64bdfa4..7ac4c9da 100644 --- a/tetras_extraction/script/src/extract_mosetp.py +++ b/tetras_extraction/script/src/extract_mosetp.py @@ -37,7 +37,7 @@ def generate_triples( graph.add((mosetp, NS["contientActivite"], page)) -def parse_mosetp(graph: Graph, filepath: str, id: str, pages_dir: str): +def parse_mosetp(graph: Graph, filepath: str, id: str): """Parse a subsection (`MosEtp###.html`) into the `graph`, creating the child pages. @@ -64,7 +64,9 @@ def parse_mosetp(graph: Graph, filepath: str, id: str, pages_dir: str): page_id = m.group(2) generate_triples(graph, id, page_id, m.group(1), index) # Call the page parser - parse_page(graph, f"{pages_dir}/{page_id}.html", page_id) + parse_page( + graph, f"{Context.source_dir}/contenu/pages/{page_id}.html", page_id + ) else: log.warning(f"skipping page: regex found no match on line '{line}'") -- GitLab