Select Git revision
extract_mosetp.py
extract_mosetp.py 2.65 KiB
import re
import subprocess
from rdflib import OWL, RDF, RDFS, Graph, Literal
from common import *
from extract_page import parse_page
# Initialise logger
log = get_logger("extract_mosetp")
def generate_triples(
graph: Graph, mosetp_id: str, page_id: str, page_title: str, page_index: int
):
"""Generate RDF triples for a given page and add them to the graph.
:param graph: the rdflib graph
:param mosetp_id: text identifier of the subsection (`MosEtp###`)
:param page_id: text identifier of the page (`pg###`)
:param page_title: human title of the page
"""
mosetp = NS[mosetp_id]
page = NS[page_id]
# Type and simple properties
graph.add((page, RDF.type, OWL.NamedIndividual))
graph.add((page, RDF.type, NS["Activite"]))
graph.add((page, NS["id"], Literal(page_id)))
set_title(graph, page, page_title)
add_index(
graph,
page,
page_index,
)
# Link with parent subsection
graph.add((page, RDFS.subClassOf, mosetp))
graph.add((mosetp, NS["contientActivite"], page))
def parse_mosetp(graph: Graph, filepath: str, id: str):
"""Parse a subsection (`MosEtp###.html`) into the `graph`, creating
the child pages.
:param graph: the RDF graph
:param filepath: path to the MosEtp file
:param id: text identifier of the subsection
"""
# Prepare regex with capturing groups to match lines
regex = re.compile(r'.*new PageContenu\("(.*)", "(.*)", "(.*)", ""\);')
# The lines we need are fairly basic, grep is much faster
# than a Python HTML parser to filter them
cmd_array = ["grep", "new PageContenu(", filepath]
try:
cmd = subprocess.run(
cmd_array,
check=True,
capture_output=True,
encoding="utf-8",
)
# Match regex on each line
for index, line in enumerate(cmd.stdout.splitlines()):
m = regex.match(line)
if m is not None: # should always match but just in case
page_id = m.group(2)
generate_triples(graph, id, page_id, m.group(1), index)
# Call the page parser
parse_page(
graph, f"{Context.source_dir}/contenu/pages/{page_id}.html", page_id
)
else:
log.warning(f"skipping page: regex found no match on line '{line}'")
except FileNotFoundError as e:
e.add_note(f"Failed to parse '{filepath}' with subprocess")
raise e
except subprocess.CalledProcessError as e:
if e.returncode == 1:
log.error(f"{filepath}: grep found no match, skipping.'")
else:
log.error(e.stderr)