extract_mosetp.py

import re
import subprocess

from rdflib import OWL, RDF, RDFS, Graph, Literal

from common import *
from extract_page import parse_page

# Initialise logger
log = get_logger("extract_mosetp")


def generate_triples(
    graph: Graph, mosetp_id: str, page_id: str, page_title: str, page_index: int
):
    """Generate RDF triples for a given page and add them to the graph.

    :param graph: the rdflib graph
    :param mosetp_id: text identifier of the subsection (`MosEtp###`)
    :param page_id: text identifier of the page (`pg###`)
    :param page_title: human title of the page
    """
    mosetp = NS[mosetp_id]
    page = NS[page_id]
    # Type and simple properties
    graph.add((page, RDF.type, OWL.NamedIndividual))
    graph.add((page, RDF.type, NS["Activite"]))
    graph.add((page, NS["id"], Literal(page_id)))
    set_title(graph, page, page_title)
    add_index(
        graph,
        page,
        page_index,
    )
    # Link with parent subsection
    graph.add((page, RDFS.subClassOf, mosetp))
    graph.add((mosetp, NS["contientActivite"], page))


def parse_mosetp(graph: Graph, filepath: str, id: str):
    """Parse a subsection (`MosEtp###.html`) into the `graph`, creating
    the child pages.

    :param graph: the RDF graph
    :param filepath: path to the MosEtp file
    :param id: text identifier of the subsection
    """
    # Prepare regex with capturing groups to match lines
    regex = re.compile(r'.*new PageContenu\("(.*)", "(.*)", "(.*)", ""\);')
    # The lines we need are fairly basic, grep is much faster
    # than a Python HTML parser to filter them
    cmd_array = ["grep", "new PageContenu(", filepath]
    try:
        cmd = subprocess.run(
            cmd_array,
            check=True,
            capture_output=True,
            encoding="utf-8",
        )
        # Match regex on each line
        for index, line in enumerate(cmd.stdout.splitlines()):
            m = regex.match(line)
            if m is not None:  # should always match but just in case
                page_id = m.group(2)
                generate_triples(graph, id, page_id, m.group(1), index)
                # Call the page parser
                parse_page(graph, f"{SOURCE_DIR}/contenu/pages/{page_id}.html", page_id)
            else:
                log.warning(f"skipping page: regex found no match on line '{line}'")

    except FileNotFoundError as e:
        e.add_note(f"Failed to parse '{filepath}' with subprocess")
        raise e
    except subprocess.CalledProcessError as e:
        if e.returncode == 1:
            log.error(f"{filepath}: grep found no match, skipping.'")
        else:
            log.error(e.stderr)