Skip to content
Snippets Groups Projects
Select Git revision
  • 852c7d082b22c67d4ed251ea800260388c47856c
  • main default protected
  • export
  • 28-conversion-tests
  • extraction
  • exploration
  • exploration-old
  • 2-encoding-fix
  • main-old
9 results

extract_mosetp.py

Blame
  • extract_mosetp.py 2.65 KiB
    import re
    import subprocess
    
    from rdflib import OWL, RDF, RDFS, Graph, Literal
    
    from common import *
    from extract_page import parse_page
    
    # Initialise logger
    log = get_logger("extract_mosetp")
    
    
    def generate_triples(
        graph: Graph, mosetp_id: str, page_id: str, page_title: str, page_index: int
    ):
        """Generate RDF triples for a given page and add them to the graph.
    
        :param graph: the rdflib graph
        :param mosetp_id: text identifier of the subsection (`MosEtp###`)
        :param page_id: text identifier of the page (`pg###`)
        :param page_title: human title of the page
        """
        mosetp = NS[mosetp_id]
        page = NS[page_id]
        # Type and simple properties
        graph.add((page, RDF.type, OWL.NamedIndividual))
        graph.add((page, RDF.type, NS["Activite"]))
        graph.add((page, NS["id"], Literal(page_id)))
        set_title(graph, page, page_title)
        add_index(
            graph,
            page,
            page_index,
        )
        # Link with parent subsection
        graph.add((page, RDFS.subClassOf, mosetp))
        graph.add((mosetp, NS["contientActivite"], page))
    
    
    def parse_mosetp(graph: Graph, filepath: str, id: str):
        """Parse a subsection (`MosEtp###.html`) into the `graph`, creating
        the child pages.
    
        :param graph: the RDF graph
        :param filepath: path to the MosEtp file
        :param id: text identifier of the subsection
        """
        # Prepare regex with capturing groups to match lines
        regex = re.compile(r'.*new PageContenu\("(.*)", "(.*)", "(.*)", ""\);')
        # The lines we need are fairly basic, grep is much faster
        # than a Python HTML parser to filter them
        cmd_array = ["grep", "new PageContenu(", filepath]
        try:
            cmd = subprocess.run(
                cmd_array,
                check=True,
                capture_output=True,
                encoding="utf-8",
            )
            # Match regex on each line
            for index, line in enumerate(cmd.stdout.splitlines()):
                m = regex.match(line)
                if m is not None:  # should always match but just in case
                    page_id = m.group(2)
                    generate_triples(graph, id, page_id, m.group(1), index)
                    # Call the page parser
                    parse_page(
                        graph, f"{Context.source_dir}/contenu/pages/{page_id}.html", page_id
                    )
                else:
                    log.warning(f"skipping page: regex found no match on line '{line}'")
    
        except FileNotFoundError as e:
            e.add_note(f"Failed to parse '{filepath}' with subprocess")
            raise e
        except subprocess.CalledProcessError as e:
            if e.returncode == 1:
                log.error(f"{filepath}: grep found no match, skipping.'")
            else:
                log.error(e.stderr)