Skip to content
Snippets Groups Projects
Select Git revision
  • 0e491e5afb8b803cc49d80bb123d4cc46b367c63
  • mui5-annotation-on-video-stable default
  • get_setter_canvasSizeInformations
  • fix-error-div-into-p
  • annotation-on-video-v2
  • detached
  • annotation-on-video-r17
  • mui5
  • mui5-react-18
  • jacob-test
  • annotation-on-video protected
  • master
  • test-antoinev1
  • 20-fetch-thumbnail-on-annotation
  • add-research-field
  • Save
  • add-plugin
  • 14-wip-no-seek-to
  • 14-bug-on-video-time-control
  • 9_wip_videotests
  • _upgrade_material_ui
  • latest-tetras-16
  • v3.3.0
  • v3.2.0
  • v3.1.1
  • v3.1.0
  • v3.0.0
  • v3.0.0-rc.7
  • v3.0.0-rc.6
  • v3.0.0-rc.5
  • v3.0.0-rc.4
  • v3.0.0-rc.3
  • v3.0.0-rc.2
  • v3.0.0-rc.1
  • v3.0.0-beta.10
  • v3.0.0-beta.9
  • v3.0.0-beta.8
  • v3.0.0-beta.7
  • v3.0.0-beta.6
  • v3.0.0-beta.5
  • v3.0.0-beta.3
41 results

webpack.config.js

Blame
  • extract_mosetp.py 2.60 KiB
    import re
    import subprocess
    
    from rdflib import OWL, RDF, RDFS, Graph, Literal
    
    from common import *
    from extract_page import parse_page
    
    # Initialise logger
    log = get_logger("extract_mosetp")
    
    
    def generate_triples(
        graph: Graph, mosetp_id: str, page_id: str, page_title: str, page_index: int
    ):
        """Generate RDF triples for a given page and add them to the graph.
    
        :param graph: the rdflib graph
        :param mosetp_id: text identifier of the subsection (`MosEtp###`)
        :param page_id: text identifier of the page (`pg###`)
        :param page_title: human title of the page
        """
        mosetp = NS[mosetp_id]
        page = NS[page_id]
        # Type and simple properties
        graph.add((page, RDF.type, OWL.NamedIndividual))
        graph.add((page, RDF.type, NS["Activite"]))
        graph.add((page, NS["id"], Literal(page_id)))
        set_title(graph, page, page_title)
        add_index(
            graph,
            page,
            page_index,
        )
        # Link with parent subsection
        graph.add((page, RDFS.subClassOf, mosetp))
        graph.add((mosetp, NS["contientActivite"], page))
    
    
    def parse_mosetp(graph: Graph, filepath: str, id: str):
        """Parse a subsection (`MosEtp###.html`) into the `graph`, creating
        the child pages.
    
        :param graph: the RDF graph
        :param filepath: path to the MosEtp file
        :param id: text identifier of the subsection
        """
        # Prepare regex with capturing groups to match lines
        regex = re.compile(r'.*new PageContenu\("(.*)", "(.*)", "(.*)", ""\);')
        # The lines we need are fairly basic, grep is much faster
        # than a Python HTML parser to filter them
        cmd_array = ["grep", "new PageContenu(", filepath]
        try:
            cmd = subprocess.run(
                cmd_array,
                check=True,
                capture_output=True,
                encoding="utf-8",
            )
            # Match regex on each line
            for index, line in enumerate(cmd.stdout.splitlines()):
                m = regex.match(line)
                if m is not None:  # should always match but just in case
                    page_id = m.group(2)
                    generate_triples(graph, id, page_id, m.group(1), index)
                    # Call the page parser
                    parse_page(graph, f"{SOURCE_DIR}/contenu/pages/{page_id}.html", page_id)
                else:
                    log.warning(f"skipping page: regex found no match on line '{line}'")
    
        except FileNotFoundError as e:
            e.add_note(f"Failed to parse '{filepath}' with subprocess")
            raise e
        except subprocess.CalledProcessError as e:
            if e.returncode == 1:
                log.error(f"{filepath}: grep found no match, skipping.'")
            else:
                log.error(e.stderr)