Skip to content
Snippets Groups Projects
Select Git revision
  • 26650f6fe4019020c4b78ff92a7745bbe23d64f5
  • master default protected
2 results

__init__.py

Blame
  • extract.py 4.15 KiB
    from pprint import pprint
    from typing import Optional
    
    from lxml import etree
    from rdflib import RDFS, Graph, Literal, URIRef
    from rdflib.namespace import OWL, RDF
    
    import extract_mosetp
    from common import *
    
    # All common constants are in a dedicated module
    from constants import *
    
    schema_ontology_uri = URIRef(
        "http://www.semanticweb.org/eliott/ontologies/2024/4/macao"
    )
    content_ontology_uri = URIRef(
        "http://www.semanticweb.org/eliott/ontologies/2024/4/macao-content"
    )
    
    
    def dump_graph(g: Graph):
        """Print all triples in the graph"""
        for subj, pred, obj in g:
            print(subj, pred, obj)
    
    
    def create_graph() -> Graph:
        g = Graph()
        g.bind("", NS)  # Bind default namespace to empty prefix
        return g
    
    
    def export_graph(g: Graph):
        """Exports the graph to `OUT_FILE`, with OWL imports to include
        the schema file when loaded as an ontology
        """
        this_onto = content_ontology_uri
        g.add((this_onto, RDF.type, OWL.Ontology))
        g.add((this_onto, RDFS.label, Literal("macao-content")))
        g.add((this_onto, OWL.imports, schema_ontology_uri))
        g.serialize(OUT_FILE, "turtle", base=NS)
        print(f"Exported {len(g)} triples to {OUT_FILE}.")
    
    
    def ns_find(elem: etree.ElementBase, query: str):
        """Wrapper for lxml's `find()` function that automatically uses the default
        namespace for all unprefixed tag names.
        """
        return elem.find(query, namespaces={"": elem.nsmap[None]})
    
    
    def ns_findall(elem: etree.ElementBase, query: str):
        """Wrapper for lxml's `findall()` function that automatically uses the default
        namespace for all unprefixed tag names.
        """
        return elem.findall(query, namespaces={"": elem.nsmap[None]})
    
    
    def ns_localname(elem: etree.ElementBase) -> str:
        """Get an element's local name, stripping the namespace."""
        return etree.QName(elem).localname
    
    
    def parse_manifest(graph: Graph):
        """Parses the `imsmanifest.xml` and populates the `graph` with the
        modules hierarchy.
        """
        # Parse with lxml
        root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
        org = ns_find(root, ".//organization")
        # For all top-level modules
        for i, e in enumerate(ns_findall(org, "item")):
            module = NS[e.get("identifier")]
            parse_manifest_rec(graph, e)
            graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
            add_index(graph, module, i)
    
    
    def parse_manifest_rec(
        graph: Graph,
        elem,
        parentResource: Optional[URIRef] = None,
        index: Optional[int] = None,
    ):
        """Parses a module `MosMod` from the manifest recursively, adding all its
        descendants to the `graph`
        :param parentResource: parent element in the tree, as a `rdflib` resource
        :param index: index (order) among sibling elements
        """
    
        # Get title and ID
        title: str = ns_find(elem, "title").text
        id: str = elem.get("identifier")
        # Declare RDF resource and simple properties
        subject = NS[id]
        graph.add((subject, RDF.type, OWL.NamedIndividual))
        add_title(graph, subject, title)
        if id.startswith("MosMod"):
            # It's a Module:
            graph.add((subject, RDF.type, NS["Module"]))
            # Add parent properties if necessary
            if parentResource is not None:
                graph.add((parentResource, NS["contientModule"], subject))
                graph.add((subject, RDFS.subClassOf, parentResource))
                if index is not None:
                    add_index(graph, subject, index)
            # Recurse on child items
            for child_index, child in enumerate(ns_findall(elem, "item")):
                parse_manifest_rec(graph, child, subject, child_index)
    
        else:
            # It's a Subsection
            graph.add((subject, RDF.type, NS["SousPartie"]))
            # Add parent properties if necessary
            if parentResource is not None:
                graph.add((parentResource, NS["contientSousPartie"], subject))
                graph.add((subject, RDFS.subClassOf, parentResource))
                if index is not None:
                    add_index(graph, subject, index)
            # Parse list of pages
            extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id)
    
    
    def main():
        g = create_graph()
        parse_manifest(g)
        export_graph(g)
    
    
    if __name__ == "__main__":
        main()