extract.py

from pprint import pprint
from typing import Optional

from lxml import etree
from rdflib import RDFS, Graph, Literal, URIRef
from rdflib.namespace import OWL, RDF

import extract_mosetp
from common import *

# All common constants are in a dedicated module
from constants import *

schema_ontology_uri = URIRef(
    "http://www.semanticweb.org/eliott/ontologies/2024/4/macao"
)
content_ontology_uri = URIRef(
    "http://www.semanticweb.org/eliott/ontologies/2024/4/macao-content"
)


def dump_graph(g: Graph):
    """Print all triples in the graph"""
    for subj, pred, obj in g:
        print(subj, pred, obj)


def create_graph() -> Graph:
    g = Graph()
    g.bind("", NS)  # Bind default namespace to empty prefix
    return g


def export_graph(g: Graph):
    """Exports the graph to `OUT_FILE`, with OWL imports to include
    the schema file when loaded as an ontology
    """
    this_onto = content_ontology_uri
    g.add((this_onto, RDF.type, OWL.Ontology))
    g.add((this_onto, RDFS.label, Literal("macao-content")))
    g.add((this_onto, OWL.imports, schema_ontology_uri))
    g.serialize(OUT_FILE, "turtle", base=NS)
    print(f"Exported {len(g)} triples to {OUT_FILE}.")


def ns_find(elem: etree.ElementBase, query: str):
    """Wrapper for lxml's `find()` function that automatically uses the default
    namespace for all unprefixed tag names.
    """
    return elem.find(query, namespaces={"": elem.nsmap[None]})


def ns_findall(elem: etree.ElementBase, query: str):
    """Wrapper for lxml's `findall()` function that automatically uses the default
    namespace for all unprefixed tag names.
    """
    return elem.findall(query, namespaces={"": elem.nsmap[None]})


def ns_localname(elem: etree.ElementBase) -> str:
    """Get an element's local name, stripping the namespace."""
    return etree.QName(elem).localname


def parse_manifest(graph: Graph):
    """Parses the `imsmanifest.xml` and populates the `graph` with the
    modules hierarchy.
    """
    # Parse with lxml
    root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
    org = ns_find(root, ".//organization")
    # For all top-level modules
    for i, e in enumerate(ns_findall(org, "item")):
        module = NS[e.get("identifier")]
        parse_manifest_rec(graph, e)
        graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
        add_index(graph, module, i)


def parse_manifest_rec(
    graph: Graph,
    elem,
    parentResource: Optional[URIRef] = None,
    index: Optional[int] = None,
):
    """Parses a module `MosMod` from the manifest recursively, adding all its
    descendants to the `graph`
    :param parentResource: parent element in the tree, as a `rdflib` resource
    :param index: index (order) among sibling elements
    """

    # Get title and ID
    title: str = ns_find(elem, "title").text
    id: str = elem.get("identifier")
    # Declare RDF resource and simple properties
    subject = NS[id]
    graph.add((subject, RDF.type, OWL.NamedIndividual))
    add_title(graph, subject, title)
    if id.startswith("MosMod"):
        # It's a Module:
        graph.add((subject, RDF.type, NS["Module"]))
        # Add parent properties if necessary
        if parentResource is not None:
            graph.add((parentResource, NS["contientModule"], subject))
            graph.add((subject, RDFS.subClassOf, parentResource))
            if index is not None:
                add_index(graph, subject, index)
        # Recurse on child items
        for child_index, child in enumerate(ns_findall(elem, "item")):
            parse_manifest_rec(graph, child, subject, child_index)

    else:
        # It's a Subsection
        graph.add((subject, RDF.type, NS["SousPartie"]))
        # Add parent properties if necessary
        if parentResource is not None:
            graph.add((parentResource, NS["contientSousPartie"], subject))
            graph.add((subject, RDFS.subClassOf, parentResource))
            if index is not None:
                add_index(graph, subject, index)
        # Parse list of pages
        extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id)


def main():
    g = create_graph()
    parse_manifest(g)
    export_graph(g)


if __name__ == "__main__":
    main()