Select Git revision
extract_all_media_from_folder.sh
extract.py 4.15 KiB
from pprint import pprint
from typing import Optional
from lxml import etree
from rdflib import RDFS, Graph, Literal, URIRef
from rdflib.namespace import OWL, RDF
import extract_mosetp
from common import *
# All common constants are in a dedicated module
from constants import *
schema_ontology_uri = URIRef(
"http://www.semanticweb.org/eliott/ontologies/2024/4/macao"
)
content_ontology_uri = URIRef(
"http://www.semanticweb.org/eliott/ontologies/2024/4/macao-content"
)
def dump_graph(g: Graph):
"""Print all triples in the graph"""
for subj, pred, obj in g:
print(subj, pred, obj)
def create_graph() -> Graph:
g = Graph()
g.bind("", NS) # Bind default namespace to empty prefix
return g
def export_graph(g: Graph):
"""Exports the graph to `OUT_FILE`, with OWL imports to include
the schema file when loaded as an ontology
"""
this_onto = content_ontology_uri
g.add((this_onto, RDF.type, OWL.Ontology))
g.add((this_onto, RDFS.label, Literal("macao-content")))
g.add((this_onto, OWL.imports, schema_ontology_uri))
g.serialize(OUT_FILE, "turtle", base=NS)
print(f"Exported {len(g)} triples to {OUT_FILE}.")
def ns_find(elem: etree.ElementBase, query: str):
"""Wrapper for lxml's `find()` function that automatically uses the default
namespace for all unprefixed tag names.
"""
return elem.find(query, namespaces={"": elem.nsmap[None]})
def ns_findall(elem: etree.ElementBase, query: str):
"""Wrapper for lxml's `findall()` function that automatically uses the default
namespace for all unprefixed tag names.
"""
return elem.findall(query, namespaces={"": elem.nsmap[None]})
def ns_localname(elem: etree.ElementBase) -> str:
"""Get an element's local name, stripping the namespace."""
return etree.QName(elem).localname
def parse_manifest(graph: Graph):
"""Parses the `imsmanifest.xml` and populates the `graph` with the
modules hierarchy.
"""
# Parse with lxml
root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
org = ns_find(root, ".//organization")
# For all top-level modules
for i, e in enumerate(ns_findall(org, "item")):
module = NS[e.get("identifier")]
parse_manifest_rec(graph, e)
graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
add_index(graph, module, i)
def parse_manifest_rec(
graph: Graph,
elem,
parentResource: Optional[URIRef] = None,
index: Optional[int] = None,
):
"""Parses a module `MosMod` from the manifest recursively, adding all its
descendants to the `graph`
:param parentResource: parent element in the tree, as a `rdflib` resource
:param index: index (order) among sibling elements
"""
# Get title and ID
title: str = ns_find(elem, "title").text
id: str = elem.get("identifier")
# Declare RDF resource and simple properties
subject = NS[id]
graph.add((subject, RDF.type, OWL.NamedIndividual))
add_title(graph, subject, title)
if id.startswith("MosMod"):
# It's a Module:
graph.add((subject, RDF.type, NS["Module"]))
# Add parent properties if necessary
if parentResource is not None:
graph.add((parentResource, NS["contientModule"], subject))
graph.add((subject, RDFS.subClassOf, parentResource))
if index is not None:
add_index(graph, subject, index)
# Recurse on child items
for child_index, child in enumerate(ns_findall(elem, "item")):
parse_manifest_rec(graph, child, subject, child_index)
else:
# It's a Subsection
graph.add((subject, RDF.type, NS["SousPartie"]))
# Add parent properties if necessary
if parentResource is not None:
graph.add((parentResource, NS["contientSousPartie"], subject))
graph.add((subject, RDFS.subClassOf, parentResource))
if index is not None:
add_index(graph, subject, index)
# Parse list of pages
extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id)
def main():
g = create_graph()
parse_manifest(g)
export_graph(g)
if __name__ == "__main__":
main()