Select Git revision
AnnotationCreation.js
Forked from
IIIF / Mirador / Mirador annotations
Source project has a limited visibility.
extract.py 4.15 KiB
from pprint import pprint
from typing import Optional
from lxml import etree
from rdflib import RDFS, Graph, Literal, URIRef
from rdflib.namespace import OWL, RDF
import extract_mosetp
from common import *
# All common constants are in a dedicated module
from constants import *
schema_ontology_uri = URIRef(
"http://www.semanticweb.org/eliott/ontologies/2024/4/macao"
)
content_ontology_uri = URIRef(
"http://www.semanticweb.org/eliott/ontologies/2024/4/macao-content"
)
def dump_graph(g: Graph):
"""Print all triples in the graph"""
for subj, pred, obj in g:
print(subj, pred, obj)
def create_graph() -> Graph:
g = Graph()
g.bind("", NS) # Bind default namespace to empty prefix
return g
def export_graph(g: Graph):
"""Exports the graph to `OUT_FILE`, with OWL imports to include
the schema file when loaded as an ontology
"""
this_onto = content_ontology_uri
g.add((this_onto, RDF.type, OWL.Ontology))
g.add((this_onto, RDFS.label, Literal("macao-content")))
g.add((this_onto, OWL.imports, schema_ontology_uri))
g.serialize(OUT_FILE, "turtle", base=NS)
print(f"Exported {len(g)} triples to {OUT_FILE}.")
def ns_find(elem: etree.ElementBase, query: str):
"""Wrapper for lxml's `find()` function that automatically uses the default
namespace for all unprefixed tag names.
"""
return elem.find(query, namespaces={"": elem.nsmap[None]})
def ns_findall(elem: etree.ElementBase, query: str):
"""Wrapper for lxml's `findall()` function that automatically uses the default
namespace for all unprefixed tag names.
"""
return elem.findall(query, namespaces={"": elem.nsmap[None]})
def ns_localname(elem: etree.ElementBase) -> str:
"""Get an element's local name, stripping the namespace."""
return etree.QName(elem).localname
def parse_manifest(graph: Graph):
"""Parses the `imsmanifest.xml` and populates the `graph` with the
modules hierarchy.
"""
# Parse with lxml
root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
org = ns_find(root, ".//organization")
# For all top-level modules
for i, e in enumerate(ns_findall(org, "item")):
module = NS[e.get("identifier")]
parse_manifest_rec(graph, e)
graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
add_index(graph, module, i)
def parse_manifest_rec(
graph: Graph,
elem,
parentResource: Optional[URIRef] = None,
index: Optional[int] = None,
):
"""Parses a module `MosMod` from the manifest recursively, adding all its
descendants to the `graph`
:param parentResource: parent element in the tree, as a `rdflib` resource
:param index: index (order) among sibling elements
"""
# Get title and ID
title: str = ns_find(elem, "title").text
id: str = elem.get("identifier")
# Declare RDF resource and simple properties
subject = NS[id]
graph.add((subject, RDF.type, OWL.NamedIndividual))
add_title(graph, subject, title)
if id.startswith("MosMod"):
# It's a Module:
graph.add((subject, RDF.type, NS["Module"]))
# Add parent properties if necessary
if parentResource is not None:
graph.add((parentResource, NS["contientModule"], subject))
graph.add((subject, RDFS.subClassOf, parentResource))
if index is not None:
add_index(graph, subject, index)
# Recurse on child items
for child_index, child in enumerate(ns_findall(elem, "item")):
parse_manifest_rec(graph, child, subject, child_index)
else:
# It's a Subsection
graph.add((subject, RDF.type, NS["SousPartie"]))
# Add parent properties if necessary
if parentResource is not None:
graph.add((parentResource, NS["contientSousPartie"], subject))
graph.add((subject, RDFS.subClassOf, parentResource))
if index is not None:
add_index(graph, subject, index)
# Parse list of pages
extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id)
def main():
g = create_graph()
parse_manifest(g)
export_graph(g)
if __name__ == "__main__":
main()