Skip to content
Snippets Groups Projects
Commit e0e81c02 authored by Eliott Sammier's avatar Eliott Sammier
Browse files

Implement parsing of the manifest to generate RDF triples

parent 07c026de
Branches
No related tags found
No related merge requests found
from rdflib import Graph, Namespace, BNode, URIRef
from pprint import pprint
from rdflib import RDFS, Graph, Namespace, BNode, URIRef, Literal
from rdflib.namespace import OWL, RDF
from lxml import etree
SOURCE_DIR = ".."
OUTPUT_DIR = "out"
OUT_FILE = OUTPUT_DIR + "/macao_content.ttl"
SCHEMA_FILE = "macao_schema.ttl"
OUT_FILE = "out/out.ttl"
NAMESPACE = Namespace("http://www.semanticweb.org/eliott/ontologies/2024/4/macao/")
NS = Namespace("http://www.semanticweb.org/eliott/ontologies/2024/4/macao/")
def dump_graph(g: Graph):
"""Print all triples in the graph
"""
# Loop through each triple in the graph (subj, pred, obj)
for subj, pred, obj in g:
print(subj, pred, obj)
def create_graph() -> Graph:
g = Graph()
g.bind("", NAMESPACE)
g.add((NAMESPACE["MosEtp129"], RDF.type, OWL.NamedIndividual))
g.bind("", NS) # Bind default namespace to empty prefix
return g
......@@ -28,20 +29,78 @@ def export_graph(g: Graph):
"""
imports = BNode()
g.add((imports, RDF.type, OWL.Ontology))
g.add((imports, OWL.imports, URIRef(NAMESPACE)))
g.serialize(OUT_FILE, base=NAMESPACE)
g.add((imports, OWL.imports, URIRef(NS)))
g.serialize(OUT_FILE, 'turtle', base=NS)
print(f"Exported {len(g)} triples to {OUT_FILE}.")
def main():
g = create_graph()
# g.parse(SCHEMA_FILE)
g.parse("macao_contents.ttl")
# dump_graph(g)
export_graph(g)
# Print the number of triples in the Graph
print(f"Graph g has {len(g)} statements.")
def ns_find(elem: etree.ElementBase, query: str):
"""Wrapper for lxml's `find()` function that automatically uses the default
namespace for all unprefixed tag names.
"""
return elem.find(query, namespaces={"": elem.nsmap[None]})
def ns_findall(elem: etree.ElementBase, query: str):
"""Wrapper for lxml's `findall()` function that automatically uses the default
namespace for all unprefixed tag names.
"""
return elem.findall(query, namespaces={"": elem.nsmap[None]})
def ns_localname(elem: etree.ElementBase) -> str:
"""Get an element's local name, stripping the namespace.
"""
return etree.QName(elem).localname
def parse_manifest(graph: Graph):
"""Parses the `imsmanifest.xml` and populates the `graph` with the
modules hierarchy.
"""
# Parse with lxml
root = etree.parse(SOURCE_DIR+"/imsmanifest.xml", None).getroot()
org = ns_find(root, ".//organization")
for e in ns_findall(org, "item"):
print(ns_localname(e), e.get("identifier"))
parse_manifest_rec(graph, e)
graph.add((NS[e.get("identifier")], RDFS.subClassOf, NS["MacaoRoot"]))
def parse_manifest_rec(graph: Graph, elem, parentResource=None):
"""Parses a module `MosMod` from the manifest recursively, adding all its
descendants to the `graph`
"""
# Get title and ID
title: str = ns_find(elem, "title").text
id: str = elem.get("identifier")
# Declare RDF resource and simple properties
subject = NS[id]
graph.add((subject, RDF.type, OWL.NamedIndividual))
graph.add((subject, NS["titre"], Literal(title)))
graph.add((subject, RDFS.label, Literal(title)))
if id.startswith("MosMod"):
# It's a Module:
graph.add((subject, RDF.type, NS["Module"]))
# Add parent properties if necessary
if parentResource is not None:
graph.add((parentResource, NS["contientModule"], subject))
graph.add((subject, RDFS.subClassOf, parentResource))
# Recurse on child items
for child in ns_findall(elem, "item"):
parse_manifest_rec(graph, child, subject)
else:
# It's a Subpart
graph.add((subject, RDF.type, NS["SousPartie"]))
# Add parent properties if necessary
if parentResource is not None:
graph.add((parentResource, NS["contientSousPartie"], subject))
graph.add((subject, RDFS.subClassOf, parentResource))
def main():
g = create_graph()
parse_manifest(g)
export_graph(g)
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment