Skip to content
Snippets Groups Projects
Select Git revision
  • 48baab94e9958b94d44dcb116a748554655f139c
  • demo_ci_gitlab_pages default
  • demo_gitlab_ci
  • 5-images-in-annotations
  • 5-final-images
  • 5-chpk-images-in-annot
  • tetras-main protected
  • 5-rebase-images-in-annot
  • 5-wip-images-in-annot
  • tmp
  • 1-edit-annotations-on-videos
  • 5-old-images-in-annotations
  • old_demo_ci_gitlab_pages
  • images_annotations
  • wip
  • devsetup
  • wip-annot-video-ui
  • wip-annotations-on-videos
  • master
  • v0.4.0_react16
  • wip-debugging-annotations
21 results

AnnotationExportDialog.test.js

Blame
  • Forked from IIIF / Mirador / Mirador annotations
    Source project has a limited visibility.
    extract.py 4.15 KiB
    from pprint import pprint
    from typing import Optional
    
    from lxml import etree
    from rdflib import RDFS, Graph, Literal, URIRef
    from rdflib.namespace import OWL, RDF
    
    import extract_mosetp
    from common import *
    
    # All common constants are in a dedicated module
    from constants import *
    
    schema_ontology_uri = URIRef(
        "http://www.semanticweb.org/eliott/ontologies/2024/4/macao"
    )
    content_ontology_uri = URIRef(
        "http://www.semanticweb.org/eliott/ontologies/2024/4/macao-content"
    )
    
    
    def dump_graph(g: Graph):
        """Print all triples in the graph"""
        for subj, pred, obj in g:
            print(subj, pred, obj)
    
    
    def create_graph() -> Graph:
        g = Graph()
        g.bind("", NS)  # Bind default namespace to empty prefix
        return g
    
    
    def export_graph(g: Graph):
        """Exports the graph to `OUT_FILE`, with OWL imports to include
        the schema file when loaded as an ontology
        """
        this_onto = content_ontology_uri
        g.add((this_onto, RDF.type, OWL.Ontology))
        g.add((this_onto, RDFS.label, Literal("macao-content")))
        g.add((this_onto, OWL.imports, schema_ontology_uri))
        g.serialize(OUT_FILE, "turtle", base=NS)
        print(f"Exported {len(g)} triples to {OUT_FILE}.")
    
    
    def ns_find(elem: etree.ElementBase, query: str):
        """Wrapper for lxml's `find()` function that automatically uses the default
        namespace for all unprefixed tag names.
        """
        return elem.find(query, namespaces={"": elem.nsmap[None]})
    
    
    def ns_findall(elem: etree.ElementBase, query: str):
        """Wrapper for lxml's `findall()` function that automatically uses the default
        namespace for all unprefixed tag names.
        """
        return elem.findall(query, namespaces={"": elem.nsmap[None]})
    
    
    def ns_localname(elem: etree.ElementBase) -> str:
        """Get an element's local name, stripping the namespace."""
        return etree.QName(elem).localname
    
    
    def parse_manifest(graph: Graph):
        """Parses the `imsmanifest.xml` and populates the `graph` with the
        modules hierarchy.
        """
        # Parse with lxml
        root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
        org = ns_find(root, ".//organization")
        # For all top-level modules
        for i, e in enumerate(ns_findall(org, "item")):
            module = NS[e.get("identifier")]
            parse_manifest_rec(graph, e)
            graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
            add_index(graph, module, i)
    
    
    def parse_manifest_rec(
        graph: Graph,
        elem,
        parentResource: Optional[URIRef] = None,
        index: Optional[int] = None,
    ):
        """Parses a module `MosMod` from the manifest recursively, adding all its
        descendants to the `graph`
        :param parentResource: parent element in the tree, as a `rdflib` resource
        :param index: index (order) among sibling elements
        """
    
        # Get title and ID
        title: str = ns_find(elem, "title").text
        id: str = elem.get("identifier")
        # Declare RDF resource and simple properties
        subject = NS[id]
        graph.add((subject, RDF.type, OWL.NamedIndividual))
        add_title(graph, subject, title)
        if id.startswith("MosMod"):
            # It's a Module:
            graph.add((subject, RDF.type, NS["Module"]))
            # Add parent properties if necessary
            if parentResource is not None:
                graph.add((parentResource, NS["contientModule"], subject))
                graph.add((subject, RDFS.subClassOf, parentResource))
                if index is not None:
                    add_index(graph, subject, index)
            # Recurse on child items
            for child_index, child in enumerate(ns_findall(elem, "item")):
                parse_manifest_rec(graph, child, subject, child_index)
    
        else:
            # It's a Subsection
            graph.add((subject, RDF.type, NS["SousPartie"]))
            # Add parent properties if necessary
            if parentResource is not None:
                graph.add((parentResource, NS["contientSousPartie"], subject))
                graph.add((subject, RDFS.subClassOf, parentResource))
                if index is not None:
                    add_index(graph, subject, index)
            # Parse list of pages
            extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id)
    
    
    def main():
        g = create_graph()
        parse_manifest(g)
        export_graph(g)
    
    
    if __name__ == "__main__":
        main()