from lxml import etree
from rdflib import RDFS, Graph, Literal, URIRef
from rdflib.namespace import OWL, RDF

import extract_mosetp
from common import *

# Initialise logger
log = get_logger("extract")

schema_ontology_uri = URIRef(
    "http://www.semanticweb.org/eliott/ontologies/2024/4/macao"
)
content_ontology_uri = URIRef(
    "http://www.semanticweb.org/eliott/ontologies/2024/4/macao-content"
)


def dump_graph(g: Graph):
    """Print all triples in the graph"""
    for subj, pred, obj in g:
        print(subj, pred, obj)


def create_graph() -> Graph:
    g = Graph()
    g.bind("", NS)  # Bind default namespace to empty prefix
    return g


def export_graph(g: Graph):
    """Exports the graph to `RESULT_FILE`, with OWL imports to include
    the schema file when loaded as an ontology
    """
    this_onto = content_ontology_uri
    g.add((this_onto, RDF.type, OWL.Ontology))
    g.add((this_onto, RDFS.label, Literal("macao-content")))
    g.add((this_onto, OWL.imports, schema_ontology_uri))
    g.serialize(RDF_CONTENT_FILE, "turtle", base=NS)
    log.info(f"Exported {len(g)} triples to {RDF_CONTENT_FILE}.")


def ns_find(elem: etree._Element, query: str):
    """Wrapper for lxml's `find()` function that automatically uses the default
    namespace for all unprefixed tag names.
    """
    return elem.find(query, namespaces={"": elem.nsmap[None]})


def ns_findall(elem: etree._Element, query: str):
    """Wrapper for lxml's `findall()` function that automatically uses the default
    namespace for all unprefixed tag names.
    """
    return elem.findall(query, namespaces={"": elem.nsmap[None]})


def ns_localname(elem: etree._Element) -> str:
    """Get an element's local name, stripping the namespace."""
    return etree.QName(elem).localname


def parse_manifest(graph: Graph):
    """Parses the `imsmanifest.xml` and populates the `graph` with the
    modules hierarchy.
    """
    # Parse with lxml
    root = etree.parse(Context.source_dir + "/imsmanifest.xml", None).getroot()
    org = ns_find(root, ".//organization")
    if org is None:
        raise ParseError("Missing node <organization> in manifest")

    # The top-level element is the <organization>
    root_module = NS[org.get("identifier", default="None")]
    graph.add((root_module, RDFS.subClassOf, NS["MacaoRoot"]))
    # Add a nice display name
    if Context.version == "macao_3":
        graph.add((root_module, NS["__protege_display_name"], Literal("MACAO 3")))
        graph.add((root_module, NS["index"], Literal(3)))
    else:
        graph.add((root_module, NS["__protege_display_name"], Literal("MACAO")))
        graph.add((root_module, NS["index"], Literal(1)))
    # Recurse on the organization's items
    parse_manifest_rec(graph, org)


def parse_manifest_rec(
    graph: Graph,
    elem: etree._Element,
    parentResource: URIRef | None = None,
    index: int | None = None,
):
    """Parses a `MosOrg`, `MosMod` or `MosEtp` from the manifest recursively, adding all its
    descendants to the `graph`
    :param parentResource: parent element in the tree, as a `rdflib` resource
    :param index: index (order) among sibling elements
    """

    # Get title and ID
    title = ns_find(elem, "title")
    title = title.text if title is not None else "None"  # safe default value
    id: str = elem.get("identifier", default="None")
    # Declare RDF resource and simple properties
    subject = NS[id]
    graph.add((subject, RDF.type, OWL.NamedIndividual))
    graph.add((subject, NS["id"], Literal(id)))
    set_title(graph, subject, str(title))
    if is_module(id):
        # It's a Module:
        graph.add((subject, RDF.type, NS["Module"]))
        # Add parent properties if necessary
        if parentResource is not None:
            graph.add((parentResource, NS["contientModule"], subject))
            graph.add((subject, RDFS.subClassOf, parentResource))
            if index is not None:
                add_index(graph, subject, index)
        # Recurse on child items
        for child_index, child in enumerate(ns_findall(elem, "item")):
            parse_manifest_rec(graph, child, subject, child_index)

    elif is_subsection(id):
        # It's a Subsection
        graph.add((subject, RDF.type, NS["SousPartie"]))
        # Add parent properties if necessary
        if parentResource is not None:
            graph.add((parentResource, NS["contientSousPartie"], subject))
            graph.add((subject, RDFS.subClassOf, parentResource))
            if index is not None:
                add_index(graph, subject, index)
        # Parse list of pages
        extract_mosetp.parse_mosetp(graph, f"{Context.source_dir}/sco/{id}.html", id)


def is_module(id: str):
    return (
        (Context.version == "macao_12" and id.startswith("MosMod"))
        or (Context.version == "macao_3" and id.startswith("seq"))
        or id.startswith("MosOrg")  # the organization is just the top-level module
    )


def is_subsection(id: str):
    return (Context.version == "macao_12" and id.startswith("MosEtp")) or (
        Context.version == "macao_3" and id.startswith("act")
    )


def main():
    g = create_graph()

    if MACAO_VERSION == "full":
        # Run the parser once for each version, but with the same RDF graph
        for Context.version in ["macao_12", "macao_3"]:
            Context.source_dir = f"{SOURCE_DIR}/{Context.version}"
            parse_manifest(g)
    else:
        parse_manifest(g)

    export_graph(g)


if __name__ == "__main__":
    main()
