transform.py

from rdflib import OWL, RDF, Graph, Literal
from lxml import html
#import pandoc
from markitdown import MarkItDown
import tempfile
import re
from glob import glob

from common import *

log = get_logger("transform")


def construct(g: Graph, query: str):
    """Performs a SPARQL `CONSTRUCT` query and add the resulting triples
    to the graph, in-place.

    :return: Number of *new* triples (i.e. not present in the initial graph)
    """
    res = g.query(query)
    if res.graph is not None:
        new_triples = res.graph - g
        n = len(new_triples)
        g += new_triples
        if n > 0:
            log.info(f"\tConstructed {n} triples")
        return n
    return 0


def construct_while(g: Graph, query: str):
    """Repeat a CONSTRUCT query, adding triples to the graph, until the query
    stops generating new triples"""
    while construct(g, query) > 0:
        pass

def prepareHTMLforMD(str):
    #regexJS = re.compile(r'<script type="text/javascript">(.*)</script>')

    # Identify audio content and add markups to identify them in the MD
    regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?).swf',.*")
    str = regexAV.sub(r"@AUDIOSTART\1@AUDIOEND", str)

    return(str)

def postEditMD(str):
    ###################################################
    # Add audio players for audio extracted from SWF
    ###################################################
    regexAV = re.compile(r"@AUDIOSTART.*?@AUDIOEND")
    for audioElt in regexAV.findall(str):
        audioFolder = audioElt.replace("@AUDIOSTART","").replace("@AUDIOEND","").replace(r"\_","_")
        audioPaths = glob(HUGO_MEDIA_DIR+"/"+audioFolder+"/*.mp3")
        audioStr = ''
        for audioPath in audioPaths:
            audioFile = audioPath.split('/')[-1]
            #audioStr += r'{{< audio id="'+audioFile+r'" src="media/'+audioFolder+r'/'+audioFile+r'" >}}'
            audioStr += '<div><audio id="'+audioFile+'" ><source src="/macao-hugo/media/'+audioFolder+r'/'+audioFile+'" type="audio/mpeg"></audio><button onclick="document.getElementById(\''+audioFile+'\').play()">Play</button></div>'
        str = str.replace(audioElt, audioStr)
    ###################################################

    # Add html for images and fix media paths
    regexIMG = re.compile(r"!\[\]\(\.\./media/(.*?)\)")
    str = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str)

    return(str)

def transform_html(graph: Graph):
    html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description']


    # 1st attempt : clean up audio <script> tags
    """
    for t in graph.triples((None, NS['description'], None)):
        desc_str = t[2]
        tree = html.fragment_fromstring(desc_str)
        for script in tree.findall(".//script"):
            # `HtmlElement.drop_tree()` removes an element along with its
            # children and text, however it has an interesting feature :
            # the tail text is not removed, but instead joined to the previous
            # sibling or parent automatically.
            # This means that when we want to replace an element with string,
            # we only need to prepend the string to the tail, and it will be
            # inserted in the right place
            script.tail = script.text + script.tail
            script.drop_tree()
        pass
    """

    # Process all html content through Pandoc -> We use Markitdown instead at the moment
    #for prop in html_properties:
    #    for t in graph.triples((None, NS[prop], None))  :
    #        desc_str = t[2]
    #        desc_doc = pandoc.read(desc_str, format="html")
    #        desc_md = pandoc.write(desc_doc, format="markdown")
    #        l = list(t)
    #        l[2] = Literal(desc_md)
    #        l[1] = NS[prop+'_md']
    #        graph.add(tuple(l))

    # Add mannually edited surveyjs_json to the main graph
    #for t in graph.triples((None, surveyjs_json, None))  :


    # Process all html content through Markitdown
    for prop in html_properties:
        for t in graph.triples((None, NS[prop], None))  :
            desc_str = prepareHTMLforMD(t[2])
            tmp = tempfile.NamedTemporaryFile(suffix=".html")
            with open(tmp.name, 'w') as f:
                f.write(desc_str)
            mid = MarkItDown()
            desc_md = postEditMD(mid.convert(tmp.name).text_content)
            l = list(t)
            l[2] = Literal(desc_md)
            l[1] = NS[prop+'_md']
            ################################################
            # Add manually eddited content to the main graph
            manual_edition_list = list(graph.triples((l[0], NS[prop+"_md_manual_edition"], None)))
            if len(manual_edition_list)>=1:
                t_manual_edition = manual_edition_list[0]
                l[2] = t_manual_edition[2]
            graph.add(tuple(l))
            #################################################


def main():
    # Load graph
    graph = Graph()
    graph.bind("", NS)
    graph.parse(RDF_SCHEMA_FILE)
    graph.parse(RDF_MANUAL_EDITION_FILE)
    graph.parse(RDF_CONTENT_FILE)


    transform_html(graph)

    # Apply property 'subClassOf' transitively, except on the "fake" class
    # hierarchy based on MacaoRoot
    log.info("Adding transitive subclasses...")
    q_transitive_subclass = """
        CONSTRUCT {
            ?a rdfs:subClassOf ?c
        } WHERE {
            ?a rdfs:subClassOf ?b .
            ?b rdfs:subClassOf ?c .
            ?b rdfs:subClassOf :MacaoObject
            FILTER(?a != owl:Nothing && ?c != owl:Thing)
        }
    """
    construct_while(graph, q_transitive_subclass)

    # Apply type inheritance (rdfs:subClassOf)
    log.info("Adding supertypes...")
    construct_while(
        graph,
        """
        CONSTRUCT {
            ?subj rdf:type ?supertype
        } WHERE {
            ?subj a ?type .
            ?type rdfs:subClassOf ?supertype
        }
        """,
    )

    # Apply property inheritance (rdfs:subPropertyOf)
    log.info("Adding super-properties...")
    construct_while(
        graph,
        """
        CONSTRUCT {
            ?subj ?superprop ?obj
        } WHERE {
            ?subj ?prop ?obj .
            ?prop rdfs:subPropertyOf ?superprop .
            FILTER(?superprop != owl:topDataProperty)
        }
        """,
    )

    # A

    # ==> Save
    # Remove dependency on previous ontologies
    for ontology in graph.subjects(RDF.type, OWL.Ontology):
        graph.remove((ontology, None, None))
    # Declare new ontology
    onto_uri = URIRef("http://www.semanticweb.org/eliott/ontologies/2024/4/macao-full")
    graph.add((onto_uri, RDF.type, OWL.Ontology))
    graph.add((onto_uri, RDFS.label, Literal("macao-full")))
    graph.serialize(RDF_FULL_FILE, "turtle", base=NS)


if __name__ == "__main__":
    main()