Skip to content
Snippets Groups Projects
Select Git revision
  • 84345ca75b21e133f07fe49b16eb88958d400e4e
  • main default protected
  • export
  • 28-conversion-tests
  • extraction
  • exploration
  • exploration-old
  • 2-encoding-fix
  • main-old
9 results

transform.py

Blame
  • transform.py 6.66 KiB
    from rdflib import OWL, RDF, Graph, Literal
    from lxml import html
    #import pandoc
    from markitdown import MarkItDown
    import tempfile
    import re
    from glob import glob
    
    from common import *
    
    log = get_logger("transform")
    
    
    def construct(g: Graph, query: str):
        """Performs a SPARQL `CONSTRUCT` query and add the resulting triples
        to the graph, in-place.
    
        :return: Number of *new* triples (i.e. not present in the initial graph)
        """
        res = g.query(query)
        if res.graph is not None:
            new_triples = res.graph - g
            n = len(new_triples)
            g += new_triples
            if n > 0:
                log.info(f"\tConstructed {n} triples")
            return n
        return 0
    
    
    def construct_while(g: Graph, query: str):
        """Repeat a CONSTRUCT query, adding triples to the graph, until the query
        stops generating new triples"""
        while construct(g, query) > 0:
            pass
    
    def prepareHTMLforMD(str):
        #regexJS = re.compile(r'<script type="text/javascript">(.*)</script>')
    
        # Identify audio content and add markups to identify them in the MD
        regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?).swf',.*")
        str = regexAV.sub(r"@AUDIOSTART\1@AUDIOEND", str)
    
        return(str)
    
    def postEditMD(str):
        ###################################################
        # Add audio players for audio extracted from SWF
        ###################################################
        regexAV = re.compile(r"@AUDIOSTART.*?@AUDIOEND")
        for audioElt in regexAV.findall(str):
            audioFolder = audioElt.replace("@AUDIOSTART","").replace("@AUDIOEND","").replace(r"\_","_")
            audioPaths = glob(HUGO_MEDIA_DIR+"/"+audioFolder+"/*.mp3")
            audioStr = ''
            for audioPath in audioPaths:
                audioFile = audioPath.split('/')[-1]
                #audioStr += r'{{< audio id="'+audioFile+r'" src="media/'+audioFolder+r'/'+audioFile+r'" >}}'
                audioStr += '<div><audio id="'+audioFile+'" ><source src="/macao-hugo/media/'+audioFolder+r'/'+audioFile+'" type="audio/mpeg"></audio><button onclick="document.getElementById(\''+audioFile+'\').play()">Play</button></div>'
            str = str.replace(audioElt, audioStr)
        ###################################################
    
        # Add html for images and fix media paths
        regexIMG = re.compile(r"!\[\]\(\.\./media/(.*?)\)")
        str = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str)
    
        return(str)
    
    def transform_html(graph: Graph):
        html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description']
    
    
        # 1st attempt : clean up audio <script> tags
        """
        for t in graph.triples((None, NS['description'], None)):
            desc_str = t[2]
            tree = html.fragment_fromstring(desc_str)
            for script in tree.findall(".//script"):
                # `HtmlElement.drop_tree()` removes an element along with its
                # children and text, however it has an interesting feature :
                # the tail text is not removed, but instead joined to the previous
                # sibling or parent automatically.
                # This means that when we want to replace an element with string,
                # we only need to prepend the string to the tail, and it will be
                # inserted in the right place
                script.tail = script.text + script.tail
                script.drop_tree()
            pass
        """
    
        # Process all html content through Pandoc -> We use Markitdown instead at the moment
        #for prop in html_properties:
        #    for t in graph.triples((None, NS[prop], None))  :
        #        desc_str = t[2]
        #        desc_doc = pandoc.read(desc_str, format="html")
        #        desc_md = pandoc.write(desc_doc, format="markdown")
        #        l = list(t)
        #        l[2] = Literal(desc_md)
        #        l[1] = NS[prop+'_md']
        #        graph.add(tuple(l))
    
        # Add mannually edited surveyjs_json to the main graph
        #for t in graph.triples((None, surveyjs_json, None))  :
    
    
        # Process all html content through Markitdown
        for prop in html_properties:
            for t in graph.triples((None, NS[prop], None))  :
                desc_str = prepareHTMLforMD(t[2])
                tmp = tempfile.NamedTemporaryFile(suffix=".html")
                with open(tmp.name, 'w') as f:
                    f.write(desc_str)
                mid = MarkItDown()
                desc_md = postEditMD(mid.convert(tmp.name).text_content)
                l = list(t)
                l[2] = Literal(desc_md)
                l[1] = NS[prop+'_md']
                ################################################
                # Add manually eddited content to the main graph
                manual_edition_list = list(graph.triples((l[0], NS[prop+"_md_manual_edition"], None)))
                if len(manual_edition_list)>=1:
                    t_manual_edition = manual_edition_list[0]
                    l[2] = t_manual_edition[2]
                graph.add(tuple(l))
                #################################################
    
    
    
    def main():
        # Load graph
        graph = Graph()
        graph.bind("", NS)
        graph.parse(RDF_SCHEMA_FILE)
        graph.parse(RDF_MANUAL_EDITION_FILE)
        graph.parse(RDF_CONTENT_FILE)
    
    
        transform_html(graph)
    
        # Apply property 'subClassOf' transitively, except on the "fake" class
        # hierarchy based on MacaoRoot
        log.info("Adding transitive subclasses...")
        q_transitive_subclass = """
            CONSTRUCT {
                ?a rdfs:subClassOf ?c
            } WHERE {
                ?a rdfs:subClassOf ?b .
                ?b rdfs:subClassOf ?c .
                ?b rdfs:subClassOf :MacaoObject
                FILTER(?a != owl:Nothing && ?c != owl:Thing)
            }
        """
        construct_while(graph, q_transitive_subclass)
    
        # Apply type inheritance (rdfs:subClassOf)
        log.info("Adding supertypes...")
        construct_while(
            graph,
            """
            CONSTRUCT {
                ?subj rdf:type ?supertype
            } WHERE {
                ?subj a ?type .
                ?type rdfs:subClassOf ?supertype
            }
            """,
        )
    
        # Apply property inheritance (rdfs:subPropertyOf)
        log.info("Adding super-properties...")
        construct_while(
            graph,
            """
            CONSTRUCT {
                ?subj ?superprop ?obj
            } WHERE {
                ?subj ?prop ?obj .
                ?prop rdfs:subPropertyOf ?superprop .
                FILTER(?superprop != owl:topDataProperty)
            }
            """,
        )
    
        # A
    
        # ==> Save
        # Remove dependency on previous ontologies
        for ontology in graph.subjects(RDF.type, OWL.Ontology):
            graph.remove((ontology, None, None))
        # Declare new ontology
        onto_uri = URIRef("http://www.semanticweb.org/eliott/ontologies/2024/4/macao-full")
        graph.add((onto_uri, RDF.type, OWL.Ontology))
        graph.add((onto_uri, RDFS.label, Literal("macao-full")))
        graph.serialize(RDF_FULL_FILE, "turtle", base=NS)
    
    
    if __name__ == "__main__":
        main()