from rdflib import OWL, RDF, Graph, Literal from lxml import html #import pandoc from markitdown import MarkItDown import tempfile import re from glob import glob from common import * log = get_logger("transform") def construct(g: Graph, query: str): """Performs a SPARQL `CONSTRUCT` query and add the resulting triples to the graph, in-place. :return: Number of *new* triples (i.e. not present in the initial graph) """ res = g.query(query) if res.graph is not None: new_triples = res.graph - g n = len(new_triples) g += new_triples if n > 0: log.info(f"\tConstructed {n} triples") return n return 0 def construct_while(g: Graph, query: str): """Repeat a CONSTRUCT query, adding triples to the graph, until the query stops generating new triples""" while construct(g, query) > 0: pass def prepareHTMLforMD(str): # Identify audio content and add markups to identify them in the MD regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?).swf',.*") str = regexAV.sub(r"@AUDIOSTART\1@AUDIOEND", str) # Identify image content and add markups to identify them in the MD regexIMG = re.compile(r"<img.*?src=\"\.\./media/(.*?)\".*?>") str = regexIMG.sub(r"@IMAGESTART\1@IMAGEEND", str) return(str) def postEditMD(str): ################################################### # Add audio players for audio extracted from SWF ################################################### regexAV = re.compile(r"@AUDIOSTART.*?@AUDIOEND") for audioElt in regexAV.findall(str): audioFolder = audioElt.replace("@AUDIOSTART","").replace("@AUDIOEND","").replace(r"\_","_") audioPaths = glob(HUGO_MEDIA_DIR+"/"+audioFolder+"/*.mp3") audioStr = '' for audioPath in audioPaths: audioFile = audioPath.split('/')[-1] #audioStr += r'{{< audio id="'+audioFile+r'" src="media/'+audioFolder+r'/'+audioFile+r'" >}}' audioStr += '<div><audio id="'+audioFile+'" ><source src="/macao-hugo/media/'+audioFolder+r'/'+audioFile+'" type="audio/mpeg"></audio><button onclick="document.getElementById(\''+audioFile+'\').play()">Play</button></div>' str = str.replace(audioElt, audioStr) ################################################### ################################################### # Add html code to MD for images ################################################### regexIMG = re.compile(r"@IMAGESTART.*?@IMAGEEND") for imageElt in regexIMG.findall(str): imgFilename = imageElt.replace("@IMAGESTART","").replace("@IMAGEEND","").replace(r"\_","_") imgHtml = '<img src="/macao-hugo/media/'+imgFilename+'" id="'+imgFilename+'"/>' str = str.replace(imageElt, imgHtml) ################################################### # Add html for images and fix media paths regexIMG = re.compile(r"!\[\]\(\.\./media/(.*?)\)") str = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str) return(str) def transform_html(graph: Graph): html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description'] # 1st attempt : clean up audio <script> tags """ for t in graph.triples((None, NS['description'], None)): desc_str = t[2] tree = html.fragment_fromstring(desc_str) for script in tree.findall(".//script"): # `HtmlElement.drop_tree()` removes an element along with its # children and text, however it has an interesting feature : # the tail text is not removed, but instead joined to the previous # sibling or parent automatically. # This means that when we want to replace an element with string, # we only need to prepend the string to the tail, and it will be # inserted in the right place script.tail = script.text + script.tail script.drop_tree() pass """ # Process all html content through Pandoc -> We use Markitdown instead at the moment #for prop in html_properties: # for t in graph.triples((None, NS[prop], None)) : # desc_str = t[2] # desc_doc = pandoc.read(desc_str, format="html") # desc_md = pandoc.write(desc_doc, format="markdown") # l = list(t) # l[2] = Literal(desc_md) # l[1] = NS[prop+'_md'] # graph.add(tuple(l)) # Add mannually edited surveyjs_json to the main graph #for t in graph.triples((None, surveyjs_json, None)) : # Process all html content through Markitdown for prop in html_properties: for t in graph.triples((None, NS[prop], None)) : desc_str = prepareHTMLforMD(t[2]) tmp = tempfile.NamedTemporaryFile(suffix=".html") with open(tmp.name, 'w') as f: f.write(desc_str) mid = MarkItDown() desc_md = postEditMD(mid.convert(tmp.name).text_content) l = list(t) l[2] = Literal(desc_md) l[1] = NS[prop+'_md'] ################################################ # Add manually eddited content to the main graph manual_edition_list = list(graph.triples((l[0], NS[prop+"_md_manual_edition"], None))) if len(manual_edition_list)>=1: t_manual_edition = manual_edition_list[0] l[2] = t_manual_edition[2] graph.add(tuple(l)) ################################################# def main(): # Load graph graph = Graph() graph.bind("", NS) graph.parse(RDF_SCHEMA_FILE) graph.parse(RDF_MANUAL_EDITION_FILE) graph.parse(RDF_CONTENT_FILE) transform_html(graph) # Apply property 'subClassOf' transitively, except on the "fake" class # hierarchy based on MacaoRoot log.info("Adding transitive subclasses...") q_transitive_subclass = """ CONSTRUCT { ?a rdfs:subClassOf ?c } WHERE { ?a rdfs:subClassOf ?b . ?b rdfs:subClassOf ?c . ?b rdfs:subClassOf :MacaoObject FILTER(?a != owl:Nothing && ?c != owl:Thing) } """ construct_while(graph, q_transitive_subclass) # Apply type inheritance (rdfs:subClassOf) log.info("Adding supertypes...") construct_while( graph, """ CONSTRUCT { ?subj rdf:type ?supertype } WHERE { ?subj a ?type . ?type rdfs:subClassOf ?supertype } """, ) # Apply property inheritance (rdfs:subPropertyOf) log.info("Adding super-properties...") construct_while( graph, """ CONSTRUCT { ?subj ?superprop ?obj } WHERE { ?subj ?prop ?obj . ?prop rdfs:subPropertyOf ?superprop . FILTER(?superprop != owl:topDataProperty) } """, ) # A # ==> Save # Remove dependency on previous ontologies for ontology in graph.subjects(RDF.type, OWL.Ontology): graph.remove((ontology, None, None)) # Declare new ontology onto_uri = URIRef("http://www.semanticweb.org/eliott/ontologies/2024/4/macao-full") graph.add((onto_uri, RDF.type, OWL.Ontology)) graph.add((onto_uri, RDFS.label, Literal("macao-full"))) graph.serialize(RDF_FULL_FILE, "turtle", base=NS) if __name__ == "__main__": main()