from rdflib import OWL, RDF, Graph, Literal from lxml import html #import pandoc from markitdown import MarkItDown from markdown import markdown import tempfile import re from glob import glob from bs4 import BeautifulSoup from common import * log = get_logger("transform") def construct(g: Graph, query: str): """Performs a SPARQL `CONSTRUCT` query and add the resulting triples to the graph, in-place. :return: Number of *new* triples (i.e. not present in the initial graph) """ res = g.query(query) if res.graph is not None: new_triples = res.graph - g n = len(new_triples) g += new_triples if n > 0: log.info(f"\tConstructed {n} triples") return n return 0 def construct_while(g: Graph, query: str): """Repeat a CONSTRUCT query, adding triples to the graph, until the query stops generating new triples""" while construct(g, query) > 0: pass def markFileDown(filepath): with open(filepath, 'r', encoding='ISO 8859-1') as f0: str0 = f0.read().replace("</p>", "</p>@BR@") tmp = tempfile.NamedTemporaryFile(suffix=".html") with open(tmp.name, 'w') as f: f.write(str0) # Convert the body to MD mid = MarkItDown() body_content_md = mid.convert(tmp.name).text_content.replace('| Imprimer ','|').replace('@BR@','\n\n') return(body_content_md) def prepareHTMLforMD(str): str = str.replace("\r"," ") # Identify audio content and add markups to identify them in the MD regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?).swf',.*") str = regexAV.sub(r"@AUDIOSTART\1@AUDIOEND", str) # Identify image content and add markups to identify them in the MD regexIMG = re.compile(r"<img.*?src=\"\.\./media/(.*?)\".*?>") str = regexIMG.sub(r"@IMAGESTART\1@IMAGEEND", str) # Identify words supporting comments and add markups to identify them in the MD regexANCHOR= re.compile(r"<a class=\"STY_lienComt\" href=\"javascript:CRS_afficherDetail\('(.)'\)\" onclick=\"enrPosClic\(event,'.'\)\">(.*?)</a>") str = regexANCHOR.sub(r"@ANCHORSTART@\1@\2@ANCHOREND", str) # Identify comments ids and add markups to identify them in the MD regexCOMMENTID= re.compile(r"<div id=\"divCmt(.)\" onclick=\"SPE_clicDansBulle\(event,'.*?'\)\">")#(.*)</div>", re.MULTILINE) str = regexCOMMENTID.sub(r"@COMMENTIDSTART@\1@COMMENTIDEND", str) # Identify anchor words that support Doc regexANCHOR2= re.compile(r"<a.*?ouvrirDoc\('(.*?)','(.*?)'\)\">(.*?\n?.*?)</a>", re.MULTILINE) str = regexANCHOR2.sub(r"@ANCHORSTART@commentfile_\1.\2@\3@ANCHOREND", str) str = re.sub(r'@(.*)\n(.*)@ANCHOREND', r"@\1 \2@ANCHOREND", str) str = str.replace(r"\_","_").replace("PAGE","html") return(str) def postEditMD(str1): ################################################### # Various string cleaning and replacements ################################################### str1 = str1.replace(r"\_","_").replace(r" - ","\n- ").replace(r"| --- |", "").replace(r"|","") str1 = str1.replace("Ecoutez","Écoutez").replace("Ecouter","Écouter").replace("Enoncés","Énoncés").replace(r"A l'oral",r"À l'oral").replace("Enoncé","Énoncé").replace("A un bout","À un bout").replace("A l","À l").replace("Vous vous êtes trompé","Vous vous trompez").replace("Pas d'accord","Vous vous trompez") ################################################### # Add audio players for audio extracted from SWF ################################################### regexAV = re.compile(r"@AUDIOSTART.*?@AUDIOEND") for audioElt in regexAV.findall(str1): audioFolder = audioElt.replace("@AUDIOSTART","").replace("@AUDIOEND","").replace(r"\_","_") audioPaths = glob(HUGO_MEDIA_DIR+"/"+audioFolder+"/*.mp3") audioStr = '' for audioPath in audioPaths: audioFile = audioPath.split('/')[-1] #audioStr += r'{{< audio id="'+audioFile+r'" src="media/'+audioFolder+r'/'+audioFile+r'" >}}' audioStr += '<div><audio id="'+audioFile+'" ><source src="/macao-hugo/media/'+audioFolder+r'/'+audioFile+'" type="audio/mpeg"></audio><button onclick="document.getElementById(\''+audioFile+'\').play()">Play</button></div>' str1 = str1.replace(audioElt, audioStr) ################################################### ################################################### # Add html code to MD for images ################################################### regexIMG = re.compile(r"@IMAGESTART.*?@IMAGEEND") for imageElt in regexIMG.findall(str1): imgFilename = imageElt.replace("@IMAGESTART","").replace("@IMAGEEND","").replace(r"\_","_") imgHtml = '<img class="inlineImage" src="/macao-hugo/media/'+imgFilename+'" id="'+imgFilename+'"/>' str1 = str1.replace(imageElt, imgHtml) ################################################### ################################################### # Add html code to MD for comment anchors ################################################### commentaireInfo_md = '' i=0 for match in re.findall(r'@ANCHORSTART@(commentfile_.*?)@', str1): i += 1 filepath="/home/daxid/DEV/MACAO/macao-legacy/Basilisk/MACAO/macao_12/contenu/pages/"+match.replace("commentfile_","") try: file = open(filepath, 'r', encoding='utf-8') file.close() simpleHtml = markdown(markFileDown(filepath)) commentaireInfo_md = '<div class="commentaireInfo" commentaireId="'+str(i)+'">'+simpleHtml+'</div>' str1 = str1.replace(match,str(i)) except FileNotFoundError: pass print(f"File {filepath} not found.") # First for the words supporting the comments regexANCHOR = re.compile(r"@ANCHORSTART@(.*?)@(.*?)@ANCHOREND") str1 = regexANCHOR.sub(r'<span spanId="\1">\2</span>', str1) # For the comments themselves if '@COMMENTIDSTART' in str1 : str1 = str1.replace('@COMMENTIDSTART@','<div class="commentaireInfo" commentaireId="').replace('@COMMENTIDEND','">') + '</div>' ################################################### # Add html for images and fix media paths ################################################### regexIMG = re.compile(r"!\[\]\(\.\./media/(.*?)\)") str1 = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str1) ################################################### # Some global string replacement ################################################### str1 = str1.replace("""Pour vous enregistrer ou interrompre l'enregistrement, cliquez sur le bouton rouge. Pour vous réécouter, cliquez sur la flèche noire.""","") str1 = str1.replace("hauts-parleurs","flèches") str1 = str1.replace('\n<img class="inlineImage"','<img class="inlineImage"') return(str1, commentaireInfo_md) def transform_html(graph: Graph): html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description'] # 1st attempt : clean up audio <script> tags """ for t in graph.triples((None, NS['description'], None)): desc_str = t[2] tree = html.fragment_fromstring(desc_str) for script in tree.findall(".//script"): # `HtmlElement.drop_tree()` removes an element along with its # children and text, however it has an interesting feature : # the tail text is not removed, but instead joined to the previous # sibling or parent automatically. # This means that when we want to replace an element with string, # we only need to prepend the string to the tail, and it will be # inserted in the right place script.tail = script.text + script.tail script.drop_tree() pass """ # Process all html content through Pandoc -> We use Markitdown instead at the moment #for prop in html_properties: # for t in graph.triples((None, NS[prop], None)) : # desc_str = t[2] # desc_doc = pandoc.read(desc_str, format="html") # desc_md = pandoc.write(desc_doc, format="markdown") # l = list(t) # l[2] = Literal(desc_md) # l[1] = NS[prop+'_md'] # graph.add(tuple(l)) # Add mannually edited surveyjs_json to the main graph #for t in graph.triples((None, surveyjs_json, None)) : # Process all html content through Markitdown for prop in html_properties: for t in graph.triples((None, NS[prop], None)) : desc_str = prepareHTMLforMD(t[2]) if desc_str == '': desc_md = '' else: tmp = tempfile.NamedTemporaryFile(suffix=".html") with open(tmp.name, 'w') as f: f.write(desc_str) mid = MarkItDown() tmp_md = mid.convert(tmp.name).text_content (desc_md, commentaireInfo_md) = postEditMD(tmp_md) if commentaireInfo_md != "": l0 = list(t) l0[2] = Literal(commentaireInfo_md) l0[1] = NS["commentaireInfo_md"] graph.add(tuple(l0)) l = list(t) l[2] = Literal(desc_md) l[1] = NS[prop+'_md'] ################################################ # Add manually eddited content to the main graph manual_edition_list = list(graph.triples((l[0], NS[prop+"_md_manual_edition"], None))) if len(manual_edition_list)>=1: t_manual_edition = manual_edition_list[0] literal = t_manual_edition[2] # Identify audio content and add literal = postEditMD(prepareHTMLforMD(literal))[0] l[2] = Literal(literal) graph.add(tuple(l)) ################################################# def main(): # Load graph graph = Graph() graph.bind("", NS) graph.parse(RDF_SCHEMA_FILE) graph.parse(RDF_MANUAL_EDITION_FILE) graph.parse(RDF_CONTENT_FILE) transform_html(graph) # Apply property 'subClassOf' transitively, except on the "fake" class # hierarchy based on MacaoRoot log.info("Adding transitive subclasses...") q_transitive_subclass = """ CONSTRUCT { ?a rdfs:subClassOf ?c } WHERE { ?a rdfs:subClassOf ?b . ?b rdfs:subClassOf ?c . ?b rdfs:subClassOf :MacaoObject FILTER(?a != owl:Nothing && ?c != owl:Thing) } """ construct_while(graph, q_transitive_subclass) # Apply type inheritance (rdfs:subClassOf) log.info("Adding supertypes...") construct_while( graph, """ CONSTRUCT { ?subj rdf:type ?supertype } WHERE { ?subj a ?type . ?type rdfs:subClassOf ?supertype } """, ) # Apply property inheritance (rdfs:subPropertyOf) log.info("Adding super-properties...") construct_while( graph, """ CONSTRUCT { ?subj ?superprop ?obj } WHERE { ?subj ?prop ?obj . ?prop rdfs:subPropertyOf ?superprop . FILTER(?superprop != owl:topDataProperty) } """, ) # A # ==> Save # Remove dependency on previous ontologies for ontology in graph.subjects(RDF.type, OWL.Ontology): graph.remove((ontology, None, None)) # Declare new ontology onto_uri = URIRef("http://www.semanticweb.org/eliott/ontologies/2024/4/macao-full") graph.add((onto_uri, RDF.type, OWL.Ontology)) graph.add((onto_uri, RDFS.label, Literal("macao-full"))) graph.serialize(RDF_FULL_FILE, "turtle", base=NS) if __name__ == "__main__": main()