diff --git a/tetras_extraction/script/src/transform.py b/tetras_extraction/script/src/transform.py index 74ad2ee6fda68f250147fb4beb5a0080bcbe6d50..8f028de47f6225e87392ec5833d3f306d0bac285 100644 --- a/tetras_extraction/script/src/transform.py +++ b/tetras_extraction/script/src/transform.py @@ -5,6 +5,7 @@ from markitdown import MarkItDown import tempfile import re from glob import glob +from bs4 import BeautifulSoup from common import * @@ -34,6 +35,18 @@ def construct_while(g: Graph, query: str): while construct(g, query) > 0: pass +def markFileDown(filepath): + with open(filepath, 'r', encoding='ISO 8859-1') as f0: + str0 = f0.read().replace("</p>", "</p>@BR@") + tmp = tempfile.NamedTemporaryFile(suffix=".html") + with open(tmp.name, 'w') as f: + f.write(str0) + # Convert the body to MD + mid = MarkItDown() + body_content_md = mid.convert(tmp.name).text_content.replace('| Imprimer ','|').replace('@BR@','\n\n') + return(body_content_md) + + def prepareHTMLforMD(str): # Identify audio content and add markups to identify them in the MD regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?).swf',.*") @@ -44,22 +57,23 @@ def prepareHTMLforMD(str): # Identify words supporting comments and add markups to identify them in the MD regexANCHOR= re.compile(r"<a class=\"STY_lienComt\" href=\"javascript:CRS_afficherDetail\('(.)'\)\" onclick=\"enrPosClic\(event,'.'\)\">(.*?)</a>") str = regexANCHOR.sub(r"@ANCHORSTART@\1@\2@ANCHOREND", str) - # Identify anchor words with related comment id and add markups to identify them in the MD - regexANCHOR= re.compile(r"<a class=\"STY_lienComt\" href=\"javascript:CRS_afficherDetail\('(.)'\)\" onclick=\"enrPosClic\(event,'.'\)\">(.*?)</a>") - str = regexANCHOR.sub(r"@ANCHORSTART@\1@\2@ANCHOREND", str) # Identify comments ids and add markups to identify them in the MD regexCOMMENTID= re.compile(r"<div id=\"divCmt(.)\" onclick=\"SPE_clicDansBulle\(event,'.*?'\)\">")#(.*)</div>", re.MULTILINE) str = regexCOMMENTID.sub(r"@COMMENTIDSTART@\1@COMMENTIDEND", str) - + # Identify anchor words that support Doc + regexANCHOR2= re.compile(r"<a.*?ouvrirDoc\('(.*?)','(.*?)'\)\">(.*?\n?.*?)</a>", re.MULTILINE) + str = regexANCHOR2.sub(r"@ANCHORSTART@commentfile_\1.\2@\3@ANCHOREND", str) + str = str.replace(r"\_","_") return(str) -def postEditMD(str): +def postEditMD(str1): + str1 = str1.replace(r"\_","_") ################################################### # Add audio players for audio extracted from SWF ################################################### regexAV = re.compile(r"@AUDIOSTART.*?@AUDIOEND") - for audioElt in regexAV.findall(str): + for audioElt in regexAV.findall(str1): audioFolder = audioElt.replace("@AUDIOSTART","").replace("@AUDIOEND","").replace(r"\_","_") audioPaths = glob(HUGO_MEDIA_DIR+"/"+audioFolder+"/*.mp3") audioStr = '' @@ -67,47 +81,62 @@ def postEditMD(str): audioFile = audioPath.split('/')[-1] #audioStr += r'{{< audio id="'+audioFile+r'" src="media/'+audioFolder+r'/'+audioFile+r'" >}}' audioStr += '<div><audio id="'+audioFile+'" ><source src="/macao-hugo/media/'+audioFolder+r'/'+audioFile+'" type="audio/mpeg"></audio><button onclick="document.getElementById(\''+audioFile+'\').play()">Play</button></div>' - str = str.replace(audioElt, audioStr) + str1 = str1.replace(audioElt, audioStr) ################################################### ################################################### # Add html code to MD for images ################################################### regexIMG = re.compile(r"@IMAGESTART.*?@IMAGEEND") - for imageElt in regexIMG.findall(str): + for imageElt in regexIMG.findall(str1): imgFilename = imageElt.replace("@IMAGESTART","").replace("@IMAGEEND","").replace(r"\_","_") imgHtml = '<img class="inlineImage" src="/macao-hugo/media/'+imgFilename+'" id="'+imgFilename+'"/>' - str = str.replace(imageElt, imgHtml) + str1 = str1.replace(imageElt, imgHtml) ################################################### ################################################### # Add html code to MD for comment anchors ################################################### + commentaireInfo_md = '' + i=0 + for match in re.findall(r'@ANCHORSTART@(commentfile_.*?)@', str1): + i += 1 + filepath="/home/daxid/DEV/MACAO/macao-legacy/Basilisk/MACAO/macao_12/contenu/pages/"+match.replace("commentfile_","") + try: + file = open(filepath, 'r', encoding='utf-8') + file.close() + md = markFileDown(filepath) + commentaireInfo_md = '<div class="commentaireInfo" commentaireId="'+str(i)+'">'+md+'</div>' + str1 = str1.replace(match,str(i)) + except FileNotFoundError: + pass + #print(f"File {filepath} not found.") + # First for the words supporting the comments - regexANCHOR = re.compile(r"@ANCHORSTART@(.)@(.*?)@ANCHOREND") - str = regexANCHOR.sub(r"<span spanId='\1'>\2</span>", str) + regexANCHOR = re.compile(r"@ANCHORSTART@(.*?)@(.*?)@ANCHOREND") + str1 = regexANCHOR.sub(r'<span spanId="\1">\2</span>', str1) # For the comments themselves - if '@COMMENTIDSTART' in str : - str = str.replace('@COMMENTIDSTART@','<div class="commentaireInfo" commentaireId="').replace('@COMMENTIDEND','">') + '</div>' + if '@COMMENTIDSTART' in str1 : + str1 = str1.replace('@COMMENTIDSTART@','<div class="commentaireInfo" commentaireId="').replace('@COMMENTIDEND','">') + '</div>' ################################################### # Add html for images and fix media paths ################################################### regexIMG = re.compile(r"!\[\]\(\.\./media/(.*?)\)") - str = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str) + str1 = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str1) ################################################### # Some global string replacement ################################################### - str = str.replace("""Pour vous enregistrer ou interrompre + str1 = str1.replace("""Pour vous enregistrer ou interrompre l'enregistrement, cliquez sur le bouton rouge. Pour vous réécouter, cliquez sur la flèche noire.""","") - str = str.replace("hauts-parleurs","flèches") - str = str.replace('\n<img class="inlineImage"','<img class="inlineImage"') + str1 = str1.replace("hauts-parleurs","flèches") + str1 = str1.replace('\n<img class="inlineImage"','<img class="inlineImage"') - return(str) + return(str1, commentaireInfo_md) def transform_html(graph: Graph): @@ -159,7 +188,14 @@ def transform_html(graph: Graph): f.write(desc_str) mid = MarkItDown() tmp_md = mid.convert(tmp.name).text_content - desc_md = postEditMD(tmp_md) + (desc_md, commentaireInfo_md) = postEditMD(tmp_md) + + if commentaireInfo_md != "": + l0 = list(t) + l0[2] = Literal(commentaireInfo_md) + l0[1] = NS["commentaireInfo_md"] + graph.add(tuple(l0)) + l = list(t) l[2] = Literal(desc_md) l[1] = NS[prop+'_md']