Skip to content
Snippets Groups Projects
Commit 82676c5c authored by David Rouquet's avatar David Rouquet
Browse files

Commentaires de type Doc

parent 84345ca7
No related branches found
No related tags found
No related merge requests found
......@@ -5,6 +5,7 @@ from markitdown import MarkItDown
import tempfile
import re
from glob import glob
from bs4 import BeautifulSoup
from common import *
......@@ -34,6 +35,18 @@ def construct_while(g: Graph, query: str):
while construct(g, query) > 0:
pass
def markFileDown(filepath):
with open(filepath, 'r', encoding='ISO 8859-1') as f0:
str0 = f0.read().replace("</p>", "</p>@BR@")
tmp = tempfile.NamedTemporaryFile(suffix=".html")
with open(tmp.name, 'w') as f:
f.write(str0)
# Convert the body to MD
mid = MarkItDown()
body_content_md = mid.convert(tmp.name).text_content.replace('| Imprimer ','|').replace('@BR@','\n\n')
return(body_content_md)
def prepareHTMLforMD(str):
# Identify audio content and add markups to identify them in the MD
regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?).swf',.*")
......@@ -44,22 +57,23 @@ def prepareHTMLforMD(str):
# Identify words supporting comments and add markups to identify them in the MD
regexANCHOR= re.compile(r"<a class=\"STY_lienComt\" href=\"javascript:CRS_afficherDetail\('(.)'\)\" onclick=\"enrPosClic\(event,'.'\)\">(.*?)</a>")
str = regexANCHOR.sub(r"@ANCHORSTART@\1@\2@ANCHOREND", str)
# Identify anchor words with related comment id and add markups to identify them in the MD
regexANCHOR= re.compile(r"<a class=\"STY_lienComt\" href=\"javascript:CRS_afficherDetail\('(.)'\)\" onclick=\"enrPosClic\(event,'.'\)\">(.*?)</a>")
str = regexANCHOR.sub(r"@ANCHORSTART@\1@\2@ANCHOREND", str)
# Identify comments ids and add markups to identify them in the MD
regexCOMMENTID= re.compile(r"<div id=\"divCmt(.)\" onclick=\"SPE_clicDansBulle\(event,'.*?'\)\">")#(.*)</div>", re.MULTILINE)
str = regexCOMMENTID.sub(r"@COMMENTIDSTART@\1@COMMENTIDEND", str)
# Identify anchor words that support Doc
regexANCHOR2= re.compile(r"<a.*?ouvrirDoc\('(.*?)','(.*?)'\)\">(.*?\n?.*?)</a>", re.MULTILINE)
str = regexANCHOR2.sub(r"@ANCHORSTART@commentfile_\1.\2@\3@ANCHOREND", str)
str = str.replace(r"\_","_")
return(str)
def postEditMD(str):
def postEditMD(str1):
str1 = str1.replace(r"\_","_")
###################################################
# Add audio players for audio extracted from SWF
###################################################
regexAV = re.compile(r"@AUDIOSTART.*?@AUDIOEND")
for audioElt in regexAV.findall(str):
for audioElt in regexAV.findall(str1):
audioFolder = audioElt.replace("@AUDIOSTART","").replace("@AUDIOEND","").replace(r"\_","_")
audioPaths = glob(HUGO_MEDIA_DIR+"/"+audioFolder+"/*.mp3")
audioStr = ''
......@@ -67,47 +81,62 @@ def postEditMD(str):
audioFile = audioPath.split('/')[-1]
#audioStr += r'{{< audio id="'+audioFile+r'" src="media/'+audioFolder+r'/'+audioFile+r'" >}}'
audioStr += '<div><audio id="'+audioFile+'" ><source src="/macao-hugo/media/'+audioFolder+r'/'+audioFile+'" type="audio/mpeg"></audio><button onclick="document.getElementById(\''+audioFile+'\').play()">Play</button></div>'
str = str.replace(audioElt, audioStr)
str1 = str1.replace(audioElt, audioStr)
###################################################
###################################################
# Add html code to MD for images
###################################################
regexIMG = re.compile(r"@IMAGESTART.*?@IMAGEEND")
for imageElt in regexIMG.findall(str):
for imageElt in regexIMG.findall(str1):
imgFilename = imageElt.replace("@IMAGESTART","").replace("@IMAGEEND","").replace(r"\_","_")
imgHtml = '<img class="inlineImage" src="/macao-hugo/media/'+imgFilename+'" id="'+imgFilename+'"/>'
str = str.replace(imageElt, imgHtml)
str1 = str1.replace(imageElt, imgHtml)
###################################################
###################################################
# Add html code to MD for comment anchors
###################################################
commentaireInfo_md = ''
i=0
for match in re.findall(r'@ANCHORSTART@(commentfile_.*?)@', str1):
i += 1
filepath="/home/daxid/DEV/MACAO/macao-legacy/Basilisk/MACAO/macao_12/contenu/pages/"+match.replace("commentfile_","")
try:
file = open(filepath, 'r', encoding='utf-8')
file.close()
md = markFileDown(filepath)
commentaireInfo_md = '<div class="commentaireInfo" commentaireId="'+str(i)+'">'+md+'</div>'
str1 = str1.replace(match,str(i))
except FileNotFoundError:
pass
#print(f"File {filepath} not found.")
# First for the words supporting the comments
regexANCHOR = re.compile(r"@ANCHORSTART@(.)@(.*?)@ANCHOREND")
str = regexANCHOR.sub(r"<span spanId='\1'>\2</span>", str)
regexANCHOR = re.compile(r"@ANCHORSTART@(.*?)@(.*?)@ANCHOREND")
str1 = regexANCHOR.sub(r'<span spanId="\1">\2</span>', str1)
# For the comments themselves
if '@COMMENTIDSTART' in str :
str = str.replace('@COMMENTIDSTART@','<div class="commentaireInfo" commentaireId="').replace('@COMMENTIDEND','">') + '</div>'
if '@COMMENTIDSTART' in str1 :
str1 = str1.replace('@COMMENTIDSTART@','<div class="commentaireInfo" commentaireId="').replace('@COMMENTIDEND','">') + '</div>'
###################################################
# Add html for images and fix media paths
###################################################
regexIMG = re.compile(r"!\[\]\(\.\./media/(.*?)\)")
str = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str)
str1 = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str1)
###################################################
# Some global string replacement
###################################################
str = str.replace("""Pour vous enregistrer ou interrompre
str1 = str1.replace("""Pour vous enregistrer ou interrompre
l'enregistrement, cliquez sur le bouton rouge.
Pour vous réécouter, cliquez sur la flèche
noire.""","")
str = str.replace("hauts-parleurs","flèches")
str = str.replace('\n<img class="inlineImage"','<img class="inlineImage"')
str1 = str1.replace("hauts-parleurs","flèches")
str1 = str1.replace('\n<img class="inlineImage"','<img class="inlineImage"')
return(str)
return(str1, commentaireInfo_md)
def transform_html(graph: Graph):
......@@ -159,7 +188,14 @@ def transform_html(graph: Graph):
f.write(desc_str)
mid = MarkItDown()
tmp_md = mid.convert(tmp.name).text_content
desc_md = postEditMD(tmp_md)
(desc_md, commentaireInfo_md) = postEditMD(tmp_md)
if commentaireInfo_md != "":
l0 = list(t)
l0[2] = Literal(commentaireInfo_md)
l0[1] = NS["commentaireInfo_md"]
graph.add(tuple(l0))
l = list(t)
l[2] = Literal(desc_md)
l[1] = NS[prop+'_md']
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment