Commentaires de type Doc

82676c5c · David Rouquet · 84345ca7 · 82676c5c
Commit 82676c5c authored 3 months ago by David Rouquet
--- a/tetras_extraction/script/src/transform.py
+++ b/tetras_extraction/script/src/transform.py
@@ -5,6 +5,7 @@ from markitdown import MarkItDown
 import tempfile
 import re
 from glob import glob
+from bs4 import BeautifulSoup

 from common import *

@@ -34,6 +35,18 @@ def construct_while(g: Graph, query: str):
    while construct(g, query) > 0:
        pass

+def markFileDown(filepath):
+    with open(filepath, 'r', encoding='ISO 8859-1') as f0:
+        str0 = f0.read().replace("</p>", "</p>@BR@")
+    tmp = tempfile.NamedTemporaryFile(suffix=".html")
+    with open(tmp.name, 'w') as f:
+        f.write(str0)
+    # Convert the body to MD
+    mid = MarkItDown()
+    body_content_md = mid.convert(tmp.name).text_content.replace('| Imprimer  ','|').replace('@BR@','\n\n')
+    return(body_content_md)
+
+
 def prepareHTMLforMD(str):
    # Identify audio content and add markups to identify them in the MD
    regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?).swf',.*")
@@ -44,22 +57,23 @@ def prepareHTMLforMD(str):
    # Identify words supporting comments and add markups to identify them in the MD
    regexANCHOR= re.compile(r"<a class=\"STY_lienComt\" href=\"javascript:CRS_afficherDetail\('(.)'\)\" onclick=\"enrPosClic\(event,'.'\)\">(.*?)</a>")
    str = regexANCHOR.sub(r"@ANCHORSTART@\1@\2@ANCHOREND", str)
-    # Identify anchor words with related comment id and add markups to identify them in the MD
-    regexANCHOR= re.compile(r"<a class=\"STY_lienComt\" href=\"javascript:CRS_afficherDetail\('(.)'\)\" onclick=\"enrPosClic\(event,'.'\)\">(.*?)</a>")
-    str = regexANCHOR.sub(r"@ANCHORSTART@\1@\2@ANCHOREND", str)
    # Identify comments ids  and add markups to identify them in the MD
    regexCOMMENTID= re.compile(r"<div id=\"divCmt(.)\" onclick=\"SPE_clicDansBulle\(event,'.*?'\)\">")#(.*)</div>", re.MULTILINE)
    str = regexCOMMENTID.sub(r"@COMMENTIDSTART@\1@COMMENTIDEND", str)
-
+    # Identify anchor words that support Doc
+    regexANCHOR2= re.compile(r"<a.*?ouvrirDoc\('(.*?)','(.*?)'\)\">(.*?\n?.*?)</a>", re.MULTILINE)
+    str = regexANCHOR2.sub(r"@ANCHORSTART@commentfile_\1.\2@\3@ANCHOREND", str)
+    str = str.replace(r"\_","_")
    return(str)


-def postEditMD(str):
+def postEditMD(str1):
+    str1 = str1.replace(r"\_","_")
    ###################################################
    # Add audio players for audio extracted from SWF
    ###################################################
    regexAV = re.compile(r"@AUDIOSTART.*?@AUDIOEND")
-    for audioElt in regexAV.findall(str):
+    for audioElt in regexAV.findall(str1):
        audioFolder = audioElt.replace("@AUDIOSTART","").replace("@AUDIOEND","").replace(r"\_","_")
        audioPaths = glob(HUGO_MEDIA_DIR+"/"+audioFolder+"/*.mp3")
        audioStr = ''
@@ -67,47 +81,62 @@ def postEditMD(str):
            audioFile = audioPath.split('/')[-1]
            #audioStr += r'{{< audio id="'+audioFile+r'" src="media/'+audioFolder+r'/'+audioFile+r'" >}}'
            audioStr += '<div><audio id="'+audioFile+'" ><source src="/macao-hugo/media/'+audioFolder+r'/'+audioFile+'" type="audio/mpeg"></audio><button onclick="document.getElementById(\''+audioFile+'\').play()">Play</button></div>'
-        str = str.replace(audioElt, audioStr)
+        str1 = str1.replace(audioElt, audioStr)
    ###################################################

    ###################################################
    # Add html code to MD for images
    ###################################################
    regexIMG = re.compile(r"@IMAGESTART.*?@IMAGEEND")
-    for imageElt in regexIMG.findall(str):
+    for imageElt in regexIMG.findall(str1):
        imgFilename = imageElt.replace("@IMAGESTART","").replace("@IMAGEEND","").replace(r"\_","_")
        imgHtml = '<img class="inlineImage" src="/macao-hugo/media/'+imgFilename+'" id="'+imgFilename+'"/>'
-        str = str.replace(imageElt, imgHtml)
+        str1 = str1.replace(imageElt, imgHtml)
    ###################################################

    ###################################################
    # Add html code to MD for comment anchors
    ###################################################
+    commentaireInfo_md = ''
+    i=0
+    for match in re.findall(r'@ANCHORSTART@(commentfile_.*?)@', str1):
+        i += 1
+        filepath="/home/daxid/DEV/MACAO/macao-legacy/Basilisk/MACAO/macao_12/contenu/pages/"+match.replace("commentfile_","")
+        try:
+            file = open(filepath, 'r', encoding='utf-8')
+            file.close()
+            md = markFileDown(filepath)
+            commentaireInfo_md = '<div class="commentaireInfo" commentaireId="'+str(i)+'">'+md+'</div>'
+            str1 = str1.replace(match,str(i))
+        except FileNotFoundError:
+            pass
+            #print(f"File {filepath} not found.")
+
    # First for the words supporting the comments
-    regexANCHOR = re.compile(r"@ANCHORSTART@(.)@(.*?)@ANCHOREND")
-    str = regexANCHOR.sub(r"<span spanId='\1'>\2</span>", str)
+    regexANCHOR = re.compile(r"@ANCHORSTART@(.*?)@(.*?)@ANCHOREND")
+    str1 = regexANCHOR.sub(r'<span spanId="\1">\2</span>', str1)

    # For the comments themselves
-    if '@COMMENTIDSTART' in str :
-        str = str.replace('@COMMENTIDSTART@','<div class="commentaireInfo" commentaireId="').replace('@COMMENTIDEND','">') + '</div>'
+    if '@COMMENTIDSTART' in str1 :
+        str1 = str1.replace('@COMMENTIDSTART@','<div class="commentaireInfo" commentaireId="').replace('@COMMENTIDEND','">') + '</div>'

    ###################################################
    # Add html for images and fix media paths
    ###################################################
    regexIMG = re.compile(r"!\[\]\(\.\./media/(.*?)\)")
-    str = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str)
+    str1 = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str1)

    ###################################################
    # Some global string replacement
    ###################################################
-    str = str.replace("""Pour vous enregistrer ou interrompre
+    str1 = str1.replace("""Pour vous enregistrer ou interrompre
 l'enregistrement, cliquez sur le bouton rouge.
 Pour vous réécouter, cliquez sur la flèche
 noire.""","")
-    str = str.replace("hauts-parleurs","flèches")
-    str = str.replace('\n<img class="inlineImage"','<img class="inlineImage"')
+    str1 = str1.replace("hauts-parleurs","flèches")
+    str1 = str1.replace('\n<img class="inlineImage"','<img class="inlineImage"')

-    return(str)
+    return(str1, commentaireInfo_md)


 def transform_html(graph: Graph):
@@ -159,7 +188,14 @@ def transform_html(graph: Graph):
                    f.write(desc_str)
                mid = MarkItDown()
                tmp_md = mid.convert(tmp.name).text_content
-                desc_md = postEditMD(tmp_md)
+                (desc_md, commentaireInfo_md) = postEditMD(tmp_md)
+
+            if commentaireInfo_md != "":
+                l0 = list(t)
+                l0[2] = Literal(commentaireInfo_md)
+                l0[1] = NS["commentaireInfo_md"]
+                graph.add(tuple(l0))
+
            l = list(t)
            l[2] = Literal(desc_md)
            l[1] = NS[prop+'_md']