diff --git a/tetras_extraction/script/src/common.py b/tetras_extraction/script/src/common.py index 99e499fd55fb579c0cf2db91deeb502103722eaa..a2af1dd004cf84bb699f2bc3c5e8ecec1102d358 100644 --- a/tetras_extraction/script/src/common.py +++ b/tetras_extraction/script/src/common.py @@ -25,6 +25,8 @@ MACAO_ROOT = env_path_or_rel_default("MACAO_ROOT", "../../..") SOURCE_DIR = env_path_or_rel_default("SOURCES_DIR", "../../../Basilisk/MACAO") """Path to the Macao source directory (i.e. the one with the manifest)""" RESULT_DIR = env_path_or_rel_default("RESULTS_DIR", f"../../result/{MACAO_VERSION}") +"""Path to the Hugo static media directory""" +HUGO_MEDIA_DIR = env_path_or_rel_default("HUGO_MEDIA_DIR", f"/home/daxid/DEV/MACAO/macao-hugo/macao/static/media/") """Path to the directory containing various results (RDF, content, media...)""" NEW_CONTENT_ROOT = env_path_or_rel_default( "NEW_CONTENT_ROOT", RESULT_DIR + "/activities" diff --git a/tetras_extraction/script/src/transform.py b/tetras_extraction/script/src/transform.py index b94b6def6a208eef79d72c869d460057d24ea356..c7593791639ca8979d86cc9d38696e38ff35a3d7 100644 --- a/tetras_extraction/script/src/transform.py +++ b/tetras_extraction/script/src/transform.py @@ -4,6 +4,7 @@ from lxml import html from markitdown import MarkItDown import tempfile import re +from glob import glob from common import * @@ -36,12 +37,19 @@ def construct_while(g: Graph, query: str): def prepareHTMLforMD(str): #regexJS = re.compile(r'<script type="text/javascript">(.*)</script>') regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?).swf',.*") - str = regexAV.sub(r"AUDIOSTART\1@AUDIOEND", str) + str = regexAV.sub(r"@AUDIOSTART\1@AUDIOEND", str) return(str) def postEditMD(str): - regexAV = re.compile(r"AUDIOSTART(.*?)@AUDIOEND") - str = regexAV.sub(r'{{< audio id="\1" src="media/\1/\1_01.mp3" class="something" >}}', str).replace(r'\_',r'_') + regexAV = re.compile(r"@AUDIOSTART.*?@AUDIOEND") + for audioElt in regexAV.findall(str): + audioFolder = audioElt.replace("@AUDIOSTART","").replace("@AUDIOEND","").replace(r"\_","_") + audioPaths = glob(HUGO_MEDIA_DIR+"/"+audioFolder+"/*.mp3") + audioStr = '' + for audioPath in audioPaths: + audioFile = audioPath.split('/')[-1] + audioStr += r'{{< audio id="'+audioFile+r'" src="media/'+audioFolder+r'/'+audioFile+r'" >}}' + str = str.replace(audioElt, audioStr) return(str) def transform_html(graph: Graph): @@ -80,20 +88,17 @@ def transform_html(graph: Graph): # Process all html content through Markitdown for prop in html_properties: for t in graph.triples((None, NS[prop], None)) : - desc_str = prepareHTMLforMD(t[2]) tmp = tempfile.NamedTemporaryFile(suffix=".html") with open(tmp.name, 'w') as f: f.write(desc_str) mid = MarkItDown() - #desc_md = postEditMD(mid.convert(tmp.name).text_content) - desc_md = mid.convert(tmp.name).text_content + desc_md = postEditMD(mid.convert(tmp.name).text_content) l = list(t) l[2] = Literal(desc_md) l[1] = NS[prop+'_md'] manual_edition_list = list(graph.triples((l[0], NS[prop+"_md_manual_edition"], None))) if len(manual_edition_list)>=1: - print(manual_edition_list) t_manual_edition = manual_edition_list[0] l[2] = t_manual_edition[2] graph.add(tuple(l))