diff --git a/tetras_extraction/script/requirements.txt b/tetras_extraction/script/requirements.txt index f8f98c309a84e3274a3ac920658ed90f254bff3b..de914f259d597ac101e43cb059071b863735f5bc 100644 --- a/tetras_extraction/script/requirements.txt +++ b/tetras_extraction/script/requirements.txt @@ -10,4 +10,6 @@ types-beautifulsoup4==4.12.0.20240511 types-html5lib==1.1.11.20240228 types-lxml==2024.4.14 typing_extensions==4.12.1 -pandoc +#pandoc +#python_on_whales +markitdown diff --git a/tetras_extraction/script/src/transform.py b/tetras_extraction/script/src/transform.py index 672a4262c7b041a8bb81b6f42a1370e32c6c5b4e..bd22da8551722020a0d42fc095d22f44287fd1a8 100644 --- a/tetras_extraction/script/src/transform.py +++ b/tetras_extraction/script/src/transform.py @@ -1,6 +1,9 @@ from rdflib import OWL, RDF, Graph, Literal from lxml import html -import pandoc +#import pandoc +from markitdown import MarkItDown +import tempfile +import re from common import * @@ -30,6 +33,12 @@ def construct_while(g: Graph, query: str): while construct(g, query) > 0: pass +def prepareHTMLforMD(str): + audio = "" + #regexJS = re.compile(r'<script type="text/javascript">(.*)</script>') + regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?)',.*") + str = regexAV.sub(audio, str) + return(str) def transform_html(graph: Graph): html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description'] @@ -53,19 +62,32 @@ def transform_html(graph: Graph): pass """ - # Process all html content through Pandoc + # Process all html content through Pandoc -> We use Markitdown instead at the moment + #for prop in html_properties: + # for t in graph.triples((None, NS[prop], None)) : + # desc_str = t[2] + # desc_doc = pandoc.read(desc_str, format="html") + # desc_md = pandoc.write(desc_doc, format="markdown") + # l = list(t) + # l[2] = Literal(desc_md) + # l[1] = NS[prop+'_md'] + # graph.add(tuple(l)) + + # Process all html content through Markitdown for prop in html_properties: for t in graph.triples((None, NS[prop], None)) : - desc_str = t[2] - desc_doc = pandoc.read(desc_str, format="html") - desc_md = pandoc.write(desc_doc, format="markdown") + desc_str = prepareHTMLforMD(t[2]) + tmp = tempfile.NamedTemporaryFile(suffix=".html") + with open(tmp.name, 'w') as f: + f.write(desc_str) + mid = MarkItDown() + desc_md = mid.convert(tmp.name).text_content l = list(t) l[2] = Literal(desc_md) l[1] = NS[prop+'_md'] graph.add(tuple(l)) - def main(): # Load graph graph = Graph()