From 49950a5ef0e27f63859cab86dd98be9e9bb23b7c Mon Sep 17 00:00:00 2001 From: daxid <david.rouquet@tetras-libre.fr> Date: Mon, 23 Dec 2024 18:59:41 +0100 Subject: [PATCH] use markitdown instead of pandoc and prepare HTML before MD conversion --- tetras_extraction/script/requirements.txt | 4 ++- tetras_extraction/script/src/transform.py | 34 +++++++++++++++++++---- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/tetras_extraction/script/requirements.txt b/tetras_extraction/script/requirements.txt index f8f98c30..de914f25 100644 --- a/tetras_extraction/script/requirements.txt +++ b/tetras_extraction/script/requirements.txt @@ -10,4 +10,6 @@ types-beautifulsoup4==4.12.0.20240511 types-html5lib==1.1.11.20240228 types-lxml==2024.4.14 typing_extensions==4.12.1 -pandoc +#pandoc +#python_on_whales +markitdown diff --git a/tetras_extraction/script/src/transform.py b/tetras_extraction/script/src/transform.py index 672a4262..bd22da85 100644 --- a/tetras_extraction/script/src/transform.py +++ b/tetras_extraction/script/src/transform.py @@ -1,6 +1,9 @@ from rdflib import OWL, RDF, Graph, Literal from lxml import html -import pandoc +#import pandoc +from markitdown import MarkItDown +import tempfile +import re from common import * @@ -30,6 +33,12 @@ def construct_while(g: Graph, query: str): while construct(g, query) > 0: pass +def prepareHTMLforMD(str): + audio = "" + #regexJS = re.compile(r'<script type="text/javascript">(.*)</script>') + regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?)',.*") + str = regexAV.sub(audio, str) + return(str) def transform_html(graph: Graph): html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description'] @@ -53,19 +62,32 @@ def transform_html(graph: Graph): pass """ - # Process all html content through Pandoc + # Process all html content through Pandoc -> We use Markitdown instead at the moment + #for prop in html_properties: + # for t in graph.triples((None, NS[prop], None)) : + # desc_str = t[2] + # desc_doc = pandoc.read(desc_str, format="html") + # desc_md = pandoc.write(desc_doc, format="markdown") + # l = list(t) + # l[2] = Literal(desc_md) + # l[1] = NS[prop+'_md'] + # graph.add(tuple(l)) + + # Process all html content through Markitdown for prop in html_properties: for t in graph.triples((None, NS[prop], None)) : - desc_str = t[2] - desc_doc = pandoc.read(desc_str, format="html") - desc_md = pandoc.write(desc_doc, format="markdown") + desc_str = prepareHTMLforMD(t[2]) + tmp = tempfile.NamedTemporaryFile(suffix=".html") + with open(tmp.name, 'w') as f: + f.write(desc_str) + mid = MarkItDown() + desc_md = mid.convert(tmp.name).text_content l = list(t) l[2] = Literal(desc_md) l[1] = NS[prop+'_md'] graph.add(tuple(l)) - def main(): # Load graph graph = Graph() -- GitLab