Skip to content
Snippets Groups Projects
Commit 49950a5e authored by David Rouquet's avatar David Rouquet
Browse files

use markitdown instead of pandoc and prepare HTML before MD conversion

parent e5030174
No related branches found
No related tags found
1 merge request!4Main
...@@ -10,4 +10,6 @@ types-beautifulsoup4==4.12.0.20240511 ...@@ -10,4 +10,6 @@ types-beautifulsoup4==4.12.0.20240511
types-html5lib==1.1.11.20240228 types-html5lib==1.1.11.20240228
types-lxml==2024.4.14 types-lxml==2024.4.14
typing_extensions==4.12.1 typing_extensions==4.12.1
pandoc #pandoc
#python_on_whales
markitdown
from rdflib import OWL, RDF, Graph, Literal from rdflib import OWL, RDF, Graph, Literal
from lxml import html from lxml import html
import pandoc #import pandoc
from markitdown import MarkItDown
import tempfile
import re
from common import * from common import *
...@@ -30,6 +33,12 @@ def construct_while(g: Graph, query: str): ...@@ -30,6 +33,12 @@ def construct_while(g: Graph, query: str):
while construct(g, query) > 0: while construct(g, query) > 0:
pass pass
def prepareHTMLforMD(str):
audio = "![The Sound File](/home/daxid/DEV/MACAO/macao-legacy/tetras_extraction/result/full/audio/e29_macao1_2b12.swf.0.mp3?controlsList=nodownload-nofullscreen-noremoteplayback)"
#regexJS = re.compile(r'<script type="text/javascript">(.*)</script>')
regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?)',.*")
str = regexAV.sub(audio, str)
return(str)
def transform_html(graph: Graph): def transform_html(graph: Graph):
html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description'] html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description']
...@@ -53,19 +62,32 @@ def transform_html(graph: Graph): ...@@ -53,19 +62,32 @@ def transform_html(graph: Graph):
pass pass
""" """
# Process all html content through Pandoc # Process all html content through Pandoc -> We use Markitdown instead at the moment
#for prop in html_properties:
# for t in graph.triples((None, NS[prop], None)) :
# desc_str = t[2]
# desc_doc = pandoc.read(desc_str, format="html")
# desc_md = pandoc.write(desc_doc, format="markdown")
# l = list(t)
# l[2] = Literal(desc_md)
# l[1] = NS[prop+'_md']
# graph.add(tuple(l))
# Process all html content through Markitdown
for prop in html_properties: for prop in html_properties:
for t in graph.triples((None, NS[prop], None)) : for t in graph.triples((None, NS[prop], None)) :
desc_str = t[2] desc_str = prepareHTMLforMD(t[2])
desc_doc = pandoc.read(desc_str, format="html") tmp = tempfile.NamedTemporaryFile(suffix=".html")
desc_md = pandoc.write(desc_doc, format="markdown") with open(tmp.name, 'w') as f:
f.write(desc_str)
mid = MarkItDown()
desc_md = mid.convert(tmp.name).text_content
l = list(t) l = list(t)
l[2] = Literal(desc_md) l[2] = Literal(desc_md)
l[1] = NS[prop+'_md'] l[1] = NS[prop+'_md']
graph.add(tuple(l)) graph.add(tuple(l))
def main(): def main():
# Load graph # Load graph
graph = Graph() graph = Graph()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment