Skip to content
Snippets Groups Projects
Commit 49950a5e authored by David Rouquet's avatar David Rouquet
Browse files

use markitdown instead of pandoc and prepare HTML before MD conversion

parent e5030174
Branches
No related tags found
1 merge request!4Main
......@@ -10,4 +10,6 @@ types-beautifulsoup4==4.12.0.20240511
types-html5lib==1.1.11.20240228
types-lxml==2024.4.14
typing_extensions==4.12.1
pandoc
#pandoc
#python_on_whales
markitdown
from rdflib import OWL, RDF, Graph, Literal
from lxml import html
import pandoc
#import pandoc
from markitdown import MarkItDown
import tempfile
import re
from common import *
......@@ -30,6 +33,12 @@ def construct_while(g: Graph, query: str):
while construct(g, query) > 0:
pass
def prepareHTMLforMD(str):
audio = "![The Sound File](/home/daxid/DEV/MACAO/macao-legacy/tetras_extraction/result/full/audio/e29_macao1_2b12.swf.0.mp3?controlsList=nodownload-nofullscreen-noremoteplayback)"
#regexJS = re.compile(r'<script type="text/javascript">(.*)</script>')
regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?)',.*")
str = regexAV.sub(audio, str)
return(str)
def transform_html(graph: Graph):
html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description']
......@@ -53,19 +62,32 @@ def transform_html(graph: Graph):
pass
"""
# Process all html content through Pandoc
# Process all html content through Pandoc -> We use Markitdown instead at the moment
#for prop in html_properties:
# for t in graph.triples((None, NS[prop], None)) :
# desc_str = t[2]
# desc_doc = pandoc.read(desc_str, format="html")
# desc_md = pandoc.write(desc_doc, format="markdown")
# l = list(t)
# l[2] = Literal(desc_md)
# l[1] = NS[prop+'_md']
# graph.add(tuple(l))
# Process all html content through Markitdown
for prop in html_properties:
for t in graph.triples((None, NS[prop], None)) :
desc_str = t[2]
desc_doc = pandoc.read(desc_str, format="html")
desc_md = pandoc.write(desc_doc, format="markdown")
desc_str = prepareHTMLforMD(t[2])
tmp = tempfile.NamedTemporaryFile(suffix=".html")
with open(tmp.name, 'w') as f:
f.write(desc_str)
mid = MarkItDown()
desc_md = mid.convert(tmp.name).text_content
l = list(t)
l[2] = Literal(desc_md)
l[1] = NS[prop+'_md']
graph.add(tuple(l))
def main():
# Load graph
graph = Graph()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment