diff --git a/tetras_extraction/script/requirements.txt b/tetras_extraction/script/requirements.txt index 7fbd5744524cc77f264ebd200d6ddb9518d811f3..f8f98c309a84e3274a3ac920658ed90f254bff3b 100644 --- a/tetras_extraction/script/requirements.txt +++ b/tetras_extraction/script/requirements.txt @@ -10,3 +10,4 @@ types-beautifulsoup4==4.12.0.20240511 types-html5lib==1.1.11.20240228 types-lxml==2024.4.14 typing_extensions==4.12.1 +pandoc diff --git a/tetras_extraction/script/src/transform.py b/tetras_extraction/script/src/transform.py index 790e8ed6479577c7492f4342900e1cfa1dbc7cf6..672a4262c7b041a8bb81b6f42a1370e32c6c5b4e 100644 --- a/tetras_extraction/script/src/transform.py +++ b/tetras_extraction/script/src/transform.py @@ -1,5 +1,6 @@ -from rdflib import OWL, RDF, Graph +from rdflib import OWL, RDF, Graph, Literal from lxml import html +import pandoc from common import * @@ -30,12 +31,15 @@ def construct_while(g: Graph, query: str): pass -# 1st attempt : clean up audio <script> tags def transform_html(graph: Graph): + html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description'] + + + # 1st attempt : clean up audio <script> tags + """ for t in graph.triples((None, NS['description'], None)): desc_str = t[2] tree = html.fragment_fromstring(desc_str) - for script in tree.findall(".//script"): # `HtmlElement.drop_tree()` removes an element along with its # children and text, however it has an interesting feature : @@ -47,6 +51,18 @@ def transform_html(graph: Graph): script.tail = script.text + script.tail script.drop_tree() pass + """ + + # Process all html content through Pandoc + for prop in html_properties: + for t in graph.triples((None, NS[prop], None)) : + desc_str = t[2] + desc_doc = pandoc.read(desc_str, format="html") + desc_md = pandoc.write(desc_doc, format="markdown") + l = list(t) + l[2] = Literal(desc_md) + l[1] = NS[prop+'_md'] + graph.add(tuple(l))