From 94cdb0859b326c9606722e3bad4454f3efe75b0a Mon Sep 17 00:00:00 2001
From: daxid <david.rouquet@tetras-libre.fr>
Date: Mon, 18 Nov 2024 23:24:09 +0100
Subject: [PATCH] transformation html -> MD

---
 tetras_extraction/script/requirements.txt |  1 +
 tetras_extraction/script/src/transform.py | 22 +++++++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tetras_extraction/script/requirements.txt b/tetras_extraction/script/requirements.txt
index 7fbd5744..f8f98c30 100644
--- a/tetras_extraction/script/requirements.txt
+++ b/tetras_extraction/script/requirements.txt
@@ -10,3 +10,4 @@ types-beautifulsoup4==4.12.0.20240511
 types-html5lib==1.1.11.20240228
 types-lxml==2024.4.14
 typing_extensions==4.12.1
+pandoc
diff --git a/tetras_extraction/script/src/transform.py b/tetras_extraction/script/src/transform.py
index 790e8ed6..672a4262 100644
--- a/tetras_extraction/script/src/transform.py
+++ b/tetras_extraction/script/src/transform.py
@@ -1,5 +1,6 @@
-from rdflib import OWL, RDF, Graph
+from rdflib import OWL, RDF, Graph, Literal
 from lxml import html
+import pandoc
 
 from common import *
 
@@ -30,12 +31,15 @@ def construct_while(g: Graph, query: str):
         pass
 
 
-# 1st attempt : clean up audio <script> tags
 def transform_html(graph: Graph):
+    html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description']
+
+
+    # 1st attempt : clean up audio <script> tags
+    """
     for t in graph.triples((None, NS['description'], None)):
         desc_str = t[2]
         tree = html.fragment_fromstring(desc_str)
-
         for script in tree.findall(".//script"):
             # `HtmlElement.drop_tree()` removes an element along with its
             # children and text, however it has an interesting feature :
@@ -47,6 +51,18 @@ def transform_html(graph: Graph):
             script.tail = script.text + script.tail
             script.drop_tree()
         pass
+    """
+
+    # Process all html content through Pandoc
+    for prop in html_properties:
+        for t in graph.triples((None, NS[prop], None))  :
+            desc_str = t[2]
+            desc_doc = pandoc.read(desc_str, format="html")
+            desc_md = pandoc.write(desc_doc, format="markdown")
+            l = list(t)
+            l[2] = Literal(desc_md)
+            l[1] = NS[prop+'_md']
+            graph.add(tuple(l))
 
 
 
-- 
GitLab