From 5f723d6c3375d7d5525653282d59e9c44daf5dcc Mon Sep 17 00:00:00 2001 From: daxid <david.rouquet@tetras-libre.fr> Date: Mon, 13 Jan 2025 22:45:34 +0100 Subject: [PATCH] allow TAT choice text html->md --- tetras_extraction/script/src/extract_page.py | 2 +- tetras_extraction/script/src/transform.py | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tetras_extraction/script/src/extract_page.py b/tetras_extraction/script/src/extract_page.py index f119ea1..23f85f1 100644 --- a/tetras_extraction/script/src/extract_page.py +++ b/tetras_extraction/script/src/extract_page.py @@ -335,7 +335,7 @@ class ExerciceTAT(Exercice): graph.add((segment_uri, RDF.type, NS["Segment"])) graph.add((segment_uri, NS["index"], Literal(index))) if isinstance(segment, str): - graph.add((segment_uri, NS["text"], Literal(segment))) + graph.add((segment_uri, NS["html"], Literal(segment))) else: graph.add((segment_uri, RDF.type, NS["Champ"])) segment.save(graph, rdf_name) diff --git a/tetras_extraction/script/src/transform.py b/tetras_extraction/script/src/transform.py index b3285bb..10ae8ff 100644 --- a/tetras_extraction/script/src/transform.py +++ b/tetras_extraction/script/src/transform.py @@ -118,11 +118,15 @@ def transform_html(graph: Graph): for prop in html_properties: for t in graph.triples((None, NS[prop], None)) : desc_str = prepareHTMLforMD(t[2]) - tmp = tempfile.NamedTemporaryFile(suffix=".html") - with open(tmp.name, 'w') as f: - f.write(desc_str) - mid = MarkItDown() - desc_md = postEditMD(mid.convert(tmp.name).text_content) + if desc_str == '': + desc_md = '' + else: + tmp = tempfile.NamedTemporaryFile(suffix=".html") + with open(tmp.name, 'w') as f: + f.write(desc_str) + mid = MarkItDown() + tmp_md = mid.convert(tmp.name).text_content + desc_md = postEditMD(tmp_md) l = list(t) l[2] = Literal(desc_md) l[1] = NS[prop+'_md'] -- GitLab