Select Git revision
-
Daniel Rudolf authoredDaniel Rudolf authored
transform.py 7.30 KiB
from rdflib import OWL, RDF, Graph, Literal
from lxml import html
#import pandoc
from markitdown import MarkItDown
import tempfile
import re
from glob import glob
from common import *
log = get_logger("transform")
def construct(g: Graph, query: str):
"""Performs a SPARQL `CONSTRUCT` query and add the resulting triples
to the graph, in-place.
:return: Number of *new* triples (i.e. not present in the initial graph)
"""
res = g.query(query)
if res.graph is not None:
new_triples = res.graph - g
n = len(new_triples)
g += new_triples
if n > 0:
log.info(f"\tConstructed {n} triples")
return n
return 0
def construct_while(g: Graph, query: str):
"""Repeat a CONSTRUCT query, adding triples to the graph, until the query
stops generating new triples"""
while construct(g, query) > 0:
pass
def prepareHTMLforMD(str):
# Identify audio content and add markups to identify them in the MD
regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?).swf',.*")
str = regexAV.sub(r"@AUDIOSTART\1@AUDIOEND", str)
# Identify image content and add markups to identify them in the MD
regexIMG = re.compile(r"<img.*?src=\"\.\./media/(.*?)\".*?>")
str = regexIMG.sub(r"@IMAGESTART\1@IMAGEEND", str)
return(str)
def postEditMD(str):
###################################################
# Add audio players for audio extracted from SWF
###################################################
regexAV = re.compile(r"@AUDIOSTART.*?@AUDIOEND")
for audioElt in regexAV.findall(str):
audioFolder = audioElt.replace("@AUDIOSTART","").replace("@AUDIOEND","").replace(r"\_","_")
audioPaths = glob(HUGO_MEDIA_DIR+"/"+audioFolder+"/*.mp3")
audioStr = ''
for audioPath in audioPaths:
audioFile = audioPath.split('/')[-1]
#audioStr += r'{{< audio id="'+audioFile+r'" src="media/'+audioFolder+r'/'+audioFile+r'" >}}'
audioStr += '<div><audio id="'+audioFile+'" ><source src="/macao-hugo/media/'+audioFolder+r'/'+audioFile+'" type="audio/mpeg"></audio><button onclick="document.getElementById(\''+audioFile+'\').play()">Play</button></div>'
str = str.replace(audioElt, audioStr)
###################################################
###################################################
# Add html code to MD for images
###################################################
regexIMG = re.compile(r"@IMAGESTART.*?@IMAGEEND")
for imageElt in regexIMG.findall(str):
imgFilename = imageElt.replace("@IMAGESTART","").replace("@IMAGEEND","").replace(r"\_","_")
imgHtml = '<img src="/macao-hugo/media/'+imgFilename+'" id="'+imgFilename+'"/>'
str = str.replace(imageElt, imgHtml)
###################################################
# Add html for images and fix media paths
regexIMG = re.compile(r"!\[\]\(\.\./media/(.*?)\)")
str = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str)
return(str)
def transform_html(graph: Graph):
html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description']
# 1st attempt : clean up audio <script> tags
"""
for t in graph.triples((None, NS['description'], None)):
desc_str = t[2]
tree = html.fragment_fromstring(desc_str)
for script in tree.findall(".//script"):
# `HtmlElement.drop_tree()` removes an element along with its
# children and text, however it has an interesting feature :
# the tail text is not removed, but instead joined to the previous
# sibling or parent automatically.
# This means that when we want to replace an element with string,
# we only need to prepend the string to the tail, and it will be
# inserted in the right place
script.tail = script.text + script.tail
script.drop_tree()
pass
"""
# Process all html content through Pandoc -> We use Markitdown instead at the moment
#for prop in html_properties:
# for t in graph.triples((None, NS[prop], None)) :
# desc_str = t[2]
# desc_doc = pandoc.read(desc_str, format="html")
# desc_md = pandoc.write(desc_doc, format="markdown")
# l = list(t)
# l[2] = Literal(desc_md)
# l[1] = NS[prop+'_md']
# graph.add(tuple(l))
# Add mannually edited surveyjs_json to the main graph
#for t in graph.triples((None, surveyjs_json, None)) :
# Process all html content through Markitdown
for prop in html_properties:
for t in graph.triples((None, NS[prop], None)) :
desc_str = prepareHTMLforMD(t[2])
tmp = tempfile.NamedTemporaryFile(suffix=".html")
with open(tmp.name, 'w') as f:
f.write(desc_str)
mid = MarkItDown()
desc_md = postEditMD(mid.convert(tmp.name).text_content)
l = list(t)
l[2] = Literal(desc_md)
l[1] = NS[prop+'_md']
################################################
# Add manually eddited content to the main graph
manual_edition_list = list(graph.triples((l[0], NS[prop+"_md_manual_edition"], None)))
if len(manual_edition_list)>=1:
t_manual_edition = manual_edition_list[0]
l[2] = t_manual_edition[2]
graph.add(tuple(l))
#################################################
def main():
# Load graph
graph = Graph()
graph.bind("", NS)
graph.parse(RDF_SCHEMA_FILE)
graph.parse(RDF_MANUAL_EDITION_FILE)
graph.parse(RDF_CONTENT_FILE)
transform_html(graph)
# Apply property 'subClassOf' transitively, except on the "fake" class
# hierarchy based on MacaoRoot
log.info("Adding transitive subclasses...")
q_transitive_subclass = """
CONSTRUCT {
?a rdfs:subClassOf ?c
} WHERE {
?a rdfs:subClassOf ?b .
?b rdfs:subClassOf ?c .
?b rdfs:subClassOf :MacaoObject
FILTER(?a != owl:Nothing && ?c != owl:Thing)
}
"""
construct_while(graph, q_transitive_subclass)
# Apply type inheritance (rdfs:subClassOf)
log.info("Adding supertypes...")
construct_while(
graph,
"""
CONSTRUCT {
?subj rdf:type ?supertype
} WHERE {
?subj a ?type .
?type rdfs:subClassOf ?supertype
}
""",
)
# Apply property inheritance (rdfs:subPropertyOf)
log.info("Adding super-properties...")
construct_while(
graph,
"""
CONSTRUCT {
?subj ?superprop ?obj
} WHERE {
?subj ?prop ?obj .
?prop rdfs:subPropertyOf ?superprop .
FILTER(?superprop != owl:topDataProperty)
}
""",
)
# A
# ==> Save
# Remove dependency on previous ontologies
for ontology in graph.subjects(RDF.type, OWL.Ontology):
graph.remove((ontology, None, None))
# Declare new ontology
onto_uri = URIRef("http://www.semanticweb.org/eliott/ontologies/2024/4/macao-full")
graph.add((onto_uri, RDF.type, OWL.Ontology))
graph.add((onto_uri, RDFS.label, Literal("macao-full")))
graph.serialize(RDF_FULL_FILE, "turtle", base=NS)
if __name__ == "__main__":
main()