Skip to content
Snippets Groups Projects
Commit edd4441a authored by David Rouquet's avatar David Rouquet
Browse files

1st attempt : clean up audio <script> tags

parent c0b2b8df
No related branches found
No related tags found
2 merge requests!4Main,!3Daxid html2md
from rdflib import OWL, RDF, Graph
from lxml import html
from common import *
......@@ -29,6 +30,26 @@ def construct_while(g: Graph, query: str):
pass
# 1st attempt : clean up audio <script> tags
def transform_html(graph: Graph):
for t in graph.triples((None, NS['description'], None)):
desc_str = t[2]
tree = html.fragment_fromstring(desc_str)
for script in tree.findall(".//script"):
# `HtmlElement.drop_tree()` removes an element along with its
# children and text, however it has an interesting feature :
# the tail text is not removed, but instead joined to the previous
# sibling or parent automatically.
# This means that when we want to replace an element with string,
# we only need to prepend the string to the tail, and it will be
# inserted in the right place
script.tail = script.text + script.tail
script.drop_tree()
pass
def main():
# Load graph
graph = Graph()
......@@ -36,6 +57,8 @@ def main():
graph.parse(RDF_SCHEMA_FILE)
graph.parse(RDF_CONTENT_FILE)
transform_html(graph)
# Apply property 'subClassOf' transitively, except on the "fake" class
# hierarchy based on MacaoRoot
log.info("Adding transitive subclasses...")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment