Skip to content
Snippets Groups Projects
Commit 035d29ec authored by Eliott Sammier's avatar Eliott Sammier
Browse files

Merge branch '22-parse-activity' into 23-parse-exo-qc

parents a5778912 3af96c18
Branches
No related tags found
No related merge requests found
......@@ -73,3 +73,10 @@ def add_index(g: Graph, subject: URIRef, index: int):
Literal(f"{index:02} | {name} | ") + title,
)
)
# Exceptions ###################################################################
class ParseError(Exception):
pass
import filecmp
from pprint import pprint
from typing import Optional
......@@ -66,9 +67,11 @@ def parse_manifest(graph: Graph):
# Parse with lxml
root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
org = ns_find(root, ".//organization")
if org is None:
raise ParseError("Missing node <organization> in manifest")
# For all top-level modules
for i, e in enumerate(ns_findall(org, "item")):
module = NS[e.get("identifier")]
module = NS[e.get("identifier", default="None")]
parse_manifest_rec(graph, e)
graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
add_index(graph, module, i)
......@@ -76,7 +79,7 @@ def parse_manifest(graph: Graph):
def parse_manifest_rec(
graph: Graph,
elem,
elem: etree._Element,
parentResource: Optional[URIRef] = None,
index: Optional[int] = None,
):
......@@ -87,12 +90,13 @@ def parse_manifest_rec(
"""
# Get title and ID
title: str = ns_find(elem, "title").text
id: str = elem.get("identifier")
title = ns_find(elem, "title")
title = title.text if title is not None else "None" # safe default value
id: str = elem.get("identifier", default="None")
# Declare RDF resource and simple properties
subject = NS[id]
graph.add((subject, RDF.type, OWL.NamedIndividual))
add_title(graph, subject, title)
add_title(graph, subject, str(title))
if id.startswith("MosMod"):
# It's a Module:
graph.add((subject, RDF.type, NS["Module"]))
......@@ -119,18 +123,32 @@ def parse_manifest_rec(
extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id)
import extract_page
def compare_files(f1, f2):
print(
"Files {} and {} {}.".format(
f1, f2, "are identical" if filecmp.cmp(f1, f2) else "differ"
)
)
def main():
g = create_graph()
# Create or reset debug log files for all activity parsers, to compare their
# results afterwards
parsers = ("Match", "Xpath", "Regex")
logfiles = [f"/tmp/{p}Parser_debuglog.txt" for p in parsers]
for logfile in logfiles:
with open(logfile, "w") as f:
print("", file=f)
parse_manifest(g)
export_graph(g)
# extract_page.parse_page(
# g,
# f"{SOURCE_DIR}/contenu/pages/pg60.html",
# "pg60",
# )
# Compare log files 2 by 2
compare_files(logfiles[0], logfiles[1])
compare_files(logfiles[0], logfiles[2])
compare_files(logfiles[1], logfiles[2])
if __name__ == "__main__":
......
......@@ -38,11 +38,11 @@ class Page:
return str(self.__dict__)
class ParseError(Exception):
pass
class RegexParser:
def __init__(self, graph: Graph, act_id: str) -> None:
self.graph = graph
self.act_id = act_id
def parse(self, js, output=sys.stdout):
# Find function declaration and only keep code after it
func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
......@@ -51,6 +51,8 @@ class RegexParser:
body = func_split[1]
activity_type, activity_var_name = self._parse_activity_constructor(body)
# Save to graph
self.graph.add((NS[self.act_id], RDF.type, NS[activity_type]))
print(activity_type, end="", file=output)
if activity_type.startswith("ExerciceQC"):
print(" ", self._parse_qc_answers(body), end="", file=output)
......@@ -314,14 +316,14 @@ def parse_page(graph: Graph, filepath: str, id: str):
# Collect all inline scripts (no external 'src') and join them in a
# block of JS code
# scripts = root.cssselect('script[type="text/javascript"]:not([src])')
scripts: List[_Element] = root.xpath(
scripts: List[html.HtmlElement] = root.xpath(
'/html/head/script[@type="text/javascript" and not(@src)]'
)
js = "\n".join((s.text_content() for s in scripts))
# Try different parsers, each writing to a different file to compare their results
for parser in [XpathParser(), MatchParser(graph, id), RegexParser()]:
with open(f"/tmp/{str(parser)}.txt", "a") as f:
for parser in [XpathParser(), MatchParser(graph, id), RegexParser(graph, id)]:
with open(f"/tmp/{str(parser)}_debuglog.txt", "a") as f:
print(f"\n{id:8}", end="", file=f)
try:
parser.parse(js, output=f)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment