Skip to content
Snippets Groups Projects
Commit 5a7cc90d authored by Eliott Sammier's avatar Eliott Sammier
Browse files

Merge branch '22-parse-activity' into extraction

parents d09e3cfc 90b5d8a5
Branches
No related tags found
No related merge requests found
......@@ -7,19 +7,29 @@
@base <http://www.semanticweb.org/eliott/ontologies/2024/4/macao/> .
<http://www.semanticweb.org/eliott/ontologies/2024/4/macao> rdf:type owl:Ontology ;
rdfs:label "macao-schema"@fr .
rdfs:label "macao-schema"@fr ;
owl:versionInfo 1.1 .
#################################################################
# Annotation properties
# Object Properties
#################################################################
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao#test
:test rdf:type owl:AnnotationProperty .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponse
:aReponse rdf:type owl:ObjectProperty ;
rdfs:range :Reponse .
#################################################################
# Object Properties
#################################################################
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponseCorrecte
:aReponseCorrecte rdf:type owl:ObjectProperty ;
rdfs:domain :Exercice ;
rdfs:range :Reponse .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponseIncorrecte
:aReponseIncorrecte rdf:type owl:ObjectProperty ;
rdfs:domain :Exercice ;
rdfs:range :Reponse .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/contenuDans
:contenuDans rdf:type owl:ObjectProperty ;
......@@ -77,6 +87,34 @@
rdfs:range xsd:anyURI .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireInfo
:commentaireInfo rdf:type owl:DatatypeProperty ;
rdfs:domain :Page ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireSucces
:commentaireSucces rdf:type owl:DatatypeProperty ;
rdfs:domain :Page ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireSugg
:commentaireSugg rdf:type owl:DatatypeProperty ;
rdfs:domain :Page ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/correct
:correct rdf:type owl:DatatypeProperty ;
rdfs:range xsd:boolean .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/html
:html rdf:type owl:DatatypeProperty ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/id
:id rdf:type owl:DatatypeProperty ;
rdfs:subPropertyOf owl:topDataProperty ;
......@@ -96,10 +134,6 @@
# Classes
#################################################################
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao#MacaoRoot
:MacaoRoot rdf:type owl:Class .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Cours
:Cours rdf:type owl:Class ;
rdfs:subClassOf :Page .
......@@ -110,16 +144,41 @@
rdfs:subClassOf :Page .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/FlashObject
:FlashObject rdf:type owl:Class ;
rdfs:subClassOf :MacaoRessource .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceGD
:ExerciceGD rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC
:ExerciceQC rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC_QCM
:ExerciceQC_QCM rdf:type owl:Class ;
rdfs:subClassOf :ExerciceQC .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC_QCU
:ExerciceQC_QCU rdf:type owl:Class ;
rdfs:subClassOf :ExerciceQC .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQM
:ExerciceQM rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/GD
:GD rdf:type owl:Class ;
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceTAT
:ExerciceTAT rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/FlashObject
:FlashObject rdf:type owl:Class ;
rdfs:subClassOf :MacaoRessource .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Image
:Image rdf:type owl:Class ;
rdfs:subClassOf :MacaoRessource .
......@@ -139,6 +198,10 @@
rdfs:subClassOf :MacaoObject .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/MacaoRoot
:MacaoRoot rdf:type owl:Class .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Module
:Module rdf:type owl:Class ;
rdfs:subClassOf :MacaoContenu .
......@@ -149,14 +212,9 @@
rdfs:subClassOf :MacaoContenu .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/QC
:QC rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/QM
:QM rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Reponse
:Reponse rdf:type owl:Class ;
rdfs:subClassOf :MacaoContenu .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/SimpleFlash
......@@ -169,25 +227,15 @@
rdfs:subClassOf :MacaoContenu .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/TAT
:TAT rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ValuePartition
:ValuePartition rdf:type owl:Class .
#################################################################
# General axioms
#################################################################
[ rdf:type owl:AllDisjointClasses ;
owl:members ( :GD
:QC
:QM
:TAT
owl:members ( :ExerciceGD
:ExerciceQC
:ExerciceQM
:ExerciceTAT
)
] .
......
Source diff could not be displayed: it is too large. Options to address this: view the blob.
{
"recommendations": [
"detachhead.basedpyright",
"ms-python.black-formatter"
]
}
\ No newline at end of file
from os import environ, path
from sys import stderr
from rdflib import RDFS, Graph, Literal, URIRef
from typing import Any
from lxml import html
from rdflib import Graph, Literal, RDFS, URIRef
from rdflib import Namespace
from os import path, environ
def env_path_or_rel_default(env_var: str, default: str) -> str:
......@@ -34,15 +37,33 @@ NS = Namespace("http://www.semanticweb.org/eliott/ontologies/2024/4/macao/")
# Utility functions ############################################################
def eprint(*args, **kwargs):
def eprint(*args, **kwargs): # pyright: ignore[reportMissingParameterType]
"""Just like `print()`, but to standard error instead of standard output"""
print(*args, file=stderr, **kwargs)
def add_title(g: Graph, subject: URIRef, title: str):
def to_html(elem: html.HtmlElement) -> str:
"""Shorthand function to serialise a `HtmlElement` to a HTML string"""
return html.tostring(elem, encoding="unicode")
def insert_grow(l: list[Any], index: int, value: Any, fill_value: Any | None = None):
"""Insert at a given position in a list, growing it if necessary
:param l: list
:param index: The position where the value is inserted
:param value: The value to insert
:param fill_value: The value used for elements created automatically when growing, defaults to None
"""
for _ in range(len(l), index + 1):
l.append(fill_value)
l[index] = value
def set_title(g: Graph, subject: URIRef, title: str):
"""Add triples to define the `subject`'s title and label"""
g.add((subject, RDFS.label, Literal(title)))
g.add((subject, NS["titre"], Literal(title)))
g.set((subject, RDFS.label, Literal(title)))
g.set((subject, NS["titre"], Literal(title)))
def add_index(g: Graph, subject: URIRef, index: int):
......@@ -59,3 +80,10 @@ def add_index(g: Graph, subject: URIRef, index: int):
Literal(f"{index:02} | {name} | ") + title,
)
)
# Exceptions ###################################################################
class ParseError(Exception):
pass
from pprint import pprint
from typing import Optional
import filecmp
from lxml import etree
from rdflib import RDFS, Graph, Literal, URIRef
......@@ -66,9 +65,11 @@ def parse_manifest(graph: Graph):
# Parse with lxml
root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
org = ns_find(root, ".//organization")
if org is None:
raise ParseError("Missing node <organization> in manifest")
# For all top-level modules
for i, e in enumerate(ns_findall(org, "item")):
module = NS[e.get("identifier")]
module = NS[e.get("identifier", default="None")]
parse_manifest_rec(graph, e)
graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
add_index(graph, module, i)
......@@ -76,9 +77,9 @@ def parse_manifest(graph: Graph):
def parse_manifest_rec(
graph: Graph,
elem,
parentResource: Optional[URIRef] = None,
index: Optional[int] = None,
elem: etree._Element,
parentResource: URIRef | None = None,
index: int | None = None,
):
"""Parses a module `MosMod` from the manifest recursively, adding all its
descendants to the `graph`
......@@ -87,12 +88,13 @@ def parse_manifest_rec(
"""
# Get title and ID
title: str = ns_find(elem, "title").text
id: str = elem.get("identifier")
title = ns_find(elem, "title")
title = title.text if title is not None else "None" # safe default value
id: str = elem.get("identifier", default="None")
# Declare RDF resource and simple properties
subject = NS[id]
graph.add((subject, RDF.type, OWL.NamedIndividual))
add_title(graph, subject, title)
set_title(graph, subject, str(title))
if id.startswith("MosMod"):
# It's a Module:
graph.add((subject, RDF.type, NS["Module"]))
......@@ -119,18 +121,32 @@ def parse_manifest_rec(
extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id)
import extract_page
def compare_files(f1: str, f2: str):
print(
"Files {} and {} {}.".format(
f1, f2, "are identical" if filecmp.cmp(f1, f2) else "differ"
)
)
def main():
g = create_graph()
# Create or reset debug log files for all activity parsers, to compare their
# results afterwards
parsers = ("Match", "Xpath", "Regex")
logfiles = [f"/tmp/{p}Parser_debuglog.txt" for p in parsers]
for logfile in logfiles:
with open(logfile, "w") as f:
print("", file=f)
parse_manifest(g)
export_graph(g)
# extract_page.parse_page(
# g,
# f"{SOURCE_DIR}/contenu/pages/pg60.html",
# "pg60",
# )
# Compare log files 2 by 2
compare_files(logfiles[0], logfiles[1])
compare_files(logfiles[0], logfiles[2])
compare_files(logfiles[1], logfiles[2])
if __name__ == "__main__":
......
from os import path
import re
import subprocess
from os import path
from rdflib import OWL, RDF, RDFS, Graph, Literal
from rdflib import Graph, Literal, OWL, RDF, RDFS
from extract_page import parse_page
from common import *
from extract_page import parse_page
def generate_triples(
......@@ -23,7 +23,8 @@ def generate_triples(
# Type and simple properties
graph.add((page, RDF.type, OWL.NamedIndividual))
graph.add((page, RDF.type, NS["Page"]))
add_title(graph, page, page_title)
graph.add((page, NS["id"], Literal(page_id)))
set_title(graph, page, page_title)
add_index(
graph,
page,
......
from abc import abstractmethod
from dataclasses import dataclass
import re
from pprint import pprint
from typing import Any, List
from typing import Any, Callable
from lxml import html
import esprima as es
from lxml import etree, html
from lxml.etree import _Element
from rdflib import Graph, Literal
from lxml.html import HtmlElement
from rdflib import Graph, Literal, RDF
from typing_extensions import override
from common import *
class Comment:
id: str
num: int
text: str
html: Any
elem: _Element
def __init__(self):
self.id: str
self.num: int
self.text: str
self.html: Any
self.elem: _Element
@override
def __repr__(self):
return str(self.__dict__)
class Page:
id: str
title: str
type: str # cours ou exercice
comment_success: Comment
comments_sugg: List[Comment]
comments_misc: List[Comment]
class Activity:
def __init__(self):
self.id: str = ""
"""The ID of the page this activity is in (`pg###`)"""
self.title: str = ""
"""Human-readable title of the activity"""
self.description: str | None = None
"""Description of the activity's body (HTML),
e.g. the instructions for an exercise activity"""
self.comment_consigne: Comment | None = None
"""Another form of activity description but in a comment. May or may not
coexist with a regular description"""
self.comment_success: Comment | None = None
"""Comment displayed on success, if applicable"""
self.comments_sugg: list[Comment] = []
"""Help comments displayed on failure, if applicable"""
self.comments_misc: list[Comment] = []
"""Any other comments, if present"""
self.ref: URIRef
def __init__(self) -> None:
self.comments_sugg = []
self.comments_misc = []
def __repr__(self):
return str(self.__dict__)
# Regex to separate non-digits and digits
regex_comment = re.compile(r"(\D*)(\d*)")
def parse_page(graph: Graph, filepath: str, id: str):
page = Page()
# Parse with lxml
tree = html.parse(filepath)
root = tree.getroot()
def save(self, graph: Graph):
"""Save activity data to the graph. Subclasses may override this method
to save their specific data."""
self.ref = NS[self.id]
# => Type
graph.add((self.ref, RDF.type, NS[self.get_name()]))
# => Title
set_title(graph, self.ref, self.title)
# => Description
description = self.description or ""
if self.comment_consigne is not None:
description += self.comment_consigne.html
if description != "":
graph.add((self.ref, NS["description"], Literal(description)))
# => Comments
if self.comment_success is not None:
graph.add(
(self.ref, NS["commentaireSucces"], Literal(self.comment_success.html))
)
for comment in self.comments_sugg:
graph.add((self.ref, NS["commentaireSugg"], Literal(comment.html)))
for comment in self.comments_misc:
graph.add((self.ref, NS["commentaireInfo"], Literal(comment.html)))
# Parse comments
def parse_html(self, root: HtmlElement):
"""From a `lxml.html` parsing tree, extract all data relevant to this class.
Subclasses may override this method to extract more specific data.
"""
# => Title
self.title = root.xpath("/html/head/title")[0].text
# => Comments
zi = root.get_element_by_id("zoneInvisible")
for cmt_div in zi:
comment = Comment()
comment.text = cmt_div.text_content()
comment.html = html.tostring(cmt_div, encoding="unicode")
comment.html = to_html(cmt_div)
comment.elem = cmt_div
comment.id = cmt_div.get("id")
comment.id = cmt_div.get("id") or ""
# Split id in two parts (non-digits and digits), then match on these parts
m = regex_comment.match(comment.id)
if m is not None:
match m.groups():
case ["divCmt", num]:
print(f"Comment, num={num}")
comment.num = int(num)
page.comments_misc.append(comment)
graph.add((NS[id], NS["commentaireInfo"], Literal(comment.html)))
self.comments_misc.append(comment)
case ["divSugg", num]:
print(f"Suggestion, num={num}")
comment.num = int(num)
page.comments_sugg.append(comment)
graph.add((NS[id], NS["commentaireSugg"], Literal(comment.html)))
self.comments_sugg.append(comment)
case ["divCmtSucces", _]:
print(f"Succès")
page.comment_success = comment
graph.add((NS[id], NS["commentaireSucces"], Literal(comment.html)))
case [other, _]:
print(f"other: {other}")
# pprint(page)
self.comment_success = comment
case ["divConsigne", _]:
self.comment_consigne = comment
case [alpha, num]:
eprint(f"No match for comment {alpha}[{num}] ('{comment.id}')")
def get_name(self) -> str:
return type(self).__name__
@classmethod
def from_typename(cls, name: str):
"""Convenience function to create an `Activity` subclass from a name"""
match name:
case "Cours":
return Cours()
case "ExerciceQC":
return ExerciceQC()
case "ExerciceQM":
return ExerciceQM()
case "ExerciceTAT":
return ExerciceTAT()
case "ExerciceGD":
return ExerciceGD()
case _:
raise NameError(name=name)
@override
def __repr__(self):
return self.get_name() + str(self.__dict__)
class Cours(Activity):
@override
def parse_html(self, root: HtmlElement):
super().parse_html(root)
# => Description
cours = root.get_element_by_id("STY_texteCours")
self.description = to_html(cours).strip()
class Exercice(Activity):
@override
def parse_html(self, root: HtmlElement):
super().parse_html(root)
# => Description
question = root.get_element_by_id("STY_question")
self.description = to_html(question).strip()
@dataclass
class Choice:
"""A possible answer for a question, correct or not"""
def __init__(self, index: int = 0):
self.index = index
self.is_correct: bool = False
self.html: str = ""
class ChoiceGroup:
def __init__(self):
self.label: str
self.items: list[Choice]
class ExerciceQC(Exercice):
def __init__(self, is_qcm: bool = False) -> None:
super().__init__()
self.is_qcm = is_qcm
self.choices: list[Choice] = []
@override
def get_name(self) -> str:
return "ExerciceQC_QCM" if self.is_qcm else "ExerciceQC_QCU"
@override
def parse_html(self, root: HtmlElement):
super().parse_html(root)
# Find question choices
for choice in root.find_class("STY_reponseQC"):
# Choices have an 'id' attribute in the form 'lienRepX'
# where X is their index (starting at 1)
index = int(choice.attrib["id"].replace("lienRep", ""))
self.set_html(index - 1, to_html(choice).strip())
@override
def save(self, graph: Graph):
super().save(graph)
for choice in self.choices:
rdf_name = f"{self.id}q{choice.index}" # ex: pg157q2
display_name = rdf_name + " | " + ("V" if choice.is_correct else "F")
choice_node = NS[rdf_name]
graph.add((choice_node, RDF.type, NS["Reponse"]))
graph.add((choice_node, NS["index"], Literal(choice.index)))
graph.add((choice_node, NS["correct"], Literal(choice.is_correct)))
graph.add((choice_node, NS["html"], Literal(choice.html)))
graph.add(
(
choice_node,
NS["__protege_display_name"],
Literal(display_name),
)
)
graph.add((NS[self.id], NS["aReponse"], choice_node))
# Our fake "class hierarchy" just for easier visualization
graph.add((choice_node, RDFS.subClassOf, NS[self.id]))
def set_correct(self, choice_index: int, correct: bool):
"""Set the choice at `choice_index` as correct or not, creating it if needed."""
self._get_or_create(choice_index).is_correct = correct
def set_html(self, choice_index: int, html: str):
"""Set the `html` attribute for the choice at `choice_index`, creating it if needed."""
self._get_or_create(choice_index).html = html
def _get_or_create(self, index: int) -> Choice:
"""Returns the choice at `index`, creating it if needed."""
for i in range(len(self.choices), index + 1):
self.choices.append(Choice(i))
return self.choices[index]
class ExerciceQM(Exercice):
def __init__(self):
super().__init__()
self.questions: list[ChoiceGroup]
class ExerciceTAT(Exercice):
def __init__(self):
super().__init__()
self.text: str # can be HTML
self.gaps: list[ChoiceGroup]
class ExerciceGD(Exercice):
def __init__(self):
super().__init__()
self.targets: list[str]
self.draggables: list[list[Choice]]
class JSParser:
@abstractmethod
def parse(self, js: str) -> Activity:
"""Parse a string of JavaScript code and returns an instance of the
correct `Activity` subclass, partially populated with data found in the code.
"""
pass
@override
def __str__(self) -> str:
return type(self).__name__
class RegexParser(JSParser):
def __init__(self, graph: Graph, act_id: str) -> None:
self.graph = graph
self.act_id = act_id
@override
def parse(self, js: str) -> Activity:
# Find function declaration and only keep code after it
func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
if len(func_split) < 2:
raise ParseError("Failed to find function 'entrerDonnees'")
body = func_split[1]
activity, activity_var_name = self._parse_activity_constructor(body)
if isinstance(activity, ExerciceQC):
# Parse correct answers
self._parse_qc_answers(body, activity)
return activity
def _parse_activity_constructor(self, code: str) -> tuple[Activity, str]:
"""
Find activity constructor call, return the activity type
and resulting variable name.
"""
constructor_match = re.search(
r"""
(\w+) # result variable name
\s+=\s+new\s+ #
(Cours|Exercice\w+) # constructor name
\((.*?)\); # optional arguments between parentheses
""",
code,
re.VERBOSE,
)
if constructor_match is None:
raise ParseError("Failed to parse activity constructor")
var_name, act_type, args = constructor_match.groups()
activity = Activity.from_typename(act_type)
# Handle case of QC variants
if isinstance(activity, ExerciceQC) and args == '"QCM"':
activity.is_qcm = True
return activity, var_name
def _parse_qc_answers(self, code: str, exo: ExerciceQC) -> None:
"""Parse the correct answers for a QCU activity"""
index = 0
for line in code.splitlines():
line = line.strip()
m = re.match(r"var nr = (\d+);", line)
if m is not None:
# "index" line
index = int(m.group(1)) - 1 # question indexes start at 1
elif line == "exo.tabStylesR[nr] = CODE_F;":
# "incorrect answer" line
exo.set_correct(index, False)
elif line == "exo.tabStylesR[nr] = CODE_V;":
# "correct answer" line
exo.set_correct(index, True)
class XpathParser(JSParser):
"""A parser for the JS portion of an activity, that uses XPath to query
an XML representation of Esprima's abstract syntax tree (AST)"""
# XPath requests pre-compiled as functions
request_function = etree.XPath(
'//FunctionDeclaration[id/Identifier[@name="entrerDonnees"]]'
)
request_index_and_values = etree.XPath(
'*//VariableDeclarator[id//*[@name="nr"]]/init/Literal | *//AssignmentExpression//Identifier[starts-with(@name,"CODE_")]'
)
request_constructor_id = etree.XPath(
'*//NewExpression/callee/Identifier[@name="Cours" or starts-with(@name, "Exercice")]'
)
def __init__(self) -> None:
self.fun: Any
"""AST element corresponding to the function we're interested in.
Initialised in `self.parse()`."""
@override
def parse(self, js: str) -> Activity:
jstree: Any = es.parseScript(js, None)
# Convert Esprima object tree to XML etree
xml = self.to_xml(jstree.toDict(), "jstree")
try:
self.fun = self.request_function(xml)[0]
activity = self._parse_activity_type()
if isinstance(activity, ExerciceQC):
self._parse_qc_answers(activity)
return activity
except Exception as e:
raise ParseError(e)
def _parse_activity_type(self) -> Activity:
constructor_id = self.request_constructor_id(self.fun)[0]
match constructor_id.get("name"):
case "ExerciceQC":
arg = constructor_id.xpath("../../arguments/Literal/@value")[0]
if arg == "QCM":
return ExerciceQC(is_qcm=True)
elif arg == "QCU":
return ExerciceQC()
else:
raise ParseError(f"ExerciceQC: invalid argument '{arg}'")
case other:
return Activity.from_typename(other)
def _parse_qc_answers(self, activity: ExerciceQC) -> None:
"""Parse the correct answers for a QC activity"""
indexes_and_values = self.request_index_and_values(self.fun)
index = 0
for e in indexes_and_values:
value = e.xpath("@value")
if len(value) != 0:
# "index line"
index = int(value[0]) - 1 # question indexes start at 1
else:
# "correct" or "incorrect" line
activity.set_correct(index, e.get("name") == "CODE_V")
def to_xml(self, obj: Any, tag_name: str | None = None):
"""Recursively convert an object structure to an XML `ElementTree`.
Structures are expected to be Python dictionaries.
Converting a dictionary produces a tag named after the "type" attribute (if present).
- A primitive attribute (i.e. not list nor dict) becomes a tag attribute.
- A list attribute becomes a tag with its contents as sub-tags.
- A dictionary attribute becomes a tag (named like the attribute's key)
containing a sub-tag for the dictionary itself
"""
if isinstance(obj, dict):
# Dictionary (or object):
# - if it has a "type" key, the dict represents an object -> use its value as the tag name
# - if a tag_name is specified as well, it's probably important (like an attribute name),
# so we keep both, as 2 nested tags (tag_name for the outer tag, type for the inner tag)
inner_tag = None
outer_tag = None
has_inner = "type" in obj.keys()
if has_inner:
inner_tag = etree.Element(obj["type"], None, None)
else:
inner_tag = etree.Element("_dict", None, None)
if tag_name is not None:
outer_tag = etree.Element(tag_name)
if has_inner:
outer_tag.append(inner_tag)
else:
inner_tag = outer_tag
else:
outer_tag = inner_tag
# Recurse on dictionary items
for key, val in obj.items():
if key != "type": # exception for 'type', handled as attribute
if isinstance(val, (list, dict)):
# Structured attributes become child tags
inner_tag.append(self.to_xml(val, key))
else:
# Primitive attributes become tag attributes
inner_tag.set(key, str(val))
return outer_tag
elif isinstance(obj, list):
tag_name = tag_name or "_list"
list_tag = etree.Element(tag_name)
for e in obj:
list_tag.append(self.to_xml(e))
return list_tag
else:
tag_name = tag_name or "_literal"
leaf_tag = etree.Element(tag_name)
leaf_tag.text = str(obj)
return leaf_tag
class MatchParser(JSParser):
"""A parser for the JS portion of an activity, that uses Python match statements
to navigate the abstract syntax tree (AST) produced by Esprima"""
def __init__(self, graph: Graph, act_id: str) -> None:
self.graph = graph
self.act_id = act_id
self.activity: Activity | None = None
@override
def parse(self, js: str) -> Activity:
jstree = es.parseScript(js, None)
# Try to match our template with one of the top-level statements
for statement in jstree.body:
self.match_function(statement.toDict())
if self.activity is not None:
return self.activity
else:
raise ParseError("No activity constructor found")
def match_constructor_call(self, new_expr: dict[str, Any]):
if self.activity is not None: # Ignore anything after the first match
return
match new_expr:
case {
"type": "NewExpression",
"callee": {
"type": "Identifier",
"name": typ,
},
"arguments": [*args],
}:
match typ:
case "Cours" | "ExerciceQM" | "ExerciceTAT" | "ExerciceGD":
self.activity = Activity.from_typename(typ)
case "ExerciceQC":
match args:
case [{"type": "Literal", "value": "QCU"}, *_]:
typ += "_QCU"
self.activity = ExerciceQC()
case [{"type": "Literal", "value": "QCM"}, *_]:
typ += "_QCM"
self.activity = ExerciceQC(is_qcm=True)
case _:
raise ParseError(
f"ExerciceQC: Invalid argument '{args}'"
)
case _:
raise ParseError(f"Unknown activity type '{typ}'")
case _:
pass
def match_function(self, func: dict[str, Any]):
"""Checks if `func` matches a function declaration named `entrerDonnees`,
and search its body if successful
"""
match func:
case {
"type": "FunctionDeclaration",
"id": {"name": "entrerDonnees"},
"body": {"type": "BlockStatement", "body": body},
}:
# Matched a function declaration and captured its `body` attr
for statement in body:
# Find constructor calls (e.g. `new Thing()`) recursively
recurse_prefix(statement, self.match_constructor_call)
case _:
pass
def recurse_prefix(t: Any, f: Callable[[Any], None]):
"""Depth-first prefixed recursion: calls a function on an object, then on
all its children (if it's a list or dictionary) recursively
:param t: The object
:param f: The function to call
"""
f(t)
if isinstance(t, list):
for e in t:
recurse_prefix(e, f)
elif isinstance(t, dict):
for e in t.values():
recurse_prefix(e, f)
# Regex to separate non-digits and digits
regex_comment = re.compile(r"(\D*)(\d*)")
def parse_page(graph: Graph, filepath: str, id: str):
# Activity data is spread across HTML and JS code, which are parsed
# differently. Additionally, some pieces of data are specific to the
# activity type (Cours, ExerciceQC...) and this type is in the JS portion.
# This requires parsing the JS code first, to get the type, then proceed
# with HTML to get the rest of the type-specific data.
# We still need to find the inline scripts before parsing them
tree = html.parse(filepath)
root = tree.getroot()
# Collect all inline scripts (no external 'src') and join them in a
# block of JS code
scripts: list[HtmlElement] = root.xpath(
'/html/head/script[@type="text/javascript" and not(@src)]'
)
js = "\n".join((s.text_content() for s in scripts))
activity = Activity()
# Try different parsers, each writing to a different file to compare their results
for parser in [XpathParser(), MatchParser(graph, id), RegexParser(graph, id)]:
with open(f"/tmp/{str(parser)}_debuglog.txt", "a") as f:
print(f"\n{id:8}", end="", file=f)
try:
activity: Activity = parser.parse(js)
print(activity, end="", file=f)
except ParseError as e:
eprint(f"{parser} -> {id}: Parsing error: {e}")
eprint("Treating this as a generic Activity.")
activity.id = id
# Parse the HTML portion
activity.parse_html(root)
# Save everything to the graph
activity.save(graph)
{
"reportMissingTypeStubs": "information",
"reportUnusedCallResult": "none",
"reportUnusedVariable": "warning",
"reportUnusedImport": "warning",
"reportMissingParameterType": "warning",
"reportMissingArgumentType": "warning",
"reportPrivateUsage": "none" /* lxml.etree often returns _Element */,
"reportUnknownParameterType": "none",
"reportUnknownArgumentType": "none",
"reportUnknownVariableType": "none",
"reportUnknownMemberType": "none",
"reportAny": "none"
}
\ No newline at end of file
cssselect==1.2.0
esprima==4.0.1
isodate==0.6.1
lxml==5.2.2
pyparsing==3.1.2
rdflib==7.0.0
six==1.16.0
types-beautifulsoup4==4.12.0.20240511
types-html5lib==1.1.11.20240228
types-lxml==2024.4.14
typing_extensions==4.12.1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment