Select Git revision
jest.config.js
extract_page.py 20.53 KiB
import re
from abc import abstractmethod
from dataclasses import dataclass
from typing import Any, Callable
import esprima as es
from lxml import etree, html
from lxml.etree import _Element
from lxml.html import HtmlElement
from rdflib import RDF, Graph, Literal
from typing_extensions import override
from common import *
# Initialise logger
log = get_logger("extract_page")
class Comment:
def __init__(self):
self.id: str
self.num: int
self.text: str
self.html: Any
self.elem: _Element
@override
def __repr__(self):
return str(self.__dict__)
class Activity:
def __init__(self):
self.id: str = ""
"""The ID of the page this activity is in (`pg###`)"""
self.title: str = ""
"""Human-readable title of the activity"""
self.description: str | None = None
"""Description of the activity's body (HTML),
e.g. the instructions for an exercise activity"""
self.comment_consigne: Comment | None = None
"""Another form of activity description but in a comment. May or may not
coexist with a regular description"""
self.comment_success: Comment | None = None
"""Comment displayed on success, if applicable"""
self.comments_sugg: list[Comment] = []
"""Help comments displayed on failure, if applicable"""
self.comments_misc: list[Comment] = []
"""Any other comments, if present"""
self.ref: URIRef
def save(self, graph: Graph):
"""Save activity data to the graph. Subclasses may override this method
to save their specific data."""
self.ref = NS[self.id]
# => Type
graph.add((self.ref, RDF.type, NS[self.get_name()]))
# => Title
set_title(graph, self.ref, self.title)
# => Description
description = self.description or ""
if self.comment_consigne is not None:
description += self.comment_consigne.html
if description != "":
graph.add((self.ref, NS["description"], Literal(description)))
# => Comments
if self.comment_success is not None:
graph.add(
(self.ref, NS["commentaireSucces"], Literal(self.comment_success.html))
)
for comment in self.comments_sugg:
graph.add((self.ref, NS["commentaireSugg"], Literal(comment.html)))
for comment in self.comments_misc:
graph.add((self.ref, NS["commentaireInfo"], Literal(comment.html)))
def parse_html(self, root: HtmlElement):
"""From a `lxml.html` parsing tree, extract all data relevant to this class.
Subclasses may override this method to extract more specific data.
"""
# => Title
self.title = root.xpath("/html/head/title")[0].text
# => Comments
zi = root.get_element_by_id("zoneInvisible")
for cmt_div in zi:
comment = Comment()
comment.text = cmt_div.text_content()
comment.html = to_html(cmt_div)
comment.elem = cmt_div
comment.id = cmt_div.get("id") or ""
# Split id in two parts (non-digits and digits), then match on these parts
m = regex_comment.match(comment.id)
if m is not None:
match m.groups():
case ["divCmt", num]:
comment.num = int(num)
self.comments_misc.append(comment)
case ["divSugg", num]:
comment.num = int(num)
self.comments_sugg.append(comment)
case ["divCmtSucces", _]:
self.comment_success = comment
case ["divConsigne", _]:
self.comment_consigne = comment
case [alpha, num]:
log.warning(
f"No match for comment {alpha}[{num}] ('{comment.id}')"
)
def get_name(self) -> str:
return type(self).__name__
@classmethod
def from_typename(cls, name: str):
"""Convenience function to create an `Activity` subclass from a name"""
match name:
case "Cours":
return Cours()
case "ExerciceQC":
return ExerciceQC()
case "ExerciceQM":
return ExerciceQM()
case "ExerciceTAT":
return ExerciceTAT()
case "ExerciceGD":
return ExerciceGD()
case _:
raise NameError(name=name)
@override
def __repr__(self):
return self.get_name() + str(self.__dict__)
class Cours(Activity):
@override
def parse_html(self, root: HtmlElement):
super().parse_html(root)
# => Description
cours = root.get_element_by_id("STY_texteCours")
self.description = to_html(cours).strip()
class Exercice(Activity):
@override
def parse_html(self, root: HtmlElement):
super().parse_html(root)
# => Description
question = root.get_element_by_id("STY_question")
self.description = to_html(question).strip()
@dataclass
class Choice:
"""A possible answer for a question, correct or not"""
def __init__(self, index: int = 0):
self.index = index
self.is_correct: bool = False
self.html: str = ""
class ChoiceGroup:
def __init__(self):
self.label: str
self.items: list[Choice]
class ExerciceQC(Exercice):
def __init__(self, is_qcm: bool = False) -> None:
super().__init__()
self.is_qcm = is_qcm
self.choices: list[Choice] = []
@override
def get_name(self) -> str:
return "ExerciceQC_QCM" if self.is_qcm else "ExerciceQC_QCU"
@override
def parse_html(self, root: HtmlElement):
super().parse_html(root)
# Find question choices
for choice in root.find_class("STY_reponseQC"):
# Choices have an 'id' attribute in the form 'lienRepX'
# where X is their index (starting at 1)
index = int(choice.attrib["id"].replace("lienRep", ""))
self.set_html(index - 1, to_html(choice).strip())
@override
def save(self, graph: Graph):
super().save(graph)
for choice in self.choices:
rdf_name = f"{self.id}q{choice.index}" # ex: pg157q2
display_name = rdf_name + " | " + ("V" if choice.is_correct else "F")
choice_node = NS[rdf_name]
graph.add((choice_node, RDF.type, NS["Reponse"]))
graph.add((choice_node, NS["index"], Literal(choice.index)))
graph.add((choice_node, NS["correct"], Literal(choice.is_correct)))
graph.add((choice_node, NS["html"], Literal(choice.html)))
graph.add(
(
choice_node,
NS["__protege_display_name"],
Literal(display_name),
)
)
graph.add((NS[self.id], NS["aReponse"], choice_node))
# Our fake "class hierarchy" just for easier visualization
graph.add((choice_node, RDFS.subClassOf, NS[self.id]))
def set_correct(self, choice_index: int, correct: bool):
"""Set the choice at `choice_index` as correct or not, creating it if needed."""
self._get_or_create(choice_index).is_correct = correct
def set_html(self, choice_index: int, html: str):
"""Set the `html` attribute for the choice at `choice_index`, creating it if needed."""
self._get_or_create(choice_index).html = html
def _get_or_create(self, index: int) -> Choice:
"""Returns the choice at `index`, creating it if needed."""
for i in range(len(self.choices), index + 1):
self.choices.append(Choice(i))
return self.choices[index]
class ExerciceQM(Exercice):
def __init__(self):
super().__init__()
self.questions: list[ChoiceGroup]
class ExerciceTAT(Exercice):
def __init__(self):
super().__init__()
self.text: str # can be HTML
self.gaps: list[ChoiceGroup]
class ExerciceGD(Exercice):
def __init__(self):
super().__init__()
self.targets: list[str]
self.draggables: list[list[Choice]]
class JSParser:
@abstractmethod
def parse(self, js: str) -> Activity:
"""Parse a string of JavaScript code and returns an instance of the
correct `Activity` subclass, partially populated with data found in the code.
"""
pass
@override
def __str__(self) -> str:
return type(self).__name__
class RegexParser(JSParser):
def __init__(self, graph: Graph, act_id: str) -> None:
self.graph = graph
self.act_id = act_id
@override
def parse(self, js: str) -> Activity:
# Find function declaration and only keep code after it
func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
if len(func_split) < 2:
raise ParseError("Failed to find function 'entrerDonnees'")
body = func_split[1]
activity, activity_var_name = self._parse_activity_constructor(body)
if isinstance(activity, ExerciceQC):
# Parse correct answers
self._parse_qc_answers(body, activity)
return activity
def _parse_activity_constructor(self, code: str) -> tuple[Activity, str]:
"""
Find activity constructor call, return the activity type
and resulting variable name.
"""
constructor_match = re.search(
r"""
(\w+) # result variable name
\s+=\s+new\s+ #
(Cours|Exercice\w+) # constructor name
\((.*?)\); # optional arguments between parentheses
""",
code,
re.VERBOSE,
)
if constructor_match is None:
raise ParseError("Failed to parse activity constructor")
var_name, act_type, args = constructor_match.groups()
activity = Activity.from_typename(act_type)
# Handle case of QC variants
if isinstance(activity, ExerciceQC) and args == '"QCM"':
activity.is_qcm = True
return activity, var_name
def _parse_qc_answers(self, code: str, exo: ExerciceQC) -> None:
"""Parse the correct answers for a QCU activity"""
index = 0
for line in code.splitlines():
line = line.strip()
m = re.match(r"var nr = (\d+);", line)
if m is not None:
# "index" line
index = int(m.group(1)) - 1 # question indexes start at 1
elif line == "exo.tabStylesR[nr] = CODE_F;":
# "incorrect answer" line
exo.set_correct(index, False)
elif line == "exo.tabStylesR[nr] = CODE_V;":
# "correct answer" line
exo.set_correct(index, True)
class XpathParser(JSParser):
"""A parser for the JS portion of an activity, that uses XPath to query
an XML representation of Esprima's abstract syntax tree (AST)"""
# XPath requests pre-compiled as functions
request_function = etree.XPath(
'//FunctionDeclaration[id/Identifier[@name="entrerDonnees"]]'
)
request_index_and_values = etree.XPath(
'*//VariableDeclarator[id//*[@name="nr"]]/init/Literal | *//AssignmentExpression//Identifier[starts-with(@name,"CODE_")]'
)
request_constructor_id = etree.XPath(
'*//NewExpression/callee/Identifier[@name="Cours" or starts-with(@name, "Exercice")]'
)
def __init__(self) -> None:
self.fun: Any
"""AST element corresponding to the function we're interested in.
Initialised in `self.parse()`."""
@override
def parse(self, js: str) -> Activity:
jstree: Any = es.parseScript(js, None)
# Convert Esprima object tree to XML etree
xml = self.to_xml(jstree.toDict(), "jstree")
try:
self.fun = self.request_function(xml)[0]
activity = self._parse_activity_type()
if isinstance(activity, ExerciceQC):
self._parse_qc_answers(activity)
return activity
except Exception as e:
raise ParseError(e)
def _parse_activity_type(self) -> Activity:
constructor_id = self.request_constructor_id(self.fun)[0]
match constructor_id.get("name"):
case "ExerciceQC":
arg = constructor_id.xpath("../../arguments/Literal/@value")[0]
if arg == "QCM":
return ExerciceQC(is_qcm=True)
elif arg == "QCU":
return ExerciceQC()
else:
raise ParseError(f"ExerciceQC: invalid argument '{arg}'")
case other:
return Activity.from_typename(other)
def _parse_qc_answers(self, activity: ExerciceQC) -> None:
"""Parse the correct answers for a QC activity"""
indexes_and_values = self.request_index_and_values(self.fun)
index = 0
for e in indexes_and_values:
value = e.xpath("@value")
if len(value) != 0:
# "index line"
index = int(value[0]) - 1 # question indexes start at 1
else:
# "correct" or "incorrect" line
activity.set_correct(index, e.get("name") == "CODE_V")
def to_xml(self, obj: Any, tag_name: str | None = None):
"""Recursively convert an object structure to an XML `ElementTree`.
Structures are expected to be Python dictionaries.
Converting a dictionary produces a tag named after the "type" attribute (if present).
- A primitive attribute (i.e. not list nor dict) becomes a tag attribute.
- A list attribute becomes a tag with its contents as sub-tags.
- A dictionary attribute becomes a tag (named like the attribute's key)
containing a sub-tag for the dictionary itself
"""
if isinstance(obj, dict):
# Dictionary (or object):
# - if it has a "type" key, the dict represents an object -> use its value as the tag name
# - if a tag_name is specified as well, it's probably important (like an attribute name),
# so we keep both, as 2 nested tags (tag_name for the outer tag, type for the inner tag)
inner_tag = None
outer_tag = None
has_inner = "type" in obj.keys()
if has_inner:
inner_tag = etree.Element(obj["type"], None, None)
else:
inner_tag = etree.Element("_dict", None, None)
if tag_name is not None:
outer_tag = etree.Element(tag_name)
if has_inner:
outer_tag.append(inner_tag)
else:
inner_tag = outer_tag
else:
outer_tag = inner_tag
# Recurse on dictionary items
for key, val in obj.items():
if key != "type": # exception for 'type', handled as attribute
if isinstance(val, (list, dict)):
# Structured attributes become child tags
inner_tag.append(self.to_xml(val, key))
else:
# Primitive attributes become tag attributes
inner_tag.set(key, str(val))
return outer_tag
elif isinstance(obj, list):
tag_name = tag_name or "_list"
list_tag = etree.Element(tag_name)
for e in obj:
list_tag.append(self.to_xml(e))
return list_tag
else:
tag_name = tag_name or "_literal"
leaf_tag = etree.Element(tag_name)
leaf_tag.text = str(obj)
return leaf_tag
class MatchParser(JSParser):
"""A parser for the JS portion of an activity, that uses Python match statements
to navigate the abstract syntax tree (AST) produced by Esprima"""
def __init__(self, graph: Graph, act_id: str) -> None:
self.graph = graph
self.act_id = act_id
self.activity: Activity | None = None
@override
def parse(self, js: str) -> Activity:
jstree = es.parseScript(js, None)
# Try to match our template with one of the top-level statements
for statement in jstree.body:
self.match_function(statement.toDict())
if self.activity is not None:
return self.activity
else:
raise ParseError("No activity constructor found")
def match_constructor_call(self, new_expr: dict[str, Any]):
if self.activity is not None: # Ignore anything after the first match
return
match new_expr:
case {
"type": "NewExpression",
"callee": {
"type": "Identifier",
"name": typ,
},
"arguments": [*args],
}:
match typ:
case "Cours" | "ExerciceQM" | "ExerciceTAT" | "ExerciceGD":
self.activity = Activity.from_typename(typ)
case "ExerciceQC":
match args:
case [{"type": "Literal", "value": "QCU"}, *_]:
typ += "_QCU"
self.activity = ExerciceQC()
case [{"type": "Literal", "value": "QCM"}, *_]:
typ += "_QCM"
self.activity = ExerciceQC(is_qcm=True)
case _:
raise ParseError(
f"ExerciceQC: Invalid argument '{args}'"
)
case _:
raise ParseError(f"Unknown activity type '{typ}'")
case _:
pass
def match_function(self, func: dict[str, Any]):
"""Checks if `func` matches a function declaration named `entrerDonnees`,
and search its body if successful
"""
match func:
case {
"type": "FunctionDeclaration",
"id": {"name": "entrerDonnees"},
"body": {"type": "BlockStatement", "body": body},
}:
# Matched a function declaration and captured its `body` attr
for statement in body:
# Find constructor calls (e.g. `new Thing()`) recursively
recurse_prefix(statement, self.match_constructor_call)
case _:
pass
def recurse_prefix(t: Any, f: Callable[[Any], None]):
"""Depth-first prefixed recursion: calls a function on an object, then on
all its children (if it's a list or dictionary) recursively
:param t: The object
:param f: The function to call
"""
f(t)
if isinstance(t, list):
for e in t:
recurse_prefix(e, f)
elif isinstance(t, dict):
for e in t.values():
recurse_prefix(e, f)
# Regex to separate non-digits and digits
regex_comment = re.compile(r"(\D*)(\d*)")
def parse_page(graph: Graph, filepath: str, id: str):
# Activity data is spread across HTML and JS code, which are parsed
# differently. Additionally, some pieces of data are specific to the
# activity type (Cours, ExerciceQC...) and this type is in the JS portion.
# This requires parsing the JS code first, to get the type, then proceed
# with HTML to get the rest of the type-specific data.
# We still need to find the inline scripts before parsing them
tree = html.parse(filepath)
root = tree.getroot()
# Collect all inline scripts (no external 'src') and join them in a
# block of JS code
scripts: list[HtmlElement] = root.xpath(
'/html/head/script[@type="text/javascript" and not(@src)]'
)
js = "\n".join((s.text_content() for s in scripts))
activity = Activity()
# Try different parsers, each writing to a different file to compare their results
for parser in [XpathParser(), MatchParser(graph, id), RegexParser(graph, id)]:
with open(f"/tmp/{str(parser)}_debuglog.txt", "a") as f:
print(f"\n{id:8}", end="", file=f)
try:
activity: Activity = parser.parse(js)
print(activity, end="", file=f)
except ParseError as e:
log.error(
f"{parser} -> {id}: Parsing error: {e}. Treating this as a generic Activity."
)
activity.id = id
# Parse the HTML portion
activity.parse_html(root)
# Save everything to the graph
activity.save(graph)