Select Git revision
wrap_parameters.rb
extract_page.py 18.19 KiB
import re
from abc import abstractmethod
from dataclasses import dataclass
from typing import Any
from lxml import html
from lxml.etree import _Element
from lxml.html import HtmlElement
from rdflib import RDF, Graph, Literal
from typing_extensions import override
from common import *
# Initialise logger
log = get_logger("extract_page")
class Comment:
def __init__(self, id: str = ""):
self.id = id
self.num: int
self.text: str
self.html: Any
self.elem: _Element
@override
def __repr__(self):
return str(self.__dict__)
class Activity:
def __init__(self):
self.id: str = ""
"""The ID of the page this activity is in (`pg###`)"""
self.title: str = ""
"""Human-readable title of the activity"""
self.description: str | None = None
"""Description of the activity's body (HTML),
e.g. the instructions for an exercise activity"""
self.comment_consigne: Comment | None = None
"""Another form of activity description but in a comment. May or may not
coexist with a regular description"""
self.comment_success: Comment | None = None
"""Comment displayed on success, if applicable"""
self.comments_sugg: dict[str, Comment] = {}
"""Help comments displayed on failure, if applicable (keyed by ID)"""
self.comments_misc: list[Comment] = []
"""Any other comments, if present"""
self.ref: URIRef
def save(self, graph: Graph):
"""Save activity data to the graph. Subclasses may override this method
to save their specific data."""
self.ref = NS[self.id]
# => Type
graph.add((self.ref, RDF.type, NS[self.get_name()]))
# => Title
set_title(graph, self.ref, self.title)
# => Description
description = self.description or ""
if self.comment_consigne is not None:
description += self.comment_consigne.html
if description != "":
graph.add((self.ref, NS["description"], Literal(description)))
# => Comments
if self.comment_success is not None:
graph.add(
(self.ref, NS["commentaireSucces"], Literal(self.comment_success.html))
)
for comment in self.comments_sugg.values():
graph.add((self.ref, NS["commentaireSugg"], Literal(comment.html)))
for comment in self.comments_misc:
graph.add((self.ref, NS["commentaireInfo"], Literal(comment.html)))
def parse_html(self, root: HtmlElement):
"""From a `lxml.html` parsing tree, extract all data relevant to this class.
Subclasses may override this method to extract more specific data.
"""
# => Title
self.title = root.xpath("/html/head/title")[0].text
# => Comments
zi = root.get_element_by_id("zoneInvisible")
for cmt_div in zi:
comment = Comment()
comment.text = cmt_div.text_content()
comment.html = to_html(cmt_div)
comment.elem = cmt_div
comment.id = cmt_div.get("id") or ""
# Split id in two parts (non-digits and digits), then match on these parts
m = regex_comment.match(comment.id)
if m is not None:
match m.groups():
case ["divCmt", num]:
comment.num = int(num)
self.comments_misc.append(comment)
case ["divSugg", num]:
comment.num = int(num)
self.comments_sugg[comment.id] = comment
case ["divCmtSucces", _]:
self.comment_success = comment
case ["divConsigne", _]:
self.comment_consigne = comment
case alpha, num:
log.warning(
f"No match for comment {alpha}[{num}] ('{comment.id}')"
)
case something:
log.warning(f"No match for comment '{something}'")
def get_name(self) -> str:
return type(self).__name__
@classmethod
def from_typename(cls, name: str):
"""Convenience function to create an `Activity` subclass from a name"""
match name:
case "Cours":
return Cours()
case "ExerciceQC":
return ExerciceQC()
case "ExerciceQM":
return ExerciceQM()
case "ExerciceTAT":
return ExerciceTAT()
case "ExerciceGD":
return ExerciceGD()
case _:
raise NameError(name=name)
@override
def __repr__(self):
return self.get_name() + str(self.__dict__)
class Cours(Activity):
@override
def parse_html(self, root: HtmlElement):
super().parse_html(root)
# => Description
cours = root.get_element_by_id("STY_texteCours")
self.description = to_html(cours).strip()
class Exercice(Activity):
@override
def parse_html(self, root: HtmlElement):
super().parse_html(root)
# => Description
question = root.get_element_by_id("STY_question")
self.description = to_html(question).strip()
@dataclass
class Choice:
"""A possible answer for a question, correct or not"""
def __init__(
self,
id: str = "",
index: int = -1,
is_correct: bool = False,
html: str = "",
comment: Comment | None = None,
):
self.id = id
"""A string identifier for the choice"""
self.index = index
"""The order the choice appears in"""
self.is_correct = is_correct
self.html = html
self.comment = comment
"""A `Comment` associated with this choice, displayed when the exercise
is incorrect and this choice is selected"""
@override
def __str__(self) -> str:
return f"Choice(id='{self.id}', index={self.index}, is_correct={self.is_correct}, html='{self.html[0::10]}')"
class ChoiceGroup:
def __init__(self):
self.label: str
self.items: list[Choice]
class ExerciceQC(Exercice):
def __init__(self, is_qcm: bool = False) -> None:
super().__init__()
self.is_qcm = is_qcm
self.choices: dict[str, Choice] = {}
@override
def get_name(self) -> str:
return "ExerciceQC_QCM" if self.is_qcm else "ExerciceQC_QCU"
@override
def parse_html(self, root: HtmlElement):
super().parse_html(root)
# Find question choices
for index, choice_node in enumerate(root.find_class("STY_reponseQC")):
if Context.version == "macao_12":
# Choices have an 'id' attribute in the form 'lienRepX'
# where X is their index (starting at 1)
id = choice_node.attrib["id"].replace("lienRep", "")
else:
# Choices have an 'id' attribute in the form 'lienrepX' (lowercase)
# where X is a number. The actual ID we're keeping is 'repX'.
id = choice_node.attrib["id"].replace("lien", "")
choice = self.get_or_create_choice(id)
choice.index = index
choice.html = to_html(choice_node).strip()
# The activity's comments have already been extracted in Activity.parse_html(),
# but some of them may be associated with a specific choice (this is
# detected by the JS parser earlier).
# Move these comments from the activity to their choice object.
for choice in self.choices.values():
if choice.comment is not None:
try:
choice.comment = self.comments_sugg.pop(choice.comment.id)
except KeyError:
log.warning(
f"Choice '{choice.id}' requested comment '{choice.comment.id}', which was not found in HTML."
)
@override
def save(self, graph: Graph):
super().save(graph)
for choice in self.choices.values():
rdf_name = f"{self.id}_{choice.index}_{choice.id}" # ex: pg157_2, pg173_rep21
display_name = rdf_name + " | " + ("V" if choice.is_correct else "F")
choice_node = NS[rdf_name]
graph.add((choice_node, RDF.type, NS["Reponse"]))
graph.add((choice_node, NS["id"], Literal(choice.id)))
graph.add((choice_node, NS["index"], Literal(choice.index)))
graph.add((choice_node, NS["correct"], Literal(choice.is_correct)))
graph.add((choice_node, NS["html"], Literal(choice.html)))
# Save optional comment
if choice.comment is not None:
graph.add(
(choice_node, NS["commentaireSugg"], Literal(choice.comment.html))
)
graph.add(
(
choice_node,
NS["__protege_display_name"],
Literal(display_name),
)
)
graph.add((NS[self.id], NS["aReponse"], choice_node))
# Our fake "class hierarchy" just for easier visualization
graph.add((choice_node, RDFS.subClassOf, NS[self.id]))
def set_correct(self, choice_id: str, correct: bool):
"""Set the choice with ID `choice_id` as correct or not, creating it if needed."""
self.get_or_create_choice(choice_id).is_correct = correct
def set_html(self, choice_id: str, html: str):
"""Set the `html` attribute for the choice with ID `choice_id`, creating it if needed."""
self.get_or_create_choice(choice_id).html = html
def get_or_create_choice(self, id: str) -> Choice:
"""Returns the choice with the `id`, creating it if needed."""
if id not in self.choices:
self.choices[id] = Choice(id)
return self.choices[id]
class ExerciceQM(Exercice):
def __init__(self):
super().__init__()
self.questions: list[ChoiceGroup]
class ExerciceTAT(Exercice):
def __init__(self):
super().__init__()
self.text: str # can be HTML
self.gaps: list[ChoiceGroup]
class ExerciceGD(Exercice):
def __init__(self):
super().__init__()
self.targets: list[str]
self.draggables: list[list[Choice]]
class JSParser:
@abstractmethod
def parse(self, js: str) -> Activity:
"""Parse a string of JavaScript code and returns an instance of the
correct `Activity` subclass, partially populated with data found in the code.
"""
pass
@override
def __str__(self) -> str:
return type(self).__name__
class RegexParser(JSParser):
def __init__(self, graph: Graph, act_id: str) -> None:
self.graph = graph
self.act_id = act_id
@override
def parse(self, js: str) -> Activity:
# Find function declaration and only keep code after it
func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
if len(func_split) < 2:
raise ParseError("Failed to find function 'entrerDonnees'")
body = func_split[1]
activity, _ = self._parse_activity_constructor(body)
if isinstance(activity, ExerciceQC):
# Parse correct answers
self._parse_qc_answers(body, activity)
return activity
def _parse_activity_constructor(self, code: str) -> tuple[Activity, str]:
"""
Find activity constructor call, return the activity type
and resulting variable name.
"""
constructor_match = re.search(
r"""
(\w+) # result variable name
\s+=\s+new\s+ #
(Cours|Exercice\w+) # constructor name
\((.*?)\); # optional arguments between parentheses
""",
code,
re.VERBOSE,
)
if constructor_match is None:
raise ParseError("Failed to parse activity constructor")
var_name, act_type, args = constructor_match.groups()
activity = Activity.from_typename(act_type)
# Handle case of QC variants
if isinstance(activity, ExerciceQC) and args == '"QCM"':
activity.is_qcm = True
return activity, var_name
def _parse_qc_answers(self, code: str, exo: ExerciceQC) -> None:
"""Parse the correct answers for a QCU activity"""
if Context.version == "macao_12":
choice_id = "0"
for line in code.splitlines():
line = line.strip()
m = re.match(r"var nr = (\d+);", line)
if m is not None:
# "index" line
choice_id = m.group(1)
elif line == "exo.tabStylesR[nr] = CODE_F;":
# "incorrect answer" line
exo.set_correct(choice_id, False)
elif line == "exo.tabStylesR[nr] = CODE_V;":
# "correct answer" line
exo.set_correct(choice_id, True)
else:
# Parse choices IDs and correctness
# ( tinker with this regex: https://regex101.com/r/qAkdDD/2 )
answers_regex = re.compile(
r"""
var[ ](?P<varname>\w+) # Capture variable name, referenced in 2nd line
[ ]=[ ]new[ ]ItemReponse\(
'(?P<id>\w+)' # Constructor parameter : answer ID (obfuscated)
\);\n\s* # New line and any indent
(?P=varname) # Back-reference to the variable name captured earlier
\.init\(
\"\d*?(?P<correct>\d)\" # First parameter of "init" : correctness
# (capture last digit only)
(?:,\s*\"\w*\"){3}\); # Skip 3 params""",
re.VERBOSE,
)
answers = list(answers_regex.finditer(code))
# Yet another layer of obfuscation: correct/incorrect are inverted
# depending on the total score and number of answers
# (see ClasseExerciceQC.js:86)
score = self._parse_score(code)
is_inverted = ((len(answers) + score) % 2) == 1
for match in answers:
# Answer ID is obfuscated by changing some digits
choice_id = decode_answer_id(match.group("id"))
choice = exo.get_or_create_choice(choice_id)
choice.is_correct = (match.group("correct") == "1") != is_inverted
# Parse choice-comment associations
# ( tinker with this regex: https://regex101.com/r/qEzZ5R/1 )
comments_regex = re.compile(
r"""
var[ ](?P<varname>\w+) # Capture variable name, referenced in 2nd line
[ ]=[ ]
'(?P<comment_id>\w+)' # Constructor param : comment ID
;\n\s* # New line and any indent
EXO_ajouterCommentaire\(
(?P=varname) # Back-reference to the variable name captured earlier
(?:,\s*\"\w*\"){6} # Skip 6 parameters
,[ ]\"(?P<choice_id>\w+)\" # 8th parameter : choice ID
(?:,\s*\"\w*\"){10} # Skip 10 parameters
\);""",
re.VERBOSE,
)
for match in comments_regex.finditer(code):
choice_id = match.group("choice_id")
comment_id = match.group("comment_id")
try:
choice = exo.choices[choice_id]
# Save a Comment object with just the ID, other fields will be
# filled at the HTML parsing stage
choice.comment = Comment(comment_id)
except KeyError:
log.warning(
f"{self.act_id}: '{comment_id}' requested choice ID '{choice_id}', which doesn't exist"
)
pass
def _parse_score(self, code: str):
"""Parse the activity's 'total score' variable"""
exception = ParseError("Failed to parse total score for this activity")
m = re.search(r"exo\.scoreTotal ?= ?(\d+);", code)
try:
if m is not None:
return int(m.group(1))
else:
raise exception
except ValueError as e:
raise exception from e
def decode_answer_id(id: str):
"""
Decode an obfuscated answer ID, just like the `decodeX()` function
in `ClasseExerciceQC.js`.
"""
res = ""
for c in id:
match c:
case "3":
res += "0"
case "8":
res += "1"
case "7":
res += "2"
case "9":
res += "3"
case "1":
res += "7"
case "0":
res += "8"
case "2":
res += "9"
case _:
res += c
return res
# Regex to separate non-digits and digits
regex_comment = re.compile(r"(\D*)(\d*)")
def parse_page(graph: Graph, filepath: str, id: str):
# Activity data is spread across HTML and JS code, which are parsed
# differently. Additionally, some pieces of data are specific to the
# activity type (Cours, ExerciceQC...) and this type is in the JS portion.
# This requires parsing the JS code first, to get the type, then proceed
# with HTML to get the rest of the type-specific data.
# We still need to find the inline scripts before parsing them
tree = html.parse(filepath)
root = tree.getroot()
# Collect all inline scripts (no external 'src') and join them in a
# block of JS code
scripts: list[HtmlElement] = root.xpath(
'/html/head/script[@type="text/javascript" and not(@src)]'
)
js = "\n".join((s.text_content() for s in scripts))
activity = Activity()
parser = RegexParser(graph, id)
try:
activity: Activity = parser.parse(js)
except ParseError as e:
log.error(
f"{parser} -> {id}: Parsing error: {e}. Treating this as a generic Activity."
)
activity.id = id
# Parse the HTML portion
activity.parse_html(root)
# Save everything to the graph
activity.save(graph)