Select Git revision
extract_page.py 12.93 KiB
import re
import sys
from pprint import pprint
from typing import Any, List, Optional
import esprima as es
from lxml import etree, html
from lxml.etree import _Element
from rdflib import RDF, Graph, Literal
from common import *
class Comment:
id: str
num: int
text: str
html: Any
elem: _Element
def __repr__(self):
return str(self.__dict__)
class Page:
id: str
title: str
type: str # cours ou exercice
comment_success: Comment
comments_sugg: List[Comment]
comments_misc: List[Comment]
def __init__(self) -> None:
self.comments_sugg = []
self.comments_misc = []
def __repr__(self):
return str(self.__dict__)
class ParseError(Exception):
pass
class RegexParser:
def parse(self, js, output=sys.stdout):
# Find function declaration and only keep code after it
func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
if len(func_split) < 2:
raise ParseError("Failed to find function 'entrerDonnees'")
body = func_split[1]
activity_type, activity_var_name = self._parse_activity_constructor(body)
print(activity_type, end="", file=output)
if activity_type == "ExerciceQC_QCU":
print(" ", self._parse_qcu_answers(body), end="", file=output)
def _parse_activity_constructor(self, code: str) -> tuple[str, str]:
"""
Find activity constructor call, return the activity type
and resulting variable name.
"""
constructor_match = re.search(
r"""
(\w+) # result variable name
\s+=\s+new\s+ #
(Cours|Exercice\w+) # constructor name
\((.*?)\); # optional arguments between parentheses
""",
code,
re.VERBOSE,
)
if constructor_match is None:
raise ParseError("Failed to parse activity constructor")
# Handle case of QC variants
var_name, act_type, args = constructor_match.groups()
if act_type == "ExerciceQC" and args in ('"QCU"', '"QCM"'):
act_type += "_" + args.replace('"', "")
return act_type, var_name
def _parse_qcu_answers(self, code: str) -> list[bool]:
"""Parse the correct answers for a QCU activity, as a list of booleans"""
correct_choices = []
index = 0
for line in code.splitlines():
line = line.strip()
m = re.match(r"var nr = (\d+);", line)
if m is not None:
# "index" line
index = int(m.group(1))
elif line == "exo.tabStylesR[nr] = CODE_F;":
# "incorrect answer" line
insert_grow(correct_choices, index, False, fill_value=False)
elif line == "exo.tabStylesR[nr] = CODE_V;":
# "correct answer" line
insert_grow(correct_choices, index, True, fill_value=False)
return correct_choices
def __str__(self) -> str:
return "RegexParser"
class XpathParser:
"""A parser for the JS portion of an activity, that uses XPath to query
an XML representation of Esprima's abstract syntax tree (AST)"""
# XPath requests pre-compiled as functions
request_function = etree.XPath(
'//FunctionDeclaration[id/Identifier[@name="entrerDonnees"]]'
)
request_index_and_values = etree.XPath(
'*//VariableDeclarator[id//*[@name="nr"]]/init/Literal | *//AssignmentExpression[*//Identifier[@name="CODE_V"]]'
)
request_constructor_id = etree.XPath(
'*//NewExpression/callee/Identifier[@name="Cours" or starts-with(@name, "Exercice")]'
)
def __init__(self):
pass
def parse(self, js, output=sys.stdout):
jstree = es.parseScript(js, None)
# Convert Esprima object tree to XML etree
xml = self.to_xml(jstree.toDict(), "jstree")
try:
self.fun = self.request_function(xml)[0]
act_type = self._parse_activity_type()
print(act_type, end="", file=output)
if act_type == "ExerciceQC_QCU":
print(" ", self._parse_qcu_answers(), end="", file=output)
except Exception as e:
raise ParseError(e)
def _parse_activity_type(self) -> str:
constructor_id = self.request_constructor_id(self.fun)[0]
match constructor_id.get("name"):
case "ExerciceQC":
arg = constructor_id.xpath("../../arguments/Literal/@value")[0]
if arg not in ["QCU", "QCM"]:
raise ParseError(f"ExerciceQC: invalid argument '{arg}'")
return f"ExerciceQC_{arg}"
case other:
return other
def _parse_qcu_answers(self) -> list[bool]:
"""Parse the correct answers for a QCU activity, as a list of booleans"""
indexes_and_values = self.request_index_and_values(self.fun)
correct_choices = []
index = 0
for e in indexes_and_values:
value = e.xpath("@value")
if len(value) != 0:
# "index line"
index = int(value[0])
else:
# "true line"
insert_grow(correct_choices, index, True, fill_value=False)
return correct_choices
def to_xml(self, obj, tag_name: Optional[str] = None):
"""Recursively convert an object structure to an XML `ElementTree`.
Structures are expected to be Python dictionaries.
Converting a dictionary produces a tag named after the "type" attribute (if present).
- A primitive attribute (i.e. not list nor dict) becomes a tag attribute.
- A list attribute becomes a tag with its contents as sub-tags.
- A dictionary attribute becomes a tag (named like the attribute's key)
containing a sub-tag for the dictionary itself
"""
if isinstance(obj, dict):
# Dictionary (or object):
# - if it has a "type" key, the dict represents an object -> use its value as the tag name
# - if a tag_name is specified as well, it's probably important (like an attribute name),
# so we keep both, as 2 nested tags (tag_name for the outer tag, type for the inner tag)
inner_tag = None
outer_tag = None
has_inner = "type" in obj.keys()
if has_inner:
inner_tag = etree.Element(obj["type"], None, None)
else:
inner_tag = etree.Element("_dict", None, None)
if tag_name is not None:
outer_tag = etree.Element(tag_name)
if has_inner:
outer_tag.append(inner_tag)
else:
inner_tag = outer_tag
else:
outer_tag = inner_tag
# Recurse on dictionary items
for key, val in obj.items():
if key != "type": # exception for 'type', handled as attribute
if isinstance(val, (list, dict)):
# Structured attributes become child tags
inner_tag.append(self.to_xml(val, key))
else:
# Primitive attributes become tag attributes
inner_tag.set(key, str(val))
return outer_tag
elif isinstance(obj, list):
tag_name = tag_name or "_list"
list_tag = etree.Element(tag_name)
for e in obj:
list_tag.append(self.to_xml(e))
return list_tag
else:
tag_name = tag_name or "_literal"
leaf_tag = etree.Element(tag_name)
leaf_tag.text = str(obj)
return leaf_tag
def __str__(self) -> str:
return "XpathParser"
class MatchParser:
"""A parser for the JS portion of an activity, that uses Python match statements
to navigate the abstract syntax tree (AST) produced by Esprima"""
def __init__(self, graph: Graph, act_id: str) -> None:
self.graph = graph
self.act_id = act_id
def parse(self, js, output=sys.stdout):
self.output = output
jstree = es.parseScript(js, None)
# Try to match our template with one of the top-level statements
for statement in jstree.body:
self.match_function(statement.toDict())
def match_constructor_call(self, new_expr: dict):
match new_expr:
case {
"type": "NewExpression",
"callee": {
"type": "Identifier",
"name": typ,
},
"arguments": [*args],
}:
match typ:
case "Cours" | "ExerciceQM" | "ExerciceTAT" | "ExerciceGD":
self.print(typ)
self.graph.add((NS[self.act_id], RDF.type, NS[typ]))
case "ExerciceQC":
match args:
case [{"type": "Literal", "value": "QCU"}, *_]:
typ += "_QCU"
case [{"type": "Literal", "value": "QCM"}, *_]:
typ += "_QCM"
case _:
raise ParseError(
f"ExerciceQC: Invalid argument '{args}'"
)
self.print(typ)
self.graph.add((NS[self.act_id], RDF.type, NS[typ]))
case _:
raise ParseError(f"Unknown activity type '{typ}'")
case _:
pass
def print(self, s: str):
print(s, end="", file=self.output)
def match_function(self, func: dict):
"""Checks if `func` matches a function declaration named `entrerDonnees`,
and search its body if successful
"""
match func:
case {
"type": "FunctionDeclaration",
"id": {"name": "entrerDonnees"},
"body": {"type": "BlockStatement", "body": body},
}:
# Matched a function declaration and captured its `body` attr
for statement in body:
# Find constructor calls (e.g. `new Thing()`) recursively
recurse_prefix(statement, self.match_constructor_call)
def __str__(self) -> str:
return "MatchParser"
def recurse_prefix(t, f):
"""Depth-first prefixed recursion: calls a function on an object, then on
all its children (if it's a list or dictionary) recursively
:param t: The object
:param f: The function to call
"""
f(t)
if isinstance(t, list):
for e in t:
recurse_prefix(e, f)
elif isinstance(t, dict):
for e in t.values():
recurse_prefix(e, f)
# Regex to separate non-digits and digits
regex_comment = re.compile(r"(\D*)(\d*)")
def parse_page(graph: Graph, filepath: str, id: str):
page = Page()
# Parse with lxml
tree = html.parse(filepath)
root = tree.getroot()
# Collect all inline scripts (no external 'src') and join them in a
# block of JS code
# scripts = root.cssselect('script[type="text/javascript"]:not([src])')
scripts: List[_Element] = root.xpath(
'/html/head/script[@type="text/javascript" and not(@src)]'
)
js = "\n".join((s.text_content() for s in scripts))
# Try different parsers, each writing to a different file to compare their results
for parser in [XpathParser(), MatchParser(graph, id), RegexParser()]:
with open(f"/tmp/{str(parser)}.txt", "a") as f:
print(f"\n{id:8}", end="", file=f)
try:
parser.parse(js, output=f)
except ParseError as e:
print(f"{parser} -> {id}: Parsing error: {e}", file=sys.stderr)
# Parse comments
zi = root.get_element_by_id("zoneInvisible")
for cmt_div in zi:
comment = Comment()
comment.text = cmt_div.text_content()
comment.html = html.tostring(cmt_div, encoding="unicode")
comment.elem = cmt_div
comment.id = cmt_div.get("id") or ""
# Split id in two parts (non-digits and digits), then match on these parts
m = regex_comment.match(comment.id)
if m is not None:
match m.groups():
case ["divCmt", num]:
comment.num = int(num)
page.comments_misc.append(comment)
graph.add((NS[id], NS["commentaireInfo"], Literal(comment.html)))
case ["divSugg", num]:
comment.num = int(num)
page.comments_sugg.append(comment)
graph.add((NS[id], NS["commentaireSugg"], Literal(comment.html)))
case ["divCmtSucces", _]:
page.comment_success = comment
graph.add((NS[id], NS["commentaireSucces"], Literal(comment.html)))
case [other, _]:
pass
# pprint(page)