diff --git a/tetras_extraction/macao_12/script/.vscode/extensions.json b/tetras_extraction/macao_12/script/.vscode/extensions.json new file mode 100644 index 0000000000000000000000000000000000000000..9e75ef5179cf4e5c6a85ca0508198fe49c76f4ed --- /dev/null +++ b/tetras_extraction/macao_12/script/.vscode/extensions.json @@ -0,0 +1,6 @@ +{ + "recommendations": [ + "detachhead.basedpyright", + "ms-python.black-formatter" + ] +} \ No newline at end of file diff --git a/tetras_extraction/macao_12/script/common.py b/tetras_extraction/macao_12/script/common.py index 83512e85181b9933c1f7660f4ede92804f320032..e9aa55a848fde8c39ba901a525540c5455a4d496 100644 --- a/tetras_extraction/macao_12/script/common.py +++ b/tetras_extraction/macao_12/script/common.py @@ -1,9 +1,10 @@ +from os import environ, path from sys import stderr -from typing import Any, Optional -from rdflib import RDFS, Graph, Literal, URIRef -from rdflib import Namespace -from os import path, environ +from typing import Any + from lxml import html +from rdflib import Graph, Literal, RDFS, URIRef +from rdflib import Namespace def env_path_or_rel_default(env_var: str, default: str) -> str: @@ -36,19 +37,17 @@ NS = Namespace("http://www.semanticweb.org/eliott/ontologies/2024/4/macao/") # Utility functions ############################################################ -def eprint(*args, **kwargs): +def eprint(*args, **kwargs): # pyright: ignore[reportMissingParameterType] """Just like `print()`, but to standard error instead of standard output""" print(*args, file=stderr, **kwargs) def to_html(elem: html.HtmlElement) -> str: """Shorthand function to serialise a `HtmlElement` to a HTML string""" - return html.tostring(elem, encoding="unicode") # type: ignore - # Type checkers complain because `tostring()` formally returns `bytes`, - # but with encoding="unicode" it actually returns a `str`. + return html.tostring(elem, encoding="unicode") -def insert_grow(l: list, index: int, value: Any, fill_value: Optional[Any] = None): +def insert_grow(l: list[Any], index: int, value: Any, fill_value: Any | None = None): """Insert at a given position in a list, growing it if necessary :param l: list diff --git a/tetras_extraction/macao_12/script/extract.py b/tetras_extraction/macao_12/script/extract.py index 34ff07826ab8e0c4e48b8e8833292e425713a5b9..196848dd2e287857096adc9a7ae08c025f9b22ab 100644 --- a/tetras_extraction/macao_12/script/extract.py +++ b/tetras_extraction/macao_12/script/extract.py @@ -1,6 +1,4 @@ import filecmp -from pprint import pprint -from typing import Optional from lxml import etree from rdflib import RDFS, Graph, Literal, URIRef @@ -80,8 +78,8 @@ def parse_manifest(graph: Graph): def parse_manifest_rec( graph: Graph, elem: etree._Element, - parentResource: Optional[URIRef] = None, - index: Optional[int] = None, + parentResource: URIRef | None = None, + index: int | None = None, ): """Parses a module `MosMod` from the manifest recursively, adding all its descendants to the `graph` @@ -123,7 +121,7 @@ def parse_manifest_rec( extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id) -def compare_files(f1, f2): +def compare_files(f1: str, f2: str): print( "Files {} and {} {}.".format( f1, f2, "are identical" if filecmp.cmp(f1, f2) else "differ" diff --git a/tetras_extraction/macao_12/script/extract_mosetp.py b/tetras_extraction/macao_12/script/extract_mosetp.py index a68e45f9cc245962fa47b68f05deaca6e54546b1..426a5c9701f28bf2f87ecffecaacf50e77c62693 100644 --- a/tetras_extraction/macao_12/script/extract_mosetp.py +++ b/tetras_extraction/macao_12/script/extract_mosetp.py @@ -1,11 +1,11 @@ +from os import path import re import subprocess -from os import path -from rdflib import OWL, RDF, RDFS, Graph, Literal +from rdflib import Graph, Literal, OWL, RDF, RDFS -from extract_page import parse_page from common import * +from extract_page import parse_page def generate_triples( diff --git a/tetras_extraction/macao_12/script/extract_page.py b/tetras_extraction/macao_12/script/extract_page.py index e9b2691dd66c4bc14c2d26872dedd7a205501e0d..d69f0e4a3a9eefe80fe128396336f7086d3c1829 100644 --- a/tetras_extraction/macao_12/script/extract_page.py +++ b/tetras_extraction/macao_12/script/extract_page.py @@ -1,15 +1,14 @@ -import re -import sys from abc import abstractmethod from dataclasses import dataclass -from pprint import pprint -from typing import Any, List, Optional +import re +from typing import Any, Callable import esprima as es from lxml import etree, html from lxml.etree import _Element from lxml.html import HtmlElement -from rdflib import RDF, Graph, Literal +from rdflib import Graph, Literal, RDF +from typing_extensions import override from common import * @@ -22,6 +21,7 @@ class Comment: self.html: Any self.elem: _Element + @override def __repr__(self): return str(self.__dict__) @@ -32,18 +32,19 @@ class Activity: """The ID of the page this activity is in (`pg###`)""" self.title: str = "" """Human-readable title of the activity""" - self.description: Optional[str] = None + self.description: str | None = None """Description of the activity's body (HTML), e.g. the instructions for an exercise activity""" - self.comment_consigne: Optional[Comment] = None + self.comment_consigne: Comment | None = None """Another form of activity description but in a comment. May or may not coexist with a regular description""" - self.comment_success: Optional[Comment] = None + self.comment_success: Comment | None = None """Comment displayed on success, if applicable""" - self.comments_sugg: List[Comment] = [] + self.comments_sugg: list[Comment] = [] """Help comments displayed on failure, if applicable""" - self.comments_misc: List[Comment] = [] + self.comments_misc: list[Comment] = [] """Any other comments, if present""" + self.ref: URIRef def save(self, graph: Graph): """Save activity data to the graph. Subclasses may override this method @@ -120,11 +121,13 @@ class Activity: case _: raise NameError(name=name) + @override def __repr__(self): return self.get_name() + str(self.__dict__) class Cours(Activity): + @override def parse_html(self, root: HtmlElement): super().parse_html(root) # => Description @@ -133,6 +136,7 @@ class Cours(Activity): class Exercice(Activity): + @override def parse_html(self, root: HtmlElement): super().parse_html(root) # => Description @@ -144,8 +148,8 @@ class Exercice(Activity): class Choice: """A possible answer for a question, correct or not""" - def __init__(self, index=0): - self.index: int = index + def __init__(self, index: int = 0): + self.index = index self.is_correct: bool = False self.html: str = "" @@ -153,18 +157,20 @@ class Choice: class ChoiceGroup: def __init__(self): self.label: str - self.items: List[Choice] + self.items: list[Choice] class ExerciceQC(Exercice): - def __init__(self, is_qcm=False) -> None: + def __init__(self, is_qcm: bool = False) -> None: super().__init__() - self.is_qcm: bool = is_qcm - self.choices: List[Choice] = [] + self.is_qcm = is_qcm + self.choices: list[Choice] = [] + @override def get_name(self) -> str: return "ExerciceQC_QCM" if self.is_qcm else "ExerciceQC_QCU" + @override def parse_html(self, root: HtmlElement): super().parse_html(root) # Find question choices @@ -174,6 +180,7 @@ class ExerciceQC(Exercice): index = int(choice.attrib["id"].replace("lienRep", "")) self.set_html(index - 1, to_html(choice).strip()) + @override def save(self, graph: Graph): super().save(graph) for choice in self.choices: @@ -213,31 +220,32 @@ class ExerciceQC(Exercice): class ExerciceQM(Exercice): def __init__(self): super().__init__() - self.questions: List[ChoiceGroup] + self.questions: list[ChoiceGroup] class ExerciceTAT(Exercice): def __init__(self): super().__init__() self.text: str # can be HTML - self.gaps: List[ChoiceGroup] + self.gaps: list[ChoiceGroup] class ExerciceGD(Exercice): def __init__(self): super().__init__() - self.targets: List[str] - self.draggables: List[List[Choice]] + self.targets: list[str] + self.draggables: list[list[Choice]] class JSParser: @abstractmethod - def parse(self, js, output=sys.stdout) -> Activity: + def parse(self, js: str) -> Activity: """Parse a string of JavaScript code and returns an instance of the correct `Activity` subclass, partially populated with data found in the code. """ pass + @override def __str__(self) -> str: return type(self).__name__ @@ -247,7 +255,8 @@ class RegexParser(JSParser): self.graph = graph self.act_id = act_id - def parse(self, js) -> Activity: + @override + def parse(self, js: str) -> Activity: # Find function declaration and only keep code after it func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js) if len(func_split) < 2: @@ -318,8 +327,14 @@ class XpathParser(JSParser): '*//NewExpression/callee/Identifier[@name="Cours" or starts-with(@name, "Exercice")]' ) - def parse(self, js) -> Activity: - jstree = es.parseScript(js, None) + def __init__(self) -> None: + self.fun: Any + """AST element corresponding to the function we're interested in. + Initialised in `self.parse()`.""" + + @override + def parse(self, js: str) -> Activity: + jstree: Any = es.parseScript(js, None) # Convert Esprima object tree to XML etree xml = self.to_xml(jstree.toDict(), "jstree") try: @@ -358,7 +373,7 @@ class XpathParser(JSParser): # "correct" or "incorrect" line activity.set_correct(index, e.get("name") == "CODE_V") - def to_xml(self, obj, tag_name: Optional[str] = None): + def to_xml(self, obj: Any, tag_name: str | None = None): """Recursively convert an object structure to an XML `ElementTree`. Structures are expected to be Python dictionaries. Converting a dictionary produces a tag named after the "type" attribute (if present). @@ -420,10 +435,10 @@ class MatchParser(JSParser): def __init__(self, graph: Graph, act_id: str) -> None: self.graph = graph self.act_id = act_id - self.activity: Optional[Activity] = None + self.activity: Activity | None = None - def parse(self, js, output=sys.stdout) -> Activity: - self.output = output + @override + def parse(self, js: str) -> Activity: jstree = es.parseScript(js, None) # Try to match our template with one of the top-level statements for statement in jstree.body: @@ -434,7 +449,7 @@ class MatchParser(JSParser): else: raise ParseError("No activity constructor found") - def match_constructor_call(self, new_expr: dict): + def match_constructor_call(self, new_expr: dict[str, Any]): if self.activity is not None: # Ignore anything after the first match return match new_expr: @@ -466,7 +481,7 @@ class MatchParser(JSParser): case _: pass - def match_function(self, func: dict): + def match_function(self, func: dict[str, Any]): """Checks if `func` matches a function declaration named `entrerDonnees`, and search its body if successful """ @@ -481,9 +496,11 @@ class MatchParser(JSParser): for statement in body: # Find constructor calls (e.g. `new Thing()`) recursively recurse_prefix(statement, self.match_constructor_call) + case _: + pass -def recurse_prefix(t, f): +def recurse_prefix(t: Any, f: Callable[[Any], None]): """Depth-first prefixed recursion: calls a function on an object, then on all its children (if it's a list or dictionary) recursively @@ -515,7 +532,7 @@ def parse_page(graph: Graph, filepath: str, id: str): root = tree.getroot() # Collect all inline scripts (no external 'src') and join them in a # block of JS code - scripts: List[HtmlElement] = root.xpath( + scripts: list[HtmlElement] = root.xpath( '/html/head/script[@type="text/javascript" and not(@src)]' ) js = "\n".join((s.text_content() for s in scripts)) diff --git a/tetras_extraction/macao_12/script/pyrightconfig.json b/tetras_extraction/macao_12/script/pyrightconfig.json new file mode 100644 index 0000000000000000000000000000000000000000..5f26bcacd410c0a839a66723813248d458722e6d --- /dev/null +++ b/tetras_extraction/macao_12/script/pyrightconfig.json @@ -0,0 +1,14 @@ +{ + "reportMissingTypeStubs": "information", + "reportUnusedCallResult": "none", + "reportUnusedVariable": "warning", + "reportUnusedImport": "warning", + "reportMissingParameterType": "warning", + "reportMissingArgumentType": "warning", + "reportPrivateUsage": "none" /* lxml.etree often returns _Element */, + "reportUnknownParameterType": "none", + "reportUnknownArgumentType": "none", + "reportUnknownVariableType": "none", + "reportUnknownMemberType": "none", + "reportAny": "none" +} \ No newline at end of file