Skip to content
Snippets Groups Projects
Select Git revision
  • 593b40ba66e8bb5104a187659bb25f2361c545d5
  • mui5-annotation-on-video-stable default
  • get_setter_canvasSizeInformations
  • fix-error-div-into-p
  • annotation-on-video-v2
  • detached
  • annotation-on-video-r17
  • mui5
  • mui5-react-18
  • jacob-test
  • annotation-on-video protected
  • master
  • test-antoinev1
  • 20-fetch-thumbnail-on-annotation
  • add-research-field
  • Save
  • add-plugin
  • 14-wip-no-seek-to
  • 14-bug-on-video-time-control
  • 9_wip_videotests
  • _upgrade_material_ui
  • latest-tetras-16
  • v3.3.0
  • v3.2.0
  • v3.1.1
  • v3.1.0
  • v3.0.0
  • v3.0.0-rc.7
  • v3.0.0-rc.6
  • v3.0.0-rc.5
  • v3.0.0-rc.4
  • v3.0.0-rc.3
  • v3.0.0-rc.2
  • v3.0.0-rc.1
  • v3.0.0-beta.10
  • v3.0.0-beta.9
  • v3.0.0-beta.8
  • v3.0.0-beta.7
  • v3.0.0-beta.6
  • v3.0.0-beta.5
  • v3.0.0-beta.3
41 results

WindowAuthenticationControl.js

Blame
  • extract_page.py 18.19 KiB
    import re
    from abc import abstractmethod
    from dataclasses import dataclass
    from typing import Any
    
    from lxml import html
    from lxml.etree import _Element
    from lxml.html import HtmlElement
    from rdflib import RDF, Graph, Literal
    from typing_extensions import override
    
    from common import *
    
    # Initialise logger
    log = get_logger("extract_page")
    
    
    class Comment:
        def __init__(self, id: str = ""):
            self.id = id
            self.num: int
            self.text: str
            self.html: Any
            self.elem: _Element
    
        @override
        def __repr__(self):
            return str(self.__dict__)
    
    
    class Activity:
        def __init__(self):
            self.id: str = ""
            """The ID of the page this activity is in (`pg###`)"""
            self.title: str = ""
            """Human-readable title of the activity"""
            self.description: str | None = None
            """Description of the activity's body (HTML),
            e.g. the instructions for an exercise activity"""
            self.comment_consigne: Comment | None = None
            """Another form of activity description but in a comment. May or may not
            coexist with a regular description"""
            self.comment_success: Comment | None = None
            """Comment displayed on success, if applicable"""
            self.comments_sugg: dict[str, Comment] = {}
            """Help comments displayed on failure, if applicable (keyed by ID)"""
            self.comments_misc: list[Comment] = []
            """Any other comments, if present"""
            self.ref: URIRef
    
        def save(self, graph: Graph):
            """Save activity data to the graph. Subclasses may override this method
            to save their specific data."""
            self.ref = NS[self.id]
            # => Type
            graph.add((self.ref, RDF.type, NS[self.get_name()]))
            # => Title
            set_title(graph, self.ref, self.title)
            # => Description
            description = self.description or ""
            if self.comment_consigne is not None:
                description += self.comment_consigne.html
            if description != "":
                graph.add((self.ref, NS["description"], Literal(description)))
            # => Comments
            if self.comment_success is not None:
                graph.add(
                    (self.ref, NS["commentaireSucces"], Literal(self.comment_success.html))
                )
            for comment in self.comments_sugg.values():
                graph.add((self.ref, NS["commentaireSugg"], Literal(comment.html)))
            for comment in self.comments_misc:
                graph.add((self.ref, NS["commentaireInfo"], Literal(comment.html)))
    
        def parse_html(self, root: HtmlElement):
            """From a `lxml.html` parsing tree, extract all data relevant to this class.
            Subclasses may override this method to extract more specific data.
            """
            # => Title
            self.title = root.xpath("/html/head/title")[0].text
            # => Comments
            zi = root.get_element_by_id("zoneInvisible")
            for cmt_div in zi:
                comment = Comment()
                comment.text = cmt_div.text_content()
                comment.html = to_html(cmt_div)
                comment.elem = cmt_div
                comment.id = cmt_div.get("id") or ""
                # Split id in two parts (non-digits and digits), then match on these parts
                m = regex_comment.match(comment.id)
                if m is not None:
                    match m.groups():
                        case ["divCmt", num]:
                            comment.num = int(num)
                            self.comments_misc.append(comment)
                        case ["divSugg", num]:
                            comment.num = int(num)
                            self.comments_sugg[comment.id] = comment
                        case ["divCmtSucces", _]:
                            self.comment_success = comment
                        case ["divConsigne", _]:
                            self.comment_consigne = comment
                        case alpha, num:
                            log.warning(
                                f"No match for comment {alpha}[{num}] ('{comment.id}')"
                            )
                        case something:
                            log.warning(f"No match for comment '{something}'")
    
        def get_name(self) -> str:
            return type(self).__name__
    
        @classmethod
        def from_typename(cls, name: str):
            """Convenience function to create an `Activity` subclass from a name"""
            match name:
                case "Cours":
                    return Cours()
                case "ExerciceQC":
                    return ExerciceQC()
                case "ExerciceQM":
                    return ExerciceQM()
                case "ExerciceTAT":
                    return ExerciceTAT()
                case "ExerciceGD":
                    return ExerciceGD()
                case _:
                    raise NameError(name=name)
    
        @override
        def __repr__(self):
            return self.get_name() + str(self.__dict__)
    
    
    class Cours(Activity):
        @override
        def parse_html(self, root: HtmlElement):
            super().parse_html(root)
            # => Description
            cours = root.get_element_by_id("STY_texteCours")
            self.description = to_html(cours).strip()
    
    
    class Exercice(Activity):
        @override
        def parse_html(self, root: HtmlElement):
            super().parse_html(root)
            # => Description
            question = root.get_element_by_id("STY_question")
            self.description = to_html(question).strip()
    
    
    @dataclass
    class Choice:
        """A possible answer for a question, correct or not"""
    
        def __init__(
            self,
            id: str = "",
            index: int = -1,
            is_correct: bool = False,
            html: str = "",
            comment: Comment | None = None,
        ):
            self.id = id
            """A string identifier for the choice"""
            self.index = index
            """The order the choice appears in"""
            self.is_correct = is_correct
            self.html = html
            self.comment = comment
            """A `Comment` associated with this choice, displayed when the exercise
            is incorrect and this choice is selected"""
    
        @override
        def __str__(self) -> str:
            return f"Choice(id='{self.id}', index={self.index}, is_correct={self.is_correct}, html='{self.html[0::10]}')"
    
    
    class ChoiceGroup:
        def __init__(self):
            self.label: str
            self.items: list[Choice]
    
    
    class ExerciceQC(Exercice):
        def __init__(self, is_qcm: bool = False) -> None:
            super().__init__()
            self.is_qcm = is_qcm
            self.choices: dict[str, Choice] = {}
    
        @override
        def get_name(self) -> str:
            return "ExerciceQC_QCM" if self.is_qcm else "ExerciceQC_QCU"
    
        @override
        def parse_html(self, root: HtmlElement):
            super().parse_html(root)
            # Find question choices
            for index, choice_node in enumerate(root.find_class("STY_reponseQC")):
                if Context.version == "macao_12":
                    # Choices have an 'id' attribute in the form 'lienRepX'
                    # where X is their index (starting at 1)
                    id = choice_node.attrib["id"].replace("lienRep", "")
                else:
                    # Choices have an 'id' attribute in the form 'lienrepX' (lowercase)
                    # where X is a number. The actual ID we're keeping is 'repX'.
                    id = choice_node.attrib["id"].replace("lien", "")
                choice = self.get_or_create_choice(id)
                choice.index = index
                choice.html = to_html(choice_node).strip()
    
            # The activity's comments have already been extracted in Activity.parse_html(),
            # but some of them may be associated with a specific choice (this is
            # detected by the JS parser earlier).
            # Move these comments from the activity to their choice object.
            for choice in self.choices.values():
                if choice.comment is not None:
                    try:
                        choice.comment = self.comments_sugg.pop(choice.comment.id)
                    except KeyError:
                        log.warning(
                            f"Choice '{choice.id}' requested comment '{choice.comment.id}', which was not found in HTML."
                        )
    
        @override
        def save(self, graph: Graph):
            super().save(graph)
            for choice in self.choices.values():
                rdf_name = f"{self.id}_{choice.index}_{choice.id}"  # ex: pg157_2, pg173_rep21
                display_name = rdf_name + " | " + ("V" if choice.is_correct else "F")
                choice_node = NS[rdf_name]
                graph.add((choice_node, RDF.type, NS["Reponse"]))
                graph.add((choice_node, NS["id"], Literal(choice.id)))
                graph.add((choice_node, NS["index"], Literal(choice.index)))
                graph.add((choice_node, NS["correct"], Literal(choice.is_correct)))
                graph.add((choice_node, NS["html"], Literal(choice.html)))
                # Save optional comment
                if choice.comment is not None:
                    graph.add(
                        (choice_node, NS["commentaireSugg"], Literal(choice.comment.html))
                    )
                graph.add(
                    (
                        choice_node,
                        NS["__protege_display_name"],
                        Literal(display_name),
                    )
                )
                graph.add((NS[self.id], NS["aReponse"], choice_node))
                # Our fake "class hierarchy" just for easier visualization
                graph.add((choice_node, RDFS.subClassOf, NS[self.id]))
    
        def set_correct(self, choice_id: str, correct: bool):
            """Set the choice with ID `choice_id` as correct or not, creating it if needed."""
            self.get_or_create_choice(choice_id).is_correct = correct
    
        def set_html(self, choice_id: str, html: str):
            """Set the `html` attribute for the choice with ID `choice_id`, creating it if needed."""
            self.get_or_create_choice(choice_id).html = html
    
        def get_or_create_choice(self, id: str) -> Choice:
            """Returns the choice with the `id`, creating it if needed."""
            if id not in self.choices:
                self.choices[id] = Choice(id)
            return self.choices[id]
    
    
    class ExerciceQM(Exercice):
        def __init__(self):
            super().__init__()
            self.questions: list[ChoiceGroup]
    
    
    class ExerciceTAT(Exercice):
        def __init__(self):
            super().__init__()
            self.text: str  # can be HTML
            self.gaps: list[ChoiceGroup]
    
    
    class ExerciceGD(Exercice):
        def __init__(self):
            super().__init__()
            self.targets: list[str]
            self.draggables: list[list[Choice]]
    
    
    class JSParser:
        @abstractmethod
        def parse(self, js: str) -> Activity:
            """Parse a string of JavaScript code and returns an instance of the
            correct `Activity` subclass, partially populated with data found in the code.
            """
            pass
    
        @override
        def __str__(self) -> str:
            return type(self).__name__
    
    
    class RegexParser(JSParser):
        def __init__(self, graph: Graph, act_id: str) -> None:
            self.graph = graph
            self.act_id = act_id
    
        @override
        def parse(self, js: str) -> Activity:
            # Find function declaration and only keep code after it
            func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
            if len(func_split) < 2:
                raise ParseError("Failed to find function 'entrerDonnees'")
            body = func_split[1]
    
            activity, _ = self._parse_activity_constructor(body)
            if isinstance(activity, ExerciceQC):
                # Parse correct answers
                self._parse_qc_answers(body, activity)
    
            return activity
    
        def _parse_activity_constructor(self, code: str) -> tuple[Activity, str]:
            """
            Find activity constructor call, return the activity type
            and resulting variable name.
            """
            constructor_match = re.search(
                r"""
                    (\w+)               # result variable name
                    \s+=\s+new\s+       # 
                    (Cours|Exercice\w+) # constructor name
                    \((.*?)\);          # optional arguments between parentheses
                        """,
                code,
                re.VERBOSE,
            )
            if constructor_match is None:
                raise ParseError("Failed to parse activity constructor")
    
            var_name, act_type, args = constructor_match.groups()
            activity = Activity.from_typename(act_type)
            # Handle case of QC variants
            if isinstance(activity, ExerciceQC) and args == '"QCM"':
                activity.is_qcm = True
            return activity, var_name
    
        def _parse_qc_answers(self, code: str, exo: ExerciceQC) -> None:
            """Parse the correct answers for a QCU activity"""
            if Context.version == "macao_12":
                choice_id = "0"
                for line in code.splitlines():
                    line = line.strip()
                    m = re.match(r"var nr = (\d+);", line)
                    if m is not None:
                        # "index" line
                        choice_id = m.group(1)
                    elif line == "exo.tabStylesR[nr] = CODE_F;":
                        # "incorrect answer" line
                        exo.set_correct(choice_id, False)
                    elif line == "exo.tabStylesR[nr] = CODE_V;":
                        # "correct answer" line
                        exo.set_correct(choice_id, True)
            else:
    
                # Parse choices IDs and correctness
    
                # ( tinker with this regex: https://regex101.com/r/qAkdDD/2 )
                answers_regex = re.compile(
                    r"""
                    var[ ](?P<varname>\w+)      # Capture variable name, referenced in 2nd line
                    [ ]=[ ]new[ ]ItemReponse\(
                    '(?P<id>\w+)'               # Constructor parameter : answer ID (obfuscated)
                    \);\n\s*                    # New line and any indent
                    (?P=varname)                # Back-reference to the variable name captured earlier
                    \.init\(
                    \"\d*?(?P<correct>\d)\"     # First parameter of "init" : correctness
                                                # (capture last digit only)
                    (?:,\s*\"\w*\"){3}\);       # Skip 3 params""",
                    re.VERBOSE,
                )
                answers = list(answers_regex.finditer(code))
                # Yet another layer of obfuscation: correct/incorrect are inverted
                # depending on the total score and number of answers
                # (see ClasseExerciceQC.js:86)
                score = self._parse_score(code)
                is_inverted = ((len(answers) + score) % 2) == 1
                for match in answers:
                    # Answer ID is obfuscated by changing some digits
                    choice_id = decode_answer_id(match.group("id"))
                    choice = exo.get_or_create_choice(choice_id)
                    choice.is_correct = (match.group("correct") == "1") != is_inverted
    
                # Parse choice-comment associations
    
                # ( tinker with this regex: https://regex101.com/r/qEzZ5R/1 )
                comments_regex = re.compile(
                    r"""
                    var[ ](?P<varname>\w+)      # Capture variable name, referenced in 2nd line
                    [ ]=[ ]
                    '(?P<comment_id>\w+)'       # Constructor param : comment ID
                    ;\n\s*                      # New line and any indent
                    EXO_ajouterCommentaire\(
                    (?P=varname)                # Back-reference to the variable name captured earlier
                    (?:,\s*\"\w*\"){6}          # Skip 6 parameters
                    ,[ ]\"(?P<choice_id>\w+)\"  # 8th parameter : choice ID
                    (?:,\s*\"\w*\"){10}         # Skip 10 parameters
                    \);""",
                    re.VERBOSE,
                )
                for match in comments_regex.finditer(code):
                    choice_id = match.group("choice_id")
                    comment_id = match.group("comment_id")
                    try:
                        choice = exo.choices[choice_id]
                        # Save a Comment object with just the ID, other fields will be
                        # filled at the HTML parsing stage
                        choice.comment = Comment(comment_id)
                    except KeyError:
                        log.warning(
                            f"{self.act_id}: '{comment_id}' requested choice ID '{choice_id}', which doesn't exist"
                        )
                pass
    
        def _parse_score(self, code: str):
            """Parse the activity's 'total score' variable"""
            exception = ParseError("Failed to parse total score for this activity")
            m = re.search(r"exo\.scoreTotal ?= ?(\d+);", code)
            try:
                if m is not None:
                    return int(m.group(1))
                else:
                    raise exception
            except ValueError as e:
                raise exception from e
    
    
    def decode_answer_id(id: str):
        """
        Decode an obfuscated answer ID, just like the `decodeX()` function
        in `ClasseExerciceQC.js`.
        """
        res = ""
        for c in id:
            match c:
                case "3":
                    res += "0"
                case "8":
                    res += "1"
                case "7":
                    res += "2"
                case "9":
                    res += "3"
                case "1":
                    res += "7"
                case "0":
                    res += "8"
                case "2":
                    res += "9"
                case _:
                    res += c
        return res
    
    
    # Regex to separate non-digits and digits
    regex_comment = re.compile(r"(\D*)(\d*)")
    
    
    def parse_page(graph: Graph, filepath: str, id: str):
        # Activity data is spread across HTML and JS code, which are parsed
        # differently. Additionally, some pieces of data are specific to the
        # activity type (Cours, ExerciceQC...) and this type is in the JS portion.
        # This requires parsing the JS code first, to get the type, then proceed
        # with HTML to get the rest of the type-specific data.
    
        # We still need to find the inline scripts before parsing them
        tree = html.parse(filepath)
        root = tree.getroot()
        # Collect all inline scripts (no external 'src') and join them in a
        # block of JS code
        scripts: list[HtmlElement] = root.xpath(
            '/html/head/script[@type="text/javascript" and not(@src)]'
        )
        js = "\n".join((s.text_content() for s in scripts))
    
        activity = Activity()
        parser = RegexParser(graph, id)
        try:
            activity: Activity = parser.parse(js)
        except ParseError as e:
            log.error(
                f"{parser} -> {id}: Parsing error: {e}. Treating this as a generic Activity."
            )
    
        activity.id = id
        # Parse the HTML portion
        activity.parse_html(root)
        # Save everything to the graph
        activity.save(graph)