Skip to content
Snippets Groups Projects
Select Git revision
  • 229bc46aaa94e7cf7bf45b341da4387b5bc27d1d
  • main default protected
  • export
  • 28-conversion-tests
  • extraction
  • exploration
  • exploration-old
  • 2-encoding-fix
  • main-old
9 results

extract_page.py

Blame
  • extract_page.py 18.91 KiB
    import re
    from abc import abstractmethod
    from dataclasses import dataclass, field
    
    from lxml import html
    from lxml.html import HtmlElement
    from rdflib import RDF, Graph, Literal
    from typing_extensions import override
    
    from common import *
    
    # Initialise logger
    log = get_logger("extract_page")
    
    
    class Base:
        """A default base class to implement convenience methods"""
    
        @override
        def __repr__(self) -> str:
            return str(type(self).__name__) + str(self.__dict__)
    
    
    @dataclass
    class Comment:
        id: str
        """The comment's identifier, unique in its parent activity"""
        num: int = -1
        """The comment's index in the activity, its order"""
        html: str = ""
        """The comment as raw HTML"""
        text: str = ""
        """The comment as plain text, with formatting stripped """
    
    
    @dataclass
    class Activity:
        id: str = ""
        """The ID of the page this activity is in (`pg###`)"""
        title: str = ""
        """Human-readable title of the activity"""
        description: str | None = None
        """Description of the activity's body (HTML),
        e.g. the instructions for an exercise activity"""
        comment_consigne: Comment | None = None
        """Another form of activity description but in a comment. May or may not
        coexist with a regular description"""
        comment_success: Comment | None = None
        """Comment displayed on success, if applicable"""
        comments_sugg: dict[str, Comment] = field(default_factory=dict)
        """Help comments displayed on failure, if applicable (keyed by ID)"""
        comments_misc: list[Comment] = field(default_factory=list)
        """Any other comments, if present"""
    
        def save(self, graph: Graph):
            """Save activity data to the graph. Subclasses may override this method
            to save their specific data."""
            ref: URIRef = NS[self.id]
            # => Type
            graph.add((ref, RDF.type, NS[self.get_name()]))
            # => Title
            set_title(graph, ref, self.title)
            # => Description
            description = self.description or ""
            if self.comment_consigne is not None:
                description += self.comment_consigne.html
            if description != "":
                graph.add((ref, NS["description"], Literal(description)))
            # => Comments
            if self.comment_success is not None:
                graph.add(
                    (ref, NS["commentaireSucces"], Literal(self.comment_success.html))
                )
            for comment in self.comments_sugg.values():
                graph.add((ref, NS["commentaireSugg"], Literal(comment.html)))
            for comment in self.comments_misc:
                graph.add((ref, NS["commentaireInfo"], Literal(comment.html)))
    
        def parse_html(self, root: HtmlElement):
            """From a `lxml.html` parsing tree, extract all data relevant to this class.
            Subclasses may override this method to extract more specific data.
            """
            # => Title
            self.title = root.xpath("/html/head/title")[0].text
            # => Comments
            zi = root.get_element_by_id("zoneInvisible")
            for cmt_div in zi:
                comment = Comment(cmt_div.get("id") or "")
                comment.text = cmt_div.text_content()
                comment.html = to_html(cmt_div)
                # Split id in two parts (non-digits and digits), then match on these parts
                m = regex_comment.match(comment.id)
                if m is not None:
                    match m.groups():
                        case ["divCmt", num]:
                            comment.num = int(num)
                            self.comments_misc.append(comment)
                        case ["divSugg", num]:
                            comment.num = int(num)
                            self.comments_sugg[comment.id] = comment
                        case ["divCmtSucces", _]:
                            self.comment_success = comment
                        case ["divConsigne", _]:
                            self.comment_consigne = comment
                        case alpha, num:
                            log.warning(
                                f"{self.id}: No match for comment {alpha}[{num}] ('{comment.id}')"
                            )
                        case something:
                            log.warning(f"{self.id}: No match for comment '{something}'")
    
        def get_name(self) -> str:
            return type(self).__name__
    
        @classmethod
        def from_typename(cls, name: str):
            """Convenience function to create an `Activity` subclass from a name"""
            match name:
                case "Cours":
                    return Cours()
                case "ExerciceQC":
                    return ExerciceQC()
                case "ExerciceQM":
                    return ExerciceQM()
                case "ExerciceTAT":
                    return ExerciceTAT()
                case "ExerciceGD":
                    return ExerciceGD()
                case _:
                    raise NameError(name=name)
    
    
    class Cours(Activity):
        @override
        def parse_html(self, root: HtmlElement):
            super().parse_html(root)
            # => Description
            cours = root.get_element_by_id("STY_texteCours")
            self.description = to_html(cours).strip()
    
    
    class Exercice(Activity):
        @override
        def parse_html(self, root: HtmlElement):
            super().parse_html(root)
            # => Description
            question = root.get_element_by_id("STY_question")
            self.description = to_html(question).strip()
    
    
    @dataclass
    class Choice:
        """A possible answer for a question, correct or not"""
    
        id: str = ""
        """A string identifier for the choice"""
        index: int = -1
        """The order the choice appears in"""
        is_correct: bool = False
        html: str = ""
        comment: Comment | None = None
        """A `Comment` associated with this choice, displayed when the exercise
        is incorrect and this choice is selected"""
    
    
    @dataclass
    class ChoiceGroup:
        label: str
        items: list[Choice] = field(default_factory=list)
    
    
    @dataclass
    class Gap:
        """A gap in a gap-fill text exercise"""
    
        id: str
        choices: list[Choice] = field(default_factory=list)
    
    
    @dataclass
    class ExerciceQC(Exercice):
        is_qcm: bool = False
        choices: dict[str, Choice] = field(default_factory=dict)
    
        @override
        def get_name(self) -> str:
            return "ExerciceQC_QCM" if self.is_qcm else "ExerciceQC_QCU"
    
        @override
        def parse_html(self, root: HtmlElement):
            super().parse_html(root)
            # Find question choices
            for index, choice_node in enumerate(root.find_class("STY_reponseQC")):
                if Context.version == "macao_12":
                    # Choices have an 'id' attribute in the form 'lienRepX'
                    # where X is their index (starting at 1)
                    id = choice_node.attrib["id"].replace("lienRep", "")
                else:
                    # Choices have an 'id' attribute in the form 'lienrepX' (lowercase)
                    # where X is a number. The actual ID we're keeping is 'repX'.
                    id = choice_node.attrib["id"].replace("lien", "")
                choice = self.get_or_create_choice(id)
                choice.index = index
                choice.html = to_html(choice_node).strip()
    
            # The activity's comments have already been extracted in Activity.parse_html(),
            # but some of them may be associated with a specific choice (this is
            # detected by the JS parser earlier).
            # Move these comments from the activity to their choice object.
            for choice in self.choices.values():
                if choice.comment is not None:
                    try:
                        choice.comment = self.comments_sugg.pop(choice.comment.id)
                    except KeyError:
                        log.warning(
                            f"{self.id}: Choice '{choice.id}' requested comment '{choice.comment.id}', which was not found in HTML."
                        )
    
        @override
        def save(self, graph: Graph):
            super().save(graph)
            for choice in self.choices.values():
                rdf_name = f"{self.id}_{choice.id}"  # ex: pg157_2, pg173_rep21
                display_name = rdf_name + " | " + ("V" if choice.is_correct else "F")
                choice_node = NS[rdf_name]
                graph.add((choice_node, RDF.type, NS["Reponse"]))
                graph.add((choice_node, NS["id"], Literal(choice.id)))
                graph.add((choice_node, NS["index"], Literal(choice.index)))
                graph.add((choice_node, NS["correct"], Literal(choice.is_correct)))
                graph.add((choice_node, NS["html"], Literal(choice.html)))
                # Save optional comment
                if choice.comment is not None:
                    graph.add(
                        (choice_node, NS["commentaireSugg"], Literal(choice.comment.html))
                    )
                graph.add(
                    (
                        choice_node,
                        NS["__protege_display_name"],
                        Literal(display_name),
                    )
                )
                graph.add((NS[self.id], NS["aReponse"], choice_node))
                # Our fake "class hierarchy" just for easier visualization
                graph.add((choice_node, RDFS.subClassOf, NS[self.id]))
    
        def set_correct(self, choice_id: str, correct: bool):
            """Set the choice with ID `choice_id` as correct or not, creating it if needed."""
            self.get_or_create_choice(choice_id).is_correct = correct
    
        def set_html(self, choice_id: str, html: str):
            """Set the `html` attribute for the choice with ID `choice_id`, creating it if needed."""
            self.get_or_create_choice(choice_id).html = html
    
        def get_or_create_choice(self, id: str) -> Choice:
            """Returns the choice with the `id`, creating it if needed."""
            if id not in self.choices:
                self.choices[id] = Choice(id)
            return self.choices[id]
    
    
    @dataclass
    class ExerciceQM(Exercice):
        questions: list[ChoiceGroup] = field(default_factory=list)
    
    
    @dataclass
    class ExerciceTAT(Exercice):
        segments: list[str | Gap] = field(default_factory=list)
        """The segments (text or gap) that make up the exercise text, in order"""
    
        @override
        def parse_html(self, root: HtmlElement):
            super().parse_html(root)
            # Find the text container
            try:
                container = root.find_class("STY_texteTAT")[0]
            except IndexError as e:
                raise ParseError("ExerciceTAT: text container not found") from e
    
            # Text buffer accumulates the text found
            text_segment_buf = container.text or ""
            for elem in container:
                if elem.tag == "select" and "STY_selectTAT" in elem.classes:
                    # It's a gap
                    # Time to "close" the text segment and add it
                    self.segments.append(text_segment_buf)
                    # Add the gap
                    gap_id = elem.attrib["id"].replace("champTrou", "")
                    self.segments.append(Gap(gap_id))
                    # New text segment starts with the tail text of this element
                    text_segment_buf = elem.tail or ""
                else:
                    text_segment_buf += to_html(elem)
    
            self.segments.append(text_segment_buf)
    
            nb_total_gaps = len(container.find_class("STY_selectTAT"))
            nb_found_gaps = len([e for e in self.segments if isinstance(e, Gap)])
            if nb_found_gaps != nb_total_gaps:
                log.warning(
                    f"{self.id}: Text has {nb_total_gaps} gaps in total, but found {nb_found_gaps} gap elements, some might be missing"
                )
            pass
    
    @dataclass
    class ExerciceGD(Exercice):
        targets: list[str] = field(default_factory=list)
        draggables: list[list[Choice]] = field(default_factory=list)
    
    
    class JSParser(Base):
        @abstractmethod
        def parse(self, js: str) -> Activity:
            """Parse a string of JavaScript code and returns an instance of the
            correct `Activity` subclass, partially populated with data found in the code.
            """
            pass
    
        @override
        def __str__(self) -> str:
            return type(self).__name__
    
    
    class RegexParser(JSParser):
        def __init__(self, act_id: str) -> None:
            self.act_id = act_id
    
        @override
        def parse(self, js: str) -> Activity:
            # Find function declaration and only keep code after it
            func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
            if len(func_split) < 2:
                raise ParseError("Failed to find function 'entrerDonnees'")
            body = func_split[1]
    
            activity, _ = self._parse_activity_constructor(body)
            if isinstance(activity, ExerciceQC):
                # Parse correct answers
                self._parse_qc_answers(body, activity)
    
            return activity
    
        def _parse_activity_constructor(self, code: str) -> tuple[Activity, str]:
            """
            Find activity constructor call, return the activity type
            and resulting variable name.
            """
            constructor_match = re.search(
                r"""
                    (\w+)               # result variable name
                    \s+=\s+new\s+       # 
                    (Cours|Exercice\w+) # constructor name
                    \((.*?)\);          # optional arguments between parentheses
                        """,
                code,
                re.VERBOSE,
            )
            if constructor_match is None:
                raise ParseError("Failed to parse activity constructor")
    
            var_name, act_type, args = constructor_match.groups()
            activity = Activity.from_typename(act_type)
            # Handle case of QC variants
            if isinstance(activity, ExerciceQC) and args == '"QCM"':
                activity.is_qcm = True
            return activity, var_name
    
        def _parse_qc_answers(self, code: str, exo: ExerciceQC) -> None:
            """Parse the correct answers for a QCU activity"""
            if Context.version == "macao_12":
                choice_id = "0"
                for line in code.splitlines():
                    line = line.strip()
                    m = re.match(r"var nr = (\d+);", line)
                    if m is not None:
                        # "index" line
                        choice_id = m.group(1)
                    elif line == "exo.tabStylesR[nr] = CODE_F;":
                        # "incorrect answer" line
                        exo.set_correct(choice_id, False)
                    elif line == "exo.tabStylesR[nr] = CODE_V;":
                        # "correct answer" line
                        exo.set_correct(choice_id, True)
            else:
    
                # Parse choices IDs and correctness
    
                # ( tinker with this regex: https://regex101.com/r/qAkdDD/2 )
                answers_regex = re.compile(
                    r"""
                    var[ ](?P<varname>\w+)      # Capture variable name, referenced in 2nd line
                    [ ]=[ ]new[ ]ItemReponse\(
                    '(?P<id>\w+)'               # Constructor parameter : answer ID (obfuscated)
                    \);\n\s*                    # New line and any indent
                    (?P=varname)                # Back-reference to the variable name captured earlier
                    \.init\(
                    \"\d*?(?P<correct>\d)\"     # First parameter of "init" : correctness
                                                # (capture last digit only)
                    (?:,\s*\"\w*\"){3}\);       # Skip 3 params""",
                    re.VERBOSE,
                )
                answers = list(answers_regex.finditer(code))
                # Yet another layer of obfuscation: correct/incorrect are inverted
                # depending on the total score and number of answers
                # (see ClasseExerciceQC.js:86)
                score = self._parse_score(code)
                is_inverted = ((len(answers) + score) % 2) == 1
                for match in answers:
                    # Answer ID is obfuscated by changing some digits
                    choice_id = decode_answer_id(match.group("id"))
                    choice = exo.get_or_create_choice(choice_id)
                    choice.is_correct = (match.group("correct") == "1") != is_inverted
    
                # Parse choice-comment associations
    
                # ( tinker with this regex: https://regex101.com/r/qEzZ5R/1 )
                comments_regex = re.compile(
                    r"""
                    var[ ](?P<varname>\w+)      # Capture variable name, referenced in 2nd line
                    [ ]=[ ]
                    '(?P<comment_id>\w+)'       # Constructor param : comment ID
                    ;\n\s*                      # New line and any indent
                    EXO_ajouterCommentaire\(
                    (?P=varname)                # Back-reference to the variable name captured earlier
                    (?:,\s*\"\w*\"){6}          # Skip 6 parameters
                    ,[ ]\"(?P<choice_id>\w+)\"  # 8th parameter : choice ID
                    (?:,\s*\"\w*\"){10}         # Skip 10 parameters
                    \);""",
                    re.VERBOSE,
                )
                for match in comments_regex.finditer(code):
                    choice = exo.get_or_create_choice(match.group("choice_id"))
                    # Save a Comment object with just the ID, other fields will be
                    # filled at the HTML parsing stage
                    choice.comment = Comment(match.group("comment_id"))
                pass
    
        def _parse_score(self, code: str):
            """Parse the activity's 'total score' variable"""
            exception = ParseError("Failed to parse total score for this activity")
            m = re.search(r"exo\.scoreTotal ?= ?(\d+);", code)
            try:
                if m is not None:
                    return int(m.group(1))
                else:
                    raise exception
            except ValueError as e:
                raise exception from e
    
    
    def decode_answer_id(id: str):
        """
        Decode an obfuscated answer ID, just like the `decodeX()` function
        in `ClasseExerciceQC.js`.
        """
        res = ""
        for c in id:
            match c:
                case "3":
                    res += "0"
                case "8":
                    res += "1"
                case "7":
                    res += "2"
                case "9":
                    res += "3"
                case "1":
                    res += "7"
                case "0":
                    res += "8"
                case "2":
                    res += "9"
                case _:
                    res += c
        return res
    
    
    # Regex to separate non-digits and digits
    regex_comment = re.compile(r"(\D*)(\d*)")
    
    
    def parse_page(graph: Graph, filepath: str, id: str):
        # Activity data is spread across HTML and JS code, which are parsed
        # differently. Additionally, some pieces of data are specific to the
        # activity type (Cours, ExerciceQC...) and this type is in the JS portion.
        # This requires parsing the JS code first, to get the type, then proceed
        # with HTML to get the rest of the type-specific data.
    
        # We still need to find the inline scripts before parsing them
        tree = html.parse(filepath)
        root = tree.getroot()
        # Collect all inline scripts (no external 'src') and join them in a
        # block of JS code
        scripts: list[HtmlElement] = root.xpath(
            '/html/head/script[@type="text/javascript" and not(@src)]'
        )
        js = "\n".join((s.text_content() for s in scripts))
    
        activity = Activity()
        parser = RegexParser(id)
        try:
            activity: Activity = parser.parse(js)
        except ParseError as e:
            log.error(
                f"{parser} -> {id}: Parsing error: {e}. Treating this as a generic Activity."
            )
    
        activity.id = id
        # Parse the HTML portion
        activity.parse_html(root)
        # Save everything to the graph
        activity.save(graph)