Parse & decode TAT choices for macao3

90495a98 · Eliott Sammier · 229bc46a · 90495a98
Commit 90495a98 authored Aug 2, 2024 by Eliott Sammier
--- a/tetras_extraction/script/src/extract_page.py
+++ b/tetras_extraction/script/src/extract_page.py
@@ -84,12 +84,13 @@ class Activity:
        self.title = root.xpath("/html/head/title")[0].text
        # => Comments
        zi = root.get_element_by_id("zoneInvisible")
+        # Regex to separate non-digits and digits
        for cmt_div in zi:
            comment = Comment(cmt_div.get("id") or "")
            comment.text = cmt_div.text_content()
            comment.html = to_html(cmt_div)
            # Split id in two parts (non-digits and digits), then match on these parts
-            m = regex_comment.match(comment.id)
+            m = re.match(r"(\D*)(\d*)", comment.id)
            if m is not None:
                match m.groups():
                    case ["divCmt", num]:
@@ -268,6 +269,8 @@ class ExerciceQM(Exercice):
 class ExerciceTAT(Exercice):
    segments: list[str | Gap] = field(default_factory=list)
    """The segments (text or gap) that make up the exercise text, in order"""
+    gaps: dict[str, Gap] = field(default_factory=dict)
+    """Only the gaps, keyed by ID, useful during parsing"""

    @override
    def parse_html(self, root: HtmlElement):
@@ -287,7 +290,7 @@ class ExerciceTAT(Exercice):
                self.segments.append(text_segment_buf)
                # Add the gap
                gap_id = elem.attrib["id"].replace("champTrou", "")
-                self.segments.append(Gap(gap_id))
+                self.segments.append(self.get_or_create_gap(gap_id))
                # New text segment starts with the tail text of this element
                text_segment_buf = elem.tail or ""
            else:
@@ -303,6 +306,13 @@ class ExerciceTAT(Exercice):
            )
        pass

+    def get_or_create_gap(self, gap_id: str) -> Gap:
+        """Find a gap by ID, creating it if needed"""
+        if gap_id not in self.gaps:
+            self.gaps[gap_id] = Gap(gap_id)
+        return self.gaps[gap_id]
+
+
 @dataclass
 class ExerciceGD(Exercice):
    targets: list[str] = field(default_factory=list)
@@ -335,9 +345,14 @@ class RegexParser(JSParser):
        body = func_split[1]

        activity, _ = self._parse_activity_constructor(body)
-        if isinstance(activity, ExerciceQC):
+        match activity:
+            case ExerciceQC():
                # Parse correct answers
                self._parse_qc_answers(body, activity)
+            case ExerciceTAT():
+                self._parse_tat_choices(body, activity)
+            case _:
+                pass

        return activity

@@ -448,6 +463,42 @@ class RegexParser(JSParser):
        except ValueError as e:
            raise exception from e

+    def _parse_tat_choices(self, code: str, exo: ExerciceTAT) -> None:
+        choices_regex = re.compile(
+            r"""
+            exo\.ajouterReponse\(
+            '(?P<choice_id>\w+)'
+            ,\s'(?P<gap_id>\d+)'
+            ,\s'(?P<correct_code>\d+)'
+            ,\s\"(?P<text>.+)\"
+            \);""",
+            re.VERBOSE,
+        )
+        choices = list(choices_regex.finditer(code))
+        # Correctness obfuscation
+        # Each choice is correct if correct_code == 2*gap_num + (nb_gaps + score) % 2
+        # (see the wiki for more info)
+        nb_gaps = max(
+            [int(match.group("gap_id")) for match in choices_regex.finditer(code)],
+            default=0,
+        )
+        score = self._parse_score(code)
+        correction_offset = (nb_gaps + score) % 2
+
+        # Process matches
+        for match in choices:
+            gap = exo.get_or_create_gap(match.group("gap_id"))
+            choice = Choice(match.group("choice_id"))
+            correct_code = int(match.group("correct_code"))
+            choice.is_correct = (2 * int(gap.id) + correction_offset) == correct_code
+            # Decode obfuscated text
+            text = match.group("text")
+            choice.html = decode_answer_text(text)
+            # Add choice
+            gap.choices.append(choice)
+            pass
+        pass
+

 def decode_answer_id(id: str):
    """
@@ -476,8 +527,20 @@ def decode_answer_id(id: str):
    return res


-# Regex to separate non-digits and digits
-regex_comment = re.compile(r"(\D*)(\d*)")
+def decode_answer_text(text: str):
+    """
+    Decode an obfuscated answer text, just like the `decodeX()` function
+    in `ClasseExerciceTAT.js`.
+    """
+    # The two chars at the end move to the beginning
+    if len(text) > 2:
+        text = text[-2:] + text[0:-2]
+    # Then it's a simple 1-to-1 character substitution
+    table = str.maketrans(
+        "bHOi4ph5sWlr1c2nI7LBuzgaUNv0FDXtm8SodePVqRfwGKkJMxAQjTC",
+        "ABCDFGHJKLNOPQTUVWXabcdfghjklnopqtuvwx0124578ierRImMsSz",
+    )
+    return text.translate(table)


 def parse_page(graph: Graph, filepath: str, id: str):