extract_page.py

import re
from abc import abstractmethod
from dataclasses import dataclass, field

from lxml import html
from lxml.html import HtmlElement
from rdflib import RDF, Graph, Literal
from typing_extensions import override

from common import *

# Initialise logger
log = get_logger("extract_page")


class Base:
    """A default base class to implement convenience methods"""

    @override
    def __repr__(self) -> str:
        return str(type(self).__name__) + str(self.__dict__)


@dataclass
class Comment:
    id: str
    """The comment's identifier, unique in its parent activity"""
    num: int = -1
    """The comment's index in the activity, its order"""
    html: str = ""
    """The comment as raw HTML"""
    text: str = ""
    """The comment as plain text, with formatting stripped """


@dataclass
class Activity:
    id: str = ""
    """The ID of the page this activity is in (`pg###`)"""
    title: str = ""
    """Human-readable title of the activity"""
    description: str | None = None
    """Description of the activity's body (HTML),
    e.g. the instructions for an exercise activity"""
    comment_consigne: Comment | None = None
    """Another form of activity description but in a comment. May or may not
    coexist with a regular description"""
    comment_success: Comment | None = None
    """Comment displayed on success, if applicable"""
    comments_sugg: dict[str, Comment] = field(default_factory=dict)
    """Help comments displayed on failure, if applicable (keyed by ID)"""
    comments_misc: list[Comment] = field(default_factory=list)
    """Any other comments, if present"""

    def save(self, graph: Graph):
        """Save activity data to the graph. Subclasses may override this method
        to save their specific data."""
        ref: URIRef = NS[self.id]
        # => Type
        graph.add((ref, RDF.type, NS[self.get_name()]))
        # => Title
        set_title(graph, ref, self.title)
        # => Description
        description = self.description or ""
        if self.comment_consigne is not None:
            description += self.comment_consigne.html
        if description != "":
            graph.add((ref, NS["description"], Literal(description)))
        # => Comments
        if self.comment_success is not None:
            graph.add(
                (ref, NS["commentaireSucces"], Literal(self.comment_success.html))
            )
        for comment in self.comments_sugg.values():
            graph.add((ref, NS["commentaireSugg"], Literal(comment.html)))
        for comment in self.comments_misc:
            graph.add((ref, NS["commentaireInfo"], Literal(comment.html)))

    def parse_html(self, root: HtmlElement):
        """From a `lxml.html` parsing tree, extract all data relevant to this class.
        Subclasses may override this method to extract more specific data.
        """
        # => Title
        self.title = root.xpath("/html/head/title")[0].text
        # => Comments
        zi = root.get_element_by_id("zoneInvisible")
        for cmt_div in zi:
            comment = Comment(cmt_div.get("id") or "")
            comment.text = cmt_div.text_content()
            comment.html = to_html(cmt_div)
            # Split id in two parts (non-digits and digits), then match on these parts
            m = regex_comment.match(comment.id)
            if m is not None:
                match m.groups():
                    case ["divCmt", num]:
                        comment.num = int(num)
                        self.comments_misc.append(comment)
                    case ["divSugg", num]:
                        comment.num = int(num)
                        self.comments_sugg[comment.id] = comment
                    case ["divCmtSucces", _]:
                        self.comment_success = comment
                    case ["divConsigne", _]:
                        self.comment_consigne = comment
                    case alpha, num:
                        log.warning(
                            f"{self.id}: No match for comment {alpha}[{num}] ('{comment.id}')"
                        )
                    case something:
                        log.warning(f"{self.id}: No match for comment '{something}'")

    def get_name(self) -> str:
        return type(self).__name__

    @classmethod
    def from_typename(cls, name: str):
        """Convenience function to create an `Activity` subclass from a name"""
        match name:
            case "Cours":
                return Cours()
            case "ExerciceQC":
                return ExerciceQC()
            case "ExerciceQM":
                return ExerciceQM()
            case "ExerciceTAT":
                return ExerciceTAT()
            case "ExerciceGD":
                return ExerciceGD()
            case _:
                raise NameError(name=name)


class Cours(Activity):
    @override
    def parse_html(self, root: HtmlElement):
        super().parse_html(root)
        # => Description
        cours = root.get_element_by_id("STY_texteCours")
        self.description = to_html(cours).strip()


class Exercice(Activity):
    @override
    def parse_html(self, root: HtmlElement):
        super().parse_html(root)
        # => Description
        question = root.get_element_by_id("STY_question")
        self.description = to_html(question).strip()


@dataclass
class Choice:
    """A possible answer for a question, correct or not"""

    id: str = ""
    """A string identifier for the choice"""
    index: int = -1
    """The order the choice appears in"""
    is_correct: bool = False
    html: str = ""
    comment: Comment | None = None
    """A `Comment` associated with this choice, displayed when the exercise
    is incorrect and this choice is selected"""


@dataclass
class ChoiceGroup:
    label: str
    items: list[Choice] = field(default_factory=list)


@dataclass
class Gap:
    """A gap in a gap-fill text exercise"""

    id: str
    choices: list[Choice] = field(default_factory=list)


@dataclass
class ExerciceQC(Exercice):
    is_qcm: bool = False
    choices: dict[str, Choice] = field(default_factory=dict)

    @override
    def get_name(self) -> str:
        return "ExerciceQC_QCM" if self.is_qcm else "ExerciceQC_QCU"

    @override
    def parse_html(self, root: HtmlElement):
        super().parse_html(root)
        # Find question choices
        for index, choice_node in enumerate(root.find_class("STY_reponseQC")):
            if Context.version == "macao_12":
                # Choices have an 'id' attribute in the form 'lienRepX'
                # where X is their index (starting at 1)
                id = choice_node.attrib["id"].replace("lienRep", "")
            else:
                # Choices have an 'id' attribute in the form 'lienrepX' (lowercase)
                # where X is a number. The actual ID we're keeping is 'repX'.
                id = choice_node.attrib["id"].replace("lien", "")
            choice = self.get_or_create_choice(id)
            choice.index = index
            choice.html = to_html(choice_node).strip()

        # The activity's comments have already been extracted in Activity.parse_html(),
        # but some of them may be associated with a specific choice (this is
        # detected by the JS parser earlier).
        # Move these comments from the activity to their choice object.
        for choice in self.choices.values():
            if choice.comment is not None:
                try:
                    choice.comment = self.comments_sugg.pop(choice.comment.id)
                except KeyError:
                    log.warning(
                        f"{self.id}: Choice '{choice.id}' requested comment '{choice.comment.id}', which was not found in HTML."
                    )

    @override
    def save(self, graph: Graph):
        super().save(graph)
        for choice in self.choices.values():
            rdf_name = f"{self.id}_{choice.id}"  # ex: pg157_2, pg173_rep21
            display_name = rdf_name + " | " + ("V" if choice.is_correct else "F")
            choice_node = NS[rdf_name]
            graph.add((choice_node, RDF.type, NS["Reponse"]))
            graph.add((choice_node, NS["id"], Literal(choice.id)))
            graph.add((choice_node, NS["index"], Literal(choice.index)))
            graph.add((choice_node, NS["correct"], Literal(choice.is_correct)))
            graph.add((choice_node, NS["html"], Literal(choice.html)))
            # Save optional comment
            if choice.comment is not None:
                graph.add(
                    (choice_node, NS["commentaireSugg"], Literal(choice.comment.html))
                )
            graph.add(
                (
                    choice_node,
                    NS["__protege_display_name"],
                    Literal(display_name),
                )
            )
            graph.add((NS[self.id], NS["aReponse"], choice_node))
            # Our fake "class hierarchy" just for easier visualization
            graph.add((choice_node, RDFS.subClassOf, NS[self.id]))

    def set_correct(self, choice_id: str, correct: bool):
        """Set the choice with ID `choice_id` as correct or not, creating it if needed."""
        self.get_or_create_choice(choice_id).is_correct = correct

    def set_html(self, choice_id: str, html: str):
        """Set the `html` attribute for the choice with ID `choice_id`, creating it if needed."""
        self.get_or_create_choice(choice_id).html = html

    def get_or_create_choice(self, id: str) -> Choice:
        """Returns the choice with the `id`, creating it if needed."""
        if id not in self.choices:
            self.choices[id] = Choice(id)
        return self.choices[id]


@dataclass
class ExerciceQM(Exercice):
    questions: list[ChoiceGroup] = field(default_factory=list)


@dataclass
class ExerciceTAT(Exercice):
    segments: list[str | Gap] = field(default_factory=list)
    """The segments (text or gap) that make up the exercise text, in order"""

    @override
    def parse_html(self, root: HtmlElement):
        super().parse_html(root)
        # Find the text container
        try:
            container = root.find_class("STY_texteTAT")[0]
        except IndexError as e:
            raise ParseError("ExerciceTAT: text container not found") from e

        # Text buffer accumulates the text found
        text_segment_buf = container.text or ""
        for elem in container:
            if elem.tag == "select" and "STY_selectTAT" in elem.classes:
                # It's a gap
                # Time to "close" the text segment and add it
                self.segments.append(text_segment_buf)
                # Add the gap
                gap_id = elem.attrib["id"].replace("champTrou", "")
                self.segments.append(Gap(gap_id))
                # New text segment starts with the tail text of this element
                text_segment_buf = elem.tail or ""
            else:
                text_segment_buf += to_html(elem)

        self.segments.append(text_segment_buf)

        nb_total_gaps = len(container.find_class("STY_selectTAT"))
        nb_found_gaps = len([e for e in self.segments if isinstance(e, Gap)])
        if nb_found_gaps != nb_total_gaps:
            log.warning(
                f"{self.id}: Text has {nb_total_gaps} gaps in total, but found {nb_found_gaps} gap elements, some might be missing"
            )
        pass

@dataclass
class ExerciceGD(Exercice):
    targets: list[str] = field(default_factory=list)
    draggables: list[list[Choice]] = field(default_factory=list)


class JSParser(Base):
    @abstractmethod
    def parse(self, js: str) -> Activity:
        """Parse a string of JavaScript code and returns an instance of the
        correct `Activity` subclass, partially populated with data found in the code.
        """
        pass

    @override
    def __str__(self) -> str:
        return type(self).__name__


class RegexParser(JSParser):
    def __init__(self, act_id: str) -> None:
        self.act_id = act_id

    @override
    def parse(self, js: str) -> Activity:
        # Find function declaration and only keep code after it
        func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
        if len(func_split) < 2:
            raise ParseError("Failed to find function 'entrerDonnees'")
        body = func_split[1]

        activity, _ = self._parse_activity_constructor(body)
        if isinstance(activity, ExerciceQC):
            # Parse correct answers
            self._parse_qc_answers(body, activity)

        return activity

    def _parse_activity_constructor(self, code: str) -> tuple[Activity, str]:
        """
        Find activity constructor call, return the activity type
        and resulting variable name.
        """
        constructor_match = re.search(
            r"""
                (\w+)               # result variable name
                \s+=\s+new\s+       #
                (Cours|Exercice\w+) # constructor name
                \((.*?)\);          # optional arguments between parentheses
                    """,
            code,
            re.VERBOSE,
        )
        if constructor_match is None:
            raise ParseError("Failed to parse activity constructor")

        var_name, act_type, args = constructor_match.groups()
        activity = Activity.from_typename(act_type)
        # Handle case of QC variants
        if isinstance(activity, ExerciceQC) and args == '"QCM"':
            activity.is_qcm = True
        return activity, var_name

    def _parse_qc_answers(self, code: str, exo: ExerciceQC) -> None:
        """Parse the correct answers for a QCU activity"""
        if Context.version == "macao_12":
            choice_id = "0"
            for line in code.splitlines():
                line = line.strip()
                m = re.match(r"var nr = (\d+);", line)
                if m is not None:
                    # "index" line
                    choice_id = m.group(1)
                elif line == "exo.tabStylesR[nr] = CODE_F;":
                    # "incorrect answer" line
                    exo.set_correct(choice_id, False)
                elif line == "exo.tabStylesR[nr] = CODE_V;":
                    # "correct answer" line
                    exo.set_correct(choice_id, True)
        else:

            # Parse choices IDs and correctness

            # ( tinker with this regex: https://regex101.com/r/qAkdDD/2 )
            answers_regex = re.compile(
                r"""
                var[ ](?P<varname>\w+)      # Capture variable name, referenced in 2nd line
                [ ]=[ ]new[ ]ItemReponse\(
                '(?P<id>\w+)'               # Constructor parameter : answer ID (obfuscated)
                \);\n\s*                    # New line and any indent
                (?P=varname)                # Back-reference to the variable name captured earlier
                \.init\(
                \"\d*?(?P<correct>\d)\"     # First parameter of "init" : correctness
                                            # (capture last digit only)
                (?:,\s*\"\w*\"){3}\);       # Skip 3 params""",
                re.VERBOSE,
            )
            answers = list(answers_regex.finditer(code))
            # Yet another layer of obfuscation: correct/incorrect are inverted
            # depending on the total score and number of answers
            # (see ClasseExerciceQC.js:86)
            score = self._parse_score(code)
            is_inverted = ((len(answers) + score) % 2) == 1
            for match in answers:
                # Answer ID is obfuscated by changing some digits
                choice_id = decode_answer_id(match.group("id"))
                choice = exo.get_or_create_choice(choice_id)
                choice.is_correct = (match.group("correct") == "1") != is_inverted

            # Parse choice-comment associations

            # ( tinker with this regex: https://regex101.com/r/qEzZ5R/1 )
            comments_regex = re.compile(
                r"""
                var[ ](?P<varname>\w+)      # Capture variable name, referenced in 2nd line
                [ ]=[ ]
                '(?P<comment_id>\w+)'       # Constructor param : comment ID
                ;\n\s*                      # New line and any indent
                EXO_ajouterCommentaire\(
                (?P=varname)                # Back-reference to the variable name captured earlier
                (?:,\s*\"\w*\"){6}          # Skip 6 parameters
                ,[ ]\"(?P<choice_id>\w+)\"  # 8th parameter : choice ID
                (?:,\s*\"\w*\"){10}         # Skip 10 parameters
                \);""",
                re.VERBOSE,
            )
            for match in comments_regex.finditer(code):
                choice = exo.get_or_create_choice(match.group("choice_id"))
                # Save a Comment object with just the ID, other fields will be
                # filled at the HTML parsing stage
                choice.comment = Comment(match.group("comment_id"))
            pass

    def _parse_score(self, code: str):
        """Parse the activity's 'total score' variable"""
        exception = ParseError("Failed to parse total score for this activity")
        m = re.search(r"exo\.scoreTotal ?= ?(\d+);", code)
        try:
            if m is not None:
                return int(m.group(1))
            else:
                raise exception
        except ValueError as e:
            raise exception from e


def decode_answer_id(id: str):
    """
    Decode an obfuscated answer ID, just like the `decodeX()` function
    in `ClasseExerciceQC.js`.
    """
    res = ""
    for c in id:
        match c:
            case "3":
                res += "0"
            case "8":
                res += "1"
            case "7":
                res += "2"
            case "9":
                res += "3"
            case "1":
                res += "7"
            case "0":
                res += "8"
            case "2":
                res += "9"
            case _:
                res += c
    return res


# Regex to separate non-digits and digits
regex_comment = re.compile(r"(\D*)(\d*)")


def parse_page(graph: Graph, filepath: str, id: str):
    # Activity data is spread across HTML and JS code, which are parsed
    # differently. Additionally, some pieces of data are specific to the
    # activity type (Cours, ExerciceQC...) and this type is in the JS portion.
    # This requires parsing the JS code first, to get the type, then proceed
    # with HTML to get the rest of the type-specific data.

    # We still need to find the inline scripts before parsing them
    tree = html.parse(filepath)
    root = tree.getroot()
    # Collect all inline scripts (no external 'src') and join them in a
    # block of JS code
    scripts: list[HtmlElement] = root.xpath(
        '/html/head/script[@type="text/javascript" and not(@src)]'
    )
    js = "\n".join((s.text_content() for s in scripts))

    activity = Activity()
    parser = RegexParser(id)
    try:
        activity: Activity = parser.parse(js)
    except ParseError as e:
        log.error(
            f"{parser} -> {id}: Parsing error: {e}. Treating this as a generic Activity."
        )

    activity.id = id
    # Parse the HTML portion
    activity.parse_html(root)
    # Save everything to the graph
    activity.save(graph)