Skip to content
Snippets Groups Projects
Select Git revision
  • bd345caa4aa936e740be48896fff9809ec0b6148
  • annotation-on-video default protected
  • demo_ci
  • 3-upstream-01022023
  • master
  • gh3538-captions
  • 16-adapt-for-images-annot
  • 15-api-for-annotations-on-video
  • 15-annotations-on-videos
  • video_for_annotations
  • wip-1-annotations-on-videos
  • 9-videoviewer-tests
  • 9_wip_videotests
  • 6-fix-tests-and-ci
  • _fix_ci
  • wip-webpack-from-git
16 results

AnnotationFactory.test.js

Blame
  • extract_page.py 12.93 KiB
    import re
    import sys
    from pprint import pprint
    from typing import Any, List, Optional
    
    import esprima as es
    from lxml import etree, html
    from lxml.etree import _Element
    from rdflib import RDF, Graph, Literal
    
    from common import *
    
    
    class Comment:
        id: str
        num: int
        text: str
        html: Any
        elem: _Element
    
        def __repr__(self):
            return str(self.__dict__)
    
    
    class Page:
        id: str
        title: str
        type: str  # cours ou exercice
        comment_success: Comment
        comments_sugg: List[Comment]
        comments_misc: List[Comment]
    
        def __init__(self) -> None:
            self.comments_sugg = []
            self.comments_misc = []
    
        def __repr__(self):
            return str(self.__dict__)
    
    
    class ParseError(Exception):
        pass
    
    
    class RegexParser:
        def parse(self, js, output=sys.stdout):
            # Find function declaration and only keep code after it
            func_split = re.split(r"\s*?function entrerDonnees\(\s*?\)\s*?{", js)
            if len(func_split) < 2:
                raise ParseError("Failed to find function 'entrerDonnees'")
            body = func_split[1]
    
            activity_type, activity_var_name = self._parse_activity_constructor(body)
            print(activity_type, end="", file=output)
            if activity_type == "ExerciceQC_QCU":
                print(" ", self._parse_qcu_answers(body), end="", file=output)
    
        def _parse_activity_constructor(self, code: str) -> tuple[str, str]:
            """
            Find activity constructor call, return the activity type
            and resulting variable name.
            """
            constructor_match = re.search(
                r"""
                    (\w+)               # result variable name
                    \s+=\s+new\s+       # 
                    (Cours|Exercice\w+) # constructor name
                    \((.*?)\);          # optional arguments between parentheses
                        """,
                code,
                re.VERBOSE,
            )
            if constructor_match is None:
                raise ParseError("Failed to parse activity constructor")
            # Handle case of QC variants
            var_name, act_type, args = constructor_match.groups()
            if act_type == "ExerciceQC" and args in ('"QCU"', '"QCM"'):
                act_type += "_" + args.replace('"', "")
            return act_type, var_name
    
        def _parse_qcu_answers(self, code: str) -> list[bool]:
            """Parse the correct answers for a QCU activity, as a list of booleans"""
            correct_choices = []
            index = 0
            for line in code.splitlines():
                line = line.strip()
                m = re.match(r"var nr = (\d+);", line)
                if m is not None:
                    # "index" line
                    index = int(m.group(1))
                elif line == "exo.tabStylesR[nr] = CODE_F;":
                    # "incorrect answer" line
                    insert_grow(correct_choices, index, False, fill_value=False)
                elif line == "exo.tabStylesR[nr] = CODE_V;":
                    # "correct answer" line
                    insert_grow(correct_choices, index, True, fill_value=False)
            return correct_choices
    
        def __str__(self) -> str:
            return "RegexParser"
    
    
    class XpathParser:
        """A parser for the JS portion of an activity, that uses XPath to query
        an XML representation of Esprima's abstract syntax tree (AST)"""
    
        # XPath requests pre-compiled as functions
        request_function = etree.XPath(
            '//FunctionDeclaration[id/Identifier[@name="entrerDonnees"]]'
        )
        request_index_and_values = etree.XPath(
            '*//VariableDeclarator[id//*[@name="nr"]]/init/Literal | *//AssignmentExpression[*//Identifier[@name="CODE_V"]]'
        )
        request_constructor_id = etree.XPath(
            '*//NewExpression/callee/Identifier[@name="Cours" or starts-with(@name, "Exercice")]'
        )
    
        def __init__(self):
            pass
    
        def parse(self, js, output=sys.stdout):
            jstree = es.parseScript(js, None)
            # Convert Esprima object tree to XML etree
            xml = self.to_xml(jstree.toDict(), "jstree")
            try:
                self.fun = self.request_function(xml)[0]
                act_type = self._parse_activity_type()
                print(act_type, end="", file=output)
                if act_type == "ExerciceQC_QCU":
                    print(" ", self._parse_qcu_answers(), end="", file=output)
            except Exception as e:
                raise ParseError(e)
    
        def _parse_activity_type(self) -> str:
            constructor_id = self.request_constructor_id(self.fun)[0]
            match constructor_id.get("name"):
                case "ExerciceQC":
                    arg = constructor_id.xpath("../../arguments/Literal/@value")[0]
                    if arg not in ["QCU", "QCM"]:
                        raise ParseError(f"ExerciceQC: invalid argument '{arg}'")
                    return f"ExerciceQC_{arg}"
                case other:
                    return other
    
        def _parse_qcu_answers(self) -> list[bool]:
            """Parse the correct answers for a QCU activity, as a list of booleans"""
            indexes_and_values = self.request_index_and_values(self.fun)
            correct_choices = []
            index = 0
            for e in indexes_and_values:
                value = e.xpath("@value")
                if len(value) != 0:
                    # "index line"
                    index = int(value[0])
                else:
                    # "true line"
                    insert_grow(correct_choices, index, True, fill_value=False)
            return correct_choices
    
        def to_xml(self, obj, tag_name: Optional[str] = None):
            """Recursively convert an object structure to an XML `ElementTree`.
            Structures are expected to be Python dictionaries.
            Converting a dictionary produces a tag named after the "type" attribute (if present).
            - A primitive attribute (i.e. not list nor dict) becomes a tag attribute.
            - A list attribute becomes a tag with its contents as sub-tags.
            - A dictionary attribute becomes a tag (named like the attribute's key)
            containing a sub-tag for the dictionary itself
            """
            if isinstance(obj, dict):
                # Dictionary (or object):
                # - if it has a "type" key, the dict represents an object -> use its value as the tag name
                # - if a tag_name is specified as well, it's probably important (like an attribute name),
                # so we keep both, as 2 nested tags (tag_name for the outer tag, type for the inner tag)
                inner_tag = None
                outer_tag = None
                has_inner = "type" in obj.keys()
                if has_inner:
                    inner_tag = etree.Element(obj["type"], None, None)
                else:
                    inner_tag = etree.Element("_dict", None, None)
    
                if tag_name is not None:
                    outer_tag = etree.Element(tag_name)
                    if has_inner:
                        outer_tag.append(inner_tag)
                    else:
                        inner_tag = outer_tag
                else:
                    outer_tag = inner_tag
                # Recurse on dictionary items
                for key, val in obj.items():
                    if key != "type":  # exception for 'type', handled as attribute
                        if isinstance(val, (list, dict)):
                            # Structured attributes become child tags
                            inner_tag.append(self.to_xml(val, key))
                        else:
                            # Primitive attributes become tag attributes
                            inner_tag.set(key, str(val))
                return outer_tag
    
            elif isinstance(obj, list):
                tag_name = tag_name or "_list"
                list_tag = etree.Element(tag_name)
                for e in obj:
                    list_tag.append(self.to_xml(e))
                return list_tag
    
            else:
                tag_name = tag_name or "_literal"
                leaf_tag = etree.Element(tag_name)
                leaf_tag.text = str(obj)
                return leaf_tag
    
        def __str__(self) -> str:
            return "XpathParser"
    
    
    class MatchParser:
        """A parser for the JS portion of an activity, that uses Python match statements
        to navigate the abstract syntax tree (AST) produced by Esprima"""
    
        def __init__(self, graph: Graph, act_id: str) -> None:
            self.graph = graph
            self.act_id = act_id
    
        def parse(self, js, output=sys.stdout):
            self.output = output
            jstree = es.parseScript(js, None)
            # Try to match our template with one of the top-level statements
            for statement in jstree.body:
                self.match_function(statement.toDict())
    
        def match_constructor_call(self, new_expr: dict):
            match new_expr:
                case {
                    "type": "NewExpression",
                    "callee": {
                        "type": "Identifier",
                        "name": typ,
                    },
                    "arguments": [*args],
                }:
                    match typ:
                        case "Cours" | "ExerciceQM" | "ExerciceTAT" | "ExerciceGD":
                            self.print(typ)
                            self.graph.add((NS[self.act_id], RDF.type, NS[typ]))
                        case "ExerciceQC":
                            match args:
                                case [{"type": "Literal", "value": "QCU"}, *_]:
                                    typ += "_QCU"
                                case [{"type": "Literal", "value": "QCM"}, *_]:
                                    typ += "_QCM"
                                case _:
                                    raise ParseError(
                                        f"ExerciceQC: Invalid argument '{args}'"
                                    )
                            self.print(typ)
                            self.graph.add((NS[self.act_id], RDF.type, NS[typ]))
                        case _:
                            raise ParseError(f"Unknown activity type '{typ}'")
                case _:
                    pass
    
        def print(self, s: str):
            print(s, end="", file=self.output)
    
        def match_function(self, func: dict):
            """Checks if `func` matches a function declaration named `entrerDonnees`,
            and search its body if successful
            """
    
            match func:
                case {
                    "type": "FunctionDeclaration",
                    "id": {"name": "entrerDonnees"},
                    "body": {"type": "BlockStatement", "body": body},
                }:
                    # Matched a function declaration and captured its `body` attr
                    for statement in body:
                        # Find constructor calls (e.g. `new Thing()`) recursively
                        recurse_prefix(statement, self.match_constructor_call)
    
        def __str__(self) -> str:
            return "MatchParser"
    
    
    def recurse_prefix(t, f):
        """Depth-first prefixed recursion: calls a function on an object, then on
        all its children (if it's a list or dictionary) recursively
    
        :param t: The object
        :param f: The function to call
        """
        f(t)
        if isinstance(t, list):
            for e in t:
                recurse_prefix(e, f)
        elif isinstance(t, dict):
            for e in t.values():
                recurse_prefix(e, f)
    
    
    # Regex to separate non-digits and digits
    regex_comment = re.compile(r"(\D*)(\d*)")
    
    
    def parse_page(graph: Graph, filepath: str, id: str):
        page = Page()
        # Parse with lxml
        tree = html.parse(filepath)
        root = tree.getroot()
    
        # Collect all inline scripts (no external 'src') and join them in a
        # block of JS code
        # scripts = root.cssselect('script[type="text/javascript"]:not([src])')
        scripts: List[_Element] = root.xpath(
            '/html/head/script[@type="text/javascript" and not(@src)]'
        )
        js = "\n".join((s.text_content() for s in scripts))
    
        # Try different parsers, each writing to a different file to compare their results
        for parser in [XpathParser(), MatchParser(graph, id), RegexParser()]:
            with open(f"/tmp/{str(parser)}.txt", "a") as f:
                print(f"\n{id:8}", end="", file=f)
                try:
                    parser.parse(js, output=f)
                except ParseError as e:
                    print(f"{parser} -> {id}: Parsing error: {e}", file=sys.stderr)
    
        # Parse comments
        zi = root.get_element_by_id("zoneInvisible")
        for cmt_div in zi:
            comment = Comment()
            comment.text = cmt_div.text_content()
            comment.html = html.tostring(cmt_div, encoding="unicode")
            comment.elem = cmt_div
            comment.id = cmt_div.get("id") or ""
            # Split id in two parts (non-digits and digits), then match on these parts
            m = regex_comment.match(comment.id)
            if m is not None:
                match m.groups():
                    case ["divCmt", num]:
                        comment.num = int(num)
                        page.comments_misc.append(comment)
                        graph.add((NS[id], NS["commentaireInfo"], Literal(comment.html)))
                    case ["divSugg", num]:
                        comment.num = int(num)
                        page.comments_sugg.append(comment)
                        graph.add((NS[id], NS["commentaireSugg"], Literal(comment.html)))
                    case ["divCmtSucces", _]:
                        page.comment_success = comment
                        graph.add((NS[id], NS["commentaireSucces"], Literal(comment.html)))
                    case [other, _]:
                        pass
        # pprint(page)