Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
Macao Legacy
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
MACAO
Macao Legacy
Commits
7364be88
Commit
7364be88
authored
11 months ago
by
Eliott Sammier
Browse files
Options
Downloads
Patches
Plain Diff
Remove unused JS parsers in favor of RegexParser
parent
55c63dcb
Branches
Branches containing commit
No related tags found
1 merge request
!1
Main
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
tetras_extraction/script/src/extract.py
+1
-13
1 addition, 13 deletions
tetras_extraction/script/src/extract.py
tetras_extraction/script/src/extract_page.py
+9
-218
9 additions, 218 deletions
tetras_extraction/script/src/extract_page.py
with
10 additions
and
231 deletions
tetras_extraction/script/src/extract.py
+
1
−
13
View file @
7364be88
...
...
@@ -148,14 +148,6 @@ def compare_files(f1: str, f2: str):
def
main
():
g
=
create_graph
()
# Create or reset debug log files for all activity parsers, to compare their
# results afterwards
parsers
=
(
"
Match
"
,
"
Xpath
"
,
"
Regex
"
)
logfiles
=
[
f
"
/tmp/
{
p
}
Parser_debuglog.txt
"
for
p
in
parsers
]
for
logfile
in
logfiles
:
with
open
(
logfile
,
"
w
"
)
as
f
:
print
(
""
,
file
=
f
)
if
MACAO_VERSION
==
"
full
"
:
# Run the parser once for each version, but with the same RDF graph
for
Context
.
version
in
[
"
macao_12
"
,
"
macao_3
"
]:
...
...
@@ -163,12 +155,8 @@ def main():
parse_manifest
(
g
)
else
:
parse_manifest
(
g
)
export_graph
(
g
)
# Compare log files 2 by 2
compare_files
(
logfiles
[
0
],
logfiles
[
1
])
compare_files
(
logfiles
[
0
],
logfiles
[
2
])
compare_files
(
logfiles
[
1
],
logfiles
[
2
])
export_graph
(
g
)
if
__name__
==
"
__main__
"
:
...
...
This diff is collapsed.
Click to expand it.
tetras_extraction/script/src/extract_page.py
+
9
−
218
View file @
7364be88
import
re
from
abc
import
abstractmethod
from
dataclasses
import
dataclass
from
typing
import
Any
,
Callable
from
typing
import
Any
import
esprima
as
es
from
lxml
import
etree
,
html
from
lxml
import
html
from
lxml.etree
import
_Element
from
lxml.html
import
HtmlElement
from
rdflib
import
RDF
,
Graph
,
Literal
...
...
@@ -428,210 +427,6 @@ class RegexParser(JSParser):
raise
exception
from
e
class
XpathParser
(
JSParser
):
"""
A parser for the JS portion of an activity, that uses XPath to query
an XML representation of Esprima
'
s abstract syntax tree (AST)
"""
# XPath requests pre-compiled as functions
request_function
=
etree
.
XPath
(
'
//FunctionDeclaration[id/Identifier[@name=
"
entrerDonnees
"
]]
'
)
request_index_and_values
=
etree
.
XPath
(
'
*//VariableDeclarator[id//*[@name=
"
nr
"
]]/init/Literal | *//AssignmentExpression//Identifier[starts-with(@name,
"
CODE_
"
)]
'
)
request_constructor_id
=
etree
.
XPath
(
'
*//NewExpression/callee/Identifier[@name=
"
Cours
"
or starts-with(@name,
"
Exercice
"
)]
'
)
def
__init__
(
self
)
->
None
:
self
.
fun
:
Any
"""
AST element corresponding to the function we
'
re interested in.
Initialised in `self.parse()`.
"""
@override
def
parse
(
self
,
js
:
str
)
->
Activity
:
jstree
:
Any
=
es
.
parseScript
(
js
,
None
)
# Convert Esprima object tree to XML etree
xml
=
self
.
to_xml
(
jstree
.
toDict
(),
"
jstree
"
)
try
:
self
.
fun
=
self
.
request_function
(
xml
)[
0
]
activity
=
self
.
_parse_activity_type
()
if
isinstance
(
activity
,
ExerciceQC
):
self
.
_parse_qc_answers
(
activity
)
return
activity
except
Exception
as
e
:
raise
ParseError
(
e
)
def
_parse_activity_type
(
self
)
->
Activity
:
constructor_id
=
self
.
request_constructor_id
(
self
.
fun
)[
0
]
match
constructor_id
.
get
(
"
name
"
):
case
"
ExerciceQC
"
:
arg
=
constructor_id
.
xpath
(
"
../../arguments/Literal/@value
"
)[
0
]
if
arg
==
"
QCM
"
:
return
ExerciceQC
(
is_qcm
=
True
)
elif
arg
==
"
QCU
"
:
return
ExerciceQC
()
else
:
raise
ParseError
(
f
"
ExerciceQC: invalid argument
'
{
arg
}
'"
)
case
other
:
return
Activity
.
from_typename
(
other
)
def
_parse_qc_answers
(
self
,
activity
:
ExerciceQC
)
->
None
:
"""
Parse the correct answers for a QC activity
"""
indexes_and_values
=
self
.
request_index_and_values
(
self
.
fun
)
choice_id
=
"
0
"
for
e
in
indexes_and_values
:
value
=
e
.
xpath
(
"
@value
"
)
if
len
(
value
)
!=
0
:
# "index line"
choice_id
=
value
[
0
]
else
:
# "correct" or "incorrect" line
activity
.
set_correct
(
choice_id
,
e
.
get
(
"
name
"
)
==
"
CODE_V
"
)
def
to_xml
(
self
,
obj
:
Any
,
tag_name
:
str
|
None
=
None
):
"""
Recursively convert an object structure to an XML `ElementTree`.
Structures are expected to be Python dictionaries.
Converting a dictionary produces a tag named after the
"
type
"
attribute (if present).
- A primitive attribute (i.e. not list nor dict) becomes a tag attribute.
- A list attribute becomes a tag with its contents as sub-tags.
- A dictionary attribute becomes a tag (named like the attribute
'
s key)
containing a sub-tag for the dictionary itself
"""
if
isinstance
(
obj
,
dict
):
# Dictionary (or object):
# - if it has a "type" key, the dict represents an object -> use its value as the tag name
# - if a tag_name is specified as well, it's probably important (like an attribute name),
# so we keep both, as 2 nested tags (tag_name for the outer tag, type for the inner tag)
inner_tag
=
None
outer_tag
=
None
has_inner
=
"
type
"
in
obj
.
keys
()
if
has_inner
:
inner_tag
=
etree
.
Element
(
obj
[
"
type
"
],
None
,
None
)
else
:
inner_tag
=
etree
.
Element
(
"
_dict
"
,
None
,
None
)
if
tag_name
is
not
None
:
outer_tag
=
etree
.
Element
(
tag_name
)
if
has_inner
:
outer_tag
.
append
(
inner_tag
)
else
:
inner_tag
=
outer_tag
else
:
outer_tag
=
inner_tag
# Recurse on dictionary items
for
key
,
val
in
obj
.
items
():
if
key
!=
"
type
"
:
# exception for 'type', handled as attribute
if
isinstance
(
val
,
(
list
,
dict
)):
# Structured attributes become child tags
inner_tag
.
append
(
self
.
to_xml
(
val
,
key
))
else
:
# Primitive attributes become tag attributes
inner_tag
.
set
(
key
,
str
(
val
))
return
outer_tag
elif
isinstance
(
obj
,
list
):
tag_name
=
tag_name
or
"
_list
"
list_tag
=
etree
.
Element
(
tag_name
)
for
e
in
obj
:
list_tag
.
append
(
self
.
to_xml
(
e
))
return
list_tag
else
:
tag_name
=
tag_name
or
"
_literal
"
leaf_tag
=
etree
.
Element
(
tag_name
)
leaf_tag
.
text
=
str
(
obj
)
return
leaf_tag
class
MatchParser
(
JSParser
):
"""
A parser for the JS portion of an activity, that uses Python match statements
to navigate the abstract syntax tree (AST) produced by Esprima
"""
def
__init__
(
self
,
graph
:
Graph
,
act_id
:
str
)
->
None
:
self
.
graph
=
graph
self
.
act_id
=
act_id
self
.
activity
:
Activity
|
None
=
None
@override
def
parse
(
self
,
js
:
str
)
->
Activity
:
jstree
=
es
.
parseScript
(
js
,
None
)
# Try to match our template with one of the top-level statements
for
statement
in
jstree
.
body
:
self
.
match_function
(
statement
.
toDict
())
if
self
.
activity
is
not
None
:
return
self
.
activity
else
:
raise
ParseError
(
"
No activity constructor found
"
)
def
match_constructor_call
(
self
,
new_expr
:
dict
[
str
,
Any
]):
if
self
.
activity
is
not
None
:
# Ignore anything after the first match
return
match
new_expr
:
case
{
"
type
"
:
"
NewExpression
"
,
"
callee
"
:
{
"
type
"
:
"
Identifier
"
,
"
name
"
:
typ
,
},
"
arguments
"
:
[
*
args
],
}:
match
typ
:
case
"
Cours
"
|
"
ExerciceQM
"
|
"
ExerciceTAT
"
|
"
ExerciceGD
"
:
self
.
activity
=
Activity
.
from_typename
(
typ
)
case
"
ExerciceQC
"
:
match
args
:
case
[{
"
type
"
:
"
Literal
"
,
"
value
"
:
"
QCU
"
},
*
_
]:
typ
+=
"
_QCU
"
self
.
activity
=
ExerciceQC
()
case
[{
"
type
"
:
"
Literal
"
,
"
value
"
:
"
QCM
"
},
*
_
]:
typ
+=
"
_QCM
"
self
.
activity
=
ExerciceQC
(
is_qcm
=
True
)
case
_
:
raise
ParseError
(
f
"
ExerciceQC: Invalid argument
'
{
args
}
'"
)
case
_
:
raise
ParseError
(
f
"
Unknown activity type
'
{
typ
}
'"
)
case
_
:
pass
def
match_function
(
self
,
func
:
dict
[
str
,
Any
]):
"""
Checks if `func` matches a function declaration named `entrerDonnees`,
and search its body if successful
"""
match
func
:
case
{
"
type
"
:
"
FunctionDeclaration
"
,
"
id
"
:
{
"
name
"
:
"
entrerDonnees
"
},
"
body
"
:
{
"
type
"
:
"
BlockStatement
"
,
"
body
"
:
body
},
}:
# Matched a function declaration and captured its `body` attr
for
statement
in
body
:
# Find constructor calls (e.g. `new Thing()`) recursively
recurse_prefix
(
statement
,
self
.
match_constructor_call
)
case
_
:
pass
def
recurse_prefix
(
t
:
Any
,
f
:
Callable
[[
Any
],
None
]):
"""
Depth-first prefixed recursion: calls a function on an object, then on
all its children (if it
'
s a list or dictionary) recursively
:param t: The object
:param f: The function to call
"""
f
(
t
)
if
isinstance
(
t
,
list
):
for
e
in
t
:
recurse_prefix
(
e
,
f
)
elif
isinstance
(
t
,
dict
):
for
e
in
t
.
values
():
recurse_prefix
(
e
,
f
)
def
decode_answer_id
(
id
:
str
):
"""
Decode an obfuscated answer ID, just like the `decodeX()` function
...
...
@@ -681,13 +476,9 @@ def parse_page(graph: Graph, filepath: str, id: str):
js
=
"
\n
"
.
join
((
s
.
text_content
()
for
s
in
scripts
))
activity
=
Activity
()
# Try different parsers, each writing to a different file to compare their results
for
parser
in
[
XpathParser
(),
MatchParser
(
graph
,
id
),
RegexParser
(
graph
,
id
)]:
with
open
(
f
"
/tmp/
{
str
(
parser
)
}
_debuglog.txt
"
,
"
a
"
)
as
f
:
print
(
f
"
\n
{
id
:
8
}
"
,
end
=
""
,
file
=
f
)
parser
=
RegexParser
(
graph
,
id
)
try
:
activity
:
Activity
=
parser
.
parse
(
js
)
print
(
activity
,
end
=
""
,
file
=
f
)
except
ParseError
as
e
:
log
.
error
(
f
"
{
parser
}
->
{
id
}
: Parsing error:
{
e
}
. Treating this as a generic Activity.
"
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment