Skip to content
Snippets Groups Projects
Commit 5a7cc90d authored by Eliott Sammier's avatar Eliott Sammier
Browse files

Merge branch '22-parse-activity' into extraction

parents d09e3cfc 90b5d8a5
No related branches found
No related tags found
No related merge requests found
...@@ -7,19 +7,29 @@ ...@@ -7,19 +7,29 @@
@base <http://www.semanticweb.org/eliott/ontologies/2024/4/macao/> . @base <http://www.semanticweb.org/eliott/ontologies/2024/4/macao/> .
<http://www.semanticweb.org/eliott/ontologies/2024/4/macao> rdf:type owl:Ontology ; <http://www.semanticweb.org/eliott/ontologies/2024/4/macao> rdf:type owl:Ontology ;
rdfs:label "macao-schema"@fr . rdfs:label "macao-schema"@fr ;
owl:versionInfo 1.1 .
################################################################# #################################################################
# Annotation properties # Object Properties
################################################################# #################################################################
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao#test ### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponse
:test rdf:type owl:AnnotationProperty . :aReponse rdf:type owl:ObjectProperty ;
rdfs:range :Reponse .
################################################################# ### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponseCorrecte
# Object Properties :aReponseCorrecte rdf:type owl:ObjectProperty ;
################################################################# rdfs:domain :Exercice ;
rdfs:range :Reponse .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponseIncorrecte
:aReponseIncorrecte rdf:type owl:ObjectProperty ;
rdfs:domain :Exercice ;
rdfs:range :Reponse .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/contenuDans ### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/contenuDans
:contenuDans rdf:type owl:ObjectProperty ; :contenuDans rdf:type owl:ObjectProperty ;
...@@ -77,6 +87,34 @@ ...@@ -77,6 +87,34 @@
rdfs:range xsd:anyURI . rdfs:range xsd:anyURI .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireInfo
:commentaireInfo rdf:type owl:DatatypeProperty ;
rdfs:domain :Page ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireSucces
:commentaireSucces rdf:type owl:DatatypeProperty ;
rdfs:domain :Page ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireSugg
:commentaireSugg rdf:type owl:DatatypeProperty ;
rdfs:domain :Page ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/correct
:correct rdf:type owl:DatatypeProperty ;
rdfs:range xsd:boolean .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/html
:html rdf:type owl:DatatypeProperty ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/id ### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/id
:id rdf:type owl:DatatypeProperty ; :id rdf:type owl:DatatypeProperty ;
rdfs:subPropertyOf owl:topDataProperty ; rdfs:subPropertyOf owl:topDataProperty ;
...@@ -96,10 +134,6 @@ ...@@ -96,10 +134,6 @@
# Classes # Classes
################################################################# #################################################################
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao#MacaoRoot
:MacaoRoot rdf:type owl:Class .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Cours ### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Cours
:Cours rdf:type owl:Class ; :Cours rdf:type owl:Class ;
rdfs:subClassOf :Page . rdfs:subClassOf :Page .
...@@ -110,16 +144,41 @@ ...@@ -110,16 +144,41 @@
rdfs:subClassOf :Page . rdfs:subClassOf :Page .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/FlashObject ### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceGD
:FlashObject rdf:type owl:Class ; :ExerciceGD rdf:type owl:Class ;
rdfs:subClassOf :MacaoRessource . rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC
:ExerciceQC rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC_QCM
:ExerciceQC_QCM rdf:type owl:Class ;
rdfs:subClassOf :ExerciceQC .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC_QCU
:ExerciceQC_QCU rdf:type owl:Class ;
rdfs:subClassOf :ExerciceQC .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQM
:ExerciceQM rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/GD ### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceTAT
:GD rdf:type owl:Class ; :ExerciceTAT rdf:type owl:Class ;
rdfs:subClassOf :Exercice . rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/FlashObject
:FlashObject rdf:type owl:Class ;
rdfs:subClassOf :MacaoRessource .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Image ### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Image
:Image rdf:type owl:Class ; :Image rdf:type owl:Class ;
rdfs:subClassOf :MacaoRessource . rdfs:subClassOf :MacaoRessource .
...@@ -139,6 +198,10 @@ ...@@ -139,6 +198,10 @@
rdfs:subClassOf :MacaoObject . rdfs:subClassOf :MacaoObject .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/MacaoRoot
:MacaoRoot rdf:type owl:Class .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Module ### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Module
:Module rdf:type owl:Class ; :Module rdf:type owl:Class ;
rdfs:subClassOf :MacaoContenu . rdfs:subClassOf :MacaoContenu .
...@@ -149,14 +212,9 @@ ...@@ -149,14 +212,9 @@
rdfs:subClassOf :MacaoContenu . rdfs:subClassOf :MacaoContenu .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/QC ### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Reponse
:QC rdf:type owl:Class ; :Reponse rdf:type owl:Class ;
rdfs:subClassOf :Exercice . rdfs:subClassOf :MacaoContenu .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/QM
:QM rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/SimpleFlash ### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/SimpleFlash
...@@ -169,25 +227,15 @@ ...@@ -169,25 +227,15 @@
rdfs:subClassOf :MacaoContenu . rdfs:subClassOf :MacaoContenu .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/TAT
:TAT rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ValuePartition
:ValuePartition rdf:type owl:Class .
################################################################# #################################################################
# General axioms # General axioms
################################################################# #################################################################
[ rdf:type owl:AllDisjointClasses ; [ rdf:type owl:AllDisjointClasses ;
owl:members ( :GD owl:members ( :ExerciceGD
:QC :ExerciceQC
:QM :ExerciceQM
:TAT :ExerciceTAT
) )
] . ] .
......
This diff is collapsed.
{
"recommendations": [
"detachhead.basedpyright",
"ms-python.black-formatter"
]
}
\ No newline at end of file
from os import environ, path
from sys import stderr from sys import stderr
from rdflib import RDFS, Graph, Literal, URIRef from typing import Any
from lxml import html
from rdflib import Graph, Literal, RDFS, URIRef
from rdflib import Namespace from rdflib import Namespace
from os import path, environ
def env_path_or_rel_default(env_var: str, default: str) -> str: def env_path_or_rel_default(env_var: str, default: str) -> str:
...@@ -34,15 +37,33 @@ NS = Namespace("http://www.semanticweb.org/eliott/ontologies/2024/4/macao/") ...@@ -34,15 +37,33 @@ NS = Namespace("http://www.semanticweb.org/eliott/ontologies/2024/4/macao/")
# Utility functions ############################################################ # Utility functions ############################################################
def eprint(*args, **kwargs): def eprint(*args, **kwargs): # pyright: ignore[reportMissingParameterType]
"""Just like `print()`, but to standard error instead of standard output""" """Just like `print()`, but to standard error instead of standard output"""
print(*args, file=stderr, **kwargs) print(*args, file=stderr, **kwargs)
def add_title(g: Graph, subject: URIRef, title: str): def to_html(elem: html.HtmlElement) -> str:
"""Shorthand function to serialise a `HtmlElement` to a HTML string"""
return html.tostring(elem, encoding="unicode")
def insert_grow(l: list[Any], index: int, value: Any, fill_value: Any | None = None):
"""Insert at a given position in a list, growing it if necessary
:param l: list
:param index: The position where the value is inserted
:param value: The value to insert
:param fill_value: The value used for elements created automatically when growing, defaults to None
"""
for _ in range(len(l), index + 1):
l.append(fill_value)
l[index] = value
def set_title(g: Graph, subject: URIRef, title: str):
"""Add triples to define the `subject`'s title and label""" """Add triples to define the `subject`'s title and label"""
g.add((subject, RDFS.label, Literal(title))) g.set((subject, RDFS.label, Literal(title)))
g.add((subject, NS["titre"], Literal(title))) g.set((subject, NS["titre"], Literal(title)))
def add_index(g: Graph, subject: URIRef, index: int): def add_index(g: Graph, subject: URIRef, index: int):
...@@ -59,3 +80,10 @@ def add_index(g: Graph, subject: URIRef, index: int): ...@@ -59,3 +80,10 @@ def add_index(g: Graph, subject: URIRef, index: int):
Literal(f"{index:02} | {name} | ") + title, Literal(f"{index:02} | {name} | ") + title,
) )
) )
# Exceptions ###################################################################
class ParseError(Exception):
pass
from pprint import pprint import filecmp
from typing import Optional
from lxml import etree from lxml import etree
from rdflib import RDFS, Graph, Literal, URIRef from rdflib import RDFS, Graph, Literal, URIRef
...@@ -66,9 +65,11 @@ def parse_manifest(graph: Graph): ...@@ -66,9 +65,11 @@ def parse_manifest(graph: Graph):
# Parse with lxml # Parse with lxml
root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot() root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
org = ns_find(root, ".//organization") org = ns_find(root, ".//organization")
if org is None:
raise ParseError("Missing node <organization> in manifest")
# For all top-level modules # For all top-level modules
for i, e in enumerate(ns_findall(org, "item")): for i, e in enumerate(ns_findall(org, "item")):
module = NS[e.get("identifier")] module = NS[e.get("identifier", default="None")]
parse_manifest_rec(graph, e) parse_manifest_rec(graph, e)
graph.add((module, RDFS.subClassOf, NS["MacaoRoot"])) graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
add_index(graph, module, i) add_index(graph, module, i)
...@@ -76,9 +77,9 @@ def parse_manifest(graph: Graph): ...@@ -76,9 +77,9 @@ def parse_manifest(graph: Graph):
def parse_manifest_rec( def parse_manifest_rec(
graph: Graph, graph: Graph,
elem, elem: etree._Element,
parentResource: Optional[URIRef] = None, parentResource: URIRef | None = None,
index: Optional[int] = None, index: int | None = None,
): ):
"""Parses a module `MosMod` from the manifest recursively, adding all its """Parses a module `MosMod` from the manifest recursively, adding all its
descendants to the `graph` descendants to the `graph`
...@@ -87,12 +88,13 @@ def parse_manifest_rec( ...@@ -87,12 +88,13 @@ def parse_manifest_rec(
""" """
# Get title and ID # Get title and ID
title: str = ns_find(elem, "title").text title = ns_find(elem, "title")
id: str = elem.get("identifier") title = title.text if title is not None else "None" # safe default value
id: str = elem.get("identifier", default="None")
# Declare RDF resource and simple properties # Declare RDF resource and simple properties
subject = NS[id] subject = NS[id]
graph.add((subject, RDF.type, OWL.NamedIndividual)) graph.add((subject, RDF.type, OWL.NamedIndividual))
add_title(graph, subject, title) set_title(graph, subject, str(title))
if id.startswith("MosMod"): if id.startswith("MosMod"):
# It's a Module: # It's a Module:
graph.add((subject, RDF.type, NS["Module"])) graph.add((subject, RDF.type, NS["Module"]))
...@@ -119,18 +121,32 @@ def parse_manifest_rec( ...@@ -119,18 +121,32 @@ def parse_manifest_rec(
extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id) extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id)
import extract_page def compare_files(f1: str, f2: str):
print(
"Files {} and {} {}.".format(
f1, f2, "are identical" if filecmp.cmp(f1, f2) else "differ"
)
)
def main(): def main():
g = create_graph() g = create_graph()
# Create or reset debug log files for all activity parsers, to compare their
# results afterwards
parsers = ("Match", "Xpath", "Regex")
logfiles = [f"/tmp/{p}Parser_debuglog.txt" for p in parsers]
for logfile in logfiles:
with open(logfile, "w") as f:
print("", file=f)
parse_manifest(g) parse_manifest(g)
export_graph(g) export_graph(g)
# extract_page.parse_page(
# g, # Compare log files 2 by 2
# f"{SOURCE_DIR}/contenu/pages/pg60.html", compare_files(logfiles[0], logfiles[1])
# "pg60", compare_files(logfiles[0], logfiles[2])
# ) compare_files(logfiles[1], logfiles[2])
if __name__ == "__main__": if __name__ == "__main__":
......
from os import path
import re import re
import subprocess import subprocess
from os import path
from rdflib import OWL, RDF, RDFS, Graph, Literal from rdflib import Graph, Literal, OWL, RDF, RDFS
from extract_page import parse_page
from common import * from common import *
from extract_page import parse_page
def generate_triples( def generate_triples(
...@@ -23,7 +23,8 @@ def generate_triples( ...@@ -23,7 +23,8 @@ def generate_triples(
# Type and simple properties # Type and simple properties
graph.add((page, RDF.type, OWL.NamedIndividual)) graph.add((page, RDF.type, OWL.NamedIndividual))
graph.add((page, RDF.type, NS["Page"])) graph.add((page, RDF.type, NS["Page"]))
add_title(graph, page, page_title) graph.add((page, NS["id"], Literal(page_id)))
set_title(graph, page, page_title)
add_index( add_index(
graph, graph,
page, page,
......
This diff is collapsed.
{
"reportMissingTypeStubs": "information",
"reportUnusedCallResult": "none",
"reportUnusedVariable": "warning",
"reportUnusedImport": "warning",
"reportMissingParameterType": "warning",
"reportMissingArgumentType": "warning",
"reportPrivateUsage": "none" /* lxml.etree often returns _Element */,
"reportUnknownParameterType": "none",
"reportUnknownArgumentType": "none",
"reportUnknownVariableType": "none",
"reportUnknownMemberType": "none",
"reportAny": "none"
}
\ No newline at end of file
cssselect==1.2.0
esprima==4.0.1
isodate==0.6.1 isodate==0.6.1
lxml==5.2.2 lxml==5.2.2
pyparsing==3.1.2 pyparsing==3.1.2
rdflib==7.0.0 rdflib==7.0.0
six==1.16.0 six==1.16.0
types-beautifulsoup4==4.12.0.20240511
types-html5lib==1.1.11.20240228
types-lxml==2024.4.14
typing_extensions==4.12.1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment