Skip to content
Snippets Groups Projects
Commit 5a7cc90d authored by Eliott Sammier's avatar Eliott Sammier
Browse files

Merge branch '22-parse-activity' into extraction

parents d09e3cfc 90b5d8a5
No related branches found
No related tags found
No related merge requests found
......@@ -7,19 +7,29 @@
@base <http://www.semanticweb.org/eliott/ontologies/2024/4/macao/> .
<http://www.semanticweb.org/eliott/ontologies/2024/4/macao> rdf:type owl:Ontology ;
rdfs:label "macao-schema"@fr .
rdfs:label "macao-schema"@fr ;
owl:versionInfo 1.1 .
#################################################################
# Annotation properties
# Object Properties
#################################################################
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao#test
:test rdf:type owl:AnnotationProperty .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponse
:aReponse rdf:type owl:ObjectProperty ;
rdfs:range :Reponse .
#################################################################
# Object Properties
#################################################################
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponseCorrecte
:aReponseCorrecte rdf:type owl:ObjectProperty ;
rdfs:domain :Exercice ;
rdfs:range :Reponse .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/aReponseIncorrecte
:aReponseIncorrecte rdf:type owl:ObjectProperty ;
rdfs:domain :Exercice ;
rdfs:range :Reponse .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/contenuDans
:contenuDans rdf:type owl:ObjectProperty ;
......@@ -77,6 +87,34 @@
rdfs:range xsd:anyURI .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireInfo
:commentaireInfo rdf:type owl:DatatypeProperty ;
rdfs:domain :Page ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireSucces
:commentaireSucces rdf:type owl:DatatypeProperty ;
rdfs:domain :Page ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/commentaireSugg
:commentaireSugg rdf:type owl:DatatypeProperty ;
rdfs:domain :Page ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/correct
:correct rdf:type owl:DatatypeProperty ;
rdfs:range xsd:boolean .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/html
:html rdf:type owl:DatatypeProperty ;
rdfs:range rdf:XMLLiteral .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/id
:id rdf:type owl:DatatypeProperty ;
rdfs:subPropertyOf owl:topDataProperty ;
......@@ -96,10 +134,6 @@
# Classes
#################################################################
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao#MacaoRoot
:MacaoRoot rdf:type owl:Class .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Cours
:Cours rdf:type owl:Class ;
rdfs:subClassOf :Page .
......@@ -110,16 +144,41 @@
rdfs:subClassOf :Page .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/FlashObject
:FlashObject rdf:type owl:Class ;
rdfs:subClassOf :MacaoRessource .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceGD
:ExerciceGD rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC
:ExerciceQC rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC_QCM
:ExerciceQC_QCM rdf:type owl:Class ;
rdfs:subClassOf :ExerciceQC .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQC_QCU
:ExerciceQC_QCU rdf:type owl:Class ;
rdfs:subClassOf :ExerciceQC .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceQM
:ExerciceQM rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/GD
:GD rdf:type owl:Class ;
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ExerciceTAT
:ExerciceTAT rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/FlashObject
:FlashObject rdf:type owl:Class ;
rdfs:subClassOf :MacaoRessource .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Image
:Image rdf:type owl:Class ;
rdfs:subClassOf :MacaoRessource .
......@@ -139,6 +198,10 @@
rdfs:subClassOf :MacaoObject .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/MacaoRoot
:MacaoRoot rdf:type owl:Class .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Module
:Module rdf:type owl:Class ;
rdfs:subClassOf :MacaoContenu .
......@@ -149,14 +212,9 @@
rdfs:subClassOf :MacaoContenu .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/QC
:QC rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/QM
:QM rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/Reponse
:Reponse rdf:type owl:Class ;
rdfs:subClassOf :MacaoContenu .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/SimpleFlash
......@@ -169,25 +227,15 @@
rdfs:subClassOf :MacaoContenu .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/TAT
:TAT rdf:type owl:Class ;
rdfs:subClassOf :Exercice .
### http://www.semanticweb.org/eliott/ontologies/2024/4/macao/ValuePartition
:ValuePartition rdf:type owl:Class .
#################################################################
# General axioms
#################################################################
[ rdf:type owl:AllDisjointClasses ;
owl:members ( :GD
:QC
:QM
:TAT
owl:members ( :ExerciceGD
:ExerciceQC
:ExerciceQM
:ExerciceTAT
)
] .
......
This diff is collapsed.
{
"recommendations": [
"detachhead.basedpyright",
"ms-python.black-formatter"
]
}
\ No newline at end of file
from os import environ, path
from sys import stderr
from rdflib import RDFS, Graph, Literal, URIRef
from typing import Any
from lxml import html
from rdflib import Graph, Literal, RDFS, URIRef
from rdflib import Namespace
from os import path, environ
def env_path_or_rel_default(env_var: str, default: str) -> str:
......@@ -34,15 +37,33 @@ NS = Namespace("http://www.semanticweb.org/eliott/ontologies/2024/4/macao/")
# Utility functions ############################################################
def eprint(*args, **kwargs):
def eprint(*args, **kwargs): # pyright: ignore[reportMissingParameterType]
"""Just like `print()`, but to standard error instead of standard output"""
print(*args, file=stderr, **kwargs)
def add_title(g: Graph, subject: URIRef, title: str):
def to_html(elem: html.HtmlElement) -> str:
"""Shorthand function to serialise a `HtmlElement` to a HTML string"""
return html.tostring(elem, encoding="unicode")
def insert_grow(l: list[Any], index: int, value: Any, fill_value: Any | None = None):
"""Insert at a given position in a list, growing it if necessary
:param l: list
:param index: The position where the value is inserted
:param value: The value to insert
:param fill_value: The value used for elements created automatically when growing, defaults to None
"""
for _ in range(len(l), index + 1):
l.append(fill_value)
l[index] = value
def set_title(g: Graph, subject: URIRef, title: str):
"""Add triples to define the `subject`'s title and label"""
g.add((subject, RDFS.label, Literal(title)))
g.add((subject, NS["titre"], Literal(title)))
g.set((subject, RDFS.label, Literal(title)))
g.set((subject, NS["titre"], Literal(title)))
def add_index(g: Graph, subject: URIRef, index: int):
......@@ -59,3 +80,10 @@ def add_index(g: Graph, subject: URIRef, index: int):
Literal(f"{index:02} | {name} | ") + title,
)
)
# Exceptions ###################################################################
class ParseError(Exception):
pass
from pprint import pprint
from typing import Optional
import filecmp
from lxml import etree
from rdflib import RDFS, Graph, Literal, URIRef
......@@ -66,9 +65,11 @@ def parse_manifest(graph: Graph):
# Parse with lxml
root = etree.parse(SOURCE_DIR + "/imsmanifest.xml", None).getroot()
org = ns_find(root, ".//organization")
if org is None:
raise ParseError("Missing node <organization> in manifest")
# For all top-level modules
for i, e in enumerate(ns_findall(org, "item")):
module = NS[e.get("identifier")]
module = NS[e.get("identifier", default="None")]
parse_manifest_rec(graph, e)
graph.add((module, RDFS.subClassOf, NS["MacaoRoot"]))
add_index(graph, module, i)
......@@ -76,9 +77,9 @@ def parse_manifest(graph: Graph):
def parse_manifest_rec(
graph: Graph,
elem,
parentResource: Optional[URIRef] = None,
index: Optional[int] = None,
elem: etree._Element,
parentResource: URIRef | None = None,
index: int | None = None,
):
"""Parses a module `MosMod` from the manifest recursively, adding all its
descendants to the `graph`
......@@ -87,12 +88,13 @@ def parse_manifest_rec(
"""
# Get title and ID
title: str = ns_find(elem, "title").text
id: str = elem.get("identifier")
title = ns_find(elem, "title")
title = title.text if title is not None else "None" # safe default value
id: str = elem.get("identifier", default="None")
# Declare RDF resource and simple properties
subject = NS[id]
graph.add((subject, RDF.type, OWL.NamedIndividual))
add_title(graph, subject, title)
set_title(graph, subject, str(title))
if id.startswith("MosMod"):
# It's a Module:
graph.add((subject, RDF.type, NS["Module"]))
......@@ -119,18 +121,32 @@ def parse_manifest_rec(
extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id)
import extract_page
def compare_files(f1: str, f2: str):
print(
"Files {} and {} {}.".format(
f1, f2, "are identical" if filecmp.cmp(f1, f2) else "differ"
)
)
def main():
g = create_graph()
# Create or reset debug log files for all activity parsers, to compare their
# results afterwards
parsers = ("Match", "Xpath", "Regex")
logfiles = [f"/tmp/{p}Parser_debuglog.txt" for p in parsers]
for logfile in logfiles:
with open(logfile, "w") as f:
print("", file=f)
parse_manifest(g)
export_graph(g)
# extract_page.parse_page(
# g,
# f"{SOURCE_DIR}/contenu/pages/pg60.html",
# "pg60",
# )
# Compare log files 2 by 2
compare_files(logfiles[0], logfiles[1])
compare_files(logfiles[0], logfiles[2])
compare_files(logfiles[1], logfiles[2])
if __name__ == "__main__":
......
from os import path
import re
import subprocess
from os import path
from rdflib import OWL, RDF, RDFS, Graph, Literal
from rdflib import Graph, Literal, OWL, RDF, RDFS
from extract_page import parse_page
from common import *
from extract_page import parse_page
def generate_triples(
......@@ -23,7 +23,8 @@ def generate_triples(
# Type and simple properties
graph.add((page, RDF.type, OWL.NamedIndividual))
graph.add((page, RDF.type, NS["Page"]))
add_title(graph, page, page_title)
graph.add((page, NS["id"], Literal(page_id)))
set_title(graph, page, page_title)
add_index(
graph,
page,
......
This diff is collapsed.
{
"reportMissingTypeStubs": "information",
"reportUnusedCallResult": "none",
"reportUnusedVariable": "warning",
"reportUnusedImport": "warning",
"reportMissingParameterType": "warning",
"reportMissingArgumentType": "warning",
"reportPrivateUsage": "none" /* lxml.etree often returns _Element */,
"reportUnknownParameterType": "none",
"reportUnknownArgumentType": "none",
"reportUnknownVariableType": "none",
"reportUnknownMemberType": "none",
"reportAny": "none"
}
\ No newline at end of file
cssselect==1.2.0
esprima==4.0.1
isodate==0.6.1
lxml==5.2.2
pyparsing==3.1.2
rdflib==7.0.0
six==1.16.0
types-beautifulsoup4==4.12.0.20240511
types-html5lib==1.1.11.20240228
types-lxml==2024.4.14
typing_extensions==4.12.1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment