Skip to content
Snippets Groups Projects
Commit 997e1cbc authored by Eliott Sammier's avatar Eliott Sammier
Browse files

Reorganise code files and tooling

- Move all source files to `src/` dir to separate code and meta
- Add explicit linter settings, auto-format and recommended extensions
- Update README and CLI tool with new entrypoint
parent 5a7cc90d
No related branches found
No related tags found
No related merge requests found
......@@ -51,6 +51,8 @@ extract-rdf
Run the extractor to generate RDF from text sources
extract-mp3
Extract audio streams from all Flash SWF files
convert
Run the full conversion process (extract -> transform -> export)
help
Print this help and exit
EOF
......@@ -70,6 +72,13 @@ check_file() {
return 1
}
activate_venv() {
if ! source "$SCRIPTS_DIR/venv/bin/activate"; then
echo "Python venv not found, did you run setup first?" >&2
exit 1
fi
}
# List MP3 streams in a file
list_streams() {
ffprobe -i "$1" 2>&1 | grep -E 'Stream.*Audio: mp3'
......@@ -206,13 +215,13 @@ count-all)
setup)
"$SCRIPTS_DIR/setup.sh"
;;
convert)
activate_venv
python "$SCRIPTS_DIR/src/main.py"
;;
extract-rdf)
if source "$SCRIPTS_DIR/venv/bin/activate"; then
python "$SCRIPTS_DIR/extract.py"
else
echo "Python venv not found, did you run setup first?" >&2
exit 1
fi
activate_venv
python "$SCRIPTS_DIR/src/extract.py"
;;
extract-mp3)
for audio_file in "$SOURCES_DIR/contenu/media/"*.swf; do
......@@ -221,10 +230,7 @@ extract-mp3)
;;
shell)
if [[ "$1" = "-p" || "$1" = "--pyenv" ]]; then
if ! source "$SCRIPTS_DIR/venv/bin/activate"; then
echo "Python venv not found, did you run setup first?" >&2
exit 1
fi
activate_venv
fi
export PATH="$PATH:$MACAO_ROOT"
cd "$MACAO_ROOT"
......
{
"recommendations": [
"detachhead.basedpyright",
"ms-python.black-formatter"
"ms-python.black-formatter",
"ms-python.isort"
]
}
\ No newline at end of file
......@@ -8,7 +8,7 @@
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "extract.py",
"program": "src/extract.py",
"console": "integratedTerminal"
}
]
......
{
"python.analysis.typeCheckingMode": "basic",
"python.analysis.autoImportCompletions": true
"basedpyright.analysis.diagnosticMode": "workspace",
"basedpyright.analysis.exclude": [
"venv/"
],
"basedpyright.disableOrganizeImports": true,
"editor.codeActionsOnSave": {
"source.organizeImports": "always"
},
"editor.formatOnSave": true,
"python.analysis.autoImportCompletions": true,
"python.analysis.typeCheckingMode": "basic"
}
\ No newline at end of file
......@@ -9,7 +9,7 @@ Ensuite pour chaque shell, il est nécessaire de `source venv/bin/activate` avan
de pouvoir lancer Python.
```sh
python extract.py
python src/extract.py
```
`extract.py` est le point d'entrée de l'extracteur, qui produit une représentation
RDF des contenus textuels extraits de Macao12.
......
......@@ -3,8 +3,7 @@ from sys import stderr
from typing import Any
from lxml import html
from rdflib import Graph, Literal, RDFS, URIRef
from rdflib import Namespace
from rdflib import RDFS, Graph, Literal, Namespace, URIRef
def env_path_or_rel_default(env_var: str, default: str) -> str:
......@@ -20,15 +19,17 @@ def env_path_or_rel_default(env_var: str, default: str) -> str:
MODULE_DIR = path.dirname(path.realpath(__file__))
"""Absolute path of this module's directory"""
MACAO_ROOT = env_path_or_rel_default("MACAO_ROOT", "../../..")
MACAO_ROOT = env_path_or_rel_default("MACAO_ROOT", "../../../..")
"""Path to the Macao root directory"""
SOURCE_DIR = env_path_or_rel_default("SOURCES_DIR", "../../../Basilisk/MACAO/macao_12")
SOURCE_DIR = env_path_or_rel_default(
"SOURCES_DIR", "../../../../Basilisk/MACAO/macao_12"
)
"""Path to the Macao source directory (i.e. the one with the manifest)"""
RESULT_DIR = env_path_or_rel_default("RESULTS_DIR", "../result")
RESULT_DIR = env_path_or_rel_default("RESULTS_DIR", "../../result")
"""Path to the output directory"""
RESULT_FILE = env_path_or_rel_default("RESULT_FILE", "../result/macao_content.ttl")
RESULT_FILE = env_path_or_rel_default("RESULT_FILE", RESULT_DIR + "/macao_content.ttl")
"""Path to the Turtle output file"""
SCHEMA_FILE = env_path_or_rel_default("SCHEMA_FILE", "../macao_schema.ttl")
SCHEMA_FILE = env_path_or_rel_default("SCHEMA_FILE", "../../macao_schema.ttl")
"""Path to the schema file"""
NS = Namespace("http://www.semanticweb.org/eliott/ontologies/2024/4/macao/")
......
......@@ -5,12 +5,16 @@ isomorphic, and their differences otherwise.
Implemented from https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#module-rdflib.compare
"""
from sys import argv, stderr
from rdflib import Graph
from rdflib.compare import to_isomorphic, graph_diff
from rdflib.compare import graph_diff, to_isomorphic
def dump_nt_sorted(g: Graph):
for l in sorted(g.serialize(format='nt').splitlines()):
if l: print("\t"+l)
for l in sorted(g.serialize(format="nt").splitlines()):
if l:
print("\t" + l)
def main():
if len(argv) < 3:
......@@ -33,5 +37,6 @@ def main():
print("In second:")
dump_nt_sorted(in_second)
if __name__ == "__main__":
main()
from os import path
import re
import subprocess
from os import path
from rdflib import Graph, Literal, OWL, RDF, RDFS
from rdflib import OWL, RDF, RDFS, Graph, Literal
from common import *
from extract_page import parse_page
......
import re
from abc import abstractmethod
from dataclasses import dataclass
import re
from typing import Any, Callable
import esprima as es
from lxml import etree, html
from lxml.etree import _Element
from lxml.html import HtmlElement
from rdflib import Graph, Literal, RDF
from rdflib import RDF, Graph, Literal
from typing_extensions import override
from common import *
......
import extract
def main():
extract.main()
# transform.main()
# export.main()
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment