From 997e1cbc0b30783935acd75e7e69f11468c8ca5e Mon Sep 17 00:00:00 2001
From: eliott <eliott.sammier@tetras-libre.fr>
Date: Wed, 19 Jun 2024 12:37:19 +0200
Subject: [PATCH] Reorganise code files and tooling

- Move all source files to `src/` dir to separate code and meta
- Add explicit linter settings, auto-format and recommended extensions
- Update README and CLI tool with new entrypoint
---
 mcli                                          | 26 ++++++++++++-------
 .../macao_12/script/.vscode/extensions.json   |  3 ++-
 .../macao_12/script/.vscode/launch.json       |  2 +-
 .../macao_12/script/.vscode/settings.json     | 13 ++++++++--
 tetras_extraction/macao_12/script/README.md   |  2 +-
 .../macao_12/script/{ => src}/common.py       | 15 ++++++-----
 .../macao_12/script/{ => src}/compare_rdf.py  | 15 +++++++----
 .../macao_12/script/{ => src}/extract.py      |  0
 .../script/{ => src}/extract_mosetp.py        |  4 +--
 .../macao_12/script/{ => src}/extract_page.py |  4 +--
 tetras_extraction/macao_12/script/src/main.py | 11 ++++++++
 11 files changed, 64 insertions(+), 31 deletions(-)
 rename tetras_extraction/macao_12/script/{ => src}/common.py (86%)
 rename tetras_extraction/macao_12/script/{ => src}/compare_rdf.py (84%)
 rename tetras_extraction/macao_12/script/{ => src}/extract.py (100%)
 rename tetras_extraction/macao_12/script/{ => src}/extract_mosetp.py (98%)
 rename tetras_extraction/macao_12/script/{ => src}/extract_page.py (99%)
 create mode 100644 tetras_extraction/macao_12/script/src/main.py

diff --git a/mcli b/mcli
index 83b7ac09..e7bd32cf 100755
--- a/mcli
+++ b/mcli
@@ -51,6 +51,8 @@ extract-rdf
     Run the extractor to generate RDF from text sources
 extract-mp3
     Extract audio streams from all Flash SWF files
+convert
+    Run the full conversion process (extract -> transform -> export)
 help
     Print this help and exit
 EOF
@@ -70,6 +72,13 @@ check_file() {
     return 1
 }
 
+activate_venv() {
+    if ! source "$SCRIPTS_DIR/venv/bin/activate"; then
+        echo "Python venv not found, did you run setup first?" >&2
+        exit 1
+    fi
+}
+
 # List MP3 streams in a file
 list_streams() {
     ffprobe -i "$1" 2>&1 | grep -E 'Stream.*Audio: mp3'
@@ -206,13 +215,13 @@ count-all)
 setup)
     "$SCRIPTS_DIR/setup.sh"
     ;;
+convert)
+    activate_venv
+    python "$SCRIPTS_DIR/src/main.py"
+    ;;
 extract-rdf)
-    if source "$SCRIPTS_DIR/venv/bin/activate"; then
-        python "$SCRIPTS_DIR/extract.py"
-    else
-        echo "Python venv not found, did you run setup first?" >&2
-        exit 1
-    fi
+    activate_venv
+    python "$SCRIPTS_DIR/src/extract.py"
     ;;
 extract-mp3)
     for audio_file in "$SOURCES_DIR/contenu/media/"*.swf; do
@@ -221,10 +230,7 @@ extract-mp3)
     ;;
 shell)
     if [[ "$1" = "-p" || "$1" = "--pyenv" ]]; then
-        if ! source "$SCRIPTS_DIR/venv/bin/activate"; then
-            echo "Python venv not found, did you run setup first?" >&2
-            exit 1
-        fi
+        activate_venv
     fi
     export PATH="$PATH:$MACAO_ROOT"
     cd "$MACAO_ROOT"
diff --git a/tetras_extraction/macao_12/script/.vscode/extensions.json b/tetras_extraction/macao_12/script/.vscode/extensions.json
index 9e75ef51..85395aa1 100644
--- a/tetras_extraction/macao_12/script/.vscode/extensions.json
+++ b/tetras_extraction/macao_12/script/.vscode/extensions.json
@@ -1,6 +1,7 @@
 {
     "recommendations": [
         "detachhead.basedpyright",
-        "ms-python.black-formatter"
+        "ms-python.black-formatter",
+        "ms-python.isort"
     ]
 }
\ No newline at end of file
diff --git a/tetras_extraction/macao_12/script/.vscode/launch.json b/tetras_extraction/macao_12/script/.vscode/launch.json
index b7b64a14..f228b954 100644
--- a/tetras_extraction/macao_12/script/.vscode/launch.json
+++ b/tetras_extraction/macao_12/script/.vscode/launch.json
@@ -8,7 +8,7 @@
             "name": "Python Debugger: Current File",
             "type": "debugpy",
             "request": "launch",
-            "program": "extract.py",
+            "program": "src/extract.py",
             "console": "integratedTerminal"
         }
     ]
diff --git a/tetras_extraction/macao_12/script/.vscode/settings.json b/tetras_extraction/macao_12/script/.vscode/settings.json
index dcb1530c..c3434325 100644
--- a/tetras_extraction/macao_12/script/.vscode/settings.json
+++ b/tetras_extraction/macao_12/script/.vscode/settings.json
@@ -1,4 +1,13 @@
 {
-    "python.analysis.typeCheckingMode": "basic",
-    "python.analysis.autoImportCompletions": true
+    "basedpyright.analysis.diagnosticMode": "workspace",
+    "basedpyright.analysis.exclude": [
+        "venv/"
+    ],
+    "basedpyright.disableOrganizeImports": true,
+    "editor.codeActionsOnSave": {
+        "source.organizeImports": "always"
+    },
+    "editor.formatOnSave": true,
+    "python.analysis.autoImportCompletions": true,
+    "python.analysis.typeCheckingMode": "basic"
 }
\ No newline at end of file
diff --git a/tetras_extraction/macao_12/script/README.md b/tetras_extraction/macao_12/script/README.md
index cd15a089..9cfa2903 100644
--- a/tetras_extraction/macao_12/script/README.md
+++ b/tetras_extraction/macao_12/script/README.md
@@ -9,7 +9,7 @@ Ensuite pour chaque shell, il est nécessaire de `source venv/bin/activate` avan
 de pouvoir lancer Python.
 
 ```sh
-python extract.py
+python src/extract.py
 ```
 `extract.py` est le point d'entrée de l'extracteur, qui produit une représentation
 RDF des contenus textuels extraits de Macao12.  
diff --git a/tetras_extraction/macao_12/script/common.py b/tetras_extraction/macao_12/script/src/common.py
similarity index 86%
rename from tetras_extraction/macao_12/script/common.py
rename to tetras_extraction/macao_12/script/src/common.py
index e9aa55a8..31fcc7d4 100644
--- a/tetras_extraction/macao_12/script/common.py
+++ b/tetras_extraction/macao_12/script/src/common.py
@@ -3,8 +3,7 @@ from sys import stderr
 from typing import Any
 
 from lxml import html
-from rdflib import Graph, Literal, RDFS, URIRef
-from rdflib import Namespace
+from rdflib import RDFS, Graph, Literal, Namespace, URIRef
 
 
 def env_path_or_rel_default(env_var: str, default: str) -> str:
@@ -20,15 +19,17 @@ def env_path_or_rel_default(env_var: str, default: str) -> str:
 
 MODULE_DIR = path.dirname(path.realpath(__file__))
 """Absolute path of this module's directory"""
-MACAO_ROOT = env_path_or_rel_default("MACAO_ROOT", "../../..")
+MACAO_ROOT = env_path_or_rel_default("MACAO_ROOT", "../../../..")
 """Path to the Macao root directory"""
-SOURCE_DIR = env_path_or_rel_default("SOURCES_DIR", "../../../Basilisk/MACAO/macao_12")
+SOURCE_DIR = env_path_or_rel_default(
+    "SOURCES_DIR", "../../../../Basilisk/MACAO/macao_12"
+)
 """Path to the Macao source directory (i.e. the one with the manifest)"""
-RESULT_DIR = env_path_or_rel_default("RESULTS_DIR", "../result")
+RESULT_DIR = env_path_or_rel_default("RESULTS_DIR", "../../result")
 """Path to the output directory"""
-RESULT_FILE = env_path_or_rel_default("RESULT_FILE", "../result/macao_content.ttl")
+RESULT_FILE = env_path_or_rel_default("RESULT_FILE", RESULT_DIR + "/macao_content.ttl")
 """Path to the Turtle output file"""
-SCHEMA_FILE = env_path_or_rel_default("SCHEMA_FILE", "../macao_schema.ttl")
+SCHEMA_FILE = env_path_or_rel_default("SCHEMA_FILE", "../../macao_schema.ttl")
 """Path to the schema file"""
 
 NS = Namespace("http://www.semanticweb.org/eliott/ontologies/2024/4/macao/")
diff --git a/tetras_extraction/macao_12/script/compare_rdf.py b/tetras_extraction/macao_12/script/src/compare_rdf.py
similarity index 84%
rename from tetras_extraction/macao_12/script/compare_rdf.py
rename to tetras_extraction/macao_12/script/src/compare_rdf.py
index 3b7775e8..1676c466 100755
--- a/tetras_extraction/macao_12/script/compare_rdf.py
+++ b/tetras_extraction/macao_12/script/src/compare_rdf.py
@@ -5,18 +5,22 @@ isomorphic, and their differences otherwise.
 Implemented from https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#module-rdflib.compare
 """
 from sys import argv, stderr
+
 from rdflib import Graph
-from rdflib.compare import to_isomorphic, graph_diff
+from rdflib.compare import graph_diff, to_isomorphic
+
 
 def dump_nt_sorted(g: Graph):
-    for l in sorted(g.serialize(format='nt').splitlines()):
-        if l: print("\t"+l)
+    for l in sorted(g.serialize(format="nt").splitlines()):
+        if l:
+            print("\t" + l)
+
 
 def main():
     if len(argv) < 3:
         print(f"Usage: {argv[0]} <first_graph> <second_graph>", file=stderr)
         exit()
-        
+
     g1 = Graph().parse(argv[1])
     g2 = Graph().parse(argv[2])
     iso1 = to_isomorphic(g1)
@@ -33,5 +37,6 @@ def main():
         print("In second:")
         dump_nt_sorted(in_second)
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/tetras_extraction/macao_12/script/extract.py b/tetras_extraction/macao_12/script/src/extract.py
similarity index 100%
rename from tetras_extraction/macao_12/script/extract.py
rename to tetras_extraction/macao_12/script/src/extract.py
diff --git a/tetras_extraction/macao_12/script/extract_mosetp.py b/tetras_extraction/macao_12/script/src/extract_mosetp.py
similarity index 98%
rename from tetras_extraction/macao_12/script/extract_mosetp.py
rename to tetras_extraction/macao_12/script/src/extract_mosetp.py
index 426a5c97..3bb26f15 100644
--- a/tetras_extraction/macao_12/script/extract_mosetp.py
+++ b/tetras_extraction/macao_12/script/src/extract_mosetp.py
@@ -1,8 +1,8 @@
-from os import path
 import re
 import subprocess
+from os import path
 
-from rdflib import Graph, Literal, OWL, RDF, RDFS
+from rdflib import OWL, RDF, RDFS, Graph, Literal
 
 from common import *
 from extract_page import parse_page
diff --git a/tetras_extraction/macao_12/script/extract_page.py b/tetras_extraction/macao_12/script/src/extract_page.py
similarity index 99%
rename from tetras_extraction/macao_12/script/extract_page.py
rename to tetras_extraction/macao_12/script/src/extract_page.py
index d69f0e4a..038f698f 100644
--- a/tetras_extraction/macao_12/script/extract_page.py
+++ b/tetras_extraction/macao_12/script/src/extract_page.py
@@ -1,13 +1,13 @@
+import re
 from abc import abstractmethod
 from dataclasses import dataclass
-import re
 from typing import Any, Callable
 
 import esprima as es
 from lxml import etree, html
 from lxml.etree import _Element
 from lxml.html import HtmlElement
-from rdflib import Graph, Literal, RDF
+from rdflib import RDF, Graph, Literal
 from typing_extensions import override
 
 from common import *
diff --git a/tetras_extraction/macao_12/script/src/main.py b/tetras_extraction/macao_12/script/src/main.py
new file mode 100644
index 00000000..c276a20a
--- /dev/null
+++ b/tetras_extraction/macao_12/script/src/main.py
@@ -0,0 +1,11 @@
+import extract
+
+
+def main():
+    extract.main()
+    # transform.main()
+    # export.main()
+
+
+if __name__ == "__main__":
+    main()
-- 
GitLab