#!/usr/bin/env bash # Default value for version env var if [[ -z "$VERSION" ]]; then # VERSION="macao_12" VERSION="full" fi # First parameter overrides version if set version_regex='^(macao_12|macao_3|each|full)$' if [[ "$1" == "12" || "$1" == "3" ]]; then VERSION="macao_$1" shift elif [[ "$1" =~ $version_regex ]]; then VERSION="$1" shift fi export VERSION if [[ ! "$VERSION" =~ $version_regex ]]; then echo "Invalid version '$VERSION'" exit 1 fi # Special version value "each" is processed by calling the script itself once # for each actual version if [[ "$VERSION" = "each" && "$1" != "shell" && "$1" != "status" && "$1" != "setup" ]]; then echo "macao_12:" && "$0" 12 "$@" echo "macao_3:" && "$0" 3 "$@" exit fi export MACAO_ROOT="${MACAO_ROOT:-$(dirname "$(realpath "$0")")}" export SCRIPTS_DIR="$MACAO_ROOT/tetras_extraction/script" SOURCES_DIR="$MACAO_ROOT/Basilisk/MACAO" [[ "$VERSION" != "each" && "$VERSION" != "full" ]] && SOURCES_DIR="$SOURCES_DIR/$VERSION" export SOURCES_DIR RESULTS_DIR="$MACAO_ROOT/tetras_extraction/result" [[ "$VERSION" != "each" ]] && RESULTS_DIR="$RESULTS_DIR/$VERSION" export RESULTS_DIR print_usage() { cat <<EOF Usage: $(basename "$0") [version] <command> [args] VERSION Specifies which part of the MACAO repository to use: "macao_12", "macao_3" (or simply "12" and "3"), "each" for both in separate result dirs and "full" for both merged in a single result dir. If not specified, uses the value of the "VERSION" environment variable, or "12" by default. Some commands do not support every version. COMMANDS status Print useful info about the current environment. shell [-p|--pyenv] Open a shell with mcli's environment variables set, including PATH. If -p or --pyenv is specified, also enter the Python virtual env. list-streams <file> List audio streams in <file> count-streams [<file>] Count audio streams in <file>, or from all SWF files if none is given index-extensions Index all files by extension count-all [-f|--force] Count many types of Macao objects. If -f or --force is given, refresh indexes before counting (equivalent to count-streams and index-extensions) setup Initialize Python environment required by extractors setup-debug (Re)create .env file used by the Python debugger launch config extract Run the extract stage, to generate RDF from text sources transform Run the transform stage, to complete and clean-up the RDF data export Run the export stage, to generate Macao-Hugo content pages convert Run the full conversion process (extract -> transform -> export) test Run simple tests on the extracted RDF data extract-mp3 [-y|--yes-overwrite] Extract audio streams from all Flash SWF files help Print this help and exit EOF } # Check if $1 is a regular file, otherwise if it's a filename (and not a path) # look for this filename in fallback directory $2 check_file() { file="$1" fallback="$2" [[ -f "$file" ]] && echo "$file" && return if [[ -d "$fallback" && "$(basename "$file")" = "$file" ]]; then file="$fallback/$file" [[ -f "$file" ]] && echo "$file" && return fi echo "No such file '$1'" >&2 return 1 } activate_venv() { if ! source "$SCRIPTS_DIR/venv/bin/activate"; then echo "Python venv not found, did you run setup first?" >&2 exit 1 fi } # List MP3 streams in a file list_streams() { ffprobe -i "$1" 2>&1 | grep -E 'Stream.*Audio: mp3' } count_streams() { if [[ -n "$1" ]]; then file="$(check_file "$1" "$SOURCES_DIR/contenu/media")" echo "Indexing streams from $file ..." >&2 echo "$(list_streams "$file" | wc -l) $(basename "$file")" else mkdir -p "$RESULTS_DIR/indexes" echo "Indexing streams from all SWFs (this may take some time) ..." >&2 out_file="$RESULTS_DIR/indexes/swf_streams_count.txt" if [[ "$VERSION" = "full" ]]; then for version in macao_12 macao_3; do for file in "$SOURCES_DIR/$version/contenu/media/"*.swf; do echo "$(list_streams "$file" | wc -l) $version/$(basename "$file")" done done | sort -rn >"$out_file" else for file in "$SOURCES_DIR/contenu/media/"*.swf; do echo "$(list_streams "$file" | wc -l) $(basename "$file")" done | sort -rn >"$out_file" fi cat "$out_file" fi } index_extensions() { mkdir -p "$RESULTS_DIR/indexes" out_file="$RESULTS_DIR/indexes/index_per_extension.txt" echo -n "" >"$out_file" # Clear out file # Index all files, with a cd trick to get relative paths cd "$SOURCES_DIR" || exit allfiles="$(find . -path '**/.idea' -prune -o -type f -print)" cd - || exit # List all extensions, then for each one, filter the index for files with this extension # ( Perl expression courtesy of https://stackoverflow.com/a/1842270 ) perl -ne 'print $1 if m/\.([^.\/]+)$/**//' <<<"$allfiles" | sort -u | while read -r ext; do echo "[$ext]" >>"$out_file" grep -E ".*\.$ext\$" <<<"$allfiles" | sort >>"$out_file" echo "" >>"$out_file" done cat "$out_file" echo "Indexed all files by extension to $out_file" >&2 } count_all() { indexes_dir="$RESULTS_DIR/indexes" mkdir -p "$indexes_dir" index="$indexes_dir/index_per_extension.txt" swf_index="$indexes_dir/swf_streams_count.txt" content_dir="$SOURCES_DIR/contenu" out_file="$indexes_dir/count-all.txt" if [[ ! -f "$index" ]]; then echo "$index not found, use --force or run index-extensions before" exit 1 fi if [[ ! -f "$swf_index" ]]; then echo "$swf_index not found, use --force or run count-streams before" exit 1 fi if [[ "$VERSION" = "macao_3" ]]; then nb_mod="$(grep -c '<item identifier="seq' "$SOURCES_DIR/imsmanifest.xml")" nb_subs="$(grep -Ec 'act[0-9]+.html' "$index")" else nb_mod="$(grep -c '<item identifier="MosMod' "$SOURCES_DIR/imsmanifest.xml")" nb_subs="$(grep -Ec 'MosEtp[0-9]+.html' "$index")" fi nb_pages_all="$(grep -Ec '/contenu/.*\.htm' "$index")" nb_pages="$(grep -Ec '/contenu/pages/pg[0-9]+\.html$' "$index")" nb_pages_special="$((nb_pages_all - nb_pages))" nb_courses="$(grep -rI 'new Cours(' "$content_dir" | wc -l)" nb_exo="$(grep -rIE "new Exercice[[:alpha:]]*\(" "$content_dir" | wc -l)" nb_qm="$(grep -rI 'new ExerciceQM(' "$content_dir" | wc -l)" nb_qcu="$(grep -rIE "new ExerciceQC\(['|\"]QCU['|\"]" "$content_dir" | wc -l)" nb_qcm="$(grep -rIE "new ExerciceQC\(['|\"]QCM['|\"]" "$content_dir" | wc -l)" nb_tat="$(grep -rI 'new ExerciceTAT(' "$content_dir" | wc -l)" nb_gd="$(grep -rI 'new ExerciceGD(' "$content_dir" | wc -l)" nb_exo_total="$((nb_qm + nb_qcu + nb_qcm + nb_tat + nb_gd))" nb_exo_other="$((nb_exo - nb_exo_total))" nb_act="$((nb_courses + nb_exo_total))" nb_flash="$(grep -Ec '/contenu/media/.*\.swf$' "$index")" nb_flash_0="$(grep -Ec '^0 ' "$swf_index")" nb_flash_1="$(grep -Ec '^1 ' "$swf_index")" nb_flash_mult="$((nb_flash - nb_flash_0 - nb_flash_1))" nb_png="$(grep -Ec '/contenu/media/.*\.png$' "$index")" nb_jpg="$(grep -Ec '/contenu/media/.*\.jpg$' "$index")" nb_gif="$(grep -Ec '/contenu/media/.*\.gif$' "$index")" nb_img="$((nb_png + nb_jpg + nb_gif))" nb_media="$(find "$content_dir/media/" -maxdepth 1 -type f | wc -l)" nb_media_total="$((nb_flash + nb_img))" nb_media_other="$((nb_media - nb_media_total))" cat >"$out_file" <<EOF modules: $nb_mod sous-parties: $nb_subs pages: $nb_pages_all normales: $nb_pages spéciales: $nb_pages_special activités: $nb_act cours: $nb_courses exercices: $nb_exo_total QCU: $nb_qcu QCM: $nb_qcm QM: $nb_qm TAT: $nb_tat GD: $nb_gd other: $nb_exo_other media: $nb_media_total images: $nb_img png: $nb_png jpg: $nb_jpg gif: $nb_gif flash: $nb_flash no audio: $nb_flash_0 1 audio: $nb_flash_1 2+ audio: $nb_flash_mult other: $nb_media_other EOF cat "$out_file" } action="$1" shift case "$action" in list-streams) [[ -z "$1" ]] && echo "Usage: list-streams <file>" && exit 1 file="$(check_file "$1" "$SOURCES_DIR/contenu/media")" list_streams "$file" ;; count-streams) count_streams "$@" ;; index-extensions) index_extensions ;; count-all) if [[ "$1" = "-f" || "$1" = "--force" ]]; then index_extensions >/dev/null count_streams >/dev/null fi count_all ;; setup) "$SCRIPTS_DIR/setup.sh" ;; setup-debug) envfile="$SCRIPTS_DIR/.env" echo "VERSION='$VERSION'" > "$envfile" echo "MACAO_ROOT='$MACAO_ROOT'" >> "$envfile" echo "SOURCES_DIR='$SOURCES_DIR'" >> "$envfile" echo "SCRIPTS_DIR='$SCRIPTS_DIR'" >> "$envfile" echo "RESULTS_DIR='$RESULTS_DIR'" >> "$envfile" ;; convert) #activate_venv python "$SCRIPTS_DIR/src/main.py" ;; extract) #activate_venv python "$SCRIPTS_DIR/src/extract.py" ;; transform) #activate_venv python "$SCRIPTS_DIR/src/transform.py" ;; export) #activate_venv python "$SCRIPTS_DIR/src/export.py" ;; test) #activate_venv python "$SCRIPTS_DIR/src/test.py" ;; extract-mp3) for version in macao_12 macao_3; do for audio_file in "$SOURCES_DIR/$version/contenu/media/"*.swf; do "$SCRIPTS_DIR/extract_mp3.sh" "$@" --output-dir "$RESULTS_DIR/audio" "$audio_file" done done ;; copy-images) mkdir "$RESULTS_DIR/img/" for version in macao_12 macao_3; do for type in gif jpg png; do cp "$SOURCES_DIR/$version/contenu/media/"*".$type" "$RESULTS_DIR/img/" done done ;; shell) if [[ "$VERSION" = "each" ]]; then echo "Subcommand 'shell' not supported for version '$VERSION'" >&2 exit 1 fi if [[ "$1" = "-p" || "$1" = "--pyenv" ]]; then activate_venv fi export PATH="$PATH:$MACAO_ROOT" cd "$MACAO_ROOT" $SHELL ;; status) echo "VERSION=$VERSION" echo "MACAO_ROOT=$MACAO_ROOT" echo "SOURCES_DIR=$SOURCES_DIR" echo "SCRIPTS_DIR=$SCRIPTS_DIR" echo "RESULTS_DIR=$RESULTS_DIR" echo "" echo "Python virtual env: ${VIRTUAL_ENV:-"not set"}" echo -n "mcli: " if which mcli >/dev/null; then echo "available"; else echo "not in PATH"; fi echo "" cd "$MACAO_ROOT" && git status ;; help) print_usage ;; *) echo "Unknown command '$action'" print_usage exit 1 ;; esac