diff --git a/tetras_extraction/macao_12/script/extract.py b/tetras_extraction/macao_12/script/extract.py index abcca88141e7da11f894780cd7b2851d25a3446c..0461cfe25b30323801e2f126aef65a6f29975965 100644 --- a/tetras_extraction/macao_12/script/extract.py +++ b/tetras_extraction/macao_12/script/extract.py @@ -1,3 +1,4 @@ +import filecmp from pprint import pprint from typing import Optional @@ -122,18 +123,32 @@ def parse_manifest_rec( extract_mosetp.parse_mosetp(graph, f"{SOURCE_DIR}/sco/{id}.html", id) -import extract_page +def compare_files(f1, f2): + print( + "Files {} and {} {}.".format( + f1, f2, "are identical" if filecmp.cmp(f1, f2) else "differ" + ) + ) def main(): g = create_graph() + + # Create or reset debug log files for all activity parsers, to compare their + # results afterwards + parsers = ("Match", "Xpath", "Regex") + logfiles = [f"/tmp/{p}Parser_debuglog.txt" for p in parsers] + for logfile in logfiles: + with open(logfile, "w") as f: + print("", file=f) + parse_manifest(g) export_graph(g) - # extract_page.parse_page( - # g, - # f"{SOURCE_DIR}/contenu/pages/pg60.html", - # "pg60", - # ) + + # Compare log files 2 by 2 + compare_files(logfiles[0], logfiles[1]) + compare_files(logfiles[0], logfiles[2]) + compare_files(logfiles[1], logfiles[2]) if __name__ == "__main__": diff --git a/tetras_extraction/macao_12/script/extract_page.py b/tetras_extraction/macao_12/script/extract_page.py index dc3d171265fc03cceb52556e5b84879b7619560d..a36b71f13091c25f83a065fff9bca71e97da78b2 100644 --- a/tetras_extraction/macao_12/script/extract_page.py +++ b/tetras_extraction/macao_12/script/extract_page.py @@ -301,7 +301,7 @@ def parse_page(graph: Graph, filepath: str, id: str): # Try different parsers, each writing to a different file to compare their results for parser in [XpathParser(), MatchParser(graph, id), RegexParser(graph, id)]: - with open(f"/tmp/{str(parser)}.txt", "a") as f: + with open(f"/tmp/{str(parser)}_debuglog.txt", "a") as f: print(f"{id:8}", end="", file=f) try: parser.parse(js, output=f)