From 06e7a5c660313d4edb4c8857198dc9fe554e7992 Mon Sep 17 00:00:00 2001 From: daxid <david.rouquet@tetras-libre.fr> Date: Fri, 23 Jun 2023 10:25:44 +0000 Subject: [PATCH] Fix some bug and clean code --- tenet/__init__.py | 1 - tenet/main.py | 153 ++++++++++++++-------------------------------- 2 files changed, 46 insertions(+), 108 deletions(-) diff --git a/tenet/__init__.py b/tenet/__init__.py index bd883646..7a930616 100644 --- a/tenet/__init__.py +++ b/tenet/__init__.py @@ -1,7 +1,6 @@ # -- Update System Path import os, sys LIB_PATH = os.path.dirname(os.path.abspath(__file__)) + '/' -print('Running in ' + LIB_PATH) os.chdir(LIB_PATH) sys.path.insert(0, os.path.abspath(LIB_PATH)) diff --git a/tenet/main.py b/tenet/main.py index 8a61a134..92774bea 100644 --- a/tenet/main.py +++ b/tenet/main.py @@ -36,38 +36,10 @@ logger = logging.getLogger('root') def __set_context(): # LIB_PATH = os.path.dirname(os.path.abspath(__file__)) + '/' - print(f'Running in {LIB_PATH}') + print(f'Tenet Running in {LIB_PATH}') os.chdir(LIB_PATH) -# def __set_config( -# config_file_path, -# source_type, source_corpus, onto_prefix, -# base_output_dir, technical_dir_path): - -# logger.info("-- Process Setting ") -# logger.info(f'----- Corpus source: {source_corpus} ({source_type})') -# logger.info(f'----- Base output dir: {base_output_dir}') -# logger.info(f'----- technical dir path: {technical_dir_path}') -# logger.info(f'----- Ontology target (id): {onto_prefix}') -# logger.info(f'----- Current path: {os.getcwd()}') -# logger.debug(f'----- Config file: {config_file_path}') - -# process_config = config.Config(config_file_path, -# onto_prefix, -# source_corpus, -# base_output_dir = base_output_dir, -# technical_dir_path = technical_dir_path, -# source_type = source_type -# ) -# #process_config.source_type = source_type -# # config.output_ontology_namespace = target_ontology_namespace - -# logger.debug(process_config.get_full_config()) - -# return process_config - - def __set_config(config_dict): config_file_path = config_dict['config_file_path'] source_type = config_dict['source_type'] @@ -84,14 +56,6 @@ def __set_config(config_dict): logger.info(f'----- Current path: {os.getcwd()}') logger.debug(f'----- Config file: {config_file_path}') - # process_config = config.Config( - # config_file_path, - # onto_prefix, - # source_corpus, - # base_output_dir=base_output_dir, - # technical_dir_path=technical_dir_path, - # source_type=source_type - # ) base_config = config.Config(config_dict) logger.debug(base_config.get_full_config()) @@ -110,9 +74,8 @@ def __count_number_of_graph(config): def __apply_extraction(config, sentence_file): if config.technical_dir_path is not None: - os.makedirs(config.sentence_output_dir, exist_ok=True) - work_graph = structure.prepare_sentence_work(config, sentence_file) - + os.makedirs(config.sentence_output_dir, exist_ok=True) + work_graph = structure.prepare_sentence_work(config, sentence_file) _, new_triple_list = process.apply(config, work_graph) return new_triple_list @@ -142,6 +105,21 @@ def __serialize_factoid_graph(config, factoid_graph, out_file_path=None): return ontology_turtle_string + +#============================================================================== +# Extraction Run +#============================================================================== + +def run_extraction(arg_dict): + process_config = config.Config(arg_dict) + sentence_indice = arg_dict['sentence_list_indice'] + sentence_file = arg_dict['sentence_file'] + logger.info(f' *** sentence {sentence_indice} *** ') + process_config.sentence_output_dir = f'-{sentence_indice}' + new_triple_list = __apply_extraction(process_config, sentence_file) + return(new_triple_list) + + #============================================================================== # AMR Main Methods (to create an ontology) - with one processing #============================================================================== @@ -188,10 +166,6 @@ def create_ontology_from_amrld_file(amrld_file_path, } config = __set_config(config_dict) - - # config = __set_config(OWL_CONFIG_FILE_PATH, -# 'amr', amrld_file_path, onto_prefix, -# base_output_dir, technical_dir_path) assert os.path.exists(amrld_file_path), f'input file does not exists ({amrld_file_path})' @@ -259,23 +233,31 @@ def create_ontology_from_amrld_dir(amrld_dir_path, config = __set_config(config_dict) - # config = __set_config(OWL_CONFIG_FILE_PATH, -# 'amr', amrld_dir_path, onto_prefix, -# base_output_dir, technical_dir_path) - assert os.path.exists(amrld_dir_path), f'input directory does not exists ({amrld_dir_path})' __count_number_of_graph(config) # -- Extraction Processing logger.info('\n === Extraction Processing === ') - sentence_dir = config.source_sentence_file - sentence_count = 0 + + # ----- Sentence File List + sentence_dir = config.source_sentence_file + sentence_file_list = glob.glob(sentence_dir, recursive = True) + + # ----- Computing Extraction Argument (config_dict update) + for i in range(len(sentence_file_list)): + config_dict['sentence_list_indice'] = i + config_dict['sentence_file'] = sentence_file_list[i] + + # ----- Single Processing Extraction Run + #sentence_count = 0 result_triple_list = [] - for sentence_file in glob.glob(sentence_dir, recursive = True): - sentence_count += 1 - logger.info(f' *** sentence {sentence_count} *** ') - config.sentence_output_dir = f'-{sentence_count}' - new_triple_list = __apply_extraction(config, sentence_file) + for sentence_file in sentence_file_list:# +# sentence_count += 1 +# logger.info(f' *** sentence {sentence_count} *** ') +# config.sentence_output_dir = f'-{sentence_count}' +# new_triple_list = __apply_extraction(config, sentence_file) +# result_triple_list.extend(new_triple_list) + new_triple_list = run_extraction(config_dict) result_triple_list.extend(new_triple_list) # -- Final Ontology Generation (factoid_graph) @@ -297,35 +279,11 @@ def create_ontology_from_amrld_dir(amrld_dir_path, # AMR Main Methods (to create an ontology) - Multiprocessing #============================================================================== -#global result_triple_queue -#global sentence_file_list - -def dump_queue(q): - q.put(None) - return list(iter(q.get, None)) -def pool_function(arg_dict): - #global result_triple_queue - #global sentence_file_list - - #process_config = config.Config(OWL_CONFIG_FILE_PATH, 'default', 'default') - #process_config.update_from_dict(arg_dict) - process_config = config.Config(arg_dict) - - sentence_indice = arg_dict['sentence_list_indice'] - sentence_file = sentence_file_list[sentence_indice] - - logger.info(f' *** sentence {sentence_indice} *** ') - process_config.sentence_output_dir = f'-{sentence_indice}\n' - new_triple_list = __apply_extraction(process_config, sentence_file) - # The following must handled via a global queue - #result_triple_queue.extend(new_triple_list) - - return(new_triple_list) -#@timed +@timed def create_ontology_from_amrld_dir_multi_cpu(amrld_dir_path, base_ontology_path=None, onto_prefix=None, @@ -352,8 +310,6 @@ def create_ontology_from_amrld_dir_multi_cpu(amrld_dir_path, Complete Ontology Turtle String (synthesis of all ontology) """ - global result_triple_queue - global sentence_file_list logger.info('[TENET] Extraction Processing') # -- Process Initialization @@ -372,38 +328,30 @@ def create_ontology_from_amrld_dir_multi_cpu(amrld_dir_path, } config = __set_config(config_dict) - - # config = __set_config(OWL_CONFIG_FILE_PATH, -# 'amr', amrld_dir_path, onto_prefix, -# base_output_dir, technical_dir_path) assert os.path.exists(amrld_dir_path), f'input directory does not exists ({amrld_dir_path})' __count_number_of_graph(config) # -- Extraction Processing logger.info('\n === Extraction Processing === ') - sentence_dir = config.source_sentence_file - sentence_count = 0 - result_triple_list = [] - - #result_triple_queue = multiprocessing.Queue() + # ----- Sentence File List + sentence_dir = config.source_sentence_file sentence_file_list = glob.glob(sentence_dir, recursive = True) # The following is for multiprocessing logging (must be exec before the pool is created multiprocessing_logging.install_mp_handler() - # config_dict = config.to_dict() - #star_iterable = [(i, config) for i in range(len(sentence_file_list))] - - mapIterable = [] - + # ----- Computing Extraction Argument + mapIterable = [] for i in range(len(sentence_file_list)): config_dict['sentence_list_indice'] = i + config_dict['sentence_file'] = sentence_file_list[i] mapIterable = mapIterable + [config_dict.copy()] - + + # ----- (Multiprocessing) Extraction Run with multiprocessing.Pool(processes) as p: - triplesLists = p.map(pool_function, mapIterable) + triplesLists = p.map(run_extraction, mapIterable) result_triple_list = [] for tripleList in triplesLists : @@ -411,7 +359,6 @@ def create_ontology_from_amrld_dir_multi_cpu(amrld_dir_path, # -- Final Ontology Generation (factoid_graph) logger.info('\n === Final Ontology Generation === ') - #result_triple_list = dump_queue(result_triple_queue) factoid_graph = __generate_final_ontology(result_triple_list) ontology_turtle_string = __serialize_factoid_graph(config, factoid_graph, out_file_path) @@ -470,10 +417,6 @@ def generate_odrl_from_amrld_file( config = __set_config(config_dict) - # config = __set_config(ODRL_CONFIG_FILE_PATH, -# 'amr', amrld_file_path, onto_prefix, -# base_output_dir, technical_dir_path) - assert os.path.exists(amrld_file_path), f'input file does not exists ({amrld_file_path})' # -- Extraction Processing @@ -536,10 +479,6 @@ def generate_odrl_from_amrld_dir( } config = __set_config(config_dict) - - # config = __set_config(ODRL_CONFIG_FILE_PATH, -# 'amr', amrld_dir_path, onto_prefix, -# base_output_dir, technical_dir_path) assert os.path.exists(amrld_dir_path), f'input directory does not exists ({amrld_dir_path})' __count_number_of_graph(config) -- GitLab