diff --git a/tenet/extraction/config.py b/tenet/extraction/config.py index 851484c84d3e537a5cf4f297b568cac5e736850c..0dc39065226d26fe5b42a8d29d78d11d78a87326 100644 --- a/tenet/extraction/config.py +++ b/tenet/extraction/config.py @@ -54,6 +54,7 @@ class Config: base_output_dir = config_dict['base_output_dir'] if 'base_output_dir' in config_dict.keys() else None technical_dir_path = config_dict['technical_dir_path'] if 'technical_dir_path' in config_dict.keys() else None source_type = config_dict['source_type'] if 'source_type' in config_dict.keys() else None + sentence_id = config_dict['sentence_id'] if 'sentence_id' in config_dict.keys() else 0 # -- Config XML Tree @@ -73,6 +74,8 @@ class Config: else: self.source_type = source_type self.extraction_scheme = c_base.get("extraction_scheme") + self.sentence_id = sentence_id + self.process_ref = f'P-{self.sentence_id}' # # -- CTS Reference # self.cts_ref = "" diff --git a/tenet/extraction/process.py b/tenet/extraction/process.py index 6b0b255a4c0e994b740a3ec439949fd1280abd89..ffc6e74bf96425703df3d9ff6c9350a7614bb67c 100644 --- a/tenet/extraction/process.py +++ b/tenet/extraction/process.py @@ -41,17 +41,17 @@ def load_cts(config): except FileNotFoundError: logger.error(' *** Error while loading scheme (load_cts) ***') - logger.debug(f' ----- current work directory: {os.getcwd()}') - logger.debug(f' ----- cts_file unknown: {config.cts_file}') + logger.debug(f' [{config.process_ref}] ----- current work directory: {os.getcwd()}') + logger.debug(f' [{config.process_ref}] ----- cts_file unknown: {config.cts_file}') except Exception: logger.error(' *** Error while loading scheme (load_cts) ***') - logger.debug(f' ----- current work directory: {os.getcwd()}') - logger.debug(f' ----- extraction_scheme: {config.extraction_scheme}') - logger.debug(f' ----- cts_file: {config.cts_file}') + logger.debug(f' [{config.process_ref}] ----- current work directory: {os.getcwd()}') + logger.debug(f' [{config.process_ref}] ----- extraction_scheme: {config.extraction_scheme}') + logger.debug(f' [{config.process_ref}] ----- cts_file: {config.cts_file}') -def get_new_rule_set(rule_def_set, prefix_list): +def get_new_rule_set(config, rule_def_set, prefix_list): """ Get a set of new rules from <rule_def_set> (rule definition dictionary) """ @@ -67,11 +67,11 @@ def get_new_rule_set(rule_def_set, prefix_list): return rule_set except: - logger.error(' *** Error while loading rule set (get_new_rule_set) *** ') - logger.debug(f' ----- len(rule_def_set): {len(rule_def_set)}') - logger.debug(f' ----- rule_key: {rule_key}') - logger.debug(f' ----- rule_def: {rule_def}') - logger.debug(f' ----- len(rule_set): {rule_set}') + logger.error(f' [{config.process_ref}] *** Error while loading rule set (get_new_rule_set) *** ') + logger.debug(f' [{config.process_ref}] ----- len(rule_def_set): {len(rule_def_set)}') + logger.debug(f' [{config.process_ref}] ----- rule_key: {rule_key}') + logger.debug(f' [{config.process_ref}] ----- rule_def: {rule_def}') + logger.debug(f' [{config.process_ref}] ----- len(rule_set): {rule_set}') def load_rule_set(config, rule_dir, prefix_list): @@ -108,22 +108,22 @@ def load_rule_set(config, rule_dir, prefix_list): # -- Update rule set if hasattr(rule_module, 'rule_set'): rule_def_set = rule_module.rule_set - new_rule_set = get_new_rule_set(rule_def_set, prefix_list) + new_rule_set = get_new_rule_set(config, rule_def_set, prefix_list) rule_set.update(new_rule_set) return rule_set except: - logger.error(' *** Error while loading rule set (load_rule_set) *** ') - logger.debug(f' ----- path_glob_pattern: {path_glob_pattern}') - logger.debug(f' ----- sys.path: {sys.path}') - logger.debug(f' ----- file_glob_pattern: {file_glob_pattern}') - logger.debug(f' ----- rule_filename: {rule_filename}') - logger.debug(f' ----- rule_module_name: {rule_module_name}') - logger.debug(f' ----- rule_module: {rule_module}') - logger.debug(f' ----- len(rule_def_set): {len(rule_def_set)}') - logger.debug(f' ----- len(new_rule_set): {len(new_rule_set)}') - logger.debug(f' ----- len(rule_set): {len(rule_set)}') + logger.error(f' [{config.process_ref}] *** Error while loading rule set (load_rule_set) *** ') + logger.debug(f' [{config.process_ref}] ----- path_glob_pattern: {path_glob_pattern}') + logger.debug(f' [{config.process_ref}] ----- sys.path: {sys.path}') + logger.debug(f' [{config.process_ref}] ----- file_glob_pattern: {file_glob_pattern}') + logger.debug(f' [{config.process_ref}] ----- rule_filename: {rule_filename}') + logger.debug(f' [{config.process_ref}] ----- rule_module_name: {rule_module_name}') + logger.debug(f' [{config.process_ref}] ----- rule_module: {rule_module}') + logger.debug(f' [{config.process_ref}] ----- len(rule_def_set): {len(rule_def_set)}') + logger.debug(f' [{config.process_ref}] ----- len(new_rule_set): {len(new_rule_set)}') + logger.debug(f' [{config.process_ref}] ----- len(rule_set): {len(rule_set)}') @@ -147,14 +147,14 @@ def _apply_rule(graph, rule): return rule_label, graph, extracted_triple_set -def _apply_sequence(graph, sequence_list): +def _apply_sequence(config, graph, sequence_list): """ Apply the rule on the working graph <graph> """ try: assert len(sequence_list) > 0, f'Houston, we have a problem: it is an empty sequence ({sequence_list})!' sequence_label = sequence_list[0] - logger.info(f"--- Sequence: {sequence_label}") + logger.info(f" [{config.process_ref}] --- Sequence: {sequence_label}") all_new_triple_set = [] for rule in sequence_list[1:]: @@ -166,7 +166,7 @@ def _apply_sequence(graph, sequence_list): all_new_triple_set.extend(extracted_triple_set) new_triple_count = len(graph) - graph_length_before - str = f"----- {rule_label}: " + str = f" [{config.process_ref}] ----- {rule_label}: " str += f"{new_triple_count}/{len(extracted_triple_set)} new triple" if new_triple_count > 1: str += f"s" str += f" ({len(graph)}, {exec_time_date})" @@ -178,16 +178,16 @@ def _apply_sequence(graph, sequence_list): return graph, all_new_triple_set except AssertionError as ae: - logger.error(f' *** *** **** Assertion Error *** *** *** \n {ae}') + logger.error(f' [{config.process_ref}] *** *** **** Assertion Error *** *** *** \n {ae}') except: - logger.error(f" *** Error while processing extraction (_apply_new_rule_sequence) ***") - logger.debug(f" ----- len(sequence): {len(rule.query_list)} ") - logger.debug(f" ----- last rule: {query_label} ") - logger.debug(f" ----- last SPARQL query: \n{sparql_query} ") - logger.debug(f" ----- len(extracted_triple_set): {len(extracted_triple_set)} ") - logger.debug(f" ----- new_triple_count: {new_triple_count} ") - logger.debug(f" ----- exec_time_date: {exec_time_date} ") + logger.error(f" [{config.process_ref}] *** Error while processing extraction (_apply_new_rule_sequence) ***") + logger.debug(f" [{config.process_ref}] ----- len(sequence): {len(rule.query_list)} ") + logger.debug(f" [{config.process_ref}] ----- last rule: {query_label} ") + logger.debug(f" [{config.process_ref}] ----- last SPARQL query: \n{sparql_query} ") + logger.debug(f" [{config.process_ref}] ----- len(extracted_triple_set): {len(extracted_triple_set)} ") + logger.debug(f" [{config.process_ref}] ----- new_triple_count: {new_triple_count} ") + logger.debug(f" [{config.process_ref}] ----- exec_time_date: {exec_time_date} ") @@ -197,28 +197,27 @@ def _serialize_graph(config, graph, step_name): try: uuid_str = config.uuid_str work_file = config.output_file.replace('.ttl', '_' + step_name + '.ttl') - base_ref = "http://{0}/{1}".format(uuid_str, step_name) + base_ref = f"http://{uuid_str}/{step_name}" - message = "--- Serializing graph to {0} " - message = message.format(Path(work_file).stem) + message = f" [{config.process_ref}] --- Serializing graph to {Path(work_file).stem} " logger.debug(message) - logger.debug("----- step: {0}".format(step_name)) - logger.debug("----- id: {0}".format(uuid_str)) - logger.debug("----- work_file: {0}".format(work_file)) - logger.debug("----- base: {0}".format(base_ref)) + logger.debug(f" [{config.process_ref}] ----- step: {step_name}") + logger.debug(f" [{config.process_ref}] ----- id: {uuid_str}") + logger.debug(f" [{config.process_ref}] ----- work_file: {work_file}") + logger.debug(f" [{config.process_ref}] ----- base: {base_ref}") graph.serialize(destination=work_file, base=base_ref, format='turtle') except: - logger.error(" *** Error while serializing graph (serialize_graph) ***") - logger.debug(" ----- work_file: {0} ".format(work_file)) + logger.error(f" [{config.process_ref}] *** Error while serializing graph (serialize_graph) ***") + logger.debug(f" [{config.process_ref}] ----- work_file: {work_file} ") def apply_step(config, graph, rule_set, step_number, step_name, step_sequence_def): """ Apply extraction step on the working graph """ try: - logger.info(f"-- Step {step_number}: {step_name}") + logger.info(f" [{config.process_ref}] -- Step {step_number}: {step_name}") # -- Initialize step_triple_list = [] @@ -226,7 +225,7 @@ def apply_step(config, graph, rule_set, step_number, step_name, step_sequence_de # -- Apply the sequences of the step for sequence_def in step_sequence_def: - graph, triple_list = _apply_sequence(graph, sequence_def) + graph, triple_list = _apply_sequence(config, graph, sequence_def) step_triple_list.extend(triple_list) # -- Serialize the working graph updated during the step @@ -235,21 +234,20 @@ def apply_step(config, graph, rule_set, step_number, step_name, step_sequence_de _serialize_graph(config, graph, step_name) # -- Log extracted triple number - str = "----- {0} triples extracted during {1} step" new_triple_count = len(graph) - graph_length_before_step - logger.info(str.format(new_triple_count, step_name)) + str = f" [{config.process_ref}] ----- {new_triple_count} triples extracted during {step_name} step" return graph, step_triple_list except AssertionError: - logger.error(f' *** *** **** Assertion Error *** *** *** \n') + logger.error(f' [{config.process_ref}] *** *** **** Assertion Error *** *** *** \n') except: - logger.error(" *** Error while processing extraction (apply_step) ***") - logger.debug(f' ----- step_name = {step_name}') - logger.debug(f' ----- len(step_sequence_def) = {len(step_sequence_def)}') - logger.debug(f' ----- last sequence def = {sequence_def}') - logger.debug(f' ----- last sequence label = {sequence.label}') + logger.error(f" [{config.process_ref}] *** Error while processing extraction (apply_step) ***") + logger.debug(f' [{config.process_ref}] ----- step_name = {step_name}') + logger.debug(f' [{config.process_ref}] ----- len(step_sequence_def) = {len(step_sequence_def)}') + logger.debug(f' [{config.process_ref}] ----- last sequence def = {sequence_def}') + logger.debug(f' [{config.process_ref}] ----- last sequence label = {sequence.label}') #============================================================================== @@ -262,14 +260,14 @@ def apply(config, graph): try: # -- Loading Extraction Scheme - logger.info(f"-- Loading Extraction Scheme ({config.extraction_scheme})") + logger.info(f" [{config.process_ref}] -- Loading Extraction Scheme ({config.extraction_scheme})") rule_dir, prefix_list, scheme = load_cts(config) - logger.debug("----- Step number: {0}".format(len(scheme))) + logger.debug(f" [{config.process_ref}] ----- Step number: {len(scheme)}") # -- Loading Extraction Rules - logger.info("-- Loading Extraction Rules ({0}*)".format(rule_dir)) + logger.info(f" [{config.process_ref}] -- Loading Extraction Rules ({rule_dir}*)") rule_set = load_rule_set(config, rule_dir, prefix_list) - logger.debug("----- Total rule number: {0}".format(len(rule_set))) + logger.debug(f" [{config.process_ref}] ----- Total rule number: {len(rule_set)}") # -- Apply each step of the scheme new_triple_list = [] @@ -283,13 +281,13 @@ def apply(config, graph): if config.technical_dir_path is not None: os.makedirs(config.sentence_output_dir, exist_ok=True) factoid_file = config.output_file.replace('.ttl', '_factoid.ttl') - logger.debug("--- Serializing graph to factoid file (" + factoid_file + ")") + logger.debug(f" [{config.process_ref}] --- Serializing graph to factoid file (" + factoid_file + ")") factoid_graph = Graph() for new_triple in new_triple_list: factoid_graph.add(new_triple) - logger.debug("----- Number of factoids: " + str(len(new_triple_list))) + logger.debug(f" [{config.process_ref}] ----- Number of factoids: " + str(len(new_triple_list))) base_ref = f'http://{config.uuid_str}/factoid' - logger.debug("----- Graph base: " + base_ref) + logger.debug(f" [{config.process_ref}] ----- Graph base: " + base_ref) factoid_graph.serialize(destination=factoid_file, base=base_ref, format='turtle') @@ -297,8 +295,8 @@ def apply(config, graph): return graph, new_triple_list except: - logger.error(' *** Error while processing extraction (apply) ***') - logger.debug(f' ----- config.extraction_scheme = {config.extraction_scheme}') - logger.debug(f' ----- rule_dir = {rule_dir}') - logger.debug(f' ----- scheme = {scheme}') - logger.debug(f' ----- step = {step_name}, {step_sequence_def}') \ No newline at end of file + logger.error(f' [{config.process_ref}] *** Error while processing extraction (apply) ***') + logger.debug(f' [{config.process_ref}] ----- config.extraction_scheme = {config.extraction_scheme}') + logger.debug(f' [{config.process_ref}] ----- rule_dir = {rule_dir}') + logger.debug(f' [{config.process_ref}] ----- scheme = {scheme}') + logger.debug(f' [{config.process_ref}] ----- step = {step_name}, {step_sequence_def}') \ No newline at end of file diff --git a/tenet/extraction/structure.py b/tenet/extraction/structure.py index d50c86370c976691b94f88deeaccc887cf7111e8..c809eb29a151eaa8de56315a25916aea748a3fff 100644 --- a/tenet/extraction/structure.py +++ b/tenet/extraction/structure.py @@ -31,47 +31,46 @@ logger = logging.getLogger(__name__) #============================================================================== def load_config(config, work_graph): - logger.debug("----- Configuration Loading") + logger.debug(f" [{config.process_ref}] ----- Configuration Loading") work_graph.parse(config.schema_file) - logger.debug("-------- RDF Schema ({0})".format(len(work_graph))) + logger.debug(f" [{config.process_ref}] -------- RDF Schema ({len(work_graph)})") work_graph.parse(config.semantic_net_file) - logger.debug("-------- Semantic Net Definition ({0})".format(len(work_graph))) + logger.debug(f" [{config.process_ref}] -------- Semantic Net Definition ({len(work_graph)})") work_graph.parse(config.config_param_file) - logger.debug("-------- Config Parameter Definition ({0})".format(len(work_graph))) + logger.debug(f" [{config.process_ref}] -------- Config Parameter Definition ({len(work_graph)})") def load_frame(config, work_graph): - logger.debug("----- Frame Ontology Loading") + logger.debug(f" [{config.process_ref}] ----- Frame Ontology Loading") if config.is_base_ontology_produced_as_output(): work_graph.parse(config.base_ontology_file) - logger.debug("-------- Base Ontology produced as output ({0})".format(len(work_graph))) + logger.debug(f" [{config.process_ref}] -------- Base Ontology produced as output ({len(work_graph)})") else: work_graph.parse(config.frame_ontology_file) - logger.debug("-------- System Frame Ontology (" + str(len(work_graph)) + ")") + logger.debug(f" [{config.process_ref}] -------- System Frame Ontology ({len(work_graph)})") work_graph.parse(config.frame_ontology_seed_file) - logger.debug("-------- System Frame Ontology Seed (" + str(len(work_graph)) + ")") + logger.debug(f" [{config.process_ref}] -------- System Frame Ontology Seed ({len(work_graph)})") def load_sentence(config, work_graph, sentence_file): - logger.debug("----- Sentence Loading") + logger.debug(f" [{config.process_ref}] ----- Sentence Loading") work_graph.parse(sentence_file) - logger.debug("-------- {0} ({1})".format(sentence_file, len(work_graph))) + logger.debug(f" [{config.process_ref}] -------- {sentence_file} ({len(work_graph)})") def load_all_sentences(config, work_graph): - logger.debug("----- Sentences Loading") + logger.debug(f" [{config.process_ref}] ----- Sentences Loading") sentence_count = 0 for file_ref in glob.glob(config.source_sentence_file, recursive = True): sentence_count += 1 work_graph.parse(file_ref) - # logger.debug("----- " + file_ref + " (" + str(len(work_graph)) + ")") - logger.debug("-------- Loaded sentence number: " + str(sentence_count)) + logger.debug(f" [{config.process_ref}] -------- Loaded sentence number: {sentence_count}") #============================================================================== @@ -159,16 +158,16 @@ def prepare_work_graph(config, sentence_file): """ create working structure as RDF graph""" try: - logger.info("-- Work Structure Preparation") + logger.info(f" [{config.process_ref}] -- Work Structure Preparation") # -- Graph Initialization - logger.debug("--- Graph Initialization") + logger.debug(f" [{config.process_ref}] --- Graph Initialization") work_graph = Graph() load_config(config, work_graph) load_frame(config, work_graph) # -- Source Data Import - logger.debug("--- Source Data Import") + logger.debug(f" [{config.process_ref}] --- Source Data Import") if config.process_level == 'sentence': load_sentence(config, work_graph, sentence_file) else: # process_level == 'document' @@ -177,21 +176,21 @@ def prepare_work_graph(config, sentence_file): # -- Result if config.technical_dir_path is not None: os.makedirs(config.sentence_output_dir, exist_ok=True) - logger.debug('--- Export work graph as turtle') + logger.debug(f' [{config.process_ref}] --- Export work graph as turtle') export_file = config.output_file - logger.debug('----- Work graph file: {0} '.format(export_file)) + logger.debug(f' [{config.process_ref}] ----- Work graph file: {export_file} ') export_result(config, work_graph, export_file) work_graph = finalize_export_file(config, export_file) # -- Information logging about structure graphId, graphSentence = obtain_graph_reference(work_graph) - logger.info(f"----- Sentence (id): {graphId}") - logger.info(f"----- Sentence (text): {graphSentence}") + logger.info(f" [{config.process_ref}] ----- Sentence (id): {graphId}") + logger.info(f" [{config.process_ref}] ----- Sentence (text): {graphSentence}") return work_graph except: - logger.error("!!! An exception occurred !!!") + logger.error(f" [{config.process_ref}] !!! An exception occurred !!!") def prepare_sentence_work(config, sentence_file): diff --git a/tenet/main.py b/tenet/main.py index 9fcc43b4c5710578ac6999b00b37d30f4c1d7a99..3d6c877b678080846796b9c988c7504281dc6bb3 100644 --- a/tenet/main.py +++ b/tenet/main.py @@ -111,13 +111,19 @@ def __serialize_factoid_graph(config, factoid_graph, out_file_path=None): def run_extraction(arg_dict): process_config = config.Config(arg_dict) - sentence_indice = arg_dict['sentence_list_indice'] sentence_file = arg_dict['sentence_file'] - logger.info(f' *** sentence {sentence_indice} *** ') - process_config.sentence_output_dir = f'-{sentence_indice}' - new_triple_list = __apply_extraction(process_config, sentence_file) - return(new_triple_list) + logger.info(f'\n [{process_config.process_ref}] *** extraction from sentence {process_config.sentence_id} *** ') + process_config.sentence_output_dir = f'-{process_config.sentence_id}' + try: + new_triple_list = __apply_extraction(process_config, sentence_file) + logger.info(f' [{process_config.process_ref}] Success ({len(new_triple_list)} extracted triple(s))') + return(new_triple_list) + + except: + logger.info(f' [{process_config.process_ref}] Failure') + return [] + #============================================================================== # AMR Main Methods (to create an ontology) @@ -249,7 +255,7 @@ def create_ontology_from_amrld_dir(amrld_dir_path, # ----- Computing Extraction Argument mapIterable = [] for i in range(len(sentence_file_list)): - config_dict['sentence_list_indice'] = i + config_dict['sentence_id'] = i config_dict['sentence_file'] = sentence_file_list[i] mapIterable = mapIterable + [config_dict.copy()]