#!/usr/bin/python3.10 # -*-coding:Utf-8 -* #============================================================================== # TENET: extract #------------------------------------------------------------------------------ # Command to run the main extraction process #============================================================================== #============================================================================== # Importing required modules #============================================================================== import argparse, os, glob import logging.config from lib import config, structure from lib import shacl_extraction, tenet_extraction from rdflib import Graph #============================================================================== # Parameters #============================================================================== # Logging logging.config.fileConfig('logging.conf', disable_existing_loggers=False) logger = logging.getLogger('root') # Configuration CONFIG_FILE = "config.xml" # Default values DEFAULT_SOURCE_TYPE = 'amr' DEFAULT_SOURCE_CORPUS = "samples/s1/" # name of the directory with slash DEFAULT_TARGET_ID = 'DefaultTargetId' DEFAULT_ENGINE = 'tenet' #============================================================================== # Utilities #============================================================================== def control_arguments(): arg_parser = argparse.ArgumentParser( description=("TENET - Tool for Extraction using Net Extension ", "by (semantic) Transduction")) arg_parser.add_argument("--source_type", nargs='?', default=DEFAULT_SOURCE_TYPE, help="source_type: amr or unl") arg_parser.add_argument("--source_corpus", default=DEFAULT_SOURCE_CORPUS, help="source_corpus: name of the source corpus directory with slash") arg_parser.add_argument("--target_id", default=DEFAULT_TARGET_ID, help="target_id: id for the target ontology") arg_parser.add_argument("--engine", default=DEFAULT_ENGINE, help="engine: shacl, tenet or new") args = arg_parser.parse_args() return args #============================================================================== # Steps #============================================================================== def set_config(args): logger.info("-- Process Setting ") logger.info("----- Corpus source: {0} ({1})".format(args.source_corpus, args.source_type)) logger.info("----- Ontology target (id): {0}".format(args.target_id)) logger.debug("----- Current path: {0}".format(os.getcwd())) logger.debug("----- Config file: {0}".format(CONFIG_FILE)) process_config = config.Config(CONFIG_FILE, args.target_id, args.source_corpus #, target_ontology ) process_config.source_type = args.source_type # config.output_ontology_namespace = target_ontology_namespace process_config.engine = args.engine logger.debug(process_config.get_full_config()) return process_config def init_process(config): logger.info("-- Creating output target directory: " + config.output_dir) os.makedirs(config.output_dir, exist_ok=True) logger.debug("-- Counting number of graph files (sentences) ") sentence_count = 0 for file_ref in glob.glob(config.source_sentence_file, recursive = True): sentence_count += 1 logger.debug("----- Graph count: {0}".format(sentence_count)) def run_shacl_extraction(config): logger.debug("-- Process level: document") work_graph = structure.prepare_work_graph_at_document_level(config) shacl_extraction.apply(config, work_graph) def run_tenet_extraction(config): if config.process_level == 'sentence': logger.debug("-- Process level: sentence") sentence_dir = config.source_sentence_file sentence_count = 0 result_triple_list = [] for sentence_file in glob.glob(sentence_dir, recursive = True): sentence_count += 1 config.sentence_output_dir = '-' + str(sentence_count) logger.info(" *** sentence {0} *** ".format(sentence_count)) os.makedirs(config.sentence_output_dir, exist_ok=True) work_graph = structure.prepare_sentence_work(config, sentence_file) # New extraction engine running _, new_triple_list = tenet_extraction.apply(config, work_graph) result_triple_list.extend(new_triple_list) logger.info(' === Final Ontology Generation === ') config.sentence_output_dir = '' logger.info("-- Making complete factoid graph by merging sentence factoid graphs") factoid_graph = Graph() for new_triple in result_triple_list: factoid_graph.add(new_triple) logger.info("----- Total factoid number: " + str(len(new_triple_list))) uuid_str = config.uuid_str base_ref = "http://" + uuid_str + '/' + 'factoid' logger.info("----- Graph base: {0}".format(base_ref)) factoid_file = config.output_file.replace('.ttl', '_factoid.ttl') logger.info("-- Serializing graph to factoid file ({0})".format(factoid_file)) factoid_graph.serialize(destination=factoid_file, base=base_ref, format='turtle') else: # config.process_level == 'document' logger.debug("-- Process level: document") work_graph = structure.prepare_document_work(config) shacl_extraction.apply(config, work_graph) #============================================================================== # Main processing #============================================================================== def run(args): logger.info('[TENET] Extraction Processing') # -- Process Initialization logger.info(' === Process Initialization === ') config = set_config(args) init_process(config) # -- Extraction Processing using TENET Engine if config.engine == "shacl": logger.info(' === Extraction Processing using SHACL Engine === ') run_shacl_extraction(config) else: # config.engine == "tenet": logger.info(' === Extraction Processing using New TENET Engine === ') run_tenet_extraction(config) logger.info(' === Done === ') if __name__ == '__main__': args = control_arguments() run(args)