Select Git revision
AnnotationManifestsItem.js
extract.py 6.71 KiB
#!/usr/bin/python3.10
# -*-coding:Utf-8 -*
#==============================================================================
# TENET: extract
#------------------------------------------------------------------------------
# Command to run the main extraction process
#==============================================================================
#==============================================================================
# Importing required modules
#==============================================================================
import argparse, os, glob
import logging.config
from lib import config, structure
from lib import shacl_extraction, tenet_extraction
from rdflib import Graph
#==============================================================================
# Parameters
#==============================================================================
# Logging
logging.config.fileConfig('logging.conf', disable_existing_loggers=False)
logger = logging.getLogger('root')
# Configuration
CONFIG_FILE = "config.xml"
# Default values
DEFAULT_SOURCE_TYPE = 'amr'
DEFAULT_SOURCE_CORPUS = "samples/s1/" # name of the directory with slash
DEFAULT_TARGET_ID = 'DefaultTargetId'
DEFAULT_ENGINE = 'tenet'
#==============================================================================
# Utilities
#==============================================================================
def control_arguments():
arg_parser = argparse.ArgumentParser(
description=("TENET - Tool for Extraction using Net Extension ",
"by (semantic) Transduction"))
arg_parser.add_argument("--source_type", nargs='?',
default=DEFAULT_SOURCE_TYPE,
help="source_type: amr or unl")
arg_parser.add_argument("--source_corpus",
default=DEFAULT_SOURCE_CORPUS,
help="source_corpus: name of the source corpus directory with slash")
arg_parser.add_argument("--target_id",
default=DEFAULT_TARGET_ID,
help="target_id: id for the target ontology")
arg_parser.add_argument("--engine",
default=DEFAULT_ENGINE,
help="engine: shacl, tenet or new")
args = arg_parser.parse_args()
return args
#==============================================================================
# Steps
#==============================================================================
def set_config(args):
logger.info("-- Process Setting ")
logger.info("----- Corpus source: {0} ({1})".format(args.source_corpus,
args.source_type))
logger.info("----- Ontology target (id): {0}".format(args.target_id))
logger.debug("----- Current path: {0}".format(os.getcwd()))
logger.debug("----- Config file: {0}".format(CONFIG_FILE))
process_config = config.Config(CONFIG_FILE,
args.target_id,
args.source_corpus #, target_ontology
)
process_config.source_type = args.source_type
# config.output_ontology_namespace = target_ontology_namespace
process_config.engine = args.engine
logger.debug(process_config.get_full_config())
return process_config
def init_process(config):
logger.info("-- Creating output target directory: " + config.output_dir)
os.makedirs(config.output_dir, exist_ok=True)
logger.debug("-- Counting number of graph files (sentences) ")
sentence_count = 0
for file_ref in glob.glob(config.source_sentence_file, recursive = True):
sentence_count += 1
logger.debug("----- Graph count: {0}".format(sentence_count))
def run_shacl_extraction(config):
logger.debug("-- Process level: document")
work_graph = structure.prepare_work_graph_at_document_level(config)
shacl_extraction.apply(config, work_graph)
def run_tenet_extraction(config):
if config.process_level == 'sentence':
logger.debug("-- Process level: sentence")
sentence_dir = config.source_sentence_file
sentence_count = 0
result_triple_list = []
for sentence_file in glob.glob(sentence_dir, recursive = True):
sentence_count += 1
config.sentence_output_dir = '-' + str(sentence_count)
logger.info(" *** sentence {0} *** ".format(sentence_count))
os.makedirs(config.sentence_output_dir, exist_ok=True)
work_graph = structure.prepare_sentence_work(config, sentence_file)
# New extraction engine running
_, new_triple_list = tenet_extraction.apply(config, work_graph)
result_triple_list.extend(new_triple_list)
logger.info(' === Final Ontology Generation === ')
config.sentence_output_dir = ''
logger.info("-- Making complete factoid graph by merging sentence factoid graphs")
factoid_graph = Graph()
for new_triple in result_triple_list:
factoid_graph.add(new_triple)
logger.info("----- Total factoid number: " + str(len(new_triple_list)))
uuid_str = config.uuid_str
base_ref = "http://" + uuid_str + '/' + 'factoid'
logger.info("----- Graph base: {0}".format(base_ref))
factoid_file = config.output_file.replace('.ttl', '_factoid.ttl')
logger.info("-- Serializing graph to factoid file ({0})".format(factoid_file))
factoid_graph.serialize(destination=factoid_file,
base=base_ref,
format='turtle')
else: # config.process_level == 'document'
logger.debug("-- Process level: document")
work_graph = structure.prepare_document_work(config)
shacl_extraction.apply(config, work_graph)
#==============================================================================
# Main processing
#==============================================================================
def run(args):
logger.info('[TENET] Extraction Processing')
# -- Process Initialization
logger.info(' === Process Initialization === ')
config = set_config(args)
init_process(config)
# -- Extraction Processing using TENET Engine
if config.engine == "shacl":
logger.info(' === Extraction Processing using SHACL Engine === ')
run_shacl_extraction(config)
else: # config.engine == "tenet":
logger.info(' === Extraction Processing using New TENET Engine === ')
run_tenet_extraction(config)
logger.info(' === Done === ')
if __name__ == '__main__':
args = control_arguments()
run(args)