Select Git revision

Aurélien Lamercerie authored
main.py 11.88 KiB
#!/usr/bin/python3.10
# -*-coding:Utf-8 -*
#==============================================================================
# AMR Batch: main
#------------------------------------------------------------------------------
# Module providing the main method(s) of the amrBatch library.
#==============================================================================
import sys, os, glob
import shutil, re
import subprocess
import amrlib
from rdflib import Graph
import traceback
import logging.config
from amrlib.graph_processing.amr_plot import AMRPlot
from filepath_manager import FilepathManager
# -- Config File Path
LIB_PATH = os.path.dirname(os.path.abspath(__file__)) + '/'
LOGGING_CONF_FILE_PATH = f'{LIB_PATH}logging.conf'
CONFIG_FILE_PATH = f'{LIB_PATH}config.xml'
# AMRLD Parameters
AMRLD_DIR = f'{LIB_PATH}amrld/'
AMRLD_WORKDIR = f'{AMRLD_DIR}wk/'
# -- Logging
logging.config.fileConfig(LOGGING_CONF_FILE_PATH, disable_existing_loggers=True)
logger = logging.getLogger('root')
#==============================================================================
# Functions to manage in-process data
#==============================================================================
def is_valid_sentence(sentence):
""" True if the sentence is correct.
"""
is_empty = ((sentence == "") | (sentence == "\n"))
lang_mark_re = re.compile("\([a-z]+\)(.)*")
is_language_mark = lang_mark_re.match(sentence) is not None
return not (is_empty | is_language_mark)
def clean_sentence(sentence):
""" Sentence cleanup as needed """
sentence = re.sub("(\.)*\\n", "", sentence)
return sentence
def define_new_data(base_ref, number, sentence):
number_str = str(number).rjust(2,"0")
data_ref = base_ref + "-" + number_str
output_data_dir = data_ref + "/"
id_line_str = "# ::id " + data_ref + "\n"
sentence = clean_sentence(sentence)
sentence += "."
new_data = {
"base_ref" : data_ref,
"number" : number,
"output_data_dir" : output_data_dir,
"data_ref" : data_ref,
"id_line_str" : id_line_str,
"sentence" : sentence,
"graph" : ""
}
return new_data
def get_amr_graph_list(workdata_list):
amr_graph_list = []
for workdata in workdata_list:
amr_graph_list.append(workdata['graph'])
return amr_graph_list
#==============================================================================
# Preparation Steps
#==============================================================================
def __prepare_workdata(filepath_manager):
logger.info('-- Reading input files to recover a list of sentences')
input_filepath = filepath_manager.input_filepath
base_reference = filepath_manager.base_reference
workdata_list = list()
sentence_number = 0
with open(input_filepath, "r") as reading_file: # r = read
for line in reading_file.readlines():
sentences = line.split(". ")
for sentence in sentences:
if is_valid_sentence(sentence):
sentence_number += 1
new_data = define_new_data(base_reference, sentence_number, sentence)
workdata_list.append(new_data)
logger.debug(f' *** sentence {sentence_number} *** \n{new_data}')
logger.info(f'----- number of sentences: {len(workdata_list)}')
return workdata_list
def __build_output_dir_tree(filepath_manager, workdata_list):
base_output_dirpath = filepath_manager.output_dirpath
logger.debug(f'-- Making output directory tree ({base_output_dirpath})')
os.makedirs(base_output_dirpath, exist_ok=True)
for data in workdata_list:
output_dir = filepath_manager.get_workdata_output_dirpath(data["output_data_dir"])
os.makedirs(output_dir, exist_ok=True)
def __generate_sentence_file(filepath_manager, workdata_list):
sentence_filepath = filepath_manager.get_sentence_output_filepath()
logger.debug(f"-- Generating sentence file ")
with open(sentence_filepath, "w") as writing_file: # w = write
first = True
for workdata in workdata_list:
if not first: writing_file.write("\n")
writing_file.write(workdata["sentence"])
first = False
#==============================================================================
# Conversion Steps
#==============================================================================
def __generate_penman_amr_graph(filepath_manager, data):
""" AMR graph generation in penman format """
graph = data["graph"]
output_filepath = filepath_manager.get_penman_amr_graph_output_filepath(data)
logger.debug(f"----- AMR Graph file (penman): {os.path.basename(output_filepath)}")
with open(output_filepath, "w") as writing_file: # w = write
writing_file.write(data["id_line_str"])
writing_file.write(graph)
def __generate_dot_amr_graph(filepath_manager, data):
""" AMR graph generation in dot and png format """
graph = data["graph"]
try:
# -- generating dot/png files using AMRLib and GraphViz
dot_filename = filepath_manager.get_dot_amr_graph_output_filepath(data)
format = 'png'
logger.debug(f'----- AMR Graph file (dot): {os.path.basename(dot_filename)}')
plot = AMRPlot(dot_filename, format)
plot.build_from_graph(graph)
plot.graph.render()
render_fn = dot_filename + '.' + format # -- renaming PNG file
good_png_fn = filepath_manager.get_png_amr_graph_output_filepath(data)
logger.debug(f'----- AMR Graph file (png): {{os.path.basename(good_png_fn)}}')
os.rename(render_fn, good_png_fn)
except:
logger.warning('Exception when trying to plot')
traceback.print_exc()
def __convert_sentences_to_graphs(amr_model, workdata_list):
""" Converting text sentences to AMR graphs """
logger.info("-- Loading AMR model")
stog = amrlib.load_stog_model(model_dir=amr_model)
logger.info("-- Converting sentences to AMR graphs")
wd_number = 0
for data in workdata_list:
wd_number += 1
stog_result = stog.parse_sents([data["sentence"]])
logger.info(f'----- Sentence {wd_number} successfully processed')
logger.debug(stog_result)
data["graph"] = stog_result[0]
logger.info(f'----- Total processed graph number: {wd_number}')
return workdata_list
def __generate_amr_graph_files(filepath_manager, workdata_list):
logger.info("-- Generating AMR graph files")
for data in workdata_list:
__generate_penman_amr_graph(filepath_manager, data)
__generate_dot_amr_graph(filepath_manager, data)
#==============================================================================
# Serialization Steps
#==============================================================================
def __serialize_amr_graph_to_rdf_triple(filepath_manager, data):
""" Serialize AMR graph to AMR-RDF triple """
# -- Filepath
input_file = filepath_manager.get_penman_amr_graph_output_filepath(data)
input_amrld_file = filepath_manager.get_amr_graph_amrld_filepath(data)
output_amrld_file = filepath_manager.get_amr_rdf_amrld_filepath(data)
input_wk_file = filepath_manager.get_amr_graph_wk_filepath(data)
output_wk_file = filepath_manager.get_amr_rdf_wk_filepath(data)
amr_triple_file = filepath_manager.get_amr_rdf_triple_output_filepath(data)
# -- AMR-LD processing
amrld_process = ["python3", "amr_to_rdf.py",
"-i", input_wk_file,
"-o", output_wk_file]
if (os.path.isfile(input_file)):
logger.info("-- Serialize AMR graphs to RDF using amr-ld library")
logger.debug(f'----- penman filepath: {input_file}')
logger.debug(f'----- AMRLD filepath: {input_amrld_file}')
shutil.copyfile(input_file, input_amrld_file)
current_dirpath = os.getcwd()
os.chdir(AMRLD_DIR)
subprocess.run(amrld_process)
os.chdir(current_dirpath)
# -- Copy result
if (os.path.isfile(output_amrld_file)):
logger.info(f'-- Generating AMR RDF file (triple): {os.path.basename(amr_triple_file)}')
shutil.copyfile(output_amrld_file, amr_triple_file)
def __convert_rdf_triple_to_rdf_turtle(filepath_manager, data):
""" Converting AMR-RDF triple to AMR-RDF turtle """
# -- Filepath
amr_triple_file = filepath_manager.get_amr_rdf_triple_output_filepath(data)
amr_turtle_file = filepath_manager.get_amr_rdf_turtle_output_filepath(data)
# -- Conversion
if (os.path.isfile(amr_triple_file)):
logger.info(f'-- Generating AMR RDF file (turtle): {os.path.basename(amr_turtle_file)}')
g = Graph()
g.parse(amr_triple_file)
g.serialize(destination=amr_turtle_file, format='turtle')
def __convert_amr_graphs_to_rdf(filepath_manager, data_list):
""" Converting AMR graphs to AMR-RDF """
for data in data_list:
__serialize_amr_graph_to_rdf_triple(filepath_manager, data)
__convert_rdf_triple_to_rdf_turtle(filepath_manager, data)
#==============================================================================
# Main Method(s)
#==============================================================================
def parse_sentences_from_file(input_filepath,
amr_model_path,
output_dirpath=None,
amrld_serialization=False):
"""
Method to parse an input file containing natural language sentences and
construct the corresponding AMR graphs (and their RDF serializations if required).
The method returns an AMR graph string in PENMAN format. AMR graphs are also
serialized in RDF turtle format using the AMR-LD library if required
(by defining turtle_output_file_path as parameter).
Parameters
----------
input_filepath: a path to a text file.
output_dirpath: a directory path where the output data are written if defined (the function still outputs the string).
turtle_output_file_path: a file path where the output AMRLD representation is written in TURTLE format if defined.
technical_dir_path: a dir path where some technical and log files are written if defined.
Returns
-------
AMR Graph String (in PENMAN format).
"""
logger.info('[AMR Batch] NL Document Parsing')
# -- Prepare the sentences to be converted
logger.info('\n === Preparation === ')
filepath_manager = FilepathManager(input_filepath, output_dirpath)
logger.info(f'-- base reference: {filepath_manager.base_reference}')
logger.info(f'-- input filepath: {filepath_manager.input_filepath}')
logger.info(f'-- output dirpath: {filepath_manager.output_dirpath}')
assert os.path.exists(input_filepath), f'input file does not exists ({input_filepath})'
workdata_list = __prepare_workdata(filepath_manager)
__build_output_dir_tree(filepath_manager, workdata_list)
__generate_sentence_file(filepath_manager, workdata_list)
# -- Convert sentences to graphs
logger.info('\n === Text Convert to AMR Graphs === ')
logger.info(f'-- library: amrlib')
logger.debug(f' ({AMRLD_DIR})')
logger.info(f'-- model: {os.path.basename(amr_model_path)}')
logger.debug(f' ({amr_model_path})')
logger.debug(f'-- working directory: {AMRLD_WORKDIR}')
workdata_list = __convert_sentences_to_graphs(amr_model_path, workdata_list)
__generate_amr_graph_files(filepath_manager, workdata_list)
if amrld_serialization==True:
# -- Convert graphs to RDF
logger.info('\n === AMR Graphs Serialization to AMR-RDF Representation === ')
logger.info("-- library: amrlk")
__convert_amr_graphs_to_rdf(filepath_manager, workdata_list)
return get_amr_graph_list(workdata_list)