Skip to content
Snippets Groups Projects
Select Git revision
  • 42db5151e1a5134487ceb0ddefecbf44a53db6dd
  • main default protected
  • multiprocessing
3 results

main.py

Blame
  • main.py 11.88 KiB
    #!/usr/bin/python3.10
    # -*-coding:Utf-8 -*
    
    #==============================================================================
    # AMR Batch: main
    #------------------------------------------------------------------------------
    # Module providing the main method(s) of the amrBatch library.
    #==============================================================================
    
    import sys, os, glob
    import shutil, re
    import subprocess
    import amrlib
    from rdflib import Graph
    import traceback
    import logging.config
    
    from amrlib.graph_processing.amr_plot import AMRPlot
    from filepath_manager import FilepathManager
    
    # -- Config File Path
    LIB_PATH = os.path.dirname(os.path.abspath(__file__)) + '/'
    LOGGING_CONF_FILE_PATH = f'{LIB_PATH}logging.conf'
    CONFIG_FILE_PATH = f'{LIB_PATH}config.xml'
    
    # AMRLD Parameters
    AMRLD_DIR = f'{LIB_PATH}amrld/'
    AMRLD_WORKDIR = f'{AMRLD_DIR}wk/'
    
    # -- Logging
    logging.config.fileConfig(LOGGING_CONF_FILE_PATH, disable_existing_loggers=True)
    logger = logging.getLogger('root')
    
    
    
    #==============================================================================
    # Functions to manage in-process data
    #==============================================================================
    
    def is_valid_sentence(sentence):
        """ True if the sentence is correct.
        """
        is_empty = ((sentence == "") | (sentence == "\n"))
        lang_mark_re = re.compile("\([a-z]+\)(.)*")
        is_language_mark = lang_mark_re.match(sentence) is not None
        return not (is_empty | is_language_mark)
    
    
    def clean_sentence(sentence):
        """ Sentence cleanup as needed """
        sentence = re.sub("(\.)*\\n", "", sentence)
        return sentence
    
    
    def define_new_data(base_ref, number, sentence):
        
        number_str = str(number).rjust(2,"0")
        data_ref = base_ref + "-" + number_str
        output_data_dir = data_ref + "/"
        id_line_str = "# ::id " + data_ref + "\n"
        sentence = clean_sentence(sentence)
        sentence += "."
        
        new_data = {
            "base_ref" : data_ref,
            "number" : number,
            "output_data_dir" : output_data_dir,
            "data_ref" : data_ref,
            "id_line_str" : id_line_str,
            "sentence" : sentence,
            "graph" : ""
                }
        
        return new_data
    
        
    def get_amr_graph_list(workdata_list):
        amr_graph_list = []
        for workdata in workdata_list:
            amr_graph_list.append(workdata['graph'])
        return amr_graph_list
            
       
    #==============================================================================
    # Preparation Steps
    #==============================================================================
    
    def __prepare_workdata(filepath_manager):
        logger.info('-- Reading input files to recover a list of sentences')
        input_filepath = filepath_manager.input_filepath
        base_reference = filepath_manager.base_reference
        
        workdata_list = list()
        
        sentence_number = 0
        with open(input_filepath, "r") as reading_file: # r = read
            for line in reading_file.readlines():
                sentences = line.split(". ")
                for sentence in sentences:
                    if is_valid_sentence(sentence):
                        sentence_number += 1
                        new_data = define_new_data(base_reference, sentence_number, sentence)
                        workdata_list.append(new_data)
                        logger.debug(f' *** sentence {sentence_number} *** \n{new_data}')
        
        logger.info(f'----- number of sentences: {len(workdata_list)}')
        return workdata_list 
        
    
    def __build_output_dir_tree(filepath_manager, workdata_list):
        base_output_dirpath = filepath_manager.output_dirpath
        logger.debug(f'-- Making output directory tree ({base_output_dirpath})')
        os.makedirs(base_output_dirpath, exist_ok=True)
        for data in workdata_list:
            output_dir = filepath_manager.get_workdata_output_dirpath(data["output_data_dir"])
            os.makedirs(output_dir, exist_ok=True)
           
    
    def __generate_sentence_file(filepath_manager, workdata_list):                    
        sentence_filepath = filepath_manager.get_sentence_output_filepath()
        logger.debug(f"-- Generating sentence file ")
        with open(sentence_filepath, "w") as writing_file: # w = write 
            first = True
            for workdata in workdata_list:
                if not first: writing_file.write("\n")
                writing_file.write(workdata["sentence"])
                first = False
        
    
    
       
    #==============================================================================
    # Conversion Steps
    #==============================================================================
    
    def __generate_penman_amr_graph(filepath_manager, data):   
        """ AMR graph generation in penman format """
      
        graph = data["graph"]    
        output_filepath = filepath_manager.get_penman_amr_graph_output_filepath(data)
        logger.debug(f"----- AMR Graph file (penman): {os.path.basename(output_filepath)}")
        with open(output_filepath, "w") as writing_file: # w = write
            writing_file.write(data["id_line_str"])
            writing_file.write(graph)
    
    
    
    def __generate_dot_amr_graph(filepath_manager, data):
        """ AMR graph generation in dot and png format """
        
        graph = data["graph"]
        
        try:
            # -- generating dot/png files using AMRLib and GraphViz 
            dot_filename = filepath_manager.get_dot_amr_graph_output_filepath(data)
            format = 'png'
            logger.debug(f'----- AMR Graph file (dot): {os.path.basename(dot_filename)}')
            plot = AMRPlot(dot_filename, format) 
            plot.build_from_graph(graph)
            plot.graph.render()
            
            render_fn = dot_filename + '.' + format # -- renaming PNG file
            good_png_fn = filepath_manager.get_png_amr_graph_output_filepath(data)
            logger.debug(f'----- AMR Graph file (png): {{os.path.basename(good_png_fn)}}')
            os.rename(render_fn, good_png_fn)
            
        except:
            logger.warning('Exception when trying to plot')
            traceback.print_exc()
            
        
    
    
    def __convert_sentences_to_graphs(amr_model, workdata_list):
        """ Converting text sentences to AMR graphs """
        
        logger.info("-- Loading AMR model")
        stog = amrlib.load_stog_model(model_dir=amr_model)
    
        logger.info("-- Converting sentences to AMR graphs")
        wd_number = 0
        for data in workdata_list:
            wd_number += 1
            stog_result = stog.parse_sents([data["sentence"]])
            logger.info(f'----- Sentence {wd_number} successfully processed')
            logger.debug(stog_result)
            data["graph"] = stog_result[0]
            
        logger.info(f'----- Total processed graph number: {wd_number}') 
        return workdata_list
        
    
            
    def __generate_amr_graph_files(filepath_manager, workdata_list):
        logger.info("-- Generating AMR graph files")
        for data in workdata_list:
            __generate_penman_amr_graph(filepath_manager, data)
            __generate_dot_amr_graph(filepath_manager, data)
       
        
       
    #==============================================================================
    # Serialization Steps
    #==============================================================================
    
    def __serialize_amr_graph_to_rdf_triple(filepath_manager, data):
        """ Serialize AMR graph to AMR-RDF triple """
        
        # -- Filepath
        input_file = filepath_manager.get_penman_amr_graph_output_filepath(data)
        input_amrld_file = filepath_manager.get_amr_graph_amrld_filepath(data)
        output_amrld_file = filepath_manager.get_amr_rdf_amrld_filepath(data)
        input_wk_file = filepath_manager.get_amr_graph_wk_filepath(data)
        output_wk_file = filepath_manager.get_amr_rdf_wk_filepath(data)
        amr_triple_file = filepath_manager.get_amr_rdf_triple_output_filepath(data)
        
        # -- AMR-LD processing
        amrld_process = ["python3", "amr_to_rdf.py", 
                         "-i", input_wk_file, 
                         "-o", output_wk_file]
        if (os.path.isfile(input_file)):
            logger.info("-- Serialize AMR graphs to RDF using amr-ld library")
            logger.debug(f'----- penman filepath: {input_file}')
            logger.debug(f'----- AMRLD filepath: {input_amrld_file}')
            shutil.copyfile(input_file, input_amrld_file) 
            current_dirpath = os.getcwd()
            os.chdir(AMRLD_DIR)
            subprocess.run(amrld_process)   
            os.chdir(current_dirpath)
    
        # -- Copy result
        if (os.path.isfile(output_amrld_file)):
            logger.info(f'-- Generating AMR RDF file (triple): {os.path.basename(amr_triple_file)}')
            shutil.copyfile(output_amrld_file, amr_triple_file) 
            
            
    def __convert_rdf_triple_to_rdf_turtle(filepath_manager, data):
        """ Converting AMR-RDF triple to AMR-RDF turtle """
        
        # -- Filepath
        amr_triple_file = filepath_manager.get_amr_rdf_triple_output_filepath(data)
        amr_turtle_file = filepath_manager.get_amr_rdf_turtle_output_filepath(data)
        
        # -- Conversion
        if (os.path.isfile(amr_triple_file)):
            logger.info(f'-- Generating AMR RDF file (turtle): {os.path.basename(amr_turtle_file)}')
            g = Graph()
            g.parse(amr_triple_file)
            g.serialize(destination=amr_turtle_file, format='turtle')
    
    
    
    def __convert_amr_graphs_to_rdf(filepath_manager, data_list):
        """ Converting AMR graphs to AMR-RDF """
        
        for data in data_list:
            __serialize_amr_graph_to_rdf_triple(filepath_manager, data)
            __convert_rdf_triple_to_rdf_turtle(filepath_manager, data)
            
            
    
    #==============================================================================
    # Main Method(s)
    #==============================================================================
    
    def parse_sentences_from_file(input_filepath, 
                                  amr_model_path, 
                                  output_dirpath=None,
                                  amrld_serialization=False):
        """
        Method to parse an input file containing natural language sentences and 
        construct the corresponding AMR graphs (and their RDF serializations if required).
        The method returns an AMR graph string in PENMAN format. AMR graphs are also 
        serialized in RDF turtle format using the AMR-LD library if required 
        (by defining turtle_output_file_path as parameter).
        
    
        Parameters
        ----------
        input_filepath: a path to a text file.
        output_dirpath: a directory path where the output data are written if defined (the function still outputs the string). 
        turtle_output_file_path: a file path where the output AMRLD representation is written in TURTLE format if defined. 
        technical_dir_path: a dir path where some technical and log files are written if defined.
    
        Returns
        -------
        AMR Graph String (in PENMAN format).
    
        """
        
        logger.info('[AMR Batch] NL Document Parsing')    
      
        # -- Prepare the sentences to be converted
        logger.info('\n === Preparation === ')
        filepath_manager = FilepathManager(input_filepath, output_dirpath)
        logger.info(f'-- base reference: {filepath_manager.base_reference}')
        logger.info(f'-- input filepath: {filepath_manager.input_filepath}')
        logger.info(f'-- output dirpath: {filepath_manager.output_dirpath}')
        assert os.path.exists(input_filepath), f'input file does not exists ({input_filepath})'
        workdata_list = __prepare_workdata(filepath_manager)
        __build_output_dir_tree(filepath_manager, workdata_list)
        __generate_sentence_file(filepath_manager, workdata_list)
                
        # -- Convert sentences to graphs
        logger.info('\n === Text Convert to AMR Graphs === ')
        logger.info(f'-- library: amrlib') 
        logger.debug(f'  ({AMRLD_DIR})') 
        logger.info(f'-- model: {os.path.basename(amr_model_path)}')
        logger.debug(f'  ({amr_model_path})') 
        logger.debug(f'-- working directory: {AMRLD_WORKDIR}')
        workdata_list = __convert_sentences_to_graphs(amr_model_path, workdata_list)
        __generate_amr_graph_files(filepath_manager, workdata_list)
    
        if amrld_serialization==True:
            # -- Convert graphs to RDF
            logger.info('\n === AMR Graphs Serialization to AMR-RDF Representation  === ')
            logger.info("-- library: amrlk") 
            __convert_amr_graphs_to_rdf(filepath_manager, workdata_list)
            
        
        return get_amr_graph_list(workdata_list)