diff --git a/cut_file.py b/cut_file.py new file mode 100644 index 0000000000000000000000000000000000000000..d7b3f922a0279042fd2c7cc43a1408f0e300633d --- /dev/null +++ b/cut_file.py @@ -0,0 +1,129 @@ +#!/usr/bin/python3.10 +# -*-coding:Utf-8 -* + +#============================================================================== +# C.M. Tool: cut a file into several files +#------------------------------------------------------------------------------ +# Script to cut a file with a lot of sentences into several files with less +# sentences +#============================================================================== + +#============================================================================== +# Importing required modules +#============================================================================== + +import amrlib +import re +import os +import sys +import subprocess +import shutil +from rdflib import Graph + + +#============================================================================== +# Parameters +#============================================================================== + +# Input/Output Directories +INPUT_DIR = "inputData/" +OUTPUT_DIR = "outputData/" + +# Reference Suffix +TEXT_SUFFIX = ".txt" +SENTENCE_SUFFIX = ".sentence.txt" + + +#============================================================================== +# Functions to define filepath +#============================================================================== + +def get_text_input_filepath(data_ref): + return INPUT_DIR + data_ref + TEXT_SUFFIX + +def get_sentence_output_filepath(data_ref, file_number, digits): + file_number_str = str(file_number).rjust(digits,"0") + data_file_name = data_ref + "-" + file_number_str + SENTENCE_SUFFIX + return INPUT_DIR + data_file_name + + + +#============================================================================== +# Functions to manage in-process data +#============================================================================== + +def is_valid_sentence(sentence): + """ True if the sentence is correct. + """ + is_empty = ((sentence == "") | (sentence == "\n")) + lang_mark_re = re.compile("\([a-z]+\)(.)*") + is_language_mark = lang_mark_re.match(sentence) is not None + return not (is_empty | is_language_mark) + + +def clean_sentence(sentence): + """ Sentence cleanup as needed """ + sentence = re.sub("(\.)*\\n", "", sentence) + return sentence + + +#============================================================================== +# Main function +#============================================================================== + +def main(base_ref, split_number): + + print("\n" + "[CMT] Cut a file into several files") + + print("-- base_ref:", base_ref) + print("-- split_number:", split_number) + + print("-- Calculating the total numbre of sentences") + input_file = get_text_input_filepath(base_ref) + sentence_total_number = 0 + with open(input_file, "r") as reading_file: # r = read + for line in reading_file.readlines(): + sentences = line.split(". ") + for sentence in sentences: + sentence_total_number += 1 + digits_number = len(str(sentence_total_number)) + print("----- total number of sentences: " + str(sentence_total_number)) + print("----- digits number: " + str(digits_number)) + + + print("-- Cut input file to several files") + data_list = list() + s_number = 0 + f_number = 0 + input_file = get_text_input_filepath(base_ref) + with open(input_file, "r") as reading_file: # r = read + for line in reading_file.readlines(): + sentences = line.split(". ") + for sentence in sentences: + if is_valid_sentence(sentence): + s_number += 1 + data_list.append(sentence) + print(sentence) + if s_number == split_number: + f_number += 1 + s_number = 0 + print("----- number of sentences: " + str(len(data_list))) + output_file = get_sentence_output_filepath(base_ref, + f_number, + digits_number) + print("-- Generating new file: " + output_file) + with open(output_file, "w") as writing_file: # w = write + first = True + for data in data_list: + if not first: writing_file.write("\n") + writing_file.write(data) + first = False + data_list.clear() + + + +if __name__ == "__main__": + base_ref = sys.argv[1] + split_number = int(sys.argv[2]) + split_number = split_number if split_number > 1 else 1 + main(base_ref, split_number)