Skip to content
Snippets Groups Projects
Commit 9c5e7bf3 authored by Aurélien Lamercerie's avatar Aurélien Lamercerie
Browse files

Add new script for cut big file of sentence

parent 60f9235b
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/python3.10
# -*-coding:Utf-8 -*
#==============================================================================
# C.M. Tool: cut a file into several files
#------------------------------------------------------------------------------
# Script to cut a file with a lot of sentences into several files with less
# sentences
#==============================================================================
#==============================================================================
# Importing required modules
#==============================================================================
import amrlib
import re
import os
import sys
import subprocess
import shutil
from rdflib import Graph
#==============================================================================
# Parameters
#==============================================================================
# Input/Output Directories
INPUT_DIR = "inputData/"
OUTPUT_DIR = "outputData/"
# Reference Suffix
TEXT_SUFFIX = ".txt"
SENTENCE_SUFFIX = ".sentence.txt"
#==============================================================================
# Functions to define filepath
#==============================================================================
def get_text_input_filepath(data_ref):
return INPUT_DIR + data_ref + TEXT_SUFFIX
def get_sentence_output_filepath(data_ref, file_number, digits):
file_number_str = str(file_number).rjust(digits,"0")
data_file_name = data_ref + "-" + file_number_str + SENTENCE_SUFFIX
return INPUT_DIR + data_file_name
#==============================================================================
# Functions to manage in-process data
#==============================================================================
def is_valid_sentence(sentence):
""" True if the sentence is correct.
"""
is_empty = ((sentence == "") | (sentence == "\n"))
lang_mark_re = re.compile("\([a-z]+\)(.)*")
is_language_mark = lang_mark_re.match(sentence) is not None
return not (is_empty | is_language_mark)
def clean_sentence(sentence):
""" Sentence cleanup as needed """
sentence = re.sub("(\.)*\\n", "", sentence)
return sentence
#==============================================================================
# Main function
#==============================================================================
def main(base_ref, split_number):
print("\n" + "[CMT] Cut a file into several files")
print("-- base_ref:", base_ref)
print("-- split_number:", split_number)
print("-- Calculating the total numbre of sentences")
input_file = get_text_input_filepath(base_ref)
sentence_total_number = 0
with open(input_file, "r") as reading_file: # r = read
for line in reading_file.readlines():
sentences = line.split(". ")
for sentence in sentences:
sentence_total_number += 1
digits_number = len(str(sentence_total_number))
print("----- total number of sentences: " + str(sentence_total_number))
print("----- digits number: " + str(digits_number))
print("-- Cut input file to several files")
data_list = list()
s_number = 0
f_number = 0
input_file = get_text_input_filepath(base_ref)
with open(input_file, "r") as reading_file: # r = read
for line in reading_file.readlines():
sentences = line.split(". ")
for sentence in sentences:
if is_valid_sentence(sentence):
s_number += 1
data_list.append(sentence)
print(sentence)
if s_number == split_number:
f_number += 1
s_number = 0
print("----- number of sentences: " + str(len(data_list)))
output_file = get_sentence_output_filepath(base_ref,
f_number,
digits_number)
print("-- Generating new file: " + output_file)
with open(output_file, "w") as writing_file: # w = write
first = True
for data in data_list:
if not first: writing_file.write("\n")
writing_file.write(data)
first = False
data_list.clear()
if __name__ == "__main__":
base_ref = sys.argv[1]
split_number = int(sys.argv[2])
split_number = split_number if split_number > 1 else 1
main(base_ref, split_number)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment