Spaces:
Runtime error
Runtime error
import os | |
from glob import glob | |
import re | |
import string | |
import argparse | |
import json | |
import random | |
random.seed(42) | |
def replace_extra_chars(line): | |
line = line.replace("(", "").replace( | |
")", "" | |
) # .replace('\u200d', ' ').replace('\ufeff', ' ').replace('\u200c', ' ').replace('\u200e', ' ') | |
# line = line.replace('β', ' ').replace('β', ' ').replace(':', ' ') | |
return line.strip() | |
def write_txt(content, filename): | |
with open(filename, "w+", encoding="utf-8") as f: | |
f.write(content) | |
def save_train_test_valid_split(annotations_txt, num_samples_valid, num_samples_test): | |
with open(annotations_txt, encoding="utf-8") as f: | |
all_lines = [line.strip() for line in f.readlines()] | |
test_val_indices = random.sample( | |
range(len(all_lines)), num_samples_valid + num_samples_test | |
) | |
valid_ix = test_val_indices[:num_samples_valid] | |
test_ix = test_val_indices[num_samples_valid:] | |
train = [line for i, line in enumerate(all_lines) if i not in test_val_indices] | |
valid = [line for i, line in enumerate(all_lines) if i in valid_ix] | |
test = [line for i, line in enumerate(all_lines) if i in test_ix] | |
print(f"Num samples in train: {len(train)}") | |
print(f"Num samples in valid: {len(valid)}") | |
print(f"Num samples in test: {len(test)}") | |
out_dir_path = "/".join(annotations_txt.split("/")[:-1]) | |
with open(os.path.join(out_dir_path, "train.txt"), "w+", encoding="utf-8") as f: | |
for line in train: | |
print(line, file=f) | |
with open(os.path.join(out_dir_path, "valid.txt"), "w+", encoding="utf-8") as f: | |
for line in valid: | |
print(line, file=f) | |
with open(os.path.join(out_dir_path, "test.txt"), "w+", encoding="utf-8") as f: | |
for line in test: | |
print(line, file=f) | |
print(f"train, test and valid txts saved in {out_dir_path}") | |
def save_txts_from_txt_done_data( | |
text_path, | |
wav_path_for_annotations_txt, | |
out_path_for_txts, | |
num_samples_valid, | |
num_samples_test, | |
): | |
outfile = os.path.join(out_path_for_txts, "annotations.txt") | |
with open(text_path) as file: | |
file_lines = file.readlines() | |
# print(file_lines[0]) | |
file_lines = [replace_extra_chars(line) for line in file_lines] | |
# print(file_lines[0]) | |
fnames, ftexts = [], [] | |
for line in file_lines: | |
elems = line.split('"') | |
fnames.append(elems[0].strip()) | |
ftexts.append(elems[1].strip().lower().replace('β','\'').replace('β','\'')) | |
all_chars = list(set("".join(ftexts))) | |
punct_with_space = [i for i in all_chars if i in list(string.punctuation)] + [" "] | |
chars = [i for i in all_chars if i not in punct_with_space if i.strip()] | |
chars = "".join(chars) | |
punct_with_space = "".join(punct_with_space)#.replace("'",r"\'") | |
with open('../../config/glow/base_blank.json', 'r') as jfile: | |
json_config = json.load(jfile) | |
json_config["data"]["chars"] = chars | |
json_config["data"]["punc"] = punct_with_space | |
json_config["data"]["training_files"]=out_path_for_txts + '/train.txt' | |
json_config["data"]["validation_files"] = out_path_for_txts + '/valid.txt' | |
new_config_name = out_path_for_txts.split('/')[-1] | |
with open(f'../../config/glow/{new_config_name}.json','w+') as jfile: | |
json.dump(json_config, jfile) | |
print(f"Characters: {chars}") | |
print(f"Len of vocab: {len(chars)}") | |
print(f"Punctuation: {punct_with_space}") | |
print(f"Config file is stored at ../../config/glow/{new_config_name}.json") | |
outfile_f = open(outfile, "w+", encoding="utf-8") | |
for f, t in zip(fnames, ftexts): | |
print( | |
os.path.join(wav_path_for_annotations_txt, f) + ".wav", | |
t, | |
sep="|", | |
file=outfile_f, | |
) | |
outfile_f.close() | |
write_txt(punct_with_space, os.path.join(out_path_for_txts, "punc.txt")) | |
write_txt(chars, os.path.join(out_path_for_txts, "chars.txt")) | |
save_train_test_valid_split( | |
annotations_txt=outfile, | |
num_samples_valid=num_samples_valid, | |
num_samples_test=num_samples_test, | |
) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-i", "--text-path", type=str, required=True) | |
parser.add_argument("-o", "--output-path", type=str, required=True) | |
parser.add_argument("-w", "--wav-path", type=str, required=True) | |
parser.add_argument("-v", "--valid-samples", type=int, default = 100) | |
parser.add_argument("-t", "--test-samples", type=int, default = 10) | |
args = parser.parse_args() | |
save_txts_from_txt_done_data( | |
args.text_path, | |
args.wav_path, | |
args.output_path, | |
args.valid_samples, | |
args.test_samples, | |
) | |