otto_daoli / preprocess_text.py
XzJosh's picture
Upload 47 files
bf0a127
import json
from random import shuffle
import tqdm
from text.cleaner import clean_text
from collections import defaultdict
stage = [1,2,3]
transcription_path = 'filelists/genshin.list'
train_path = 'filelists/train.list'
val_path = 'filelists/val.list'
config_path = "configs/config.json"
val_per_spk = 4
max_val_total = 8
if 1 in stage:
with open( transcription_path+'.cleaned', 'w', encoding='utf-8') as f:
for line in tqdm.tqdm(open(transcription_path, encoding='utf-8').readlines()):
try:
utt, spk, language, text = line.strip().split('|')
norm_text, phones, tones, word2ph = clean_text(text, language)
f.write('{}|{}|{}|{}|{}|{}|{}\n'.format(utt, spk, language, norm_text, ' '.join(phones),
" ".join([str(i) for i in tones]),
" ".join([str(i) for i in word2ph])))
except Exception as error :
print("err!", utt, error)
if 2 in stage:
spk_utt_map = defaultdict(list)
spk_id_map = {}
current_sid = 0
with open( transcription_path+'.cleaned', encoding='utf-8') as f:
for line in f.readlines():
utt, spk, language, text, phones, tones, word2ph = line.strip().split('|')
spk_utt_map[spk].append(line)
if spk not in spk_id_map.keys():
spk_id_map[spk] = current_sid
current_sid += 1
train_list = []
val_list = []
for spk, utts in spk_utt_map.items():
shuffle(utts)
val_list+=utts[:val_per_spk]
train_list+=utts[val_per_spk:]
if len(val_list) > max_val_total:
train_list+=val_list[max_val_total:]
val_list = val_list[:max_val_total]
with open( train_path,"w", encoding='utf-8') as f:
for line in train_list:
f.write(line)
with open(val_path, "w", encoding='utf-8') as f:
for line in val_list:
f.write(line)
if 3 in stage:
assert 2 in stage
config = json.load(open(config_path, encoding='utf-8'))
config["data"]['spk2id'] = spk_id_map
with open(config_path, 'w', encoding='utf-8') as f:
json.dump(config, f, indent=2, ensure_ascii=False)