otto_daoli

Sleeping

App Files Files Community

otto_daoli / preprocess_text.py

XzJosh

Upload 47 files

bf0a127 over 1 year ago

raw

history blame contribute delete

2.28 kB

	import json
	from random import shuffle

	import tqdm
	from text.cleaner import clean_text
	from collections import defaultdict
	stage = [1,2,3]

	transcription_path = 'filelists/genshin.list'
	train_path = 'filelists/train.list'
	val_path = 'filelists/val.list'
	config_path = "configs/config.json"
	val_per_spk = 4
	max_val_total = 8

	if 1 in stage:
	with open( transcription_path+'.cleaned', 'w', encoding='utf-8') as f:
	for line in tqdm.tqdm(open(transcription_path, encoding='utf-8').readlines()):
	try:
	utt, spk, language, text = line.strip().split('\|')
	norm_text, phones, tones, word2ph = clean_text(text, language)
	f.write('{}\|{}\|{}\|{}\|{}\|{}\|{}\n'.format(utt, spk, language, norm_text, ' '.join(phones),
	" ".join([str(i) for i in tones]),
	" ".join([str(i) for i in word2ph])))
	except Exception as error :
	print("err!", utt, error)

	if 2 in stage:
	spk_utt_map = defaultdict(list)
	spk_id_map = {}
	current_sid = 0

	with open( transcription_path+'.cleaned', encoding='utf-8') as f:
	for line in f.readlines():
	utt, spk, language, text, phones, tones, word2ph = line.strip().split('\|')
	spk_utt_map[spk].append(line)
	if spk not in spk_id_map.keys():
	spk_id_map[spk] = current_sid
	current_sid += 1
	train_list = []
	val_list = []

	for spk, utts in spk_utt_map.items():
	shuffle(utts)
	val_list+=utts[:val_per_spk]
	train_list+=utts[val_per_spk:]
	if len(val_list) > max_val_total:
	train_list+=val_list[max_val_total:]
	val_list = val_list[:max_val_total]

	with open( train_path,"w", encoding='utf-8') as f:
	for line in train_list:
	f.write(line)

	with open(val_path, "w", encoding='utf-8') as f:
	for line in val_list:
	f.write(line)

	if 3 in stage:
	assert 2 in stage
	config = json.load(open(config_path, encoding='utf-8'))
	config["data"]['spk2id'] = spk_id_map
	with open(config_path, 'w', encoding='utf-8') as f:
	json.dump(config, f, indent=2, ensure_ascii=False)