Spaces:
Sleeping
Sleeping
# generate audio text map for WenetSpeech4TTS | |
# evaluate for vocab size | |
import sys | |
import os | |
sys.path.append(os.getcwd()) | |
import json | |
from tqdm import tqdm | |
from concurrent.futures import ProcessPoolExecutor | |
import torchaudio | |
from datasets import Dataset | |
from model.utils import convert_char_to_pinyin | |
def deal_with_sub_path_files(dataset_path, sub_path): | |
print(f"Dealing with: {sub_path}") | |
text_dir = os.path.join(dataset_path, sub_path, "txts") | |
audio_dir = os.path.join(dataset_path, sub_path, "wavs") | |
text_files = os.listdir(text_dir) | |
audio_paths, texts, durations = [], [], [] | |
for text_file in tqdm(text_files): | |
with open(os.path.join(text_dir, text_file), "r", encoding="utf-8") as file: | |
first_line = file.readline().split("\t") | |
audio_nm = first_line[0] | |
audio_path = os.path.join(audio_dir, audio_nm + ".wav") | |
text = first_line[1].strip() | |
audio_paths.append(audio_path) | |
if tokenizer == "pinyin": | |
texts.extend(convert_char_to_pinyin([text], polyphone=polyphone)) | |
elif tokenizer == "char": | |
texts.append(text) | |
audio, sample_rate = torchaudio.load(audio_path) | |
durations.append(audio.shape[-1] / sample_rate) | |
return audio_paths, texts, durations | |
def main(): | |
assert tokenizer in ["pinyin", "char"] | |
audio_path_list, text_list, duration_list = [], [], [] | |
executor = ProcessPoolExecutor(max_workers=max_workers) | |
futures = [] | |
for dataset_path in dataset_paths: | |
sub_items = os.listdir(dataset_path) | |
sub_paths = [item for item in sub_items if os.path.isdir(os.path.join(dataset_path, item))] | |
for sub_path in sub_paths: | |
futures.append(executor.submit(deal_with_sub_path_files, dataset_path, sub_path)) | |
for future in tqdm(futures, total=len(futures)): | |
audio_paths, texts, durations = future.result() | |
audio_path_list.extend(audio_paths) | |
text_list.extend(texts) | |
duration_list.extend(durations) | |
executor.shutdown() | |
if not os.path.exists("data"): | |
os.makedirs("data") | |
print(f"\nSaving to data/{dataset_name}_{tokenizer} ...") | |
dataset = Dataset.from_dict({"audio_path": audio_path_list, "text": text_list, "duration": duration_list}) | |
dataset.save_to_disk(f"data/{dataset_name}_{tokenizer}/raw", max_shard_size="2GB") # arrow format | |
with open(f"data/{dataset_name}_{tokenizer}/duration.json", "w", encoding="utf-8") as f: | |
json.dump( | |
{"duration": duration_list}, f, ensure_ascii=False | |
) # dup a json separately saving duration in case for DynamicBatchSampler ease | |
print("\nEvaluating vocab size (all characters and symbols / all phonemes) ...") | |
text_vocab_set = set() | |
for text in tqdm(text_list): | |
text_vocab_set.update(list(text)) | |
# add alphabets and symbols (optional, if plan to ft on de/fr etc.) | |
if tokenizer == "pinyin": | |
text_vocab_set.update([chr(i) for i in range(32, 127)] + [chr(i) for i in range(192, 256)]) | |
with open(f"data/{dataset_name}_{tokenizer}/vocab.txt", "w") as f: | |
for vocab in sorted(text_vocab_set): | |
f.write(vocab + "\n") | |
print(f"\nFor {dataset_name}, sample count: {len(text_list)}") | |
print(f"For {dataset_name}, vocab size is: {len(text_vocab_set)}\n") | |
if __name__ == "__main__": | |
max_workers = 32 | |
tokenizer = "pinyin" # "pinyin" | "char" | |
polyphone = True | |
dataset_choice = 1 # 1: Premium, 2: Standard, 3: Basic | |
dataset_name = ["WenetSpeech4TTS_Premium", "WenetSpeech4TTS_Standard", "WenetSpeech4TTS_Basic"][dataset_choice - 1] | |
dataset_paths = [ | |
"<SOME_PATH>/WenetSpeech4TTS/Basic", | |
"<SOME_PATH>/WenetSpeech4TTS/Standard", | |
"<SOME_PATH>/WenetSpeech4TTS/Premium", | |
][-dataset_choice:] | |
print(f"\nChoose Dataset: {dataset_name}\n") | |
main() | |
# Results (if adding alphabets with accents and symbols): | |
# WenetSpeech4TTS Basic Standard Premium | |
# samples count 3932473 1941220 407494 | |
# pinyin vocab size 1349 1348 1344 (no polyphone) | |
# - - 1459 (polyphone) | |
# char vocab size 5264 5219 5042 | |
# vocab size may be slightly different due to jieba tokenizer and pypinyin (e.g. way of polyphoneme) | |
# please be careful if using pretrained model, make sure the vocab.txt is same | |