Spaces:
Paused
Paused
import os | |
from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig | |
# from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig | |
from TTS.tts.configs.shared_configs import BaseDatasetConfig | |
CURRENT_PATH = os.getcwd() | |
# change the root path to the TTS root path | |
os.chdir("../../../") | |
### Definitions ### | |
# dataset | |
VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/" # download: https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd | |
RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/" # download: https://www.openslr.org/17/ | |
MUSAN_PATH = "/raid/datasets/DA/musan/" # download: https://www.openslr.org/17/ | |
# training | |
OUTPUT_PATH = os.path.join( | |
CURRENT_PATH, "resnet_speaker_encoder_training_output/" | |
) # path to save the train logs and checkpoint | |
CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json") | |
RESTORE_PATH = None # Checkpoint to use for transfer learning if None ignore | |
# instance the config | |
# to speaker encoder | |
config = SpeakerEncoderConfig() | |
# to emotion encoder | |
# config = EmotionEncoderConfig() | |
#### DATASET CONFIG #### | |
# The formatter need to return the key "speaker_name" for the speaker encoder and the "emotion_name" for the emotion encoder | |
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", language="en-us", path=VCTK_PATH) | |
# add the dataset to the config | |
config.datasets = [dataset_config] | |
#### TRAINING CONFIG #### | |
# The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements | |
# It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker | |
# number total of speaker in batch in training | |
config.num_classes_in_batch = 100 | |
# number of utterance per class/speaker in the batch in training | |
config.num_utter_per_class = 4 | |
# final batch size = config.num_classes_in_batch * config.num_utter_per_class | |
# number total of speaker in batch in evaluation | |
config.eval_num_classes_in_batch = 100 | |
# number of utterance per class/speaker in the batch in evaluation | |
config.eval_num_utter_per_class = 4 | |
# number of data loader workers | |
config.num_loader_workers = 8 | |
config.num_val_loader_workers = 8 | |
# number of epochs | |
config.epochs = 10000 | |
# loss to be used in training | |
config.loss = "softmaxproto" | |
# run eval | |
config.run_eval = False | |
# output path for the checkpoints | |
config.output_path = OUTPUT_PATH | |
# Save local checkpoint every save_step steps | |
config.save_step = 2000 | |
### Model Config ### | |
config.model_params = { | |
"model_name": "resnet", # supported "lstm" and "resnet" | |
"input_dim": 64, | |
"use_torch_spec": True, | |
"log_input": True, | |
"proj_dim": 512, # embedding dim | |
} | |
### Audio Config ### | |
# To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts" | |
config.voice_len = 2.0 | |
# all others configs | |
config.audio = { | |
"fft_size": 512, | |
"win_length": 400, | |
"hop_length": 160, | |
"frame_shift_ms": None, | |
"frame_length_ms": None, | |
"stft_pad_mode": "reflect", | |
"sample_rate": 16000, | |
"resample": False, | |
"preemphasis": 0.97, | |
"ref_level_db": 20, | |
"do_sound_norm": False, | |
"do_trim_silence": False, | |
"trim_db": 60, | |
"power": 1.5, | |
"griffin_lim_iters": 60, | |
"num_mels": 64, | |
"mel_fmin": 0.0, | |
"mel_fmax": 8000.0, | |
"spec_gain": 20, | |
"signal_norm": False, | |
"min_level_db": -100, | |
"symmetric_norm": False, | |
"max_norm": 4.0, | |
"clip_norm": False, | |
"stats_path": None, | |
"do_rms_norm": True, | |
"db_level": -27.0, | |
} | |
### Augmentation Config ### | |
config.audio_augmentation = { | |
# additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf | |
"p": 0.5, # probability to the use of one of the augmentation - 0 means disabled | |
"rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"}, # download: https://www.openslr.org/17/ | |
"additive": { | |
"sounds_path": MUSAN_PATH, | |
"speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1}, | |
"noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1}, | |
"music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1}, | |
}, | |
"gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05}, | |
} | |
config.save_json(CONFIG_OUT_PATH) | |
print(CONFIG_OUT_PATH) | |
if RESTORE_PATH is not None: | |
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}" | |
else: | |
command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}" | |
os.system(command) | |