File size: 4,787 Bytes
45ee559
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import os

from TTS.encoder.configs.speaker_encoder_config import SpeakerEncoderConfig

# from TTS.encoder.configs.emotion_encoder_config import EmotionEncoderConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig

CURRENT_PATH = os.getcwd()
# change the root path to the TTS root path
os.chdir("../../../")

### Definitions ###
# dataset
VCTK_PATH = "/raid/datasets/VCTK_NEW_16khz_removed_silence_silero_vad/"  # download:  https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zipdddddddddd
RIR_SIMULATED_PATH = "/raid/datasets/DA/RIRS_NOISES/simulated_rirs/"  # download: https://www.openslr.org/17/
MUSAN_PATH = "/raid/datasets/DA/musan/"  # download: https://www.openslr.org/17/

# training
OUTPUT_PATH = os.path.join(
    CURRENT_PATH, "resnet_speaker_encoder_training_output/"
)  # path to save the train logs and checkpoint
CONFIG_OUT_PATH = os.path.join(OUTPUT_PATH, "config_se.json")
RESTORE_PATH = None  # Checkpoint to use for transfer learning if None ignore

# instance the config
# to speaker encoder
config = SpeakerEncoderConfig()
# to emotion encoder
# config = EmotionEncoderConfig()


#### DATASET CONFIG ####
# The formatter need to return the key "speaker_name"  for the speaker encoder and the "emotion_name" for the emotion encoder
dataset_config = BaseDatasetConfig(formatter="vctk", meta_file_train="", language="en-us", path=VCTK_PATH)

# add the dataset to the config
config.datasets = [dataset_config]


#### TRAINING CONFIG ####
# The encoder data loader balancer the dataset item equally to guarantee better training and to attend the losses requirements
# It have two parameters to control the final batch size the number total of speaker used in each batch and the number of samples for each speaker

# number total of speaker in batch in training
config.num_classes_in_batch = 100
# number of utterance per class/speaker in the batch in training
config.num_utter_per_class = 4
# final batch size = config.num_classes_in_batch * config.num_utter_per_class

# number total of speaker in batch in evaluation
config.eval_num_classes_in_batch = 100
# number of utterance per class/speaker in the batch in evaluation
config.eval_num_utter_per_class = 4

# number of data loader workers
config.num_loader_workers = 8
config.num_val_loader_workers = 8

# number of epochs
config.epochs = 10000
# loss to be used in training
config.loss = "softmaxproto"

# run eval
config.run_eval = False

# output path for the checkpoints
config.output_path = OUTPUT_PATH

# Save local checkpoint every save_step steps
config.save_step = 2000

### Model Config ###
config.model_params = {
    "model_name": "resnet",  # supported "lstm" and "resnet"
    "input_dim": 64,
    "use_torch_spec": True,
    "log_input": True,
    "proj_dim": 512,  # embedding dim
}

### Audio Config ###
# To fast train the model divides the audio in small parts. it parameter defines the length in seconds of these "parts"
config.voice_len = 2.0
# all others configs
config.audio = {
    "fft_size": 512,
    "win_length": 400,
    "hop_length": 160,
    "frame_shift_ms": None,
    "frame_length_ms": None,
    "stft_pad_mode": "reflect",
    "sample_rate": 16000,
    "resample": False,
    "preemphasis": 0.97,
    "ref_level_db": 20,
    "do_sound_norm": False,
    "do_trim_silence": False,
    "trim_db": 60,
    "power": 1.5,
    "griffin_lim_iters": 60,
    "num_mels": 64,
    "mel_fmin": 0.0,
    "mel_fmax": 8000.0,
    "spec_gain": 20,
    "signal_norm": False,
    "min_level_db": -100,
    "symmetric_norm": False,
    "max_norm": 4.0,
    "clip_norm": False,
    "stats_path": None,
    "do_rms_norm": True,
    "db_level": -27.0,
}


### Augmentation Config ###
config.audio_augmentation = {
    # additive noise and room impulse response (RIR) simulation similar to: https://arxiv.org/pdf/2009.14153.pdf
    "p": 0.5,  # probability to the use of one of the augmentation - 0 means disabled
    "rir": {"rir_path": RIR_SIMULATED_PATH, "conv_mode": "full"},  # download: https://www.openslr.org/17/
    "additive": {
        "sounds_path": MUSAN_PATH,
        "speech": {"min_snr_in_db": 13, "max_snr_in_db": 20, "min_num_noises": 1, "max_num_noises": 1},
        "noise": {"min_snr_in_db": 0, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
        "music": {"min_snr_in_db": 5, "max_snr_in_db": 15, "min_num_noises": 1, "max_num_noises": 1},
    },
    "gaussian": {"p": 0.7, "min_amplitude": 0.0, "max_amplitude": 1e-05},
}

config.save_json(CONFIG_OUT_PATH)

print(CONFIG_OUT_PATH)
if RESTORE_PATH is not None:
    command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH} --restore_path {RESTORE_PATH}"
else:
    command = f"python TTS/bin/train_encoder.py --config_path {CONFIG_OUT_PATH}"

os.system(command)