ccoreilly commited on
Commit
7f0efc6
·
1 Parent(s): 58b3ffd

Afegeix models

Browse files
Dockerfile CHANGED
@@ -12,6 +12,7 @@ RUN cd espeak-ng && \
12
 
13
  COPY requirements.txt .
14
  COPY app.py .
 
15
 
16
  RUN pip install -r requirements.txt
17
 
@@ -20,7 +21,6 @@ RUN mkdir -p cache && chmod 777 cache
20
  ENV NUMBA_CACHE_DIR=./cache
21
  ENV MPLCONFIGDIR=./cache
22
 
23
-
24
  EXPOSE 7860
25
 
26
- CMD python app.py
 
12
 
13
  COPY requirements.txt .
14
  COPY app.py .
15
+ COPY models .
16
 
17
  RUN pip install -r requirements.txt
18
 
 
21
  ENV NUMBA_CACHE_DIR=./cache
22
  ENV MPLCONFIGDIR=./cache
23
 
 
24
  EXPOSE 7860
25
 
26
+ CMD ["python", "app.py"]
models/bsc/best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b15fa7d2052bada1cf421e49d2d03b00e95b49fcd0e42b7af1d92da2880cdecc
3
+ size 1038659133
models/bsc/config.json ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/experiments_from_previous",
3
+ "logger_uri": null,
4
+ "run_name": "multispeaker_vits_ca_1e4_1e4_32",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "log_model_step": 1000,
13
+ "save_step": 1000,
14
+ "save_n_checkpoints": 5,
15
+ "save_checkpoints": true,
16
+ "save_all_best": true,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": true,
20
+ "test_delay_epochs": -1,
21
+ "run_eval": true,
22
+ "run_eval_steps": null,
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "mixed_precision": false,
26
+ "epochs": 1000,
27
+ "batch_size": 16,
28
+ "eval_batch_size": 8,
29
+ "grad_clip": [
30
+ 1000.0,
31
+ 1000.0
32
+ ],
33
+ "scheduler_after_epoch": true,
34
+ "lr": 0.001,
35
+ "optimizer": "AdamW",
36
+ "optimizer_params": {
37
+ "betas": [
38
+ 0.8,
39
+ 0.99
40
+ ],
41
+ "eps": 1e-09,
42
+ "weight_decay": 0.01
43
+ },
44
+ "lr_scheduler": "",
45
+ "lr_scheduler_params": null,
46
+ "use_grad_scaler": false,
47
+ "cudnn_enable": true,
48
+ "cudnn_deterministic": false,
49
+ "cudnn_benchmark": false,
50
+ "training_seed": 54321,
51
+ "model": "vits",
52
+ "num_loader_workers": 4,
53
+ "num_eval_loader_workers": 4,
54
+ "use_noise_augment": false,
55
+ "audio": {
56
+ "fft_size": 1024,
57
+ "sample_rate": 22050,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "num_mels": 80,
61
+ "mel_fmin": 0,
62
+ "mel_fmax": null
63
+ },
64
+ "use_phonemes": true,
65
+ "phonemizer": "espeak",
66
+ "phoneme_language": "ca",
67
+ "compute_input_seq_cache": true,
68
+ "text_cleaner": "multilingual_cleaners",
69
+ "enable_eos_bos_chars": false,
70
+ "test_sentences_file": "",
71
+ "phoneme_cache_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/phoneme_cache",
72
+ "characters": {
73
+ "characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
74
+ "vocab_dict": null,
75
+ "pad": "<PAD>",
76
+ "eos": "<EOS>",
77
+ "bos": "<BOS>",
78
+ "blank": "<BLNK>",
79
+ "characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
80
+ "punctuations": "!'(),-.:;? ",
81
+ "phonemes": null,
82
+ "is_unique": false,
83
+ "is_sorted": true
84
+ },
85
+ "add_blank": true,
86
+ "batch_group_size": 5,
87
+ "loss_masking": null,
88
+ "min_audio_len": 1,
89
+ "max_audio_len": Infinity,
90
+ "min_text_len": 1,
91
+ "max_text_len": 325,
92
+ "compute_f0": false,
93
+ "compute_linear_spec": true,
94
+ "precompute_num_workers": 0,
95
+ "start_by_longest": false,
96
+ "datasets": [
97
+ {
98
+ "formatter": "vctk_old",
99
+ "dataset_name": "vctk_old",
100
+ "path": "/gpfs/scratch/bsc88/bsc88474/data/multispeaker_ca",
101
+ "meta_file_train": "",
102
+ "ignored_speakers": [
103
+ "uri",
104
+ "09796",
105
+ "05450"
106
+ ],
107
+ "language": "ca",
108
+ "meta_file_val": "",
109
+ "meta_file_attn_mask": ""
110
+ }
111
+ ],
112
+ "test_sentences": [
113
+ [
114
+ "Per exemple, dels nostres bancs que inverteixen en armament de les nostres empreses."
115
+ ],
116
+ [
117
+ "Preguntin-se si aix\u00f2 era necessari."
118
+ ],
119
+ [
120
+ "La suposada ocultaci\u00f3 dels informes que advertien de risc s\u00edsmic."
121
+ ],
122
+ [
123
+ "\u00c9s de 633 milions d'euros quan es far\u00e0 la publicaci\u00f3 detallada."
124
+ ]
125
+ ],
126
+ "eval_split_max_size": null,
127
+ "eval_split_size": 0.01,
128
+ "use_speaker_weighted_sampler": false,
129
+ "speaker_weighted_sampler_alpha": 1.0,
130
+ "use_language_weighted_sampler": false,
131
+ "language_weighted_sampler_alpha": 1.0,
132
+ "use_length_weighted_sampler": false,
133
+ "length_weighted_sampler_alpha": 1.0,
134
+ "model_args": {
135
+ "num_chars": 131,
136
+ "out_channels": 513,
137
+ "spec_segment_size": 32,
138
+ "hidden_channels": 192,
139
+ "hidden_channels_ffn_text_encoder": 768,
140
+ "num_heads_text_encoder": 2,
141
+ "num_layers_text_encoder": 6,
142
+ "kernel_size_text_encoder": 3,
143
+ "dropout_p_text_encoder": 0.1,
144
+ "dropout_p_duration_predictor": 0.5,
145
+ "kernel_size_posterior_encoder": 5,
146
+ "dilation_rate_posterior_encoder": 1,
147
+ "num_layers_posterior_encoder": 16,
148
+ "kernel_size_flow": 5,
149
+ "dilation_rate_flow": 1,
150
+ "num_layers_flow": 4,
151
+ "resblock_type_decoder": "1",
152
+ "resblock_kernel_sizes_decoder": [
153
+ 3,
154
+ 7,
155
+ 11
156
+ ],
157
+ "resblock_dilation_sizes_decoder": [
158
+ [
159
+ 1,
160
+ 3,
161
+ 5
162
+ ],
163
+ [
164
+ 1,
165
+ 3,
166
+ 5
167
+ ],
168
+ [
169
+ 1,
170
+ 3,
171
+ 5
172
+ ]
173
+ ],
174
+ "upsample_rates_decoder": [
175
+ 8,
176
+ 8,
177
+ 2,
178
+ 2
179
+ ],
180
+ "upsample_initial_channel_decoder": 512,
181
+ "upsample_kernel_sizes_decoder": [
182
+ 16,
183
+ 16,
184
+ 4,
185
+ 4
186
+ ],
187
+ "periods_multi_period_discriminator": [
188
+ 2,
189
+ 3,
190
+ 5,
191
+ 7,
192
+ 11
193
+ ],
194
+ "use_sdp": true,
195
+ "noise_scale": 1.0,
196
+ "inference_noise_scale": 0.667,
197
+ "length_scale": 1.0,
198
+ "noise_scale_dp": 1.0,
199
+ "inference_noise_scale_dp": 1.0,
200
+ "max_inference_len": null,
201
+ "init_discriminator": true,
202
+ "use_spectral_norm_disriminator": false,
203
+ "use_speaker_embedding": true,
204
+ "num_speakers": 257,
205
+ "speakers_file": "/home/user/app/speakers.pth",
206
+ "d_vector_file": null,
207
+ "speaker_embedding_channels": 256,
208
+ "use_d_vector_file": false,
209
+ "d_vector_dim": 0,
210
+ "detach_dp_input": true,
211
+ "use_language_embedding": false,
212
+ "embedded_language_dim": 4,
213
+ "num_languages": 0,
214
+ "language_ids_file": null,
215
+ "use_speaker_encoder_as_loss": false,
216
+ "speaker_encoder_config_path": "",
217
+ "speaker_encoder_model_path": "",
218
+ "condition_dp_on_speaker": true,
219
+ "freeze_encoder": false,
220
+ "freeze_DP": false,
221
+ "freeze_PE": false,
222
+ "freeze_flow_decoder": false,
223
+ "freeze_waveform_decoder": false,
224
+ "encoder_sample_rate": null,
225
+ "interpolate_z": true,
226
+ "reinit_DP": false,
227
+ "reinit_text_encoder": false
228
+ },
229
+ "lr_gen": 0.0001,
230
+ "lr_disc": 0.0001,
231
+ "lr_scheduler_gen": "ExponentialLR",
232
+ "lr_scheduler_gen_params": {
233
+ "gamma": 0.999875,
234
+ "last_epoch": -1
235
+ },
236
+ "lr_scheduler_disc": "ExponentialLR",
237
+ "lr_scheduler_disc_params": {
238
+ "gamma": 0.999875,
239
+ "last_epoch": -1
240
+ },
241
+ "kl_loss_alpha": 1.0,
242
+ "disc_loss_alpha": 1.0,
243
+ "gen_loss_alpha": 1.0,
244
+ "feat_loss_alpha": 1.0,
245
+ "mel_loss_alpha": 45.0,
246
+ "dur_loss_alpha": 1.0,
247
+ "speaker_encoder_loss_alpha": 1.0,
248
+ "return_wav": true,
249
+ "use_weighted_sampler": false,
250
+ "weighted_sampler_attrs": null,
251
+ "weighted_sampler_multipliers": null,
252
+ "r": 1,
253
+ "num_speakers": 257,
254
+ "use_speaker_embedding": true,
255
+ "speakers_file": "/home/user/app/speakers.pth",
256
+ "speaker_embedding_channels": 256,
257
+ "language_ids_file": null,
258
+ "use_language_embedding": false,
259
+ "use_d_vector_file": false,
260
+ "d_vector_file": null,
261
+ "d_vector_dim": 0
262
+ }
models/bsc/speaker_map.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "f_cen_05": "05739",
3
+ "f_cen_81": "8162d651b6211f06f655a69cd7fdd383d6b4287e9ba132b9898ef9ac8687349e777626333d23bed93f9264aae965efb14ed650cb64fd0ad90494aff903eaef11",
4
+ "f_occ_31": "31535cb2ece4710d08fdbeefb6f8f75ed093fee4cf8573bd601d960f8c6156f0fd0a85712761691e86e31160b993ee0eacb10c4c8aed000cc394cf7c7d207a7e",
5
+ "f_occ_de": "dee065b956b99b10db4763759d64c41791af1a7e77f1864f90a2b0847a12633dcf9bc108db7eaf73cc8d0e750f5c37383a56cd77cc2276d3960104c6bebe6346",
6
+ "f_sep_31": "31e6f3a011661320b2e59b6f8be43f6db2243e9feabc2b9787c1413788e13eb0e5810bed983bf7ff66e46417d183a91ed50b3b9be9d89e4f51aada72293b9881",
7
+ "m_cen_08": "08935",
8
+ "m_occ_44": "30b1f81c579755895581259d79a8a5a3ca45b908b0bd14ad1c6418f39aa1e2f47cb4749c69b5440cdb92e3bafb772e19e7bc2b16d196b061addd173a1309e491",
9
+ "m_val_89": "896256329fbeb5b8116349c31d8a39a7d36d5f970d48558e1db5417d611e240e4dbf473f6e49137f7aa6116394b7deabb0bbec4a014896cdc9484ee91458117d"
10
+ }
models/bsc/speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dacda0b8dd3e111c5072f8f33c08b4a29b92ac79aaf22ceca912d01e7deb905
3
+ size 30191
models/collectivat/catotron-ona-TTS-API-entry.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "voice": "ona-fast-hifigan",
3
+ "lang": "ca",
4
+ "model_type": "coqui",
5
+ "tts_config_path": "fast-speech_config.json",
6
+ "tts_model_path": "fast-speech_best_model.pth",
7
+ "vocoder_config_path": "ljspeech--hifigan_v2_config.json",
8
+ "vocoder_model_path": "ljspeech--hifigan_v2_model_file.pth",
9
+ "load": true
10
+ }
models/collectivat/fast-speech_best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a5aefb9f49f6172e34b816e1de8f5234012f0a9a05747973f6610e40869983f
3
+ size 457921637
models/collectivat/fast-speech_config.json ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron",
3
+ "logger_uri": null,
4
+ "run_name": "fast_pitch_ljspeech",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 50,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "log_model_step": null,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 5,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 1000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": -1,
21
+ "run_eval": true,
22
+ "run_eval_steps": null,
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "mixed_precision": false,
26
+ "epochs": 1000,
27
+ "batch_size": 16,
28
+ "eval_batch_size": 16,
29
+ "grad_clip": 5.0,
30
+ "scheduler_after_epoch": true,
31
+ "lr": 0.0001,
32
+ "optimizer": "Adam",
33
+ "optimizer_params": {
34
+ "betas": [
35
+ 0.9,
36
+ 0.998
37
+ ],
38
+ "weight_decay": 1e-06
39
+ },
40
+ "lr_scheduler": "NoamLR",
41
+ "lr_scheduler_params": {
42
+ "warmup_steps": 4000
43
+ },
44
+ "use_grad_scaler": false,
45
+ "cudnn_enable": true,
46
+ "cudnn_deterministic": false,
47
+ "cudnn_benchmark": false,
48
+ "training_seed": 54321,
49
+ "model": "fast_pitch",
50
+ "num_loader_workers": 8,
51
+ "num_eval_loader_workers": 4,
52
+ "use_noise_augment": false,
53
+ "audio": {
54
+ "fft_size": 1024,
55
+ "win_length": 1024,
56
+ "hop_length": 256,
57
+ "frame_shift_ms": null,
58
+ "frame_length_ms": null,
59
+ "stft_pad_mode": "reflect",
60
+ "sample_rate": 22050,
61
+ "resample": false,
62
+ "preemphasis": 0.0,
63
+ "ref_level_db": 20,
64
+ "do_sound_norm": false,
65
+ "log_func": "np.log",
66
+ "do_trim_silence": true,
67
+ "trim_db": 60.0,
68
+ "do_rms_norm": false,
69
+ "db_level": null,
70
+ "power": 1.5,
71
+ "griffin_lim_iters": 60,
72
+ "num_mels": 80,
73
+ "mel_fmin": 0.0,
74
+ "mel_fmax": 8000,
75
+ "spec_gain": 1.0,
76
+ "do_amp_to_db_linear": true,
77
+ "do_amp_to_db_mel": true,
78
+ "pitch_fmax": 640.0,
79
+ "pitch_fmin": 0.0,
80
+ "signal_norm": false,
81
+ "min_level_db": -100,
82
+ "symmetric_norm": true,
83
+ "max_norm": 4.0,
84
+ "clip_norm": true,
85
+ "stats_path": null
86
+ },
87
+ "use_phonemes": false,
88
+ "phonemizer": null,
89
+ "phoneme_language": "ca-es",
90
+ "compute_input_seq_cache": true,
91
+ "text_cleaner": "multilingual_cleaners",
92
+ "enable_eos_bos_chars": false,
93
+ "test_sentences_file": "",
94
+ "phoneme_cache_path": null,
95
+ "characters": {
96
+ "characters_class": "TTS.tts.utils.text.characters.Graphemes",
97
+ "vocab_dict": null,
98
+ "pad": "_",
99
+ "eos": "*",
100
+ "bos": "^",
101
+ "blank": null,
102
+ "characters": "A\u00c0\u00c1BC\u00c7DE\u00c9\u00c8FGHI\u00cd\u00cfJKLMNO\u00d3\u00d2PQRSTU\u00dc\u00daVWXYZa\u00e0\u00e1bc\u00e7de\u00e9\u00e8fghi\u00ed\u00efjklmno\u00f3\u00f2pqrstu\u00fc\u00favwxyz",
103
+ "punctuations": "!'(),-.:;?\u00b7 ",
104
+ "phonemes": "",
105
+ "is_unique": true,
106
+ "is_sorted": true
107
+ },
108
+ "add_blank": false,
109
+ "batch_group_size": 0,
110
+ "loss_masking": null,
111
+ "min_audio_len": 1,
112
+ "max_audio_len": Infinity,
113
+ "min_text_len": 1,
114
+ "max_text_len": Infinity,
115
+ "compute_f0": true,
116
+ "compute_linear_spec": false,
117
+ "precompute_num_workers": 4,
118
+ "start_by_longest": false,
119
+ "datasets": [
120
+ {
121
+ "name": "custom_turkish",
122
+ "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
123
+ "meta_file_train": "upc_ona_train.txt",
124
+ "ignored_speakers": null,
125
+ "language": "",
126
+ "meta_file_val": "",
127
+ "meta_file_attn_mask": ""
128
+ },
129
+ {
130
+ "name": "custom_turkish",
131
+ "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
132
+ "meta_file_train": "upc_ona_val.txt",
133
+ "ignored_speakers": null,
134
+ "language": "",
135
+ "meta_file_val": "",
136
+ "meta_file_attn_mask": ""
137
+ }
138
+ ],
139
+ "test_sentences": [
140
+ "Hola Barcelona!",
141
+ "Escriviu al text."
142
+ ],
143
+ "eval_split_max_size": null,
144
+ "eval_split_size": 0.01,
145
+ "use_speaker_weighted_sampler": false,
146
+ "speaker_weighted_sampler_alpha": 1.0,
147
+ "use_language_weighted_sampler": false,
148
+ "language_weighted_sampler_alpha": 1.0,
149
+ "use_length_weighted_sampler": false,
150
+ "length_weighted_sampler_alpha": 1.0,
151
+ "base_model": "forward_tts",
152
+ "model_args": {
153
+ "num_chars": 89,
154
+ "out_channels": 80,
155
+ "hidden_channels": 384,
156
+ "use_aligner": true,
157
+ "use_pitch": true,
158
+ "pitch_predictor_hidden_channels": 256,
159
+ "pitch_predictor_kernel_size": 3,
160
+ "pitch_predictor_dropout_p": 0.1,
161
+ "pitch_embedding_kernel_size": 3,
162
+ "duration_predictor_hidden_channels": 256,
163
+ "duration_predictor_kernel_size": 3,
164
+ "duration_predictor_dropout_p": 0.1,
165
+ "positional_encoding": true,
166
+ "poisitonal_encoding_use_scale": true,
167
+ "length_scale": 1,
168
+ "encoder_type": "fftransformer",
169
+ "encoder_params": {
170
+ "hidden_channels_ffn": 1024,
171
+ "num_heads": 1,
172
+ "num_layers": 6,
173
+ "dropout_p": 0.1
174
+ },
175
+ "decoder_type": "fftransformer",
176
+ "decoder_params": {
177
+ "hidden_channels_ffn": 1024,
178
+ "num_heads": 1,
179
+ "num_layers": 6,
180
+ "dropout_p": 0.1
181
+ },
182
+ "detach_duration_predictor": false,
183
+ "max_duration": 75,
184
+ "num_speakers": 1,
185
+ "use_speaker_embedding": false,
186
+ "speakers_file": null,
187
+ "use_d_vector_file": false,
188
+ "d_vector_dim": null,
189
+ "d_vector_file": null
190
+ },
191
+ "num_speakers": 0,
192
+ "speakers_file": null,
193
+ "use_speaker_embedding": false,
194
+ "use_d_vector_file": false,
195
+ "d_vector_file": false,
196
+ "d_vector_dim": 0,
197
+ "spec_loss_type": "mse",
198
+ "duration_loss_type": "mse",
199
+ "use_ssim_loss": true,
200
+ "ssim_loss_alpha": 1.0,
201
+ "spec_loss_alpha": 1.0,
202
+ "aligner_loss_alpha": 1.0,
203
+ "pitch_loss_alpha": 0.1,
204
+ "dur_loss_alpha": 0.1,
205
+ "binary_align_loss_alpha": 0.1,
206
+ "binary_loss_warmup_epochs": 150,
207
+ "min_seq_len": 13,
208
+ "max_seq_len": 500000,
209
+ "r": 1,
210
+ "f0_cache_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/f0_cache",
211
+ "restore_path": "/home/twbgmy/.local/share/tts/tts_models--en--ljspeech--fast_pitch/model_file.pth",
212
+ "github_branch": "* dev"
213
+ }
models/collectivat/ljspeech--hifigan_v2_config.json ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_name": "hifigan",
3
+ "run_description": "universal hifigan trained on LibriTTS with no spectrogram normalization and using log() for scaling instead of log10()",
4
+
5
+
6
+ // AUDIO PARAMETERS
7
+ "audio":{
8
+ "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
9
+ "win_length": 1024, // stft window length in ms.
10
+ "hop_length": 256, // stft window hop-lengh in ms.
11
+ "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
12
+ "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
13
+
14
+ // Audio processing parameters
15
+ "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
16
+ "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
17
+ "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
18
+ "log_func": "np.log",
19
+
20
+ // Silence trimming
21
+ "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
22
+ "trim_db": 60, // threshold for timming silence. Set this according to your dataset.
23
+
24
+ // MelSpectrogram parameters
25
+ "num_mels": 80, // size of the mel spec frame.
26
+ "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
27
+ "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
28
+ "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
29
+
30
+ // Normalization parameters
31
+ "signal_norm": false, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
32
+ "min_level_db": -100, // lower bound for normalization
33
+ "symmetric_norm": true, // move normalization to range [-1, 1]
34
+ "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
35
+ "clip_norm": true, // clip normalized values into the range.
36
+ "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
37
+ },
38
+
39
+ // DISTRIBUTED TRAINING
40
+ "distributed":{
41
+ "backend": "nccl",
42
+ "url": "tcp:\/\/localhost:54324"
43
+ },
44
+
45
+ // MODEL PARAMETERS
46
+ "use_pqmf": false,
47
+
48
+ // LOSS PARAMETERS
49
+ "use_stft_loss": false,
50
+ "use_subband_stft_loss": false,
51
+ "use_mse_gan_loss": true,
52
+ "use_hinge_gan_loss": false,
53
+ "use_feat_match_loss": true, // use only with melgan discriminators
54
+ "use_l1_spec_loss": true,
55
+
56
+ // loss weights
57
+ "stft_loss_weight": 0,
58
+ "subband_stft_loss_weight": 0,
59
+ "mse_G_loss_weight": 1,
60
+ "hinge_G_loss_weight": 0,
61
+ "feat_match_loss_weight": 10,
62
+ "l1_spec_loss_weight": 45,
63
+
64
+ // multiscale stft loss parameters
65
+ // "stft_loss_params": {
66
+ // "n_ffts": [1024, 2048, 512],
67
+ // "hop_lengths": [120, 240, 50],
68
+ // "win_lengths": [600, 1200, 240]
69
+ // },
70
+
71
+ "l1_spec_loss_params": {
72
+ "use_mel": true,
73
+ "sample_rate": 16000,
74
+ "n_fft": 1024,
75
+ "hop_length": 256,
76
+ "win_length": 1024,
77
+ "n_mels": 80,
78
+ "mel_fmin": 0.0,
79
+ "mel_fmax": null
80
+ },
81
+
82
+ "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
83
+
84
+ // DISCRIMINATOR
85
+ "discriminator_model": "hifigan_discriminator",
86
+ //"discriminator_model_params":{
87
+ // "peroids": [2, 3, 5, 7, 11],
88
+ // "base_channels": 16,
89
+ // "max_channels":512,
90
+ // "downsample_factors":[4, 4, 4]
91
+ //},
92
+ "steps_to_start_discriminator": 0, // steps required to start GAN trainining.1
93
+
94
+ // GENERATOR
95
+ "generator_model": "hifigan_generator",
96
+ "generator_model_params": {
97
+ "resblock_type": "1",
98
+ "upsample_factors": [8,8,2,2],
99
+ "upsample_kernel_sizes": [16,16,4,4],
100
+ "upsample_initial_channel": 128,
101
+ "resblock_kernel_sizes": [3,7,11],
102
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]]
103
+ },
104
+
105
+ // DATASET
106
+ "data_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/vo_voice_quality_transformation/",
107
+ "feature_path": null,
108
+ // "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/",
109
+ "seq_len": 8192,
110
+ "pad_short": 2000,
111
+ "conv_pad": 0,
112
+ "use_noise_augment": false,
113
+ "use_cache": true,
114
+ "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
115
+
116
+ // TRAINING
117
+ "batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
118
+
119
+ // VALIDATION
120
+ "run_eval": true,
121
+ "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
122
+ "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
123
+
124
+ // OPTIMIZER
125
+ "epochs": 10000, // total number of epochs to train.
126
+ "wd": 0.0, // Weight decay weight.
127
+ "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
128
+ "disc_clip_grad": -1, // Discriminator gradient clipping threshold.
129
+ // "lr_scheduler_gen": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
130
+ // "lr_scheduler_gen_params": {
131
+ // "gamma": 0.999,
132
+ // "last_epoch": -1
133
+ // },
134
+ // "lr_scheduler_disc": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
135
+ // "lr_scheduler_disc_params": {
136
+ // "gamma": 0.999,
137
+ // "last_epoch": -1
138
+ // },
139
+ "lr_gen": 0.00001, // Initial learning rate. If Noam decay is active, maximum learning rate.
140
+ "lr_disc": 0.00001,
141
+
142
+ // TENSORBOARD and LOGGING
143
+ "print_step": 25, // Number of steps to log traning on console.
144
+ "print_eval": false, // If True, it prints loss values for each step in eval run.
145
+ "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
146
+ "checkpoint": true, // If true, it saves checkpoints per "save_step"
147
+ "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
148
+
149
+ // DATA LOADING
150
+ "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
151
+ "num_val_loader_workers": 4, // number of evaluation data loader processes.
152
+ "eval_split_size": 10,
153
+
154
+ // PATHS
155
+ "output_path": "/home/erogol/gdrive/Trainings/sam/"
156
+ }
157
+
158
+
models/collectivat/ljspeech--hifigan_v2_model_file.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4047e93886faa1aba11948efa71f59dcb0ec9117e286660e59b91892ef98d129
3
+ size 3794153
models/piper/MODEL_CARD ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model card for upc_ona (x-low)
2
+
3
+ * Language: ca (Catalan)
4
+ * Speakers: 1
5
+ * Quality: x-low
6
+ * Samplerate: 16,000Hz
7
+
8
+ ## Dataset
9
+
10
+ * URL: https://collectivat.cat/asr#upc-festcat-tts-corpora
11
+ * License: CC BY-SA 3.0 ES
12
+
13
+ ## Training
14
+
15
+ Trained from scratch.
models/piper/ca-upc_ona-x-low.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13661d26423e0c791823823a5971f4e1aaf644a62e65e0e94d299c0e70560e14
3
+ size 20628813
models/piper/ca-upc_ona-x-low.onnx.json ADDED
@@ -0,0 +1,409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio": {
3
+ "sample_rate": 16000
4
+ },
5
+ "espeak": {
6
+ "voice": "ca"
7
+ },
8
+ "inference": {
9
+ "noise_scale": 0.667,
10
+ "length_scale": 1,
11
+ "noise_w": 0.8
12
+ },
13
+ "phoneme_map": {},
14
+ "phoneme_id_map": {
15
+ "_": [
16
+ 0
17
+ ],
18
+ "^": [
19
+ 1
20
+ ],
21
+ "$": [
22
+ 2
23
+ ],
24
+ " ": [
25
+ 3
26
+ ],
27
+ "!": [
28
+ 4
29
+ ],
30
+ "'": [
31
+ 5
32
+ ],
33
+ "(": [
34
+ 6
35
+ ],
36
+ ")": [
37
+ 7
38
+ ],
39
+ ",": [
40
+ 8
41
+ ],
42
+ "-": [
43
+ 9
44
+ ],
45
+ ".": [
46
+ 10
47
+ ],
48
+ ":": [
49
+ 11
50
+ ],
51
+ ";": [
52
+ 12
53
+ ],
54
+ "?": [
55
+ 13
56
+ ],
57
+ "a": [
58
+ 14
59
+ ],
60
+ "b": [
61
+ 15
62
+ ],
63
+ "c": [
64
+ 16
65
+ ],
66
+ "d": [
67
+ 17
68
+ ],
69
+ "e": [
70
+ 18
71
+ ],
72
+ "f": [
73
+ 19
74
+ ],
75
+ "h": [
76
+ 20
77
+ ],
78
+ "i": [
79
+ 21
80
+ ],
81
+ "j": [
82
+ 22
83
+ ],
84
+ "k": [
85
+ 23
86
+ ],
87
+ "l": [
88
+ 24
89
+ ],
90
+ "m": [
91
+ 25
92
+ ],
93
+ "n": [
94
+ 26
95
+ ],
96
+ "o": [
97
+ 27
98
+ ],
99
+ "p": [
100
+ 28
101
+ ],
102
+ "q": [
103
+ 29
104
+ ],
105
+ "r": [
106
+ 30
107
+ ],
108
+ "s": [
109
+ 31
110
+ ],
111
+ "t": [
112
+ 32
113
+ ],
114
+ "u": [
115
+ 33
116
+ ],
117
+ "v": [
118
+ 34
119
+ ],
120
+ "w": [
121
+ 35
122
+ ],
123
+ "x": [
124
+ 36
125
+ ],
126
+ "y": [
127
+ 37
128
+ ],
129
+ "z": [
130
+ 38
131
+ ],
132
+ "æ": [
133
+ 39
134
+ ],
135
+ "ç": [
136
+ 40
137
+ ],
138
+ "ð": [
139
+ 41
140
+ ],
141
+ "ø": [
142
+ 42
143
+ ],
144
+ "ħ": [
145
+ 43
146
+ ],
147
+ "ŋ": [
148
+ 44
149
+ ],
150
+ "œ": [
151
+ 45
152
+ ],
153
+ "ǀ": [
154
+ 46
155
+ ],
156
+ "ǁ": [
157
+ 47
158
+ ],
159
+ "ǂ": [
160
+ 48
161
+ ],
162
+ "ǃ": [
163
+ 49
164
+ ],
165
+ "ɐ": [
166
+ 50
167
+ ],
168
+ "ɑ": [
169
+ 51
170
+ ],
171
+ "ɒ": [
172
+ 52
173
+ ],
174
+ "ɓ": [
175
+ 53
176
+ ],
177
+ "ɔ": [
178
+ 54
179
+ ],
180
+ "ɕ": [
181
+ 55
182
+ ],
183
+ "ɖ": [
184
+ 56
185
+ ],
186
+ "ɗ": [
187
+ 57
188
+ ],
189
+ "ɘ": [
190
+ 58
191
+ ],
192
+ "ə": [
193
+ 59
194
+ ],
195
+ "ɚ": [
196
+ 60
197
+ ],
198
+ "ɛ": [
199
+ 61
200
+ ],
201
+ "ɜ": [
202
+ 62
203
+ ],
204
+ "ɞ": [
205
+ 63
206
+ ],
207
+ "ɟ": [
208
+ 64
209
+ ],
210
+ "ɠ": [
211
+ 65
212
+ ],
213
+ "ɡ": [
214
+ 66
215
+ ],
216
+ "ɢ": [
217
+ 67
218
+ ],
219
+ "ɣ": [
220
+ 68
221
+ ],
222
+ "ɤ": [
223
+ 69
224
+ ],
225
+ "ɥ": [
226
+ 70
227
+ ],
228
+ "ɦ": [
229
+ 71
230
+ ],
231
+ "ɧ": [
232
+ 72
233
+ ],
234
+ "ɨ": [
235
+ 73
236
+ ],
237
+ "ɪ": [
238
+ 74
239
+ ],
240
+ "ɫ": [
241
+ 75
242
+ ],
243
+ "ɬ": [
244
+ 76
245
+ ],
246
+ "ɭ": [
247
+ 77
248
+ ],
249
+ "ɮ": [
250
+ 78
251
+ ],
252
+ "ɯ": [
253
+ 79
254
+ ],
255
+ "ɰ": [
256
+ 80
257
+ ],
258
+ "ɱ": [
259
+ 81
260
+ ],
261
+ "ɲ": [
262
+ 82
263
+ ],
264
+ "ɳ": [
265
+ 83
266
+ ],
267
+ "ɴ": [
268
+ 84
269
+ ],
270
+ "ɵ": [
271
+ 85
272
+ ],
273
+ "ɶ": [
274
+ 86
275
+ ],
276
+ "ɸ": [
277
+ 87
278
+ ],
279
+ "ɹ": [
280
+ 88
281
+ ],
282
+ "ɺ": [
283
+ 89
284
+ ],
285
+ "ɻ": [
286
+ 90
287
+ ],
288
+ "ɽ": [
289
+ 91
290
+ ],
291
+ "ɾ": [
292
+ 92
293
+ ],
294
+ "ʀ": [
295
+ 93
296
+ ],
297
+ "ʁ": [
298
+ 94
299
+ ],
300
+ "ʂ": [
301
+ 95
302
+ ],
303
+ "ʃ": [
304
+ 96
305
+ ],
306
+ "ʄ": [
307
+ 97
308
+ ],
309
+ "ʈ": [
310
+ 98
311
+ ],
312
+ "ʉ": [
313
+ 99
314
+ ],
315
+ "ʊ": [
316
+ 100
317
+ ],
318
+ "ʋ": [
319
+ 101
320
+ ],
321
+ "ʌ": [
322
+ 102
323
+ ],
324
+ "ʍ": [
325
+ 103
326
+ ],
327
+ "ʎ": [
328
+ 104
329
+ ],
330
+ "ʏ": [
331
+ 105
332
+ ],
333
+ "ʐ": [
334
+ 106
335
+ ],
336
+ "ʑ": [
337
+ 107
338
+ ],
339
+ "ʒ": [
340
+ 108
341
+ ],
342
+ "ʔ": [
343
+ 109
344
+ ],
345
+ "ʕ": [
346
+ 110
347
+ ],
348
+ "ʘ": [
349
+ 111
350
+ ],
351
+ "ʙ": [
352
+ 112
353
+ ],
354
+ "ʛ": [
355
+ 113
356
+ ],
357
+ "ʜ": [
358
+ 114
359
+ ],
360
+ "ʝ": [
361
+ 115
362
+ ],
363
+ "ʟ": [
364
+ 116
365
+ ],
366
+ "ʡ": [
367
+ 117
368
+ ],
369
+ "ʢ": [
370
+ 118
371
+ ],
372
+ "ʲ": [
373
+ 119
374
+ ],
375
+ "ˈ": [
376
+ 120
377
+ ],
378
+ "ˌ": [
379
+ 121
380
+ ],
381
+ "ː": [
382
+ 122
383
+ ],
384
+ "ˑ": [
385
+ 123
386
+ ],
387
+ "˞": [
388
+ 124
389
+ ],
390
+ "β": [
391
+ 125
392
+ ],
393
+ "θ": [
394
+ 126
395
+ ],
396
+ "χ": [
397
+ 127
398
+ ],
399
+ "ᵻ": [
400
+ 128
401
+ ],
402
+ "ⱱ": [
403
+ 129
404
+ ]
405
+ },
406
+ "num_symbols": 130,
407
+ "num_speakers": 1,
408
+ "speaker_id_map": {}
409
+ }