ahnafsamin commited on
Commit
055be7c
·
1 Parent(s): 59ff0f5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +287 -0
README.md CHANGED
@@ -1,3 +1,290 @@
1
  ---
 
 
 
 
 
 
 
2
  license: afl-3.0
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - text-to-speech
4
+ - gronings
5
+ - FastSpeech 2
6
+ language: gos
7
+ datasets:
8
+ - gronings
9
  license: afl-3.0
10
+ widget:
11
+ - text: "This seems a very pleasant place, and I think I shall enjoy myself very much."
12
  ---
13
+ ## GroTTS Model
14
+
15
+ This model was trained with the [FastSpeech 2](https://arxiv.org/abs/2006.04558) architecture using approx. 2 hours of Gronings TTS dataset. For the best results you need to download the vocoder separately from [here](https://huggingface.co/ahnafsamin/parallelwavegan-gronings) and then use the following code:
16
+ ```
17
+ from espnet2.bin.tts_inference import Text2Speech
18
+ from scipy.io.wavfile import write
19
+
20
+ model = Text2Speech.from_pretrained(
21
+ model_file="path_to_the_model_file_in_pth_format",
22
+ vocoder_file="path_to_the_vocoder_file_in_pkl_format"
23
+ )
24
+ output = model("This is a simple test.")
25
+ write("x.wav", 22050, output['wav'].numpy())
26
+ ```
27
+
28
+ ## TTS config
29
+ <details><summary>expand</summary>
30
+ ```
31
+ config: conf/tuning/train_fastspeech2.yaml
32
+ print_config: false
33
+ log_level: INFO
34
+ dry_run: false
35
+ iterator_type: sequence
36
+ output_dir: exp/tts_train_fastspeech2_raw_char_tacotron
37
+ ngpu: 1
38
+ seed: 0
39
+ num_workers: 1
40
+ num_att_plot: 3
41
+ dist_backend: nccl
42
+ dist_init_method: env://
43
+ dist_world_size: null
44
+ dist_rank: null
45
+ local_rank: 0
46
+ dist_master_addr: null
47
+ dist_master_port: null
48
+ dist_launcher: null
49
+ multiprocessing_distributed: false
50
+ unused_parameters: false
51
+ sharded_ddp: false
52
+ cudnn_enabled: true
53
+ cudnn_benchmark: false
54
+ cudnn_deterministic: true
55
+ collect_stats: false
56
+ write_collected_feats: false
57
+ max_epoch: 1000
58
+ patience: null
59
+ val_scheduler_criterion:
60
+ - valid
61
+ - loss
62
+ early_stopping_criterion:
63
+ - valid
64
+ - loss
65
+ - min
66
+ best_model_criterion:
67
+ - - valid
68
+ - loss
69
+ - min
70
+ - - train
71
+ - loss
72
+ - min
73
+ keep_nbest_models: 5
74
+ nbest_averaging_interval: 0
75
+ grad_clip: 1.0
76
+ grad_clip_type: 2.0
77
+ grad_noise: false
78
+ accum_grad: 8
79
+ no_forward_run: false
80
+ resume: true
81
+ train_dtype: float32
82
+ use_amp: false
83
+ log_interval: null
84
+ use_matplotlib: true
85
+ use_tensorboard: true
86
+ use_wandb: false
87
+ wandb_project: null
88
+ wandb_id: null
89
+ wandb_entity: null
90
+ wandb_name: null
91
+ wandb_model_log_interval: -1
92
+ detect_anomaly: false
93
+ pretrain_path: null
94
+ init_param: []
95
+ ignore_init_mismatch: false
96
+ freeze_param: []
97
+ num_iters_per_epoch: 800
98
+ batch_size: 20
99
+ valid_batch_size: null
100
+ batch_bins: 3000000
101
+ valid_batch_bins: null
102
+ train_shape_file:
103
+ - exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/text_shape.char
104
+ - exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/speech_shape
105
+ valid_shape_file:
106
+ - exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/text_shape.char
107
+ - exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/speech_shape
108
+ batch_type: numel
109
+ valid_batch_type: null
110
+ fold_length:
111
+ - 150
112
+ - 204800
113
+ sort_in_batch: descending
114
+ sort_batch: descending
115
+ multiple_iterator: false
116
+ chunk_length: 500
117
+ chunk_shift_ratio: 0.5
118
+ num_cache_chunks: 1024
119
+ train_data_path_and_name_and_type:
120
+ - - dump/raw/tr_no_dev/text
121
+ - text
122
+ - text
123
+ - - exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/tr_no_dev/durations
124
+ - durations
125
+ - text_int
126
+ - - dump/raw/tr_no_dev/wav.scp
127
+ - speech
128
+ - sound
129
+ - - exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/collect_feats/pitch.scp
130
+ - pitch
131
+ - npy
132
+ - - exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/collect_feats/energy.scp
133
+ - energy
134
+ - npy
135
+ valid_data_path_and_name_and_type:
136
+ - - dump/raw/dev/text
137
+ - text
138
+ - text
139
+ - - exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/dev/durations
140
+ - durations
141
+ - text_int
142
+ - - dump/raw/dev/wav.scp
143
+ - speech
144
+ - sound
145
+ - - exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/collect_feats/pitch.scp
146
+ - pitch
147
+ - npy
148
+ - - exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/collect_feats/energy.scp
149
+ - energy
150
+ - npy
151
+ allow_variable_data_keys: false
152
+ max_cache_size: 0.0
153
+ max_cache_fd: 32
154
+ valid_max_cache_size: null
155
+ optim: adam
156
+ optim_conf:
157
+ lr: 1.0
158
+ scheduler: noamlr
159
+ scheduler_conf:
160
+ model_size: 384
161
+ warmup_steps: 4000
162
+ token_list:
163
+ - <blank>
164
+ - <unk>
165
+ - <space>
166
+ - E
167
+ - N
168
+ - A
169
+ - O
170
+ - T
171
+ - I
172
+ - R
173
+ - D
174
+ - L
175
+ - S
176
+ - K
177
+ - M
178
+ - G
179
+ - U
180
+ - H
181
+ - .
182
+ - W
183
+ - V
184
+ - Z
185
+ - P
186
+ - B
187
+ - ','
188
+ - J
189
+ - C
190
+ - F
191
+ - '?'
192
+ - ''''
193
+ - '!'
194
+ - Y
195
+ - X
196
+ - '`'
197
+ - <sos/eos>
198
+ odim: null
199
+ model_conf: {}
200
+ use_preprocessor: true
201
+ token_type: char
202
+ bpemodel: null
203
+ non_linguistic_symbols: null
204
+ cleaner: tacotron
205
+ g2p: g2p_en
206
+ feats_extract: fbank
207
+ feats_extract_conf:
208
+ n_fft: 1024
209
+ hop_length: 256
210
+ win_length: null
211
+ fs: 22050
212
+ fmin: 80
213
+ fmax: 7600
214
+ n_mels: 80
215
+ normalize: global_mvn
216
+ normalize_conf:
217
+ stats_file: exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz
218
+ tts: fastspeech2
219
+ tts_conf:
220
+ adim: 384
221
+ aheads: 2
222
+ elayers: 4
223
+ eunits: 1536
224
+ dlayers: 4
225
+ dunits: 1536
226
+ positionwise_layer_type: conv1d
227
+ positionwise_conv_kernel_size: 3
228
+ duration_predictor_layers: 2
229
+ duration_predictor_chans: 256
230
+ duration_predictor_kernel_size: 3
231
+ postnet_layers: 5
232
+ postnet_filts: 5
233
+ postnet_chans: 256
234
+ use_masking: true
235
+ use_scaled_pos_enc: true
236
+ encoder_normalize_before: true
237
+ decoder_normalize_before: true
238
+ reduction_factor: 1
239
+ init_type: xavier_uniform
240
+ init_enc_alpha: 1.0
241
+ init_dec_alpha: 1.0
242
+ transformer_enc_dropout_rate: 0.2
243
+ transformer_enc_positional_dropout_rate: 0.2
244
+ transformer_enc_attn_dropout_rate: 0.2
245
+ transformer_dec_dropout_rate: 0.2
246
+ transformer_dec_positional_dropout_rate: 0.2
247
+ transformer_dec_attn_dropout_rate: 0.2
248
+ pitch_predictor_layers: 5
249
+ pitch_predictor_chans: 256
250
+ pitch_predictor_kernel_size: 5
251
+ pitch_predictor_dropout: 0.5
252
+ pitch_embed_kernel_size: 1
253
+ pitch_embed_dropout: 0.0
254
+ stop_gradient_from_pitch_predictor: true
255
+ energy_predictor_layers: 2
256
+ energy_predictor_chans: 256
257
+ energy_predictor_kernel_size: 3
258
+ energy_predictor_dropout: 0.5
259
+ energy_embed_kernel_size: 1
260
+ energy_embed_dropout: 0.0
261
+ stop_gradient_from_energy_predictor: false
262
+ pitch_extract: dio
263
+ pitch_extract_conf:
264
+ fs: 22050
265
+ n_fft: 1024
266
+ hop_length: 256
267
+ f0max: 400
268
+ f0min: 80
269
+ reduction_factor: 1
270
+ pitch_normalize: global_mvn
271
+ pitch_normalize_conf:
272
+ stats_file: exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz
273
+ energy_extract: energy
274
+ energy_extract_conf:
275
+ fs: 22050
276
+ n_fft: 1024
277
+ hop_length: 256
278
+ win_length: null
279
+ reduction_factor: 1
280
+ energy_normalize: global_mvn
281
+ energy_normalize_conf:
282
+ stats_file: exp/tts_train_raw_char_tacotron/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz
283
+ required:
284
+ - output_dir
285
+ - token_list
286
+ version: 0.10.7a1
287
+ distributed: false
288
+
289
+ ```
290
+ </details>