Siddhant commited on
Commit
8ee289a
1 Parent(s): ebcae58

import from zenodo

Browse files
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - vctk
9
+ license: cc-by-4.0
10
+ ---
11
+ ## Example ESPnet2 TTS model
12
+ ### `kan-bayashi/vctk_gst_tacotron2`
13
+ ♻️ Imported from https://zenodo.org/record/3986237/
14
+
15
+ This model was trained by kan-bayashi using vctk/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space/171epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:561f6b5c2136cd747d5d6ecc69706f89fb95dc7e93306f4f177ce483aff2bb37
3
+ size 110376270
exp/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space/config.yaml ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_gst_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 200
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_noise: false
44
+ accum_grad: 1
45
+ no_forward_run: false
46
+ resume: true
47
+ train_dtype: float32
48
+ log_interval: null
49
+ pretrain_path: []
50
+ pretrain_key: []
51
+ num_iters_per_epoch: null
52
+ batch_size: 20
53
+ valid_batch_size: null
54
+ batch_bins: 3750000
55
+ valid_batch_bins: null
56
+ train_shape_file:
57
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
58
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/speech_shape
59
+ valid_shape_file:
60
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
61
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/speech_shape
62
+ batch_type: numel
63
+ valid_batch_type: null
64
+ fold_length:
65
+ - 150
66
+ - 240000
67
+ sort_in_batch: descending
68
+ sort_batch: descending
69
+ multiple_iterator: false
70
+ chunk_length: 500
71
+ chunk_shift_ratio: 0.5
72
+ num_cache_chunks: 1024
73
+ train_data_path_and_name_and_type:
74
+ - - dump/raw/tr_no_dev/text
75
+ - text
76
+ - text
77
+ - - dump/raw/tr_no_dev/wav.scp
78
+ - speech
79
+ - sound
80
+ valid_data_path_and_name_and_type:
81
+ - - dump/raw/dev/text
82
+ - text
83
+ - text
84
+ - - dump/raw/dev/wav.scp
85
+ - speech
86
+ - sound
87
+ allow_variable_data_keys: false
88
+ max_cache_size: 0.0
89
+ valid_max_cache_size: null
90
+ optim: adam
91
+ optim_conf:
92
+ lr: 0.001
93
+ eps: 1.0e-06
94
+ weight_decay: 0.0
95
+ scheduler: null
96
+ scheduler_conf: {}
97
+ token_list:
98
+ - <blank>
99
+ - <unk>
100
+ - OY0
101
+ - ''''
102
+ - OY2
103
+ - ER2
104
+ - UH0
105
+ - '!'
106
+ - EY0
107
+ - AW0
108
+ - AA0
109
+ - UH2
110
+ - UW2
111
+ - AY0
112
+ - AO2
113
+ - AO0
114
+ - AE2
115
+ - AH2
116
+ - AE0
117
+ - AA2
118
+ - IY2
119
+ - EH0
120
+ - AW2
121
+ - ZH
122
+ - AY2
123
+ - OY1
124
+ - IH2
125
+ - UW0
126
+ - EY2
127
+ - EH2
128
+ - OW2
129
+ - OW0
130
+ - '?'
131
+ - CH
132
+ - ER1
133
+ - TH
134
+ - UH1
135
+ - AW1
136
+ - JH
137
+ - Y
138
+ - SH
139
+ - NG
140
+ - ','
141
+ - G
142
+ - OW1
143
+ - AO1
144
+ - IY0
145
+ - UW1
146
+ - EY1
147
+ - AY1
148
+ - HH
149
+ - F
150
+ - ER0
151
+ - V
152
+ - P
153
+ - B
154
+ - AH1
155
+ - IY1
156
+ - IH0
157
+ - AA1
158
+ - EH1
159
+ - AE1
160
+ - M
161
+ - W
162
+ - K
163
+ - DH
164
+ - Z
165
+ - .
166
+ - L
167
+ - D
168
+ - IH1
169
+ - R
170
+ - S
171
+ - N
172
+ - T
173
+ - AH0
174
+ - <sos/eos>
175
+ odim: null
176
+ model_conf: {}
177
+ use_preprocessor: true
178
+ token_type: phn
179
+ bpemodel: null
180
+ non_linguistic_symbols: null
181
+ cleaner: tacotron
182
+ g2p: g2p_en_no_space
183
+ feats_extract: fbank
184
+ feats_extract_conf:
185
+ fs: 24000
186
+ fmin: 80
187
+ fmax: 7600
188
+ n_mels: 80
189
+ hop_length: 300
190
+ n_fft: 2048
191
+ win_length: 1200
192
+ normalize: global_mvn
193
+ normalize_conf:
194
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
195
+ tts: tacotron2
196
+ tts_conf:
197
+ embed_dim: 512
198
+ elayers: 1
199
+ eunits: 512
200
+ econv_layers: 3
201
+ econv_chans: 512
202
+ econv_filts: 5
203
+ atype: location
204
+ adim: 512
205
+ aconv_chans: 32
206
+ aconv_filts: 15
207
+ cumulate_att_w: true
208
+ dlayers: 2
209
+ dunits: 1024
210
+ prenet_layers: 2
211
+ prenet_units: 256
212
+ postnet_layers: 5
213
+ postnet_chans: 512
214
+ postnet_filts: 5
215
+ output_activation: null
216
+ use_batch_norm: true
217
+ use_concate: true
218
+ use_residual: false
219
+ use_gst: true
220
+ gst_heads: 8
221
+ gst_tokens: 128
222
+ dropout_rate: 0.5
223
+ zoneout_rate: 0.1
224
+ reduction_factor: 1
225
+ spk_embed_dim: null
226
+ use_masking: true
227
+ bce_pos_weight: 10.0
228
+ use_guided_attn_loss: true
229
+ guided_attn_loss_sigma: 0.4
230
+ guided_attn_loss_lambda: 1.0
231
+ required:
232
+ - output_dir
233
+ - token_list
234
+ distributed: false
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space/171epoch.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1597459309.635388
6
+ torch: 1.5.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space/config.yaml