Spaces:
Runtime error
Runtime error
Merge branch 'main' into ps
Browse files
checkpoints/ps_normal_exp/config.yaml
CHANGED
@@ -82,7 +82,6 @@ fvae_kernel_size: 5
|
|
82 |
fvae_noise_scale: 1.0
|
83 |
fvae_strides: 4
|
84 |
gen_dir_name: ''
|
85 |
-
glow_kernel_size: 3
|
86 |
griffin_lim_iters: 30
|
87 |
hidden_size: 192
|
88 |
hop_size: 256
|
@@ -127,8 +126,6 @@ out_wav_norm: false
|
|
127 |
pitch_extractor: parselmouth
|
128 |
pitch_key: pitch
|
129 |
pitch_type: frame
|
130 |
-
post_decoder: false
|
131 |
-
post_decoder_detach_ling: false
|
132 |
post_flow_lr: 0.001
|
133 |
post_glow_hidden: 192
|
134 |
post_glow_kernel_size: 3
|
@@ -157,8 +154,9 @@ preprocess_args:
|
|
157 |
with_phsep: true
|
158 |
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
159 |
print_nan_grads: false
|
160 |
-
|
161 |
-
|
|
|
162 |
processed_data_dir: data/processed/ljspeech
|
163 |
profile_infer: false
|
164 |
raw_data_dir: data/raw/LJSpeech-1.1
|
|
|
82 |
fvae_noise_scale: 1.0
|
83 |
fvae_strides: 4
|
84 |
gen_dir_name: ''
|
|
|
85 |
griffin_lim_iters: 30
|
86 |
hidden_size: 192
|
87 |
hop_size: 256
|
|
|
126 |
pitch_extractor: parselmouth
|
127 |
pitch_key: pitch
|
128 |
pitch_type: frame
|
|
|
|
|
129 |
post_flow_lr: 0.001
|
130 |
post_glow_hidden: 192
|
131 |
post_glow_kernel_size: 3
|
|
|
154 |
with_phsep: true
|
155 |
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
156 |
print_nan_grads: false
|
157 |
+
prior_flow_hidden: 64
|
158 |
+
prior_flow_kernel_size: 3
|
159 |
+
prior_flow_n_blocks: 4
|
160 |
processed_data_dir: data/processed/ljspeech
|
161 |
profile_infer: false
|
162 |
raw_data_dir: data/raw/LJSpeech-1.1
|
checkpoints/ps_small_exp/config.yaml
CHANGED
@@ -82,7 +82,6 @@ fvae_kernel_size: 3
|
|
82 |
fvae_noise_scale: 1.0
|
83 |
fvae_strides: 4
|
84 |
gen_dir_name: ''
|
85 |
-
glow_kernel_size: 3
|
86 |
griffin_lim_iters: 30
|
87 |
hidden_size: 128
|
88 |
hop_size: 256
|
@@ -127,8 +126,6 @@ out_wav_norm: false
|
|
127 |
pitch_extractor: parselmouth
|
128 |
pitch_key: pitch
|
129 |
pitch_type: frame
|
130 |
-
post_decoder: false
|
131 |
-
post_decoder_detach_ling: false
|
132 |
post_flow_lr: 0.001
|
133 |
post_glow_hidden: 128
|
134 |
post_glow_kernel_size: 3
|
@@ -157,8 +154,9 @@ preprocess_args:
|
|
157 |
with_phsep: true
|
158 |
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
159 |
print_nan_grads: false
|
160 |
-
|
161 |
-
|
|
|
162 |
processed_data_dir: data/processed/ljspeech
|
163 |
profile_infer: false
|
164 |
raw_data_dir: data/raw/LJSpeech-1.1
|
|
|
82 |
fvae_noise_scale: 1.0
|
83 |
fvae_strides: 4
|
84 |
gen_dir_name: ''
|
|
|
85 |
griffin_lim_iters: 30
|
86 |
hidden_size: 128
|
87 |
hop_size: 256
|
|
|
126 |
pitch_extractor: parselmouth
|
127 |
pitch_key: pitch
|
128 |
pitch_type: frame
|
|
|
|
|
129 |
post_flow_lr: 0.001
|
130 |
post_glow_hidden: 128
|
131 |
post_glow_kernel_size: 3
|
|
|
154 |
with_phsep: true
|
155 |
preprocess_cls: egs.datasets.audio.lj.preprocess.LJPreprocess
|
156 |
print_nan_grads: false
|
157 |
+
prior_flow_hidden: 32
|
158 |
+
prior_flow_kernel_size: 3
|
159 |
+
prior_flow_n_blocks: 3
|
160 |
processed_data_dir: data/processed/ljspeech
|
161 |
profile_infer: false
|
162 |
raw_data_dir: data/raw/LJSpeech-1.1
|
egs/egs_bases/tts/ps.yaml
CHANGED
@@ -38,14 +38,12 @@ fvae_enc_n_layers: 8
|
|
38 |
fvae_dec_n_layers: 4
|
39 |
fvae_strides: 4
|
40 |
fvae_noise_scale: 1.0
|
41 |
-
post_decoder: false
|
42 |
-
post_decoder_detach_ling: false
|
43 |
|
44 |
# prior flow
|
45 |
use_prior_flow: true
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
|
50 |
###########################
|
51 |
# training and inference
|
|
|
38 |
fvae_dec_n_layers: 4
|
39 |
fvae_strides: 4
|
40 |
fvae_noise_scale: 1.0
|
|
|
|
|
41 |
|
42 |
# prior flow
|
43 |
use_prior_flow: true
|
44 |
+
prior_flow_hidden: 64
|
45 |
+
prior_flow_kernel_size: 3
|
46 |
+
prior_flow_n_blocks: 4
|
47 |
|
48 |
###########################
|
49 |
# training and inference
|
egs/egs_bases/tts/ps_flow_small.yaml
CHANGED
@@ -30,9 +30,9 @@ fvae_noise_scale: 1.0
|
|
30 |
|
31 |
# prior flow
|
32 |
use_prior_flow: true
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
# post flow
|
37 |
post_glow_hidden: 128
|
38 |
post_glow_kernel_size: 3
|
|
|
30 |
|
31 |
# prior flow
|
32 |
use_prior_flow: true
|
33 |
+
prior_flow_hidden: 32
|
34 |
+
prior_flow_kernel_size: 3
|
35 |
+
prior_flow_n_blocks: 3
|
36 |
# post flow
|
37 |
post_glow_hidden: 128
|
38 |
post_glow_kernel_size: 3
|
modules/tts/portaspeech/portaspeech.py
CHANGED
@@ -74,9 +74,9 @@ class PortaSpeech(FastSpeech):
|
|
74 |
dec_n_layers=hparams['fvae_dec_n_layers'],
|
75 |
c_cond=self.hidden_size,
|
76 |
use_prior_flow=hparams['use_prior_flow'],
|
77 |
-
flow_hidden=hparams['
|
78 |
-
flow_kernel_size=hparams['
|
79 |
-
flow_n_steps=hparams['
|
80 |
strides=[hparams['fvae_strides']],
|
81 |
encoder_type=hparams['fvae_encoder_type'],
|
82 |
decoder_type=hparams['fvae_decoder_type'],
|
@@ -88,11 +88,6 @@ class PortaSpeech(FastSpeech):
|
|
88 |
self.pitch_embed = Embedding(300, self.hidden_size, 0)
|
89 |
if self.hparams['add_word_pos']:
|
90 |
self.word_pos_proj = Linear(self.hidden_size, self.hidden_size)
|
91 |
-
if self.hparams['post_decoder']:
|
92 |
-
self.post_decoder_proj_in = Linear(self.out_dims, self.hidden_size)
|
93 |
-
self.post_decoder = ConditionalConvBlocks(
|
94 |
-
self.hidden_size, self.hidden_size, self.out_dims, None,
|
95 |
-
hparams['dec_kernel_size'], num_layers=4)
|
96 |
|
97 |
def build_embedding(self, dictionary, embed_dim):
|
98 |
num_embeddings = len(dictionary)
|
@@ -188,11 +183,6 @@ class PortaSpeech(FastSpeech):
|
|
188 |
z = torch.randn_like(z)
|
189 |
x_recon = self.fvae.decoder(z, nonpadding=tgt_nonpadding_BHT, cond=x).transpose(1, 2)
|
190 |
ret['pre_mel_out'] = x_recon
|
191 |
-
if self.hparams['post_decoder']:
|
192 |
-
x_recon = self.post_decoder_proj_in(x_recon.detach())
|
193 |
-
if self.hparams['post_decoder_detach_ling']:
|
194 |
-
decoder_inp = decoder_inp.detach()
|
195 |
-
x_recon = self.post_decoder(x_recon, decoder_inp) * tgt_nonpadding
|
196 |
return x_recon
|
197 |
|
198 |
def forward_dur(self, dur_input, mel2word, ret, **kwargs):
|
|
|
74 |
dec_n_layers=hparams['fvae_dec_n_layers'],
|
75 |
c_cond=self.hidden_size,
|
76 |
use_prior_flow=hparams['use_prior_flow'],
|
77 |
+
flow_hidden=hparams['prior_flow_hidden'],
|
78 |
+
flow_kernel_size=hparams['prior_flow_kernel_size'],
|
79 |
+
flow_n_steps=hparams['prior_flow_n_blocks'],
|
80 |
strides=[hparams['fvae_strides']],
|
81 |
encoder_type=hparams['fvae_encoder_type'],
|
82 |
decoder_type=hparams['fvae_decoder_type'],
|
|
|
88 |
self.pitch_embed = Embedding(300, self.hidden_size, 0)
|
89 |
if self.hparams['add_word_pos']:
|
90 |
self.word_pos_proj = Linear(self.hidden_size, self.hidden_size)
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
def build_embedding(self, dictionary, embed_dim):
|
93 |
num_embeddings = len(dictionary)
|
|
|
183 |
z = torch.randn_like(z)
|
184 |
x_recon = self.fvae.decoder(z, nonpadding=tgt_nonpadding_BHT, cond=x).transpose(1, 2)
|
185 |
ret['pre_mel_out'] = x_recon
|
|
|
|
|
|
|
|
|
|
|
186 |
return x_recon
|
187 |
|
188 |
def forward_dur(self, dur_input, mel2word, ret, **kwargs):
|
tasks/tts/ps.py
CHANGED
@@ -58,8 +58,6 @@ class PortaSpeechTask(FastSpeechTask):
|
|
58 |
losses_kl = min(self.global_step / hparams['kl_start_steps'], 1) * losses_kl
|
59 |
losses_kl = losses_kl * hparams['lambda_kl']
|
60 |
losses['kl'] = losses_kl
|
61 |
-
if hparams['post_decoder']:
|
62 |
-
self.add_mel_loss(output['pre_mel_out'], sample['mels'], losses, '_post')
|
63 |
self.add_mel_loss(output['mel_out'], sample['mels'], losses)
|
64 |
if hparams['dur_level'] == 'word':
|
65 |
self.add_dur_loss(
|
|
|
58 |
losses_kl = min(self.global_step / hparams['kl_start_steps'], 1) * losses_kl
|
59 |
losses_kl = losses_kl * hparams['lambda_kl']
|
60 |
losses['kl'] = losses_kl
|
|
|
|
|
61 |
self.add_mel_loss(output['mel_out'], sample['mels'], losses)
|
62 |
if hparams['dur_level'] == 'word':
|
63 |
self.add_dur_loss(
|