Spaces:
Runtime error
Runtime error
kiramayatu
commited on
Commit
•
5e1fe65
1
Parent(s):
9d36a59
Update models.py
Browse files
models.py
CHANGED
@@ -6,9 +6,8 @@ from torch.nn import functional as F
|
|
6 |
import commons
|
7 |
import modules
|
8 |
import attentions
|
9 |
-
import monotonic_align
|
10 |
|
11 |
-
from torch.nn import Conv1d, ConvTranspose1d,
|
12 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
13 |
from commons import init_weights, get_padding
|
14 |
|
@@ -59,7 +58,7 @@ class StochasticDurationPredictor(nn.Module):
|
|
59 |
flows = self.flows
|
60 |
assert w is not None
|
61 |
|
62 |
-
logdet_tot_q = 0
|
63 |
h_w = self.post_pre(w)
|
64 |
h_w = self.post_convs(h_w, x_mask)
|
65 |
h_w = self.post_proj(h_w) * x_mask
|
@@ -68,7 +67,7 @@ class StochasticDurationPredictor(nn.Module):
|
|
68 |
for flow in self.post_flows:
|
69 |
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
|
70 |
logdet_tot_q += logdet_q
|
71 |
-
z_u, z1 = torch.split(z_q, [1, 1], 1)
|
72 |
u = torch.sigmoid(z_u) * x_mask
|
73 |
z0 = (w - u) * x_mask
|
74 |
logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
|
@@ -140,7 +139,8 @@ class TextEncoder(nn.Module):
|
|
140 |
n_heads,
|
141 |
n_layers,
|
142 |
kernel_size,
|
143 |
-
p_dropout
|
|
|
144 |
super().__init__()
|
145 |
self.n_vocab = n_vocab
|
146 |
self.out_channels = out_channels
|
@@ -150,9 +150,13 @@ class TextEncoder(nn.Module):
|
|
150 |
self.n_layers = n_layers
|
151 |
self.kernel_size = kernel_size
|
152 |
self.p_dropout = p_dropout
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
156 |
|
157 |
self.encoder = attentions.Encoder(
|
158 |
hidden_channels,
|
@@ -163,8 +167,11 @@ class TextEncoder(nn.Module):
|
|
163 |
p_dropout)
|
164 |
self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
165 |
|
166 |
-
def forward(self, x, x_lengths):
|
167 |
-
|
|
|
|
|
|
|
168 |
x = torch.transpose(x, 1, -1) # [b, h, t]
|
169 |
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
170 |
|
@@ -391,7 +398,7 @@ class SynthesizerTrn(nn.Module):
|
|
391 |
Synthesizer for Training
|
392 |
"""
|
393 |
|
394 |
-
def __init__(self,
|
395 |
n_vocab,
|
396 |
spec_channels,
|
397 |
segment_size,
|
@@ -402,15 +409,16 @@ class SynthesizerTrn(nn.Module):
|
|
402 |
n_layers,
|
403 |
kernel_size,
|
404 |
p_dropout,
|
405 |
-
resblock,
|
406 |
-
resblock_kernel_sizes,
|
407 |
-
resblock_dilation_sizes,
|
408 |
-
upsample_rates,
|
409 |
-
upsample_initial_channel,
|
410 |
upsample_kernel_sizes,
|
411 |
n_speakers=0,
|
412 |
gin_channels=0,
|
413 |
use_sdp=True,
|
|
|
414 |
**kwargs):
|
415 |
|
416 |
super().__init__()
|
@@ -442,7 +450,8 @@ class SynthesizerTrn(nn.Module):
|
|
442 |
n_heads,
|
443 |
n_layers,
|
444 |
kernel_size,
|
445 |
-
p_dropout
|
|
|
446 |
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
447 |
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
448 |
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
@@ -452,7 +461,7 @@ class SynthesizerTrn(nn.Module):
|
|
452 |
else:
|
453 |
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
454 |
|
455 |
-
if n_speakers
|
456 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
457 |
|
458 |
def forward(self, x, x_lengths, y, y_lengths, sid=None):
|
@@ -485,7 +494,7 @@ class SynthesizerTrn(nn.Module):
|
|
485 |
else:
|
486 |
logw_ = torch.log(w + 1e-6) * x_mask
|
487 |
logw = self.dp(x, x_mask, g=g)
|
488 |
-
l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
|
489 |
|
490 |
# expand prior
|
491 |
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
|
@@ -495,8 +504,8 @@ class SynthesizerTrn(nn.Module):
|
|
495 |
o = self.dec(z_slice, g=g)
|
496 |
return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
497 |
|
498 |
-
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
|
499 |
-
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
500 |
if self.n_speakers > 0:
|
501 |
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
502 |
else:
|
@@ -529,4 +538,4 @@ class SynthesizerTrn(nn.Module):
|
|
529 |
z_p = self.flow(z, y_mask, g=g_src)
|
530 |
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
531 |
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
|
532 |
-
return o_hat, y_mask, (z, z_p, z_hat)
|
|
|
6 |
import commons
|
7 |
import modules
|
8 |
import attentions
|
|
|
9 |
|
10 |
+
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
11 |
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
from commons import init_weights, get_padding
|
13 |
|
|
|
58 |
flows = self.flows
|
59 |
assert w is not None
|
60 |
|
61 |
+
logdet_tot_q = 0
|
62 |
h_w = self.post_pre(w)
|
63 |
h_w = self.post_convs(h_w, x_mask)
|
64 |
h_w = self.post_proj(h_w) * x_mask
|
|
|
67 |
for flow in self.post_flows:
|
68 |
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
|
69 |
logdet_tot_q += logdet_q
|
70 |
+
z_u, z1 = torch.split(z_q, [1, 1], 1)
|
71 |
u = torch.sigmoid(z_u) * x_mask
|
72 |
z0 = (w - u) * x_mask
|
73 |
logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
|
|
|
139 |
n_heads,
|
140 |
n_layers,
|
141 |
kernel_size,
|
142 |
+
p_dropout,
|
143 |
+
emotion_embedding):
|
144 |
super().__init__()
|
145 |
self.n_vocab = n_vocab
|
146 |
self.out_channels = out_channels
|
|
|
150 |
self.n_layers = n_layers
|
151 |
self.kernel_size = kernel_size
|
152 |
self.p_dropout = p_dropout
|
153 |
+
self.emotion_embedding = emotion_embedding
|
154 |
+
|
155 |
+
if self.n_vocab!=0:
|
156 |
+
self.emb = nn.Embedding(n_vocab, hidden_channels)
|
157 |
+
if emotion_embedding:
|
158 |
+
self.emotion_emb = nn.Linear(1024, hidden_channels)
|
159 |
+
nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
|
160 |
|
161 |
self.encoder = attentions.Encoder(
|
162 |
hidden_channels,
|
|
|
167 |
p_dropout)
|
168 |
self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
169 |
|
170 |
+
def forward(self, x, x_lengths, emotion_embedding=None):
|
171 |
+
if self.n_vocab!=0:
|
172 |
+
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
|
173 |
+
if emotion_embedding is not None:
|
174 |
+
x = x + self.emotion_emb(emotion_embedding.unsqueeze(1))
|
175 |
x = torch.transpose(x, 1, -1) # [b, h, t]
|
176 |
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
177 |
|
|
|
398 |
Synthesizer for Training
|
399 |
"""
|
400 |
|
401 |
+
def __init__(self,
|
402 |
n_vocab,
|
403 |
spec_channels,
|
404 |
segment_size,
|
|
|
409 |
n_layers,
|
410 |
kernel_size,
|
411 |
p_dropout,
|
412 |
+
resblock,
|
413 |
+
resblock_kernel_sizes,
|
414 |
+
resblock_dilation_sizes,
|
415 |
+
upsample_rates,
|
416 |
+
upsample_initial_channel,
|
417 |
upsample_kernel_sizes,
|
418 |
n_speakers=0,
|
419 |
gin_channels=0,
|
420 |
use_sdp=True,
|
421 |
+
emotion_embedding=False,
|
422 |
**kwargs):
|
423 |
|
424 |
super().__init__()
|
|
|
450 |
n_heads,
|
451 |
n_layers,
|
452 |
kernel_size,
|
453 |
+
p_dropout,
|
454 |
+
emotion_embedding)
|
455 |
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
456 |
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
457 |
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
|
|
461 |
else:
|
462 |
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
463 |
|
464 |
+
if n_speakers > 1:
|
465 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
466 |
|
467 |
def forward(self, x, x_lengths, y, y_lengths, sid=None):
|
|
|
494 |
else:
|
495 |
logw_ = torch.log(w + 1e-6) * x_mask
|
496 |
logw = self.dp(x, x_mask, g=g)
|
497 |
+
l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
|
498 |
|
499 |
# expand prior
|
500 |
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
|
|
|
504 |
o = self.dec(z_slice, g=g)
|
505 |
return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
506 |
|
507 |
+
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
|
508 |
+
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
|
509 |
if self.n_speakers > 0:
|
510 |
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
511 |
else:
|
|
|
538 |
z_p = self.flow(z, y_mask, g=g_src)
|
539 |
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
540 |
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
|
541 |
+
return o_hat, y_mask, (z, z_p, z_hat)
|