kiramayatu commited on
Commit
5e1fe65
1 Parent(s): 9d36a59

Update models.py

Browse files
Files changed (1) hide show
  1. models.py +31 -22
models.py CHANGED
@@ -6,9 +6,8 @@ from torch.nn import functional as F
6
  import commons
7
  import modules
8
  import attentions
9
- import monotonic_align
10
 
11
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
12
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
  from commons import init_weights, get_padding
14
 
@@ -59,7 +58,7 @@ class StochasticDurationPredictor(nn.Module):
59
  flows = self.flows
60
  assert w is not None
61
 
62
- logdet_tot_q = 0
63
  h_w = self.post_pre(w)
64
  h_w = self.post_convs(h_w, x_mask)
65
  h_w = self.post_proj(h_w) * x_mask
@@ -68,7 +67,7 @@ class StochasticDurationPredictor(nn.Module):
68
  for flow in self.post_flows:
69
  z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
70
  logdet_tot_q += logdet_q
71
- z_u, z1 = torch.split(z_q, [1, 1], 1)
72
  u = torch.sigmoid(z_u) * x_mask
73
  z0 = (w - u) * x_mask
74
  logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
@@ -140,7 +139,8 @@ class TextEncoder(nn.Module):
140
  n_heads,
141
  n_layers,
142
  kernel_size,
143
- p_dropout):
 
144
  super().__init__()
145
  self.n_vocab = n_vocab
146
  self.out_channels = out_channels
@@ -150,9 +150,13 @@ class TextEncoder(nn.Module):
150
  self.n_layers = n_layers
151
  self.kernel_size = kernel_size
152
  self.p_dropout = p_dropout
153
-
154
- self.emb = nn.Embedding(n_vocab, hidden_channels)
155
- nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
 
 
 
 
156
 
157
  self.encoder = attentions.Encoder(
158
  hidden_channels,
@@ -163,8 +167,11 @@ class TextEncoder(nn.Module):
163
  p_dropout)
164
  self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
165
 
166
- def forward(self, x, x_lengths):
167
- x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
 
 
 
168
  x = torch.transpose(x, 1, -1) # [b, h, t]
169
  x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
170
 
@@ -391,7 +398,7 @@ class SynthesizerTrn(nn.Module):
391
  Synthesizer for Training
392
  """
393
 
394
- def __init__(self,
395
  n_vocab,
396
  spec_channels,
397
  segment_size,
@@ -402,15 +409,16 @@ class SynthesizerTrn(nn.Module):
402
  n_layers,
403
  kernel_size,
404
  p_dropout,
405
- resblock,
406
- resblock_kernel_sizes,
407
- resblock_dilation_sizes,
408
- upsample_rates,
409
- upsample_initial_channel,
410
  upsample_kernel_sizes,
411
  n_speakers=0,
412
  gin_channels=0,
413
  use_sdp=True,
 
414
  **kwargs):
415
 
416
  super().__init__()
@@ -442,7 +450,8 @@ class SynthesizerTrn(nn.Module):
442
  n_heads,
443
  n_layers,
444
  kernel_size,
445
- p_dropout)
 
446
  self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
447
  self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
448
  self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
@@ -452,7 +461,7 @@ class SynthesizerTrn(nn.Module):
452
  else:
453
  self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
454
 
455
- if n_speakers >= 1:
456
  self.emb_g = nn.Embedding(n_speakers, gin_channels)
457
 
458
  def forward(self, x, x_lengths, y, y_lengths, sid=None):
@@ -485,7 +494,7 @@ class SynthesizerTrn(nn.Module):
485
  else:
486
  logw_ = torch.log(w + 1e-6) * x_mask
487
  logw = self.dp(x, x_mask, g=g)
488
- l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
489
 
490
  # expand prior
491
  m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
@@ -495,8 +504,8 @@ class SynthesizerTrn(nn.Module):
495
  o = self.dec(z_slice, g=g)
496
  return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
497
 
498
- def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None):
499
- x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
500
  if self.n_speakers > 0:
501
  g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
502
  else:
@@ -529,4 +538,4 @@ class SynthesizerTrn(nn.Module):
529
  z_p = self.flow(z, y_mask, g=g_src)
530
  z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
531
  o_hat = self.dec(z_hat * y_mask, g=g_tgt)
532
- return o_hat, y_mask, (z, z_p, z_hat)
 
6
  import commons
7
  import modules
8
  import attentions
 
9
 
10
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
11
  from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
  from commons import init_weights, get_padding
13
 
 
58
  flows = self.flows
59
  assert w is not None
60
 
61
+ logdet_tot_q = 0
62
  h_w = self.post_pre(w)
63
  h_w = self.post_convs(h_w, x_mask)
64
  h_w = self.post_proj(h_w) * x_mask
 
67
  for flow in self.post_flows:
68
  z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
69
  logdet_tot_q += logdet_q
70
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
71
  u = torch.sigmoid(z_u) * x_mask
72
  z0 = (w - u) * x_mask
73
  logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
 
139
  n_heads,
140
  n_layers,
141
  kernel_size,
142
+ p_dropout,
143
+ emotion_embedding):
144
  super().__init__()
145
  self.n_vocab = n_vocab
146
  self.out_channels = out_channels
 
150
  self.n_layers = n_layers
151
  self.kernel_size = kernel_size
152
  self.p_dropout = p_dropout
153
+ self.emotion_embedding = emotion_embedding
154
+
155
+ if self.n_vocab!=0:
156
+ self.emb = nn.Embedding(n_vocab, hidden_channels)
157
+ if emotion_embedding:
158
+ self.emotion_emb = nn.Linear(1024, hidden_channels)
159
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
160
 
161
  self.encoder = attentions.Encoder(
162
  hidden_channels,
 
167
  p_dropout)
168
  self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
169
 
170
+ def forward(self, x, x_lengths, emotion_embedding=None):
171
+ if self.n_vocab!=0:
172
+ x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
173
+ if emotion_embedding is not None:
174
+ x = x + self.emotion_emb(emotion_embedding.unsqueeze(1))
175
  x = torch.transpose(x, 1, -1) # [b, h, t]
176
  x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
177
 
 
398
  Synthesizer for Training
399
  """
400
 
401
+ def __init__(self,
402
  n_vocab,
403
  spec_channels,
404
  segment_size,
 
409
  n_layers,
410
  kernel_size,
411
  p_dropout,
412
+ resblock,
413
+ resblock_kernel_sizes,
414
+ resblock_dilation_sizes,
415
+ upsample_rates,
416
+ upsample_initial_channel,
417
  upsample_kernel_sizes,
418
  n_speakers=0,
419
  gin_channels=0,
420
  use_sdp=True,
421
+ emotion_embedding=False,
422
  **kwargs):
423
 
424
  super().__init__()
 
450
  n_heads,
451
  n_layers,
452
  kernel_size,
453
+ p_dropout,
454
+ emotion_embedding)
455
  self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
456
  self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
457
  self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
 
461
  else:
462
  self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
463
 
464
+ if n_speakers > 1:
465
  self.emb_g = nn.Embedding(n_speakers, gin_channels)
466
 
467
  def forward(self, x, x_lengths, y, y_lengths, sid=None):
 
494
  else:
495
  logw_ = torch.log(w + 1e-6) * x_mask
496
  logw = self.dp(x, x_mask, g=g)
497
+ l_length = torch.sum((logw - logw_)**2, [1,2]) / torch.sum(x_mask) # for averaging
498
 
499
  # expand prior
500
  m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
 
504
  o = self.dec(z_slice, g=g)
505
  return o, l_length, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
506
 
507
+ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
508
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, emotion_embedding)
509
  if self.n_speakers > 0:
510
  g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
511
  else:
 
538
  z_p = self.flow(z, y_mask, g=g_src)
539
  z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
540
  o_hat = self.dec(z_hat * y_mask, g=g_tgt)
541
+ return o_hat, y_mask, (z, z_p, z_hat)