MarcusSu1216 commited on
Commit
4724805
·
1 Parent(s): eea8df5

Upload 16 files

Browse files
modules/__init__.py ADDED
File without changes
modules/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (125 Bytes). View file
 
modules/__pycache__/attentions.cpython-38.pyc ADDED
Binary file (10.6 kB). View file
 
modules/__pycache__/commons.cpython-38.pyc ADDED
Binary file (6.62 kB). View file
 
modules/__pycache__/crepe.cpython-38.pyc ADDED
Binary file (8.7 kB). View file
 
modules/__pycache__/enhancer.cpython-38.pyc ADDED
Binary file (3.34 kB). View file
 
modules/__pycache__/losses.cpython-38.pyc ADDED
Binary file (1.53 kB). View file
 
modules/__pycache__/mel_processing.cpython-38.pyc ADDED
Binary file (3.45 kB). View file
 
modules/__pycache__/modules.cpython-38.pyc ADDED
Binary file (10.1 kB). View file
 
modules/attentions.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ import modules.commons as commons
9
+ import modules.modules as modules
10
+ from modules.modules import LayerNorm
11
+
12
+
13
+ class FFT(nn.Module):
14
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0.,
15
+ proximal_bias=False, proximal_init=True, **kwargs):
16
+ super().__init__()
17
+ self.hidden_channels = hidden_channels
18
+ self.filter_channels = filter_channels
19
+ self.n_heads = n_heads
20
+ self.n_layers = n_layers
21
+ self.kernel_size = kernel_size
22
+ self.p_dropout = p_dropout
23
+ self.proximal_bias = proximal_bias
24
+ self.proximal_init = proximal_init
25
+
26
+ self.drop = nn.Dropout(p_dropout)
27
+ self.self_attn_layers = nn.ModuleList()
28
+ self.norm_layers_0 = nn.ModuleList()
29
+ self.ffn_layers = nn.ModuleList()
30
+ self.norm_layers_1 = nn.ModuleList()
31
+ for i in range(self.n_layers):
32
+ self.self_attn_layers.append(
33
+ MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias,
34
+ proximal_init=proximal_init))
35
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
36
+ self.ffn_layers.append(
37
+ FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
38
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
39
+
40
+ def forward(self, x, x_mask):
41
+ """
42
+ x: decoder input
43
+ h: encoder output
44
+ """
45
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
46
+ x = x * x_mask
47
+ for i in range(self.n_layers):
48
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
49
+ y = self.drop(y)
50
+ x = self.norm_layers_0[i](x + y)
51
+
52
+ y = self.ffn_layers[i](x, x_mask)
53
+ y = self.drop(y)
54
+ x = self.norm_layers_1[i](x + y)
55
+ x = x * x_mask
56
+ return x
57
+
58
+
59
+ class Encoder(nn.Module):
60
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
61
+ super().__init__()
62
+ self.hidden_channels = hidden_channels
63
+ self.filter_channels = filter_channels
64
+ self.n_heads = n_heads
65
+ self.n_layers = n_layers
66
+ self.kernel_size = kernel_size
67
+ self.p_dropout = p_dropout
68
+ self.window_size = window_size
69
+
70
+ self.drop = nn.Dropout(p_dropout)
71
+ self.attn_layers = nn.ModuleList()
72
+ self.norm_layers_1 = nn.ModuleList()
73
+ self.ffn_layers = nn.ModuleList()
74
+ self.norm_layers_2 = nn.ModuleList()
75
+ for i in range(self.n_layers):
76
+ self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
77
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
78
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
79
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
80
+
81
+ def forward(self, x, x_mask):
82
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
83
+ x = x * x_mask
84
+ for i in range(self.n_layers):
85
+ y = self.attn_layers[i](x, x, attn_mask)
86
+ y = self.drop(y)
87
+ x = self.norm_layers_1[i](x + y)
88
+
89
+ y = self.ffn_layers[i](x, x_mask)
90
+ y = self.drop(y)
91
+ x = self.norm_layers_2[i](x + y)
92
+ x = x * x_mask
93
+ return x
94
+
95
+
96
+ class Decoder(nn.Module):
97
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
98
+ super().__init__()
99
+ self.hidden_channels = hidden_channels
100
+ self.filter_channels = filter_channels
101
+ self.n_heads = n_heads
102
+ self.n_layers = n_layers
103
+ self.kernel_size = kernel_size
104
+ self.p_dropout = p_dropout
105
+ self.proximal_bias = proximal_bias
106
+ self.proximal_init = proximal_init
107
+
108
+ self.drop = nn.Dropout(p_dropout)
109
+ self.self_attn_layers = nn.ModuleList()
110
+ self.norm_layers_0 = nn.ModuleList()
111
+ self.encdec_attn_layers = nn.ModuleList()
112
+ self.norm_layers_1 = nn.ModuleList()
113
+ self.ffn_layers = nn.ModuleList()
114
+ self.norm_layers_2 = nn.ModuleList()
115
+ for i in range(self.n_layers):
116
+ self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
117
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
118
+ self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
119
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
120
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
121
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
122
+
123
+ def forward(self, x, x_mask, h, h_mask):
124
+ """
125
+ x: decoder input
126
+ h: encoder output
127
+ """
128
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
129
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
130
+ x = x * x_mask
131
+ for i in range(self.n_layers):
132
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
133
+ y = self.drop(y)
134
+ x = self.norm_layers_0[i](x + y)
135
+
136
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
137
+ y = self.drop(y)
138
+ x = self.norm_layers_1[i](x + y)
139
+
140
+ y = self.ffn_layers[i](x, x_mask)
141
+ y = self.drop(y)
142
+ x = self.norm_layers_2[i](x + y)
143
+ x = x * x_mask
144
+ return x
145
+
146
+
147
+ class MultiHeadAttention(nn.Module):
148
+ def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
149
+ super().__init__()
150
+ assert channels % n_heads == 0
151
+
152
+ self.channels = channels
153
+ self.out_channels = out_channels
154
+ self.n_heads = n_heads
155
+ self.p_dropout = p_dropout
156
+ self.window_size = window_size
157
+ self.heads_share = heads_share
158
+ self.block_length = block_length
159
+ self.proximal_bias = proximal_bias
160
+ self.proximal_init = proximal_init
161
+ self.attn = None
162
+
163
+ self.k_channels = channels // n_heads
164
+ self.conv_q = nn.Conv1d(channels, channels, 1)
165
+ self.conv_k = nn.Conv1d(channels, channels, 1)
166
+ self.conv_v = nn.Conv1d(channels, channels, 1)
167
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
168
+ self.drop = nn.Dropout(p_dropout)
169
+
170
+ if window_size is not None:
171
+ n_heads_rel = 1 if heads_share else n_heads
172
+ rel_stddev = self.k_channels**-0.5
173
+ self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
174
+ self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
175
+
176
+ nn.init.xavier_uniform_(self.conv_q.weight)
177
+ nn.init.xavier_uniform_(self.conv_k.weight)
178
+ nn.init.xavier_uniform_(self.conv_v.weight)
179
+ if proximal_init:
180
+ with torch.no_grad():
181
+ self.conv_k.weight.copy_(self.conv_q.weight)
182
+ self.conv_k.bias.copy_(self.conv_q.bias)
183
+
184
+ def forward(self, x, c, attn_mask=None):
185
+ q = self.conv_q(x)
186
+ k = self.conv_k(c)
187
+ v = self.conv_v(c)
188
+
189
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
190
+
191
+ x = self.conv_o(x)
192
+ return x
193
+
194
+ def attention(self, query, key, value, mask=None):
195
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
196
+ b, d, t_s, t_t = (*key.size(), query.size(2))
197
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
198
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
199
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
200
+
201
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
202
+ if self.window_size is not None:
203
+ assert t_s == t_t, "Relative attention is only available for self-attention."
204
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
205
+ rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
206
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
207
+ scores = scores + scores_local
208
+ if self.proximal_bias:
209
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
210
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
211
+ if mask is not None:
212
+ scores = scores.masked_fill(mask == 0, -1e4)
213
+ if self.block_length is not None:
214
+ assert t_s == t_t, "Local attention is only available for self-attention."
215
+ block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
216
+ scores = scores.masked_fill(block_mask == 0, -1e4)
217
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
218
+ p_attn = self.drop(p_attn)
219
+ output = torch.matmul(p_attn, value)
220
+ if self.window_size is not None:
221
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
222
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
223
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
224
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
225
+ return output, p_attn
226
+
227
+ def _matmul_with_relative_values(self, x, y):
228
+ """
229
+ x: [b, h, l, m]
230
+ y: [h or 1, m, d]
231
+ ret: [b, h, l, d]
232
+ """
233
+ ret = torch.matmul(x, y.unsqueeze(0))
234
+ return ret
235
+
236
+ def _matmul_with_relative_keys(self, x, y):
237
+ """
238
+ x: [b, h, l, d]
239
+ y: [h or 1, m, d]
240
+ ret: [b, h, l, m]
241
+ """
242
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
243
+ return ret
244
+
245
+ def _get_relative_embeddings(self, relative_embeddings, length):
246
+ max_relative_position = 2 * self.window_size + 1
247
+ # Pad first before slice to avoid using cond ops.
248
+ pad_length = max(length - (self.window_size + 1), 0)
249
+ slice_start_position = max((self.window_size + 1) - length, 0)
250
+ slice_end_position = slice_start_position + 2 * length - 1
251
+ if pad_length > 0:
252
+ padded_relative_embeddings = F.pad(
253
+ relative_embeddings,
254
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
255
+ else:
256
+ padded_relative_embeddings = relative_embeddings
257
+ used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
258
+ return used_relative_embeddings
259
+
260
+ def _relative_position_to_absolute_position(self, x):
261
+ """
262
+ x: [b, h, l, 2*l-1]
263
+ ret: [b, h, l, l]
264
+ """
265
+ batch, heads, length, _ = x.size()
266
+ # Concat columns of pad to shift from relative to absolute indexing.
267
+ x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
268
+
269
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
270
+ x_flat = x.view([batch, heads, length * 2 * length])
271
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
272
+
273
+ # Reshape and slice out the padded elements.
274
+ x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
275
+ return x_final
276
+
277
+ def _absolute_position_to_relative_position(self, x):
278
+ """
279
+ x: [b, h, l, l]
280
+ ret: [b, h, l, 2*l-1]
281
+ """
282
+ batch, heads, length, _ = x.size()
283
+ # padd along column
284
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
285
+ x_flat = x.view([batch, heads, length**2 + length*(length -1)])
286
+ # add 0's in the beginning that will skew the elements after reshape
287
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
288
+ x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
289
+ return x_final
290
+
291
+ def _attention_bias_proximal(self, length):
292
+ """Bias for self-attention to encourage attention to close positions.
293
+ Args:
294
+ length: an integer scalar.
295
+ Returns:
296
+ a Tensor with shape [1, 1, length, length]
297
+ """
298
+ r = torch.arange(length, dtype=torch.float32)
299
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
300
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
301
+
302
+
303
+ class FFN(nn.Module):
304
+ def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
305
+ super().__init__()
306
+ self.in_channels = in_channels
307
+ self.out_channels = out_channels
308
+ self.filter_channels = filter_channels
309
+ self.kernel_size = kernel_size
310
+ self.p_dropout = p_dropout
311
+ self.activation = activation
312
+ self.causal = causal
313
+
314
+ if causal:
315
+ self.padding = self._causal_padding
316
+ else:
317
+ self.padding = self._same_padding
318
+
319
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
320
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
321
+ self.drop = nn.Dropout(p_dropout)
322
+
323
+ def forward(self, x, x_mask):
324
+ x = self.conv_1(self.padding(x * x_mask))
325
+ if self.activation == "gelu":
326
+ x = x * torch.sigmoid(1.702 * x)
327
+ else:
328
+ x = torch.relu(x)
329
+ x = self.drop(x)
330
+ x = self.conv_2(self.padding(x * x_mask))
331
+ return x * x_mask
332
+
333
+ def _causal_padding(self, x):
334
+ if self.kernel_size == 1:
335
+ return x
336
+ pad_l = self.kernel_size - 1
337
+ pad_r = 0
338
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
339
+ x = F.pad(x, commons.convert_pad_shape(padding))
340
+ return x
341
+
342
+ def _same_padding(self, x):
343
+ if self.kernel_size == 1:
344
+ return x
345
+ pad_l = (self.kernel_size - 1) // 2
346
+ pad_r = self.kernel_size // 2
347
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
348
+ x = F.pad(x, commons.convert_pad_shape(padding))
349
+ return x
modules/commons.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ def slice_pitch_segments(x, ids_str, segment_size=4):
8
+ ret = torch.zeros_like(x[:, :segment_size])
9
+ for i in range(x.size(0)):
10
+ idx_str = ids_str[i]
11
+ idx_end = idx_str + segment_size
12
+ ret[i] = x[i, idx_str:idx_end]
13
+ return ret
14
+
15
+ def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4):
16
+ b, d, t = x.size()
17
+ if x_lengths is None:
18
+ x_lengths = t
19
+ ids_str_max = x_lengths - segment_size + 1
20
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
21
+ ret = slice_segments(x, ids_str, segment_size)
22
+ ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size)
23
+ return ret, ret_pitch, ids_str
24
+
25
+ def init_weights(m, mean=0.0, std=0.01):
26
+ classname = m.__class__.__name__
27
+ if classname.find("Conv") != -1:
28
+ m.weight.data.normal_(mean, std)
29
+
30
+
31
+ def get_padding(kernel_size, dilation=1):
32
+ return int((kernel_size*dilation - dilation)/2)
33
+
34
+
35
+ def convert_pad_shape(pad_shape):
36
+ l = pad_shape[::-1]
37
+ pad_shape = [item for sublist in l for item in sublist]
38
+ return pad_shape
39
+
40
+
41
+ def intersperse(lst, item):
42
+ result = [item] * (len(lst) * 2 + 1)
43
+ result[1::2] = lst
44
+ return result
45
+
46
+
47
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
48
+ """KL(P||Q)"""
49
+ kl = (logs_q - logs_p) - 0.5
50
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
51
+ return kl
52
+
53
+
54
+ def rand_gumbel(shape):
55
+ """Sample from the Gumbel distribution, protect from overflows."""
56
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
57
+ return -torch.log(-torch.log(uniform_samples))
58
+
59
+
60
+ def rand_gumbel_like(x):
61
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
62
+ return g
63
+
64
+
65
+ def slice_segments(x, ids_str, segment_size=4):
66
+ ret = torch.zeros_like(x[:, :, :segment_size])
67
+ for i in range(x.size(0)):
68
+ idx_str = ids_str[i]
69
+ idx_end = idx_str + segment_size
70
+ ret[i] = x[i, :, idx_str:idx_end]
71
+ return ret
72
+
73
+
74
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
75
+ b, d, t = x.size()
76
+ if x_lengths is None:
77
+ x_lengths = t
78
+ ids_str_max = x_lengths - segment_size + 1
79
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
80
+ ret = slice_segments(x, ids_str, segment_size)
81
+ return ret, ids_str
82
+
83
+
84
+ def rand_spec_segments(x, x_lengths=None, segment_size=4):
85
+ b, d, t = x.size()
86
+ if x_lengths is None:
87
+ x_lengths = t
88
+ ids_str_max = x_lengths - segment_size
89
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
90
+ ret = slice_segments(x, ids_str, segment_size)
91
+ return ret, ids_str
92
+
93
+
94
+ def get_timing_signal_1d(
95
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
96
+ position = torch.arange(length, dtype=torch.float)
97
+ num_timescales = channels // 2
98
+ log_timescale_increment = (
99
+ math.log(float(max_timescale) / float(min_timescale)) /
100
+ (num_timescales - 1))
101
+ inv_timescales = min_timescale * torch.exp(
102
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
103
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
104
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
105
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
106
+ signal = signal.view(1, channels, length)
107
+ return signal
108
+
109
+
110
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
111
+ b, channels, length = x.size()
112
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
113
+ return x + signal.to(dtype=x.dtype, device=x.device)
114
+
115
+
116
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
117
+ b, channels, length = x.size()
118
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
119
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
120
+
121
+
122
+ def subsequent_mask(length):
123
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
124
+ return mask
125
+
126
+
127
+ @torch.jit.script
128
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
129
+ n_channels_int = n_channels[0]
130
+ in_act = input_a + input_b
131
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
132
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
133
+ acts = t_act * s_act
134
+ return acts
135
+
136
+
137
+ def convert_pad_shape(pad_shape):
138
+ l = pad_shape[::-1]
139
+ pad_shape = [item for sublist in l for item in sublist]
140
+ return pad_shape
141
+
142
+
143
+ def shift_1d(x):
144
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
145
+ return x
146
+
147
+
148
+ def sequence_mask(length, max_length=None):
149
+ if max_length is None:
150
+ max_length = length.max()
151
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
152
+ return x.unsqueeze(0) < length.unsqueeze(1)
153
+
154
+
155
+ def generate_path(duration, mask):
156
+ """
157
+ duration: [b, 1, t_x]
158
+ mask: [b, 1, t_y, t_x]
159
+ """
160
+ device = duration.device
161
+
162
+ b, _, t_y, t_x = mask.shape
163
+ cum_duration = torch.cumsum(duration, -1)
164
+
165
+ cum_duration_flat = cum_duration.view(b * t_x)
166
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
167
+ path = path.view(b, t_x, t_y)
168
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
169
+ path = path.unsqueeze(1).transpose(2,3) * mask
170
+ return path
171
+
172
+
173
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
174
+ if isinstance(parameters, torch.Tensor):
175
+ parameters = [parameters]
176
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
177
+ norm_type = float(norm_type)
178
+ if clip_value is not None:
179
+ clip_value = float(clip_value)
180
+
181
+ total_norm = 0
182
+ for p in parameters:
183
+ param_norm = p.grad.data.norm(norm_type)
184
+ total_norm += param_norm.item() ** norm_type
185
+ if clip_value is not None:
186
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
187
+ total_norm = total_norm ** (1. / norm_type)
188
+ return total_norm
modules/crepe.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional,Union
2
+ try:
3
+ from typing import Literal
4
+ except Exception as e:
5
+ from typing_extensions import Literal
6
+ import numpy as np
7
+ import torch
8
+ import torchcrepe
9
+ from torch import nn
10
+ from torch.nn import functional as F
11
+ import scipy
12
+
13
+ #from:https://github.com/fishaudio/fish-diffusion
14
+
15
+ def repeat_expand(
16
+ content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest"
17
+ ):
18
+ """Repeat content to target length.
19
+ This is a wrapper of torch.nn.functional.interpolate.
20
+
21
+ Args:
22
+ content (torch.Tensor): tensor
23
+ target_len (int): target length
24
+ mode (str, optional): interpolation mode. Defaults to "nearest".
25
+
26
+ Returns:
27
+ torch.Tensor: tensor
28
+ """
29
+
30
+ ndim = content.ndim
31
+
32
+ if content.ndim == 1:
33
+ content = content[None, None]
34
+ elif content.ndim == 2:
35
+ content = content[None]
36
+
37
+ assert content.ndim == 3
38
+
39
+ is_np = isinstance(content, np.ndarray)
40
+ if is_np:
41
+ content = torch.from_numpy(content)
42
+
43
+ results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
44
+
45
+ if is_np:
46
+ results = results.numpy()
47
+
48
+ if ndim == 1:
49
+ return results[0, 0]
50
+ elif ndim == 2:
51
+ return results[0]
52
+
53
+
54
+ class BasePitchExtractor:
55
+ def __init__(
56
+ self,
57
+ hop_length: int = 512,
58
+ f0_min: float = 50.0,
59
+ f0_max: float = 1100.0,
60
+ keep_zeros: bool = True,
61
+ ):
62
+ """Base pitch extractor.
63
+
64
+ Args:
65
+ hop_length (int, optional): Hop length. Defaults to 512.
66
+ f0_min (float, optional): Minimum f0. Defaults to 50.0.
67
+ f0_max (float, optional): Maximum f0. Defaults to 1100.0.
68
+ keep_zeros (bool, optional): Whether keep zeros in pitch. Defaults to True.
69
+ """
70
+
71
+ self.hop_length = hop_length
72
+ self.f0_min = f0_min
73
+ self.f0_max = f0_max
74
+ self.keep_zeros = keep_zeros
75
+
76
+ def __call__(self, x, sampling_rate=44100, pad_to=None):
77
+ raise NotImplementedError("BasePitchExtractor is not callable.")
78
+
79
+ def post_process(self, x, sampling_rate, f0, pad_to):
80
+ if isinstance(f0, np.ndarray):
81
+ f0 = torch.from_numpy(f0).float().to(x.device)
82
+
83
+ if pad_to is None:
84
+ return f0
85
+
86
+ f0 = repeat_expand(f0, pad_to)
87
+
88
+ if self.keep_zeros:
89
+ return f0
90
+
91
+ vuv_vector = torch.zeros_like(f0)
92
+ vuv_vector[f0 > 0.0] = 1.0
93
+ vuv_vector[f0 <= 0.0] = 0.0
94
+
95
+ # 去掉0频率, 并线性插值
96
+ nzindex = torch.nonzero(f0).squeeze()
97
+ f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
98
+ time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
99
+ time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
100
+
101
+ if f0.shape[0] <= 0:
102
+ return torch.zeros(pad_to, dtype=torch.float, device=x.device),torch.zeros(pad_to, dtype=torch.float, device=x.device)
103
+
104
+ if f0.shape[0] == 1:
105
+ return torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0],torch.ones(pad_to, dtype=torch.float, device=x.device)
106
+
107
+ # 大概可以用 torch 重写?
108
+ f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
109
+ vuv_vector = vuv_vector.cpu().numpy()
110
+ vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0))
111
+
112
+ return f0,vuv_vector
113
+
114
+
115
+ class MaskedAvgPool1d(nn.Module):
116
+ def __init__(
117
+ self, kernel_size: int, stride: Optional[int] = None, padding: Optional[int] = 0
118
+ ):
119
+ """An implementation of mean pooling that supports masked values.
120
+
121
+ Args:
122
+ kernel_size (int): The size of the median pooling window.
123
+ stride (int, optional): The stride of the median pooling window. Defaults to None.
124
+ padding (int, optional): The padding of the median pooling window. Defaults to 0.
125
+ """
126
+
127
+ super(MaskedAvgPool1d, self).__init__()
128
+ self.kernel_size = kernel_size
129
+ self.stride = stride or kernel_size
130
+ self.padding = padding
131
+
132
+ def forward(self, x, mask=None):
133
+ ndim = x.dim()
134
+ if ndim == 2:
135
+ x = x.unsqueeze(1)
136
+
137
+ assert (
138
+ x.dim() == 3
139
+ ), "Input tensor must have 2 or 3 dimensions (batch_size, channels, width)"
140
+
141
+ # Apply the mask by setting masked elements to zero, or make NaNs zero
142
+ if mask is None:
143
+ mask = ~torch.isnan(x)
144
+
145
+ # Ensure mask has the same shape as the input tensor
146
+ assert x.shape == mask.shape, "Input tensor and mask must have the same shape"
147
+
148
+ masked_x = torch.where(mask, x, torch.zeros_like(x))
149
+ # Create a ones kernel with the same number of channels as the input tensor
150
+ ones_kernel = torch.ones(x.size(1), 1, self.kernel_size, device=x.device)
151
+
152
+ # Perform sum pooling
153
+ sum_pooled = nn.functional.conv1d(
154
+ masked_x,
155
+ ones_kernel,
156
+ stride=self.stride,
157
+ padding=self.padding,
158
+ groups=x.size(1),
159
+ )
160
+
161
+ # Count the non-masked (valid) elements in each pooling window
162
+ valid_count = nn.functional.conv1d(
163
+ mask.float(),
164
+ ones_kernel,
165
+ stride=self.stride,
166
+ padding=self.padding,
167
+ groups=x.size(1),
168
+ )
169
+ valid_count = valid_count.clamp(min=1) # Avoid division by zero
170
+
171
+ # Perform masked average pooling
172
+ avg_pooled = sum_pooled / valid_count
173
+
174
+ # Fill zero values with NaNs
175
+ avg_pooled[avg_pooled == 0] = float("nan")
176
+
177
+ if ndim == 2:
178
+ return avg_pooled.squeeze(1)
179
+
180
+ return avg_pooled
181
+
182
+
183
+ class MaskedMedianPool1d(nn.Module):
184
+ def __init__(
185
+ self, kernel_size: int, stride: Optional[int] = None, padding: Optional[int] = 0
186
+ ):
187
+ """An implementation of median pooling that supports masked values.
188
+
189
+ This implementation is inspired by the median pooling implementation in
190
+ https://gist.github.com/rwightman/f2d3849281624be7c0f11c85c87c1598
191
+
192
+ Args:
193
+ kernel_size (int): The size of the median pooling window.
194
+ stride (int, optional): The stride of the median pooling window. Defaults to None.
195
+ padding (int, optional): The padding of the median pooling window. Defaults to 0.
196
+ """
197
+
198
+ super(MaskedMedianPool1d, self).__init__()
199
+ self.kernel_size = kernel_size
200
+ self.stride = stride or kernel_size
201
+ self.padding = padding
202
+
203
+ def forward(self, x, mask=None):
204
+ ndim = x.dim()
205
+ if ndim == 2:
206
+ x = x.unsqueeze(1)
207
+
208
+ assert (
209
+ x.dim() == 3
210
+ ), "Input tensor must have 2 or 3 dimensions (batch_size, channels, width)"
211
+
212
+ if mask is None:
213
+ mask = ~torch.isnan(x)
214
+
215
+ assert x.shape == mask.shape, "Input tensor and mask must have the same shape"
216
+
217
+ masked_x = torch.where(mask, x, torch.zeros_like(x))
218
+
219
+ x = F.pad(masked_x, (self.padding, self.padding), mode="reflect")
220
+ mask = F.pad(
221
+ mask.float(), (self.padding, self.padding), mode="constant", value=0
222
+ )
223
+
224
+ x = x.unfold(2, self.kernel_size, self.stride)
225
+ mask = mask.unfold(2, self.kernel_size, self.stride)
226
+
227
+ x = x.contiguous().view(x.size()[:3] + (-1,))
228
+ mask = mask.contiguous().view(mask.size()[:3] + (-1,)).to(x.device)
229
+
230
+ # Combine the mask with the input tensor
231
+ #x_masked = torch.where(mask.bool(), x, torch.fill_(torch.zeros_like(x),float("inf")))
232
+ x_masked = torch.where(mask.bool(), x, torch.FloatTensor([float("inf")]).to(x.device))
233
+
234
+ # Sort the masked tensor along the last dimension
235
+ x_sorted, _ = torch.sort(x_masked, dim=-1)
236
+
237
+ # Compute the count of non-masked (valid) values
238
+ valid_count = mask.sum(dim=-1)
239
+
240
+ # Calculate the index of the median value for each pooling window
241
+ median_idx = (torch.div((valid_count - 1), 2, rounding_mode='trunc')).clamp(min=0)
242
+
243
+ # Gather the median values using the calculated indices
244
+ median_pooled = x_sorted.gather(-1, median_idx.unsqueeze(-1).long()).squeeze(-1)
245
+
246
+ # Fill infinite values with NaNs
247
+ median_pooled[torch.isinf(median_pooled)] = float("nan")
248
+
249
+ if ndim == 2:
250
+ return median_pooled.squeeze(1)
251
+
252
+ return median_pooled
253
+
254
+
255
+ class CrepePitchExtractor(BasePitchExtractor):
256
+ def __init__(
257
+ self,
258
+ hop_length: int = 512,
259
+ f0_min: float = 50.0,
260
+ f0_max: float = 1100.0,
261
+ threshold: float = 0.05,
262
+ keep_zeros: bool = False,
263
+ device = None,
264
+ model: Literal["full", "tiny"] = "full",
265
+ use_fast_filters: bool = True,
266
+ ):
267
+ super().__init__(hop_length, f0_min, f0_max, keep_zeros)
268
+
269
+ self.threshold = threshold
270
+ self.model = model
271
+ self.use_fast_filters = use_fast_filters
272
+ self.hop_length = hop_length
273
+ if device is None:
274
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
275
+ else:
276
+ self.dev = torch.device(device)
277
+ if self.use_fast_filters:
278
+ self.median_filter = MaskedMedianPool1d(3, 1, 1).to(device)
279
+ self.mean_filter = MaskedAvgPool1d(3, 1, 1).to(device)
280
+
281
+ def __call__(self, x, sampling_rate=44100, pad_to=None):
282
+ """Extract pitch using crepe.
283
+
284
+
285
+ Args:
286
+ x (torch.Tensor): Audio signal, shape (1, T).
287
+ sampling_rate (int, optional): Sampling rate. Defaults to 44100.
288
+ pad_to (int, optional): Pad to length. Defaults to None.
289
+
290
+ Returns:
291
+ torch.Tensor: Pitch, shape (T // hop_length,).
292
+ """
293
+
294
+ assert x.ndim == 2, f"Expected 2D tensor, got {x.ndim}D tensor."
295
+ assert x.shape[0] == 1, f"Expected 1 channel, got {x.shape[0]} channels."
296
+
297
+ x = x.to(self.dev)
298
+ f0, pd = torchcrepe.predict(
299
+ x,
300
+ sampling_rate,
301
+ self.hop_length,
302
+ self.f0_min,
303
+ self.f0_max,
304
+ pad=True,
305
+ model=self.model,
306
+ batch_size=1024,
307
+ device=x.device,
308
+ return_periodicity=True,
309
+ )
310
+
311
+ # Filter, remove silence, set uv threshold, refer to the original warehouse readme
312
+ if self.use_fast_filters:
313
+ pd = self.median_filter(pd)
314
+ else:
315
+ pd = torchcrepe.filter.median(pd, 3)
316
+
317
+ pd = torchcrepe.threshold.Silence(-60.0)(pd, x, sampling_rate, 512)
318
+ f0 = torchcrepe.threshold.At(self.threshold)(f0, pd)
319
+
320
+ if self.use_fast_filters:
321
+ f0 = self.mean_filter(f0)
322
+ else:
323
+ f0 = torchcrepe.filter.mean(f0, 3)
324
+
325
+ f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)[0]
326
+
327
+ return self.post_process(x, sampling_rate, f0, pad_to)
modules/enhancer.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from vdecoder.nsf_hifigan.nvSTFT import STFT
5
+ from vdecoder.nsf_hifigan.models import load_model
6
+ from torchaudio.transforms import Resample
7
+
8
+ class Enhancer:
9
+ def __init__(self, enhancer_type, enhancer_ckpt, device=None):
10
+ if device is None:
11
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
12
+ self.device = device
13
+
14
+ if enhancer_type == 'nsf-hifigan':
15
+ self.enhancer = NsfHifiGAN(enhancer_ckpt, device=self.device)
16
+ else:
17
+ raise ValueError(f" [x] Unknown enhancer: {enhancer_type}")
18
+
19
+ self.resample_kernel = {}
20
+ self.enhancer_sample_rate = self.enhancer.sample_rate()
21
+ self.enhancer_hop_size = self.enhancer.hop_size()
22
+
23
+ def enhance(self,
24
+ audio, # 1, T
25
+ sample_rate,
26
+ f0, # 1, n_frames, 1
27
+ hop_size,
28
+ adaptive_key = 0,
29
+ silence_front = 0
30
+ ):
31
+ # enhancer start time
32
+ start_frame = int(silence_front * sample_rate / hop_size)
33
+ real_silence_front = start_frame * hop_size / sample_rate
34
+ audio = audio[:, int(np.round(real_silence_front * sample_rate)) : ]
35
+ f0 = f0[: , start_frame :, :]
36
+
37
+ # adaptive parameters
38
+ adaptive_factor = 2 ** ( -adaptive_key / 12)
39
+ adaptive_sample_rate = 100 * int(np.round(self.enhancer_sample_rate / adaptive_factor / 100))
40
+ real_factor = self.enhancer_sample_rate / adaptive_sample_rate
41
+
42
+ # resample the ddsp output
43
+ if sample_rate == adaptive_sample_rate:
44
+ audio_res = audio
45
+ else:
46
+ key_str = str(sample_rate) + str(adaptive_sample_rate)
47
+ if key_str not in self.resample_kernel:
48
+ self.resample_kernel[key_str] = Resample(sample_rate, adaptive_sample_rate, lowpass_filter_width = 128).to(self.device)
49
+ audio_res = self.resample_kernel[key_str](audio)
50
+
51
+ n_frames = int(audio_res.size(-1) // self.enhancer_hop_size + 1)
52
+
53
+ # resample f0
54
+ f0_np = f0.squeeze(0).squeeze(-1).cpu().numpy()
55
+ f0_np *= real_factor
56
+ time_org = (hop_size / sample_rate) * np.arange(len(f0_np)) / real_factor
57
+ time_frame = (self.enhancer_hop_size / self.enhancer_sample_rate) * np.arange(n_frames)
58
+ f0_res = np.interp(time_frame, time_org, f0_np, left=f0_np[0], right=f0_np[-1])
59
+ f0_res = torch.from_numpy(f0_res).unsqueeze(0).float().to(self.device) # 1, n_frames
60
+
61
+ # enhance
62
+ enhanced_audio, enhancer_sample_rate = self.enhancer(audio_res, f0_res)
63
+
64
+ # resample the enhanced output
65
+ if adaptive_factor != 0:
66
+ key_str = str(adaptive_sample_rate) + str(enhancer_sample_rate)
67
+ if key_str not in self.resample_kernel:
68
+ self.resample_kernel[key_str] = Resample(adaptive_sample_rate, enhancer_sample_rate, lowpass_filter_width = 128).to(self.device)
69
+ enhanced_audio = self.resample_kernel[key_str](enhanced_audio)
70
+
71
+ # pad the silence frames
72
+ if start_frame > 0:
73
+ enhanced_audio = F.pad(enhanced_audio, (int(np.round(enhancer_sample_rate * real_silence_front)), 0))
74
+
75
+ return enhanced_audio, enhancer_sample_rate
76
+
77
+
78
+ class NsfHifiGAN(torch.nn.Module):
79
+ def __init__(self, model_path, device=None):
80
+ super().__init__()
81
+ if device is None:
82
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
83
+ self.device = device
84
+ print('| Load HifiGAN: ', model_path)
85
+ self.model, self.h = load_model(model_path, device=self.device)
86
+
87
+ def sample_rate(self):
88
+ return self.h.sampling_rate
89
+
90
+ def hop_size(self):
91
+ return self.h.hop_size
92
+
93
+ def forward(self, audio, f0):
94
+ stft = STFT(
95
+ self.h.sampling_rate,
96
+ self.h.num_mels,
97
+ self.h.n_fft,
98
+ self.h.win_size,
99
+ self.h.hop_size,
100
+ self.h.fmin,
101
+ self.h.fmax)
102
+ with torch.no_grad():
103
+ mel = stft.get_mel(audio)
104
+ enhanced_audio = self.model(mel, f0[:,:mel.size(-1)]).view(-1)
105
+ return enhanced_audio, self.h.sampling_rate
modules/losses.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+
4
+ import modules.commons as commons
5
+
6
+
7
+ def feature_loss(fmap_r, fmap_g):
8
+ loss = 0
9
+ for dr, dg in zip(fmap_r, fmap_g):
10
+ for rl, gl in zip(dr, dg):
11
+ rl = rl.float().detach()
12
+ gl = gl.float()
13
+ loss += torch.mean(torch.abs(rl - gl))
14
+
15
+ return loss * 2
16
+
17
+
18
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
19
+ loss = 0
20
+ r_losses = []
21
+ g_losses = []
22
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
23
+ dr = dr.float()
24
+ dg = dg.float()
25
+ r_loss = torch.mean((1-dr)**2)
26
+ g_loss = torch.mean(dg**2)
27
+ loss += (r_loss + g_loss)
28
+ r_losses.append(r_loss.item())
29
+ g_losses.append(g_loss.item())
30
+
31
+ return loss, r_losses, g_losses
32
+
33
+
34
+ def generator_loss(disc_outputs):
35
+ loss = 0
36
+ gen_losses = []
37
+ for dg in disc_outputs:
38
+ dg = dg.float()
39
+ l = torch.mean((1-dg)**2)
40
+ gen_losses.append(l)
41
+ loss += l
42
+
43
+ return loss, gen_losses
44
+
45
+
46
+ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
47
+ """
48
+ z_p, logs_q: [b, h, t_t]
49
+ m_p, logs_p: [b, h, t_t]
50
+ """
51
+ z_p = z_p.float()
52
+ logs_q = logs_q.float()
53
+ m_p = m_p.float()
54
+ logs_p = logs_p.float()
55
+ z_mask = z_mask.float()
56
+ #print(logs_p)
57
+ kl = logs_p - logs_q - 0.5
58
+ kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
59
+ kl = torch.sum(kl * z_mask)
60
+ l = kl / torch.sum(z_mask)
61
+ return l
modules/mel_processing.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import torch
5
+ from torch import nn
6
+ import torch.nn.functional as F
7
+ import torch.utils.data
8
+ import numpy as np
9
+ import librosa
10
+ import librosa.util as librosa_util
11
+ from librosa.util import normalize, pad_center, tiny
12
+ from scipy.signal import get_window
13
+ from scipy.io.wavfile import read
14
+ from librosa.filters import mel as librosa_mel_fn
15
+
16
+ MAX_WAV_VALUE = 32768.0
17
+
18
+
19
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
20
+ """
21
+ PARAMS
22
+ ------
23
+ C: compression factor
24
+ """
25
+ return torch.log(torch.clamp(x, min=clip_val) * C)
26
+
27
+
28
+ def dynamic_range_decompression_torch(x, C=1):
29
+ """
30
+ PARAMS
31
+ ------
32
+ C: compression factor used to compress
33
+ """
34
+ return torch.exp(x) / C
35
+
36
+
37
+ def spectral_normalize_torch(magnitudes):
38
+ output = dynamic_range_compression_torch(magnitudes)
39
+ return output
40
+
41
+
42
+ def spectral_de_normalize_torch(magnitudes):
43
+ output = dynamic_range_decompression_torch(magnitudes)
44
+ return output
45
+
46
+
47
+ mel_basis = {}
48
+ hann_window = {}
49
+
50
+
51
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
52
+ if torch.min(y) < -1.:
53
+ print('min value is ', torch.min(y))
54
+ if torch.max(y) > 1.:
55
+ print('max value is ', torch.max(y))
56
+
57
+ global hann_window
58
+ dtype_device = str(y.dtype) + '_' + str(y.device)
59
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
60
+ if wnsize_dtype_device not in hann_window:
61
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
62
+
63
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
64
+ y = y.squeeze(1)
65
+
66
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
67
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
68
+
69
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
70
+ return spec
71
+
72
+
73
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
74
+ global mel_basis
75
+ dtype_device = str(spec.dtype) + '_' + str(spec.device)
76
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
77
+ if fmax_dtype_device not in mel_basis:
78
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
79
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
80
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
81
+ spec = spectral_normalize_torch(spec)
82
+ return spec
83
+
84
+
85
+ def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
86
+ if torch.min(y) < -1.:
87
+ print('min value is ', torch.min(y))
88
+ if torch.max(y) > 1.:
89
+ print('max value is ', torch.max(y))
90
+
91
+ global mel_basis, hann_window
92
+ dtype_device = str(y.dtype) + '_' + str(y.device)
93
+ fmax_dtype_device = str(fmax) + '_' + dtype_device
94
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
95
+ if fmax_dtype_device not in mel_basis:
96
+ mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
97
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
98
+ if wnsize_dtype_device not in hann_window:
99
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
100
+
101
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
102
+ y = y.squeeze(1)
103
+
104
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
105
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
106
+
107
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
108
+
109
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
110
+ spec = spectral_normalize_torch(spec)
111
+
112
+ return spec
modules/modules.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import scipy
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
+ from torch.nn.utils import weight_norm, remove_weight_norm
11
+
12
+ import modules.commons as commons
13
+ from modules.commons import init_weights, get_padding
14
+
15
+
16
+ LRELU_SLOPE = 0.1
17
+
18
+
19
+ class LayerNorm(nn.Module):
20
+ def __init__(self, channels, eps=1e-5):
21
+ super().__init__()
22
+ self.channels = channels
23
+ self.eps = eps
24
+
25
+ self.gamma = nn.Parameter(torch.ones(channels))
26
+ self.beta = nn.Parameter(torch.zeros(channels))
27
+
28
+ def forward(self, x):
29
+ x = x.transpose(1, -1)
30
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
31
+ return x.transpose(1, -1)
32
+
33
+
34
+ class ConvReluNorm(nn.Module):
35
+ def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
36
+ super().__init__()
37
+ self.in_channels = in_channels
38
+ self.hidden_channels = hidden_channels
39
+ self.out_channels = out_channels
40
+ self.kernel_size = kernel_size
41
+ self.n_layers = n_layers
42
+ self.p_dropout = p_dropout
43
+ assert n_layers > 1, "Number of layers should be larger than 0."
44
+
45
+ self.conv_layers = nn.ModuleList()
46
+ self.norm_layers = nn.ModuleList()
47
+ self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
48
+ self.norm_layers.append(LayerNorm(hidden_channels))
49
+ self.relu_drop = nn.Sequential(
50
+ nn.ReLU(),
51
+ nn.Dropout(p_dropout))
52
+ for _ in range(n_layers-1):
53
+ self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
54
+ self.norm_layers.append(LayerNorm(hidden_channels))
55
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
56
+ self.proj.weight.data.zero_()
57
+ self.proj.bias.data.zero_()
58
+
59
+ def forward(self, x, x_mask):
60
+ x_org = x
61
+ for i in range(self.n_layers):
62
+ x = self.conv_layers[i](x * x_mask)
63
+ x = self.norm_layers[i](x)
64
+ x = self.relu_drop(x)
65
+ x = x_org + self.proj(x)
66
+ return x * x_mask
67
+
68
+
69
+ class DDSConv(nn.Module):
70
+ """
71
+ Dialted and Depth-Separable Convolution
72
+ """
73
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
74
+ super().__init__()
75
+ self.channels = channels
76
+ self.kernel_size = kernel_size
77
+ self.n_layers = n_layers
78
+ self.p_dropout = p_dropout
79
+
80
+ self.drop = nn.Dropout(p_dropout)
81
+ self.convs_sep = nn.ModuleList()
82
+ self.convs_1x1 = nn.ModuleList()
83
+ self.norms_1 = nn.ModuleList()
84
+ self.norms_2 = nn.ModuleList()
85
+ for i in range(n_layers):
86
+ dilation = kernel_size ** i
87
+ padding = (kernel_size * dilation - dilation) // 2
88
+ self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
89
+ groups=channels, dilation=dilation, padding=padding
90
+ ))
91
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
92
+ self.norms_1.append(LayerNorm(channels))
93
+ self.norms_2.append(LayerNorm(channels))
94
+
95
+ def forward(self, x, x_mask, g=None):
96
+ if g is not None:
97
+ x = x + g
98
+ for i in range(self.n_layers):
99
+ y = self.convs_sep[i](x * x_mask)
100
+ y = self.norms_1[i](y)
101
+ y = F.gelu(y)
102
+ y = self.convs_1x1[i](y)
103
+ y = self.norms_2[i](y)
104
+ y = F.gelu(y)
105
+ y = self.drop(y)
106
+ x = x + y
107
+ return x * x_mask
108
+
109
+
110
+ class WN(torch.nn.Module):
111
+ def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
112
+ super(WN, self).__init__()
113
+ assert(kernel_size % 2 == 1)
114
+ self.hidden_channels =hidden_channels
115
+ self.kernel_size = kernel_size,
116
+ self.dilation_rate = dilation_rate
117
+ self.n_layers = n_layers
118
+ self.gin_channels = gin_channels
119
+ self.p_dropout = p_dropout
120
+
121
+ self.in_layers = torch.nn.ModuleList()
122
+ self.res_skip_layers = torch.nn.ModuleList()
123
+ self.drop = nn.Dropout(p_dropout)
124
+
125
+ if gin_channels != 0:
126
+ cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
127
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
128
+
129
+ for i in range(n_layers):
130
+ dilation = dilation_rate ** i
131
+ padding = int((kernel_size * dilation - dilation) / 2)
132
+ in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
133
+ dilation=dilation, padding=padding)
134
+ in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
135
+ self.in_layers.append(in_layer)
136
+
137
+ # last one is not necessary
138
+ if i < n_layers - 1:
139
+ res_skip_channels = 2 * hidden_channels
140
+ else:
141
+ res_skip_channels = hidden_channels
142
+
143
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
144
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
145
+ self.res_skip_layers.append(res_skip_layer)
146
+
147
+ def forward(self, x, x_mask, g=None, **kwargs):
148
+ output = torch.zeros_like(x)
149
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
150
+
151
+ if g is not None:
152
+ g = self.cond_layer(g)
153
+
154
+ for i in range(self.n_layers):
155
+ x_in = self.in_layers[i](x)
156
+ if g is not None:
157
+ cond_offset = i * 2 * self.hidden_channels
158
+ g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
159
+ else:
160
+ g_l = torch.zeros_like(x_in)
161
+
162
+ acts = commons.fused_add_tanh_sigmoid_multiply(
163
+ x_in,
164
+ g_l,
165
+ n_channels_tensor)
166
+ acts = self.drop(acts)
167
+
168
+ res_skip_acts = self.res_skip_layers[i](acts)
169
+ if i < self.n_layers - 1:
170
+ res_acts = res_skip_acts[:,:self.hidden_channels,:]
171
+ x = (x + res_acts) * x_mask
172
+ output = output + res_skip_acts[:,self.hidden_channels:,:]
173
+ else:
174
+ output = output + res_skip_acts
175
+ return output * x_mask
176
+
177
+ def remove_weight_norm(self):
178
+ if self.gin_channels != 0:
179
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
180
+ for l in self.in_layers:
181
+ torch.nn.utils.remove_weight_norm(l)
182
+ for l in self.res_skip_layers:
183
+ torch.nn.utils.remove_weight_norm(l)
184
+
185
+
186
+ class ResBlock1(torch.nn.Module):
187
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
188
+ super(ResBlock1, self).__init__()
189
+ self.convs1 = nn.ModuleList([
190
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
191
+ padding=get_padding(kernel_size, dilation[0]))),
192
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
193
+ padding=get_padding(kernel_size, dilation[1]))),
194
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
195
+ padding=get_padding(kernel_size, dilation[2])))
196
+ ])
197
+ self.convs1.apply(init_weights)
198
+
199
+ self.convs2 = nn.ModuleList([
200
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
201
+ padding=get_padding(kernel_size, 1))),
202
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203
+ padding=get_padding(kernel_size, 1))),
204
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
205
+ padding=get_padding(kernel_size, 1)))
206
+ ])
207
+ self.convs2.apply(init_weights)
208
+
209
+ def forward(self, x, x_mask=None):
210
+ for c1, c2 in zip(self.convs1, self.convs2):
211
+ xt = F.leaky_relu(x, LRELU_SLOPE)
212
+ if x_mask is not None:
213
+ xt = xt * x_mask
214
+ xt = c1(xt)
215
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
216
+ if x_mask is not None:
217
+ xt = xt * x_mask
218
+ xt = c2(xt)
219
+ x = xt + x
220
+ if x_mask is not None:
221
+ x = x * x_mask
222
+ return x
223
+
224
+ def remove_weight_norm(self):
225
+ for l in self.convs1:
226
+ remove_weight_norm(l)
227
+ for l in self.convs2:
228
+ remove_weight_norm(l)
229
+
230
+
231
+ class ResBlock2(torch.nn.Module):
232
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
233
+ super(ResBlock2, self).__init__()
234
+ self.convs = nn.ModuleList([
235
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
236
+ padding=get_padding(kernel_size, dilation[0]))),
237
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
238
+ padding=get_padding(kernel_size, dilation[1])))
239
+ ])
240
+ self.convs.apply(init_weights)
241
+
242
+ def forward(self, x, x_mask=None):
243
+ for c in self.convs:
244
+ xt = F.leaky_relu(x, LRELU_SLOPE)
245
+ if x_mask is not None:
246
+ xt = xt * x_mask
247
+ xt = c(xt)
248
+ x = xt + x
249
+ if x_mask is not None:
250
+ x = x * x_mask
251
+ return x
252
+
253
+ def remove_weight_norm(self):
254
+ for l in self.convs:
255
+ remove_weight_norm(l)
256
+
257
+
258
+ class Log(nn.Module):
259
+ def forward(self, x, x_mask, reverse=False, **kwargs):
260
+ if not reverse:
261
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
262
+ logdet = torch.sum(-y, [1, 2])
263
+ return y, logdet
264
+ else:
265
+ x = torch.exp(x) * x_mask
266
+ return x
267
+
268
+
269
+ class Flip(nn.Module):
270
+ def forward(self, x, *args, reverse=False, **kwargs):
271
+ x = torch.flip(x, [1])
272
+ if not reverse:
273
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
274
+ return x, logdet
275
+ else:
276
+ return x
277
+
278
+
279
+ class ElementwiseAffine(nn.Module):
280
+ def __init__(self, channels):
281
+ super().__init__()
282
+ self.channels = channels
283
+ self.m = nn.Parameter(torch.zeros(channels,1))
284
+ self.logs = nn.Parameter(torch.zeros(channels,1))
285
+
286
+ def forward(self, x, x_mask, reverse=False, **kwargs):
287
+ if not reverse:
288
+ y = self.m + torch.exp(self.logs) * x
289
+ y = y * x_mask
290
+ logdet = torch.sum(self.logs * x_mask, [1,2])
291
+ return y, logdet
292
+ else:
293
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
294
+ return x
295
+
296
+
297
+ class ResidualCouplingLayer(nn.Module):
298
+ def __init__(self,
299
+ channels,
300
+ hidden_channels,
301
+ kernel_size,
302
+ dilation_rate,
303
+ n_layers,
304
+ p_dropout=0,
305
+ gin_channels=0,
306
+ mean_only=False):
307
+ assert channels % 2 == 0, "channels should be divisible by 2"
308
+ super().__init__()
309
+ self.channels = channels
310
+ self.hidden_channels = hidden_channels
311
+ self.kernel_size = kernel_size
312
+ self.dilation_rate = dilation_rate
313
+ self.n_layers = n_layers
314
+ self.half_channels = channels // 2
315
+ self.mean_only = mean_only
316
+
317
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
318
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
319
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
320
+ self.post.weight.data.zero_()
321
+ self.post.bias.data.zero_()
322
+
323
+ def forward(self, x, x_mask, g=None, reverse=False):
324
+ x0, x1 = torch.split(x, [self.half_channels]*2, 1)
325
+ h = self.pre(x0) * x_mask
326
+ h = self.enc(h, x_mask, g=g)
327
+ stats = self.post(h) * x_mask
328
+ if not self.mean_only:
329
+ m, logs = torch.split(stats, [self.half_channels]*2, 1)
330
+ else:
331
+ m = stats
332
+ logs = torch.zeros_like(m)
333
+
334
+ if not reverse:
335
+ x1 = m + x1 * torch.exp(logs) * x_mask
336
+ x = torch.cat([x0, x1], 1)
337
+ logdet = torch.sum(logs, [1,2])
338
+ return x, logdet
339
+ else:
340
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
341
+ x = torch.cat([x0, x1], 1)
342
+ return x