Nick1 commited on
Commit
3688793
·
verified ·
1 Parent(s): 5d04347

Delete rmvpe.py

Browse files
Files changed (1) hide show
  1. rmvpe.py +0 -432
rmvpe.py DELETED
@@ -1,432 +0,0 @@
1
- import sys, torch, numpy as np, traceback, pdb
2
- import torch.nn as nn
3
- from time import time as ttime
4
- import torch.nn.functional as F
5
-
6
-
7
- class BiGRU(nn.Module):
8
- def __init__(self, input_features, hidden_features, num_layers):
9
- super(BiGRU, self).__init__()
10
- self.gru = nn.GRU(
11
- input_features,
12
- hidden_features,
13
- num_layers=num_layers,
14
- batch_first=True,
15
- bidirectional=True,
16
- )
17
-
18
- def forward(self, x):
19
- return self.gru(x)[0]
20
-
21
-
22
- class ConvBlockRes(nn.Module):
23
- def __init__(self, in_channels, out_channels, momentum=0.01):
24
- super(ConvBlockRes, self).__init__()
25
- self.conv = nn.Sequential(
26
- nn.Conv2d(
27
- in_channels=in_channels,
28
- out_channels=out_channels,
29
- kernel_size=(3, 3),
30
- stride=(1, 1),
31
- padding=(1, 1),
32
- bias=False,
33
- ),
34
- nn.BatchNorm2d(out_channels, momentum=momentum),
35
- nn.ReLU(),
36
- nn.Conv2d(
37
- in_channels=out_channels,
38
- out_channels=out_channels,
39
- kernel_size=(3, 3),
40
- stride=(1, 1),
41
- padding=(1, 1),
42
- bias=False,
43
- ),
44
- nn.BatchNorm2d(out_channels, momentum=momentum),
45
- nn.ReLU(),
46
- )
47
- if in_channels != out_channels:
48
- self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
49
- self.is_shortcut = True
50
- else:
51
- self.is_shortcut = False
52
-
53
- def forward(self, x):
54
- if self.is_shortcut:
55
- return self.conv(x) + self.shortcut(x)
56
- else:
57
- return self.conv(x) + x
58
-
59
-
60
- class Encoder(nn.Module):
61
- def __init__(
62
- self,
63
- in_channels,
64
- in_size,
65
- n_encoders,
66
- kernel_size,
67
- n_blocks,
68
- out_channels=16,
69
- momentum=0.01,
70
- ):
71
- super(Encoder, self).__init__()
72
- self.n_encoders = n_encoders
73
- self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
74
- self.layers = nn.ModuleList()
75
- self.latent_channels = []
76
- for i in range(self.n_encoders):
77
- self.layers.append(
78
- ResEncoderBlock(
79
- in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
80
- )
81
- )
82
- self.latent_channels.append([out_channels, in_size])
83
- in_channels = out_channels
84
- out_channels *= 2
85
- in_size //= 2
86
- self.out_size = in_size
87
- self.out_channel = out_channels
88
-
89
- def forward(self, x):
90
- concat_tensors = []
91
- x = self.bn(x)
92
- for i in range(self.n_encoders):
93
- _, x = self.layers[i](x)
94
- concat_tensors.append(_)
95
- return x, concat_tensors
96
-
97
-
98
- class ResEncoderBlock(nn.Module):
99
- def __init__(
100
- self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
101
- ):
102
- super(ResEncoderBlock, self).__init__()
103
- self.n_blocks = n_blocks
104
- self.conv = nn.ModuleList()
105
- self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
106
- for i in range(n_blocks - 1):
107
- self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
108
- self.kernel_size = kernel_size
109
- if self.kernel_size is not None:
110
- self.pool = nn.AvgPool2d(kernel_size=kernel_size)
111
-
112
- def forward(self, x):
113
- for i in range(self.n_blocks):
114
- x = self.conv[i](x)
115
- if self.kernel_size is not None:
116
- return x, self.pool(x)
117
- else:
118
- return x
119
-
120
-
121
- class Intermediate(nn.Module): #
122
- def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
123
- super(Intermediate, self).__init__()
124
- self.n_inters = n_inters
125
- self.layers = nn.ModuleList()
126
- self.layers.append(
127
- ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
128
- )
129
- for i in range(self.n_inters - 1):
130
- self.layers.append(
131
- ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
132
- )
133
-
134
- def forward(self, x):
135
- for i in range(self.n_inters):
136
- x = self.layers[i](x)
137
- return x
138
-
139
-
140
- class ResDecoderBlock(nn.Module):
141
- def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
142
- super(ResDecoderBlock, self).__init__()
143
- out_padding = (0, 1) if stride == (1, 2) else (1, 1)
144
- self.n_blocks = n_blocks
145
- self.conv1 = nn.Sequential(
146
- nn.ConvTranspose2d(
147
- in_channels=in_channels,
148
- out_channels=out_channels,
149
- kernel_size=(3, 3),
150
- stride=stride,
151
- padding=(1, 1),
152
- output_padding=out_padding,
153
- bias=False,
154
- ),
155
- nn.BatchNorm2d(out_channels, momentum=momentum),
156
- nn.ReLU(),
157
- )
158
- self.conv2 = nn.ModuleList()
159
- self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
160
- for i in range(n_blocks - 1):
161
- self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
162
-
163
- def forward(self, x, concat_tensor):
164
- x = self.conv1(x)
165
- x = torch.cat((x, concat_tensor), dim=1)
166
- for i in range(self.n_blocks):
167
- x = self.conv2[i](x)
168
- return x
169
-
170
-
171
- class Decoder(nn.Module):
172
- def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
173
- super(Decoder, self).__init__()
174
- self.layers = nn.ModuleList()
175
- self.n_decoders = n_decoders
176
- for i in range(self.n_decoders):
177
- out_channels = in_channels // 2
178
- self.layers.append(
179
- ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
180
- )
181
- in_channels = out_channels
182
-
183
- def forward(self, x, concat_tensors):
184
- for i in range(self.n_decoders):
185
- x = self.layers[i](x, concat_tensors[-1 - i])
186
- return x
187
-
188
-
189
- class DeepUnet(nn.Module):
190
- def __init__(
191
- self,
192
- kernel_size,
193
- n_blocks,
194
- en_de_layers=5,
195
- inter_layers=4,
196
- in_channels=1,
197
- en_out_channels=16,
198
- ):
199
- super(DeepUnet, self).__init__()
200
- self.encoder = Encoder(
201
- in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
202
- )
203
- self.intermediate = Intermediate(
204
- self.encoder.out_channel // 2,
205
- self.encoder.out_channel,
206
- inter_layers,
207
- n_blocks,
208
- )
209
- self.decoder = Decoder(
210
- self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
211
- )
212
-
213
- def forward(self, x):
214
- x, concat_tensors = self.encoder(x)
215
- x = self.intermediate(x)
216
- x = self.decoder(x, concat_tensors)
217
- return x
218
-
219
-
220
- class E2E(nn.Module):
221
- def __init__(
222
- self,
223
- n_blocks,
224
- n_gru,
225
- kernel_size,
226
- en_de_layers=5,
227
- inter_layers=4,
228
- in_channels=1,
229
- en_out_channels=16,
230
- ):
231
- super(E2E, self).__init__()
232
- self.unet = DeepUnet(
233
- kernel_size,
234
- n_blocks,
235
- en_de_layers,
236
- inter_layers,
237
- in_channels,
238
- en_out_channels,
239
- )
240
- self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
241
- if n_gru:
242
- self.fc = nn.Sequential(
243
- BiGRU(3 * 128, 256, n_gru),
244
- nn.Linear(512, 360),
245
- nn.Dropout(0.25),
246
- nn.Sigmoid(),
247
- )
248
- else:
249
- self.fc = nn.Sequential(
250
- nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
251
- )
252
-
253
- def forward(self, mel):
254
- mel = mel.transpose(-1, -2).unsqueeze(1)
255
- x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
256
- x = self.fc(x)
257
- return x
258
-
259
-
260
- from librosa.filters import mel
261
-
262
-
263
- class MelSpectrogram(torch.nn.Module):
264
- def __init__(
265
- self,
266
- is_half,
267
- n_mel_channels,
268
- sampling_rate,
269
- win_length,
270
- hop_length,
271
- n_fft=None,
272
- mel_fmin=0,
273
- mel_fmax=None,
274
- clamp=1e-5,
275
- ):
276
- super().__init__()
277
- n_fft = win_length if n_fft is None else n_fft
278
- self.hann_window = {}
279
- mel_basis = mel(
280
- sr=sampling_rate,
281
- n_fft=n_fft,
282
- n_mels=n_mel_channels,
283
- fmin=mel_fmin,
284
- fmax=mel_fmax,
285
- htk=True,
286
- )
287
- mel_basis = torch.from_numpy(mel_basis).float()
288
- self.register_buffer("mel_basis", mel_basis)
289
- self.n_fft = win_length if n_fft is None else n_fft
290
- self.hop_length = hop_length
291
- self.win_length = win_length
292
- self.sampling_rate = sampling_rate
293
- self.n_mel_channels = n_mel_channels
294
- self.clamp = clamp
295
- self.is_half = is_half
296
-
297
- def forward(self, audio, keyshift=0, speed=1, center=True):
298
- factor = 2 ** (keyshift / 12)
299
- n_fft_new = int(np.round(self.n_fft * factor))
300
- win_length_new = int(np.round(self.win_length * factor))
301
- hop_length_new = int(np.round(self.hop_length * speed))
302
- keyshift_key = str(keyshift) + "_" + str(audio.device)
303
- if keyshift_key not in self.hann_window:
304
- self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
305
- audio.device
306
- )
307
- fft = torch.stft(
308
- audio,
309
- n_fft=n_fft_new,
310
- hop_length=hop_length_new,
311
- win_length=win_length_new,
312
- window=self.hann_window[keyshift_key],
313
- center=center,
314
- return_complex=True,
315
- )
316
- magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
317
- if keyshift != 0:
318
- size = self.n_fft // 2 + 1
319
- resize = magnitude.size(1)
320
- if resize < size:
321
- magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
322
- magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
323
- mel_output = torch.matmul(self.mel_basis, magnitude)
324
- if self.is_half == True:
325
- mel_output = mel_output.half()
326
- log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
327
- return log_mel_spec
328
-
329
-
330
- class RMVPE:
331
- def __init__(self, model_path, is_half, device=None):
332
- self.resample_kernel = {}
333
- model = E2E(4, 1, (2, 2))
334
- ckpt = torch.load(model_path, map_location="cpu")
335
- model.load_state_dict(ckpt)
336
- model.eval()
337
- if is_half == True:
338
- model = model.half()
339
- self.model = model
340
- self.resample_kernel = {}
341
- self.is_half = is_half
342
- if device is None:
343
- device = "cuda" if torch.cuda.is_available() else "cpu"
344
- self.device = device
345
- self.mel_extractor = MelSpectrogram(
346
- is_half, 128, 16000, 1024, 160, None, 30, 8000
347
- ).to(device)
348
- self.model = self.model.to(device)
349
- cents_mapping = 20 * np.arange(360) + 1997.3794084376191
350
- self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
351
-
352
- def mel2hidden(self, mel):
353
- with torch.no_grad():
354
- n_frames = mel.shape[-1]
355
- mel = F.pad(
356
- mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
357
- )
358
- hidden = self.model(mel)
359
- return hidden[:, :n_frames]
360
-
361
- def decode(self, hidden, thred=0.03):
362
- cents_pred = self.to_local_average_cents(hidden, thred=thred)
363
- f0 = 10 * (2 ** (cents_pred / 1200))
364
- f0[f0 == 10] = 0
365
- # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
366
- return f0
367
-
368
- def infer_from_audio(self, audio, thred=0.03):
369
- audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
370
- # torch.cuda.synchronize()
371
- # t0=ttime()
372
- mel = self.mel_extractor(audio, center=True)
373
- # torch.cuda.synchronize()
374
- # t1=ttime()
375
- hidden = self.mel2hidden(mel)
376
- # torch.cuda.synchronize()
377
- # t2=ttime()
378
- hidden = hidden.squeeze(0).cpu().numpy()
379
- if self.is_half == True:
380
- hidden = hidden.astype("float32")
381
- f0 = self.decode(hidden, thred=thred)
382
- # torch.cuda.synchronize()
383
- # t3=ttime()
384
- # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
385
- return f0
386
-
387
- def to_local_average_cents(self, salience, thred=0.05):
388
- # t0 = ttime()
389
- center = np.argmax(salience, axis=1) # 帧长#index
390
- salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
391
- # t1 = ttime()
392
- center += 4
393
- todo_salience = []
394
- todo_cents_mapping = []
395
- starts = center - 4
396
- ends = center + 5
397
- for idx in range(salience.shape[0]):
398
- todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
399
- todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
400
- # t2 = ttime()
401
- todo_salience = np.array(todo_salience) # 帧长,9
402
- todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9
403
- product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
404
- weight_sum = np.sum(todo_salience, 1) # 帧长
405
- devided = product_sum / weight_sum # 帧长
406
- # t3 = ttime()
407
- maxx = np.max(salience, axis=1) # 帧长
408
- devided[maxx <= thred] = 0
409
- # t4 = ttime()
410
- # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
411
- return devided
412
-
413
-
414
- # if __name__ == '__main__':
415
- # audio, sampling_rate = sf.read("卢本伟语录~1.wav")
416
- # if len(audio.shape) > 1:
417
- # audio = librosa.to_mono(audio.transpose(1, 0))
418
- # audio_bak = audio.copy()
419
- # if sampling_rate != 16000:
420
- # audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
421
- # model_path = "/bili-coeus/jupyter/jupyterhub-liujing04/vits_ch/test-RMVPE/weights/rmvpe_llc_half.pt"
422
- # thred = 0.03 # 0.01
423
- # device = 'cuda' if torch.cuda.is_available() else 'cpu'
424
- # rmvpe = RMVPE(model_path,is_half=False, device=device)
425
- # t0=ttime()
426
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
427
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
428
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
429
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
430
- # f0 = rmvpe.infer_from_audio(audio, thred=thred)
431
- # t1=ttime()
432
- # print(f0.shape,t1-t0)