Aasouy447 commited on
Commit
2f866a0
·
1 Parent(s): a613041

Upload infer_uvr5.py

Browse files
Files changed (1) hide show
  1. infer_uvr5.py +363 -0
infer_uvr5.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys, torch, warnings, pdb
2
+
3
+ now_dir = os.getcwd()
4
+ sys.path.append(now_dir)
5
+ from json import load as ll
6
+
7
+ warnings.filterwarnings("ignore")
8
+ import librosa
9
+ import importlib
10
+ import numpy as np
11
+ import hashlib, math
12
+ from tqdm import tqdm
13
+ from lib.uvr5_pack.lib_v5 import spec_utils
14
+ from lib.uvr5_pack.utils import _get_name_params, inference
15
+ from lib.uvr5_pack.lib_v5.model_param_init import ModelParameters
16
+ import soundfile as sf
17
+ from lib.uvr5_pack.lib_v5.nets_new import CascadedNet
18
+ from lib.uvr5_pack.lib_v5 import nets_61968KB as nets
19
+
20
+
21
+ class _audio_pre_:
22
+ def __init__(self, agg, model_path, device, is_half):
23
+ self.model_path = model_path
24
+ self.device = device
25
+ self.data = {
26
+ # Processing Options
27
+ "postprocess": False,
28
+ "tta": False,
29
+ # Constants
30
+ "window_size": 512,
31
+ "agg": agg,
32
+ "high_end_process": "mirroring",
33
+ }
34
+ mp = ModelParameters("lib/uvr5_pack/lib_v5/modelparams/4band_v2.json")
35
+ model = nets.CascadedASPPNet(mp.param["bins"] * 2)
36
+ cpk = torch.load(model_path, map_location="cpu")
37
+ model.load_state_dict(cpk)
38
+ model.eval()
39
+ if is_half:
40
+ model = model.half().to(device)
41
+ else:
42
+ model = model.to(device)
43
+
44
+ self.mp = mp
45
+ self.model = model
46
+
47
+ def _path_audio_(self, music_file, ins_root=None, vocal_root=None, format="flac"):
48
+ if ins_root is None and vocal_root is None:
49
+ return "No save root."
50
+ name = os.path.basename(music_file)
51
+ if ins_root is not None:
52
+ os.makedirs(ins_root, exist_ok=True)
53
+ if vocal_root is not None:
54
+ os.makedirs(vocal_root, exist_ok=True)
55
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
56
+ bands_n = len(self.mp.param["band"])
57
+ # print(bands_n)
58
+ for d in range(bands_n, 0, -1):
59
+ bp = self.mp.param["band"][d]
60
+ if d == bands_n: # high-end band
61
+ (
62
+ X_wave[d],
63
+ _,
64
+ ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
65
+ music_file,
66
+ bp["sr"],
67
+ False,
68
+ dtype=np.float32,
69
+ res_type=bp["res_type"],
70
+ )
71
+ if X_wave[d].ndim == 1:
72
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
73
+ else: # lower bands
74
+ X_wave[d] = librosa.core.resample(
75
+ X_wave[d + 1],
76
+ self.mp.param["band"][d + 1]["sr"],
77
+ bp["sr"],
78
+ res_type=bp["res_type"],
79
+ )
80
+ # Stft of wave source
81
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
82
+ X_wave[d],
83
+ bp["hl"],
84
+ bp["n_fft"],
85
+ self.mp.param["mid_side"],
86
+ self.mp.param["mid_side_b2"],
87
+ self.mp.param["reverse"],
88
+ )
89
+ # pdb.set_trace()
90
+ if d == bands_n and self.data["high_end_process"] != "none":
91
+ input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
92
+ self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
93
+ )
94
+ input_high_end = X_spec_s[d][
95
+ :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
96
+ ]
97
+
98
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
99
+ aggresive_set = float(self.data["agg"] / 100)
100
+ aggressiveness = {
101
+ "value": aggresive_set,
102
+ "split_bin": self.mp.param["band"][1]["crop_stop"],
103
+ }
104
+ with torch.no_grad():
105
+ pred, X_mag, X_phase = inference(
106
+ X_spec_m, self.device, self.model, aggressiveness, self.data
107
+ )
108
+ # Postprocess
109
+ if self.data["postprocess"]:
110
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
111
+ pred = spec_utils.mask_silence(pred, pred_inv)
112
+ y_spec_m = pred * X_phase
113
+ v_spec_m = X_spec_m - y_spec_m
114
+
115
+ if ins_root is not None:
116
+ if self.data["high_end_process"].startswith("mirroring"):
117
+ input_high_end_ = spec_utils.mirroring(
118
+ self.data["high_end_process"], y_spec_m, input_high_end, self.mp
119
+ )
120
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(
121
+ y_spec_m, self.mp, input_high_end_h, input_high_end_
122
+ )
123
+ else:
124
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
125
+ print("%s instruments done" % name)
126
+ if format in ["wav", "flac"]:
127
+ sf.write(
128
+ os.path.join(
129
+ ins_root,
130
+ "instrument_{}_{}.{}".format(name, self.data["agg"], format),
131
+ ),
132
+ (np.array(wav_instrument) * 32768).astype("int16"),
133
+ self.mp.param["sr"],
134
+ ) #
135
+ else:
136
+ path = os.path.join(
137
+ ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
138
+ )
139
+ sf.write(
140
+ path,
141
+ (np.array(wav_instrument) * 32768).astype("int16"),
142
+ self.mp.param["sr"],
143
+ )
144
+ if os.path.exists(path):
145
+ os.system(
146
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
147
+ % (path, path[:-4] + ".%s" % format)
148
+ )
149
+ if vocal_root is not None:
150
+ if self.data["high_end_process"].startswith("mirroring"):
151
+ input_high_end_ = spec_utils.mirroring(
152
+ self.data["high_end_process"], v_spec_m, input_high_end, self.mp
153
+ )
154
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(
155
+ v_spec_m, self.mp, input_high_end_h, input_high_end_
156
+ )
157
+ else:
158
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
159
+ print("%s vocals done" % name)
160
+ if format in ["wav", "flac"]:
161
+ sf.write(
162
+ os.path.join(
163
+ vocal_root,
164
+ "vocal_{}_{}.{}".format(name, self.data["agg"], format),
165
+ ),
166
+ (np.array(wav_vocals) * 32768).astype("int16"),
167
+ self.mp.param["sr"],
168
+ )
169
+ else:
170
+ path = os.path.join(
171
+ vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
172
+ )
173
+ sf.write(
174
+ path,
175
+ (np.array(wav_vocals) * 32768).astype("int16"),
176
+ self.mp.param["sr"],
177
+ )
178
+ if os.path.exists(path):
179
+ os.system(
180
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
181
+ % (path, path[:-4] + ".%s" % format)
182
+ )
183
+
184
+
185
+ class _audio_pre_new:
186
+ def __init__(self, agg, model_path, device, is_half):
187
+ self.model_path = model_path
188
+ self.device = device
189
+ self.data = {
190
+ # Processing Options
191
+ "postprocess": False,
192
+ "tta": False,
193
+ # Constants
194
+ "window_size": 512,
195
+ "agg": agg,
196
+ "high_end_process": "mirroring",
197
+ }
198
+ mp = ModelParameters("/content/Mangio-RVC-Fork/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json")
199
+ nout = 64 if "DeReverb" in model_path else 48
200
+ model = CascadedNet(mp.param["bins"] * 2, nout)
201
+ cpk = torch.load(model_path, map_location="cpu")
202
+ model.load_state_dict(cpk)
203
+ model.eval()
204
+ if is_half:
205
+ model = model.half().to(device)
206
+ else:
207
+ model = model.to(device)
208
+
209
+ self.mp = mp
210
+ self.model = model
211
+
212
+ def _path_audio_(
213
+ self, music_file, vocal_root=None, ins_root=None, format="flac"
214
+ ): # 3个VR模型vocal和ins是反的
215
+ if ins_root is None and vocal_root is None:
216
+ return "No save root."
217
+ name = os.path.basename(music_file)
218
+ if ins_root is not None:
219
+ os.makedirs(ins_root, exist_ok=True)
220
+ if vocal_root is not None:
221
+ os.makedirs(vocal_root, exist_ok=True)
222
+ X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
223
+ bands_n = len(self.mp.param["band"])
224
+ # print(bands_n)
225
+ for d in range(bands_n, 0, -1):
226
+ bp = self.mp.param["band"][d]
227
+ if d == bands_n: # high-end band
228
+ (
229
+ X_wave[d],
230
+ _,
231
+ ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑
232
+ music_file,
233
+ bp["sr"],
234
+ False,
235
+ dtype=np.float32,
236
+ res_type=bp["res_type"],
237
+ )
238
+ if X_wave[d].ndim == 1:
239
+ X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]])
240
+ else: # lower bands
241
+ X_wave[d] = librosa.core.resample(
242
+ X_wave[d + 1],
243
+ self.mp.param["band"][d + 1]["sr"],
244
+ bp["sr"],
245
+ res_type=bp["res_type"],
246
+ )
247
+ # Stft of wave source
248
+ X_spec_s[d] = spec_utils.wave_to_spectrogram_mt(
249
+ X_wave[d],
250
+ bp["hl"],
251
+ bp["n_fft"],
252
+ self.mp.param["mid_side"],
253
+ self.mp.param["mid_side_b2"],
254
+ self.mp.param["reverse"],
255
+ )
256
+ # pdb.set_trace()
257
+ if d == bands_n and self.data["high_end_process"] != "none":
258
+ input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + (
259
+ self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"]
260
+ )
261
+ input_high_end = X_spec_s[d][
262
+ :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, :
263
+ ]
264
+
265
+ X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp)
266
+ aggresive_set = float(self.data["agg"] / 100)
267
+ aggressiveness = {
268
+ "value": aggresive_set,
269
+ "split_bin": self.mp.param["band"][1]["crop_stop"],
270
+ }
271
+ with torch.no_grad():
272
+ pred, X_mag, X_phase = inference(
273
+ X_spec_m, self.device, self.model, aggressiveness, self.data
274
+ )
275
+ # Postprocess
276
+ if self.data["postprocess"]:
277
+ pred_inv = np.clip(X_mag - pred, 0, np.inf)
278
+ pred = spec_utils.mask_silence(pred, pred_inv)
279
+ y_spec_m = pred * X_phase
280
+ v_spec_m = X_spec_m - y_spec_m
281
+
282
+ if ins_root is not None:
283
+ if self.data["high_end_process"].startswith("mirroring"):
284
+ input_high_end_ = spec_utils.mirroring(
285
+ self.data["high_end_process"], y_spec_m, input_high_end, self.mp
286
+ )
287
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(
288
+ y_spec_m, self.mp, input_high_end_h, input_high_end_
289
+ )
290
+ else:
291
+ wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp)
292
+ print("%s instruments done" % name)
293
+ if format in ["wav", "flac"]:
294
+ sf.write(
295
+ os.path.join(
296
+ ins_root,
297
+ "instrument_{}_{}.{}".format(name, self.data["agg"], format),
298
+ ),
299
+ (np.array(wav_instrument) * 32768).astype("int16"),
300
+ self.mp.param["sr"],
301
+ ) #
302
+ else:
303
+ path = os.path.join(
304
+ ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"])
305
+ )
306
+ sf.write(
307
+ path,
308
+ (np.array(wav_instrument) * 32768).astype("int16"),
309
+ self.mp.param["sr"],
310
+ )
311
+ if os.path.exists(path):
312
+ os.system(
313
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
314
+ % (path, path[:-4] + ".%s" % format)
315
+ )
316
+ if vocal_root is not None:
317
+ if self.data["high_end_process"].startswith("mirroring"):
318
+ input_high_end_ = spec_utils.mirroring(
319
+ self.data["high_end_process"], v_spec_m, input_high_end, self.mp
320
+ )
321
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(
322
+ v_spec_m, self.mp, input_high_end_h, input_high_end_
323
+ )
324
+ else:
325
+ wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp)
326
+ print("%s vocals done" % name)
327
+ if format in ["wav", "flac"]:
328
+ sf.write(
329
+ os.path.join(
330
+ vocal_root,
331
+ "vocal_{}_{}.{}".format(name, self.data["agg"], format),
332
+ ),
333
+ (np.array(wav_vocals) * 32768).astype("int16"),
334
+ self.mp.param["sr"],
335
+ )
336
+ else:
337
+ path = os.path.join(
338
+ vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"])
339
+ )
340
+ sf.write(
341
+ path,
342
+ (np.array(wav_vocals) * 32768).astype("int16"),
343
+ self.mp.param["sr"],
344
+ )
345
+ if os.path.exists(path):
346
+ os.system(
347
+ "ffmpeg -i %s -vn %s -q:a 2 -y"
348
+ % (path, path[:-4] + ".%s" % format)
349
+ )
350
+
351
+
352
+ if __name__ == "__main__":
353
+ device = "cuda"
354
+ is_half = True
355
+ # model_path = "uvr5_weights/2_HP-UVR.pth"
356
+ model_path = "/content/Mangio-RVC-Fork/uvr5_weights/VR-DeEchoDeReverb.pth"
357
+ # model_path = "uvr5_weights/VR-DeEchoNormal.pth"
358
+ # model_path = "uvr5_weights/DeEchoNormal.pth"
359
+ # pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True,agg=10)
360
+ pre_fun = _audio_pre_new(model_path=model_path, device=device, is_half=True, agg=10)
361
+ audio_path = "/content/manioiii.mp3"
362
+ save_path = "/content/"
363
+ pre_fun._path_audio_(audio_path, save_path, save_path)