Update vc_infer_pipeline.py
Browse files- vc_infer_pipeline.py +13 -140
vc_infer_pipeline.py
CHANGED
@@ -263,154 +263,28 @@ class VC(object):
|
|
263 |
times[2] += t2 - t1
|
264 |
return audio1
|
265 |
|
266 |
-
|
267 |
-
self,
|
268 |
-
model,
|
269 |
-
net_g,
|
270 |
-
sid,
|
271 |
-
audio,
|
272 |
-
input_audio_path,
|
273 |
-
times,
|
274 |
-
f0_up_key,
|
275 |
-
f0_method,
|
276 |
-
file_index,
|
277 |
-
index_rate,
|
278 |
-
if_f0,
|
279 |
-
filter_radius,
|
280 |
-
tgt_sr,
|
281 |
-
resample_sr,
|
282 |
-
rms_mix_rate,
|
283 |
-
version,
|
284 |
-
protect,
|
285 |
-
f0_file=None,
|
286 |
-
):
|
287 |
-
if (
|
288 |
-
file_index != ""
|
289 |
-
and os.path.exists(file_index)
|
290 |
-
and index_rate != 0
|
291 |
-
):
|
292 |
-
try:
|
293 |
-
index = faiss.read_index(file_index)
|
294 |
-
big_npy = index.reconstruct_n(0, index.ntotal)
|
295 |
-
except:
|
296 |
-
traceback.print_exc()
|
297 |
-
index = big_npy = None
|
298 |
-
else:
|
299 |
-
index = big_npy = None
|
300 |
-
audio = signal.filtfilt(bh, ah, audio)
|
301 |
-
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
|
302 |
-
opt_ts = []
|
303 |
-
if audio_pad.shape[0] > self.t_max:
|
304 |
-
audio_sum = np.zeros_like(audio)
|
305 |
-
for i in range(self.window):
|
306 |
-
audio_sum += audio_pad[i : i - self.window]
|
307 |
-
for t in range(self.t_center, audio.shape[0], self.t_center):
|
308 |
-
opt_ts.append(
|
309 |
-
t
|
310 |
-
- self.t_query
|
311 |
-
+ np.where(
|
312 |
-
np.abs(audio_sum[t - self.t_query : t + self.t_query])
|
313 |
-
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
314 |
-
)[0][0]
|
315 |
-
)
|
316 |
-
s = 0
|
317 |
-
audio_opt = []
|
318 |
-
t = None
|
319 |
-
t1 = ttime()
|
320 |
-
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
|
321 |
-
p_len = audio_pad.shape[0] // self.window
|
322 |
-
inp_f0 = None
|
323 |
-
if hasattr(f0_file, "name"):
|
324 |
-
try:
|
325 |
-
with open(f0_file.name, "r") as f:
|
326 |
-
lines = f.read().strip("\n").split("\n")
|
327 |
-
inp_f0 = []
|
328 |
-
for line in lines:
|
329 |
-
inp_f0.append([float(i) for i in line.split(",")])
|
330 |
-
inp_f0 = np.array(inp_f0, dtype="float32")
|
331 |
-
except:
|
332 |
-
traceback.print_exc()
|
333 |
-
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
334 |
-
pitch, pitchf = None, None
|
335 |
-
if if_f0 == 1:
|
336 |
-
pitch, pitchf = self.get_f0(
|
337 |
-
input_audio_path,
|
338 |
-
audio_pad,
|
339 |
-
p_len,
|
340 |
-
f0_up_key,
|
341 |
-
f0_method,
|
342 |
-
filter_radius,
|
343 |
-
inp_f0,
|
344 |
-
)
|
345 |
-
pitch = pitch[:p_len]
|
346 |
-
pitchf = pitchf[:p_len]
|
347 |
-
if self.device == "mps":
|
348 |
-
pitchf = pitchf.astype(np.float32)
|
349 |
-
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
350 |
-
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
351 |
-
t2 = ttime()
|
352 |
-
times[1] += t2 - t1
|
353 |
-
for t in opt_ts:
|
354 |
-
t = t // self.window * self.window
|
355 |
if if_f0 == 1:
|
356 |
-
|
357 |
-
self.vc(
|
358 |
-
model,
|
359 |
-
net_g,
|
360 |
-
sid,
|
361 |
-
audio_pad[s : t + self.t_pad2 + self.window],
|
362 |
-
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
|
363 |
-
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
|
364 |
-
times,
|
365 |
-
index,
|
366 |
-
big_npy,
|
367 |
-
index_rate,
|
368 |
-
version,
|
369 |
-
protect,
|
370 |
-
)[self.t_pad_tgt : -self.t_pad_tgt]
|
371 |
-
)
|
372 |
-
else:
|
373 |
-
audio_opt.append(
|
374 |
-
self.vc(
|
375 |
-
model,
|
376 |
-
net_g,
|
377 |
-
sid,
|
378 |
-
audio_pad[s : t + self.t_pad2 + self.window],
|
379 |
-
None,
|
380 |
-
None,
|
381 |
-
times,
|
382 |
-
index,
|
383 |
-
big_npy,
|
384 |
-
index_rate,
|
385 |
-
version,
|
386 |
-
protect,
|
387 |
-
)[self.t_pad_tgt : -self.t_pad_tgt]
|
388 |
-
)
|
389 |
-
s = t
|
390 |
-
if if_f0 == 1:
|
391 |
-
audio_opt.append(
|
392 |
-
self.vc(
|
393 |
model,
|
394 |
net_g,
|
395 |
sid,
|
396 |
-
audio_pad[t:],
|
397 |
-
pitch[:, t // self.window :]
|
398 |
-
pitchf[:, t // self.window :]
|
399 |
times,
|
400 |
index,
|
401 |
big_npy,
|
402 |
index_rate,
|
403 |
version,
|
404 |
protect,
|
405 |
-
)[
|
406 |
-
|
407 |
-
|
408 |
-
audio_opt.append(
|
409 |
-
self.vc(
|
410 |
model,
|
411 |
net_g,
|
412 |
sid,
|
413 |
-
audio_pad[t:],
|
414 |
None,
|
415 |
None,
|
416 |
times,
|
@@ -419,8 +293,9 @@ class VC(object):
|
|
419 |
index_rate,
|
420 |
version,
|
421 |
protect,
|
422 |
-
)[
|
423 |
-
)
|
|
|
424 |
audio_opt = np.concatenate(audio_opt)
|
425 |
if rms_mix_rate != 1:
|
426 |
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
|
@@ -431,9 +306,7 @@ class VC(object):
|
|
431 |
if audio_max > 1:
|
432 |
max_int16 /= audio_max
|
433 |
audio_opt = (audio_opt * max_int16).astype(np.int16)
|
434 |
-
|
435 |
-
if torch.cuda.is_available():
|
436 |
-
torch.cuda.empty_cache()
|
437 |
return audio_opt
|
438 |
|
439 |
def parallel_pipeline(self, tasks):
|
|
|
263 |
times[2] += t2 - t1
|
264 |
return audio1
|
265 |
|
266 |
+
if t is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
if if_f0 == 1:
|
268 |
+
audio_piece = self.process_batch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
model,
|
270 |
net_g,
|
271 |
sid,
|
272 |
+
[audio_pad[t:]],
|
273 |
+
[pitch[:, t // self.window :]],
|
274 |
+
[pitchf[:, t // self.window :]],
|
275 |
times,
|
276 |
index,
|
277 |
big_npy,
|
278 |
index_rate,
|
279 |
version,
|
280 |
protect,
|
281 |
+
)[0]
|
282 |
+
else:
|
283 |
+
audio_piece = self.process_batch(
|
|
|
|
|
284 |
model,
|
285 |
net_g,
|
286 |
sid,
|
287 |
+
[audio_pad[t:]],
|
288 |
None,
|
289 |
None,
|
290 |
times,
|
|
|
293 |
index_rate,
|
294 |
version,
|
295 |
protect,
|
296 |
+
)[0]
|
297 |
+
audio_opt.append(audio_piece[self.t_pad_tgt : -self.t_pad_tgt])
|
298 |
+
|
299 |
audio_opt = np.concatenate(audio_opt)
|
300 |
if rms_mix_rate != 1:
|
301 |
audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
|
|
|
306 |
if audio_max > 1:
|
307 |
max_int16 /= audio_max
|
308 |
audio_opt = (audio_opt * max_int16).astype(np.int16)
|
309 |
+
|
|
|
|
|
310 |
return audio_opt
|
311 |
|
312 |
def parallel_pipeline(self, tasks):
|