Update vc_infer_pipeline.py
Browse files- vc_infer_pipeline.py +46 -38
vc_infer_pipeline.py
CHANGED
@@ -284,6 +284,7 @@ class VC(object):
|
|
284 |
protect,
|
285 |
f0_file=None,
|
286 |
):
|
|
|
287 |
if (
|
288 |
file_index != ""
|
289 |
and os.path.exists(file_index)
|
@@ -313,6 +314,7 @@ class VC(object):
|
|
313 |
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
314 |
)[0][0]
|
315 |
)
|
|
|
316 |
s = 0
|
317 |
audio_opt = []
|
318 |
t = None
|
@@ -350,17 +352,54 @@ class VC(object):
|
|
350 |
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
351 |
t2 = ttime()
|
352 |
times[1] += t2 - t1
|
353 |
-
for t in opt_ts:
|
354 |
t = t // self.window * self.window
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
if if_f0 == 1:
|
356 |
audio_opt.append(
|
357 |
self.vc(
|
358 |
model,
|
359 |
net_g,
|
360 |
sid,
|
361 |
-
audio_pad[
|
362 |
-
pitch[:,
|
363 |
-
pitchf[:,
|
364 |
times,
|
365 |
index,
|
366 |
big_npy,
|
@@ -375,7 +414,7 @@ class VC(object):
|
|
375 |
model,
|
376 |
net_g,
|
377 |
sid,
|
378 |
-
audio_pad[
|
379 |
None,
|
380 |
None,
|
381 |
times,
|
@@ -386,39 +425,8 @@ class VC(object):
|
|
386 |
protect,
|
387 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
388 |
)
|
389 |
-
|
390 |
-
|
391 |
-
if if_f0 == 1:
|
392 |
-
audio_piece = self.process_batch(
|
393 |
-
model,
|
394 |
-
net_g,
|
395 |
-
sid,
|
396 |
-
[audio_pad[t:]],
|
397 |
-
[pitch[:, t // self.window :]],
|
398 |
-
[pitchf[:, t // self.window :]],
|
399 |
-
times,
|
400 |
-
index,
|
401 |
-
big_npy,
|
402 |
-
index_rate,
|
403 |
-
version,
|
404 |
-
protect,
|
405 |
-
)[0]
|
406 |
-
else:
|
407 |
-
audio_piece = self.process_batch(
|
408 |
-
model,
|
409 |
-
net_g,
|
410 |
-
sid,
|
411 |
-
[audio_pad[t:]],
|
412 |
-
None,
|
413 |
-
None,
|
414 |
-
times,
|
415 |
-
index,
|
416 |
-
big_npy,
|
417 |
-
index_rate,
|
418 |
-
version,
|
419 |
-
protect,
|
420 |
-
)[0]
|
421 |
-
audio_opt.append(audio_piece[self.t_pad_tgt : -self.t_pad_tgt])
|
422 |
|
423 |
if not audio_opt:
|
424 |
raise ValueError("No audio segments were generated")
|
|
|
284 |
protect,
|
285 |
f0_file=None,
|
286 |
):
|
287 |
+
logger.info(f"Starting pipeline with audio shape: {audio.shape}")
|
288 |
if (
|
289 |
file_index != ""
|
290 |
and os.path.exists(file_index)
|
|
|
314 |
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
315 |
)[0][0]
|
316 |
)
|
317 |
+
logger.info(f"Number of opt_ts: {len(opt_ts)}")
|
318 |
s = 0
|
319 |
audio_opt = []
|
320 |
t = None
|
|
|
352 |
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
353 |
t2 = ttime()
|
354 |
times[1] += t2 - t1
|
355 |
+
for i, t in enumerate(opt_ts):
|
356 |
t = t // self.window * self.window
|
357 |
+
logger.info(f"Processing segment {i+1}/{len(opt_ts)}")
|
358 |
+
if if_f0 == 1:
|
359 |
+
segment = self.vc(
|
360 |
+
model,
|
361 |
+
net_g,
|
362 |
+
sid,
|
363 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
364 |
+
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
|
365 |
+
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
|
366 |
+
times,
|
367 |
+
index,
|
368 |
+
big_npy,
|
369 |
+
index_rate,
|
370 |
+
version,
|
371 |
+
protect,
|
372 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
373 |
+
else:
|
374 |
+
segment = self.vc(
|
375 |
+
model,
|
376 |
+
net_g,
|
377 |
+
sid,
|
378 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
379 |
+
None,
|
380 |
+
None,
|
381 |
+
times,
|
382 |
+
index,
|
383 |
+
big_npy,
|
384 |
+
index_rate,
|
385 |
+
version,
|
386 |
+
protect,
|
387 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
388 |
+
|
389 |
+
logger.info(f"Segment {i+1} shape: {segment.shape}")
|
390 |
+
audio_opt.append(segment)
|
391 |
+
s = t
|
392 |
+
if t is not None:
|
393 |
+
logger.info("Processing final segment")
|
394 |
if if_f0 == 1:
|
395 |
audio_opt.append(
|
396 |
self.vc(
|
397 |
model,
|
398 |
net_g,
|
399 |
sid,
|
400 |
+
audio_pad[t:],
|
401 |
+
pitch[:, t // self.window :],
|
402 |
+
pitchf[:, t // self.window :],
|
403 |
times,
|
404 |
index,
|
405 |
big_npy,
|
|
|
414 |
model,
|
415 |
net_g,
|
416 |
sid,
|
417 |
+
audio_pad[t:],
|
418 |
None,
|
419 |
None,
|
420 |
times,
|
|
|
425 |
protect,
|
426 |
)[self.t_pad_tgt : -self.t_pad_tgt]
|
427 |
)
|
428 |
+
|
429 |
+
logger.info(f"Number of audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
|
431 |
if not audio_opt:
|
432 |
raise ValueError("No audio segments were generated")
|