MAZALA2024 commited on
Commit
d8978c2
·
verified ·
1 Parent(s): 21e20b7

Update vc_infer_pipeline.py

Browse files
Files changed (1) hide show
  1. vc_infer_pipeline.py +13 -140
vc_infer_pipeline.py CHANGED
@@ -263,154 +263,28 @@ class VC(object):
263
  times[2] += t2 - t1
264
  return audio1
265
 
266
- def pipeline(
267
- self,
268
- model,
269
- net_g,
270
- sid,
271
- audio,
272
- input_audio_path,
273
- times,
274
- f0_up_key,
275
- f0_method,
276
- file_index,
277
- index_rate,
278
- if_f0,
279
- filter_radius,
280
- tgt_sr,
281
- resample_sr,
282
- rms_mix_rate,
283
- version,
284
- protect,
285
- f0_file=None,
286
- ):
287
- if (
288
- file_index != ""
289
- and os.path.exists(file_index)
290
- and index_rate != 0
291
- ):
292
- try:
293
- index = faiss.read_index(file_index)
294
- big_npy = index.reconstruct_n(0, index.ntotal)
295
- except:
296
- traceback.print_exc()
297
- index = big_npy = None
298
- else:
299
- index = big_npy = None
300
- audio = signal.filtfilt(bh, ah, audio)
301
- audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
302
- opt_ts = []
303
- if audio_pad.shape[0] > self.t_max:
304
- audio_sum = np.zeros_like(audio)
305
- for i in range(self.window):
306
- audio_sum += audio_pad[i : i - self.window]
307
- for t in range(self.t_center, audio.shape[0], self.t_center):
308
- opt_ts.append(
309
- t
310
- - self.t_query
311
- + np.where(
312
- np.abs(audio_sum[t - self.t_query : t + self.t_query])
313
- == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
314
- )[0][0]
315
- )
316
- s = 0
317
- audio_opt = []
318
- t = None
319
- t1 = ttime()
320
- audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
321
- p_len = audio_pad.shape[0] // self.window
322
- inp_f0 = None
323
- if hasattr(f0_file, "name"):
324
- try:
325
- with open(f0_file.name, "r") as f:
326
- lines = f.read().strip("\n").split("\n")
327
- inp_f0 = []
328
- for line in lines:
329
- inp_f0.append([float(i) for i in line.split(",")])
330
- inp_f0 = np.array(inp_f0, dtype="float32")
331
- except:
332
- traceback.print_exc()
333
- sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
334
- pitch, pitchf = None, None
335
- if if_f0 == 1:
336
- pitch, pitchf = self.get_f0(
337
- input_audio_path,
338
- audio_pad,
339
- p_len,
340
- f0_up_key,
341
- f0_method,
342
- filter_radius,
343
- inp_f0,
344
- )
345
- pitch = pitch[:p_len]
346
- pitchf = pitchf[:p_len]
347
- if self.device == "mps":
348
- pitchf = pitchf.astype(np.float32)
349
- pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
350
- pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
351
- t2 = ttime()
352
- times[1] += t2 - t1
353
- for t in opt_ts:
354
- t = t // self.window * self.window
355
  if if_f0 == 1:
356
- audio_opt.append(
357
- self.vc(
358
- model,
359
- net_g,
360
- sid,
361
- audio_pad[s : t + self.t_pad2 + self.window],
362
- pitch[:, s // self.window : (t + self.t_pad2) // self.window],
363
- pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
364
- times,
365
- index,
366
- big_npy,
367
- index_rate,
368
- version,
369
- protect,
370
- )[self.t_pad_tgt : -self.t_pad_tgt]
371
- )
372
- else:
373
- audio_opt.append(
374
- self.vc(
375
- model,
376
- net_g,
377
- sid,
378
- audio_pad[s : t + self.t_pad2 + self.window],
379
- None,
380
- None,
381
- times,
382
- index,
383
- big_npy,
384
- index_rate,
385
- version,
386
- protect,
387
- )[self.t_pad_tgt : -self.t_pad_tgt]
388
- )
389
- s = t
390
- if if_f0 == 1:
391
- audio_opt.append(
392
- self.vc(
393
  model,
394
  net_g,
395
  sid,
396
- audio_pad[t:],
397
- pitch[:, t // self.window :] if t is not None else pitch,
398
- pitchf[:, t // self.window :] if t is not None else pitchf,
399
  times,
400
  index,
401
  big_npy,
402
  index_rate,
403
  version,
404
  protect,
405
- )[self.t_pad_tgt : -self.t_pad_tgt]
406
- )
407
- else:
408
- audio_opt.append(
409
- self.vc(
410
  model,
411
  net_g,
412
  sid,
413
- audio_pad[t:],
414
  None,
415
  None,
416
  times,
@@ -419,8 +293,9 @@ class VC(object):
419
  index_rate,
420
  version,
421
  protect,
422
- )[self.t_pad_tgt : -self.t_pad_tgt]
423
- )
 
424
  audio_opt = np.concatenate(audio_opt)
425
  if rms_mix_rate != 1:
426
  audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
@@ -431,9 +306,7 @@ class VC(object):
431
  if audio_max > 1:
432
  max_int16 /= audio_max
433
  audio_opt = (audio_opt * max_int16).astype(np.int16)
434
- del pitch, pitchf, sid
435
- if torch.cuda.is_available():
436
- torch.cuda.empty_cache()
437
  return audio_opt
438
 
439
  def parallel_pipeline(self, tasks):
 
263
  times[2] += t2 - t1
264
  return audio1
265
 
266
+ if t is not None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  if if_f0 == 1:
268
+ audio_piece = self.process_batch(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  model,
270
  net_g,
271
  sid,
272
+ [audio_pad[t:]],
273
+ [pitch[:, t // self.window :]],
274
+ [pitchf[:, t // self.window :]],
275
  times,
276
  index,
277
  big_npy,
278
  index_rate,
279
  version,
280
  protect,
281
+ )[0]
282
+ else:
283
+ audio_piece = self.process_batch(
 
 
284
  model,
285
  net_g,
286
  sid,
287
+ [audio_pad[t:]],
288
  None,
289
  None,
290
  times,
 
293
  index_rate,
294
  version,
295
  protect,
296
+ )[0]
297
+ audio_opt.append(audio_piece[self.t_pad_tgt : -self.t_pad_tgt])
298
+
299
  audio_opt = np.concatenate(audio_opt)
300
  if rms_mix_rate != 1:
301
  audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
 
306
  if audio_max > 1:
307
  max_int16 /= audio_max
308
  audio_opt = (audio_opt * max_int16).astype(np.int16)
309
+
 
 
310
  return audio_opt
311
 
312
  def parallel_pipeline(self, tasks):