justyoung commited on
Commit
fb5b598
1 Parent(s): 575c2f6

Update infer/modules/vc/pipeline.py

Browse files
Files changed (1) hide show
  1. infer/modules/vc/pipeline.py +274 -121
infer/modules/vc/pipeline.py CHANGED
@@ -1,26 +1,24 @@
1
- import os
2
- import sys
3
- import traceback
4
- import logging
5
 
6
- logger = logging.getLogger(__name__)
7
 
8
- from functools import lru_cache
9
- from time import time as ttime
10
 
11
- import faiss
12
- import librosa
13
- import numpy as np
14
- import parselmouth
15
- import pyworld
16
- import torch
17
  import torch.nn.functional as F
18
  import torchcrepe
 
 
 
19
  from scipy import signal
 
 
 
 
20
 
21
  now_dir = os.getcwd()
22
  sys.path.append(now_dir)
23
 
 
 
24
  bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
25
 
26
  input_audio_path2wav = {}
@@ -40,21 +38,22 @@ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
40
  return f0
41
 
42
 
43
- def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比
44
  # print(data1.max(),data2.max())
45
- rms1 = librosa.feature.rms(
46
- y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2
47
- ) # 每半秒一个点
48
  rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
 
49
  rms1 = torch.from_numpy(rms1)
50
  rms1 = F.interpolate(
51
  rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
52
  ).squeeze()
 
53
  rms2 = torch.from_numpy(rms2)
54
  rms2 = F.interpolate(
55
  rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
56
  ).squeeze()
57
  rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
 
58
  data2 *= (
59
  torch.pow(rms1, torch.tensor(1 - rate))
60
  * torch.pow(rms2, torch.tensor(rate - 1))
@@ -62,7 +61,7 @@ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出
62
  return data2
63
 
64
 
65
- class Pipeline(object):
66
  def __init__(self, tgt_sr, config):
67
  self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
68
  config.x_pad,
@@ -71,15 +70,181 @@ class Pipeline(object):
71
  config.x_max,
72
  config.is_half,
73
  )
74
- self.sr = 16000 # hubert输入采样率
75
- self.window = 160 # 每帧点数
76
- self.t_pad = self.sr * self.x_pad # 每条前后pad时间
77
  self.t_pad_tgt = tgt_sr * self.x_pad
78
  self.t_pad2 = self.t_pad * 2
79
- self.t_query = self.sr * self.x_query # 查询切点前后查询时间
80
- self.t_center = self.sr * self.x_center # 查询切点位置
81
- self.t_max = self.sr * self.x_max # 免查询时长阈值
82
  self.device = config.device
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  def get_f0(
85
  self,
@@ -89,6 +254,8 @@ class Pipeline(object):
89
  f0_up_key,
90
  f0_method,
91
  filter_radius,
 
 
92
  inp_f0=None,
93
  ):
94
  global input_audio_path2wav
@@ -116,69 +283,63 @@ class Pipeline(object):
116
  elif f0_method == "harvest":
117
  input_audio_path2wav[input_audio_path] = x.astype(np.double)
118
  f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
119
- if filter_radius > 2:
120
  f0 = signal.medfilt(f0, 3)
 
 
 
 
 
 
 
 
 
 
121
  elif f0_method == "crepe":
122
- model = "full"
123
- # Pick a batch size that doesn't cause memory errors on your gpu
124
- batch_size = 512
125
- # Compute pitch using first gpu
126
- audio = torch.tensor(np.copy(x))[None].float()
127
- f0, pd = torchcrepe.predict(
128
- audio,
129
- self.sr,
130
- self.window,
131
- f0_min,
132
- f0_max,
133
- model,
134
- batch_size=batch_size,
135
- device=self.device,
136
- return_periodicity=True,
137
  )
138
- pd = torchcrepe.filter.median(pd, 3)
139
- f0 = torchcrepe.filter.mean(f0, 3)
140
- f0[pd < 0.1] = 0
141
- f0 = f0[0].cpu().numpy()
142
  elif f0_method == "rmvpe":
143
- if not hasattr(self, "model_rmvpe"):
144
- from infer.lib.rmvpe import RMVPE
145
 
146
- logger.info(
147
- "Loading rmvpe model,%s" % "%s/rmvpe.pt" % os.environ["rmvpe_root"]
148
- )
149
  self.model_rmvpe = RMVPE(
150
- "%s/rmvpe.pt" % os.environ["rmvpe_root"],
151
- is_half=self.is_half,
152
- device=self.device,
153
- # use_jit=self.config.use_jit,
154
  )
155
  f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
156
-
157
- if "privateuseone" in str(self.device): # clean ortruntime memory
158
- del self.model_rmvpe.model
159
- del self.model_rmvpe
160
- logger.info("Cleaning ortruntime memory")
161
  elif f0_method == "fcpe":
162
- if not hasattr(self, "model_fcpe"):
163
- from torchfcpe import spawn_bundled_infer_model
164
-
165
- logger.info("Loading fcpe model")
166
- self.model_fcpe = spawn_bundled_infer_model(self.device)
167
- f0 = (
168
- self.model_fcpe.infer(
169
- torch.from_numpy(x).to(self.device).unsqueeze(0).float(),
170
- sr=16000,
171
- decoder_mode="local_argmax",
172
- threshold=0.006,
173
- )
174
- .squeeze()
175
- .cpu()
176
- .numpy()
 
 
 
 
 
 
177
  )
178
 
 
 
 
179
  f0 *= pow(2, f0_up_key / 12)
180
- # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
181
- tf0 = self.sr // self.window # 每秒f0点数
182
  if inp_f0 is not None:
183
  delta_t = np.round(
184
  (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
@@ -190,7 +351,6 @@ class Pipeline(object):
190
  f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
191
  :shape
192
  ]
193
- # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
194
  f0bak = f0.copy()
195
  f0_mel = 1127 * np.log(1 + f0 / 700)
196
  f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
@@ -198,8 +358,9 @@ class Pipeline(object):
198
  ) + 1
199
  f0_mel[f0_mel <= 1] = 1
200
  f0_mel[f0_mel > 255] = 255
201
- f0_coarse = np.rint(f0_mel).astype(np.int32)
202
- return f0_coarse, f0bak # 1-0
 
203
 
204
  def vc(
205
  self,
@@ -209,19 +370,18 @@ class Pipeline(object):
209
  audio0,
210
  pitch,
211
  pitchf,
212
- times,
213
  index,
214
  big_npy,
215
  index_rate,
216
  version,
217
  protect,
218
- ): # ,file_index,file_big_npy
219
  feats = torch.from_numpy(audio0)
220
  if self.is_half:
221
  feats = feats.half()
222
  else:
223
  feats = feats.float()
224
- if feats.dim() == 2: # double channels
225
  feats = feats.mean(-1)
226
  assert feats.dim() == 1, feats.dim()
227
  feats = feats.view(1, -1)
@@ -236,20 +396,17 @@ class Pipeline(object):
236
  with torch.no_grad():
237
  logits = model.extract_features(**inputs)
238
  feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
239
- if protect < 0.5 and pitch is not None and pitchf is not None:
240
  feats0 = feats.clone()
241
  if (
242
- not isinstance(index, type(None))
243
- and not isinstance(big_npy, type(None))
244
  and index_rate != 0
245
  ):
246
  npy = feats[0].cpu().numpy()
247
  if self.is_half:
248
  npy = npy.astype("float32")
249
 
250
- # _, I = index.search(npy, 1)
251
- # npy = big_npy[I.squeeze()]
252
-
253
  score, ix = index.search(npy, k=8)
254
  weight = np.square(1 / score)
255
  weight /= weight.sum(axis=1, keepdims=True)
@@ -263,7 +420,7 @@ class Pipeline(object):
263
  )
264
 
265
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
266
- if protect < 0.5 and pitch is not None and pitchf is not None:
267
  feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
268
  0, 2, 1
269
  )
@@ -271,11 +428,11 @@ class Pipeline(object):
271
  p_len = audio0.shape[0] // self.window
272
  if feats.shape[1] < p_len:
273
  p_len = feats.shape[1]
274
- if pitch is not None and pitchf is not None:
275
  pitch = pitch[:, :p_len]
276
  pitchf = pitchf[:, :p_len]
277
 
278
- if protect < 0.5 and pitch is not None and pitchf is not None:
279
  pitchff = pitchf.clone()
280
  pitchff[pitchf > 0] = 1
281
  pitchff[pitchf < 1] = protect
@@ -284,16 +441,21 @@ class Pipeline(object):
284
  feats = feats.to(feats0.dtype)
285
  p_len = torch.tensor([p_len], device=self.device).long()
286
  with torch.no_grad():
287
- hasp = pitch is not None and pitchf is not None
288
- arg = (feats, p_len, pitch, pitchf, sid) if hasp else (feats, p_len, sid)
289
- audio1 = (net_g.infer(*arg)[0][0, 0]).data.cpu().float().numpy()
290
- del hasp, arg
 
 
 
 
 
 
 
291
  del feats, p_len, padding_mask
292
  if torch.cuda.is_available():
293
  torch.cuda.empty_cache()
294
  t2 = ttime()
295
- times[0] += t1 - t0
296
- times[2] += t2 - t1
297
  return audio1
298
 
299
  def pipeline(
@@ -303,7 +465,6 @@ class Pipeline(object):
303
  sid,
304
  audio,
305
  input_audio_path,
306
- times,
307
  f0_up_key,
308
  f0_method,
309
  file_index,
@@ -315,21 +476,16 @@ class Pipeline(object):
315
  rms_mix_rate,
316
  version,
317
  protect,
 
 
318
  f0_file=None,
319
  ):
320
- if (
321
- file_index != ""
322
- # and file_big_npy != ""
323
- # and os.path.exists(file_big_npy) == True
324
- and os.path.exists(file_index)
325
- and index_rate != 0
326
- ):
327
  try:
328
  index = faiss.read_index(file_index)
329
- # big_npy = np.load(file_big_npy)
330
  big_npy = index.reconstruct_n(0, index.ntotal)
331
- except:
332
- traceback.print_exc()
333
  index = big_npy = None
334
  else:
335
  index = big_npy = None
@@ -339,14 +495,14 @@ class Pipeline(object):
339
  if audio_pad.shape[0] > self.t_max:
340
  audio_sum = np.zeros_like(audio)
341
  for i in range(self.window):
342
- audio_sum += np.abs(audio_pad[i : i - self.window])
343
  for t in range(self.t_center, audio.shape[0], self.t_center):
344
  opt_ts.append(
345
  t
346
  - self.t_query
347
  + np.where(
348
- audio_sum[t - self.t_query : t + self.t_query]
349
- == audio_sum[t - self.t_query : t + self.t_query].min()
350
  )[0][0]
351
  )
352
  s = 0
@@ -356,7 +512,7 @@ class Pipeline(object):
356
  audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
357
  p_len = audio_pad.shape[0] // self.window
358
  inp_f0 = None
359
- if hasattr(f0_file, "name"):
360
  try:
361
  with open(f0_file.name, "r") as f:
362
  lines = f.read().strip("\n").split("\n")
@@ -364,8 +520,8 @@ class Pipeline(object):
364
  for line in lines:
365
  inp_f0.append([float(i) for i in line.split(",")])
366
  inp_f0 = np.array(inp_f0, dtype="float32")
367
- except:
368
- traceback.print_exc()
369
  sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
370
  pitch, pitchf = None, None
371
  if if_f0 == 1:
@@ -376,16 +532,17 @@ class Pipeline(object):
376
  f0_up_key,
377
  f0_method,
378
  filter_radius,
 
 
379
  inp_f0,
380
  )
381
  pitch = pitch[:p_len]
382
  pitchf = pitchf[:p_len]
383
- if "mps" not in str(self.device) or "xpu" not in str(self.device):
384
  pitchf = pitchf.astype(np.float32)
385
  pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
386
  pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
387
  t2 = ttime()
388
- times[1] += t2 - t1
389
  for t in opt_ts:
390
  t = t // self.window * self.window
391
  if if_f0 == 1:
@@ -397,7 +554,6 @@ class Pipeline(object):
397
  audio_pad[s : t + self.t_pad2 + self.window],
398
  pitch[:, s // self.window : (t + self.t_pad2) // self.window],
399
  pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
400
- times,
401
  index,
402
  big_npy,
403
  index_rate,
@@ -414,7 +570,6 @@ class Pipeline(object):
414
  audio_pad[s : t + self.t_pad2 + self.window],
415
  None,
416
  None,
417
- times,
418
  index,
419
  big_npy,
420
  index_rate,
@@ -432,7 +587,6 @@ class Pipeline(object):
432
  audio_pad[t:],
433
  pitch[:, t // self.window :] if t is not None else pitch,
434
  pitchf[:, t // self.window :] if t is not None else pitchf,
435
- times,
436
  index,
437
  big_npy,
438
  index_rate,
@@ -449,7 +603,6 @@ class Pipeline(object):
449
  audio_pad[t:],
450
  None,
451
  None,
452
- times,
453
  index,
454
  big_npy,
455
  index_rate,
@@ -460,7 +613,7 @@ class Pipeline(object):
460
  audio_opt = np.concatenate(audio_opt)
461
  if rms_mix_rate != 1:
462
  audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
463
- if tgt_sr != resample_sr >= 16000:
464
  audio_opt = librosa.resample(
465
  audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
466
  )
@@ -472,4 +625,4 @@ class Pipeline(object):
472
  del pitch, pitchf, sid
473
  if torch.cuda.is_available():
474
  torch.cuda.empty_cache()
475
- return audio_opt
 
 
 
 
 
1
 
 
2
 
 
 
3
 
4
+ import numpy as np, parselmouth, torch, pdb, sys, os
5
+ from time import time as ttime
 
 
 
 
6
  import torch.nn.functional as F
7
  import torchcrepe
8
+ from torch import Tensor
9
+ import scipy.signal as signal
10
+ import pyworld, os, faiss, librosa, torchcrepe
11
  from scipy import signal
12
+ from functools import lru_cache
13
+ import random
14
+ import gc
15
+ import re
16
 
17
  now_dir = os.getcwd()
18
  sys.path.append(now_dir)
19
 
20
+ from infer.modules.FCPEF0Predictor import FCPEF0Predictor
21
+
22
  bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
23
 
24
  input_audio_path2wav = {}
 
38
  return f0
39
 
40
 
41
+ def change_rms(data1, sr1, data2, sr2, rate):
42
  # print(data1.max(),data2.max())
43
+ rms1 = librosa.feature.rms(y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2)
 
 
44
  rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
45
+
46
  rms1 = torch.from_numpy(rms1)
47
  rms1 = F.interpolate(
48
  rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
49
  ).squeeze()
50
+
51
  rms2 = torch.from_numpy(rms2)
52
  rms2 = F.interpolate(
53
  rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
54
  ).squeeze()
55
  rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
56
+
57
  data2 *= (
58
  torch.pow(rms1, torch.tensor(1 - rate))
59
  * torch.pow(rms2, torch.tensor(rate - 1))
 
61
  return data2
62
 
63
 
64
+ class VC(object):
65
  def __init__(self, tgt_sr, config):
66
  self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
67
  config.x_pad,
 
70
  config.x_max,
71
  config.is_half,
72
  )
73
+ self.sr = 16000
74
+ self.window = 160
75
+ self.t_pad = self.sr * self.x_pad
76
  self.t_pad_tgt = tgt_sr * self.x_pad
77
  self.t_pad2 = self.t_pad * 2
78
+ self.t_query = self.sr * self.x_query
79
+ self.t_center = self.sr * self.x_center
80
+ self.t_max = self.sr * self.x_max
81
  self.device = config.device
82
+ self.ref_freqs = [
83
+ 65.41,
84
+ 82.41,
85
+ 110.00,
86
+ 146.83,
87
+ 196.00,
88
+ 246.94,
89
+ 329.63,
90
+ 440.00,
91
+ 587.33,
92
+ 783.99,
93
+ 1046.50,
94
+ ]
95
+ # Generate interpolated frequencies
96
+ self.note_dict = self.generate_interpolated_frequencies()
97
+
98
+ def generate_interpolated_frequencies(self):
99
+ # Generate interpolated frequencies based on the reference frequencies.
100
+ note_dict = []
101
+ for i in range(len(self.ref_freqs) - 1):
102
+ freq_low = self.ref_freqs[i]
103
+ freq_high = self.ref_freqs[i + 1]
104
+ # Interpolate between adjacent reference frequencies
105
+ interpolated_freqs = np.linspace(
106
+ freq_low, freq_high, num=10, endpoint=False
107
+ )
108
+ note_dict.extend(interpolated_freqs)
109
+ # Add the last reference frequency
110
+ note_dict.append(self.ref_freqs[-1])
111
+ return note_dict
112
+
113
+ def autotune_f0(self, f0):
114
+ # Autotunes the given fundamental frequency (f0) to the nearest musical note.
115
+ autotuned_f0 = np.zeros_like(f0)
116
+ for i, freq in enumerate(f0):
117
+ # Find the closest note
118
+ closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
119
+ autotuned_f0[i] = closest_note
120
+ return autotuned_f0
121
+
122
+ def get_optimal_torch_device(self, index: int = 0) -> torch.device:
123
+ if torch.cuda.is_available():
124
+ return torch.device(f"cuda:{index % torch.cuda.device_count()}")
125
+ elif torch.backends.mps.is_available():
126
+ return torch.device("mps")
127
+ return torch.device("cpu")
128
+
129
+ def get_f0_crepe_computation(
130
+ self,
131
+ x,
132
+ f0_min,
133
+ f0_max,
134
+ p_len,
135
+ hop_length,
136
+ model="full",
137
+ ):
138
+ x = x.astype(np.float32)
139
+ x /= np.quantile(np.abs(x), 0.999)
140
+ torch_device = self.get_optimal_torch_device()
141
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
142
+ audio = torch.unsqueeze(audio, dim=0)
143
+ if audio.ndim == 2 and audio.shape[0] > 1:
144
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
145
+ audio = audio.detach()
146
+ pitch: Tensor = torchcrepe.predict(
147
+ audio,
148
+ self.sr,
149
+ hop_length,
150
+ f0_min,
151
+ f0_max,
152
+ model,
153
+ batch_size=hop_length * 2,
154
+ device=torch_device,
155
+ pad=True,
156
+ )
157
+ p_len = p_len or x.shape[0] // hop_length
158
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
159
+ source[source < 0.001] = np.nan
160
+ target = np.interp(
161
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
162
+ np.arange(0, len(source)),
163
+ source,
164
+ )
165
+ f0 = np.nan_to_num(target)
166
+ return f0
167
+
168
+ def get_f0_official_crepe_computation(
169
+ self,
170
+ x,
171
+ f0_min,
172
+ f0_max,
173
+ model="full",
174
+ ):
175
+ batch_size = 512
176
+ audio = torch.tensor(np.copy(x))[None].float()
177
+ f0, pd = torchcrepe.predict(
178
+ audio,
179
+ self.sr,
180
+ self.window,
181
+ f0_min,
182
+ f0_max,
183
+ model,
184
+ batch_size=batch_size,
185
+ device=self.device,
186
+ return_periodicity=True,
187
+ )
188
+ pd = torchcrepe.filter.median(pd, 3)
189
+ f0 = torchcrepe.filter.mean(f0, 3)
190
+ f0[pd < 0.1] = 0
191
+ f0 = f0[0].cpu().numpy()
192
+ return f0
193
+
194
+ def get_f0_hybrid_computation(
195
+ self,
196
+ methods_str,
197
+ x,
198
+ f0_min,
199
+ f0_max,
200
+ p_len,
201
+ hop_length,
202
+ ):
203
+ methods_str = re.search("hybrid\[(.+)\]", methods_str)
204
+ if methods_str:
205
+ methods = [method.strip() for method in methods_str.group(1).split("+")]
206
+ f0_computation_stack = []
207
+ print(f"Calculating f0 pitch estimations for methods {str(methods)}")
208
+ x = x.astype(np.float32)
209
+ x /= np.quantile(np.abs(x), 0.999)
210
+ for method in methods:
211
+ f0 = None
212
+ if method == "crepe":
213
+ f0 = self.get_f0_crepe_computation(
214
+ x, f0_min, f0_max, p_len, int(hop_length)
215
+ )
216
+ elif method == "rmvpe":
217
+ if hasattr(self, "model_rmvpe") == False:
218
+ from rvc.lib.rmvpe import RMVPE
219
+
220
+ self.model_rmvpe = RMVPE(
221
+ "rmvpe.pt", is_half=self.is_half, device=self.device
222
+ )
223
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
224
+ f0 = f0[1:]
225
+ elif method == "fcpe":
226
+ self.model_fcpe = FCPEF0Predictor(
227
+ "fcpe.pt",
228
+ f0_min=int(f0_min),
229
+ f0_max=int(f0_max),
230
+ dtype=torch.float32,
231
+ device=self.device,
232
+ sampling_rate=self.sr,
233
+ threshold=0.03,
234
+ )
235
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
236
+ del self.model_fcpe
237
+ gc.collect()
238
+ f0_computation_stack.append(f0)
239
+
240
+ print(f"Calculating hybrid median f0 from the stack of {str(methods)}")
241
+ f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None]
242
+ f0_median_hybrid = None
243
+ if len(f0_computation_stack) == 1:
244
+ f0_median_hybrid = f0_computation_stack[0]
245
+ else:
246
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
247
+ return f0_median_hybrid
248
 
249
  def get_f0(
250
  self,
 
254
  f0_up_key,
255
  f0_method,
256
  filter_radius,
257
+ hop_length,
258
+ f0autotune,
259
  inp_f0=None,
260
  ):
261
  global input_audio_path2wav
 
283
  elif f0_method == "harvest":
284
  input_audio_path2wav[input_audio_path] = x.astype(np.double)
285
  f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
286
+ if int(filter_radius) > 2:
287
  f0 = signal.medfilt(f0, 3)
288
+ elif f0_method == "dio":
289
+ f0, t = pyworld.dio(
290
+ x.astype(np.double),
291
+ fs=self.sr,
292
+ f0_ceil=f0_max,
293
+ f0_floor=f0_min,
294
+ frame_period=10,
295
+ )
296
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
297
+ f0 = signal.medfilt(f0, 3)
298
  elif f0_method == "crepe":
299
+ f0 = self.get_f0_crepe_computation(
300
+ x, f0_min, f0_max, p_len, int(hop_length)
301
+ )
302
+ elif f0_method == "crepe-tiny":
303
+ f0 = self.get_f0_crepe_computation(
304
+ x, f0_min, f0_max, p_len, int(hop_length), "tiny"
 
 
 
 
 
 
 
 
 
305
  )
 
 
 
 
306
  elif f0_method == "rmvpe":
307
+ if hasattr(self, "model_rmvpe") == False:
308
+ from rvc.lib.rmvpe import RMVPE
309
 
 
 
 
310
  self.model_rmvpe = RMVPE(
311
+ "rmvpe.pt", is_half=self.is_half, device=self.device
 
 
 
312
  )
313
  f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
 
 
 
 
 
314
  elif f0_method == "fcpe":
315
+ self.model_fcpe = FCPEF0Predictor(
316
+ "fcpe.pt",
317
+ f0_min=int(f0_min),
318
+ f0_max=int(f0_max),
319
+ dtype=torch.float32,
320
+ device=self.device,
321
+ sampling_rate=self.sr,
322
+ threshold=0.03,
323
+ )
324
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
325
+ del self.model_fcpe
326
+ gc.collect()
327
+ elif "hybrid" in f0_method:
328
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
329
+ f0 = self.get_f0_hybrid_computation(
330
+ f0_method,
331
+ x,
332
+ f0_min,
333
+ f0_max,
334
+ p_len,
335
+ hop_length,
336
  )
337
 
338
+ if f0autotune == "True":
339
+ f0 = self.autotune_f0(f0)
340
+
341
  f0 *= pow(2, f0_up_key / 12)
342
+ tf0 = self.sr // self.window
 
343
  if inp_f0 is not None:
344
  delta_t = np.round(
345
  (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
 
351
  f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
352
  :shape
353
  ]
 
354
  f0bak = f0.copy()
355
  f0_mel = 1127 * np.log(1 + f0 / 700)
356
  f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
 
358
  ) + 1
359
  f0_mel[f0_mel <= 1] = 1
360
  f0_mel[f0_mel > 255] = 255
361
+ f0_coarse = np.rint(f0_mel).astype(np.int)
362
+
363
+ return f0_coarse, f0bak
364
 
365
  def vc(
366
  self,
 
370
  audio0,
371
  pitch,
372
  pitchf,
 
373
  index,
374
  big_npy,
375
  index_rate,
376
  version,
377
  protect,
378
+ ):
379
  feats = torch.from_numpy(audio0)
380
  if self.is_half:
381
  feats = feats.half()
382
  else:
383
  feats = feats.float()
384
+ if feats.dim() == 2:
385
  feats = feats.mean(-1)
386
  assert feats.dim() == 1, feats.dim()
387
  feats = feats.view(1, -1)
 
396
  with torch.no_grad():
397
  logits = model.extract_features(**inputs)
398
  feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
399
+ if protect < 0.5 and pitch != None and pitchf != None:
400
  feats0 = feats.clone()
401
  if (
402
+ isinstance(index, type(None)) == False
403
+ and isinstance(big_npy, type(None)) == False
404
  and index_rate != 0
405
  ):
406
  npy = feats[0].cpu().numpy()
407
  if self.is_half:
408
  npy = npy.astype("float32")
409
 
 
 
 
410
  score, ix = index.search(npy, k=8)
411
  weight = np.square(1 / score)
412
  weight /= weight.sum(axis=1, keepdims=True)
 
420
  )
421
 
422
  feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
423
+ if protect < 0.5 and pitch != None and pitchf != None:
424
  feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
425
  0, 2, 1
426
  )
 
428
  p_len = audio0.shape[0] // self.window
429
  if feats.shape[1] < p_len:
430
  p_len = feats.shape[1]
431
+ if pitch != None and pitchf != None:
432
  pitch = pitch[:, :p_len]
433
  pitchf = pitchf[:, :p_len]
434
 
435
+ if protect < 0.5 and pitch != None and pitchf != None:
436
  pitchff = pitchf.clone()
437
  pitchff[pitchf > 0] = 1
438
  pitchff[pitchf < 1] = protect
 
441
  feats = feats.to(feats0.dtype)
442
  p_len = torch.tensor([p_len], device=self.device).long()
443
  with torch.no_grad():
444
+ if pitch != None and pitchf != None:
445
+ audio1 = (
446
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
447
+ .data.cpu()
448
+ .float()
449
+ .numpy()
450
+ )
451
+ else:
452
+ audio1 = (
453
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
454
+ )
455
  del feats, p_len, padding_mask
456
  if torch.cuda.is_available():
457
  torch.cuda.empty_cache()
458
  t2 = ttime()
 
 
459
  return audio1
460
 
461
  def pipeline(
 
465
  sid,
466
  audio,
467
  input_audio_path,
 
468
  f0_up_key,
469
  f0_method,
470
  file_index,
 
476
  rms_mix_rate,
477
  version,
478
  protect,
479
+ hop_length,
480
+ f0autotune,
481
  f0_file=None,
482
  ):
483
+ if file_index != "" and os.path.exists(file_index) == True and index_rate != 0:
 
 
 
 
 
 
484
  try:
485
  index = faiss.read_index(file_index)
 
486
  big_npy = index.reconstruct_n(0, index.ntotal)
487
+ except Exception as error:
488
+ print(error)
489
  index = big_npy = None
490
  else:
491
  index = big_npy = None
 
495
  if audio_pad.shape[0] > self.t_max:
496
  audio_sum = np.zeros_like(audio)
497
  for i in range(self.window):
498
+ audio_sum += audio_pad[i : i - self.window]
499
  for t in range(self.t_center, audio.shape[0], self.t_center):
500
  opt_ts.append(
501
  t
502
  - self.t_query
503
  + np.where(
504
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
505
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
506
  )[0][0]
507
  )
508
  s = 0
 
512
  audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
513
  p_len = audio_pad.shape[0] // self.window
514
  inp_f0 = None
515
+ if hasattr(f0_file, "name") == True:
516
  try:
517
  with open(f0_file.name, "r") as f:
518
  lines = f.read().strip("\n").split("\n")
 
520
  for line in lines:
521
  inp_f0.append([float(i) for i in line.split(",")])
522
  inp_f0 = np.array(inp_f0, dtype="float32")
523
+ except Exception as error:
524
+ print(error)
525
  sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
526
  pitch, pitchf = None, None
527
  if if_f0 == 1:
 
532
  f0_up_key,
533
  f0_method,
534
  filter_radius,
535
+ hop_length,
536
+ f0autotune,
537
  inp_f0,
538
  )
539
  pitch = pitch[:p_len]
540
  pitchf = pitchf[:p_len]
541
+ if self.device == "mps":
542
  pitchf = pitchf.astype(np.float32)
543
  pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
544
  pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
545
  t2 = ttime()
 
546
  for t in opt_ts:
547
  t = t // self.window * self.window
548
  if if_f0 == 1:
 
554
  audio_pad[s : t + self.t_pad2 + self.window],
555
  pitch[:, s // self.window : (t + self.t_pad2) // self.window],
556
  pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
 
557
  index,
558
  big_npy,
559
  index_rate,
 
570
  audio_pad[s : t + self.t_pad2 + self.window],
571
  None,
572
  None,
 
573
  index,
574
  big_npy,
575
  index_rate,
 
587
  audio_pad[t:],
588
  pitch[:, t // self.window :] if t is not None else pitch,
589
  pitchf[:, t // self.window :] if t is not None else pitchf,
 
590
  index,
591
  big_npy,
592
  index_rate,
 
603
  audio_pad[t:],
604
  None,
605
  None,
 
606
  index,
607
  big_npy,
608
  index_rate,
 
613
  audio_opt = np.concatenate(audio_opt)
614
  if rms_mix_rate != 1:
615
  audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
616
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
617
  audio_opt = librosa.resample(
618
  audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
619
  )
 
625
  del pitch, pitchf, sid
626
  if torch.cuda.is_available():
627
  torch.cuda.empty_cache()
628
+ return audio_opt