TIMBOVILL commited on
Commit
af4cdde
1 Parent(s): 644c2ad

Upload 2 files

Browse files
Files changed (2) hide show
  1. rvc/infer/infer.py +257 -0
  2. rvc/infer/vc_infer_pipeline.py +492 -0
rvc/infer/infer.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import numpy as np
5
+ import soundfile as sf
6
+ from vc_infer_pipeline import VC
7
+ from rvc.lib.utils import load_audio
8
+ from rvc.lib.tools.split_audio import process_audio, merge_audio
9
+ from fairseq import checkpoint_utils
10
+ from rvc.lib.infer_pack.models import (
11
+ SynthesizerTrnMs256NSFsid,
12
+ SynthesizerTrnMs256NSFsid_nono,
13
+ SynthesizerTrnMs768NSFsid,
14
+ SynthesizerTrnMs768NSFsid_nono,
15
+ )
16
+
17
+ from rvc.configs.config import Config
18
+
19
+ config = Config()
20
+
21
+ torch.manual_seed(114514)
22
+ hubert_model = None
23
+
24
+
25
+ def load_hubert():
26
+ global hubert_model
27
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
28
+ ["hubert_base.pt"],
29
+ suffix="",
30
+ )
31
+ hubert_model = models[0]
32
+ hubert_model = hubert_model.to(config.device)
33
+ if config.is_half:
34
+ hubert_model = hubert_model.half()
35
+ else:
36
+ hubert_model = hubert_model.float()
37
+ hubert_model.eval()
38
+
39
+
40
+ def vc_single(
41
+ sid=0,
42
+ input_audio_path=None,
43
+ f0_up_key=None,
44
+ f0_file=None,
45
+ f0_method=None,
46
+ file_index=None,
47
+ index_rate=None,
48
+ resample_sr=0,
49
+ rms_mix_rate=1,
50
+ protect=0.33,
51
+ hop_length=None,
52
+ output_path=None,
53
+ split_audio=False,
54
+ ):
55
+ global tgt_sr, net_g, vc, hubert_model, version
56
+
57
+ if input_audio_path is None:
58
+ return "Please, load an audio!", None
59
+
60
+ f0_up_key = int(f0_up_key)
61
+ try:
62
+ audio = load_audio(input_audio_path, 16000)
63
+ audio_max = np.abs(audio).max() / 0.95
64
+
65
+ if audio_max > 1:
66
+ audio /= audio_max
67
+
68
+ if not hubert_model:
69
+ load_hubert()
70
+ if_f0 = cpt.get("f0", 1)
71
+
72
+ file_index = (
73
+ file_index.strip(" ")
74
+ .strip('"')
75
+ .strip("\n")
76
+ .strip('"')
77
+ .strip(" ")
78
+ .replace("trained", "added")
79
+ )
80
+ if tgt_sr != resample_sr >= 16000:
81
+ tgt_sr = resample_sr
82
+ if split_audio == "True":
83
+ result, new_dir_path = process_audio(input_audio_path)
84
+ if result == "Error":
85
+ return "Error with Split Audio", None
86
+ dir_path = new_dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
87
+ if dir_path != "":
88
+ paths = [
89
+ os.path.join(root, name)
90
+ for root, _, files in os.walk(dir_path, topdown=False)
91
+ for name in files
92
+ if name.endswith(".wav") and root == dir_path
93
+ ]
94
+ try:
95
+ for path in paths:
96
+ info, opt = vc_single(
97
+ sid,
98
+ path,
99
+ f0_up_key,
100
+ None,
101
+ f0_method,
102
+ file_index,
103
+ index_rate,
104
+ resample_sr,
105
+ rms_mix_rate,
106
+ protect,
107
+ hop_length,
108
+ path,
109
+ False,
110
+ )
111
+ #new_dir_path
112
+ except Exception as error:
113
+ print(error)
114
+ return "Error", None
115
+ print("Finished processing segmented audio, now merging audio...")
116
+ merge_timestamps_file = os.path.join(os.path.dirname(new_dir_path), f"{os.path.basename(input_audio_path).split('.')[0]}_timestamps.txt")
117
+ tgt_sr, audio_opt = merge_audio(merge_timestamps_file)
118
+
119
+ else:
120
+ audio_opt = vc.pipeline(
121
+ hubert_model,
122
+ net_g,
123
+ sid,
124
+ audio,
125
+ input_audio_path,
126
+ f0_up_key,
127
+ f0_method,
128
+ file_index,
129
+ index_rate,
130
+ if_f0,
131
+ filter_radius,
132
+ tgt_sr,
133
+ resample_sr,
134
+ rms_mix_rate,
135
+ version,
136
+ protect,
137
+ hop_length,
138
+ f0_file=f0_file,
139
+ )
140
+
141
+
142
+ if output_path is not None:
143
+ sf.write(output_path, audio_opt, tgt_sr, format="WAV")
144
+
145
+ return (tgt_sr, audio_opt)
146
+
147
+ except Exception as error:
148
+ print(error)
149
+
150
+
151
+ def get_vc(weight_root, sid):
152
+ global n_spk, tgt_sr, net_g, vc, cpt, version
153
+ if sid == "" or sid == []:
154
+ global hubert_model
155
+ if hubert_model is not None:
156
+ print("clean_empty_cache")
157
+ del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt
158
+ hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None
159
+ if torch.cuda.is_available():
160
+ torch.cuda.empty_cache()
161
+
162
+ if_f0 = cpt.get("f0", 1)
163
+ version = cpt.get("version", "v1")
164
+ if version == "v1":
165
+ if if_f0 == 1:
166
+ net_g = SynthesizerTrnMs256NSFsid(
167
+ *cpt["config"], is_half=config.is_half
168
+ )
169
+ else:
170
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
171
+ elif version == "v2":
172
+ if if_f0 == 1:
173
+ net_g = SynthesizerTrnMs768NSFsid(
174
+ *cpt["config"], is_half=config.is_half
175
+ )
176
+ else:
177
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
178
+ del net_g, cpt
179
+ if torch.cuda.is_available():
180
+ torch.cuda.empty_cache()
181
+ cpt = None
182
+ person = weight_root
183
+ cpt = torch.load(person, map_location="cpu")
184
+ tgt_sr = cpt["config"][-1]
185
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
186
+ if_f0 = cpt.get("f0", 1)
187
+
188
+ version = cpt.get("version", "v1")
189
+ if version == "v1":
190
+ if if_f0 == 1:
191
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
192
+ else:
193
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
194
+ elif version == "v2":
195
+ if if_f0 == 1:
196
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
197
+ else:
198
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
199
+ del net_g.enc_q
200
+ print(net_g.load_state_dict(cpt["weight"], strict=False))
201
+ net_g.eval().to(config.device)
202
+ if config.is_half:
203
+ net_g = net_g.half()
204
+ else:
205
+ net_g = net_g.float()
206
+ vc = VC(tgt_sr, config)
207
+ n_spk = cpt["config"][-3]
208
+
209
+
210
+ f0up_key = sys.argv[1]
211
+ filter_radius = sys.argv[2]
212
+ index_rate = float(sys.argv[3])
213
+ hop_length = sys.argv[4]
214
+ f0method = sys.argv[5]
215
+
216
+ audio_input_path = sys.argv[6]
217
+ audio_output_path = sys.argv[7]
218
+
219
+ model_path = sys.argv[8]
220
+ index_path = sys.argv[9]
221
+ split_audio = sys.argv[10]
222
+
223
+ sid = f0up_key
224
+ input_audio = audio_input_path
225
+ f0_pitch = f0up_key
226
+ f0_file = None
227
+ f0_method = f0method
228
+ file_index = index_path
229
+ index_rate = index_rate
230
+ output_file = audio_output_path
231
+ split_audio = split_audio
232
+
233
+ get_vc(model_path, 0)
234
+
235
+ try:
236
+ result, audio_opt = vc_single(
237
+ sid=0,
238
+ input_audio_path=input_audio,
239
+ f0_up_key=f0_pitch,
240
+ f0_file=None,
241
+ f0_method=f0_method,
242
+ file_index=file_index,
243
+ index_rate=index_rate,
244
+ hop_length=hop_length,
245
+ output_path=output_file,
246
+ split_audio=split_audio
247
+ )
248
+
249
+ if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
250
+ message = result
251
+ else:
252
+ message = result
253
+
254
+ print(f"Conversion completed. Output file: '{output_file}'")
255
+
256
+ except Exception as error:
257
+ print(f"Voice conversion failed: {error}")
rvc/infer/vc_infer_pipeline.py ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np, parselmouth, torch, pdb, sys, os
2
+ from time import time as ttime
3
+ import torch.nn.functional as F
4
+ import torchcrepe
5
+ from torch import Tensor
6
+ import scipy.signal as signal
7
+ import pyworld, os, faiss, librosa, torchcrepe
8
+ from scipy import signal
9
+ from functools import lru_cache
10
+
11
+ now_dir = os.getcwd()
12
+ sys.path.append(now_dir)
13
+
14
+ bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
15
+
16
+ input_audio_path2wav = {}
17
+
18
+
19
+ @lru_cache
20
+ def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period):
21
+ audio = input_audio_path2wav[input_audio_path]
22
+ f0, t = pyworld.harvest(
23
+ audio,
24
+ fs=fs,
25
+ f0_ceil=f0max,
26
+ f0_floor=f0min,
27
+ frame_period=frame_period,
28
+ )
29
+ f0 = pyworld.stonemask(audio, f0, t, fs)
30
+ return f0
31
+
32
+
33
+ def change_rms(data1, sr1, data2, sr2, rate):
34
+ rms1 = librosa.feature.rms(y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2)
35
+ rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2)
36
+ rms1 = torch.from_numpy(rms1)
37
+ rms1 = F.interpolate(
38
+ rms1.unsqueeze(0), size=data2.shape[0], mode="linear"
39
+ ).squeeze()
40
+ rms2 = torch.from_numpy(rms2)
41
+ rms2 = F.interpolate(
42
+ rms2.unsqueeze(0), size=data2.shape[0], mode="linear"
43
+ ).squeeze()
44
+ rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6)
45
+ data2 *= (
46
+ torch.pow(rms1, torch.tensor(1 - rate))
47
+ * torch.pow(rms2, torch.tensor(rate - 1))
48
+ ).numpy()
49
+ return data2
50
+
51
+
52
+ class VC(object):
53
+ def __init__(self, tgt_sr, config):
54
+ self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
55
+ config.x_pad,
56
+ config.x_query,
57
+ config.x_center,
58
+ config.x_max,
59
+ config.is_half,
60
+ )
61
+ self.sr = 16000
62
+ self.window = 160
63
+ self.t_pad = self.sr * self.x_pad
64
+ self.t_pad_tgt = tgt_sr * self.x_pad
65
+ self.t_pad2 = self.t_pad * 2
66
+ self.t_query = self.sr * self.x_query
67
+ self.t_center = self.sr * self.x_center
68
+ self.t_max = self.sr * self.x_max
69
+ self.device = config.device
70
+
71
+ def get_optimal_torch_device(self, index: int = 0) -> torch.device:
72
+ if torch.cuda.is_available():
73
+ return torch.device(f"cuda:{index % torch.cuda.device_count()}")
74
+ elif torch.backends.mps.is_available():
75
+ return torch.device("mps")
76
+ return torch.device("cpu")
77
+
78
+ def get_f0_crepe_computation(
79
+ self,
80
+ x,
81
+ f0_min,
82
+ f0_max,
83
+ p_len,
84
+ hop_length=120,
85
+ model="full",
86
+ ):
87
+ x = x.astype(np.float32)
88
+ x /= np.quantile(np.abs(x), 0.999)
89
+ torch_device = self.get_optimal_torch_device()
90
+ audio = torch.from_numpy(x).to(torch_device, copy=True)
91
+ audio = torch.unsqueeze(audio, dim=0)
92
+ if audio.ndim == 2 and audio.shape[0] > 1:
93
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
94
+ audio = audio.detach()
95
+ print("Initiating prediction with a hop_length of: " + str(hop_length))
96
+ pitch: Tensor = torchcrepe.predict(
97
+ audio,
98
+ self.sr,
99
+ hop_length,
100
+ f0_min,
101
+ f0_max,
102
+ model,
103
+ batch_size=hop_length * 2,
104
+ device=torch_device,
105
+ pad=True,
106
+ )
107
+ p_len = p_len or x.shape[0] // hop_length
108
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
109
+ source[source < 0.001] = np.nan
110
+ target = np.interp(
111
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
112
+ np.arange(0, len(source)),
113
+ source,
114
+ )
115
+ f0 = np.nan_to_num(target)
116
+ return f0
117
+
118
+ def get_f0_official_crepe_computation(
119
+ self,
120
+ x,
121
+ f0_min,
122
+ f0_max,
123
+ model="full",
124
+ ):
125
+ batch_size = 512
126
+ audio = torch.tensor(np.copy(x))[None].float()
127
+ f0, pd = torchcrepe.predict(
128
+ audio,
129
+ self.sr,
130
+ self.window,
131
+ f0_min,
132
+ f0_max,
133
+ model,
134
+ batch_size=batch_size,
135
+ device=self.device,
136
+ return_periodicity=True,
137
+ )
138
+ pd = torchcrepe.filter.median(pd, 3)
139
+ f0 = torchcrepe.filter.mean(f0, 3)
140
+ f0[pd < 0.1] = 0
141
+ f0 = f0[0].cpu().numpy()
142
+ return f0
143
+
144
+ def get_f0(
145
+ self,
146
+ input_audio_path,
147
+ x,
148
+ p_len,
149
+ f0_up_key,
150
+ f0_method,
151
+ filter_radius,
152
+ hop_length,
153
+ inp_f0=None,
154
+ ):
155
+ global input_audio_path2wav
156
+ time_step = self.window / self.sr * 1000
157
+ f0_min = 50
158
+ f0_max = 1100
159
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
160
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
161
+ if f0_method == "pm":
162
+ f0 = (
163
+ parselmouth.Sound(x, self.sr)
164
+ .to_pitch_ac(
165
+ time_step=time_step / 1000,
166
+ voicing_threshold=0.6,
167
+ pitch_floor=f0_min,
168
+ pitch_ceiling=f0_max,
169
+ )
170
+ .selected_array["frequency"]
171
+ )
172
+ pad_size = (p_len - len(f0) + 1) // 2
173
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
174
+ f0 = np.pad(
175
+ f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
176
+ )
177
+ elif f0_method == "harvest":
178
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
179
+ f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10)
180
+ if filter_radius > 2:
181
+ f0 = signal.medfilt(f0, 3)
182
+ elif f0_method == "dio":
183
+ f0, t = pyworld.dio(
184
+ x.astype(np.double),
185
+ fs=self.sr,
186
+ f0_ceil=f0_max,
187
+ f0_floor=f0_min,
188
+ frame_period=10,
189
+ )
190
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr)
191
+ f0 = signal.medfilt(f0, 3)
192
+ elif f0_method == "crepe":
193
+ f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, hop_length)
194
+ elif f0_method == "crepe-tiny":
195
+ f0 = self.get_f0_crepe_computation(
196
+ x, f0_min, f0_max, p_len, hop_length, "tiny"
197
+ )
198
+ elif f0_method == "rmvpe":
199
+ if hasattr(self, "model_rmvpe") == False:
200
+ from rvc.lib.rmvpe import RMVPE
201
+
202
+ self.model_rmvpe = RMVPE(
203
+ "rmvpe.pt", is_half=self.is_half, device=self.device
204
+ )
205
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
206
+
207
+ f0 *= pow(2, f0_up_key / 12)
208
+ tf0 = self.sr // self.window
209
+ if inp_f0 is not None:
210
+ delta_t = np.round(
211
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
212
+ ).astype("int16")
213
+ replace_f0 = np.interp(
214
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
215
+ )
216
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
217
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
218
+ :shape
219
+ ]
220
+ f0bak = f0.copy()
221
+ f0_mel = 1127 * np.log(1 + f0 / 700)
222
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
223
+ f0_mel_max - f0_mel_min
224
+ ) + 1
225
+ f0_mel[f0_mel <= 1] = 1
226
+ f0_mel[f0_mel > 255] = 255
227
+ f0_coarse = np.rint(f0_mel).astype(np.int)
228
+
229
+ return f0_coarse, f0bak
230
+
231
+ def vc(
232
+ self,
233
+ model,
234
+ net_g,
235
+ sid,
236
+ audio0,
237
+ pitch,
238
+ pitchf,
239
+ index,
240
+ big_npy,
241
+ index_rate,
242
+ version,
243
+ protect,
244
+ ):
245
+ feats = torch.from_numpy(audio0)
246
+ if self.is_half:
247
+ feats = feats.half()
248
+ else:
249
+ feats = feats.float()
250
+ if feats.dim() == 2:
251
+ feats = feats.mean(-1)
252
+ assert feats.dim() == 1, feats.dim()
253
+ feats = feats.view(1, -1)
254
+ padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
255
+
256
+ inputs = {
257
+ "source": feats.to(self.device),
258
+ "padding_mask": padding_mask,
259
+ "output_layer": 9 if version == "v1" else 12,
260
+ }
261
+ t0 = ttime()
262
+ with torch.no_grad():
263
+ logits = model.extract_features(**inputs)
264
+ feats = model.final_proj(logits[0]) if version == "v1" else logits[0]
265
+ if protect < 0.5 and pitch != None and pitchf != None:
266
+ feats0 = feats.clone()
267
+ if (
268
+ isinstance(index, type(None)) == False
269
+ and isinstance(big_npy, type(None)) == False
270
+ and index_rate != 0
271
+ ):
272
+ npy = feats[0].cpu().numpy()
273
+ if self.is_half:
274
+ npy = npy.astype("float32")
275
+
276
+ score, ix = index.search(npy, k=8)
277
+ weight = np.square(1 / score)
278
+ weight /= weight.sum(axis=1, keepdims=True)
279
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
280
+
281
+ if self.is_half:
282
+ npy = npy.astype("float16")
283
+ feats = (
284
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
285
+ + (1 - index_rate) * feats
286
+ )
287
+
288
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
289
+ if protect < 0.5 and pitch != None and pitchf != None:
290
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
291
+ 0, 2, 1
292
+ )
293
+ t1 = ttime()
294
+ p_len = audio0.shape[0] // self.window
295
+ if feats.shape[1] < p_len:
296
+ p_len = feats.shape[1]
297
+ if pitch != None and pitchf != None:
298
+ pitch = pitch[:, :p_len]
299
+ pitchf = pitchf[:, :p_len]
300
+
301
+ if protect < 0.5 and pitch != None and pitchf != None:
302
+ pitchff = pitchf.clone()
303
+ pitchff[pitchf > 0] = 1
304
+ pitchff[pitchf < 1] = protect
305
+ pitchff = pitchff.unsqueeze(-1)
306
+ feats = feats * pitchff + feats0 * (1 - pitchff)
307
+ feats = feats.to(feats0.dtype)
308
+ p_len = torch.tensor([p_len], device=self.device).long()
309
+ with torch.no_grad():
310
+ if pitch != None and pitchf != None:
311
+ audio1 = (
312
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
313
+ .data.cpu()
314
+ .float()
315
+ .numpy()
316
+ )
317
+ else:
318
+ audio1 = (
319
+ (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy()
320
+ )
321
+ del feats, p_len, padding_mask
322
+ if torch.cuda.is_available():
323
+ torch.cuda.empty_cache()
324
+ t2 = ttime()
325
+ return audio1
326
+
327
+ def pipeline(
328
+ self,
329
+ model,
330
+ net_g,
331
+ sid,
332
+ audio,
333
+ input_audio_path,
334
+ f0_up_key,
335
+ f0_method,
336
+ file_index,
337
+ index_rate,
338
+ if_f0,
339
+ filter_radius,
340
+ tgt_sr,
341
+ resample_sr,
342
+ rms_mix_rate,
343
+ version,
344
+ protect,
345
+ hop_length,
346
+ f0_file=None,
347
+ ):
348
+ if file_index != "" and os.path.exists(file_index) == True and index_rate != 0:
349
+ try:
350
+ index = faiss.read_index(file_index)
351
+ big_npy = index.reconstruct_n(0, index.ntotal)
352
+ except Exception as error:
353
+ print(error)
354
+ index = big_npy = None
355
+ else:
356
+ index = big_npy = None
357
+ audio = signal.filtfilt(bh, ah, audio)
358
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
359
+ opt_ts = []
360
+ if audio_pad.shape[0] > self.t_max:
361
+ audio_sum = np.zeros_like(audio)
362
+ for i in range(self.window):
363
+ audio_sum += audio_pad[i : i - self.window]
364
+ for t in range(self.t_center, audio.shape[0], self.t_center):
365
+ opt_ts.append(
366
+ t
367
+ - self.t_query
368
+ + np.where(
369
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
370
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
371
+ )[0][0]
372
+ )
373
+ s = 0
374
+ audio_opt = []
375
+ t = None
376
+ t1 = ttime()
377
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
378
+ p_len = audio_pad.shape[0] // self.window
379
+ inp_f0 = None
380
+ if hasattr(f0_file, "name") == True:
381
+ try:
382
+ with open(f0_file.name, "r") as f:
383
+ lines = f.read().strip("\n").split("\n")
384
+ inp_f0 = []
385
+ for line in lines:
386
+ inp_f0.append([float(i) for i in line.split(",")])
387
+ inp_f0 = np.array(inp_f0, dtype="float32")
388
+ except Exception as error:
389
+ print(error)
390
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
391
+ pitch, pitchf = None, None
392
+ if if_f0 == 1:
393
+ pitch, pitchf = self.get_f0(
394
+ input_audio_path,
395
+ audio_pad,
396
+ p_len,
397
+ f0_up_key,
398
+ f0_method,
399
+ filter_radius,
400
+ hop_length,
401
+ inp_f0,
402
+ )
403
+ pitch = pitch[:p_len]
404
+ pitchf = pitchf[:p_len]
405
+ if self.device == "mps":
406
+ pitchf = pitchf.astype(np.float32)
407
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
408
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
409
+ t2 = ttime()
410
+ for t in opt_ts:
411
+ t = t // self.window * self.window
412
+ if if_f0 == 1:
413
+ audio_opt.append(
414
+ self.vc(
415
+ model,
416
+ net_g,
417
+ sid,
418
+ audio_pad[s : t + self.t_pad2 + self.window],
419
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
420
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
421
+ index,
422
+ big_npy,
423
+ index_rate,
424
+ version,
425
+ protect,
426
+ )[self.t_pad_tgt : -self.t_pad_tgt]
427
+ )
428
+ else:
429
+ audio_opt.append(
430
+ self.vc(
431
+ model,
432
+ net_g,
433
+ sid,
434
+ audio_pad[s : t + self.t_pad2 + self.window],
435
+ None,
436
+ None,
437
+ index,
438
+ big_npy,
439
+ index_rate,
440
+ version,
441
+ protect,
442
+ )[self.t_pad_tgt : -self.t_pad_tgt]
443
+ )
444
+ s = t
445
+ if if_f0 == 1:
446
+ audio_opt.append(
447
+ self.vc(
448
+ model,
449
+ net_g,
450
+ sid,
451
+ audio_pad[t:],
452
+ pitch[:, t // self.window :] if t is not None else pitch,
453
+ pitchf[:, t // self.window :] if t is not None else pitchf,
454
+ index,
455
+ big_npy,
456
+ index_rate,
457
+ version,
458
+ protect,
459
+ )[self.t_pad_tgt : -self.t_pad_tgt]
460
+ )
461
+ else:
462
+ audio_opt.append(
463
+ self.vc(
464
+ model,
465
+ net_g,
466
+ sid,
467
+ audio_pad[t:],
468
+ None,
469
+ None,
470
+ index,
471
+ big_npy,
472
+ index_rate,
473
+ version,
474
+ protect,
475
+ )[self.t_pad_tgt : -self.t_pad_tgt]
476
+ )
477
+ audio_opt = np.concatenate(audio_opt)
478
+ if rms_mix_rate != 1:
479
+ audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate)
480
+ if resample_sr >= 16000 and tgt_sr != resample_sr:
481
+ audio_opt = librosa.resample(
482
+ audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
483
+ )
484
+ audio_max = np.abs(audio_opt).max() / 0.99
485
+ max_int16 = 32768
486
+ if audio_max > 1:
487
+ max_int16 /= audio_max
488
+ audio_opt = (audio_opt * max_int16).astype(np.int16)
489
+ del pitch, pitchf, sid
490
+ if torch.cuda.is_available():
491
+ torch.cuda.empty_cache()
492
+ return audio_opt