MarcusSu1216 commited on
Commit
1a8c53e
·
verified ·
1 Parent(s): 2ddcf6b

Update inference/infer_tool.py

Browse files
Files changed (1) hide show
  1. inference/infer_tool.py +25 -133
inference/infer_tool.py CHANGED
@@ -7,21 +7,6 @@ import time
7
  from pathlib import Path
8
  from inference import slicer
9
 
10
- import librosa
11
- import numpy as np
12
- # import onnxruntime
13
- import parselmouth
14
- import soundfile
15
- import torch
16
- import hashlib
17
- import io
18
- import json
19
- import logging
20
- import os
21
- import time
22
- from pathlib import Path
23
- from inference import slicer
24
-
25
  import librosa
26
  import numpy as np
27
  # import onnxruntime
@@ -117,21 +102,12 @@ def pad_array(arr, target_length):
117
  pad_right = pad_width - pad_left
118
  padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
119
  return padded_arr
120
-
121
- def split_list_by_n(list_collection, n, pre=0):
122
- for i in range(0, len(list_collection), n):
123
- yield list_collection[i-pre if i-pre>=0 else i: i + n]
124
 
125
 
126
- class F0FilterException(Exception):
127
- pass
128
-
129
  class Svc(object):
130
  def __init__(self, net_g_path, config_path,
131
  device=None,
132
- cluster_model_path="logs/44k/kmeans_10000.pt",
133
- nsf_hifigan_enhance = False
134
- ):
135
  self.net_g_path = net_g_path
136
  if device is None:
137
  self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -142,15 +118,11 @@ class Svc(object):
142
  self.target_sample = self.hps_ms.data.sampling_rate
143
  self.hop_size = self.hps_ms.data.hop_length
144
  self.spk2id = self.hps_ms.spk
145
- self.nsf_hifigan_enhance = nsf_hifigan_enhance
146
  # 加载hubert
147
  self.hubert_model = utils.get_hubert_model().to(self.dev)
148
  self.load_model()
149
  if os.path.exists(cluster_model_path):
150
  self.cluster_model = cluster.get_cluster_model(cluster_model_path)
151
- if self.nsf_hifigan_enhance:
152
- from modules.enhancer import Enhancer
153
- self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
154
 
155
  def load_model(self):
156
  # 获取模型配置
@@ -166,24 +138,14 @@ class Svc(object):
166
 
167
 
168
 
169
- def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,F0_mean_pooling):
170
 
171
  wav, sr = librosa.load(in_path, sr=self.target_sample)
172
 
173
- if F0_mean_pooling == True:
174
- f0, uv = utils.compute_f0_uv_torchcrepe(torch.FloatTensor(wav), sampling_rate=self.target_sample, hop_length=self.hop_size,device=self.dev)
175
- if f0_filter and sum(f0) == 0:
176
- raise F0FilterException("未检测到人声")
177
- f0 = torch.FloatTensor(list(f0))
178
- uv = torch.FloatTensor(list(uv))
179
- if F0_mean_pooling == False:
180
- f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
181
- if f0_filter and sum(f0) == 0:
182
- raise F0FilterException("未检测到人声")
183
- f0, uv = utils.interpolate_f0(f0)
184
- f0 = torch.FloatTensor(f0)
185
- uv = torch.FloatTensor(uv)
186
-
187
  f0 = f0 * 2 ** (tran / 12)
188
  f0 = f0.unsqueeze(0).to(self.dev)
189
  uv = uv.unsqueeze(0).to(self.dev)
@@ -204,107 +166,54 @@ class Svc(object):
204
  def infer(self, speaker, tran, raw_path,
205
  cluster_infer_ratio=0,
206
  auto_predict_f0=False,
207
- noice_scale=0.4,
208
- f0_filter=False,
209
- F0_mean_pooling=False,
210
- enhancer_adaptive_key = 0
211
- ):
212
-
213
  speaker_id = self.spk2id.__dict__.get(speaker)
214
  if not speaker_id and type(speaker) is int:
215
  if len(self.spk2id.__dict__) >= speaker:
216
  speaker_id = speaker
217
  sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
218
- c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter,F0_mean_pooling)
219
  if "half" in self.net_g_path and torch.cuda.is_available():
220
  c = c.half()
221
  with torch.no_grad():
222
  start = time.time()
223
  audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0,0].data.float()
224
- if self.nsf_hifigan_enhance:
225
- audio, _ = self.enhancer.enhance(
226
- audio[None,:],
227
- self.target_sample,
228
- f0[:,:,None],
229
- self.hps_ms.data.hop_length,
230
- adaptive_key = enhancer_adaptive_key)
231
  use_time = time.time() - start
232
  print("vits use time:{}".format(use_time))
233
  return audio, audio.shape[-1]
234
 
235
- def clear_empty(self):
236
- # 清理显存
237
- torch.cuda.empty_cache()
238
-
239
- def slice_inference(self,
240
- raw_audio_path,
241
- spk,
242
- tran,
243
- slice_db,
244
- cluster_infer_ratio,
245
- auto_predict_f0,
246
- noice_scale,
247
- pad_seconds=0.5,
248
- clip_seconds=0,
249
- lg_num=0,
250
- lgr_num =0.75,
251
- F0_mean_pooling = False,
252
- enhancer_adaptive_key = 0
253
- ):
254
  wav_path = raw_audio_path
255
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
256
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
257
- per_size = int(clip_seconds*audio_sr)
258
- lg_size = int(lg_num*audio_sr)
259
- lg_size_r = int(lg_size*lgr_num)
260
- lg_size_c_l = (lg_size-lg_size_r)//2
261
- lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
262
- lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
263
-
264
  audio = []
265
  for (slice_tag, data) in audio_data:
266
  print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
267
  # padd
 
 
268
  length = int(np.ceil(len(data) / audio_sr * self.target_sample))
 
 
 
269
  if slice_tag:
270
  print('jump empty segment')
271
  _audio = np.zeros(length)
272
- audio.extend(list(pad_array(_audio, length)))
273
- continue
274
- if per_size != 0:
275
- datas = split_list_by_n(data, per_size,lg_size)
276
  else:
277
- datas = [data]
278
- for k,dat in enumerate(datas):
279
- per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
280
- if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
281
- # padd
282
- pad_len = int(audio_sr * pad_seconds)
283
- dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
284
- raw_path = io.BytesIO()
285
- soundfile.write(raw_path, dat, audio_sr, format="wav")
286
- raw_path.seek(0)
287
  out_audio, out_sr = self.infer(spk, tran, raw_path,
288
  cluster_infer_ratio=cluster_infer_ratio,
289
  auto_predict_f0=auto_predict_f0,
290
- noice_scale=noice_scale,
291
- F0_mean_pooling = F0_mean_pooling,
292
- enhancer_adaptive_key = enhancer_adaptive_key
293
  )
294
  _audio = out_audio.cpu().numpy()
295
- pad_len = int(self.target_sample * pad_seconds)
296
- _audio = _audio[pad_len:-pad_len]
297
- _audio = pad_array(_audio, per_length)
298
- if lg_size!=0 and k!=0:
299
- lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
300
- lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
301
- lg_pre = lg1*(1-lg)+lg2*lg
302
- audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
303
- audio.extend(lg_pre)
304
- _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
305
- audio.extend(list(_audio))
306
  return np.array(audio)
307
 
 
308
  class RealTimeVC:
309
  def __init__(self):
310
  self.last_chunk = None
@@ -314,25 +223,14 @@ class RealTimeVC:
314
 
315
  """输入输出都是1维numpy 音频波形数组"""
316
 
317
- def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
318
- cluster_infer_ratio=0,
319
- auto_predict_f0=False,
320
- noice_scale=0.4,
321
- f0_filter=False):
322
-
323
  import maad
324
  audio, sr = torchaudio.load(input_wav_path)
325
  audio = audio.cpu().numpy()[0]
326
  temp_wav = io.BytesIO()
327
  if self.last_chunk is None:
328
  input_wav_path.seek(0)
329
-
330
- audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path,
331
- cluster_infer_ratio=cluster_infer_ratio,
332
- auto_predict_f0=auto_predict_f0,
333
- noice_scale=noice_scale,
334
- f0_filter=f0_filter)
335
-
336
  audio = audio.cpu().numpy()
337
  self.last_chunk = audio[-self.pre_len:]
338
  self.last_o = audio
@@ -341,15 +239,9 @@ class RealTimeVC:
341
  audio = np.concatenate([self.last_chunk, audio])
342
  soundfile.write(temp_wav, audio, sr, format="wav")
343
  temp_wav.seek(0)
344
-
345
- audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav,
346
- cluster_infer_ratio=cluster_infer_ratio,
347
- auto_predict_f0=auto_predict_f0,
348
- noice_scale=noice_scale,
349
- f0_filter=f0_filter)
350
-
351
  audio = audio.cpu().numpy()
352
  ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
353
  self.last_chunk = audio[-self.pre_len:]
354
  self.last_o = audio
355
- return ret[self.chunk_len:2 * self.chunk_len]
 
7
  from pathlib import Path
8
  from inference import slicer
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import librosa
11
  import numpy as np
12
  # import onnxruntime
 
102
  pad_right = pad_width - pad_left
103
  padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
104
  return padded_arr
 
 
 
 
105
 
106
 
 
 
 
107
  class Svc(object):
108
  def __init__(self, net_g_path, config_path,
109
  device=None,
110
+ cluster_model_path="logs/44k/kmeans_10000.pt"):
 
 
111
  self.net_g_path = net_g_path
112
  if device is None:
113
  self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
118
  self.target_sample = self.hps_ms.data.sampling_rate
119
  self.hop_size = self.hps_ms.data.hop_length
120
  self.spk2id = self.hps_ms.spk
 
121
  # 加载hubert
122
  self.hubert_model = utils.get_hubert_model().to(self.dev)
123
  self.load_model()
124
  if os.path.exists(cluster_model_path):
125
  self.cluster_model = cluster.get_cluster_model(cluster_model_path)
 
 
 
126
 
127
  def load_model(self):
128
  # 获取模型配置
 
138
 
139
 
140
 
141
+ def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker):
142
 
143
  wav, sr = librosa.load(in_path, sr=self.target_sample)
144
 
145
+ f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
146
+ f0, uv = utils.interpolate_f0(f0)
147
+ f0 = torch.FloatTensor(f0)
148
+ uv = torch.FloatTensor(uv)
 
 
 
 
 
 
 
 
 
 
149
  f0 = f0 * 2 ** (tran / 12)
150
  f0 = f0.unsqueeze(0).to(self.dev)
151
  uv = uv.unsqueeze(0).to(self.dev)
 
166
  def infer(self, speaker, tran, raw_path,
167
  cluster_infer_ratio=0,
168
  auto_predict_f0=False,
169
+ noice_scale=0.4):
 
 
 
 
 
170
  speaker_id = self.spk2id.__dict__.get(speaker)
171
  if not speaker_id and type(speaker) is int:
172
  if len(self.spk2id.__dict__) >= speaker:
173
  speaker_id = speaker
174
  sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
175
+ c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker)
176
  if "half" in self.net_g_path and torch.cuda.is_available():
177
  c = c.half()
178
  with torch.no_grad():
179
  start = time.time()
180
  audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0,0].data.float()
 
 
 
 
 
 
 
181
  use_time = time.time() - start
182
  print("vits use time:{}".format(use_time))
183
  return audio, audio.shape[-1]
184
 
185
+ def slice_inference(self,raw_audio_path, spk, tran, slice_db,cluster_infer_ratio, auto_predict_f0,noice_scale, pad_seconds=0.5):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  wav_path = raw_audio_path
187
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
188
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
189
+
 
 
 
 
 
 
190
  audio = []
191
  for (slice_tag, data) in audio_data:
192
  print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
193
  # padd
194
+ pad_len = int(audio_sr * pad_seconds)
195
+ data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])])
196
  length = int(np.ceil(len(data) / audio_sr * self.target_sample))
197
+ raw_path = io.BytesIO()
198
+ soundfile.write(raw_path, data, audio_sr, format="wav")
199
+ raw_path.seek(0)
200
  if slice_tag:
201
  print('jump empty segment')
202
  _audio = np.zeros(length)
 
 
 
 
203
  else:
 
 
 
 
 
 
 
 
 
 
204
  out_audio, out_sr = self.infer(spk, tran, raw_path,
205
  cluster_infer_ratio=cluster_infer_ratio,
206
  auto_predict_f0=auto_predict_f0,
207
+ noice_scale=noice_scale
 
 
208
  )
209
  _audio = out_audio.cpu().numpy()
210
+
211
+ pad_len = int(self.target_sample * pad_seconds)
212
+ _audio = _audio[pad_len:-pad_len]
213
+ audio.extend(list(_audio))
 
 
 
 
 
 
 
214
  return np.array(audio)
215
 
216
+
217
  class RealTimeVC:
218
  def __init__(self):
219
  self.last_chunk = None
 
223
 
224
  """输入输出都是1维numpy 音频波形数组"""
225
 
226
+ def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path):
 
 
 
 
 
227
  import maad
228
  audio, sr = torchaudio.load(input_wav_path)
229
  audio = audio.cpu().numpy()[0]
230
  temp_wav = io.BytesIO()
231
  if self.last_chunk is None:
232
  input_wav_path.seek(0)
233
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
 
 
 
 
 
 
234
  audio = audio.cpu().numpy()
235
  self.last_chunk = audio[-self.pre_len:]
236
  self.last_o = audio
 
239
  audio = np.concatenate([self.last_chunk, audio])
240
  soundfile.write(temp_wav, audio, sr, format="wav")
241
  temp_wav.seek(0)
242
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav)
 
 
 
 
 
 
243
  audio = audio.cpu().numpy()
244
  ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
245
  self.last_chunk = audio[-self.pre_len:]
246
  self.last_o = audio
247
+ return ret[self.chunk_len:2 * self.chunk_len]