darksakura commited on
Commit
ae93010
1 Parent(s): cb73098

Upload 3 files

Browse files
inference/infer_tool.py CHANGED
@@ -1,15 +1,16 @@
 
1
  import hashlib
2
  import io
3
  import json
4
  import logging
5
  import os
 
6
  import time
7
  from pathlib import Path
8
- from inference import slicer
9
- import gc
10
 
11
  import librosa
12
  import numpy as np
 
13
  # import onnxruntime
14
  import soundfile
15
  import torch
@@ -17,11 +18,9 @@ import torchaudio
17
 
18
  import cluster
19
  import utils
20
- from models import SynthesizerTrn
21
- import pickle
22
-
23
  from diffusion.unit2mel import load_model_vocoder
24
- import yaml
 
25
 
26
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
27
 
@@ -136,19 +135,14 @@ class Svc(object):
136
  self.dev = torch.device(device)
137
  self.net_g_ms = None
138
  if not self.only_diffusion:
139
- self.hps_ms = utils.get_hparams_from_file(config_path)
140
  self.target_sample = self.hps_ms.data.sampling_rate
141
  self.hop_size = self.hps_ms.data.hop_length
142
  self.spk2id = self.hps_ms.spk
143
- try:
144
- self.vol_embedding = self.hps_ms.model.vol_embedding
145
- except Exception as e:
146
- self.vol_embedding = False
147
- try:
148
- self.speech_encoder = self.hps_ms.model.speech_encoder
149
- except Exception as e:
150
- self.speech_encoder = 'vec768l12'
151
-
152
  self.nsf_hifigan_enhance = nsf_hifigan_enhance
153
  if self.shallow_diffusion or self.only_diffusion:
154
  if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
@@ -158,6 +152,7 @@ class Svc(object):
158
  self.hop_size = self.diffusion_args.data.block_size
159
  self.spk2id = self.diffusion_args.spk
160
  self.speech_encoder = self.diffusion_args.data.encoder
 
161
  if spk_mix_enable:
162
  self.diffusion_model.init_spkmix(len(self.spk2id))
163
  else:
@@ -184,7 +179,8 @@ class Svc(object):
184
  else:
185
  self.feature_retrieval=False
186
 
187
- if self.shallow_diffusion : self.nsf_hifigan_enhance = False
 
188
  if self.nsf_hifigan_enhance:
189
  from modules.enhancer import Enhancer
190
  self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
@@ -196,6 +192,7 @@ class Svc(object):
196
  self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
197
  **self.hps_ms.model)
198
  _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
 
199
  if "half" in self.net_g_path and torch.cuda.is_available():
200
  _ = self.net_g_ms.half().eval().to(self.dev)
201
  else:
@@ -220,7 +217,7 @@ class Svc(object):
220
  wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
221
  wav16k = torch.from_numpy(wav16k).to(self.dev)
222
  c = self.hubert_model.encoder(wav16k)
223
- c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
224
 
225
  if cluster_infer_ratio !=0:
226
  if self.feature_retrieval:
@@ -232,7 +229,7 @@ class Svc(object):
232
  speaker_id = speaker
233
  feature_index = self.cluster_model[speaker_id]
234
  feat_np = c.transpose(0,1).cpu().numpy()
235
- if self.big_npy is not None or self.now_spk_id != speaker_id:
236
  self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
237
  self.now_spk_id = speaker_id
238
  print("starting feature retrieval...")
@@ -272,16 +269,17 @@ class Svc(object):
272
  sid = speaker[:, frame:frame+n_frames].transpose(0,1)
273
  else:
274
  speaker_id = self.spk2id.get(speaker)
275
- if speaker_id is None:
276
- raise RuntimeError("The name you entered is not in the speaker list!")
277
  if not speaker_id and type(speaker) is int:
278
  if len(self.spk2id.__dict__) >= speaker:
279
  speaker_id = speaker
 
 
280
  sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
281
  c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
282
  n_frames = f0.size(1)
283
- if "half" in self.net_g_path and torch.cuda.is_available():
284
- c = c.half()
 
285
  with torch.no_grad():
286
  start = time.time()
287
  vol = None
@@ -293,13 +291,17 @@ class Svc(object):
293
  else:
294
  audio = torch.FloatTensor(wav).to(self.dev)
295
  audio_mel = None
 
 
 
 
296
  if self.only_diffusion or self.shallow_diffusion:
297
- vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol==None else vol[:,:,None]
298
  if self.shallow_diffusion and second_encoding:
299
  audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
300
  audio16k = torch.from_numpy(audio16k).to(self.dev)
301
  c = self.hubert_model.encoder(audio16k)
302
- c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
303
  f0 = f0[:,:,None]
304
  c = c.transpose(-1,-2)
305
  audio_mel = self.diffusion_model(
@@ -447,7 +449,8 @@ class Svc(object):
447
  datas = [data]
448
  for k,dat in enumerate(datas):
449
  per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
450
- if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
 
451
  # padd
452
  pad_len = int(audio_sr * pad_seconds)
453
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
@@ -530,4 +533,4 @@ class RealTimeVC:
530
  self.last_chunk = audio[-self.pre_len:]
531
  self.last_o = audio
532
  return ret[self.chunk_len:2 * self.chunk_len]
533
-
 
1
+ import gc
2
  import hashlib
3
  import io
4
  import json
5
  import logging
6
  import os
7
+ import pickle
8
  import time
9
  from pathlib import Path
 
 
10
 
11
  import librosa
12
  import numpy as np
13
+
14
  # import onnxruntime
15
  import soundfile
16
  import torch
 
18
 
19
  import cluster
20
  import utils
 
 
 
21
  from diffusion.unit2mel import load_model_vocoder
22
+ from inference import slicer
23
+ from models import SynthesizerTrn
24
 
25
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
26
 
 
135
  self.dev = torch.device(device)
136
  self.net_g_ms = None
137
  if not self.only_diffusion:
138
+ self.hps_ms = utils.get_hparams_from_file(config_path,True)
139
  self.target_sample = self.hps_ms.data.sampling_rate
140
  self.hop_size = self.hps_ms.data.hop_length
141
  self.spk2id = self.hps_ms.spk
142
+ self.unit_interpolate_mode = self.hps_ms.data.unit_interpolate_mode if self.hps_ms.data.unit_interpolate_mode is not None else 'left'
143
+ self.vol_embedding = self.hps_ms.model.vol_embedding if self.hps_ms.model.vol_embedding is not None else False
144
+ self.speech_encoder = self.hps_ms.model.speech_encoder if self.hps_ms.model.speech_encoder is not None else 'vec768l12'
145
+
 
 
 
 
 
146
  self.nsf_hifigan_enhance = nsf_hifigan_enhance
147
  if self.shallow_diffusion or self.only_diffusion:
148
  if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
 
152
  self.hop_size = self.diffusion_args.data.block_size
153
  self.spk2id = self.diffusion_args.spk
154
  self.speech_encoder = self.diffusion_args.data.encoder
155
+ self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode is not None else 'left'
156
  if spk_mix_enable:
157
  self.diffusion_model.init_spkmix(len(self.spk2id))
158
  else:
 
179
  else:
180
  self.feature_retrieval=False
181
 
182
+ if self.shallow_diffusion :
183
+ self.nsf_hifigan_enhance = False
184
  if self.nsf_hifigan_enhance:
185
  from modules.enhancer import Enhancer
186
  self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
 
192
  self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
193
  **self.hps_ms.model)
194
  _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
195
+ self.dtype = list(self.net_g_ms.parameters())[0].dtype
196
  if "half" in self.net_g_path and torch.cuda.is_available():
197
  _ = self.net_g_ms.half().eval().to(self.dev)
198
  else:
 
217
  wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
218
  wav16k = torch.from_numpy(wav16k).to(self.dev)
219
  c = self.hubert_model.encoder(wav16k)
220
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
221
 
222
  if cluster_infer_ratio !=0:
223
  if self.feature_retrieval:
 
229
  speaker_id = speaker
230
  feature_index = self.cluster_model[speaker_id]
231
  feat_np = c.transpose(0,1).cpu().numpy()
232
+ if self.big_npy is None or self.now_spk_id != speaker_id:
233
  self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
234
  self.now_spk_id = speaker_id
235
  print("starting feature retrieval...")
 
269
  sid = speaker[:, frame:frame+n_frames].transpose(0,1)
270
  else:
271
  speaker_id = self.spk2id.get(speaker)
 
 
272
  if not speaker_id and type(speaker) is int:
273
  if len(self.spk2id.__dict__) >= speaker:
274
  speaker_id = speaker
275
+ if speaker_id is None:
276
+ raise RuntimeError("The name you entered is not in the speaker list!")
277
  sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
278
  c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
279
  n_frames = f0.size(1)
280
+ c = c.to(self.dtype)
281
+ f0 = f0.to(self.dtype)
282
+ uv = uv.to(self.dtype)
283
  with torch.no_grad():
284
  start = time.time()
285
  vol = None
 
291
  else:
292
  audio = torch.FloatTensor(wav).to(self.dev)
293
  audio_mel = None
294
+ if self.dtype != torch.float32:
295
+ c = c.to(torch.float32)
296
+ f0 = f0.to(torch.float32)
297
+ uv = uv.to(torch.float32)
298
  if self.only_diffusion or self.shallow_diffusion:
299
+ vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol is None else vol[:,:,None]
300
  if self.shallow_diffusion and second_encoding:
301
  audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
302
  audio16k = torch.from_numpy(audio16k).to(self.dev)
303
  c = self.hubert_model.encoder(audio16k)
304
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
305
  f0 = f0[:,:,None]
306
  c = c.transpose(-1,-2)
307
  audio_mel = self.diffusion_model(
 
449
  datas = [data]
450
  for k,dat in enumerate(datas):
451
  per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
452
+ if clip_seconds!=0:
453
+ print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
454
  # padd
455
  pad_len = int(audio_sr * pad_seconds)
456
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
 
533
  self.last_chunk = audio[-self.pre_len:]
534
  self.last_o = audio
535
  return ret[self.chunk_len:2 * self.chunk_len]
536
+
inference/infer_tool_grad.py CHANGED
@@ -1,22 +1,18 @@
1
- import hashlib
2
- import json
3
  import logging
4
  import os
5
- import time
6
- from pathlib import Path
7
- import io
8
  import librosa
9
- import maad
10
  import numpy as np
11
- from inference import slicer
12
  import parselmouth
13
  import soundfile
14
  import torch
15
  import torchaudio
16
 
17
- from hubert import hubert_model
18
  import utils
 
19
  from models import SynthesizerTrn
 
20
  logging.getLogger('numba').setLevel(logging.WARNING)
21
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
22
 
@@ -93,7 +89,7 @@ class VitsSvc(object):
93
  def set_device(self, device):
94
  self.device = torch.device(device)
95
  self.hubert_soft.to(self.device)
96
- if self.SVCVITS != None:
97
  self.SVCVITS.to(self.device)
98
 
99
  def loadCheckpoint(self, path):
 
1
+ import io
 
2
  import logging
3
  import os
4
+
 
 
5
  import librosa
 
6
  import numpy as np
 
7
  import parselmouth
8
  import soundfile
9
  import torch
10
  import torchaudio
11
 
 
12
  import utils
13
+ from inference import slicer
14
  from models import SynthesizerTrn
15
+
16
  logging.getLogger('numba').setLevel(logging.WARNING)
17
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
18
 
 
89
  def set_device(self, device):
90
  self.device = torch.device(device)
91
  self.hubert_soft.to(self.device)
92
+ if self.SVCVITS is not None:
93
  self.SVCVITS.to(self.device)
94
 
95
  def loadCheckpoint(self, path):