Spaces:
Running
Running
darksakura
commited on
Commit
•
ae93010
1
Parent(s):
cb73098
Upload 3 files
Browse files- inference/infer_tool.py +30 -27
- inference/infer_tool_grad.py +5 -9
inference/infer_tool.py
CHANGED
@@ -1,15 +1,16 @@
|
|
|
|
1 |
import hashlib
|
2 |
import io
|
3 |
import json
|
4 |
import logging
|
5 |
import os
|
|
|
6 |
import time
|
7 |
from pathlib import Path
|
8 |
-
from inference import slicer
|
9 |
-
import gc
|
10 |
|
11 |
import librosa
|
12 |
import numpy as np
|
|
|
13 |
# import onnxruntime
|
14 |
import soundfile
|
15 |
import torch
|
@@ -17,11 +18,9 @@ import torchaudio
|
|
17 |
|
18 |
import cluster
|
19 |
import utils
|
20 |
-
from models import SynthesizerTrn
|
21 |
-
import pickle
|
22 |
-
|
23 |
from diffusion.unit2mel import load_model_vocoder
|
24 |
-
import
|
|
|
25 |
|
26 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
27 |
|
@@ -136,19 +135,14 @@ class Svc(object):
|
|
136 |
self.dev = torch.device(device)
|
137 |
self.net_g_ms = None
|
138 |
if not self.only_diffusion:
|
139 |
-
self.hps_ms = utils.get_hparams_from_file(config_path)
|
140 |
self.target_sample = self.hps_ms.data.sampling_rate
|
141 |
self.hop_size = self.hps_ms.data.hop_length
|
142 |
self.spk2id = self.hps_ms.spk
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
try:
|
148 |
-
self.speech_encoder = self.hps_ms.model.speech_encoder
|
149 |
-
except Exception as e:
|
150 |
-
self.speech_encoder = 'vec768l12'
|
151 |
-
|
152 |
self.nsf_hifigan_enhance = nsf_hifigan_enhance
|
153 |
if self.shallow_diffusion or self.only_diffusion:
|
154 |
if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
|
@@ -158,6 +152,7 @@ class Svc(object):
|
|
158 |
self.hop_size = self.diffusion_args.data.block_size
|
159 |
self.spk2id = self.diffusion_args.spk
|
160 |
self.speech_encoder = self.diffusion_args.data.encoder
|
|
|
161 |
if spk_mix_enable:
|
162 |
self.diffusion_model.init_spkmix(len(self.spk2id))
|
163 |
else:
|
@@ -184,7 +179,8 @@ class Svc(object):
|
|
184 |
else:
|
185 |
self.feature_retrieval=False
|
186 |
|
187 |
-
if self.shallow_diffusion :
|
|
|
188 |
if self.nsf_hifigan_enhance:
|
189 |
from modules.enhancer import Enhancer
|
190 |
self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
|
@@ -196,6 +192,7 @@ class Svc(object):
|
|
196 |
self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
|
197 |
**self.hps_ms.model)
|
198 |
_ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
|
|
|
199 |
if "half" in self.net_g_path and torch.cuda.is_available():
|
200 |
_ = self.net_g_ms.half().eval().to(self.dev)
|
201 |
else:
|
@@ -220,7 +217,7 @@ class Svc(object):
|
|
220 |
wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
|
221 |
wav16k = torch.from_numpy(wav16k).to(self.dev)
|
222 |
c = self.hubert_model.encoder(wav16k)
|
223 |
-
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
|
224 |
|
225 |
if cluster_infer_ratio !=0:
|
226 |
if self.feature_retrieval:
|
@@ -232,7 +229,7 @@ class Svc(object):
|
|
232 |
speaker_id = speaker
|
233 |
feature_index = self.cluster_model[speaker_id]
|
234 |
feat_np = c.transpose(0,1).cpu().numpy()
|
235 |
-
if self.big_npy is
|
236 |
self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
|
237 |
self.now_spk_id = speaker_id
|
238 |
print("starting feature retrieval...")
|
@@ -272,16 +269,17 @@ class Svc(object):
|
|
272 |
sid = speaker[:, frame:frame+n_frames].transpose(0,1)
|
273 |
else:
|
274 |
speaker_id = self.spk2id.get(speaker)
|
275 |
-
if speaker_id is None:
|
276 |
-
raise RuntimeError("The name you entered is not in the speaker list!")
|
277 |
if not speaker_id and type(speaker) is int:
|
278 |
if len(self.spk2id.__dict__) >= speaker:
|
279 |
speaker_id = speaker
|
|
|
|
|
280 |
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
|
281 |
c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
|
282 |
n_frames = f0.size(1)
|
283 |
-
|
284 |
-
|
|
|
285 |
with torch.no_grad():
|
286 |
start = time.time()
|
287 |
vol = None
|
@@ -293,13 +291,17 @@ class Svc(object):
|
|
293 |
else:
|
294 |
audio = torch.FloatTensor(wav).to(self.dev)
|
295 |
audio_mel = None
|
|
|
|
|
|
|
|
|
296 |
if self.only_diffusion or self.shallow_diffusion:
|
297 |
-
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol
|
298 |
if self.shallow_diffusion and second_encoding:
|
299 |
audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
|
300 |
audio16k = torch.from_numpy(audio16k).to(self.dev)
|
301 |
c = self.hubert_model.encoder(audio16k)
|
302 |
-
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
|
303 |
f0 = f0[:,:,None]
|
304 |
c = c.transpose(-1,-2)
|
305 |
audio_mel = self.diffusion_model(
|
@@ -447,7 +449,8 @@ class Svc(object):
|
|
447 |
datas = [data]
|
448 |
for k,dat in enumerate(datas):
|
449 |
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
|
450 |
-
if clip_seconds!=0:
|
|
|
451 |
# padd
|
452 |
pad_len = int(audio_sr * pad_seconds)
|
453 |
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
|
@@ -530,4 +533,4 @@ class RealTimeVC:
|
|
530 |
self.last_chunk = audio[-self.pre_len:]
|
531 |
self.last_o = audio
|
532 |
return ret[self.chunk_len:2 * self.chunk_len]
|
533 |
-
|
|
|
1 |
+
import gc
|
2 |
import hashlib
|
3 |
import io
|
4 |
import json
|
5 |
import logging
|
6 |
import os
|
7 |
+
import pickle
|
8 |
import time
|
9 |
from pathlib import Path
|
|
|
|
|
10 |
|
11 |
import librosa
|
12 |
import numpy as np
|
13 |
+
|
14 |
# import onnxruntime
|
15 |
import soundfile
|
16 |
import torch
|
|
|
18 |
|
19 |
import cluster
|
20 |
import utils
|
|
|
|
|
|
|
21 |
from diffusion.unit2mel import load_model_vocoder
|
22 |
+
from inference import slicer
|
23 |
+
from models import SynthesizerTrn
|
24 |
|
25 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
26 |
|
|
|
135 |
self.dev = torch.device(device)
|
136 |
self.net_g_ms = None
|
137 |
if not self.only_diffusion:
|
138 |
+
self.hps_ms = utils.get_hparams_from_file(config_path,True)
|
139 |
self.target_sample = self.hps_ms.data.sampling_rate
|
140 |
self.hop_size = self.hps_ms.data.hop_length
|
141 |
self.spk2id = self.hps_ms.spk
|
142 |
+
self.unit_interpolate_mode = self.hps_ms.data.unit_interpolate_mode if self.hps_ms.data.unit_interpolate_mode is not None else 'left'
|
143 |
+
self.vol_embedding = self.hps_ms.model.vol_embedding if self.hps_ms.model.vol_embedding is not None else False
|
144 |
+
self.speech_encoder = self.hps_ms.model.speech_encoder if self.hps_ms.model.speech_encoder is not None else 'vec768l12'
|
145 |
+
|
|
|
|
|
|
|
|
|
|
|
146 |
self.nsf_hifigan_enhance = nsf_hifigan_enhance
|
147 |
if self.shallow_diffusion or self.only_diffusion:
|
148 |
if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
|
|
|
152 |
self.hop_size = self.diffusion_args.data.block_size
|
153 |
self.spk2id = self.diffusion_args.spk
|
154 |
self.speech_encoder = self.diffusion_args.data.encoder
|
155 |
+
self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode is not None else 'left'
|
156 |
if spk_mix_enable:
|
157 |
self.diffusion_model.init_spkmix(len(self.spk2id))
|
158 |
else:
|
|
|
179 |
else:
|
180 |
self.feature_retrieval=False
|
181 |
|
182 |
+
if self.shallow_diffusion :
|
183 |
+
self.nsf_hifigan_enhance = False
|
184 |
if self.nsf_hifigan_enhance:
|
185 |
from modules.enhancer import Enhancer
|
186 |
self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
|
|
|
192 |
self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
|
193 |
**self.hps_ms.model)
|
194 |
_ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
|
195 |
+
self.dtype = list(self.net_g_ms.parameters())[0].dtype
|
196 |
if "half" in self.net_g_path and torch.cuda.is_available():
|
197 |
_ = self.net_g_ms.half().eval().to(self.dev)
|
198 |
else:
|
|
|
217 |
wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
|
218 |
wav16k = torch.from_numpy(wav16k).to(self.dev)
|
219 |
c = self.hubert_model.encoder(wav16k)
|
220 |
+
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
|
221 |
|
222 |
if cluster_infer_ratio !=0:
|
223 |
if self.feature_retrieval:
|
|
|
229 |
speaker_id = speaker
|
230 |
feature_index = self.cluster_model[speaker_id]
|
231 |
feat_np = c.transpose(0,1).cpu().numpy()
|
232 |
+
if self.big_npy is None or self.now_spk_id != speaker_id:
|
233 |
self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
|
234 |
self.now_spk_id = speaker_id
|
235 |
print("starting feature retrieval...")
|
|
|
269 |
sid = speaker[:, frame:frame+n_frames].transpose(0,1)
|
270 |
else:
|
271 |
speaker_id = self.spk2id.get(speaker)
|
|
|
|
|
272 |
if not speaker_id and type(speaker) is int:
|
273 |
if len(self.spk2id.__dict__) >= speaker:
|
274 |
speaker_id = speaker
|
275 |
+
if speaker_id is None:
|
276 |
+
raise RuntimeError("The name you entered is not in the speaker list!")
|
277 |
sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
|
278 |
c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
|
279 |
n_frames = f0.size(1)
|
280 |
+
c = c.to(self.dtype)
|
281 |
+
f0 = f0.to(self.dtype)
|
282 |
+
uv = uv.to(self.dtype)
|
283 |
with torch.no_grad():
|
284 |
start = time.time()
|
285 |
vol = None
|
|
|
291 |
else:
|
292 |
audio = torch.FloatTensor(wav).to(self.dev)
|
293 |
audio_mel = None
|
294 |
+
if self.dtype != torch.float32:
|
295 |
+
c = c.to(torch.float32)
|
296 |
+
f0 = f0.to(torch.float32)
|
297 |
+
uv = uv.to(torch.float32)
|
298 |
if self.only_diffusion or self.shallow_diffusion:
|
299 |
+
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol is None else vol[:,:,None]
|
300 |
if self.shallow_diffusion and second_encoding:
|
301 |
audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
|
302 |
audio16k = torch.from_numpy(audio16k).to(self.dev)
|
303 |
c = self.hubert_model.encoder(audio16k)
|
304 |
+
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
|
305 |
f0 = f0[:,:,None]
|
306 |
c = c.transpose(-1,-2)
|
307 |
audio_mel = self.diffusion_model(
|
|
|
449 |
datas = [data]
|
450 |
for k,dat in enumerate(datas):
|
451 |
per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
|
452 |
+
if clip_seconds!=0:
|
453 |
+
print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
|
454 |
# padd
|
455 |
pad_len = int(audio_sr * pad_seconds)
|
456 |
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
|
|
|
533 |
self.last_chunk = audio[-self.pre_len:]
|
534 |
self.last_o = audio
|
535 |
return ret[self.chunk_len:2 * self.chunk_len]
|
536 |
+
|
inference/infer_tool_grad.py
CHANGED
@@ -1,22 +1,18 @@
|
|
1 |
-
import
|
2 |
-
import json
|
3 |
import logging
|
4 |
import os
|
5 |
-
|
6 |
-
from pathlib import Path
|
7 |
-
import io
|
8 |
import librosa
|
9 |
-
import maad
|
10 |
import numpy as np
|
11 |
-
from inference import slicer
|
12 |
import parselmouth
|
13 |
import soundfile
|
14 |
import torch
|
15 |
import torchaudio
|
16 |
|
17 |
-
from hubert import hubert_model
|
18 |
import utils
|
|
|
19 |
from models import SynthesizerTrn
|
|
|
20 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
21 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
22 |
|
@@ -93,7 +89,7 @@ class VitsSvc(object):
|
|
93 |
def set_device(self, device):
|
94 |
self.device = torch.device(device)
|
95 |
self.hubert_soft.to(self.device)
|
96 |
-
if self.SVCVITS
|
97 |
self.SVCVITS.to(self.device)
|
98 |
|
99 |
def loadCheckpoint(self, path):
|
|
|
1 |
+
import io
|
|
|
2 |
import logging
|
3 |
import os
|
4 |
+
|
|
|
|
|
5 |
import librosa
|
|
|
6 |
import numpy as np
|
|
|
7 |
import parselmouth
|
8 |
import soundfile
|
9 |
import torch
|
10 |
import torchaudio
|
11 |
|
|
|
12 |
import utils
|
13 |
+
from inference import slicer
|
14 |
from models import SynthesizerTrn
|
15 |
+
|
16 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
17 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
18 |
|
|
|
89 |
def set_device(self, device):
|
90 |
self.device = torch.device(device)
|
91 |
self.hubert_soft.to(self.device)
|
92 |
+
if self.SVCVITS is not None:
|
93 |
self.SVCVITS.to(self.device)
|
94 |
|
95 |
def loadCheckpoint(self, path):
|