Spaces:
Running
on
Zero
Running
on
Zero
Staticaliza
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -39,9 +39,6 @@ torch.set_grad_enabled(False)
|
|
39 |
device = torch.device("cpu")
|
40 |
print(f"[DEVICE] | Using device: {device}")
|
41 |
|
42 |
-
channel_numbers = 100 # 80 by default
|
43 |
-
main_model = "nvidia/bigvgan_24khz_100band" # nvidia/bigvgan_v2_22khz_80band_256x
|
44 |
-
|
45 |
# ----------------------------
|
46 |
# Load Models and Configuration
|
47 |
# ----------------------------
|
@@ -93,7 +90,7 @@ campplus_model.to(device)
|
|
93 |
print("[INFO] | CAMPPlus model loaded, set to eval mode, and moved to CPU.")
|
94 |
|
95 |
# Load BigVGAN model
|
96 |
-
bigvgan_model = bigvgan.BigVGAN.from_pretrained(
|
97 |
bigvgan_model.remove_weight_norm()
|
98 |
bigvgan_model = bigvgan_model.eval().to(device)
|
99 |
print("[INFO] | BigVGAN model loaded, weight norm removed, set to eval mode, and moved to CPU.")
|
@@ -110,7 +107,7 @@ codec_encoder = {k: v.eval().to(device) for k, v in codec_encoder.items()}
|
|
110 |
print("[INFO] | FAcodec model loaded, set to eval mode, and moved to CPU.")
|
111 |
|
112 |
# Load Whisper model with float32 and compatible size
|
113 |
-
whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer, 'whisper_name') else "
|
114 |
whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float32).to(device)
|
115 |
del whisper_model.decoder # Remove decoder as it's not used
|
116 |
whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
|
@@ -121,7 +118,7 @@ mel_fn_args = {
|
|
121 |
"n_fft": 1024,
|
122 |
"win_size": 1024,
|
123 |
"hop_size": 256,
|
124 |
-
"num_mels":
|
125 |
"sampling_rate": sr,
|
126 |
"fmin": 0,
|
127 |
"fmax": None,
|
@@ -156,7 +153,7 @@ mel_fn_args_f0 = {
|
|
156 |
"n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
|
157 |
"win_size": config_f0['preprocess_params']['spect_params']['win_length'],
|
158 |
"hop_size": config_f0['preprocess_params']['spect_params']['hop_length'],
|
159 |
-
"num_mels":
|
160 |
"sampling_rate": sr_f0,
|
161 |
"fmin": 0,
|
162 |
"fmax": None,
|
@@ -276,7 +273,7 @@ def voice_conversion(input, reference, steps, guidance, pitch, speed):
|
|
276 |
|
277 |
# Extract style features
|
278 |
print("[INFO] | Extracting style features from reference audio.")
|
279 |
-
feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k, num_mel_bins=
|
280 |
feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
|
281 |
style2 = campplus_model(feat2.unsqueeze(0))
|
282 |
print(f"[INFO] | Style2 shape: {style2.shape}")
|
|
|
39 |
device = torch.device("cpu")
|
40 |
print(f"[DEVICE] | Using device: {device}")
|
41 |
|
|
|
|
|
|
|
42 |
# ----------------------------
|
43 |
# Load Models and Configuration
|
44 |
# ----------------------------
|
|
|
90 |
print("[INFO] | CAMPPlus model loaded, set to eval mode, and moved to CPU.")
|
91 |
|
92 |
# Load BigVGAN model
|
93 |
+
bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
|
94 |
bigvgan_model.remove_weight_norm()
|
95 |
bigvgan_model = bigvgan_model.eval().to(device)
|
96 |
print("[INFO] | BigVGAN model loaded, weight norm removed, set to eval mode, and moved to CPU.")
|
|
|
107 |
print("[INFO] | FAcodec model loaded, set to eval mode, and moved to CPU.")
|
108 |
|
109 |
# Load Whisper model with float32 and compatible size
|
110 |
+
whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer, 'whisper_name') else "openai/whisper-small"
|
111 |
whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float32).to(device)
|
112 |
del whisper_model.decoder # Remove decoder as it's not used
|
113 |
whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
|
|
|
118 |
"n_fft": 1024,
|
119 |
"win_size": 1024,
|
120 |
"hop_size": 256,
|
121 |
+
"num_mels": 80,
|
122 |
"sampling_rate": sr,
|
123 |
"fmin": 0,
|
124 |
"fmax": None,
|
|
|
153 |
"n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
|
154 |
"win_size": config_f0['preprocess_params']['spect_params']['win_length'],
|
155 |
"hop_size": config_f0['preprocess_params']['spect_params']['hop_length'],
|
156 |
+
"num_mels": 80, # Ensure this matches the primary model
|
157 |
"sampling_rate": sr_f0,
|
158 |
"fmin": 0,
|
159 |
"fmax": None,
|
|
|
273 |
|
274 |
# Extract style features
|
275 |
print("[INFO] | Extracting style features from reference audio.")
|
276 |
+
feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k, num_mel_bins=80, dither=0, sample_frequency=sampling_rate)
|
277 |
feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
|
278 |
style2 = campplus_model(feat2.unsqueeze(0))
|
279 |
print(f"[INFO] | Style2 shape: {style2.shape}")
|