Spaces:

Staticaliza
/

Voice-CPU

Running on Zero

App Files Files Community

Staticaliza commited on Dec 14, 2024

Commit

ce2f474

verified ·

1 Parent(s): 9daa517

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -8

app.py CHANGED Viewed

@@ -39,9 +39,6 @@ torch.set_grad_enabled(False)
 device = torch.device("cpu")
 print(f"[DEVICE] | Using device: {device}")
-channel_numbers = 100 # 80 by default
-main_model = "nvidia/bigvgan_24khz_100band" # nvidia/bigvgan_v2_22khz_80band_256x
 # ----------------------------
 # Load Models and Configuration
 # ----------------------------
@@ -93,7 +90,7 @@ campplus_model.to(device)
 print("[INFO] | CAMPPlus model loaded, set to eval mode, and moved to CPU.")
 # Load BigVGAN model
-bigvgan_model = bigvgan.BigVGAN.from_pretrained(main_model, use_cuda_kernel=False)
 bigvgan_model.remove_weight_norm()
 bigvgan_model = bigvgan_model.eval().to(device)
 print("[INFO] | BigVGAN model loaded, weight norm removed, set to eval mode, and moved to CPU.")
@@ -110,7 +107,7 @@ codec_encoder = {k: v.eval().to(device) for k, v in codec_encoder.items()}
 print("[INFO] | FAcodec model loaded, set to eval mode, and moved to CPU.")
 # Load Whisper model with float32 and compatible size
-whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer, 'whisper_name') else "biodatlab/distill-whisper-th-small"
 whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float32).to(device)
 del whisper_model.decoder  # Remove decoder as it's not used
 whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
@@ -121,7 +118,7 @@ mel_fn_args = {
     "n_fft": 1024,
     "win_size": 1024,
     "hop_size": 256,
-    "num_mels": channel_numbers,
     "sampling_rate": sr,
     "fmin": 0,
     "fmax": None,
@@ -156,7 +153,7 @@ mel_fn_args_f0 = {
     "n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
     "win_size": config_f0['preprocess_params']['spect_params']['win_length'],
     "hop_size": config_f0['preprocess_params']['spect_params']['hop_length'],
-    "num_mels": channel_numbers,
     "sampling_rate": sr_f0,
     "fmin": 0,
     "fmax": None,
@@ -276,7 +273,7 @@ def voice_conversion(input, reference, steps, guidance, pitch, speed):
     # Extract style features
     print("[INFO] | Extracting style features from reference audio.")
-    feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k, num_mel_bins=channel_numbers, dither=0, sample_frequency=sampling_rate)
     feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
     style2 = campplus_model(feat2.unsqueeze(0))
     print(f"[INFO] | Style2 shape: {style2.shape}")

 device = torch.device("cpu")
 print(f"[DEVICE] | Using device: {device}")
 # ----------------------------
 # Load Models and Configuration
 # ----------------------------
 print("[INFO] | CAMPPlus model loaded, set to eval mode, and moved to CPU.")
 # Load BigVGAN model
+bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
 bigvgan_model.remove_weight_norm()
 bigvgan_model = bigvgan_model.eval().to(device)
 print("[INFO] | BigVGAN model loaded, weight norm removed, set to eval mode, and moved to CPU.")
 print("[INFO] | FAcodec model loaded, set to eval mode, and moved to CPU.")
 # Load Whisper model with float32 and compatible size
+whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer, 'whisper_name') else "openai/whisper-small"
 whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float32).to(device)
 del whisper_model.decoder  # Remove decoder as it's not used
 whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
     "n_fft": 1024,
     "win_size": 1024,
     "hop_size": 256,
+    "num_mels": 80,
     "sampling_rate": sr,
     "fmin": 0,
     "fmax": None,
     "n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
     "win_size": config_f0['preprocess_params']['spect_params']['win_length'],
     "hop_size": config_f0['preprocess_params']['spect_params']['hop_length'],
+    "num_mels": 80,  # Ensure this matches the primary model
     "sampling_rate": sr_f0,
     "fmin": 0,
     "fmax": None,
     # Extract style features
     print("[INFO] | Extracting style features from reference audio.")
+    feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k, num_mel_bins=80, dither=0, sample_frequency=sampling_rate)
     feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
     style2 = campplus_model(feat2.unsqueeze(0))
     print(f"[INFO] | Style2 shape: {style2.shape}")