Staticaliza commited on
Commit
ce2f474
·
verified ·
1 Parent(s): 9daa517

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -8
app.py CHANGED
@@ -39,9 +39,6 @@ torch.set_grad_enabled(False)
39
  device = torch.device("cpu")
40
  print(f"[DEVICE] | Using device: {device}")
41
 
42
- channel_numbers = 100 # 80 by default
43
- main_model = "nvidia/bigvgan_24khz_100band" # nvidia/bigvgan_v2_22khz_80band_256x
44
-
45
  # ----------------------------
46
  # Load Models and Configuration
47
  # ----------------------------
@@ -93,7 +90,7 @@ campplus_model.to(device)
93
  print("[INFO] | CAMPPlus model loaded, set to eval mode, and moved to CPU.")
94
 
95
  # Load BigVGAN model
96
- bigvgan_model = bigvgan.BigVGAN.from_pretrained(main_model, use_cuda_kernel=False)
97
  bigvgan_model.remove_weight_norm()
98
  bigvgan_model = bigvgan_model.eval().to(device)
99
  print("[INFO] | BigVGAN model loaded, weight norm removed, set to eval mode, and moved to CPU.")
@@ -110,7 +107,7 @@ codec_encoder = {k: v.eval().to(device) for k, v in codec_encoder.items()}
110
  print("[INFO] | FAcodec model loaded, set to eval mode, and moved to CPU.")
111
 
112
  # Load Whisper model with float32 and compatible size
113
- whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer, 'whisper_name') else "biodatlab/distill-whisper-th-small"
114
  whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float32).to(device)
115
  del whisper_model.decoder # Remove decoder as it's not used
116
  whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
@@ -121,7 +118,7 @@ mel_fn_args = {
121
  "n_fft": 1024,
122
  "win_size": 1024,
123
  "hop_size": 256,
124
- "num_mels": channel_numbers,
125
  "sampling_rate": sr,
126
  "fmin": 0,
127
  "fmax": None,
@@ -156,7 +153,7 @@ mel_fn_args_f0 = {
156
  "n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
157
  "win_size": config_f0['preprocess_params']['spect_params']['win_length'],
158
  "hop_size": config_f0['preprocess_params']['spect_params']['hop_length'],
159
- "num_mels": channel_numbers,
160
  "sampling_rate": sr_f0,
161
  "fmin": 0,
162
  "fmax": None,
@@ -276,7 +273,7 @@ def voice_conversion(input, reference, steps, guidance, pitch, speed):
276
 
277
  # Extract style features
278
  print("[INFO] | Extracting style features from reference audio.")
279
- feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k, num_mel_bins=channel_numbers, dither=0, sample_frequency=sampling_rate)
280
  feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
281
  style2 = campplus_model(feat2.unsqueeze(0))
282
  print(f"[INFO] | Style2 shape: {style2.shape}")
 
39
  device = torch.device("cpu")
40
  print(f"[DEVICE] | Using device: {device}")
41
 
 
 
 
42
  # ----------------------------
43
  # Load Models and Configuration
44
  # ----------------------------
 
90
  print("[INFO] | CAMPPlus model loaded, set to eval mode, and moved to CPU.")
91
 
92
  # Load BigVGAN model
93
+ bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
94
  bigvgan_model.remove_weight_norm()
95
  bigvgan_model = bigvgan_model.eval().to(device)
96
  print("[INFO] | BigVGAN model loaded, weight norm removed, set to eval mode, and moved to CPU.")
 
107
  print("[INFO] | FAcodec model loaded, set to eval mode, and moved to CPU.")
108
 
109
  # Load Whisper model with float32 and compatible size
110
+ whisper_name = model_params.speech_tokenizer.whisper_name if hasattr(model_params.speech_tokenizer, 'whisper_name') else "openai/whisper-small"
111
  whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float32).to(device)
112
  del whisper_model.decoder # Remove decoder as it's not used
113
  whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)
 
118
  "n_fft": 1024,
119
  "win_size": 1024,
120
  "hop_size": 256,
121
+ "num_mels": 80,
122
  "sampling_rate": sr,
123
  "fmin": 0,
124
  "fmax": None,
 
153
  "n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
154
  "win_size": config_f0['preprocess_params']['spect_params']['win_length'],
155
  "hop_size": config_f0['preprocess_params']['spect_params']['hop_length'],
156
+ "num_mels": 80, # Ensure this matches the primary model
157
  "sampling_rate": sr_f0,
158
  "fmin": 0,
159
  "fmax": None,
 
273
 
274
  # Extract style features
275
  print("[INFO] | Extracting style features from reference audio.")
276
+ feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k, num_mel_bins=80, dither=0, sample_frequency=sampling_rate)
277
  feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
278
  style2 = campplus_model(feat2.unsqueeze(0))
279
  print(f"[INFO] | Style2 shape: {style2.shape}")