Staticaliza commited on
Commit
546984b
·
verified ·
1 Parent(s): da0e734

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -3
app.py CHANGED
@@ -207,7 +207,7 @@ def voice_conversion(input, reference, steps, guidance, speed):
207
  # Generate Whisper features
208
  print("[INFO] | Generating Whisper features for source audio.")
209
  if converted_waves_16k.size(-1) <= sampling_rate * 30:
210
- alt_inputs = whisper_feature_extractor([converted_waves_16k.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate)
211
  alt_input_features = whisper_model._mask_input_features(alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
212
  alt_outputs = whisper_model.encoder(alt_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
213
  S_alt = alt_outputs.last_hidden_state.to(torch.float32)
@@ -229,7 +229,7 @@ def voice_conversion(input, reference, steps, guidance, speed):
229
  chunk = converted_waves_16k[:, traversed_time:traversed_time + chunk_size]
230
  else:
231
  chunk = torch.cat([buffer, converted_waves_16k[:, traversed_time:traversed_time + chunk_size - overlap_size]], dim=-1)
232
- alt_inputs = whisper_feature_extractor([chunk.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate)
233
  alt_input_features = whisper_model._mask_input_features(alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
234
  alt_outputs = whisper_model.encoder(alt_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
235
  S_chunk = alt_outputs.last_hidden_state.to(torch.float32)
@@ -251,7 +251,7 @@ def voice_conversion(input, reference, steps, guidance, speed):
251
  # Original Whisper features
252
  print("[INFO] | Generating Whisper features for reference audio.")
253
  ori_waves_16k = torchaudio.functional.resample(ref_audio_tensor, sr_current, sampling_rate)
254
- ori_inputs = whisper_feature_extractor([ori_waves_16k.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate)
255
  ori_input_features = whisper_model._mask_input_features(ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
256
  ori_outputs = whisper_model.encoder(ori_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
257
  S_ori = ori_outputs.last_hidden_state.to(torch.float32)
 
207
  # Generate Whisper features
208
  print("[INFO] | Generating Whisper features for source audio.")
209
  if converted_waves_16k.size(-1) <= sampling_rate * 30:
210
+ alt_inputs = whisper_feature_extractor([converted_waves_16k.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate=sampling_rate)
211
  alt_input_features = whisper_model._mask_input_features(alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
212
  alt_outputs = whisper_model.encoder(alt_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
213
  S_alt = alt_outputs.last_hidden_state.to(torch.float32)
 
229
  chunk = converted_waves_16k[:, traversed_time:traversed_time + chunk_size]
230
  else:
231
  chunk = torch.cat([buffer, converted_waves_16k[:, traversed_time:traversed_time + chunk_size - overlap_size]], dim=-1)
232
+ alt_inputs = whisper_feature_extractor([chunk.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate=sampling_rate)
233
  alt_input_features = whisper_model._mask_input_features(alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
234
  alt_outputs = whisper_model.encoder(alt_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
235
  S_chunk = alt_outputs.last_hidden_state.to(torch.float32)
 
251
  # Original Whisper features
252
  print("[INFO] | Generating Whisper features for reference audio.")
253
  ori_waves_16k = torchaudio.functional.resample(ref_audio_tensor, sr_current, sampling_rate)
254
+ ori_inputs = whisper_feature_extractor([ori_waves_16k.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate=sampling_rate)
255
  ori_input_features = whisper_model._mask_input_features(ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
256
  ori_outputs = whisper_model.encoder(ori_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
257
  S_ori = ori_outputs.last_hidden_state.to(torch.float32)