Spaces:
Sleeping
Sleeping
Staticaliza
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -207,7 +207,7 @@ def voice_conversion(input, reference, steps, guidance, speed):
|
|
207 |
# Generate Whisper features
|
208 |
print("[INFO] | Generating Whisper features for source audio.")
|
209 |
if converted_waves_16k.size(-1) <= sampling_rate * 30:
|
210 |
-
alt_inputs = whisper_feature_extractor([converted_waves_16k.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate)
|
211 |
alt_input_features = whisper_model._mask_input_features(alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
|
212 |
alt_outputs = whisper_model.encoder(alt_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
|
213 |
S_alt = alt_outputs.last_hidden_state.to(torch.float32)
|
@@ -229,7 +229,7 @@ def voice_conversion(input, reference, steps, guidance, speed):
|
|
229 |
chunk = converted_waves_16k[:, traversed_time:traversed_time + chunk_size]
|
230 |
else:
|
231 |
chunk = torch.cat([buffer, converted_waves_16k[:, traversed_time:traversed_time + chunk_size - overlap_size]], dim=-1)
|
232 |
-
alt_inputs = whisper_feature_extractor([chunk.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate)
|
233 |
alt_input_features = whisper_model._mask_input_features(alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
|
234 |
alt_outputs = whisper_model.encoder(alt_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
|
235 |
S_chunk = alt_outputs.last_hidden_state.to(torch.float32)
|
@@ -251,7 +251,7 @@ def voice_conversion(input, reference, steps, guidance, speed):
|
|
251 |
# Original Whisper features
|
252 |
print("[INFO] | Generating Whisper features for reference audio.")
|
253 |
ori_waves_16k = torchaudio.functional.resample(ref_audio_tensor, sr_current, sampling_rate)
|
254 |
-
ori_inputs = whisper_feature_extractor([ori_waves_16k.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate)
|
255 |
ori_input_features = whisper_model._mask_input_features(ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
|
256 |
ori_outputs = whisper_model.encoder(ori_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
|
257 |
S_ori = ori_outputs.last_hidden_state.to(torch.float32)
|
|
|
207 |
# Generate Whisper features
|
208 |
print("[INFO] | Generating Whisper features for source audio.")
|
209 |
if converted_waves_16k.size(-1) <= sampling_rate * 30:
|
210 |
+
alt_inputs = whisper_feature_extractor([converted_waves_16k.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate=sampling_rate)
|
211 |
alt_input_features = whisper_model._mask_input_features(alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
|
212 |
alt_outputs = whisper_model.encoder(alt_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
|
213 |
S_alt = alt_outputs.last_hidden_state.to(torch.float32)
|
|
|
229 |
chunk = converted_waves_16k[:, traversed_time:traversed_time + chunk_size]
|
230 |
else:
|
231 |
chunk = torch.cat([buffer, converted_waves_16k[:, traversed_time:traversed_time + chunk_size - overlap_size]], dim=-1)
|
232 |
+
alt_inputs = whisper_feature_extractor([chunk.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate=sampling_rate)
|
233 |
alt_input_features = whisper_model._mask_input_features(alt_inputs.input_features, attention_mask=alt_inputs.attention_mask).to(device)
|
234 |
alt_outputs = whisper_model.encoder(alt_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
|
235 |
S_chunk = alt_outputs.last_hidden_state.to(torch.float32)
|
|
|
251 |
# Original Whisper features
|
252 |
print("[INFO] | Generating Whisper features for reference audio.")
|
253 |
ori_waves_16k = torchaudio.functional.resample(ref_audio_tensor, sr_current, sampling_rate)
|
254 |
+
ori_inputs = whisper_feature_extractor([ori_waves_16k.squeeze(0).cpu().numpy()], return_tensors="pt", return_attention_mask=True, sampling_rate=sampling_rate)
|
255 |
ori_input_features = whisper_model._mask_input_features(ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
|
256 |
ori_outputs = whisper_model.encoder(ori_input_features.to(torch.float32), head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True)
|
257 |
S_ori = ori_outputs.last_hidden_state.to(torch.float32)
|