Account for recorded audio format
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
import logging
|
3 |
import numpy as np
|
4 |
import torch
|
@@ -28,6 +29,15 @@ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze
|
|
28 |
|
29 |
|
30 |
def translate(audio):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
input_features = whisper_processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features
|
32 |
predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=decoder_ids)
|
33 |
translated_text = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
|
|
1 |
import gradio as gr
|
2 |
+
import librosa
|
3 |
import logging
|
4 |
import numpy as np
|
5 |
import torch
|
|
|
29 |
|
30 |
|
31 |
def translate(audio):
|
32 |
+
if isinstance(audio, str):
|
33 |
+
# Account for recorded audio
|
34 |
+
audio = {
|
35 |
+
"path": audio,
|
36 |
+
"sampling_rate": 16_000,
|
37 |
+
"array": librosa.load(audio, sr=16_000)[0]
|
38 |
+
}
|
39 |
+
elif audio["sampling_rate"] != 16_000:
|
40 |
+
audio["array"] = librosa.resample(audio["array"], audio["sampling_rate"], 16_000)
|
41 |
input_features = whisper_processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features
|
42 |
predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=decoder_ids)
|
43 |
translated_text = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|