mboushaba's picture
Update app.py
8efadfe verified
raw
history blame
3.37 kB
import gradio as gr
import librosa
import numpy as np
import soundfile as sf
from transformers import pipeline
import os
from huggingface_hub import login
# Retrieve token from environment variable
hugging_face_token = os.getenv("ASR_CEB_HUGGING_FACE_TOKEN")
# Login using the token
login(token=hugging_face_token)
asr_ceb = pipeline("automatic-speech-recognition", model = "sil-ai/wav2vec2-bloom-speech-ceb")
asr_whisper_large = pipeline("automatic-speech-recognition", model = "openai/whisper-large-v3")
asr_whisper_ceb = pipeline("automatic-speech-recognition",
model = "nlewins/whisper-small-translate-X-gen2-examples-quality-step4-1e-6")
def transcribe_speech(filepath):
if filepath is None:
gr.Warning("No audio found, please retry.")
return ""
_, sample_rate = librosa.load(filepath, sr = None)
model_rate = asr_ceb.feature_extractor.sampling_rate
if sample_rate != model_rate:
filepath = resample_audio_for_processing(filepath, model_rate, sample_rate)
output_ceb = asr_ceb(filepath)
generate_kwargs = {
# "language": "tagalog",#source language
"task": "translate"
}
output_whisper_large_translate = asr_whisper_large(filepath, generate_kwargs = generate_kwargs)
output_whisper_large = asr_whisper_large(filepath)
output_whisper_ceb = asr_whisper_ceb(filepath)
return (output_ceb["text"], output_whisper_large["text"], output_whisper_large_translate["text"],
output_whisper_ceb["text"])
def resample_audio_for_processing(filepath, model_rate, sample_rate):
print(f"Audio loaded with rate: {sample_rate} Hz while model requires rate: {model_rate} Hz")
try:
print("Resampling audio...")
audio_data, sr = librosa.load(filepath, sr = None) # Audio data will be a NumPy array
# Ensure that audio_data is a NumPy array
audio_data = np.array(audio_data)
# Resample to 16kHz
audio_resampled = librosa.resample(audio_data, orig_sr = sample_rate, target_sr = model_rate)
# Save the resampled audio
resampled_audio_path = 'resampled_audio.wav'
sf.write(resampled_audio_path, audio_resampled, 16000)
print("Audio resampled successfully.")
return resampled_audio_path
except Exception as e:
print(f"Error resampling audio: {e}, processing with audio as is it !")
return filepath
mic_transcribe = gr.Interface(
fn = transcribe_speech,
inputs = gr.Audio(sources = ["microphone"], type = "filepath"),
outputs = [gr.Textbox(label = "Transcription CEB (sil-ai)"), gr.Textbox(label = "Transcription (openai)"),
gr.Textbox(label = "Translation (openai)"),
gr.Textbox(label = "Transcription (nlewins)")]
, allow_flagging = "never")
file_transcribe = gr.Interface(
fn = transcribe_speech,
inputs = gr.Audio(sources = ["upload"], type = "filepath"),
outputs = [gr.Textbox(label = "Transcription CEB (sil-ai)"), gr.Textbox(label = "Transcription (openai)"),
gr.Textbox(label = "Translation (openai)"),
gr.Textbox(label = "Translation (nlewins)")]
, allow_flagging = "never",
)
demo = gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Use your Microphone", "Upload Audio File"],
)
if __name__ == '__main__':
demo.launch()