nata0801's picture
Update app.py
7a8abcb
import gradio as gr
import librosa
import soundfile as sf
import torch
from transformers import Wav2Vec2Tokenizer, Wav2Vec2ForCTC
#load wav2vec2 tokenizer and model
# define speech-to-text function
def asr_transcript(audio_file, language):
if language == "English":
model_name = "facebook/wav2vec2-large-960h-lv60-self"
elif language == "Russian":
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-russian"
elif language == "French":
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-french"
tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
transcript = ""
# Stream over 20 seconds chunks
stream = librosa.stream(
audio_file.name, block_length=20, frame_length=16000, hop_length=16000
)
for speech in stream:
if len(speech.shape) > 1:
speech = speech[:, 0] + speech[:, 1]
input_values = tokenizer(speech, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)[0]
transcript += transcription.lower() + " "
return transcript
gradio_ui = gr.Interface(
fn=asr_transcript,
title="Automatic speech recognition with Wav2Vec2",
description="Upload an audio clip in Russian, English, or French and let AI do the hard work of transcribing",
inputs = [gr.inputs.Audio(label="Upload Audio File", type="file"),
gr.inputs.Radio(label="Pick a language",
choices=["English",
"Russian",
"French"])],
outputs=gr.outputs.Textbox(label="Auto-Transcript"),
)
gradio_ui.launch()