|
import torch |
|
import gradio as gr |
|
from transformers import pipeline |
|
from scipy.io import wavfile |
|
|
|
MODEL_NAME = "openai/whisper-large-v3" |
|
BATCH_SIZE = 8 |
|
|
|
device = 0 if torch.cuda.is_available() else "cpu" |
|
|
|
pipe = pipeline( |
|
task="automatic-speech-recognition", |
|
model=MODEL_NAME, |
|
chunk_length_s=30, |
|
device=device, |
|
) |
|
|
|
def transcribe_simple(inputs_path, task): |
|
if inputs_path is None: |
|
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.") |
|
|
|
sampling_rate, inputs = wavfile.read(inputs_path) |
|
out = pipe(inputs_path, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True) |
|
text = out["text"] |
|
|
|
return [[transcript] for transcript in text.split(".") if transcript], text |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(): |
|
audio_input = gr.Audio(source="upload", type="filepath", label="Upload Audio") |
|
task_input = gr.Dropdown(choices=["transcribe", "translate"], value="transcribe", label="Task") |
|
submit_button = gr.Button("Transcribe") |
|
with gr.Column(): |
|
output_text = gr.Dataframe(label="Transcripts") |
|
output_full_text = gr.Textbox(label="Full Text") |
|
|
|
submit_button.click( |
|
transcribe_simple, |
|
inputs=[audio_input, task_input], |
|
outputs=[output_text, output_full_text], |
|
) |
|
|
|
demo.launch() |
|
|