import torch
import gradio as gr
import pytube as pt
from transformers import pipeline
MODEL_NAME = "ales/whisper-small-belarusian"
lang = "be"
device = 0 if torch.cuda.is_available() else "cpu"
print(f"Device set to use {device}")
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=8,
stride_length_s=1,
device=device,
)
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
def transcribe(audio_file):
if audio_file is None:
return "ERROR: Please upload or record audio"
return pipe(audio_file)["text"]
def yt_transcribe(yt_url):
if not yt_url:
return "", "ERROR: You must provide a YouTube URL."
yt = pt.YouTube(yt_url)
video_id = yt_url.split("?v=")[-1].split("&")[0]
embed = f'
'
stream = yt.streams.filter(only_audio=True).first()
stream.download(filename="audio.mp3")
text = pipe("audio.mp3")["text"]
return embed, text
with gr.Blocks() as demo:
with gr.Tab("🎤 Transcribe Audio"):
gr.Markdown("## Запішы або загрузі аўдыё")
audio_input = gr.Audio(type="filepath", label="Record or Upload Audio")
transcribe_button = gr.Button("Transcribe")
transcription_output = gr.Textbox(label="Transcription")
transcribe_button.click(
fn=transcribe,
inputs=[audio_input],
outputs=[transcription_output],
)
with gr.Tab("📺 Transcribe YouTube"):
gr.Markdown("## Устаў спасылку на YouTube-відэа")
yt_input = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...")
yt_button = gr.Button("Transcribe YouTube")
yt_embed = gr.HTML()
yt_text = gr.Textbox(label="Transcription")
yt_button.click(
fn=yt_transcribe,
inputs=[yt_input],
outputs=[yt_embed, yt_text],
)
demo.launch()