import torch import gradio as gr import pytube as pt from transformers import pipeline MODEL_NAME = "ales/whisper-small-belarusian" lang = "be" device = 0 if torch.cuda.is_available() else "cpu" print(f"Device set to use {device}") pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=8, stride_length_s=1, device=device, ) pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe") def transcribe(audio_file): if audio_file is None: return "ERROR: Please upload or record audio" return pipe(audio_file)["text"] def yt_transcribe(yt_url): if not yt_url: return "", "ERROR: You must provide a YouTube URL." yt = pt.YouTube(yt_url) video_id = yt_url.split("?v=")[-1].split("&")[0] embed = f'
' stream = yt.streams.filter(only_audio=True).first() stream.download(filename="audio.mp3") text = pipe("audio.mp3")["text"] return embed, text with gr.Blocks() as demo: with gr.Tab("🎤 Transcribe Audio"): gr.Markdown("## Запішы або загрузі аўдыё") audio_input = gr.Audio(type="filepath", label="Record or Upload Audio") transcribe_button = gr.Button("Transcribe") transcription_output = gr.Textbox(label="Transcription") transcribe_button.click( fn=transcribe, inputs=[audio_input], outputs=[transcription_output], ) with gr.Tab("📺 Transcribe YouTube"): gr.Markdown("## Устаў спасылку на YouTube-відэа") yt_input = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...") yt_button = gr.Button("Transcribe YouTube") yt_embed = gr.HTML() yt_text = gr.Textbox(label="Transcription") yt_button.click( fn=yt_transcribe, inputs=[yt_input], outputs=[yt_embed, yt_text], ) demo.launch()