Spaces:
Running
on
T4
Running
on
T4
import torch | |
import gradio as gr | |
import yt_dlp as yt | |
from transformers import pipeline | |
#from transformers.pipelines.audio_utils import ffmpeg_read | |
from typing import Tuple | |
import tempfile | |
import os | |
from yt_dlp import YoutubeDL | |
MODEL_NAME = "openai/whisper-large-v2" | |
BATCH_SIZE = 8 | |
FILE_LIMIT_MB = 1000 | |
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files | |
device = 0 if torch.cuda.is_available() else "cpu" | |
pipe = pipeline( | |
task="automatic-speech-recognition", | |
chunk_length_s=30, | |
model=MODEL_NAME, | |
device=device, | |
) | |
def transcribe(microphone, file_upload, task): | |
warn_output = "" | |
if (microphone is not None) and (file_upload is not None): | |
warn_output = ( | |
"WARNING: You've uploaded an audio file and used the microphone. " | |
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" | |
) | |
elif (microphone is None) and (file_upload is None): | |
raise gr.InterfaceError("You have to either use the microphone or upload an audio file") | |
file_size_mb = None | |
if file_upload is not None: | |
file_size_mb = os.stat(file_upload).st_size / (1024 * 1024) | |
if file_size_mb > FILE_LIMIT_MB: | |
raise gr.InterfaceError( | |
f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB." | |
) | |
file_path = microphone if microphone is not None else file_upload | |
with open(file_path, "rb") as f: | |
inputs = f.read() | |
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task})["text"] | |
return warn_output + text | |
def download_yt_audio(yt_url, filename): | |
ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"} | |
with yt.YoutubeDL(ydl_opts) as ydl: | |
try: | |
ydl.download([yt_url]) | |
except yt.utils.ExtractorError as err: | |
raise gr.InterfaceError(str(err)) | |
def yt_transcribe(yt_url, task, max_filesize=75.0) -> Tuple[str, str]: | |
with YoutubeDL({}) as ydl: | |
info_dict = ydl.extract_info(yt_url, download=False) | |
video_id = info_dict["id"] | |
html_embed_str = f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe> </center>' | |
with tempfile.TemporaryDirectory() as tmpdirname: | |
filepath = os.path.join(tmpdirname, "video.mp4") | |
download_yt_audio(yt_url, filepath) | |
with open(filepath, "rb") as f: | |
inputs = f.read() | |
#inputs = ffmpeg_read(inputs, pipeline.feature_extractor.sampling_rate) | |
#inputs = {"array": inputs, "sampling_rate": pipeline.feature_extractor.sampling_rate} | |
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task})["text"] | |
return html_embed_str, text | |
demo = gr.Blocks() | |
mf_transcribe = gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.inputs.Audio(source="microphone", type="filepath", optional=True), | |
gr.inputs.Audio(source="upload", type="filepath", optional=True), | |
gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"), | |
], | |
outputs="text", | |
layout="horizontal", | |
theme="huggingface", | |
title="Whisper Large V2: Transcribe Audio", | |
description=( | |
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the" | |
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files" | |
" of arbitrary length." | |
), | |
allow_flagging="never", | |
) | |
yt_transcribe = gr.Interface( | |
fn=yt_transcribe, | |
inputs=[ | |
gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"), | |
gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe") | |
], | |
outputs=["html", "text"], | |
layout="horizontal", | |
theme="huggingface", | |
title="Whisper Large V2: Transcribe YouTube", | |
description=( | |
"Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint" | |
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of" | |
" arbitrary length." | |
), | |
allow_flagging="never", | |
) | |
with demo: | |
gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"]) | |
demo.launch(enable_queue=True) |