whisper-sami-demo

Running on Zero

App Files Files Community

versae commited on Nov 15, 2024

Commit

ea81981

verified ·

1 Parent(s): c3e0f72

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -35

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch
 import gradio as gr
 import pytube as pt
 from transformers import pipeline
 from huggingface_hub import model_info
@@ -12,33 +13,39 @@ lang = "fi"
 share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
 auth_token = os.environ.get("AUTH_TOKEN") or True
-device = 0 if torch.cuda.is_available() else "cpu"
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=MODEL_NAME,
-    chunk_length_s=30,
-    device=device,
-    token=auth_token,
-)
-pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
-def transcribe(microphone, file_upload):
-    warn_output = ""
-    if (microphone is not None) and (file_upload is not None):
-        warn_output = (
-            "WARNING: You've uploaded an audio file and used the microphone. "
-            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
-        )
-    elif (microphone is None) and (file_upload is None):
-        return "ERROR: You have to either use the microphone or upload an audio file"
-    file = microphone if microphone is not None else file_upload
-    text = pipe(file)["text"]
-    return warn_output + text
 def _return_yt_html_embed(yt_url):
@@ -50,13 +57,13 @@ def _return_yt_html_embed(yt_url):
     return HTML_str
-def yt_transcribe(yt_url):
     yt = pt.YouTube(yt_url)
     html_embed_str = _return_yt_html_embed(yt_url)
     stream = yt.streams.filter(only_audio=True)[0]
     stream.download(filename="audio.mp3")
-    text = pipe("audio.mp3")["text"]
     return html_embed_str, text
@@ -66,11 +73,10 @@ demo = gr.Blocks()
 mf_transcribe = gr.Interface(
     fn=transcribe,
     inputs=[
-        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
-        gr.inputs.Audio(source="upload", type="filepath", optional=True),
     ],
     outputs="text",
-    layout="horizontal",
     theme="huggingface",
     title="Whisper Demo: Transcribe Audio",
     description=(
@@ -83,10 +89,12 @@ mf_transcribe = gr.Interface(
 yt_transcribe = gr.Interface(
     fn=yt_transcribe,
-    inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
     examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
     outputs=["html", "text"],
-    layout="horizontal",
     theme="huggingface",
     title="Whisper Demo: Transcribe YouTube",
     description=(
@@ -100,4 +108,4 @@ yt_transcribe = gr.Interface(
 with demo:
     gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
-demo.launch(enable_queue=True, share=True)

 import gradio as gr
 import pytube as pt
+import spaces
 from transformers import pipeline
 from huggingface_hub import model_info
 share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
 auth_token = os.environ.get("AUTH_TOKEN") or True
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+@spaces.GPU(duration=120)
+def pipe(file, return_timestamps=False):
+    asr = pipeline(
+        task="automatic-speech-recognition",
+        model=MODEL_NAME,
+        chunk_length_s=30,
+        device=device,
+        token=auth_token,
+    )
+    asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
+        language=lang,
+        task="transcribe",
+        no_timestamps=not return_timestamps,
+    )
+    # asr.model.config.no_timestamps_token_id = asr.tokenizer.encode("<|notimestamps|>", add_special_tokens=False)[0]
+    return asr(file, return_timestamps=return_timestamps)
+def transcribe(file, return_timestamps=False):
+    if not return_timestamps:
+        text = pipe(file)["text"]
+    else:
+        chunks = pipe(file, return_timestamps=True)["chunks"]
+        text = []
+        for chunk in chunks:
+            start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
+            end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
+            line = f"[{start_time} -> {end_time}] {chunk['text']}"
+            text.append(line)
+        text = "\n".join(text)
+    return text
 def _return_yt_html_embed(yt_url):
     return HTML_str
+def yt_transcribe(yt_url, return_timestamps=False):
     yt = pt.YouTube(yt_url)
     html_embed_str = _return_yt_html_embed(yt_url)
     stream = yt.streams.filter(only_audio=True)[0]
     stream.download(filename="audio.mp3")
+    text = transcribe("audio.mp3", return_timestamps=return_timestamps)
     return html_embed_str, text
 mf_transcribe = gr.Interface(
     fn=transcribe,
     inputs=[
+        gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
+        # gr.components.Checkbox(label="Return timestamps"),
     ],
     outputs="text",
     theme="huggingface",
     title="Whisper Demo: Transcribe Audio",
     description=(
 yt_transcribe = gr.Interface(
     fn=yt_transcribe,
+    inputs=[
+        gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
+        # gr.components.Checkbox(label="Return timestamps"),
+    ],
     examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
     outputs=["html", "text"],
     theme="huggingface",
     title="Whisper Demo: Transcribe YouTube",
     description=(
 with demo:
     gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
+demo.launch(share=True).queue()