Whisper_speaker_diarization_for_colab_test

Runtime error

App Files Files Community

TrialAccountHF commited on Dec 17, 2023

Commit

58442c5

1 Parent(s): 8cb7f84

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -60

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ import contextlib
 from transformers import pipeline
 import psutil
-whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
 source_languages = {
     "en": "English",
     "zh": "Chinese",
@@ -132,9 +132,6 @@ source_languages = {
 source_language_list = [key[0] for key in source_languages.items()]
-MODEL_NAME = "vumichien/whisper-medium-jp"
-lang = "ja"
 device = 0 if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     task="automatic-speech-recognition",
@@ -149,23 +146,6 @@ embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",
     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-def transcribe(microphone, file_upload):
-    warn_output = ""
-    if (microphone is not None) and (file_upload is not None):
-        warn_output = (
-            "WARNING: You've uploaded an audio file and used the microphone. "
-            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
-        )
-    elif (microphone is None) and (file_upload is None):
-        return "ERROR: You have to either use the microphone or upload an audio file"
-    file = microphone if microphone is not None else file_upload
-    text = pipe(file)["text"]
-    return warn_output + text
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
@@ -431,43 +411,4 @@ with demo:
                 system_info.render()
                 gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
-    with gr.Tab("Whisper Transcribe Japanese Audio"):
-        gr.Markdown(f'''
-              <div>
-              <h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
-              </div>
-              Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
-              checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
-          ''')
-        microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
-        upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
-        transcribe_btn = gr.Button("Transcribe Audio")
-        text_output = gr.Textbox()
-        with gr.Row():
-            gr.Markdown('''
-                ### You can test by following examples:
-                ''')
-        examples = gr.Examples(examples=
-              [ "sample1.wav",
-                "sample2.wav",
-                ],
-              label="Examples", inputs=[upload])
-        transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
-    with gr.Tab("Whisper Transcribe Japanese YouTube"):
-        gr.Markdown(f'''
-              <div>
-              <h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
-              </div>
-                Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
-                <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
-            ''')
-        youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
-        yt_transcribe_btn = gr.Button("Transcribe YouTube")
-        text_output2 = gr.Textbox()
-        html_output = gr.Markdown()
-        yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
 demo.launch(debug=True)

 from transformers import pipeline
 import psutil
+whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2", "large-v3"]
 source_languages = {
     "en": "English",
     "zh": "Chinese",
 source_language_list = [key[0] for key in source_languages.items()]
 device = 0 if torch.cuda.is_available() else "cpu"
 pipe = pipeline(
     task="automatic-speech-recognition",
     "speechbrain/spkrec-ecapa-voxceleb",
     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
                 system_info.render()
                 gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
 demo.launch(debug=True)