Commit
·
58442c5
1
Parent(s):
8cb7f84
Update app.py
Browse files
app.py
CHANGED
@@ -27,7 +27,7 @@ import contextlib
|
|
27 |
from transformers import pipeline
|
28 |
import psutil
|
29 |
|
30 |
-
whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2"]
|
31 |
source_languages = {
|
32 |
"en": "English",
|
33 |
"zh": "Chinese",
|
@@ -132,9 +132,6 @@ source_languages = {
|
|
132 |
|
133 |
source_language_list = [key[0] for key in source_languages.items()]
|
134 |
|
135 |
-
MODEL_NAME = "vumichien/whisper-medium-jp"
|
136 |
-
lang = "ja"
|
137 |
-
|
138 |
device = 0 if torch.cuda.is_available() else "cpu"
|
139 |
pipe = pipeline(
|
140 |
task="automatic-speech-recognition",
|
@@ -149,23 +146,6 @@ embedding_model = PretrainedSpeakerEmbedding(
|
|
149 |
"speechbrain/spkrec-ecapa-voxceleb",
|
150 |
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
151 |
|
152 |
-
def transcribe(microphone, file_upload):
|
153 |
-
warn_output = ""
|
154 |
-
if (microphone is not None) and (file_upload is not None):
|
155 |
-
warn_output = (
|
156 |
-
"WARNING: You've uploaded an audio file and used the microphone. "
|
157 |
-
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
|
158 |
-
)
|
159 |
-
|
160 |
-
elif (microphone is None) and (file_upload is None):
|
161 |
-
return "ERROR: You have to either use the microphone or upload an audio file"
|
162 |
-
|
163 |
-
file = microphone if microphone is not None else file_upload
|
164 |
-
|
165 |
-
text = pipe(file)["text"]
|
166 |
-
|
167 |
-
return warn_output + text
|
168 |
-
|
169 |
def _return_yt_html_embed(yt_url):
|
170 |
video_id = yt_url.split("?v=")[-1]
|
171 |
HTML_str = (
|
@@ -431,43 +411,4 @@ with demo:
|
|
431 |
system_info.render()
|
432 |
gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
|
433 |
|
434 |
-
|
435 |
-
|
436 |
-
with gr.Tab("Whisper Transcribe Japanese Audio"):
|
437 |
-
gr.Markdown(f'''
|
438 |
-
<div>
|
439 |
-
<h1 style='text-align: center'>Whisper Transcribe Japanese Audio</h1>
|
440 |
-
</div>
|
441 |
-
Transcribe long-form microphone or audio inputs with the click of a button! The fine-tuned
|
442 |
-
checkpoint <a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
|
443 |
-
''')
|
444 |
-
microphone = gr.inputs.Audio(source="microphone", type="filepath", optional=True)
|
445 |
-
upload = gr.inputs.Audio(source="upload", type="filepath", optional=True)
|
446 |
-
transcribe_btn = gr.Button("Transcribe Audio")
|
447 |
-
text_output = gr.Textbox()
|
448 |
-
with gr.Row():
|
449 |
-
gr.Markdown('''
|
450 |
-
### You can test by following examples:
|
451 |
-
''')
|
452 |
-
examples = gr.Examples(examples=
|
453 |
-
[ "sample1.wav",
|
454 |
-
"sample2.wav",
|
455 |
-
],
|
456 |
-
label="Examples", inputs=[upload])
|
457 |
-
transcribe_btn.click(transcribe, [microphone, upload], outputs=text_output)
|
458 |
-
|
459 |
-
with gr.Tab("Whisper Transcribe Japanese YouTube"):
|
460 |
-
gr.Markdown(f'''
|
461 |
-
<div>
|
462 |
-
<h1 style='text-align: center'>Whisper Transcribe Japanese YouTube</h1>
|
463 |
-
</div>
|
464 |
-
Transcribe long-form YouTube videos with the click of a button! The fine-tuned checkpoint:
|
465 |
-
<a href='https://huggingface.co/{MODEL_NAME}' target='_blank'><b>{MODEL_NAME}</b></a> to transcribe audio files of arbitrary length.
|
466 |
-
''')
|
467 |
-
youtube_link = gr.Textbox(label="Youtube url", lines=1, interactive=True)
|
468 |
-
yt_transcribe_btn = gr.Button("Transcribe YouTube")
|
469 |
-
text_output2 = gr.Textbox()
|
470 |
-
html_output = gr.Markdown()
|
471 |
-
yt_transcribe_btn.click(yt_transcribe, [youtube_link], outputs=[html_output, text_output2])
|
472 |
-
|
473 |
demo.launch(debug=True)
|
|
|
27 |
from transformers import pipeline
|
28 |
import psutil
|
29 |
|
30 |
+
whisper_models = ["tiny", "base", "small", "medium", "large-v1", "large-v2", "large-v3"]
|
31 |
source_languages = {
|
32 |
"en": "English",
|
33 |
"zh": "Chinese",
|
|
|
132 |
|
133 |
source_language_list = [key[0] for key in source_languages.items()]
|
134 |
|
|
|
|
|
|
|
135 |
device = 0 if torch.cuda.is_available() else "cpu"
|
136 |
pipe = pipeline(
|
137 |
task="automatic-speech-recognition",
|
|
|
146 |
"speechbrain/spkrec-ecapa-voxceleb",
|
147 |
device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
def _return_yt_html_embed(yt_url):
|
150 |
video_id = yt_url.split("?v=")[-1]
|
151 |
HTML_str = (
|
|
|
411 |
system_info.render()
|
412 |
gr.Markdown('''<center><img src='https://visitor-badge.glitch.me/badge?page_id=WhisperDiarizationSpeakers' alt='visitor badge'><a href="https://opensource.org/licenses/Apache-2.0"><img src='https://img.shields.io/badge/License-Apache_2.0-blue.svg' alt='License: Apache 2.0'></center>''')
|
413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
414 |
demo.launch(debug=True)
|