Spaces:
Runtime error
Runtime error
File size: 2,763 Bytes
9a3ba32 84024ab 1320bd0 84024ab 9a3ba32 84024ab 9a3ba32 99a5348 9a3ba32 5621130 f9b0a05 1320bd0 9a3ba32 1320bd0 9a3ba32 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from transformers import pipeline
import gradio as gr
import whisper
wav2vec_models = {
"en" : pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h"),
"fr" : pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-xlsr-53-french"),
"es" : pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-xlsr-53-spanish"),
"it" : pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-xlsr-53-italian")
}
whisper_model = whisper.load_model("base")
def transcribe_audio(language=None, mic=None, file=None):
if mic is not None:
audio = mic
elif file is not None:
audio = file
else:
return "You must either provide a mic recording or a file"
wav2vec_model = wav2vec_models[language]
transcription = wav2vec_model(audio)["text"]
transcription2 = whisper_model.transcribe(audio, language=language)["text"]
return transcription, transcription2
title = "Speech2text comparison (Wav2vec vs Whisper)"
description = """
This Space allows easy comparisons for transcribed texts between Facebook's Wav2vec model and newly released OpenAI's Whisper model.\n
(Even if Whisper includes a language detection and even an automatic translation, here we have decided to select the language to speed up the transcription and to focus only on the quality of the transcriptions. The default language is english)
"""
article = "Check out [the OpenAI Whisper model](https://github.com/openai/whisper) and [the Facebook Wav2vec model](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) that this demo is based off of."
examples = [["en", None, "english_sentence.flac"],
["en", None, "6_Steps_To_Hit_ANY_Goal.mp3000.mp3"],
["fr", None, "2022-a-Droite-un-fauteuil-pour-trois-3034044.mp3000.mp3"],
["fr", None, "podcast-bdl-episode-5-mix-v2.mp3000.mp3"],
["es", None, "momiasartesecretodelantiguoegipto-nationalgeographicespana-ivoox73191074.mp3000.mp3"],
["es", None, "millonarioscohetesrepresentaestanuev-xataka-ivoox73148634.mp3000.mp3"],
["it", None, "Ansa_voice_barbero_no_sigla.mp3000.mp3"],
["it", None, "A304176327.mp3000.mp3"]]
gr.Interface(
fn=transcribe_audio,
inputs=[
gr.Radio(label="Language", choices=["en", "fr", "es","it"], value="en"),
gr.Audio(source="microphone", type="filepath", optional=True),
gr.Audio(source="upload", type="filepath", optional=True),
],
outputs=[
gr.Textbox(label="facebook/wav2vec"),
gr.Textbox(label="openai/whisper"),],
title=title,
description=description,
article=article,
examples=examples
).launch(debug=True) |