File size: 2,686 Bytes
9a3ba32
 
 
 
bb28acc
 
5621130
9a3ba32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5621130
 
9a3ba32
 
 
 
 
 
 
99a5348
9a3ba32
 
5621130
 
 
 
 
9a3ba32
 
 
 
5621130
9a3ba32
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from transformers import pipeline
import gradio as gr
import whisper

wav2vec_en_model = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
wav2vec_fr_model = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-xlsr-53-french")
wav2vec_es_model = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-xlsr-53-spanish")
whisper_model = whisper.load_model("base")

def transcribe_audio(language=None, mic=None, file=None):
    if mic is not None:
        audio = mic
    elif file is not None:
        audio = file
    else:
        return "You must either provide a mic recording or a file"
    wav2vec_model = load_models(language)
    transcription = wav2vec_model(audio)["text"]
    transcription2 = whisper_model.transcribe(audio, language=language)["text"]
    return transcription, transcription2

def load_models(lang):
    if lang == 'en':
        return wav2vec_en_model
    elif lang == 'fr':
        return wav2vec_fr_model
    elif lang == 'es':
        return wav2vec_es_model
    else:
			# default english
        return wav2vec_en_model

title = "Speech2text comparison (Wav2vec vs Whisper)"
description = """
This Space allows easy comparisons for transcribed texts between Facebook's Wav2vec model and newly released OpenAI's Whisper model.\n
(Even if Whisper includes a language detection and even an automatic translation, here we have decided to select the language to speed up the transcription and to focus only on the quality of the transcriptions. The default language is english)
"""
article = "Check out [the OpenAI Whisper model](https://github.com/openai/whisper) and [the Facebook Wav2vec model](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) that this demo is based off of."
examples = [["en", None, "english_sentence.flac"], 
            ["en", None, "6_Steps_To_Hit_ANY_Goal.mp3000.mp3"],
            ["fr", None, "2022-a-Droite-un-fauteuil-pour-trois-3034044.mp3000.mp3"],
            ["fr", None, "podcast-bdl-episode-5-mix-v2.mp3000.mp3"],
            ["es", None, "momiasartesecretodelantiguoegipto-nationalgeographicespana-ivoox73191074.mp3000.mp3"]]

gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.Radio(label="Language", choices=["en", "fr", "es"], value="en"),
        gr.Audio(source="microphone", type="filepath", optional=True),
        gr.Audio(source="upload", type="filepath", optional=True),
    ],
    outputs=[
        gr.Textbox(label="facebook/wav2vec"), 
        gr.Textbox(label="openai/whisper"),],
    title=title,
    description=description,
    article=article,
    examples=examples
).launch(debug=True)