File size: 6,274 Bytes
3bb42a7
 
 
21147ce
 
89f022a
21147ce
89f022a
 
 
 
 
21147ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89f022a
21147ce
 
 
 
 
 
 
 
89f022a
 
21147ce
 
89f022a
21147ce
 
 
89f022a
3bb42a7
89f022a
3bb42a7
 
89f022a
3bb42a7
89f022a
3bb42a7
 
 
 
 
 
 
 
89f022a
 
3bb42a7
 
 
 
 
89f022a
 
 
 
 
 
3bb42a7
89f022a
 
 
 
 
 
 
 
 
 
 
3bb42a7
 
 
89f022a
 
 
 
 
 
3bb42a7
 
21147ce
 
3bb42a7
89f022a
3bb42a7
 
 
21147ce
 
 
3bb42a7
 
 
21147ce
3bb42a7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/local/bin/python3
#-*- coding:utf-8 -*-
import gradio as gr
import librosa
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration

checkpoint = "openai/whisper-base"
# checkpoint = "/innev/open-ai/huggingface/models/openai/whisper-medium"
# checkpoint = "/innev/open-ai/huggingface/models/openai/whisper-base"
processor = WhisperProcessor.from_pretrained(checkpoint)
model = WhisperForConditionalGeneration.from_pretrained(checkpoint)

def process_audio(sampling_rate, waveform):
    # convert from int16 to floating point
    waveform = waveform / 32678.0

    # convert to mono if stereo
    if len(waveform.shape) > 1:
        waveform = librosa.to_mono(waveform.T)

    # resample to 16 kHz if necessary
    if sampling_rate != 16000:
        waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000)

    # limit to 30 seconds
    waveform = waveform[:16000*30]

    # make PyTorch tensor
    waveform = torch.tensor(waveform)
    return waveform


def predict(language, audio, mic_audio=None):
    # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
    if mic_audio is not None:
        sampling_rate, waveform = mic_audio
    elif audio is not None:
        sampling_rate, waveform = audio
    else:
        return "(please provide audio)"

    forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")

    waveform = process_audio(sampling_rate, waveform)
    inputs = processor(audio=waveform, sampling_rate=16000, return_tensors="pt")
    predicted_ids = model.generate(**inputs, max_length=400, forced_decoder_ids=forced_decoder_ids)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription[0]

supportLangs = ['english', 'chinese', 'german', 'spanish', 'russian', 'korean', 'french', 'japanese', 'portuguese']

title = "OpenAI Whisper Base"

description = """
本例用于演示 <b>openai/whisper-base</b> 模型的语音识别(ASR)能力。基于原始模型开发,没有对模型做微调。 本例默认输出为中文,Whisper识别出的是繁体中文。

Whisper包含多个不同大小的版本,理论来讲模型越大识别效果越好,模型越小速度越快

<b>使用方法:</b> 上传一个音频文件或直接在页面中录制音频。音频会在传递到模型之前转换为单声道并重新采样为16 kHz。
"""

article = """
<div style='margin:20px auto;'>
<p>音频案例:<p>
<ul>
<li>"春日阳光普照大地,正是踏春好时节" 来源: 知琪(Zhiqi)
<li>"这是一年中最美味的团聚,也注定是一顿白感交集的晚餐。" 来源: 知厨(zhichu)
<li>"Hmm, I don't know" 来源: <a href="https://freesound.org/people/InspectorJ/sounds/519189/">InspectorJ</a> (CC BY 4.0 license)
<li>"Henry V" excerpt 来源: <a href="https://freesound.org/people/acclivity/sounds/24096/">acclivity</a> (CC BY-NC 4.0 license)
<li>"You can see it in the eyes" 来源: <a href="https://freesound.org/people/JoyOhJoy/sounds/165348/">JoyOhJoy</a> (CC0 license)
<li>"We yearn for time" 来源: <a href="https://freesound.org/people/Sample_Me/sounds/610529/">Sample_Me</a> (CC0 license)
</ul>

<p>
    参考:  
    <a href="https://huggingface.co/openai/whisper-base">OpenAI Whisper Base</a> |
    <a href="https://github.com/innev">Innev GitHub</a> 
</p>
</div>

<b>多语言支持: </b> english, chinese, german, spanish, russian, korean, french, japanese, portuguese, turkish, polish, catalan, dutch, arabic, swedish, italian, indonesian, hindi, finnish, vietnamese, hebrew, ukrainian, greek, malay, czech, romanian, danish, hungarian, tamil, norwegian, thai, urdu, croatian, bulgarian, lithuanian, latin, maori, malayalam, welsh, slovak, telugu, persian, latvian, bengali, serbian, azerbaijani, slovenian, kannada, estonian, macedonian, breton, basque, icelandic, armenian, nepali, mongolian, bosnian, kazakh, albanian, swahili, galician, marathi, punjabi, sinhala, khmer, shona, yoruba, somali, afrikaans, occitan, georgian, belarusian, tajik, sindhi, gujarati, amharic, yiddish, lao, uzbek, faroese, haitian creole, pashto, turkmen, nynorsk, maltese, sanskrit, luxembourgish, myanmar, tibetan, tagalog, malagasy, assamese, tatar, hawaiian, lingala, hausa, bashkir, javanese, sundanese, burmese, valencian, flemish, haitian, letzeburgesch, pushto, panjabi, moldavian, moldovan, sinhalese, castilian

| 模型版本  | 大小        | 仅英语                                                | 多语言                                               |
|----------|------------|------------------------------------------------------|-----------------------------------------------------|
| tiny     | 39 M       | [✓](https://huggingface.co/openai/whisper-tiny.en)   | [✓](https://huggingface.co/openai/whisper-tiny)     |
| base     | 74 M       | [✓](https://huggingface.co/openai/whisper-base.en)   | [✓](https://huggingface.co/openai/whisper-base)     |
| small    | 244 M      | [✓](https://huggingface.co/openai/whisper-small.en)  | [✓](https://huggingface.co/openai/whisper-small)    |
| medium   | 769 M      | [✓](https://huggingface.co/openai/whisper-medium.en) | [✓](https://huggingface.co/openai/whisper-medium)   |
| large    | 1550 M     | x                                                    | [✓](https://huggingface.co/openai/whisper-large)    |
| large-v2 | 1550 M     | x                                                    | [✓](https://huggingface.co/openai/whisper-large-v2) |
"""

examples = [
    [None, "examples/zhiqi.wav", None],
    [None, "examples/zhichu.wav", None],
    [None, "examples/hmm_i_dont_know.wav", None],
    [None, "examples/henry5.mp3", None],
    [None, "examples/yearn_for_time.mp3", None],
    [None, "examples/see_in_eyes.wav", None],
]

gr.Interface(
    fn=predict,
    inputs=[
        gr.Radio(label="目标语言", choices=supportLangs, value="chinese"),
        gr.Audio(label="上传语音", source="upload", type="numpy"),
        gr.Audio(label="录制语音", source="microphone", type="numpy"),
    ],
    outputs=[
        gr.Text(label="识别出的文字"),
    ],
    title=title,
    description=description,
    article=article,
    examples=examples,
).launch()