File size: 7,337 Bytes
fa5764d
 
 
ccb8fb5
fa5764d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9a84d8
fa5764d
 
 
79a3510
fa5764d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccb8fb5
 
 
 
 
 
 
 
 
fa5764d
 
 
 
 
ccb8fb5
fa5764d
1cdf0cf
fa5764d
 
 
 
ccb8fb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa5764d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import json

import gradio as gr
from gradio import processing_utils
from huggingface_hub import snapshot_download
from omegaconf import OmegaConf
from vosk import KaldiRecognizer, Model


def load_vosk(model_id: str):
    model_dir = snapshot_download(model_id)
    return Model(model_path=model_dir)


OmegaConf.register_new_resolver("load_vosk", load_vosk)

models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))


def automatic_speech_recognition(model_id: str, dialect_id: str, audio_data: str):
    if isinstance(models_config[model_id]["model"], dict):
        model = models_config[model_id]["model"][dialect_id]
    else:
        model = models_config[model_id]["model"]

    sample_rate, audio_array = audio_data
    if audio_array.ndim == 2:
        audio_array = audio_array[:, 0]

    audio_bytes = audio_array.tobytes()

    rec = KaldiRecognizer(model, sample_rate)

    rec.SetWords(True)

    results = []

    for start in range(0, len(audio_bytes), 4000):
        end = min(start + 4000, len(audio_bytes))
        data = audio_bytes[start:end]
        if rec.AcceptWaveform(data):
            raw_result = json.loads(rec.Result())
            results.append(raw_result)

    final_result = json.loads(rec.FinalResult())
    results.append(final_result)

    filtered_lines = []

    for result in results:
        result["text"] = result["text"]
        if len(result["text"]) > 0:
            filtered_lines.append(result["text"])

    return (", ".join(filtered_lines) + ".").capitalize()


def when_model_selected(model_id: str):
    model_config = models_config[model_id]

    if "dialect_mapping" not in model_config:
        return gr.update(visible=False)

    dialect_drop_down_choices = [
        (k, v) for k, v in model_config["dialect_mapping"].items()
    ]

    return gr.update(
        choices=dialect_drop_down_choices,
        value=dialect_drop_down_choices[0][1],
        visible=True,
    )


demo = gr.Blocks(
    title="臺灣南島語語音辨識系統",
    css="@import url(https://tauhu.tw/tauhu-oo.css);",
    theme=gr.themes.Default(
        font=(
            "tauhu-oo",
            gr.themes.GoogleFont("Source Sans Pro"),
            "ui-sans-serif",
            "system-ui",
            "sans-serif",
        )
    ),
)

with demo:
    default_model_id = list(models_config.keys())[0]
    model_drop_down = gr.Dropdown(
        models_config.keys(),
        value=default_model_id,
        label="模型",
    )

    dialect_drop_down = gr.Radio(
        choices=[
            (k, v)
            for k, v in models_config[default_model_id]["dialect_mapping"].items()
        ],
        value=list(models_config[default_model_id]["dialect_mapping"].values())[0],
        label="族別",
    )

    model_drop_down.input(
        when_model_selected,
        inputs=[model_drop_down],
        outputs=[dialect_drop_down],
    )

    audio_source = gr.Audio(
        label="上傳或錄音",
        type="numpy",
        format="wav",
        waveform_options=gr.WaveformOptions(
            sample_rate=16000,
        ),
    )

    with open("DEMO.md") as tong:
        gr.Markdown(tong.read())

    gr.Interface(
        automatic_speech_recognition,
        inputs=[model_drop_down, dialect_drop_down, audio_source],
        outputs=[
            gr.Text(interactive=True, label="辨識結果"),
        ],
        allow_flagging="auto",
    )

    gr.Examples(
        [
            [
                "formosan_ami",
                "南勢",
                processing_utils.audio_from_file(
                    "examples/cb52eb9457a0b74abcf02da6253b29e37f44ee6f.wav"
                ),
                "U payniyaru’ nu pangcah i matiya, u ina haw ku miterungay, mikadavu ku vavainay i vavahiyan a luma’.",
                "阿美族的原始社會,是以女人為主的母系社會,男子授室入贅女家。",
            ],
            [
                "formosan_ami",
                "秀姑巒",
                processing_utils.audio_from_file(
                    "examples/9954bc6c934e098dd9900e1f6efc56223903b9ec.wav"
                ),
                "saka mafana’ ko ina ato mama^ no wawa, patayra han i faki anoca^ i akong no wawa^, somad han to no faki^ ko ngangan haw i.",
                "父母一眼就看出有問題,就送到長輩的住處請他查看,當長輩將名字更換了之後。",
            ],
            [
                "formosan_ami",
                "海岸",
                processing_utils.audio_from_file(
                    "examples/c9080c15a60953ee6f2b099a7e3036846583dce6.wav"
                ),
                "Orasaka ora “pataloma’” hananay a sowal, pakalafi han no Pangcah, todongay pakalafi to malinaay, nika oni pataloma’ hananay, manga’ay misaparod han ko sowal.",
                "因此「結婚」一詞,阿美族稱pakalafi,有「請吃晚餐」的意思,但較正式的用法是pataloma’,直譯為「成家」。",
            ],
            [
                "formosan_ami",
                "馬蘭",
                processing_utils.audio_from_file(
                    "examples/eb3364be43c8c133c9bc8cd71f1925aa20a66cc0.wav"
                ),
                "O sata’angayay a pisanga’an to tilong ko Tafalong itiya ho, mapaliwal i kasaniyaroaro’ ko misatilongan to sakacaloway no finawlan i ’orip a lalosidan.",
                "而太巴塱部落則是當時最大的製造陶埸域,供應各部落族人日常生活的陶器用品。",
            ],
            [
                "formosan_sdq",
                "德固達亞",
                processing_utils.audio_from_file(
                    "examples/b02ee31b7dee33bc9195c5b201b2943610b6308f.wav"
                ),
                "Pure macu, ani naq baso ciida we ini snagi beras na, asi hrigi ribo ma psaan rqeda baro, ciida ka seengun posa qsiya.",
                "烹煮小米(粟)、黍時,通常不須清洗就直接入鍋,移置爐灶上再加水。",
            ],
            [
                "formosan_trv",
                "",
                processing_utils.audio_from_file(
                    "examples/d76cb5e64a2ba1bade35edd0d8b12262c27707a7.wav"
                ),
                "Pthangan hangan Truku brah na siida o mniq ska hangan Embgala. Hangan ta siida o Embgala hraan hidaw sun.",
                "太魯閣族正名之前太魯閣族被編入泰雅爾族,當時的名字被稱為東部泰雅爾族。",
            ],
            [
                "formosan_pwn",
                "東",
                processing_utils.audio_from_file(
                    "examples/ef5780bceb44a41368a831513925cc59ebcfe14f.wav"
                ),
                "anema sikavaljualjut na sepaiwan kasicuayan, mavan a semualap ta cemel ta kasiv sa ljamayi sa sanqumayi, kata qemaljup tjepana.",
                "過去排灣族群靠甚麼維生呢?排灣族群的經濟生產是以「山田燒墾」的農耕為主,狩獵和捕魚為副業。",
            ],
        ],
        label="範例",
        inputs=[
            dialect_drop_down,
            gr.Text(label="方言", visible=False),
            audio_source,
            gr.Text(label="族語", visible=False),
            gr.Text(label="中文", visible=False),
        ],
    )


demo.launch()