txya900619 commited on
Commit
fa5764d
1 Parent(s): 4188725

feat: upload app.py and needed file

Browse files
Files changed (4) hide show
  1. DEMO.md +17 -0
  2. app.py +134 -0
  3. configs/models.yaml +12 -0
  4. requirements.txt +2 -0
DEMO.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 原語會族語語音辨識系統
2
+
3
+ ILRDF Formosan Automatic-Speech-Recognition System
4
+
5
+ ## 研發團隊
6
+
7
+ - [李鴻欣 Hung-Shin Lee](mailto:hungshinlee@gmail.com)
8
+ - [陳力瑋 Li-Wei Chen](mailto:wayne900619@gmail.com)
9
+ - [意傳科技](https://ithuan.tw/)
10
+ - [原住民族語言研究發展基金會](https://www.ilrdf.org.tw/)
11
+
12
+ ## 特別致謝
13
+ - [聯和科創](https://www.104.com.tw/company/1a2x6bmu75)
14
+ - [台灣阿美族語言永續發展學會/原民會阿美族語言推動組織](https://www.facebook.com/groups/ypspt/about)
15
+ - [台灣太魯閣族語言發展學會](https://qkktt.com/)
16
+ - [台灣原住民族賽德克族語言文化學會](https://www.facebook.com/3S3TBL/)
17
+ - 族語老師們
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import gradio as gr
4
+ from huggingface_hub import snapshot_download
5
+ from omegaconf import OmegaConf
6
+ from vosk import KaldiRecognizer, Model
7
+
8
+
9
+ def load_vosk(model_id: str):
10
+ model_dir = snapshot_download(model_id)
11
+ return Model(model_path=model_dir)
12
+
13
+
14
+ OmegaConf.register_new_resolver("load_vosk", load_vosk)
15
+
16
+ models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
17
+
18
+
19
+ def automatic_speech_recognition(model_id: str, dialect_id: str, audio_data: str):
20
+ if isinstance(models_config[model_id]["model"], dict):
21
+ model = models_config[model_id]["model"][dialect_id]
22
+ else:
23
+ model = models_config[model_id]["model"]
24
+
25
+ sample_rate, audio_array = audio_data
26
+ if audio_array.ndim == 2:
27
+ audio_array = audio_array[:, 0]
28
+
29
+ audio_bytes = audio_array.tobytes()
30
+
31
+ rec = KaldiRecognizer(model, sample_rate)
32
+
33
+ rec.SetWords(True)
34
+
35
+ results = []
36
+
37
+ for start in range(0, len(audio_bytes), 4000):
38
+ end = min(start + 4000, len(audio_bytes))
39
+ data = audio_bytes[start:end]
40
+ if rec.AcceptWaveform(data):
41
+ raw_result = json.loads(rec.Result())
42
+ results.append(raw_result)
43
+
44
+ final_result = json.loads(rec.FinalResult())
45
+ results.append(final_result)
46
+
47
+ filtered_lines = []
48
+
49
+ for result in results:
50
+ result["text"] = result["text"].replace(" ", "")
51
+ if len(result["text"]) > 0:
52
+ filtered_lines.append(result["text"])
53
+
54
+ return ",".join(filtered_lines) + "。"
55
+
56
+
57
+ def when_model_selected(model_id: str):
58
+ model_config = models_config[model_id]
59
+
60
+ if "dialect_mapping" not in model_config:
61
+ return gr.update(visible=False)
62
+
63
+ dialect_drop_down_choices = [
64
+ (k, v) for k, v in model_config["dialect_mapping"].items()
65
+ ]
66
+
67
+ return gr.update(
68
+ choices=dialect_drop_down_choices,
69
+ value=dialect_drop_down_choices[0][1],
70
+ visible=True,
71
+ )
72
+
73
+
74
+ demo = gr.Blocks(
75
+ title="臺灣南島語語音辨識系統",
76
+ css="@import url(https://tauhu.tw/tauhu-oo.css);",
77
+ theme=gr.themes.Default(
78
+ font=(
79
+ "tauhu-oo",
80
+ gr.themes.GoogleFont("Source Sans Pro"),
81
+ "ui-sans-serif",
82
+ "system-ui",
83
+ "sans-serif",
84
+ )
85
+ ),
86
+ )
87
+
88
+ with demo:
89
+ default_model_id = list(models_config.keys())[0]
90
+ model_drop_down = gr.Dropdown(
91
+ models_config.keys(),
92
+ value=default_model_id,
93
+ label="模型",
94
+ )
95
+
96
+ dialect_drop_down = gr.Radio(
97
+ choices=[
98
+ (k, v)
99
+ for k, v in models_config[default_model_id]["dialect_mapping"].items()
100
+ ],
101
+ value=list(models_config[default_model_id]["dialect_mapping"].values())[0],
102
+ label="族別",
103
+ )
104
+
105
+ model_drop_down.input(
106
+ when_model_selected,
107
+ inputs=[model_drop_down],
108
+ outputs=[dialect_drop_down],
109
+ )
110
+
111
+ with open("DEMO.md") as tong:
112
+ gr.Markdown(tong.read())
113
+
114
+ gr.Interface(
115
+ automatic_speech_recognition,
116
+ inputs=[
117
+ model_drop_down,
118
+ dialect_drop_down,
119
+ gr.Audio(
120
+ label="上傳或錄音",
121
+ type="numpy",
122
+ format="wav",
123
+ waveform_options=gr.WaveformOptions(
124
+ sample_rate=16000,
125
+ ),
126
+ ),
127
+ ],
128
+ outputs=[
129
+ gr.Text(interactive=False, label="客語漢字"),
130
+ ],
131
+ allow_flagging="auto",
132
+ )
133
+
134
+ demo.launch()
configs/models.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ formosan-kaldi:
2
+ model:
3
+ formosan_ami: ${load_vosk:united-link/kaldi-formosan-ami}
4
+ formosan_sdq: ${load_vosk:united-link/kaldi-formosan-sdq}
5
+ formosan_trv: ${load_vosk:united-link/kaldi-formosan-trv}
6
+ formosan_pwn: ${load_vosk:united-link/kaldi-formosan-pwn}
7
+
8
+ dialect_mapping:
9
+ 阿美: formosan_ami
10
+ 賽德克: formosan_sdq
11
+ 太魯閣: formosan_trv
12
+ 排灣: formosan_pwn
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ vosk
2
+ omegaconf