txya900619 commited on
Commit
b0744ce
1 Parent(s): 960c9a4

feat: upload app script

Browse files
Files changed (3) hide show
  1. app.py +116 -0
  2. configs/models.yaml +6 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from omegaconf import OmegaConf
4
+ from transformers import pipeline
5
+
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
8
+
9
+
10
+ def load_pipe(model_id: str):
11
+ return pipeline(
12
+ "automatic-speech-recognition",
13
+ model=model_id,
14
+ max_new_tokens=128,
15
+ chunk_length_s=30,
16
+ batch_size=8,
17
+ torch_dtype=torch_dtype,
18
+ device=device,
19
+ )
20
+
21
+ OmegaConf.register_new_resolver("load_pipe", load_pipe)
22
+
23
+ models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
24
+
25
+ def automatic_speech_recognition(model_id: str, dialect_id: str, audio_file: str):
26
+ model = models_config[model_id]["model"]
27
+ generate_kwargs = {
28
+ "task": "transcribe",
29
+ "language": "id",
30
+ "num_beams": 1,
31
+ "prompt_ids": torch.from_numpy(model.tokenizer.get_prompt_ids(dialect_id)).to(
32
+ device
33
+ ),
34
+ }
35
+ return model(audio_file, generate_kwargs=generate_kwargs)["text"].replace(f" {dialect_id}", "")
36
+
37
+
38
+ def when_model_selected(model_id: str):
39
+ model_config = models_config[model_id]
40
+
41
+ dialect_drop_down_choices = [
42
+ (k, v) for k, v in model_config["dialect_mapping"].items()
43
+ ]
44
+
45
+ return gr.update(
46
+ choices=dialect_drop_down_choices,
47
+ value=dialect_drop_down_choices[0][1],
48
+ )
49
+
50
+
51
+ demo = gr.Blocks(
52
+ title="三族語音辨識系統",
53
+ css="@import url(https://tauhu.tw/tauhu-oo.css);",
54
+ theme=gr.themes.Default(
55
+ font=(
56
+ "tauhu-oo",
57
+ gr.themes.GoogleFont("Source Sans Pro"),
58
+ "ui-sans-serif",
59
+ "system-ui",
60
+ "sans-serif",
61
+ )
62
+ ),
63
+ )
64
+
65
+ with demo:
66
+ default_model_id = list(models_config.keys())[0]
67
+ model_drop_down = gr.Dropdown(
68
+ models_config.keys(),
69
+ value=default_model_id,
70
+ label="模型",
71
+ )
72
+
73
+ dialect_drop_down = gr.Radio(
74
+ choices=[
75
+ (k, v)
76
+ for k, v in models_config[default_model_id]["dialect_mapping"].items()
77
+ ],
78
+ value=list(models_config[default_model_id]["dialect_mapping"].values())[0],
79
+ label="族別",
80
+ )
81
+
82
+ model_drop_down.input(
83
+ when_model_selected,
84
+ inputs=[model_drop_down],
85
+ outputs=[dialect_drop_down],
86
+ )
87
+
88
+ gr.Markdown(
89
+ """
90
+ # 三族語音辨識系統
91
+ ### Formosan Automatic-Speech-Recognition System
92
+ ### 研發團隊
93
+ - **[李鴻欣 Hung-Shin Lee](mailto:hungshinlee@gmail.com)([聯和科創](https://www.104.com.tw/company/1a2x6bmu75))**
94
+ - **[陳力瑋 Li-Wei Chen](mailto:wayne900619@gmail.com)([聯和科創](https://www.104.com.tw/company/1a2x6bmu75))**
95
+ """
96
+ )
97
+ gr.Interface(
98
+ automatic_speech_recognition,
99
+ inputs=[
100
+ model_drop_down,
101
+ dialect_drop_down,
102
+ gr.Audio(
103
+ label="上傳或錄音",
104
+ type="filepath",
105
+ waveform_options=gr.WaveformOptions(
106
+ sample_rate=16000,
107
+ ),
108
+ ),
109
+ ],
110
+ outputs=[
111
+ gr.Text(interactive=False, label="辨識結果"),
112
+ ],
113
+ allow_flagging="auto",
114
+ )
115
+
116
+ demo.launch()
configs/models.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ whisper-large-v3:
2
+ model: ${load_pipe:formospeech/whisper-large-v3-formosan-iso-prompt}
3
+ dialect_mapping:
4
+ 阿美: ami
5
+ 賽德克: sdq
6
+ 太魯閣: trv
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ omegaconf
2
+ torch
3
+ transformers