Spaces:
Running
Running
import gradio as gr | |
import torch | |
from omegaconf import OmegaConf | |
from transformers import pipeline | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
def load_pipe(model_id: str): | |
return pipeline( | |
"automatic-speech-recognition", | |
model=model_id, | |
max_new_tokens=128, | |
chunk_length_s=30, | |
batch_size=8, | |
torch_dtype=torch_dtype, | |
device=device, | |
) | |
OmegaConf.register_new_resolver("load_pipe", load_pipe) | |
models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml")) | |
def automatic_speech_recognition(model_id: str, dialect_id: str, audio_file: str): | |
model = models_config[model_id]["model"] | |
generate_kwargs = { | |
"task": "transcribe", | |
"language": "id", | |
"num_beams": 1, | |
"prompt_ids": torch.from_numpy(model.tokenizer.get_prompt_ids(dialect_id)).to( | |
device | |
), | |
} | |
return model(audio_file, generate_kwargs=generate_kwargs)["text"].replace(f" {dialect_id}", "") | |
def when_model_selected(model_id: str): | |
model_config = models_config[model_id] | |
dialect_drop_down_choices = [ | |
(k, v) for k, v in model_config["dialect_mapping"].items() | |
] | |
return gr.update( | |
choices=dialect_drop_down_choices, | |
value=dialect_drop_down_choices[0][1], | |
) | |
demo = gr.Blocks( | |
title="三族語音辨識系統", | |
css="@import url(https://tauhu.tw/tauhu-oo.css);", | |
theme=gr.themes.Default( | |
font=( | |
"tauhu-oo", | |
gr.themes.GoogleFont("Source Sans Pro"), | |
"ui-sans-serif", | |
"system-ui", | |
"sans-serif", | |
) | |
), | |
) | |
with demo: | |
default_model_id = list(models_config.keys())[0] | |
model_drop_down = gr.Dropdown( | |
models_config.keys(), | |
value=default_model_id, | |
label="模型", | |
) | |
dialect_drop_down = gr.Radio( | |
choices=[ | |
(k, v) | |
for k, v in models_config[default_model_id]["dialect_mapping"].items() | |
], | |
value=list(models_config[default_model_id]["dialect_mapping"].values())[0], | |
label="族別", | |
) | |
model_drop_down.input( | |
when_model_selected, | |
inputs=[model_drop_down], | |
outputs=[dialect_drop_down], | |
) | |
gr.Markdown( | |
""" | |
# 三族語音辨識系統 | |
### Formosan Automatic-Speech-Recognition System | |
### 研發團隊 | |
- **[李鴻欣 Hung-Shin Lee](mailto:hungshinlee@gmail.com)([聯和科創](https://www.104.com.tw/company/1a2x6bmu75))** | |
- **[陳力瑋 Li-Wei Chen](mailto:wayne900619@gmail.com)([聯和科創](https://www.104.com.tw/company/1a2x6bmu75))** | |
""" | |
) | |
gr.Interface( | |
automatic_speech_recognition, | |
inputs=[ | |
model_drop_down, | |
dialect_drop_down, | |
gr.Audio( | |
label="上傳或錄音", | |
type="filepath", | |
waveform_options=gr.WaveformOptions( | |
sample_rate=16000, | |
), | |
), | |
], | |
outputs=[ | |
gr.Text(interactive=False, label="辨識結果"), | |
], | |
allow_flagging="auto", | |
) | |
demo.launch() | |