txya900619
commited on
Commit
·
784aace
1
Parent(s):
e43ab87
feat: upload inference script
Browse files- app.py +69 -0
- configs/models.yaml +2 -0
- requirements.txt +4 -0
app.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import torch
|
5 |
+
from huggingface_hub import snapshot_download
|
6 |
+
from nemo.collections.asr.models import ASRModel
|
7 |
+
from omegaconf import OmegaConf
|
8 |
+
|
9 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
11 |
+
|
12 |
+
|
13 |
+
def load_model(model_id: str):
|
14 |
+
model_dir = snapshot_download(model_id)
|
15 |
+
model_ckpt_path = os.path.join(model_dir, "model.nemo")
|
16 |
+
asr_model = ASRModel.restore_from(model_ckpt_path)
|
17 |
+
asr_model.eval()
|
18 |
+
asr_model = asr_model.to(device)
|
19 |
+
|
20 |
+
return asr_model
|
21 |
+
|
22 |
+
|
23 |
+
OmegaConf.register_new_resolver("load_model", load_model)
|
24 |
+
|
25 |
+
models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
|
26 |
+
|
27 |
+
|
28 |
+
def automatic_speech_recognition(model_id: str, audio_file: str):
|
29 |
+
model = models_config[model_id]["model"]
|
30 |
+
text = model.transcribe(audio_file)
|
31 |
+
return text
|
32 |
+
|
33 |
+
|
34 |
+
demo = gr.Blocks(
|
35 |
+
title="康統語音辨識系統",
|
36 |
+
)
|
37 |
+
|
38 |
+
with demo:
|
39 |
+
default_model_id = list(models_config.keys())[0]
|
40 |
+
model_drop_down = gr.Dropdown(
|
41 |
+
models_config.keys(),
|
42 |
+
value=default_model_id,
|
43 |
+
label="模型",
|
44 |
+
)
|
45 |
+
|
46 |
+
gr.Markdown(
|
47 |
+
"""
|
48 |
+
# 康統語音辨識系統
|
49 |
+
"""
|
50 |
+
)
|
51 |
+
gr.Interface(
|
52 |
+
automatic_speech_recognition,
|
53 |
+
inputs=[
|
54 |
+
model_drop_down,
|
55 |
+
gr.Audio(
|
56 |
+
label="上傳或錄音",
|
57 |
+
type="filepath",
|
58 |
+
waveform_options=gr.WaveformOptions(
|
59 |
+
sample_rate=16000,
|
60 |
+
),
|
61 |
+
),
|
62 |
+
],
|
63 |
+
outputs=[
|
64 |
+
gr.Text(interactive=False, label="辨識結果"),
|
65 |
+
],
|
66 |
+
allow_flagging="auto",
|
67 |
+
)
|
68 |
+
|
69 |
+
demo.launch()
|
configs/models.yaml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
parakeet-0.6b-custom-tokenizer:
|
2 |
+
model: ${load_model:kenkone/parakeet-ctc-0.6b-custom-tokenizer}
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
omegaconf
|
2 |
+
git+https://github.com/NVIDIA/NeMo.git@main#egg=nemo_toolkit[asr]
|
3 |
+
IPython
|
4 |
+
numpy==1.*
|