import os import gradio as gr import torch from huggingface_hub import snapshot_download from nemo.collections.asr.models import ASRModel from omegaconf import OmegaConf device = "cuda" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 def load_model(model_id: str): model_dir = snapshot_download(model_id) model_ckpt_path = os.path.join(model_dir, "model.nemo") asr_model = ASRModel.restore_from(model_ckpt_path) asr_model.eval() asr_model = asr_model.to(device) return asr_model OmegaConf.register_new_resolver("load_model", load_model) models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml")) def automatic_speech_recognition(model_id: str, audio_file: str): model = models_config[model_id]["model"] text = model.transcribe(audio_file) return text demo = gr.Blocks( title="康統語音辨識系統", ) with demo: default_model_id = list(models_config.keys())[0] model_drop_down = gr.Dropdown( models_config.keys(), value=default_model_id, label="模型", ) gr.Markdown( """ # 康統語音辨識系統 """ ) gr.Interface( automatic_speech_recognition, inputs=[ model_drop_down, gr.Audio( label="上傳或錄音", type="filepath", waveform_options=gr.WaveformOptions( sample_rate=16000, ), ), ], outputs=[ gr.Text(interactive=False, label="辨識結果"), ], allow_flagging="auto", ) demo.launch()