import gradio as gr import torch from nemo.collections.asr.models import ASRModel # Load the NeMo ASR model model = ASRModel.from_pretrained("nvidia/canary-1b") model.eval() # Load the keyword spotting model kws_model = torch.hub.load('snakers4/silero-vad', 'silero_vad') def detect_trigger(audio): if audio is None: raise gr.InterfaceError("Please provide some input audio: either upload an audio file or use the microphone") # Perform keyword spotting is_triggered = kws_model(audio) # You need to adapt this line to the actual API of your keyword spotting model return is_triggered def transcribe_triggered(audio): if audio is None: raise gr.InterfaceError("Please provide some input audio: either upload an audio file or use the microphone") # Check if trigger word is detected is_triggered = detect_trigger(audio) if not is_triggered: return "Trigger word not detected." # Perform speech recognition transcription = model.transcribe([audio]) return transcription[0] audio_input = gr.components.Audio() iface = gr.Interface(transcribe_triggered, audio_input, "text", title="ASR with NeMo Canary Model (Triggered by 'Hey Alexa')") iface.launch()