import gradio as gr
import torch
from nemo.collections.asr.models import ASRModel

# Load the NeMo ASR model
model = ASRModel.from_pretrained("nvidia/canary-1b")
model.eval()

# Load the keyword spotting model
kws_model = torch.hub.load('snakers4/silero-vad', 'silero_vad')

def detect_trigger(audio):
    if audio is None:
        raise gr.InterfaceError("Please provide some input audio: either upload an audio file or use the microphone")

    # Perform keyword spotting
    is_triggered = kws_model(audio)  # You need to adapt this line to the actual API of your keyword spotting model

    return is_triggered

def transcribe_triggered(audio):
    if audio is None:
        raise gr.InterfaceError("Please provide some input audio: either upload an audio file or use the microphone")

    # Check if trigger word is detected
    is_triggered = detect_trigger(audio)
    if not is_triggered:
        return "Trigger word not detected."

    # Perform speech recognition
    transcription = model.transcribe([audio])

    return transcription[0]

audio_input = gr.components.Audio()

iface = gr.Interface(transcribe_triggered, audio_input, "text", title="ASR with NeMo Canary Model (Triggered by 'Hey Alexa')")
iface.launch()