|
import gradio as gr |
|
import torch |
|
import sounddevice as sd |
|
import numpy as np |
|
from nemo.collections.asr.models import ASRModel |
|
|
|
|
|
model = ASRModel.from_pretrained("nvidia/canary-1b") |
|
model.eval() |
|
|
|
|
|
kws_model = torch.hub.load('snakers4/silero-vad', 'silero_vad') |
|
|
|
|
|
TRIGGER_WORD = "hey alexa" |
|
TRIGGER_DURATION = 2 |
|
SAMPLE_RATE = 16000 |
|
|
|
def start_recording(): |
|
print("Recording started...") |
|
audio = sd.rec(int(TRIGGER_DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype='float32') |
|
sd.wait() |
|
return audio.flatten() |
|
|
|
def detect_trigger(audio): |
|
|
|
is_triggered = kws_model(audio, sample_rate=SAMPLE_RATE) >= 0.5 |
|
return is_triggered |
|
|
|
def transcribe_triggered(): |
|
while True: |
|
print("Listening for trigger word...") |
|
|
|
recorded_audio = start_recording() |
|
|
|
|
|
is_triggered = detect_trigger(recorded_audio) |
|
if is_triggered: |
|
print("Trigger word detected. Transcribing...") |
|
|
|
transcription = model.transcribe([recorded_audio]) |
|
return transcription[0] |
|
|
|
iface = gr.Interface(transcribe_triggered, gr.inputs.NoInput(), "text", title="ASR with NeMo Canary Model (Triggered by 'Hey Alexa')") |
|
iface.launch() |
|
|