|
''' |
|
import gradio as gr |
|
from transformers import pipeline |
|
|
|
# Load pipelines for Canary ASR, LLama3 QA, and VITS TTS |
|
asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b", device=0) |
|
qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa") |
|
tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0) |
|
''' |
|
|
|
import gradio as gr |
|
import json |
|
import librosa |
|
import os |
|
import soundfile as sf |
|
import tempfile |
|
import uuid |
|
from transformers import pipeline |
|
|
|
import torch |
|
|
|
|
|
SAMPLE_RATE = 16000 |
|
MAX_AUDIO_SECS = 30 |
|
src_lang = "en" |
|
tgt_lang = "en" |
|
pnc="no" |
|
|
|
def convert_audio(audio_filepath, tmpdir, utt_id): |
|
""" |
|
Convert all files to monochannel 16 kHz wav files. |
|
Do not convert and raise error if audio too long. |
|
Returns output filename and duration. |
|
""" |
|
data, sr = librosa.load(audio_filepath, sr=None, mono=True) |
|
|
|
duration = librosa.get_duration(y=data, sr=sr) |
|
|
|
if duration > MAX_AUDIO_SECS: |
|
raise gr.Error( |
|
f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. " |
|
"If you wish, you may trim the audio using the Audio viewer in Step 1 " |
|
"(click on the scissors icon to start trimming audio)." |
|
) |
|
|
|
if sr != SAMPLE_RATE: |
|
data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE) |
|
|
|
out_filename = os.path.join(tmpdir, utt_id + '.wav') |
|
|
|
|
|
sf.write(out_filename, data, SAMPLE_RATE) |
|
|
|
return out_filename, duration |
|
|
|
|
|
|
|
asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b") |
|
|
|
def transcribe(audio_filepath, src_lang, tgt_lang, pnc): |
|
if audio_filepath is None: |
|
raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone") |
|
|
|
utt_id = uuid.uuid4() |
|
|
|
with tempfile.TemporaryDirectory() as tmpdir: |
|
|
|
manifest_data = { |
|
"audio_filepath": audio_filepath, |
|
"source_lang": src_lang, |
|
"target_lang": tgt_lang, |
|
"taskname": "asr", |
|
"pnc": pnc, |
|
"answer": "predict" |
|
} |
|
|
|
manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json') |
|
|
|
with open(manifest_filepath, 'w') as fout: |
|
json.dump(manifest_data, fout) |
|
|
|
|
|
transcribed_text = asr_pipeline(audio_filepath) |
|
output_text = transcribed_text[0]['transcription'] |
|
|
|
return output_text |
|
|
|
|
|
|
|
with gr.Blocks( |
|
title="NeMo Canary Model", |
|
css=""" |
|
textarea { font-size: 18px;} |
|
#model_output_text_box span { |
|
font-size: 18px; |
|
font-weight: bold; |
|
} |
|
""", |
|
theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) |
|
) as demo: |
|
|
|
gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.HTML( |
|
"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>" |
|
|
|
"<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. " |
|
"You can transcribe longer files locally with this NeMo " |
|
"<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py'>script</a>.</p>" |
|
) |
|
|
|
audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath") |
|
|
|
gr.HTML("<p><b>Step 2:</b> Choose the input and output language.</p>") |
|
|
|
|
|
with gr.Column(): |
|
|
|
gr.HTML("<p><b>Step 3:</b> Run the model.</p>") |
|
|
|
go_button = gr.Button( |
|
value="Run model", |
|
variant="primary", |
|
) |
|
|
|
model_output_text_box = gr.Textbox( |
|
label="Model Output", |
|
elem_id="model_output_text_box", |
|
) |
|
|
|
with gr.Row(): |
|
|
|
gr.HTML( |
|
"<p style='text-align: center'>" |
|
"π€ <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | " |
|
"π§βπ» <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>" |
|
"</p>" |
|
) |
|
|
|
go_button.click( |
|
fn=transcribe, |
|
inputs = [audio_file], |
|
outputs = [model_output_text_box] |
|
) |
|
|
|
|
|
demo.queue() |
|
demo.launch() |
|
|
|
''' |
|
|
|
|
|
# Function to capture audio using Canary ASR |
|
def capture_audio(): |
|
utt_id = uuid.uuid4() |
|
with tempfile.TemporaryDirectory() as tmpdir: |
|
converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id)) |
|
|
|
manifest_data = { |
|
"audio_filepath": converted_audio_filepath, |
|
"source_lang": "en", |
|
"target_lang": "en", |
|
"taskname": taskname, |
|
"pnc": pnc, |
|
"answer": "predict", |
|
"duration": 10, |
|
} |
|
|
|
manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json') |
|
|
|
print("Listening for cue words...") |
|
while True: |
|
audio_input = asr_pipeline(None)[0]['input_values'] |
|
transcript = asr_pipeline(audio_input)[0]['transcription'] |
|
if "hey canary" in transcript.lower(): |
|
print("Cue word detected!") |
|
break |
|
print("Listening...") |
|
return audio_input |
|
|
|
# AI assistant function |
|
def ai_assistant(audio_input): |
|
# Perform automatic speech recognition (ASR) |
|
transcript = asr_pipeline(audio_input)[0]['transcription'] |
|
|
|
# Perform question answering (QA) |
|
qa_result = qa_pipeline(question=transcript, context="Insert your context here") |
|
|
|
# Convert the QA result to speech using text-to-speech (TTS) |
|
tts_output = tts_pipeline(qa_result['answer']) |
|
|
|
return tts_output[0]['audio'] |
|
|
|
if __name__ == "__main__": |
|
# Create a Gradio interface |
|
gr.Interface(ai_assistant, |
|
inputs=gr.inputs.Audio(capture=capture_audio, label="Speak Here"), |
|
outputs=gr.outputs.Audio(type="audio", label="Assistant's Response"), |
|
title="AI Assistant", |
|
description="An AI Assistant that answers questions based on your speech input.").launch() |
|
''' |