File size: 5,921 Bytes
1c50673 ea225b5 25a5cc8 b7a5199 10a5181 25a5cc8 b7a5199 1c50673 2ca3e3f 1c50673 2ca3e3f 3bdfcbc 1c50673 2ca3e3f 1c50673 3bdfcbc 4911c46 cd9e9bd 1c73271 cd9e9bd 1c73271 3bdfcbc 1c73271 3bdfcbc 1c73271 3bdfcbc 1c73271 3bdfcbc 1c73271 1c50673 cd9e9bd 1c50673 25a5cc8 beebab3 1c50673 b7a5199 4710fcc beebab3 b7a5199 4710fcc beebab3 4710fcc 25a5cc8 b7a5199 25a5cc8 b7a5199 25a5cc8 b7a5199 25a5cc8 4710fcc 10a5181 25a5cc8 b7a5199 10a5181 1c50673 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
'''
import gradio as gr
from transformers import pipeline
# Load pipelines for Canary ASR, LLama3 QA, and VITS TTS
asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b", device=0)
qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa")
tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0)
'''
import gradio as gr
import json
import librosa
import os
import soundfile as sf
import tempfile
import uuid
import torch
SAMPLE_RATE = 16000 # Hz
MAX_AUDIO_SECS = 30 # wont try to transcribe if longer than this
src_lang = "en"
tgt_lang = "en"
pnc="no"
def convert_audio(audio_filepath, tmpdir, utt_id):
"""
Convert all files to monochannel 16 kHz wav files.
Do not convert and raise error if audio too long.
Returns output filename and duration.
"""
data, sr = librosa.load(audio_filepath, sr=None, mono=True)
duration = librosa.get_duration(y=data, sr=sr)
if duration > MAX_AUDIO_SECS:
raise gr.Error(
f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
"If you wish, you may trim the audio using the Audio viewer in Step 1 "
"(click on the scissors icon to start trimming audio)."
)
if sr != SAMPLE_RATE:
data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
out_filename = os.path.join(tmpdir, utt_id + '.wav')
# save output audio
sf.write(out_filename, data, SAMPLE_RATE)
return out_filename, duration
# Load the ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b")
def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
if audio_filepath is None:
raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
utt_id = uuid.uuid4()
with tempfile.TemporaryDirectory() as tmpdir:
# Make manifest file and save
manifest_data = {
"audio_filepath": audio_filepath,
"source_lang": src_lang,
"target_lang": tgt_lang,
"taskname": "asr", # Setting taskname to "asr"
"pnc": pnc,
"answer": "predict"
}
manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
with open(manifest_filepath, 'w') as fout:
json.dump(manifest_data, fout)
# Transcribe audio using ASR pipeline
transcribed_text = asr_pipeline(audio_filepath)
output_text = transcribed_text[0]['transcription']
return output_text
with gr.Blocks(
title="NeMo Canary Model",
css="""
textarea { font-size: 18px;}
#model_output_text_box span {
font-size: 18px;
font-weight: bold;
}
""",
theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
) as demo:
gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>")
with gr.Row():
with gr.Column():
gr.HTML(
"<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
"<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
"You can transcribe longer files locally with this NeMo "
"<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
)
audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
gr.HTML("<p><b>Step 2:</b> Choose the input and output language.</p>")
with gr.Column():
gr.HTML("<p><b>Step 3:</b> Run the model.</p>")
go_button = gr.Button(
value="Run model",
variant="primary", # make "primary" so it stands out (default is "secondary")
)
model_output_text_box = gr.Textbox(
label="Model Output",
elem_id="model_output_text_box",
)
with gr.Row():
gr.HTML(
"<p style='text-align: center'>"
"π€ <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | "
"π§βπ» <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
"</p>"
)
go_button.click(
fn=transcribe,
inputs = [audio_file],
outputs = [model_output_text_box]
)
demo.queue()
demo.launch()
'''
# Function to capture audio using Canary ASR
def capture_audio():
utt_id = uuid.uuid4()
with tempfile.TemporaryDirectory() as tmpdir:
converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
manifest_data = {
"audio_filepath": converted_audio_filepath,
"source_lang": "en",
"target_lang": "en",
"taskname": taskname,
"pnc": pnc,
"answer": "predict",
"duration": 10,
}
manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
print("Listening for cue words...")
while True:
audio_input = asr_pipeline(None)[0]['input_values']
transcript = asr_pipeline(audio_input)[0]['transcription']
if "hey canary" in transcript.lower():
print("Cue word detected!")
break
print("Listening...")
return audio_input
# AI assistant function
def ai_assistant(audio_input):
# Perform automatic speech recognition (ASR)
transcript = asr_pipeline(audio_input)[0]['transcription']
# Perform question answering (QA)
qa_result = qa_pipeline(question=transcript, context="Insert your context here")
# Convert the QA result to speech using text-to-speech (TTS)
tts_output = tts_pipeline(qa_result['answer'])
return tts_output[0]['audio']
if __name__ == "__main__":
# Create a Gradio interface
gr.Interface(ai_assistant,
inputs=gr.inputs.Audio(capture=capture_audio, label="Speak Here"),
outputs=gr.outputs.Audio(type="audio", label="Assistant's Response"),
title="AI Assistant",
description="An AI Assistant that answers questions based on your speech input.").launch()
''' |