File size: 7,283 Bytes
046c2b1 2f54f28 046c2b1 2f54f28 1c50673 2f54f28 1c50673 2f54f28 1c50673 98156c2 1c50673 2f54f28 3bdfcbc 2f54f28 1c50673 2f54f28 3bdfcbc b3d14b2 9b700a4 2f54f28 1c50673 2f54f28 1c50673 2f54f28 1c50673 2f54f28 1c50673 2f54f28 1c50673 2f54f28 1c50673 2f54f28 1c50673 2f54f28 71c5789 2f54f28 71c5789 2f54f28 71c5789 2f54f28 1c50673 2f54f28 1c50673 2f54f28 1c50673 046c2b1 1c50673 25a5cc8 beebab3 1c50673 b7a5199 4710fcc beebab3 b7a5199 4710fcc beebab3 4710fcc 25a5cc8 b7a5199 25a5cc8 b7a5199 25a5cc8 b7a5199 25a5cc8 4710fcc 10a5181 25a5cc8 b7a5199 10a5181 1c50673 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
import gradio as gr
from nemo.collections.asr.models import ASRModel
# Load the NeMo ASR model
model = ASRModel.from_pretrained("nvidia/canary-1b")
model.eval()
def transcribe(audio):
if audio is None:
raise gr.InterfaceError("Please provide some input audio: either upload an audio file or use the microphone")
# Perform speech recognition
transcription = model.transcribe([audio])
return transcription[0]
audio_input = gr.components.Audio()
iface = gr.Interface(transcribe, audio_input, "text", title="ASR with NeMo Canary Model")
iface.launch()
'''
import gradio as gr
from transformers import pipeline
# Load pipelines for Canary ASR, LLama3 QA, and VITS TTS
asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b", device=0)
qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa")
tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0)
import gradio as gr
import json
import librosa
import os
import soundfile as sf
import tempfile
import uuid
from transformers import pipeline
import torch
from nemo.collections.asr.models import ASRModel
from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
SAMPLE_RATE = 16000 # Hz
MAX_AUDIO_SECS = 30 # wont try to transcribe if longer than this
src_lang = "en"
tgt_lang = "en"
pnc="no"
model = ASRModel.from_pretrained("nvidia/canary-1b")
model.eval()
# make sure beam size always 1 for consistency
model.change_decoding_strategy(None)
decoding_cfg = model.cfg.decoding
decoding_cfg.beam.beam_size = 1
model.change_decoding_strategy(decoding_cfg)
# setup for buffered inference
model.cfg.preprocessor.dither = 0.0
model.cfg.preprocessor.pad_to = 0
feature_stride = model.cfg.preprocessor['window_stride']
model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
frame_asr = FrameBatchMultiTaskAED(
asr_model=model,
frame_len=40.0,
total_buffer=40.0,
batch_size=16,
)
amp_dtype = torch.float16
def convert_audio(audio_filepath, tmpdir, utt_id):
"""
Convert all files to monochannel 16 kHz wav files.
Do not convert and raise error if audio too long.
Returns output filename and duration.
"""
data, sr = librosa.load(audio_filepath, sr=None, mono=True)
duration = librosa.get_duration(y=data, sr=sr)
if duration > MAX_AUDIO_SECS:
raise gr.Error(
f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
"If you wish, you may trim the audio using the Audio viewer in Step 1 "
"(click on the scissors icon to start trimming audio)."
)
if sr != SAMPLE_RATE:
data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
out_filename = os.path.join(tmpdir, utt_id + '.wav')
# save output audio
sf.write(out_filename, data, SAMPLE_RATE)
return out_filename, duration
def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
if audio_filepath is None:
raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
utt_id = uuid.uuid4()
with tempfile.TemporaryDirectory() as tmpdir:
converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
# make manifest file and save
manifest_data = {
"audio_filepath": converted_audio_filepath,
"source_lang": src_lang,
"target_lang": tgt_lang,
"taskname": taskname,
"pnc": pnc,
"answer": "predict",
"duration": str(duration),
}
manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
with open(manifest_filepath, 'w') as fout:
line = json.dumps(manifest_data)
fout.write(line + '\n')
# call transcribe, passing in manifest filepath
if duration < 40:
output_text = model.transcribe(manifest_filepath)[0]
else: # do buffered inference
with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
with torch.no_grad():
hyps = get_buffered_pred_feat_multitaskAED(
frame_asr,
model.cfg.preprocessor,
model_stride_in_secs,
model.device,
manifest=manifest_filepath,
filepaths=None,
)
output_text = hyps[0].text
return output_text
with gr.Blocks(
title="NeMo Canary Model",
css="""
textarea { font-size: 18px;}
#model_output_text_box span {
font-size: 18px;
font-weight: bold;
}
""",
theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
) as demo:
gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>")
with gr.Row():
with gr.Column():
gr.HTML(
"<p><b>Step 1:</b> Record with your microphone.</p>"
)
audio_file = gr.Audio(sources=["microphone"], type="filepath")
with gr.Column():
gr.HTML("<p><b>Step 3:</b> Run the model.</p>")
go_button = gr.Button(
value="Run model",
variant="primary", # make "primary" so it stands out (default is "secondary")
)
model_output_text_box = gr.Textbox(
label="Model Output",
elem_id="model_output_text_box",
)
with gr.Row():
gr.HTML(
"<p style='text-align: center'>"
"π€ <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | "
"π§βπ» <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
"</p>"
)
go_button.click(
fn=transcribe,
inputs = [audio_file],
outputs = [model_output_text_box]
)
demo.queue()
demo.launch()
# Function to capture audio using Canary ASR
def capture_audio():
utt_id = uuid.uuid4()
with tempfile.TemporaryDirectory() as tmpdir:
converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
manifest_data = {
"audio_filepath": converted_audio_filepath,
"source_lang": "en",
"target_lang": "en",
"taskname": taskname,
"pnc": pnc,
"answer": "predict",
"duration": 10,
}
manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
print("Listening for cue words...")
while True:
audio_input = asr_pipeline(None)[0]['input_values']
transcript = asr_pipeline(audio_input)[0]['transcription']
if "hey canary" in transcript.lower():
print("Cue word detected!")
break
print("Listening...")
return audio_input
# AI assistant function
def ai_assistant(audio_input):
# Perform automatic speech recognition (ASR)
transcript = asr_pipeline(audio_input)[0]['transcription']
# Perform question answering (QA)
qa_result = qa_pipeline(question=transcript, context="Insert your context here")
# Convert the QA result to speech using text-to-speech (TTS)
tts_output = tts_pipeline(qa_result['answer'])
return tts_output[0]['audio']
if __name__ == "__main__":
# Create a Gradio interface
gr.Interface(ai_assistant,
inputs=gr.inputs.Audio(capture=capture_audio, label="Speak Here"),
outputs=gr.outputs.Audio(type="audio", label="Assistant's Response"),
title="AI Assistant",
description="An AI Assistant that answers questions based on your speech input.").launch()
''' |