jiuuee commited on
Commit
1c50673
Β·
verified Β·
1 Parent(s): da5219e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +234 -0
app.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  import gradio as gr
2
  from transformers import pipeline
3
 
@@ -5,9 +10,237 @@ from transformers import pipeline
5
  asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b", device=0)
6
  qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa")
7
  tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  # Function to capture audio using Canary ASR
10
  def capture_audio():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  print("Listening for cue words...")
12
  while True:
13
  audio_input = asr_pipeline(None)[0]['input_values']
@@ -38,3 +271,4 @@ if __name__ == "__main__":
38
  outputs=gr.outputs.Audio(type="audio", label="Assistant's Response"),
39
  title="AI Assistant",
40
  description="An AI Assistant that answers questions based on your speech input.").launch()
 
 
1
+
2
+
3
+
4
+
5
+ '''
6
  import gradio as gr
7
  from transformers import pipeline
8
 
 
10
  asr_pipeline = pipeline("automatic-speech-recognition", model="nvidia/canary-1b", device=0)
11
  qa_pipeline = pipeline("question-answering", model="LLAMA/llama3-base-qa", tokenizer="LLAMA/llama3-base-qa")
12
  tts_pipeline = pipeline("text-to-speech", model="patrickvonplaten/vits-large", device=0)
13
+ '''
14
+ import gradio as gr
15
+ import json
16
+ import librosa
17
+ import os
18
+ import soundfile as sf
19
+ import tempfile
20
+ import uuid
21
+
22
+ import torch
23
+
24
+ from nemo.collections.asr.models import ASRModel
25
+ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
26
+ from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
27
+
28
+ SAMPLE_RATE = 16000 # Hz
29
+ MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
30
+
31
+ model = ASRModel.from_pretrained("nvidia/canary-1b")
32
+ model.eval()
33
+
34
+ # make sure beam size always 1 for consistency
35
+ model.change_decoding_strategy(None)
36
+ decoding_cfg = model.cfg.decoding
37
+ decoding_cfg.beam.beam_size = 1
38
+ model.change_decoding_strategy(decoding_cfg)
39
+
40
+ # setup for buffered inference
41
+ model.cfg.preprocessor.dither = 0.0
42
+ model.cfg.preprocessor.pad_to = 0
43
+
44
+ feature_stride = model.cfg.preprocessor['window_stride']
45
+ model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
46
+
47
+ frame_asr = FrameBatchMultiTaskAED(
48
+ asr_model=model,
49
+ frame_len=40.0,
50
+ total_buffer=40.0,
51
+ batch_size=16,
52
+ )
53
+
54
+ amp_dtype = torch.float16
55
+
56
+ def convert_audio(audio_filepath, tmpdir, utt_id):
57
+ """
58
+ Convert all files to monochannel 16 kHz wav files.
59
+ Do not convert and raise error if audio too long.
60
+ Returns output filename and duration.
61
+ """
62
+
63
+ data, sr = librosa.load(audio_filepath, sr=None, mono=True)
64
+
65
+ duration = librosa.get_duration(y=data, sr=sr)
66
+
67
+ if duration / 60.0 > MAX_AUDIO_MINUTES:
68
+ raise gr.Error(
69
+ f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
70
+ "If you wish, you may trim the audio using the Audio viewer in Step 1 "
71
+ "(click on the scissors icon to start trimming audio)."
72
+ )
73
+
74
+ if sr != SAMPLE_RATE:
75
+ data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
76
+
77
+ out_filename = os.path.join(tmpdir, utt_id + '.wav')
78
+
79
+ # save output audio
80
+ sf.write(out_filename, data, SAMPLE_RATE)
81
+
82
+ return out_filename, duration
83
+
84
+
85
+ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
86
+ src_lang="en"
87
+ tgt_lang="en"
88
+ pnc="no"
89
+
90
+ if audio_filepath is None:
91
+ raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
92
+
93
+ utt_id = uuid.uuid4()
94
+ with tempfile.TemporaryDirectory() as tmpdir:
95
+ converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
96
+
97
+ # map src_lang and tgt_lang from long versions to short
98
+ LANG_LONG_TO_LANG_SHORT = {
99
+ "English": "en",
100
+ "Spanish": "es",
101
+ "French": "fr",
102
+ "German": "de",
103
+ }
104
+ if src_lang not in LANG_LONG_TO_LANG_SHORT.keys():
105
+ raise ValueError(f"src_lang must be one of {LANG_LONG_TO_LANG_SHORT.keys()}")
106
+ else:
107
+ src_lang = LANG_LONG_TO_LANG_SHORT[src_lang]
108
+
109
+ if tgt_lang not in LANG_LONG_TO_LANG_SHORT.keys():
110
+ raise ValueError(f"tgt_lang must be one of {LANG_LONG_TO_LANG_SHORT.keys()}")
111
+ else:
112
+ tgt_lang = LANG_LONG_TO_LANG_SHORT[tgt_lang]
113
+
114
+
115
+ # infer taskname from src_lang and tgt_lang
116
+ if src_lang == tgt_lang:
117
+ taskname = "asr"
118
+ else:
119
+ taskname = "s2t_translation"
120
+
121
+ # update pnc variable to be "yes" or "no"
122
+ pnc = "yes" if pnc else "no"
123
+
124
+ # make manifest file and save
125
+ manifest_data = {
126
+ "audio_filepath": converted_audio_filepath,
127
+ "source_lang": src_lang,
128
+ "target_lang": tgt_lang,
129
+ "taskname": taskname,
130
+ "pnc": pnc,
131
+ "answer": "predict",
132
+ "duration": str(duration),
133
+ }
134
+
135
+ manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
136
+
137
+ with open(manifest_filepath, 'w') as fout:
138
+ line = json.dumps(manifest_data)
139
+ fout.write(line + '\n')
140
+
141
+ # call transcribe, passing in manifest filepath
142
+ if duration < 40:
143
+ output_text = model.transcribe(manifest_filepath)[0]
144
+ else: # do buffered inference
145
+ with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
146
+ with torch.no_grad():
147
+ hyps = get_buffered_pred_feat_multitaskAED(
148
+ frame_asr,
149
+ model.cfg.preprocessor,
150
+ model_stride_in_secs,
151
+ model.device,
152
+ manifest=manifest_filepath,
153
+ filepaths=None,
154
+ )
155
+
156
+ output_text = hyps[0].text
157
+
158
+ return output_text
159
+
160
+
161
+ with gr.Blocks(
162
+ title="NeMo Canary Model",
163
+ css="""
164
+ textarea { font-size: 18px;}
165
+ #model_output_text_box span {
166
+ font-size: 18px;
167
+ font-weight: bold;
168
+ }
169
+ """,
170
+ theme=gr.themes.Default(text_size=gr.themes.sizes.text_lg) # make text slightly bigger (default is text_md )
171
+ ) as demo:
172
+
173
+ gr.HTML("<h1 style='text-align: center'>NeMo Canary model: Transcribe & Translate audio</h1>")
174
+
175
+ with gr.Row():
176
+ with gr.Column():
177
+ gr.HTML(
178
+ "<p><b>Step 1:</b> Upload an audio file or record with your microphone.</p>"
179
+
180
+ "<p style='color: #A0A0A0;'>This demo supports audio files up to 10 mins long. "
181
+ "You can transcribe longer files locally with this NeMo "
182
+ "<a href='https://github.com/NVIDIA/NeMo/blob/main/examples/asr/speech_multitask/speech_to_text_aed_chunked_infer.py'>script</a>.</p>"
183
+ )
184
+
185
+ audio_file = gr.Audio(sources=["microphone", "upload"], type="filepath")
186
+
187
+ gr.HTML("<p><b>Step 2:</b> Choose the input and output language.</p>")
188
+
189
+
190
+ with gr.Column():
191
+
192
+ gr.HTML("<p><b>Step 3:</b> Run the model.</p>")
193
+
194
+ go_button = gr.Button(
195
+ value="Run model",
196
+ variant="primary", # make "primary" so it stands out (default is "secondary")
197
+ )
198
+
199
+ model_output_text_box = gr.Textbox(
200
+ label="Model Output",
201
+ elem_id="model_output_text_box",
202
+ )
203
+
204
+ with gr.Row():
205
+
206
+ gr.HTML(
207
+ "<p style='text-align: center'>"
208
+ "🐀 <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | "
209
+ "πŸ§‘β€πŸ’» <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
210
+ "</p>"
211
+ )
212
+
213
+ go_button.click(
214
+ fn=transcribe,
215
+ inputs = [audio_file, src_lang, tgt_lang, pnc],
216
+ outputs = [model_output_text_box]
217
+ )
218
+
219
+
220
+ demo.queue()
221
+ demo.launch()
222
+
223
+ '''
224
+
225
 
226
  # Function to capture audio using Canary ASR
227
  def capture_audio():
228
+ utt_id = uuid.uuid4()
229
+ with tempfile.TemporaryDirectory() as tmpdir:
230
+ converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
231
+
232
+ manifest_data = {
233
+ "audio_filepath": converted_audio_filepath,
234
+ "source_lang": "en",
235
+ "target_lang": "en",
236
+ "taskname": taskname,
237
+ "pnc": pnc,
238
+ "answer": "predict",
239
+ "duration": 10,
240
+ }
241
+
242
+ manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
243
+
244
  print("Listening for cue words...")
245
  while True:
246
  audio_input = asr_pipeline(None)[0]['input_values']
 
271
  outputs=gr.outputs.Audio(type="audio", label="Assistant's Response"),
272
  title="AI Assistant",
273
  description="An AI Assistant that answers questions based on your speech input.").launch()
274
+ '''