Update app.py
Browse files
app.py
CHANGED
@@ -1,33 +1,3 @@
|
|
1 |
-
# import gradio as gr
|
2 |
-
# import torch
|
3 |
-
# from transformers import pipeline, AutoTokenizer
|
4 |
-
# from nemo.collections.asr.models import EncDecMultiTaskModel
|
5 |
-
|
6 |
-
# # load model
|
7 |
-
# canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
|
8 |
-
|
9 |
-
# # update dcode params
|
10 |
-
# decode_cfg = canary_model.cfg.decoding
|
11 |
-
# decode_cfg.beam.beam_size = 1
|
12 |
-
# canary_model.change_decoding_strategy(decode_cfg)
|
13 |
-
|
14 |
-
# pipe = pipeline(
|
15 |
-
# "automatic-speech-recognition",
|
16 |
-
# model="nvidia/canary-1b"
|
17 |
-
# )
|
18 |
-
|
19 |
-
# # pipe = pipeline(
|
20 |
-
# # "text-generation",
|
21 |
-
# # model="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF",
|
22 |
-
# # model_kwargs={"torch_dtype": torch.bfloat16},
|
23 |
-
# # device_map="auto"
|
24 |
-
# # )
|
25 |
-
|
26 |
-
# gr.Interface.from_pipeline(pipe,
|
27 |
-
# title="ASR",
|
28 |
-
# description="Using pipeline with Canary-1B",
|
29 |
-
# ).launch(inbrowser=True)
|
30 |
-
|
31 |
import gradio as gr
|
32 |
import json
|
33 |
import librosa
|
@@ -171,6 +141,104 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
|
|
171 |
|
172 |
return output_text
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
with gr.Blocks(
|
175 |
title="NeMo Canary Model",
|
176 |
css="""
|
@@ -230,32 +298,32 @@ with gr.Blocks(
|
|
230 |
elem_id="model_output_text_box",
|
231 |
)
|
232 |
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
|
260 |
|
261 |
demo.queue()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
import librosa
|
|
|
141 |
|
142 |
return output_text
|
143 |
|
144 |
+
# add logic to make sure dropdown menus only suggest valid combos
|
145 |
+
def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value):
|
146 |
+
"""Callback function for when src_lang or tgt_lang dropdown menus are changed.
|
147 |
+
|
148 |
+
Args:
|
149 |
+
src_lang_value(string), tgt_lang_value (string), pnc_value(bool) - the current
|
150 |
+
chosen "values" of each Gradio component
|
151 |
+
Returns:
|
152 |
+
src_lang, tgt_lang, pnc - these are the new Gradio components that will be displayed
|
153 |
+
|
154 |
+
Note: I found the required logic is easier to understand if you think about the possible src & tgt langs as
|
155 |
+
a matrix, e.g. with English, Spanish, French, German as the langs, and only transcription in the same language,
|
156 |
+
and X -> English and English -> X translation being allowed, the matrix looks like the diagram below ("Y" means it is
|
157 |
+
allowed to go into that state).
|
158 |
+
It is easier to understand the code if you think about which state you are in, given the current src_lang_value and
|
159 |
+
tgt_lang_value, and then which states you can go to from there.
|
160 |
+
|
161 |
+
tgt lang
|
162 |
+
- |EN |ES |FR |DE
|
163 |
+
------------------
|
164 |
+
EN| Y | Y | Y | Y
|
165 |
+
------------------
|
166 |
+
src ES| Y | Y | |
|
167 |
+
lang ------------------
|
168 |
+
FR| Y | | Y |
|
169 |
+
------------------
|
170 |
+
DE| Y | | | Y
|
171 |
+
"""
|
172 |
+
|
173 |
+
if src_lang_value == "English" and tgt_lang_value == "English":
|
174 |
+
# src_lang and tgt_lang can go anywhere
|
175 |
+
src_lang = gr.Dropdown(
|
176 |
+
choices=["English", "Spanish", "French", "German"],
|
177 |
+
value=src_lang_value,
|
178 |
+
label="Input audio is spoken in:"
|
179 |
+
)
|
180 |
+
tgt_lang = gr.Dropdown(
|
181 |
+
choices=["English", "Spanish", "French", "German"],
|
182 |
+
value=tgt_lang_value,
|
183 |
+
label="Transcribe in language:"
|
184 |
+
)
|
185 |
+
elif src_lang_value == "English":
|
186 |
+
# src is English & tgt is non-English
|
187 |
+
# => src can only be English or current tgt_lang_values
|
188 |
+
# & tgt can be anything
|
189 |
+
src_lang = gr.Dropdown(
|
190 |
+
choices=["English", tgt_lang_value],
|
191 |
+
value=src_lang_value,
|
192 |
+
label="Input audio is spoken in:"
|
193 |
+
)
|
194 |
+
tgt_lang = gr.Dropdown(
|
195 |
+
choices=["English", "Spanish", "French", "German"],
|
196 |
+
value=tgt_lang_value,
|
197 |
+
label="Transcribe in language:"
|
198 |
+
)
|
199 |
+
elif tgt_lang_value == "English":
|
200 |
+
# src is non-English & tgt is English
|
201 |
+
# => src can be anything
|
202 |
+
# & tgt can only be English or current src_lang_value
|
203 |
+
src_lang = gr.Dropdown(
|
204 |
+
choices=["English", "Spanish", "French", "German"],
|
205 |
+
value=src_lang_value,
|
206 |
+
label="Input audio is spoken in:"
|
207 |
+
)
|
208 |
+
tgt_lang = gr.Dropdown(
|
209 |
+
choices=["English", src_lang_value],
|
210 |
+
value=tgt_lang_value,
|
211 |
+
label="Transcribe in language:"
|
212 |
+
)
|
213 |
+
else:
|
214 |
+
# both src and tgt are non-English
|
215 |
+
# => both src and tgt can only be switch to English or themselves
|
216 |
+
src_lang = gr.Dropdown(
|
217 |
+
choices=["English", src_lang_value],
|
218 |
+
value=src_lang_value,
|
219 |
+
label="Input audio is spoken in:"
|
220 |
+
)
|
221 |
+
tgt_lang = gr.Dropdown(
|
222 |
+
choices=["English", tgt_lang_value],
|
223 |
+
value=tgt_lang_value,
|
224 |
+
label="Transcribe in language:"
|
225 |
+
)
|
226 |
+
# let pnc be anything if src_lang_value == tgt_lang_value, else fix to True
|
227 |
+
if src_lang_value == tgt_lang_value:
|
228 |
+
pnc = gr.Checkbox(
|
229 |
+
value=pnc_value,
|
230 |
+
label="Punctuation & Capitalization in transcript?",
|
231 |
+
interactive=True
|
232 |
+
)
|
233 |
+
else:
|
234 |
+
pnc = gr.Checkbox(
|
235 |
+
value=True,
|
236 |
+
label="Punctuation & Capitalization in transcript?",
|
237 |
+
interactive=False
|
238 |
+
)
|
239 |
+
return src_lang, tgt_lang, pnc
|
240 |
+
|
241 |
+
|
242 |
with gr.Blocks(
|
243 |
title="NeMo Canary Model",
|
244 |
css="""
|
|
|
298 |
elem_id="model_output_text_box",
|
299 |
)
|
300 |
|
301 |
+
with gr.Row():
|
302 |
+
|
303 |
+
gr.HTML(
|
304 |
+
"<p style='text-align: center'>"
|
305 |
+
"π€ <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | "
|
306 |
+
"π§βπ» <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
|
307 |
+
"</p>"
|
308 |
+
)
|
309 |
+
|
310 |
+
go_button.click(
|
311 |
+
fn=transcribe,
|
312 |
+
inputs = [audio_file, src_lang, tgt_lang, pnc],
|
313 |
+
outputs = [model_output_text_box]
|
314 |
+
)
|
315 |
+
|
316 |
+
# call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
|
317 |
+
src_lang.change(
|
318 |
+
fn=on_src_or_tgt_lang_change,
|
319 |
+
inputs=[src_lang, tgt_lang, pnc],
|
320 |
+
outputs=[src_lang, tgt_lang, pnc],
|
321 |
+
)
|
322 |
+
tgt_lang.change(
|
323 |
+
fn=on_src_or_tgt_lang_change,
|
324 |
+
inputs=[src_lang, tgt_lang, pnc],
|
325 |
+
outputs=[src_lang, tgt_lang, pnc],
|
326 |
+
)
|
327 |
|
328 |
|
329 |
demo.queue()
|