almncarlo commited on
Commit
4b980ad
Β·
verified Β·
1 Parent(s): 7d4aec1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -56
app.py CHANGED
@@ -1,33 +1,3 @@
1
- # import gradio as gr
2
- # import torch
3
- # from transformers import pipeline, AutoTokenizer
4
- # from nemo.collections.asr.models import EncDecMultiTaskModel
5
-
6
- # # load model
7
- # canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
8
-
9
- # # update dcode params
10
- # decode_cfg = canary_model.cfg.decoding
11
- # decode_cfg.beam.beam_size = 1
12
- # canary_model.change_decoding_strategy(decode_cfg)
13
-
14
- # pipe = pipeline(
15
- # "automatic-speech-recognition",
16
- # model="nvidia/canary-1b"
17
- # )
18
-
19
- # # pipe = pipeline(
20
- # # "text-generation",
21
- # # model="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF",
22
- # # model_kwargs={"torch_dtype": torch.bfloat16},
23
- # # device_map="auto"
24
- # # )
25
-
26
- # gr.Interface.from_pipeline(pipe,
27
- # title="ASR",
28
- # description="Using pipeline with Canary-1B",
29
- # ).launch(inbrowser=True)
30
-
31
  import gradio as gr
32
  import json
33
  import librosa
@@ -171,6 +141,104 @@ def transcribe(audio_filepath, src_lang, tgt_lang, pnc):
171
 
172
  return output_text
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  with gr.Blocks(
175
  title="NeMo Canary Model",
176
  css="""
@@ -230,32 +298,32 @@ with gr.Blocks(
230
  elem_id="model_output_text_box",
231
  )
232
 
233
- with gr.Row():
234
-
235
- gr.HTML(
236
- "<p style='text-align: center'>"
237
- "🐀 <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | "
238
- "πŸ§‘β€πŸ’» <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
239
- "</p>"
240
- )
241
-
242
- go_button.click(
243
- fn=transcribe,
244
- inputs = [audio_file, src_lang, tgt_lang, pnc],
245
- outputs = [model_output_text_box]
246
- )
247
-
248
- # call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
249
- src_lang.change(
250
- fn=on_src_or_tgt_lang_change,
251
- inputs=[src_lang, tgt_lang, pnc],
252
- outputs=[src_lang, tgt_lang, pnc],
253
- )
254
- tgt_lang.change(
255
- fn=on_src_or_tgt_lang_change,
256
- inputs=[src_lang, tgt_lang, pnc],
257
- outputs=[src_lang, tgt_lang, pnc],
258
- )
259
 
260
 
261
  demo.queue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import json
3
  import librosa
 
141
 
142
  return output_text
143
 
144
+ # add logic to make sure dropdown menus only suggest valid combos
145
+ def on_src_or_tgt_lang_change(src_lang_value, tgt_lang_value, pnc_value):
146
+ """Callback function for when src_lang or tgt_lang dropdown menus are changed.
147
+
148
+ Args:
149
+ src_lang_value(string), tgt_lang_value (string), pnc_value(bool) - the current
150
+ chosen "values" of each Gradio component
151
+ Returns:
152
+ src_lang, tgt_lang, pnc - these are the new Gradio components that will be displayed
153
+
154
+ Note: I found the required logic is easier to understand if you think about the possible src & tgt langs as
155
+ a matrix, e.g. with English, Spanish, French, German as the langs, and only transcription in the same language,
156
+ and X -> English and English -> X translation being allowed, the matrix looks like the diagram below ("Y" means it is
157
+ allowed to go into that state).
158
+ It is easier to understand the code if you think about which state you are in, given the current src_lang_value and
159
+ tgt_lang_value, and then which states you can go to from there.
160
+
161
+ tgt lang
162
+ - |EN |ES |FR |DE
163
+ ------------------
164
+ EN| Y | Y | Y | Y
165
+ ------------------
166
+ src ES| Y | Y | |
167
+ lang ------------------
168
+ FR| Y | | Y |
169
+ ------------------
170
+ DE| Y | | | Y
171
+ """
172
+
173
+ if src_lang_value == "English" and tgt_lang_value == "English":
174
+ # src_lang and tgt_lang can go anywhere
175
+ src_lang = gr.Dropdown(
176
+ choices=["English", "Spanish", "French", "German"],
177
+ value=src_lang_value,
178
+ label="Input audio is spoken in:"
179
+ )
180
+ tgt_lang = gr.Dropdown(
181
+ choices=["English", "Spanish", "French", "German"],
182
+ value=tgt_lang_value,
183
+ label="Transcribe in language:"
184
+ )
185
+ elif src_lang_value == "English":
186
+ # src is English & tgt is non-English
187
+ # => src can only be English or current tgt_lang_values
188
+ # & tgt can be anything
189
+ src_lang = gr.Dropdown(
190
+ choices=["English", tgt_lang_value],
191
+ value=src_lang_value,
192
+ label="Input audio is spoken in:"
193
+ )
194
+ tgt_lang = gr.Dropdown(
195
+ choices=["English", "Spanish", "French", "German"],
196
+ value=tgt_lang_value,
197
+ label="Transcribe in language:"
198
+ )
199
+ elif tgt_lang_value == "English":
200
+ # src is non-English & tgt is English
201
+ # => src can be anything
202
+ # & tgt can only be English or current src_lang_value
203
+ src_lang = gr.Dropdown(
204
+ choices=["English", "Spanish", "French", "German"],
205
+ value=src_lang_value,
206
+ label="Input audio is spoken in:"
207
+ )
208
+ tgt_lang = gr.Dropdown(
209
+ choices=["English", src_lang_value],
210
+ value=tgt_lang_value,
211
+ label="Transcribe in language:"
212
+ )
213
+ else:
214
+ # both src and tgt are non-English
215
+ # => both src and tgt can only be switch to English or themselves
216
+ src_lang = gr.Dropdown(
217
+ choices=["English", src_lang_value],
218
+ value=src_lang_value,
219
+ label="Input audio is spoken in:"
220
+ )
221
+ tgt_lang = gr.Dropdown(
222
+ choices=["English", tgt_lang_value],
223
+ value=tgt_lang_value,
224
+ label="Transcribe in language:"
225
+ )
226
+ # let pnc be anything if src_lang_value == tgt_lang_value, else fix to True
227
+ if src_lang_value == tgt_lang_value:
228
+ pnc = gr.Checkbox(
229
+ value=pnc_value,
230
+ label="Punctuation & Capitalization in transcript?",
231
+ interactive=True
232
+ )
233
+ else:
234
+ pnc = gr.Checkbox(
235
+ value=True,
236
+ label="Punctuation & Capitalization in transcript?",
237
+ interactive=False
238
+ )
239
+ return src_lang, tgt_lang, pnc
240
+
241
+
242
  with gr.Blocks(
243
  title="NeMo Canary Model",
244
  css="""
 
298
  elem_id="model_output_text_box",
299
  )
300
 
301
+ with gr.Row():
302
+
303
+ gr.HTML(
304
+ "<p style='text-align: center'>"
305
+ "🐀 <a href='https://huggingface.co/nvidia/canary-1b' target='_blank'>Canary model</a> | "
306
+ "πŸ§‘β€πŸ’» <a href='https://github.com/NVIDIA/NeMo' target='_blank'>NeMo Repository</a>"
307
+ "</p>"
308
+ )
309
+
310
+ go_button.click(
311
+ fn=transcribe,
312
+ inputs = [audio_file, src_lang, tgt_lang, pnc],
313
+ outputs = [model_output_text_box]
314
+ )
315
+
316
+ # call on_src_or_tgt_lang_change whenever src_lang or tgt_lang dropdown menus are changed
317
+ src_lang.change(
318
+ fn=on_src_or_tgt_lang_change,
319
+ inputs=[src_lang, tgt_lang, pnc],
320
+ outputs=[src_lang, tgt_lang, pnc],
321
+ )
322
+ tgt_lang.change(
323
+ fn=on_src_or_tgt_lang_change,
324
+ inputs=[src_lang, tgt_lang, pnc],
325
+ outputs=[src_lang, tgt_lang, pnc],
326
+ )
327
 
328
 
329
  demo.queue()