ssiidd commited on
Commit
59df896
·
1 Parent(s): 8878265

Update code

Browse files
Files changed (1) hide show
  1. app.py +21 -298
app.py CHANGED
@@ -15,305 +15,27 @@ from espnet_model_zoo.downloader import ModelDownloader
15
  # vocoder_tagen = "none"
16
 
17
 
18
-
19
- audio_class_str='0."dog", 1."rooster", 2."pig", 3."cow", 4."frog", 5."cat", 6."hen", 7."insects", 8."sheep", 9."crow", 10."rain", 11."sea waves", 12."crackling fire", 13."crickets", 14."chirping birds", 15."water drops", 16."wind", 17."pouring water", 18."toilet flush", 19."thunderstorm", 20."crying baby", 21."sneezing", 22."clapping", 23."breathing", 24."coughing", 25."footsteps", 26."laughing", 27."brushing teeth", 28."snoring", 29."drinking sipping", 30."door wood knock", 31."mouse click", 32."keyboard typing", 33."door wood creaks", 34."can opening", 35."washing machine", 36."vacuum cleaner", 37."clock alarm", 38."clock tick", 39."glass breaking", 40."helicopter", 41."chainsaw", 42."siren", 43."car horn", 44."engine", 45."train", 46."church bells", 47."airplane", 48."fireworks", 49."hand saw".'
20
- audio_class_arr=audio_class_str.split(", ")
21
- audio_class_arr=[k.split('"')[1] for k in audio_class_arr]
22
-
23
- def inference(wav,data):
24
  # import pdb;pdb.set_trace()
25
  with torch.no_grad():
26
  speech, rate = soundfile.read(wav)
27
  if len(speech.shape)==2:
28
  speech=speech[:,0]
29
- if data == "english_slurp":
30
- speech2text = Speech2Text.from_pretrained(
31
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
32
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
33
- # Decoding parameters are not included in the model file
34
- lang_prompt_token="<|en|> <|ner|> <|SLURP|>",
35
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
36
- beam_size=20,
37
- ctc_weight=0.0,
38
- penalty=0.1,
39
- nbest=1
40
- )
41
- nbests = speech2text(speech)
42
- text, *_ = nbests[0]
43
- text=text.split("|>")[-1]
44
- intent=text.split(" ")[0].replace("in:","")
45
- scenario=intent.split("_")[0]
46
- action=intent.split("_")[1]
47
- ner_text=text.split(" SEP ")[1:-1]
48
- text="INTENT: {scenario: "+scenario+", action: "+action+"}\n"
49
- text=text+"NAMED ENTITIES: {"
50
- for k in ner_text:
51
- slot_name=k.split(" FILL ")[0].replace("sl:","")
52
- slot_val=k.split(" FILL ")[1]
53
- text=text+" "+slot_name+" : "+slot_val+","
54
- text=text+"}"
55
- elif data == "english_fsc":
56
- speech2text = Speech2Text.from_pretrained(
57
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
58
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
59
- # Decoding parameters are not included in the model file
60
- lang_prompt_token="<|en|> <|ic|> <|fsc|>",
61
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
62
- ctc_weight=0.0,
63
- nbest=1
64
- )
65
- nbests = speech2text(speech)
66
- text, *_ = nbests[0]
67
- text=text.split("|>")[-1]
68
- intent=text.split(" ")[0].replace("in:","")
69
- action=intent.split("_")[0]
70
- objects=intent.split("_")[1]
71
- location=intent.split("_")[2]
72
- text="INTENT: {action: "+action+", object: "+objects+", location: "+location+"}"
73
- elif data == "english_snips":
74
- speech2text = Speech2Text.from_pretrained(
75
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
76
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
77
- # Decoding parameters are not included in the model file
78
- lang_prompt_token="<|en|> <|ic|> <|SNIPS|>",
79
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
80
- ctc_weight=0.0,
81
- nbest=1
82
- )
83
- nbests = speech2text(speech)
84
- text, *_ = nbests[0]
85
- text=text.split("|>")[-1]
86
- intent=text.split(" ")[0].replace("in:","")
87
- text="INTENT: "+intent
88
- elif data == "dutch_scr":
89
- speech2text = Speech2Text.from_pretrained(
90
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
91
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
92
- # Decoding parameters are not included in the model file
93
- lang_prompt_token="<|nl|> <|scr|> <|grabo_scr|>",
94
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
95
- ctc_weight=0.0,
96
- beam_size=20,
97
- nbest=1
98
- )
99
- nbests = speech2text(speech)
100
- text, *_ = nbests[0]
101
- text=text.split("|>")[-1]
102
- intent=text.split(" ")[0]
103
- text="SPEECH COMMAND: "+intent
104
- elif data == "english_scr":
105
- speech2text = Speech2Text.from_pretrained(
106
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
107
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
108
- # Decoding parameters are not included in the model file
109
- lang_prompt_token="<|en|> <|scr|> <|google_scr|>",
110
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
111
- ctc_weight=0.0,
112
- beam_size=1,
113
- nbest=1
114
- )
115
- nbests = speech2text(speech)
116
- text, *_ = nbests[0]
117
- text=text.split("|>")[-1]
118
- intent=text.split(" ")[0].replace("command:","")
119
- text="SPEECH COMMAND: "+intent
120
- elif data == "lithuanian_scr":
121
- speech2text = Speech2Text.from_pretrained(
122
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
123
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
124
- # Decoding parameters are not included in the model file
125
- lang_prompt_token= "<|lt|> <|scr|> <|lt_scr|>",
126
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
127
- ctc_weight=0.0,
128
- beam_size=1,
129
- nbest=1
130
- )
131
- nbests = speech2text(speech)
132
- text, *_ = nbests[0]
133
- text=text.split("|>")[-1]
134
- intent=text
135
- text="SPEECH COMMAND: "+intent
136
- elif data == "arabic_scr":
137
- speech2text = Speech2Text.from_pretrained(
138
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
139
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
140
- # Decoding parameters are not included in the model file
141
- lang_prompt_token= "<|ar|> <|scr|> <|ar_scr|>",
142
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
143
- ctc_weight=0.0,
144
- beam_size=1,
145
- nbest=1
146
- )
147
- nbests = speech2text(speech)
148
- text, *_ = nbests[0]
149
- text=text.split("|>")[-1]
150
- intent=text.split(" ")[0].replace("command:","")
151
- text="SPEECH COMMAND: "+intent
152
- elif data == "lid_voxforge":
153
- speech2text = Speech2Text.from_pretrained(
154
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
155
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
156
- # Decoding parameters are not included in the model file
157
- lid_prompt=True,
158
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
159
- ctc_weight=0.0,
160
- beam_size=1,
161
- nbest=1
162
- )
163
- nbests = speech2text(speech)
164
- # import pdb;pdb.set_trace()
165
- lang=speech2text.converter.tokenizer.tokenizer.convert_ids_to_tokens(nbests[0][2][0]).replace("|>","").replace("<|","")
166
- text="LANG: "+lang
167
- elif data == "fake_speech_detection_asvspoof":
168
- speech2text = Speech2Text.from_pretrained(
169
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
170
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
171
- # Decoding parameters are not included in the model file
172
- lang_prompt_token="<|en|> <|fsd|> <|asvspoof|>",
173
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
174
- ctc_weight=0.0,
175
- beam_size=1,
176
- nbest=1
177
- )
178
- nbests = speech2text(speech)
179
- text, *_ = nbests[0]
180
- text=text.split("|>")[-1]
181
- intent=text.split(" ")[0].replace("class:","")
182
- text="SPEECH CLASS: "+intent
183
- elif data == "emotion_rec_iemocap":
184
- replace_dict={}
185
- replace_dict["em:neu"]="Neutral"
186
- replace_dict["em:ang"]="Angry"
187
- replace_dict["em:sad"]="Sad"
188
- replace_dict["em:hap"]="Happy"
189
- speech2text = Speech2Text.from_pretrained(
190
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
191
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
192
- # Decoding parameters are not included in the model file
193
- lang_prompt_token="<|en|> <|er|> <|iemocap|>",
194
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
195
- ctc_weight=0.0,
196
- beam_size=1,
197
- nbest=1
198
- )
199
- nbests = speech2text(speech)
200
- text, *_ = nbests[0]
201
- text=text.split("|>")[-1]
202
- intent=replace_dict[text.split(" ")[0]]
203
- text="EMOTION: "+intent
204
- elif data == "accent_classify_accentdb":
205
- speech2text = Speech2Text.from_pretrained(
206
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
207
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
208
- # Decoding parameters are not included in the model file
209
- lang_prompt_token="<|en|> <|accent_rec|> <|accentdb|>",
210
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
211
- ctc_weight=0.0,
212
- beam_size=1,
213
- nbest=1
214
- )
215
- nbests = speech2text(speech)
216
- text, *_ = nbests[0]
217
- text=text.split("|>")[-1]
218
- intent=text.split(" ")[0].replace("accent:","")
219
- text="ACCENT: "+intent
220
- elif data == "sarcasm_mustard":
221
- speech2text = Speech2Text.from_pretrained(
222
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
223
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
224
- # Decoding parameters are not included in the model file
225
- lang_prompt_token="<|en|> <|scd|> <|mustard|>",
226
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
227
- ctc_weight=0.0,
228
- beam_size=1,
229
- nbest=1
230
- )
231
- nbests = speech2text(speech)
232
- text, *_ = nbests[0]
233
- text=text.split("|>")[-1]
234
- intent=text.split(" ")[0].replace("class:","")
235
- text="SARCASM CLASS: "+intent
236
- elif data == "sarcasm_mustard_plus":
237
- speech2text = Speech2Text.from_pretrained(
238
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
239
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
240
- # Decoding parameters are not included in the model file
241
- lang_prompt_token="<|en|> <|scd|> <|mustard_plus_plus|>",
242
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
243
- ctc_weight=0.0,
244
- beam_size=1,
245
- nbest=1
246
- )
247
- nbests = speech2text(speech)
248
- text, *_ = nbests[0]
249
- text=text.split("|>")[-1]
250
- intent=text.split(" ")[0].replace("class:","")
251
- text="SARCASM CLASS: "+intent
252
- elif data == "gender_voxceleb1":
253
- speech2text = Speech2Text.from_pretrained(
254
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
255
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
256
- # Decoding parameters are not included in the model file
257
- lang_prompt_token="<|en|> <|gid|> <|voxceleb|>",
258
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
259
- ctc_weight=0.0,
260
- beam_size=1,
261
- nbest=1
262
- )
263
- nbests = speech2text(speech)
264
- text, *_ = nbests[0]
265
- text=text.split("|>")[-1]
266
- intent=text.split(" ")[0].replace("gender:f","female").replace("gender:m","male")
267
- text="GENDER: "+intent
268
- elif data == "audio_classification_esc50":
269
- speech2text = Speech2Text.from_pretrained(
270
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
271
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
272
- # Decoding parameters are not included in the model file
273
- lang_prompt_token="<|audio|> <|auc|> <|esc50|>",
274
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
275
- ctc_weight=0.0,
276
- beam_size=1,
277
- nbest=1
278
- )
279
- nbests = speech2text(speech)
280
- text, *_ = nbests[0]
281
- text=text.split("|>")[-1]
282
- intent=text.split(" ")[0].replace("audio_class:","")
283
- text="AUDIO EVENT CLASS: "+audio_class_arr[int(intent)]
284
- elif data == "semantic_parsing_stop":
285
- speech2text = Speech2Text.from_pretrained(
286
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
287
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
288
- # Decoding parameters are not included in the model file
289
- lang_prompt_token="<|en|> <|sp|> <|STOP|>",
290
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
291
- ctc_weight=0.0,
292
- beam_size=20,
293
- penalty=0.1,
294
- nbest=1
295
- )
296
- nbests = speech2text(speech)
297
- text, *_ = nbests[0]
298
- text=text.split("|>")[-1].replace("_STOP","")
299
- text="SEMANTIC PARSE SEQUENCE: "+text
300
- elif data == "vad_freesound":
301
- speech2text = Speech2Text.from_pretrained(
302
- asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
303
- asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
304
- # Decoding parameters are not included in the model file
305
- lid_prompt=True,
306
- prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
307
- ctc_weight=0.0,
308
- beam_size=1,
309
- nbest=1
310
- )
311
- nbests = speech2text(speech)
312
- lang=speech2text.converter.tokenizer.tokenizer.convert_ids_to_tokens(nbests[0][2][0])
313
- if lang=="<|nospeech|>":
314
- text="VAD: no speech"
315
- else:
316
- text="VAD: speech"
317
  # if lang == "chinese":
318
  # wav = text2speechch(text)["wav"]
319
  # scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
@@ -322,16 +44,17 @@ def inference(wav,data):
322
  # scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
323
  return text
324
 
325
- title = "UniverSLU"
326
- description = "Gradio demo for UniverSLU Task Specifier (https://huggingface.co/espnet/UniverSLU-17-Task-Specifier). UniverSLU-17 Task Specifier is a Multi-task Spoken Language Understanding model from CMU WAVLab. It adapts Whisper to additional tasks using single-token task specifiers. To use it, simply record your audio or click one of the examples to load them. More details about the SLU tasks that the model is trained on and it's performance on these tasks can be found in our paper: https://aclanthology.org/2024.naacl-long.151/"
327
  article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
328
 
329
- examples=[['audio_slurp_ner.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch_scr"],['audio_english_scr.wav',"english_scr"],['audio_lt_scr.wav',"lithuanian_scr"],['audio_ar_scr.wav',"arabic_scr"],['audio_snips.wav',"english_snips"],['audio_lid.wav',"lid_voxforge"],['audio_fsd.wav',"fake_speech_detection_asvspoof"],['audio_er.wav',"emotion_rec_iemocap"],['audio_acc.wav',"accent_classify_accentdb"],['audio_mustard.wav',"sarcasm_mustard"],['audio_mustard_plus.wav',"sarcasm_mustard_plus"],['audio_voxceleb1.wav',"gender_voxceleb1"],['audio_esc50.wav',"audio_classification_esc50"],['audio_stop.wav',"semantic_parsing_stop"],['audio_freesound.wav',"vad_freesound"]]
 
330
 
331
  # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
332
  gr.Interface(
333
  inference,
334
- [gr.Audio(label="input audio",sources=["microphone"],type="filepath"),gr.Radio(choices=["english_slurp","english_fsc","dutch_scr","english_scr","lithuanian_scr","arabic_scr","english_snips","lid_voxforge","fake_speech_detection_asvspoof","emotion_rec_iemocap","accent_classify_accentdb","sarcasm_mustard","sarcasm_mustard_plus","gender_voxceleb1","audio_classification_esc50","semantic_parsing_stop","vad_freesound"], type="value", label="Task")],
335
  gr.Textbox(type="text", label="Output"),
336
  title=title,
337
  cache_examples=False,
 
15
  # vocoder_tagen = "none"
16
 
17
 
18
+ def inference(wav,instruction):
 
 
 
 
 
19
  # import pdb;pdb.set_trace()
20
  with torch.no_grad():
21
  speech, rate = soundfile.read(wav)
22
  if len(speech.shape)==2:
23
  speech=speech[:,0]
24
+ speech2text = Speech2Text.from_pretrained(
25
+ asr_train_config="UniverSLU-17-Natural-Phrase/exp/asr_train_asr_whisper_full_correct_specaug_target_raw_en_whisper_multilingual/config.yaml",
26
+ asr_model_file="UniverSLU-17-Natural-Phrase/exp/asr_train_asr_whisper_full_correct_specaug_target_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
27
+ # Decoding parameters are not included in the model file
28
+ nlp_prompt_prev_token=instruction,
29
+ prompt_token_file="UniverSLU-17-Natural-Phrase/add_tokens-Copy1.txt",
30
+ ctc_weight=0.0,
31
+ beam_size=1,
32
+ nbest=1
33
+ )
34
+ nbests = speech2text(speech)
35
+ text, *_ = nbests[0]
36
+ instruction=instruction.split(" <|")[0]
37
+ # import pdb;pdb.set_trace()
38
+ text=text.replace(instruction,"").replace("_STOP","").split(".")[-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # if lang == "chinese":
40
  # wav = text2speechch(text)["wav"]
41
  # scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
 
44
  # scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
45
  return text
46
 
47
+ title = "UniverSLU Natural Phrase"
48
+ description = "Gradio demo for UniverSLU Natural Phrase (https://huggingface.co/espnet/UniverSLU-17-Natural-Phrase). UniverSLU-17 Natural-Phrase is a Multi-task Spoken Language Understanding model from CMU WAVLab. It adapts Whisper to additional tasks through instruction tuning, i.e., finetuning by describing the task using natural language instructions followed by the list of label options. To use it, simply record your audio or click one of the examples to load them. More details about the SLU tasks that the model is trained on and it's performance on these tasks can be found in our paper: https://aclanthology.org/2024.naacl-long.151/"
49
  article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
50
 
51
+ # examples=[['audio_slurp_ner.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch_scr"],['audio_english_scr.wav',"english_scr"],['audio_lt_scr.wav',"lithuanian_scr"],['audio_ar_scr.wav',"arabic_scr"],['audio_snips.wav',"english_snips"],['audio_lid.wav',"lid_voxforge"],['audio_fsd.wav',"fake_speech_detection_asvspoof"],['audio_er.wav',"emotion_rec_iemocap"],['audio_acc.wav',"accent_classify_accentdb"],['audio_mustard.wav',"sarcasm_mustard"],['audio_mustard_plus.wav',"sarcasm_mustard_plus"],['audio_voxceleb1.wav',"gender_voxceleb1"],['audio_esc50.wav',"audio_classification_esc50"],['audio_stop.wav',"semantic_parsing_stop"]]
52
+ examples=[['audio_slurp_ner.flac','Identify the named entities in the spoken words. <|startoftranscript|> <|en|>'],['audio_fsc.wav','Intent classification of spoken utterance. The options are 0."increase heat washroom", 1."deactivate lights", 2."deactivate lights bedroom", 3."decrease heat", 4."deactivate lights kitchen", 5."change language", 6."activate music", 7."change language English", 8."activate lights", 9."deactivate lights washroom", 10."change language German", 11."decrease heat kitchen", 12."increase volume", 13."decrease heat bedroom", 14."deactivate music", 15."decrease volume", 16."change language Chinese", 17."decrease heat washroom", 18."change language Korean", 19."increase heat", 20."bring newspaper", 21."activate lamp", 22."deactivate lamp", 23."bring juice", 24."activate lights kitchen", 25."increase heat kitchen", 26."bring socks", 27."activate lights bedroom", 28."increase heat bedroom", 29."activate lights washroom", 30."bring shoes". <|startoftranscript|> <|en|>'],['audio_grabo.wav','Recognize speech command. The options are 0."lift position up", 1."pointer state on", 2."turn relative slow south", 3."turn absolute south", 4."move relative slow alot forward", 5."turn relative fast south", 6."turn relative fast west", 7."turn relative slow west", 8."move relative slow alot backward", 9."move absolute slow right down", 10."move relative fast alot backward", 11."pointer state off", 12."grab grabber open", 13."move relative slow normal backward", 14."move absolute fast centerx centery", 15."approach slow", 16."turn absolute west", 17."move relative slow normal forward", 18."move absolute fast left up", 19."turn relative slow east", 20."move relative fast alot forward", 21."lift position down", 22."turn relative fast east", 23."move relative fast little forward", 24."move relative fast little backward", 25."move relative fast normal backward", 26."approach fast", 27."move absolute fast right down", 28."grab grabber close", 29."move absolute slow centerx centery", 30."turn absolute east", 31."move relative slow little forward", 32."turn absolute north", 33."move relative slow little backward", 34."move absolute slow left up", 35."move relative fast normal forward". <|startoftranscript|> <|nl|>'],['audio_english_scr.wav','Recognize speech command. The options are 0."yes", 1."down", 2."no", 3."stop", 4."go", 5."on", 6."left", 7."right", 8."unknown", 9."silence", 10."off", 11."up". <|startoftranscript|> <|en|>'],['audio_lt_scr.wav','Recognize speech command. The options are 0."ačiū", 1."iki", 2."išjunk", 3."labas", 4."ne", 5."pauzė", 6."startas", 7."stop", 8."unknown", 9."į_apačią", 10."į_dešinę", 11."į_kairę", 12."į_viršų", 13."įjunk". <|startoftranscript|> <|lt|>'],['audio_ar_scr.wav','Recognize speech command. The options are 0."A", 1."B", 2."C", 3."D", 4."E", 5."F", 6."0", 7."1", 8."2", 9."3", 10."4", 11."5", 12."6", 13."7", 14."8", 15."9". <|startoftranscript|> <|ar|>'],['audio_snips.wav','Intent classification of spoken utterance. The options are 0."Increase brightness", 1."Set light color", 2."Set light brightness", 3."Switch light on", 4."Decrease brightness", 5."Switch light off". <|startoftranscript|> <|en|>'],['audio_lid.wav','Determining the language in spoken speech. The options are 0."<|ru|>", 1."<|es|>", 2."<|it|>", 3."<|en|>", 4."<|fr|>", 5."<|de|>". <|startoftranscript|>'],['audio_fsd.wav','Distinguish between synthesized and converted speech from actual speech. The options are 0."spoof", 1."bonafide". <|startoftranscript|> <|en|>'],['audio_er.wav','Emotion recognition of spoken utterance. The options are 0."angry", 1."neutral", 2."sad", 3."happy", 4."other". <|startoftranscript|> <|en|>'],['audio_acc.wav','Accent classification in speech. The options are 0."american", 1."australian", 2."bangla", 3."british", 4."indian", 5."malayalam", 6."odiya", 7."telugu", 8."welsh". <|startoftranscript|> <|en|>'],['audio_mustard.wav','Determine if the speech is sarcastic. The options are 0."sarcasm", 1."not sarcasm". <|startoftranscript|> <|en|>'],['audio_voxceleb1.wav','Recognize the gender of the speaker. The options are 0."female", 1."male". <|startoftranscript|> <|en|>'],['audio_esc50.wav','Categorize the background noise in the audio. The options are 0."dog", 1."rooster", 2."pig", 3."cow", 4."frog", 5."cat", 6."hen", 7."insects", 8."sheep", 9."crow", 10."rain", 11."sea waves", 12."crackling fire", 13."crickets", 14."chirping birds", 15."water drops", 16."wind", 17."pouring water", 18."toilet flush", 19."thunderstorm", 20."crying baby", 21."sneezing", 22."clapping", 23."breathing", 24."coughing", 25."footsteps", 26."laughing", 27."brushing teeth", 28."snoring", 29."drinking sipping", 30."door wood knock", 31."mouse click", 32."keyboard typing", 33."door wood creaks", 34."can opening", 35."washing machine", 36."vacuum cleaner", 37."clock alarm", 38."clock tick", 39."glass breaking", 40."helicopter", 41."chainsaw", 42."siren", 43."car horn", 44."engine", 45."train", 46."church bells", 47."airplane", 48."fireworks", 49."hand saw". <|startoftranscript|> <|audio|>'],['audio_stop.wav','Develop the semantic parse of the spoken content. <|startoftranscript|> <|en|>'],['audio_freesound.wav','Identify if there is speech in the provided audio. The options are 0."no speech",1."speech". <|startoftranscript|>']]
53
 
54
  # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
55
  gr.Interface(
56
  inference,
57
+ [gr.Audio(label="input audio",sources=["microphone"],type="filepath"),gr.Textbox(type="text", label="Instruction")],
58
  gr.Textbox(type="text", label="Output"),
59
  title=title,
60
  cache_examples=False,