UniverSLU_Natural_Phrase

Sleeping

App Files Files Community

ssiidd commited on Oct 15, 2024

Commit

59df896

1 Parent(s): 8878265

Update code

Browse files

Files changed (1) hide show

app.py +21 -298

app.py CHANGED Viewed

@@ -15,305 +15,27 @@ from espnet_model_zoo.downloader import ModelDownloader
 # vocoder_tagen = "none"
-audio_class_str='0."dog", 1."rooster", 2."pig", 3."cow", 4."frog", 5."cat", 6."hen", 7."insects", 8."sheep", 9."crow", 10."rain", 11."sea waves", 12."crackling fire", 13."crickets", 14."chirping birds", 15."water drops", 16."wind", 17."pouring water", 18."toilet flush", 19."thunderstorm", 20."crying baby", 21."sneezing", 22."clapping", 23."breathing", 24."coughing", 25."footsteps", 26."laughing", 27."brushing teeth", 28."snoring", 29."drinking sipping", 30."door wood knock", 31."mouse click", 32."keyboard typing", 33."door wood creaks", 34."can opening", 35."washing machine", 36."vacuum cleaner", 37."clock alarm", 38."clock tick", 39."glass breaking", 40."helicopter", 41."chainsaw", 42."siren", 43."car horn", 44."engine", 45."train", 46."church bells", 47."airplane", 48."fireworks", 49."hand saw".'
-audio_class_arr=audio_class_str.split(", ")
-audio_class_arr=[k.split('"')[1] for k in audio_class_arr]
-def inference(wav,data):
 #   import pdb;pdb.set_trace()
   with torch.no_grad():
       speech, rate = soundfile.read(wav)
       if len(speech.shape)==2:
           speech=speech[:,0]
-      if data == "english_slurp":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|en|> <|ner|> <|SLURP|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            beam_size=20,
-            ctc_weight=0.0,
-            penalty=0.1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0].replace("in:","")
-          scenario=intent.split("_")[0]
-          action=intent.split("_")[1]
-          ner_text=text.split(" SEP ")[1:-1]
-          text="INTENT: {scenario: "+scenario+", action: "+action+"}\n"
-          text=text+"NAMED ENTITIES: {"
-          for k in ner_text:
-            slot_name=k.split(" FILL ")[0].replace("sl:","")
-            slot_val=k.split(" FILL ")[1]
-            text=text+" "+slot_name+" : "+slot_val+","
-          text=text+"}"
-      elif data == "english_fsc":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|en|> <|ic|> <|fsc|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0].replace("in:","")
-          action=intent.split("_")[0]
-          objects=intent.split("_")[1]
-          location=intent.split("_")[2]
-          text="INTENT: {action: "+action+", object: "+objects+", location: "+location+"}"
-      elif data == "english_snips":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|en|> <|ic|> <|SNIPS|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0].replace("in:","")
-          text="INTENT: "+intent
-      elif data == "dutch_scr":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|nl|> <|scr|> <|grabo_scr|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=20,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0]
-          text="SPEECH COMMAND: "+intent
-      elif data == "english_scr":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|en|> <|scr|> <|google_scr|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0].replace("command:","")
-          text="SPEECH COMMAND: "+intent
-      elif data == "lithuanian_scr":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token= "<|lt|> <|scr|> <|lt_scr|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text
-          text="SPEECH COMMAND: "+intent
-      elif data == "arabic_scr":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token= "<|ar|> <|scr|> <|ar_scr|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0].replace("command:","")
-          text="SPEECH COMMAND: "+intent
-      elif data == "lid_voxforge":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lid_prompt=True,
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-        #   import pdb;pdb.set_trace()
-          lang=speech2text.converter.tokenizer.tokenizer.convert_ids_to_tokens(nbests[0][2][0]).replace("|>","").replace("<|","")
-          text="LANG: "+lang
-      elif data == "fake_speech_detection_asvspoof":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|en|> <|fsd|> <|asvspoof|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0].replace("class:","")
-          text="SPEECH CLASS: "+intent
-      elif data == "emotion_rec_iemocap":
-          replace_dict={}
-          replace_dict["em:neu"]="Neutral"
-          replace_dict["em:ang"]="Angry"
-          replace_dict["em:sad"]="Sad"
-          replace_dict["em:hap"]="Happy"
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|en|> <|er|> <|iemocap|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=replace_dict[text.split(" ")[0]]
-          text="EMOTION: "+intent
-      elif data == "accent_classify_accentdb":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|en|> <|accent_rec|> <|accentdb|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0].replace("accent:","")
-          text="ACCENT: "+intent
-      elif data == "sarcasm_mustard":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|en|> <|scd|> <|mustard|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0].replace("class:","")
-          text="SARCASM CLASS: "+intent
-      elif data == "sarcasm_mustard_plus":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|en|> <|scd|> <|mustard_plus_plus|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0].replace("class:","")
-          text="SARCASM CLASS: "+intent
-      elif data == "gender_voxceleb1":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|en|> <|gid|> <|voxceleb|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0].replace("gender:f","female").replace("gender:m","male")
-          text="GENDER: "+intent
-      elif data == "audio_classification_esc50":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|audio|> <|auc|> <|esc50|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1]
-          intent=text.split(" ")[0].replace("audio_class:","")
-          text="AUDIO EVENT CLASS: "+audio_class_arr[int(intent)]
-      elif data == "semantic_parsing_stop":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lang_prompt_token="<|en|> <|sp|> <|STOP|>",
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=20,
-            penalty=0.1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          text, *_ = nbests[0]
-          text=text.split("|>")[-1].replace("_STOP","")
-          text="SEMANTIC PARSE SEQUENCE: "+text
-      elif data == "vad_freesound":
-          speech2text = Speech2Text.from_pretrained(
-            asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
-            asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
-            # Decoding parameters are not included in the model file
-            lid_prompt=True,
-            prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
-            ctc_weight=0.0,
-            beam_size=1,
-            nbest=1
-          )
-          nbests = speech2text(speech)
-          lang=speech2text.converter.tokenizer.tokenizer.convert_ids_to_tokens(nbests[0][2][0])
-          if lang=="<|nospeech|>":
-            text="VAD: no speech"
-          else:
-             text="VAD: speech"
       # if lang == "chinese":
       #     wav = text2speechch(text)["wav"]
       #     scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
@@ -322,16 +44,17 @@ def inference(wav,data):
       #     scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
   return  text
-title = "UniverSLU"
-description = "Gradio demo for UniverSLU Task Specifier (https://huggingface.co/espnet/UniverSLU-17-Task-Specifier). UniverSLU-17 Task Specifier is a Multi-task Spoken Language Understanding model from CMU WAVLab. It adapts Whisper to additional tasks using single-token task specifiers. To use it, simply record your audio or click one of the examples to load them. More details about the SLU tasks that the model is trained on and it's performance on these tasks can be found in our paper: https://aclanthology.org/2024.naacl-long.151/"
 article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
-examples=[['audio_slurp_ner.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch_scr"],['audio_english_scr.wav',"english_scr"],['audio_lt_scr.wav',"lithuanian_scr"],['audio_ar_scr.wav',"arabic_scr"],['audio_snips.wav',"english_snips"],['audio_lid.wav',"lid_voxforge"],['audio_fsd.wav',"fake_speech_detection_asvspoof"],['audio_er.wav',"emotion_rec_iemocap"],['audio_acc.wav',"accent_classify_accentdb"],['audio_mustard.wav',"sarcasm_mustard"],['audio_mustard_plus.wav',"sarcasm_mustard_plus"],['audio_voxceleb1.wav',"gender_voxceleb1"],['audio_esc50.wav',"audio_classification_esc50"],['audio_stop.wav',"semantic_parsing_stop"],['audio_freesound.wav',"vad_freesound"]]
 # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
 gr.Interface(
     inference,
-    [gr.Audio(label="input audio",sources=["microphone"],type="filepath"),gr.Radio(choices=["english_slurp","english_fsc","dutch_scr","english_scr","lithuanian_scr","arabic_scr","english_snips","lid_voxforge","fake_speech_detection_asvspoof","emotion_rec_iemocap","accent_classify_accentdb","sarcasm_mustard","sarcasm_mustard_plus","gender_voxceleb1","audio_classification_esc50","semantic_parsing_stop","vad_freesound"], type="value", label="Task")],
     gr.Textbox(type="text", label="Output"),
     title=title,
     cache_examples=False,

 # vocoder_tagen = "none"
+def inference(wav,instruction):
 #   import pdb;pdb.set_trace()
   with torch.no_grad():
       speech, rate = soundfile.read(wav)
       if len(speech.shape)==2:
           speech=speech[:,0]
+      speech2text = Speech2Text.from_pretrained(
+        asr_train_config="UniverSLU-17-Natural-Phrase/exp/asr_train_asr_whisper_full_correct_specaug_target_raw_en_whisper_multilingual/config.yaml",
+        asr_model_file="UniverSLU-17-Natural-Phrase/exp/asr_train_asr_whisper_full_correct_specaug_target_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
+        # Decoding parameters are not included in the model file
+        nlp_prompt_prev_token=instruction,
+        prompt_token_file="UniverSLU-17-Natural-Phrase/add_tokens-Copy1.txt",
+        ctc_weight=0.0,
+        beam_size=1,
+        nbest=1
+      )
+      nbests = speech2text(speech)
+      text, *_ = nbests[0]
+      instruction=instruction.split(" <|")[0]
+      # import pdb;pdb.set_trace()
+      text=text.replace(instruction,"").replace("_STOP","").split(".")[-1]
       # if lang == "chinese":
       #     wav = text2speechch(text)["wav"]
       #     scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
       #     scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
   return  text
+title = "UniverSLU Natural Phrase"
+description = "Gradio demo for UniverSLU Natural Phrase (https://huggingface.co/espnet/UniverSLU-17-Natural-Phrase). UniverSLU-17 Natural-Phrase is a Multi-task Spoken Language Understanding model from CMU WAVLab. It adapts Whisper to additional tasks through instruction tuning, i.e., finetuning by describing the task using natural language instructions followed by the list of label options. To use it, simply record your audio or click one of the examples to load them. More details about the SLU tasks that the model is trained on and it's performance on these tasks can be found in our paper: https://aclanthology.org/2024.naacl-long.151/"
 article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
+# examples=[['audio_slurp_ner.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch_scr"],['audio_english_scr.wav',"english_scr"],['audio_lt_scr.wav',"lithuanian_scr"],['audio_ar_scr.wav',"arabic_scr"],['audio_snips.wav',"english_snips"],['audio_lid.wav',"lid_voxforge"],['audio_fsd.wav',"fake_speech_detection_asvspoof"],['audio_er.wav',"emotion_rec_iemocap"],['audio_acc.wav',"accent_classify_accentdb"],['audio_mustard.wav',"sarcasm_mustard"],['audio_mustard_plus.wav',"sarcasm_mustard_plus"],['audio_voxceleb1.wav',"gender_voxceleb1"],['audio_esc50.wav',"audio_classification_esc50"],['audio_stop.wav',"semantic_parsing_stop"]]
+examples=[['audio_slurp_ner.flac','Identify the named entities in the spoken words. <|startoftranscript|> <|en|>'],['audio_fsc.wav','Intent classification of spoken utterance. The options are 0."increase heat washroom", 1."deactivate lights", 2."deactivate lights bedroom", 3."decrease heat", 4."deactivate lights kitchen", 5."change language", 6."activate music", 7."change language English", 8."activate lights", 9."deactivate lights washroom", 10."change language German", 11."decrease heat kitchen", 12."increase volume", 13."decrease heat bedroom", 14."deactivate music", 15."decrease volume", 16."change language Chinese", 17."decrease heat washroom", 18."change language Korean", 19."increase heat", 20."bring newspaper", 21."activate lamp", 22."deactivate lamp", 23."bring juice", 24."activate lights kitchen", 25."increase heat kitchen", 26."bring socks", 27."activate lights bedroom", 28."increase heat bedroom", 29."activate lights washroom", 30."bring shoes". <|startoftranscript|> <|en|>'],['audio_grabo.wav','Recognize speech command. The options are 0."lift position up", 1."pointer state on", 2."turn relative slow south", 3."turn absolute south", 4."move relative slow alot forward", 5."turn relative fast south", 6."turn relative fast west", 7."turn relative slow west", 8."move relative slow alot backward", 9."move absolute slow right down", 10."move relative fast alot backward", 11."pointer state off", 12."grab grabber open", 13."move relative slow normal backward", 14."move absolute fast centerx centery", 15."approach slow", 16."turn absolute west", 17."move relative slow normal forward", 18."move absolute fast left up", 19."turn relative slow east", 20."move relative fast alot forward", 21."lift position down", 22."turn relative fast east", 23."move relative fast little forward", 24."move relative fast little backward", 25."move relative fast normal backward", 26."approach fast", 27."move absolute fast right down", 28."grab grabber close", 29."move absolute slow centerx centery", 30."turn absolute east", 31."move relative slow little forward", 32."turn absolute north", 33."move relative slow little backward", 34."move absolute slow left up", 35."move relative fast normal forward". <|startoftranscript|> <|nl|>'],['audio_english_scr.wav','Recognize speech command. The options are 0."yes", 1."down", 2."no", 3."stop", 4."go", 5."on", 6."left", 7."right", 8."unknown", 9."silence", 10."off", 11."up". <|startoftranscript|> <|en|>'],['audio_lt_scr.wav','Recognize speech command. The options are 0."ačiū", 1."iki", 2."išjunk", 3."labas", 4."ne", 5."pauzė", 6."startas", 7."stop", 8."unknown", 9."į_apačią", 10."į_dešinę", 11."į_kairę", 12."į_viršų", 13."įjunk". <|startoftranscript|> <|lt|>'],['audio_ar_scr.wav','Recognize speech command. The options are 0."A", 1."B", 2."C", 3."D", 4."E", 5."F", 6."0", 7."1", 8."2", 9."3", 10."4", 11."5", 12."6", 13."7", 14."8", 15."9". <|startoftranscript|> <|ar|>'],['audio_snips.wav','Intent classification of spoken utterance. The options are 0."Increase brightness", 1."Set light color", 2."Set light brightness", 3."Switch light on", 4."Decrease brightness", 5."Switch light off". <|startoftranscript|> <|en|>'],['audio_lid.wav','Determining the language in spoken speech. The options are 0."<|ru|>", 1."<|es|>", 2."<|it|>", 3."<|en|>", 4."<|fr|>", 5."<|de|>". <|startoftranscript|>'],['audio_fsd.wav','Distinguish between synthesized and converted speech from actual speech. The options are 0."spoof", 1."bonafide". <|startoftranscript|> <|en|>'],['audio_er.wav','Emotion recognition of spoken utterance. The options are 0."angry", 1."neutral", 2."sad", 3."happy", 4."other". <|startoftranscript|> <|en|>'],['audio_acc.wav','Accent classification in speech. The options are 0."american", 1."australian", 2."bangla", 3."british", 4."indian", 5."malayalam", 6."odiya", 7."telugu", 8."welsh". <|startoftranscript|> <|en|>'],['audio_mustard.wav','Determine if the speech is sarcastic. The options are 0."sarcasm", 1."not sarcasm". <|startoftranscript|> <|en|>'],['audio_voxceleb1.wav','Recognize the gender of the speaker. The options are 0."female", 1."male". <|startoftranscript|> <|en|>'],['audio_esc50.wav','Categorize the background noise in the audio. The options are 0."dog", 1."rooster", 2."pig", 3."cow", 4."frog", 5."cat", 6."hen", 7."insects", 8."sheep", 9."crow", 10."rain", 11."sea waves", 12."crackling fire", 13."crickets", 14."chirping birds", 15."water drops", 16."wind", 17."pouring water", 18."toilet flush", 19."thunderstorm", 20."crying baby", 21."sneezing", 22."clapping", 23."breathing", 24."coughing", 25."footsteps", 26."laughing", 27."brushing teeth", 28."snoring", 29."drinking sipping", 30."door wood knock", 31."mouse click", 32."keyboard typing", 33."door wood creaks", 34."can opening", 35."washing machine", 36."vacuum cleaner", 37."clock alarm", 38."clock tick", 39."glass breaking", 40."helicopter", 41."chainsaw", 42."siren", 43."car horn", 44."engine", 45."train", 46."church bells", 47."airplane", 48."fireworks", 49."hand saw". <|startoftranscript|> <|audio|>'],['audio_stop.wav','Develop the semantic parse of the spoken content. <|startoftranscript|> <|en|>'],['audio_freesound.wav','Identify if there is speech in the provided audio. The options are 0."no speech",1."speech". <|startoftranscript|>']]
 # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
 gr.Interface(
     inference,
+    [gr.Audio(label="input audio",sources=["microphone"],type="filepath"),gr.Textbox(type="text", label="Instruction")],
     gr.Textbox(type="text", label="Output"),
     title=title,
     cache_examples=False,