Spaces:
Sleeping
Sleeping
Complete demo
Browse files- app.py +295 -64
- audio_acc.wav +0 -0
- audio_ar_scr.wav +0 -0
- audio_english_scr.wav +0 -0
- audio_er.wav +0 -0
- audio_esc50.wav +0 -0
- audio_freesound.wav +0 -0
- audio_fsd.wav +0 -0
- audio_lid.wav +0 -0
- audio_lt_scr.wav +0 -0
- audio_mustard.wav +0 -0
- audio_mustard_plus.wav +0 -0
- audio_slurp_ner.flac +0 -0
- audio_snips.wav +0 -0
- audio_stop.wav +0 -0
- audio_voxceleb1.wav +0 -0
app.py
CHANGED
@@ -14,74 +14,306 @@ from espnet_model_zoo.downloader import ModelDownloader
|
|
14 |
# tagen = 'kan-bayashi/ljspeech_vits'
|
15 |
# vocoder_tagen = "none"
|
16 |
|
17 |
-
speech2text_slurp = Speech2Text.from_pretrained(
|
18 |
-
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
19 |
-
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
20 |
-
# Decoding parameters are not included in the model file
|
21 |
-
lang_prompt_token="<|en|> <|ner|> <|SLURP|>",
|
22 |
-
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
23 |
-
nbest=1
|
24 |
-
)
|
25 |
|
26 |
-
speech2text_fsc = Speech2Text.from_pretrained(
|
27 |
-
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
28 |
-
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
29 |
-
# Decoding parameters are not included in the model file
|
30 |
-
lang_prompt_token="<|en|> <|ic|> <|fsc|>",
|
31 |
-
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
32 |
-
nbest=1
|
33 |
-
)
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
# Decoding parameters are not included in the model file
|
39 |
-
lang_prompt_token="<|nl|> <|scr|> <|grabo_scr|>",
|
40 |
-
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
41 |
-
nbest=1
|
42 |
-
)
|
43 |
|
44 |
def inference(wav,data):
|
|
|
45 |
with torch.no_grad():
|
|
|
|
|
|
|
46 |
if data == "english_slurp":
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
text, *_ = nbests[0]
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
elif data == "english_fsc":
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
63 |
text, *_ = nbests[0]
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
78 |
text, *_ = nbests[0]
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
# if lang == "chinese":
|
86 |
# wav = text2speechch(text)["wav"]
|
87 |
# scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
|
@@ -91,19 +323,18 @@ def inference(wav,data):
|
|
91 |
return text
|
92 |
|
93 |
title = "UniverSLU"
|
94 |
-
description = "Gradio demo for UniverSLU
|
95 |
article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
|
96 |
|
97 |
-
examples=[['
|
98 |
|
99 |
# gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
|
100 |
gr.Interface(
|
101 |
inference,
|
102 |
-
[gr.Audio(label="input audio",
|
103 |
-
gr.Textbox(type="
|
104 |
title=title,
|
105 |
description=description,
|
106 |
article=article,
|
107 |
-
enable_queue=True,
|
108 |
examples=examples
|
109 |
-
).launch(debug=True)
|
|
|
14 |
# tagen = 'kan-bayashi/ljspeech_vits'
|
15 |
# vocoder_tagen = "none"
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
audio_class_str='0."dog", 1."rooster", 2."pig", 3."cow", 4."frog", 5."cat", 6."hen", 7."insects", 8."sheep", 9."crow", 10."rain", 11."sea waves", 12."crackling fire", 13."crickets", 14."chirping birds", 15."water drops", 16."wind", 17."pouring water", 18."toilet flush", 19."thunderstorm", 20."crying baby", 21."sneezing", 22."clapping", 23."breathing", 24."coughing", 25."footsteps", 26."laughing", 27."brushing teeth", 28."snoring", 29."drinking sipping", 30."door wood knock", 31."mouse click", 32."keyboard typing", 33."door wood creaks", 34."can opening", 35."washing machine", 36."vacuum cleaner", 37."clock alarm", 38."clock tick", 39."glass breaking", 40."helicopter", 41."chainsaw", 42."siren", 43."car horn", 44."engine", 45."train", 46."church bells", 47."airplane", 48."fireworks", 49."hand saw".'
|
20 |
+
audio_class_arr=audio_class_str.split(", ")
|
21 |
+
audio_class_arr=[k.split('"')[1] for k in audio_class_arr]
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
def inference(wav,data):
|
24 |
+
# import pdb;pdb.set_trace()
|
25 |
with torch.no_grad():
|
26 |
+
speech, rate = soundfile.read(wav)
|
27 |
+
if len(speech.shape)==2:
|
28 |
+
speech=speech[:,0]
|
29 |
if data == "english_slurp":
|
30 |
+
speech2text = Speech2Text.from_pretrained(
|
31 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
32 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
33 |
+
# Decoding parameters are not included in the model file
|
34 |
+
lang_prompt_token="<|en|> <|ner|> <|SLURP|>",
|
35 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
36 |
+
beam_size=20,
|
37 |
+
ctc_weight=0.0,
|
38 |
+
penalty=0.1,
|
39 |
+
nbest=1
|
40 |
+
)
|
41 |
+
nbests = speech2text(speech)
|
42 |
text, *_ = nbests[0]
|
43 |
+
text=text.split("|>")[-1]
|
44 |
+
intent=text.split(" ")[0].replace("in:","")
|
45 |
+
scenario=intent.split("_")[0]
|
46 |
+
action=intent.split("_")[1]
|
47 |
+
ner_text=text.split(" SEP ")[1:-1]
|
48 |
+
text="INTENT: {scenario: "+scenario+", action: "+action+"}\n"
|
49 |
+
text=text+"NAMED ENTITIES: {"
|
50 |
+
for k in ner_text:
|
51 |
+
slot_name=k.split(" FILL ")[0].replace("sl:","")
|
52 |
+
slot_val=k.split(" FILL ")[1]
|
53 |
+
text=text+" "+slot_name+" : "+slot_val+","
|
54 |
+
text=text+"}"
|
55 |
elif data == "english_fsc":
|
56 |
+
speech2text = Speech2Text.from_pretrained(
|
57 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
58 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
59 |
+
# Decoding parameters are not included in the model file
|
60 |
+
lang_prompt_token="<|en|> <|ic|> <|fsc|>",
|
61 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
62 |
+
ctc_weight=0.0,
|
63 |
+
nbest=1
|
64 |
+
)
|
65 |
+
nbests = speech2text(speech)
|
66 |
text, *_ = nbests[0]
|
67 |
+
text=text.split("|>")[-1]
|
68 |
+
intent=text.split(" ")[0].replace("in:","")
|
69 |
+
action=intent.split("_")[0]
|
70 |
+
objects=intent.split("_")[1]
|
71 |
+
location=intent.split("_")[2]
|
72 |
+
text="INTENT: {action: "+action+", object: "+objects+", location: "+location+"}"
|
73 |
+
elif data == "english_snips":
|
74 |
+
speech2text = Speech2Text.from_pretrained(
|
75 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
76 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
77 |
+
# Decoding parameters are not included in the model file
|
78 |
+
lang_prompt_token="<|en|> <|ic|> <|SNIPS|>",
|
79 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
80 |
+
ctc_weight=0.0,
|
81 |
+
nbest=1
|
82 |
+
)
|
83 |
+
nbests = speech2text(speech)
|
84 |
text, *_ = nbests[0]
|
85 |
+
text=text.split("|>")[-1]
|
86 |
+
intent=text.split(" ")[0].replace("in:","")
|
87 |
+
text="INTENT: "+intent
|
88 |
+
elif data == "dutch_scr":
|
89 |
+
speech2text = Speech2Text.from_pretrained(
|
90 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
91 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
92 |
+
# Decoding parameters are not included in the model file
|
93 |
+
lang_prompt_token="<|nl|> <|scr|> <|grabo_scr|>",
|
94 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
95 |
+
ctc_weight=0.0,
|
96 |
+
beam_size=20,
|
97 |
+
nbest=1
|
98 |
+
)
|
99 |
+
nbests = speech2text(speech)
|
100 |
+
text, *_ = nbests[0]
|
101 |
+
text=text.split("|>")[-1]
|
102 |
+
intent=text.split(" ")[0]
|
103 |
+
text="SPEECH COMMAND: "+intent
|
104 |
+
elif data == "english_scr":
|
105 |
+
speech2text = Speech2Text.from_pretrained(
|
106 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
107 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
108 |
+
# Decoding parameters are not included in the model file
|
109 |
+
lang_prompt_token="<|en|> <|scr|> <|google_scr|>",
|
110 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
111 |
+
ctc_weight=0.0,
|
112 |
+
beam_size=1,
|
113 |
+
nbest=1
|
114 |
+
)
|
115 |
+
nbests = speech2text(speech)
|
116 |
+
text, *_ = nbests[0]
|
117 |
+
text=text.split("|>")[-1]
|
118 |
+
intent=text.split(" ")[0].replace("command:","")
|
119 |
+
text="SPEECH COMMAND: "+intent
|
120 |
+
elif data == "lithuanian_scr":
|
121 |
+
speech2text = Speech2Text.from_pretrained(
|
122 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
123 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
124 |
+
# Decoding parameters are not included in the model file
|
125 |
+
lang_prompt_token= "<|lt|> <|scr|> <|lt_scr|>",
|
126 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
127 |
+
ctc_weight=0.0,
|
128 |
+
beam_size=1,
|
129 |
+
nbest=1
|
130 |
+
)
|
131 |
+
nbests = speech2text(speech)
|
132 |
+
text, *_ = nbests[0]
|
133 |
+
text=text.split("|>")[-1]
|
134 |
+
intent=text
|
135 |
+
text="SPEECH COMMAND: "+intent
|
136 |
+
elif data == "arabic_scr":
|
137 |
+
speech2text = Speech2Text.from_pretrained(
|
138 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
139 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
140 |
+
# Decoding parameters are not included in the model file
|
141 |
+
lang_prompt_token= "<|ar|> <|scr|> <|ar_scr|>",
|
142 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
143 |
+
ctc_weight=0.0,
|
144 |
+
beam_size=1,
|
145 |
+
nbest=1
|
146 |
+
)
|
147 |
+
nbests = speech2text(speech)
|
148 |
+
text, *_ = nbests[0]
|
149 |
+
text=text.split("|>")[-1]
|
150 |
+
intent=text.split(" ")[0].replace("command:","")
|
151 |
+
text="SPEECH COMMAND: "+intent
|
152 |
+
elif data == "lid_voxforge":
|
153 |
+
speech2text = Speech2Text.from_pretrained(
|
154 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
155 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
156 |
+
# Decoding parameters are not included in the model file
|
157 |
+
lid_prompt=True,
|
158 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
159 |
+
ctc_weight=0.0,
|
160 |
+
beam_size=1,
|
161 |
+
nbest=1
|
162 |
+
)
|
163 |
+
nbests = speech2text(speech)
|
164 |
+
# import pdb;pdb.set_trace()
|
165 |
+
lang=speech2text.converter.tokenizer.tokenizer.convert_ids_to_tokens(nbests[0][2][0]).replace("|>","").replace("<|","")
|
166 |
+
text="LANG: "+lang
|
167 |
+
elif data == "fake_speech_detection_asvspoof":
|
168 |
+
speech2text = Speech2Text.from_pretrained(
|
169 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
170 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
171 |
+
# Decoding parameters are not included in the model file
|
172 |
+
lang_prompt_token="<|en|> <|fsd|> <|asvspoof|>",
|
173 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
174 |
+
ctc_weight=0.0,
|
175 |
+
beam_size=1,
|
176 |
+
nbest=1
|
177 |
+
)
|
178 |
+
nbests = speech2text(speech)
|
179 |
+
text, *_ = nbests[0]
|
180 |
+
text=text.split("|>")[-1]
|
181 |
+
intent=text.split(" ")[0].replace("class:","")
|
182 |
+
text="SPEECH CLASS: "+intent
|
183 |
+
elif data == "emotion_rec_iemocap":
|
184 |
+
replace_dict={}
|
185 |
+
replace_dict["em:neu"]="Neutral"
|
186 |
+
replace_dict["em:ang"]="Angry"
|
187 |
+
replace_dict["em:sad"]="Sad"
|
188 |
+
replace_dict["em:hap"]="Happy"
|
189 |
+
speech2text = Speech2Text.from_pretrained(
|
190 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
191 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
192 |
+
# Decoding parameters are not included in the model file
|
193 |
+
lang_prompt_token="<|en|> <|er|> <|iemocap|>",
|
194 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
195 |
+
ctc_weight=0.0,
|
196 |
+
beam_size=1,
|
197 |
+
nbest=1
|
198 |
+
)
|
199 |
+
nbests = speech2text(speech)
|
200 |
+
text, *_ = nbests[0]
|
201 |
+
text=text.split("|>")[-1]
|
202 |
+
intent=replace_dict[text.split(" ")[0]]
|
203 |
+
text="EMOTION: "+intent
|
204 |
+
elif data == "accent_classify_accentdb":
|
205 |
+
speech2text = Speech2Text.from_pretrained(
|
206 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
207 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
208 |
+
# Decoding parameters are not included in the model file
|
209 |
+
lang_prompt_token="<|en|> <|accent_rec|> <|accentdb|>",
|
210 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
211 |
+
ctc_weight=0.0,
|
212 |
+
beam_size=1,
|
213 |
+
nbest=1
|
214 |
+
)
|
215 |
+
nbests = speech2text(speech)
|
216 |
+
text, *_ = nbests[0]
|
217 |
+
text=text.split("|>")[-1]
|
218 |
+
intent=text.split(" ")[0].replace("accent:","")
|
219 |
+
text="ACCENT: "+intent
|
220 |
+
elif data == "sarcasm_mustard":
|
221 |
+
speech2text = Speech2Text.from_pretrained(
|
222 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
223 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
224 |
+
# Decoding parameters are not included in the model file
|
225 |
+
lang_prompt_token="<|en|> <|scd|> <|mustard|>",
|
226 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
227 |
+
ctc_weight=0.0,
|
228 |
+
beam_size=1,
|
229 |
+
nbest=1
|
230 |
+
)
|
231 |
+
nbests = speech2text(speech)
|
232 |
+
text, *_ = nbests[0]
|
233 |
+
text=text.split("|>")[-1]
|
234 |
+
intent=text.split(" ")[0].replace("class:","")
|
235 |
+
text="SARCASM CLASS: "+intent
|
236 |
+
elif data == "sarcasm_mustard_plus":
|
237 |
+
speech2text = Speech2Text.from_pretrained(
|
238 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
239 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
240 |
+
# Decoding parameters are not included in the model file
|
241 |
+
lang_prompt_token="<|en|> <|scd|> <|mustard_plus_plus|>",
|
242 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
243 |
+
ctc_weight=0.0,
|
244 |
+
beam_size=1,
|
245 |
+
nbest=1
|
246 |
+
)
|
247 |
+
nbests = speech2text(speech)
|
248 |
+
text, *_ = nbests[0]
|
249 |
+
text=text.split("|>")[-1]
|
250 |
+
intent=text.split(" ")[0].replace("class:","")
|
251 |
+
text="SARCASM CLASS: "+intent
|
252 |
+
elif data == "gender_voxceleb1":
|
253 |
+
speech2text = Speech2Text.from_pretrained(
|
254 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
255 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
256 |
+
# Decoding parameters are not included in the model file
|
257 |
+
lang_prompt_token="<|en|> <|gid|> <|voxceleb|>",
|
258 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
259 |
+
ctc_weight=0.0,
|
260 |
+
beam_size=1,
|
261 |
+
nbest=1
|
262 |
+
)
|
263 |
+
nbests = speech2text(speech)
|
264 |
+
text, *_ = nbests[0]
|
265 |
+
text=text.split("|>")[-1]
|
266 |
+
intent=text.split(" ")[0].replace("gender:f","female").replace("gender:m","male")
|
267 |
+
text="GENDER: "+intent
|
268 |
+
elif data == "audio_classification_esc50":
|
269 |
+
speech2text = Speech2Text.from_pretrained(
|
270 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
271 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
272 |
+
# Decoding parameters are not included in the model file
|
273 |
+
lang_prompt_token="<|audio|> <|auc|> <|esc50|>",
|
274 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
275 |
+
ctc_weight=0.0,
|
276 |
+
beam_size=1,
|
277 |
+
nbest=1
|
278 |
+
)
|
279 |
+
nbests = speech2text(speech)
|
280 |
+
text, *_ = nbests[0]
|
281 |
+
text=text.split("|>")[-1]
|
282 |
+
intent=text.split(" ")[0].replace("audio_class:","")
|
283 |
+
text="AUDIO EVENT CLASS: "+audio_class_arr[int(intent)]
|
284 |
+
elif data == "semantic_parsing_stop":
|
285 |
+
speech2text = Speech2Text.from_pretrained(
|
286 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
287 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
288 |
+
# Decoding parameters are not included in the model file
|
289 |
+
lang_prompt_token="<|en|> <|sp|> <|STOP|>",
|
290 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
291 |
+
ctc_weight=0.0,
|
292 |
+
beam_size=20,
|
293 |
+
penalty=0.1,
|
294 |
+
nbest=1
|
295 |
+
)
|
296 |
+
nbests = speech2text(speech)
|
297 |
+
text, *_ = nbests[0]
|
298 |
+
text=text.split("|>")[-1].replace("_STOP","")
|
299 |
+
text="SEMANTIC PARSE SEQUENCE: "+text
|
300 |
+
elif data == "vad_freesound":
|
301 |
+
speech2text = Speech2Text.from_pretrained(
|
302 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
303 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
304 |
+
# Decoding parameters are not included in the model file
|
305 |
+
lid_prompt=True,
|
306 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
307 |
+
ctc_weight=0.0,
|
308 |
+
beam_size=1,
|
309 |
+
nbest=1
|
310 |
+
)
|
311 |
+
nbests = speech2text(speech)
|
312 |
+
lang=speech2text.converter.tokenizer.tokenizer.convert_ids_to_tokens(nbests[0][2][0])
|
313 |
+
if lang=="<|nospeech|>":
|
314 |
+
text="VAD: no speech"
|
315 |
+
else:
|
316 |
+
text="VAD: speech"
|
317 |
# if lang == "chinese":
|
318 |
# wav = text2speechch(text)["wav"]
|
319 |
# scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
|
|
|
323 |
return text
|
324 |
|
325 |
title = "UniverSLU"
|
326 |
+
description = "Gradio demo for UniverSLU Task Specifier (https://huggingface.co/espnet/UniverSLU-17-Task-Specifier). UniverSLU-17 Task Specifier is a Multi-task Spoken Language Understanding model from CMU WAVLab. It adapts Whisper to additional tasks using single-token task specifiers. To use it, simply record your audio or click one of the examples to load them. More details about the SLU tasks that the model is trained on and it's performance on these tasks can be found in our paper: https://aclanthology.org/2024.naacl-long.151/"
|
327 |
article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
|
328 |
|
329 |
+
examples=[['audio_slurp_ner.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch_scr"],['audio_english_scr.wav',"english_scr"],['audio_lt_scr.wav',"lithuanian_scr"],['audio_ar_scr.wav',"arabic_scr"],['audio_snips.wav',"english_snips"],['audio_lid.wav',"lid_voxforge"],['audio_fsd.wav',"fake_speech_detection_asvspoof"],['audio_er.wav',"emotion_rec_iemocap"],['audio_acc.wav',"accent_classify_accentdb"],['audio_mustard.wav',"sarcasm_mustard"],['audio_mustard_plus.wav',"sarcasm_mustard_plus"],['audio_voxceleb1.wav',"gender_voxceleb1"],['audio_esc50.wav',"audio_classification_esc50"],['audio_stop.wav',"semantic_parsing_stop"],['audio_freesound.wav',"vad_freesound"]]
|
330 |
|
331 |
# gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
|
332 |
gr.Interface(
|
333 |
inference,
|
334 |
+
[gr.Audio(label="input audio",sources=["microphone"],type="filepath"),gr.Radio(choices=["english_slurp","english_fsc","dutch_scr","english_scr","lithuanian_scr","arabic_scr","english_snips","lid_voxforge","fake_speech_detection_asvspoof","emotion_rec_iemocap","accent_classify_accentdb","sarcasm_mustard","sarcasm_mustard_plus","gender_voxceleb1","audio_classification_esc50","semantic_parsing_stop","vad_freesound"], type="value", label="Task")],
|
335 |
+
gr.Textbox(type="text", label="Output"),
|
336 |
title=title,
|
337 |
description=description,
|
338 |
article=article,
|
|
|
339 |
examples=examples
|
340 |
+
).launch(debug=True)
|
audio_acc.wav
ADDED
Binary file (159 kB). View file
|
|
audio_ar_scr.wav
ADDED
Binary file (68.5 kB). View file
|
|
audio_english_scr.wav
ADDED
Binary file (32 kB). View file
|
|
audio_er.wav
ADDED
Binary file (193 kB). View file
|
|
audio_esc50.wav
ADDED
Binary file (441 kB). View file
|
|
audio_freesound.wav
ADDED
Binary file (30.3 kB). View file
|
|
audio_fsd.wav
ADDED
Binary file (40 kB). View file
|
|
audio_lid.wav
ADDED
Binary file (320 kB). View file
|
|
audio_lt_scr.wav
ADDED
Binary file (32 kB). View file
|
|
audio_mustard.wav
ADDED
Binary file (225 kB). View file
|
|
audio_mustard_plus.wav
ADDED
Binary file (201 kB). View file
|
|
audio_slurp_ner.flac
ADDED
Binary file (59.7 kB). View file
|
|
audio_snips.wav
ADDED
Binary file (112 kB). View file
|
|
audio_stop.wav
ADDED
Binary file (132 kB). View file
|
|
audio_voxceleb1.wav
ADDED
Binary file (141 kB). View file
|
|