File size: 6,442 Bytes
ffa317c b1ff38e ffa317c b1ff38e ffa317c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import gradio as gr
import numpy as np
import librosa
from asr.run_asr import run_asr_inference, load_asr_model
from nlu.run_nlu import run_nlu_inference, load_nlu_model
############### strings
mhubert_link = '[mHuBERT-147 model](https://huggingface.co/utter-project/mHuBERT-147)'
massive_link = '[Speech-MASSIVE dataset](https://huggingface.co/datasets/FBK-MT/Speech-MASSIVE)'
blog_post_link = '[blog post](https://huggingface.co/blog/mzboito/naver-demo-french-slu)'
title = "# DEMO: French Spoken Language Understanding using mHuBERT-147 and Speech-MASSIVE"
description=[
f"""
**Interspeech 2024 DEMO.** Cascaded SLU using {mhubert_link} and {massive_link} components.
""",
f"""For more details on the implementation, check our {blog_post_link}.""",
]
examples = [
"resources/audios/utt_286.wav",
"resources/audios/utt_2414.wav",
"resources/audios/utt_16032.wav",
"resources/audios/utt_3060.wav",
"resources/audios/utt_1264.wav",
"resources/audios/utt_9912.wav",
"resources/audios/utt_14684.wav",
"resources/audios/utt_5410.wav",
]
transcriptions = [
"allume les lumières dans la cuisine",
"je veux commander une pizza chez michael's pizza",
"veuillez envoyer un e-mail à sally concernant la réunion de demain",
"quelles sont les nouvelles de newsource",
"mon réveil est-il réglé pour demain matin",
"olly combien de temps dois-je faire bouillir les oeufs",
"qui est le premier ministre de russie",
"lis moi les derniers gros titres du new york times"
]
intents = [
"iot_hue_lighton",
"takeaway_order",
"email_sendemail",
"news_query",
"alarm_query",
"cooking_recipe",
"qa_factoid",
"news_query"
]
slots = [
[ "Other", "Other", "Other", "Other", "Other", "house_place" ],
[ "Other", "Other", "Other", "Other", "food_type", "Other", "business_name", "business_name" ],
[ 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'person', 'Other', 'Other', 'event_name', 'Other', 'date'],
[ 'Other', 'Other', 'Other', 'Other', 'Other', 'media_type'],
[ 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'date', 'timeofday'],
[ 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'cooking_type', 'Other', 'food_type'],
[ 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'place_name'],
[ 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'Other', 'media_type', 'media_type', 'media_type']
]
utter_ack_text = """This is an output of the European Project UTTER (Unified Transcription and Translation for Extended Reality) funded by European Union’s Horizon Europe Research and Innovation programme under grant agreement number 101070631.
For more information please visit https://he-utter.eu/"""
ack_authors = """This demo was made by [Beomseok Lee](https://mt.fbk.eu/author/blee/) and [Marcely Zanon Boito](https://sites.google.com/view/mzboito/marcely-zanon-boito)."""
eu_logo = """<img src="https://huggingface.co/spaces/naver/French-SLU-DEMO-Interspeech2024/resolve/main/resources/logos/EU_flag.jpg" width="100" height="100">"""
utter_logo = """<a href="https://he-utter.eu/" target="_blank"><img src="https://huggingface.co/spaces/naver/French-SLU-DEMO-Interspeech2024/resolve/main/resources/logos/Utter_logo.png" width="50" height="50"></a>"""
nle_logo = """<a href="https://europe.naverlabs.com/" target="_blank"><img src="https://huggingface.co/spaces/naver/French-SLU-DEMO-Interspeech2024/resolve/main/resources/logos/NAVERLABS_2_BLACK.png" width="100" height="100"></a>"""
fbk_logo = """<a href="https://mt.fbk.eu/" target="_blank"><img src="https://huggingface.co/spaces/naver/French-SLU-DEMO-Interspeech2024/resolve/main/resources/logos/FBK_logo.png" width="100" height="100"></a>"""
table = f"""
| File | Transcription | Slots | Intent |
| ------------ | ------------------- | ---------- | -----------|
| {examples[0].split("/")[-1]} | {transcriptions[0]} | {slots[0]} | {intents[0]} |
| {examples[1].split("/")[-1]} | {transcriptions[1]} | {slots[1]} | {intents[1]} |
| {examples[2].split("/")[-1]} | {transcriptions[2]} | {slots[2]} | {intents[2]} |
| {examples[3].split("/")[-1]} | {transcriptions[3]} | {slots[3]} | {intents[3]} |
| {examples[4].split("/")[-1]} | {transcriptions[4]} | {slots[4]} | {intents[4]} |
| {examples[5].split("/")[-1]} | {transcriptions[5]} | {slots[5]} | {intents[5]} |
| {examples[6].split("/")[-1]} | {transcriptions[6]} | {slots[6]} | {intents[6]} |
| {examples[7].split("/")[-1]} | {transcriptions[7]} | {slots[7]} | {intents[7]} |
""".strip()
############### calls
def run_inference(audio_file):
print(audio_file)
audio_struct = librosa.load(audio_file, sr=16000)
print(audio_struct)
audio = {'sampling_rate': audio_struct[1], 'array': audio_struct[0]} #.astype(np.float32)
transcription = run_asr_inference(asr_model, processor, audio)
print(transcription)
structured_output = run_nlu_inference(nlu_model, tokenizer, transcription)
return structured_output
############### app
asr_model, processor = load_asr_model()
nlu_model, tokenizer = load_nlu_model()
demo = gr.Blocks(
title=title,
analytics_enabled=False,
theme=gr.themes.Base(),
)
with demo:
gr.Markdown(title)
for line in description:
gr.Markdown(line)
with gr.Row():
waveform_options = gr.WaveformOptions(sample_rate=16000)
audio_file = gr.Audio(
label="Audio file",
sources=['microphone','upload'],
type="filepath",
format='wav',
waveform_options=waveform_options,
show_download_button=False,
show_share_button=False,
max_length=20,
)
output = gr.HighlightedText(label="ASR result + NLU result")
gr.Button("Run Inference", variant='primary').click(
run_inference,
concurrency_limit=2,
inputs=audio_file,
outputs=output,
)
with gr.Row():
gr.Examples(label="Speech-MASSIVE test utterances:", inputs=audio_file, examples=examples)
gr.Markdown(table)
gr.Markdown("# Aknowledgments")
gr.Markdown(utter_ack_text)
gr.Markdown(ack_authors)
with gr.Row():
gr.Markdown(eu_logo)
gr.Markdown(utter_logo)
gr.Markdown(nle_logo)
gr.Markdown(fbk_logo)
demo.queue()
demo.launch() |