Spaces:
Running
on
L4
Running
on
L4
Update app.py
Browse files
app.py
CHANGED
@@ -17,6 +17,25 @@ from data.data import get_audiotext_dataloader
|
|
17 |
from src.factory import create_model_and_transforms
|
18 |
from train.train_utils import Dict2Class, get_autocast, get_cast_dtype
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
def int16_to_float32(x):
|
21 |
return (x / 32767.0).astype(np.float32)
|
22 |
|
@@ -219,16 +238,23 @@ def predict(filepath, question):
|
|
219 |
|
220 |
return output_decoded
|
221 |
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
outputs=[gr.Textbox(label="Audio Flamingo 2 Output")],
|
229 |
cache_examples=True,
|
|
|
230 |
title="Audio Flamingo 2 Demo",
|
231 |
-
description="Audio Flamingo 2 is NVIDIA's latest Large Audio-Language Model that is capable of understanding audio inputs and answer any open-ended question about it.
|
232 |
"**Audio Flamingo 2 is not an ASR model and has limited ability to recognize the speech content. It primarily focuses on perception and understanding of non-speech sounds and music.**<br>" +
|
233 |
-
"The demo is hosted on the Stage 2 checkpoints and supports upto 90 seconds of audios. Stage 3 checkpoints that support upto 5 minutes will be released at a later
|
234 |
demo.launch(share=True)
|
|
|
17 |
from src.factory import create_model_and_transforms
|
18 |
from train.train_utils import Dict2Class, get_autocast, get_cast_dtype
|
19 |
|
20 |
+
HEADER = ("""
|
21 |
+
<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
|
22 |
+
<a href="https://github.com/NVIDIA/audio-flamingo" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
|
23 |
+
<img src="https://github.com/NVIDIA/audio-flamingo/blob/main/assets/af_logo.png?raw=true" alt="Audio Flamingo 2 🔥🚀🔥" style="max-width: 120px; height: auto;">
|
24 |
+
</a>
|
25 |
+
<div>
|
26 |
+
<h1>Audio Flamingo 2: An Audio-Language Model with Long-Audio Understanding and Expert Reasoning Abilities</h1>
|
27 |
+
<h5 style="margin: 0;">If this demo please you, please give us a star ⭐ on Github or 💖 on this space.</h5>
|
28 |
+
</div>
|
29 |
+
</div>
|
30 |
+
|
31 |
+
<div style="display: flex; justify-content: center; margin-top: 10px;">
|
32 |
+
<a href="https://github.com/NVIDIA/audio-flamingo"><img src='https://img.shields.io/badge/Github-AudioFlamingo2-9C276A' style="margin-right: 5px;"></a>
|
33 |
+
<a href="https://arxiv.org/abs/2503.03983"><img src="https://img.shields.io/badge/Arxiv-2503.03983-AD1C18" style="margin-right: 5px;"></a>
|
34 |
+
<a href="https://huggingface.co/nvidia/audio-flamingo-2"><img src="https://img.shields.io/badge/🤗-Checkpoints-ED5A22.svg" style="margin-right: 5px;"></a>
|
35 |
+
<a href="https://github.com/NVIDIA/audio-flamingo/stargazers"><img src="https://img.shields.io/github/stars/NVIDIA/audio-flamingo.svg?style=social"></a>
|
36 |
+
</div>
|
37 |
+
""")
|
38 |
+
|
39 |
def int16_to_float32(x):
|
40 |
return (x / 32767.0).astype(np.float32)
|
41 |
|
|
|
238 |
|
239 |
return output_decoded
|
240 |
|
241 |
+
audio_examples = [
|
242 |
+
["./examples/soundcap1.wav", "What is the soundscape in this audio?"],
|
243 |
+
["./examples/muscicap1.wav", "Summarize the music content in a sentence."],
|
244 |
+
["./examples/mmau1.wav", "What specific sounds can be distinguished from the audio clip? (A) Helicopter and impact sounds (B) Whistling and chatter (C) Car honking and raindrops (D) Birds chirping and water flowing"],
|
245 |
+
]
|
246 |
+
|
247 |
+
|
248 |
+
demo = gr.Blocks()
|
249 |
+
with demo:
|
250 |
+
gr.HTML(HEADER)
|
251 |
+
gr.Interface(fn=predict,
|
252 |
+
inputs=[gr.Audio(type="filepath"), gr.Textbox(value='Describe the audio.', label='Question')],
|
253 |
outputs=[gr.Textbox(label="Audio Flamingo 2 Output")],
|
254 |
cache_examples=True,
|
255 |
+
examples=audio_examples,
|
256 |
title="Audio Flamingo 2 Demo",
|
257 |
+
description="Audio Flamingo 2 is NVIDIA's latest Large Audio-Language Model that is capable of understanding audio inputs and answer any open-ended question about it. <br>" +
|
258 |
"**Audio Flamingo 2 is not an ASR model and has limited ability to recognize the speech content. It primarily focuses on perception and understanding of non-speech sounds and music.**<br>" +
|
259 |
+
"The demo is hosted on the Stage 2 checkpoints and supports upto 90 seconds of audios. Stage 3 checkpoints that support upto 5 minutes will be released at a later point.")
|
260 |
demo.launch(share=True)
|