SreyanG-NVIDIA commited on
Commit
a40255d
·
verified ·
1 Parent(s): 9973c8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -8
app.py CHANGED
@@ -17,6 +17,25 @@ from data.data import get_audiotext_dataloader
17
  from src.factory import create_model_and_transforms
18
  from train.train_utils import Dict2Class, get_autocast, get_cast_dtype
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def int16_to_float32(x):
21
  return (x / 32767.0).astype(np.float32)
22
 
@@ -219,16 +238,23 @@ def predict(filepath, question):
219
 
220
  return output_decoded
221
 
222
- link = "TBD"
223
- text = "[Github]"
224
- paper_link = "https://github.com/NVIDIA/audio-flamingo/"
225
- paper_text = "TBD"
226
- demo = gr.Interface(fn=predict,
227
- inputs=[gr.Audio(type="filepath"), gr.Textbox(value='Describe the audio.', label='Edit the textbox to ask your own questions!')],
 
 
 
 
 
 
228
  outputs=[gr.Textbox(label="Audio Flamingo 2 Output")],
229
  cache_examples=True,
 
230
  title="Audio Flamingo 2 Demo",
231
- description="Audio Flamingo 2 is NVIDIA's latest Large Audio-Language Model that is capable of understanding audio inputs and answer any open-ended question about it." + f"<a href='{paper_link}'>{paper_text}</a> " + f"<a href='{link}'>{text}</a> <br>" +
232
  "**Audio Flamingo 2 is not an ASR model and has limited ability to recognize the speech content. It primarily focuses on perception and understanding of non-speech sounds and music.**<br>" +
233
- "The demo is hosted on the Stage 2 checkpoints and supports upto 90 seconds of audios. Stage 3 checkpoints that support upto 5 minutes will be released at a later points.")
234
  demo.launch(share=True)
 
17
  from src.factory import create_model_and_transforms
18
  from train.train_utils import Dict2Class, get_autocast, get_cast_dtype
19
 
20
+ HEADER = ("""
21
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
22
+ <a href="https://github.com/NVIDIA/audio-flamingo" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
23
+ <img src="https://github.com/NVIDIA/audio-flamingo/blob/main/assets/af_logo.png?raw=true" alt="Audio Flamingo 2 🔥🚀🔥" style="max-width: 120px; height: auto;">
24
+ </a>
25
+ <div>
26
+ <h1>Audio Flamingo 2: An Audio-Language Model with Long-Audio Understanding and Expert Reasoning Abilities</h1>
27
+ <h5 style="margin: 0;">If this demo please you, please give us a star ⭐ on Github or 💖 on this space.</h5>
28
+ </div>
29
+ </div>
30
+
31
+ <div style="display: flex; justify-content: center; margin-top: 10px;">
32
+ <a href="https://github.com/NVIDIA/audio-flamingo"><img src='https://img.shields.io/badge/Github-AudioFlamingo2-9C276A' style="margin-right: 5px;"></a>
33
+ <a href="https://arxiv.org/abs/2503.03983"><img src="https://img.shields.io/badge/Arxiv-2503.03983-AD1C18" style="margin-right: 5px;"></a>
34
+ <a href="https://huggingface.co/nvidia/audio-flamingo-2"><img src="https://img.shields.io/badge/🤗-Checkpoints-ED5A22.svg" style="margin-right: 5px;"></a>
35
+ <a href="https://github.com/NVIDIA/audio-flamingo/stargazers"><img src="https://img.shields.io/github/stars/NVIDIA/audio-flamingo.svg?style=social"></a>
36
+ </div>
37
+ """)
38
+
39
  def int16_to_float32(x):
40
  return (x / 32767.0).astype(np.float32)
41
 
 
238
 
239
  return output_decoded
240
 
241
+ audio_examples = [
242
+ ["./examples/soundcap1.wav", "What is the soundscape in this audio?"],
243
+ ["./examples/muscicap1.wav", "Summarize the music content in a sentence."],
244
+ ["./examples/mmau1.wav", "What specific sounds can be distinguished from the audio clip? (A) Helicopter and impact sounds (B) Whistling and chatter (C) Car honking and raindrops (D) Birds chirping and water flowing"],
245
+ ]
246
+
247
+
248
+ demo = gr.Blocks()
249
+ with demo:
250
+ gr.HTML(HEADER)
251
+ gr.Interface(fn=predict,
252
+ inputs=[gr.Audio(type="filepath"), gr.Textbox(value='Describe the audio.', label='Question')],
253
  outputs=[gr.Textbox(label="Audio Flamingo 2 Output")],
254
  cache_examples=True,
255
+ examples=audio_examples,
256
  title="Audio Flamingo 2 Demo",
257
+ description="Audio Flamingo 2 is NVIDIA's latest Large Audio-Language Model that is capable of understanding audio inputs and answer any open-ended question about it. <br>" +
258
  "**Audio Flamingo 2 is not an ASR model and has limited ability to recognize the speech content. It primarily focuses on perception and understanding of non-speech sounds and music.**<br>" +
259
+ "The demo is hosted on the Stage 2 checkpoints and supports upto 90 seconds of audios. Stage 3 checkpoints that support upto 5 minutes will be released at a later point.")
260
  demo.launch(share=True)