Spaces:
Running
on
Zero
Running
on
Zero
sanchit-gandhi
commited on
Commit
·
29309b0
1
Parent(s):
e9c24a1
Update app.py
Browse files
app.py
CHANGED
@@ -15,6 +15,38 @@ import spaces
|
|
15 |
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
16 |
processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
class MusicgenStreamer(BaseStreamer):
|
20 |
def __init__(
|
@@ -181,12 +213,24 @@ demo = gr.Interface(
|
|
181 |
fn=generate_audio,
|
182 |
inputs=[
|
183 |
gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
|
184 |
-
gr.Slider(10, 30, value=15, step=5, label="Audio length in
|
185 |
-
gr.Slider(
|
|
|
186 |
],
|
187 |
outputs=[
|
188 |
gr.Audio(label="Generated Music", streaming=True, autoplay=True)
|
189 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
)
|
191 |
|
|
|
192 |
demo.queue().launch()
|
|
|
15 |
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
|
16 |
processor = MusicgenProcessor.from_pretrained("facebook/musicgen-small")
|
17 |
|
18 |
+
title = "MusicGen Streaming"
|
19 |
+
|
20 |
+
description = """
|
21 |
+
Stream the outputs of the MusicGen text-to-music model by playing the generated audio as soon as the first chunk is ready.
|
22 |
+
Demo uses [MusicGen Small](https://huggingface.co/facebook/musicgen-small) in the 🤗 Transformers library.
|
23 |
+
"""
|
24 |
+
|
25 |
+
article = """
|
26 |
+
## How Does It Work?
|
27 |
+
|
28 |
+
MusicGen is an auto-regressive transformer-based model, meaning generates audio codes (tokens) in a causal fashion.
|
29 |
+
At each decoding step, the model generates a new set of audio codes, conditional on the text input and all previous audio codes. From the
|
30 |
+
frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
|
31 |
+
each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
|
32 |
+
20 seconds of audio.
|
33 |
+
|
34 |
+
Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
|
35 |
+
playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
|
36 |
+
For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
|
37 |
+
750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
|
38 |
+
to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
|
39 |
+
chunks of audio, each corresponding to 250 tokens.
|
40 |
+
|
41 |
+
This method of playing incremental generations reduces the latency of the MusicGen model from the total time to generate 1000 tokens,
|
42 |
+
to the time taken to play the first chunk of audio (250 tokens). This can result in significant improvements to perceived latency,
|
43 |
+
particularly when the chunk size is chosen to be small. In practice, the chunk size should be tuned to your device: using a
|
44 |
+
smaller chunk size will mean that the first chunk is ready faster, but should not be chosen so small that the model generates slower
|
45 |
+
than the time it takes to play the audio.
|
46 |
+
|
47 |
+
For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L51).
|
48 |
+
"""
|
49 |
+
|
50 |
|
51 |
class MusicgenStreamer(BaseStreamer):
|
52 |
def __init__(
|
|
|
213 |
fn=generate_audio,
|
214 |
inputs=[
|
215 |
gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
|
216 |
+
gr.Slider(10, 30, value=15, step=5, label="Audio length in seconds"),
|
217 |
+
gr.Slider(0.5, 2.5, value=0.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
|
218 |
+
gr.Slider(0, 10, value=5, step=1, label="Seed for random generations"),
|
219 |
],
|
220 |
outputs=[
|
221 |
gr.Audio(label="Generated Music", streaming=True, autoplay=True)
|
222 |
],
|
223 |
+
examples=[
|
224 |
+
["An 80s driving pop song with heavy drums and synth pads in the background", 20, 0.5, 5],
|
225 |
+
["A cheerful country song with acoustic guitars", 15, 0.5, 5],
|
226 |
+
["90s rock song with electric guitar and heavy drums", 15, 0.5, 5],
|
227 |
+
["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", 30, 0.5, 5],
|
228 |
+
["lofi slow bpm electro chill with organic samples", 30, 0.5, 5],
|
229 |
+
],
|
230 |
+
title=title,
|
231 |
+
description=description,
|
232 |
+
article=article,
|
233 |
)
|
234 |
|
235 |
+
|
236 |
demo.queue().launch()
|