Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -24,7 +24,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
24 |
|
25 |
import copy
|
26 |
import torch
|
27 |
-
import
|
28 |
|
29 |
from demucs import pretrained
|
30 |
from demucs.apply import apply_model
|
@@ -34,7 +34,7 @@ logger = logging.get_logger(__name__)
|
|
34 |
|
35 |
|
36 |
class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditionalGeneration):
|
37 |
-
stride_longform =
|
38 |
|
39 |
|
40 |
def _prepare_audio_encoder_kwargs_for_longform_generation(
|
@@ -65,20 +65,14 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
|
|
65 |
**kwargs,
|
66 |
):
|
67 |
"""
|
68 |
-
|
69 |
Generates sequences of token ids for models with a language modeling head.
|
70 |
-
|
71 |
<Tip warning={true}>
|
72 |
-
|
73 |
Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
|
74 |
model's default generation configuration. You can override any `generation_config` by passing the corresponding
|
75 |
parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
|
76 |
-
|
77 |
For an overview of generation strategies and code examples, check out the [following
|
78 |
guide](./generation_strategies).
|
79 |
-
|
80 |
</Tip>
|
81 |
-
|
82 |
Parameters:
|
83 |
inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
|
84 |
The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
|
@@ -109,20 +103,15 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
|
|
109 |
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
110 |
forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
|
111 |
specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
|
112 |
-
|
113 |
Return:
|
114 |
[`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
|
115 |
or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
|
116 |
-
|
117 |
If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
|
118 |
[`~utils.ModelOutput`] types are:
|
119 |
-
|
120 |
- [`~generation.GenerateDecoderOnlyOutput`],
|
121 |
- [`~generation.GenerateBeamDecoderOnlyOutput`]
|
122 |
-
|
123 |
If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
|
124 |
[`~utils.ModelOutput`] types are:
|
125 |
-
|
126 |
- [`~generation.GenerateEncoderDecoderOutput`],
|
127 |
- [`~generation.GenerateBeamEncoderDecoderOutput`]
|
128 |
"""
|
@@ -272,7 +261,10 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
|
|
272 |
# the first timestamps corresponds to decoder_start_token
|
273 |
current_generated_length = input_ids.shape[1] - 1
|
274 |
|
275 |
-
|
|
|
|
|
|
|
276 |
if is_greedy_gen_mode:
|
277 |
if generation_config.num_return_sequences > 1:
|
278 |
raise ValueError(
|
@@ -343,12 +335,12 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
|
|
343 |
generated_tokens.append(output_ids[:, :, self.stride_longform:])
|
344 |
else:
|
345 |
generated_tokens.append(output_ids)
|
346 |
-
|
347 |
current_generated_length += generated_tokens[-1].shape[-1]
|
348 |
|
349 |
# append the frame dimension back to the audio codes
|
350 |
# use last generated tokens as begining of the newest generation
|
351 |
-
output_ids = output_ids[None, :, :,
|
352 |
|
353 |
model_kwargs = self._prepare_audio_encoder_kwargs_for_longform_generation(output_ids, model_kwargs)
|
354 |
|
@@ -417,18 +409,24 @@ processor = AutoProcessor.from_pretrained("facebook/musicgen-melody", revision="
|
|
417 |
|
418 |
demucs = pretrained.get_model('htdemucs')
|
419 |
|
420 |
-
title = "MusicGen
|
421 |
|
422 |
description = """
|
423 |
-
Stream the outputs of the MusicGen text-to-music model by playing the generated audio as soon as the first chunk is ready.
|
424 |
-
|
|
|
|
|
|
|
425 |
demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
|
426 |
"""
|
427 |
|
428 |
article = """
|
429 |
-
##
|
|
|
|
|
430 |
|
431 |
MusicGen is an auto-regressive transformer-based model, meaning generates audio codes (tokens) in a causal fashion.
|
|
|
432 |
At each decoding step, the model generates a new set of audio codes, conditional on the text input and all previous audio codes. From the
|
433 |
frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
|
434 |
each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
|
@@ -436,18 +434,28 @@ each set of generated audio codes corresponds to 0.02 seconds. This means we req
|
|
436 |
|
437 |
Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
|
438 |
playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
|
|
|
439 |
For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
|
440 |
750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
|
441 |
to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
|
442 |
chunks of audio, each corresponding to 250 tokens.
|
443 |
|
444 |
-
This method of playing incremental generations reduces the latency of the MusicGen model from the total time to generate 1000 tokens,
|
445 |
-
to the time taken to play the first chunk of audio (250 tokens). This can result in significant improvements to perceived latency,
|
446 |
-
particularly when the chunk size is chosen to be small.
|
447 |
-
|
|
|
448 |
than the time it takes to play the audio.
|
449 |
|
450 |
For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L52).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
451 |
"""
|
452 |
|
453 |
|
@@ -465,7 +473,6 @@ class MusicgenStreamer(BaseStreamer):
|
|
465 |
Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
|
466 |
useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
|
467 |
Gradio demo).
|
468 |
-
|
469 |
Parameters:
|
470 |
model (`MusicgenForConditionalGeneration`):
|
471 |
The MusicGen model used to generate the audio waveform.
|
@@ -530,12 +537,23 @@ class MusicgenStreamer(BaseStreamer):
|
|
530 |
|
531 |
# send the input_ids to the correct device
|
532 |
input_ids = input_ids.to(self.audio_encoder.device)
|
|
|
533 |
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
539 |
return audio_values.cpu().float().numpy()
|
540 |
|
541 |
def put(self, value):
|
@@ -546,14 +564,13 @@ class MusicgenStreamer(BaseStreamer):
|
|
546 |
if self.token_cache is None:
|
547 |
self.token_cache = value
|
548 |
else:
|
|
|
|
|
|
|
549 |
self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
|
550 |
|
551 |
if self.token_cache.shape[-1] % self.play_steps == 0:
|
552 |
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
553 |
-
if self.is_longform:
|
554 |
-
if not self.longform_stride_applied:
|
555 |
-
self.to_yield = self.to_yield + self.longform_stride
|
556 |
-
self.longform_stride_applied = True
|
557 |
self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
|
558 |
self.to_yield += len(audio_values) - self.to_yield - self.stride
|
559 |
|
@@ -607,13 +624,14 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
|
|
607 |
|
608 |
return wav_buf.read()
|
609 |
|
610 |
-
@spaces.GPU()
|
611 |
def generate_audio(text_prompt, audio, audio_length_in_s=10.0, play_steps_in_s=2.0, seed=0):
|
612 |
max_new_tokens = int(frame_rate * audio_length_in_s)
|
613 |
play_steps = int(frame_rate * play_steps_in_s)
|
614 |
|
615 |
if audio is not None:
|
616 |
-
audio =
|
|
|
617 |
audio = apply_model(demucs, audio[None])
|
618 |
|
619 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
@@ -666,20 +684,20 @@ demo = gr.Interface(
|
|
666 |
fn=generate_audio,
|
667 |
inputs=[
|
668 |
gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
|
669 |
-
gr.Audio(type="
|
670 |
-
gr.Slider(
|
671 |
-
gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
|
672 |
-
gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations"),
|
673 |
],
|
674 |
outputs=[
|
675 |
gr.Audio(label="Generated Music", autoplay=True, interactive=False, streaming=True)
|
676 |
],
|
677 |
examples=[
|
678 |
-
["An 80s driving pop song with heavy drums and synth pads in the background", None,
|
679 |
-
["
|
680 |
-
["90s rock song with electric guitar and heavy drums", None,
|
681 |
-
["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None,
|
682 |
-
["lofi slow bpm electro chill with organic samples", None,
|
683 |
],
|
684 |
title=title,
|
685 |
description=description,
|
|
|
24 |
|
25 |
import copy
|
26 |
import torch
|
27 |
+
import torchaudio
|
28 |
|
29 |
from demucs import pretrained
|
30 |
from demucs.apply import apply_model
|
|
|
34 |
|
35 |
|
36 |
class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditionalGeneration):
|
37 |
+
stride_longform = 750
|
38 |
|
39 |
|
40 |
def _prepare_audio_encoder_kwargs_for_longform_generation(
|
|
|
65 |
**kwargs,
|
66 |
):
|
67 |
"""
|
|
|
68 |
Generates sequences of token ids for models with a language modeling head.
|
|
|
69 |
<Tip warning={true}>
|
|
|
70 |
Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
|
71 |
model's default generation configuration. You can override any `generation_config` by passing the corresponding
|
72 |
parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
|
|
|
73 |
For an overview of generation strategies and code examples, check out the [following
|
74 |
guide](./generation_strategies).
|
|
|
75 |
</Tip>
|
|
|
76 |
Parameters:
|
77 |
inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
|
78 |
The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
|
|
|
103 |
Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
|
104 |
forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
|
105 |
specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
|
|
|
106 |
Return:
|
107 |
[`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
|
108 |
or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
|
|
|
109 |
If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
|
110 |
[`~utils.ModelOutput`] types are:
|
|
|
111 |
- [`~generation.GenerateDecoderOnlyOutput`],
|
112 |
- [`~generation.GenerateBeamDecoderOnlyOutput`]
|
|
|
113 |
If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
|
114 |
[`~utils.ModelOutput`] types are:
|
|
|
115 |
- [`~generation.GenerateEncoderDecoderOutput`],
|
116 |
- [`~generation.GenerateBeamEncoderDecoderOutput`]
|
117 |
"""
|
|
|
261 |
# the first timestamps corresponds to decoder_start_token
|
262 |
current_generated_length = input_ids.shape[1] - 1
|
263 |
|
264 |
+
max_new_tokens = generation_config.max_new_tokens
|
265 |
+
|
266 |
+
while current_generated_length + 20 <= max_longform_generation_length:
|
267 |
+
generation_config.max_new_tokens = min(max_new_tokens, max_longform_generation_length - current_generated_length)
|
268 |
if is_greedy_gen_mode:
|
269 |
if generation_config.num_return_sequences > 1:
|
270 |
raise ValueError(
|
|
|
335 |
generated_tokens.append(output_ids[:, :, self.stride_longform:])
|
336 |
else:
|
337 |
generated_tokens.append(output_ids)
|
338 |
+
|
339 |
current_generated_length += generated_tokens[-1].shape[-1]
|
340 |
|
341 |
# append the frame dimension back to the audio codes
|
342 |
# use last generated tokens as begining of the newest generation
|
343 |
+
output_ids = output_ids[None, :, :, - self.stride_longform:]
|
344 |
|
345 |
model_kwargs = self._prepare_audio_encoder_kwargs_for_longform_generation(output_ids, model_kwargs)
|
346 |
|
|
|
409 |
|
410 |
demucs = pretrained.get_model('htdemucs')
|
411 |
|
412 |
+
title = "Streaming Long-form MusicGen"
|
413 |
|
414 |
description = """
|
415 |
+
Stream the outputs of the MusicGen Melody text-to-music model by playing the generated audio as soon as the first chunk is ready.
|
416 |
+
|
417 |
+
The generation loop is adapted to perform **long-form** music generation. In this demo, we limit the duration of the music generated, but in theory, it could run **endlessly**.
|
418 |
+
|
419 |
+
Demo uses [MusicGen Melody](https://huggingface.co/facebook/musicgen-melody) in the 🤗 Transformers library. Note that the
|
420 |
demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
|
421 |
"""
|
422 |
|
423 |
article = """
|
424 |
+
## FAQ
|
425 |
+
|
426 |
+
### How Does It Work?
|
427 |
|
428 |
MusicGen is an auto-regressive transformer-based model, meaning generates audio codes (tokens) in a causal fashion.
|
429 |
+
|
430 |
At each decoding step, the model generates a new set of audio codes, conditional on the text input and all previous audio codes. From the
|
431 |
frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
|
432 |
each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
|
|
|
434 |
|
435 |
Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
|
436 |
playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
|
437 |
+
|
438 |
For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
|
439 |
750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
|
440 |
to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
|
441 |
chunks of audio, each corresponding to 250 tokens.
|
442 |
|
443 |
+
This method of playing incremental generations **reduces the latency** of the MusicGen model from the total time to generate 1000 tokens,
|
444 |
+
to the time taken to play the first chunk of audio (250 tokens). This can result in **significant improvements** to perceived latency,
|
445 |
+
particularly when the chunk size is chosen to be small.
|
446 |
+
|
447 |
+
In practice, the chunk size should be tuned to your device: using a smaller chunk size will mean that the first chunk is ready faster, but should not be chosen so small that the model generates slower
|
448 |
than the time it takes to play the audio.
|
449 |
|
450 |
For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L52).
|
451 |
+
|
452 |
+
### Could this be used for stereo music generation?
|
453 |
+
|
454 |
+
In theory, yes, but you would have to adapt the current demo a bit and use a checkpoint specificaly made for stereo generation, for example, this [one](https://huggingface.co/facebook/musicgen-stereo-melody).
|
455 |
+
|
456 |
+
### Why is there a delay between the moment the first chunk is generated and the moment the audio starts playing?
|
457 |
+
|
458 |
+
This behaviour is specific to gradio and the different components it uses. If you ever adapt this demo for a streaming use-case, you could have lower latency.
|
459 |
"""
|
460 |
|
461 |
|
|
|
473 |
Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
|
474 |
useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
|
475 |
Gradio demo).
|
|
|
476 |
Parameters:
|
477 |
model (`MusicgenForConditionalGeneration`):
|
478 |
The MusicGen model used to generate the audio waveform.
|
|
|
537 |
|
538 |
# send the input_ids to the correct device
|
539 |
input_ids = input_ids.to(self.audio_encoder.device)
|
540 |
+
|
541 |
|
542 |
+
if self.decoder.config.audio_channels == 1:
|
543 |
+
output_values = self.audio_encoder.decode(
|
544 |
+
input_ids,
|
545 |
+
audio_scales=[None],
|
546 |
+
).audio_values
|
547 |
+
else:
|
548 |
+
codec_outputs_left = self.audio_encoder.decode(input_ids[:, :, ::2, :], audio_scales=[None])
|
549 |
+
output_values_left = codec_outputs_left.audio_values
|
550 |
+
|
551 |
+
codec_outputs_right = self.audio_encoder.decode(input_ids[:, :, 1::2, :], audio_scales=[None])
|
552 |
+
output_values_right = codec_outputs_right.audio_values
|
553 |
+
|
554 |
+
output_values = torch.cat([output_values_left, output_values_right], dim=1)
|
555 |
+
|
556 |
+
audio_values = output_values[0, 0]
|
557 |
return audio_values.cpu().float().numpy()
|
558 |
|
559 |
def put(self, value):
|
|
|
564 |
if self.token_cache is None:
|
565 |
self.token_cache = value
|
566 |
else:
|
567 |
+
# if self.is_longform and not self.longform_stride_applied:
|
568 |
+
# value = value[self.longform_stride:]
|
569 |
+
# self.longform_stride_applied = True
|
570 |
self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
|
571 |
|
572 |
if self.token_cache.shape[-1] % self.play_steps == 0:
|
573 |
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
|
|
|
|
|
|
|
|
574 |
self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
|
575 |
self.to_yield += len(audio_values) - self.to_yield - self.stride
|
576 |
|
|
|
624 |
|
625 |
return wav_buf.read()
|
626 |
|
627 |
+
@spaces.GPU(duration=90)
|
628 |
def generate_audio(text_prompt, audio, audio_length_in_s=10.0, play_steps_in_s=2.0, seed=0):
|
629 |
max_new_tokens = int(frame_rate * audio_length_in_s)
|
630 |
play_steps = int(frame_rate * play_steps_in_s)
|
631 |
|
632 |
if audio is not None:
|
633 |
+
audio = torchaudio.load(audio)
|
634 |
+
audio = convert_audio(audio[0], audio[1], demucs.samplerate, demucs.audio_channels)
|
635 |
audio = apply_model(demucs, audio[None])
|
636 |
|
637 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
684 |
fn=generate_audio,
|
685 |
inputs=[
|
686 |
gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
|
687 |
+
gr.Audio(type="filepath", label="Conditioning audio. Use this for melody-guided generation."),
|
688 |
+
gr.Slider(35, 60, value=45, step=5, label="(Approximate) Audio length in seconds."),
|
689 |
+
gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds.", info="Lower = shorter chunks, lower latency, more codec steps."),
|
690 |
+
gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations."),
|
691 |
],
|
692 |
outputs=[
|
693 |
gr.Audio(label="Generated Music", autoplay=True, interactive=False, streaming=True)
|
694 |
],
|
695 |
examples=[
|
696 |
+
["An 80s driving pop song with heavy drums and synth pads in the background", None, 45, 1.5, 5],
|
697 |
+
["Bossa nova with guitars and synthesizer", None, 45, 1.5, 5],
|
698 |
+
["90s rock song with electric guitar and heavy drums", None, 45, 1.5, 5],
|
699 |
+
["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None, 45, 1.5, 5],
|
700 |
+
["lofi slow bpm electro chill with organic samples", None, 45, 1.5, 5],
|
701 |
],
|
702 |
title=title,
|
703 |
description=description,
|