longform-musicgen

Running on Zero

App Files Files Community

ylacombe HF staff commited on Apr 22

Commit

bda8ed2

•

1 Parent(s): a204cc2

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -45

app.py CHANGED Viewed

@@ -24,7 +24,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
 import copy
 import torch
-import inspect
 from demucs import pretrained
 from demucs.apply import apply_model
@@ -34,7 +34,7 @@ logger = logging.get_logger(__name__)
 class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditionalGeneration):
-    stride_longform = 500
     def _prepare_audio_encoder_kwargs_for_longform_generation(
@@ -65,20 +65,14 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
         **kwargs,
     ):
         """
         Generates sequences of token ids for models with a language modeling head.
         <Tip warning={true}>
         Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
         model's default generation configuration. You can override any `generation_config` by passing the corresponding
         parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
         For an overview of generation strategies and code examples, check out the [following
         guide](./generation_strategies).
         </Tip>
         Parameters:
             inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
                 The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
@@ -109,20 +103,15 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
         Return:
             [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
             or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
                 If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
                 [`~utils.ModelOutput`] types are:
                     - [`~generation.GenerateDecoderOnlyOutput`],
                     - [`~generation.GenerateBeamDecoderOnlyOutput`]
                 If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                 [`~utils.ModelOutput`] types are:
                     - [`~generation.GenerateEncoderDecoderOutput`],
                     - [`~generation.GenerateBeamEncoderDecoderOutput`]
         """
@@ -272,7 +261,10 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
         # the first timestamps corresponds to decoder_start_token
         current_generated_length = input_ids.shape[1] - 1
-        while current_generated_length <= self.max_longform_generation_length:
             if is_greedy_gen_mode:
                 if generation_config.num_return_sequences > 1:
                     raise ValueError(
@@ -343,12 +335,12 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
                 generated_tokens.append(output_ids[:, :, self.stride_longform:])
             else:
                 generated_tokens.append(output_ids)
             current_generated_length += generated_tokens[-1].shape[-1]
             # append the frame dimension back to the audio codes
             # use last generated tokens as begining of the newest generation
-            output_ids = output_ids[None, :, :, (output_ids.shape[-1] - self.stride_longform):]
             model_kwargs = self._prepare_audio_encoder_kwargs_for_longform_generation(output_ids, model_kwargs)
@@ -417,18 +409,24 @@ processor = AutoProcessor.from_pretrained("facebook/musicgen-melody", revision="
 demucs = pretrained.get_model('htdemucs')
-title = "MusicGen Streaming"
 description = """
-Stream the outputs of the MusicGen text-to-music model by playing the generated audio as soon as the first chunk is ready.
-Demo uses [MusicGen Small](https://huggingface.co/facebook/musicgen-small) in the 🤗 Transformers library. Note that the
 demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
 """
 article = """
-## How Does It Work?
 MusicGen is an auto-regressive transformer-based model, meaning generates audio codes (tokens) in a causal fashion.
 At each decoding step, the model generates a new set of audio codes, conditional on the text input and all previous audio codes. From the
 frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
 each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
@@ -436,18 +434,28 @@ each set of generated audio codes corresponds to 0.02 seconds. This means we req
 Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
 playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
 For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
 750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
 to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
 chunks of audio, each corresponding to 250 tokens.
-This method of playing incremental generations reduces the latency of the MusicGen model from the total time to generate 1000 tokens,
-to the time taken to play the first chunk of audio (250 tokens). This can result in significant improvements to perceived latency,
-particularly when the chunk size is chosen to be small. In practice, the chunk size should be tuned to your device: using a
-smaller chunk size will mean that the first chunk is ready faster, but should not be chosen so small that the model generates slower
 than the time it takes to play the audio.
 For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L52).
 """
@@ -465,7 +473,6 @@ class MusicgenStreamer(BaseStreamer):
         Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
         useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
         Gradio demo).
         Parameters:
             model (`MusicgenForConditionalGeneration`):
                 The MusicGen model used to generate the audio waveform.
@@ -530,12 +537,23 @@ class MusicgenStreamer(BaseStreamer):
         # send the input_ids to the correct device
         input_ids = input_ids.to(self.audio_encoder.device)
-        output_values = self.audio_encoder.decode(
-            input_ids,
-            audio_scales=[None],
-        )
-        audio_values = output_values.audio_values[0, 0]
         return audio_values.cpu().float().numpy()
     def put(self, value):
@@ -546,14 +564,13 @@ class MusicgenStreamer(BaseStreamer):
         if self.token_cache is None:
             self.token_cache = value
         else:
             self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
         if self.token_cache.shape[-1] % self.play_steps == 0:
             audio_values = self.apply_delay_pattern_mask(self.token_cache)
-            if self.is_longform:
-                if not self.longform_stride_applied:
-                    self.to_yield = self.to_yield + self.longform_stride
-                    self.longform_stride_applied = True
             self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
             self.to_yield += len(audio_values) - self.to_yield - self.stride
@@ -607,13 +624,14 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
     return wav_buf.read()
-@spaces.GPU()
 def generate_audio(text_prompt, audio, audio_length_in_s=10.0, play_steps_in_s=2.0, seed=0):
     max_new_tokens = int(frame_rate * audio_length_in_s)
     play_steps = int(frame_rate * play_steps_in_s)
     if audio is not None:
-        audio = convert_audio(torch.tensor(audio[1]).float(), audio[0], demucs.samplerate, demucs.audio_channels)
         audio = apply_model(demucs, audio[None])
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -666,20 +684,20 @@ demo = gr.Interface(
     fn=generate_audio,
     inputs=[
         gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
-        gr.Audio(type="numpy", label="Conditioning audio"),
-        gr.Slider(15, 60, value=45, step=5, label="Audio length in seconds"),
-        gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps"),
-        gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations"),
     ],
     outputs=[
         gr.Audio(label="Generated Music", autoplay=True,  interactive=False, streaming=True)
     ],
     examples=[
-        ["An 80s driving pop song with heavy drums and synth pads in the background", None, 30, 1.5, 5],
-        ["A cheerful country song with acoustic guitars", None, 30, 1.5, 5],
-        ["90s rock song with electric guitar and heavy drums", None, 30, 1.5, 5],
-        ["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None, 30, 1.5, 5],
-        ["lofi slow bpm electro chill with organic samples", None, 30, 1.5, 5],
     ],
     title=title,
     description=description,

 import copy
 import torch
+import torchaudio
 from demucs import pretrained
 from demucs.apply import apply_model
 class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditionalGeneration):
+    stride_longform = 750
     def _prepare_audio_encoder_kwargs_for_longform_generation(
         **kwargs,
     ):
         """
         Generates sequences of token ids for models with a language modeling head.
         <Tip warning={true}>
         Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
         model's default generation configuration. You can override any `generation_config` by passing the corresponding
         parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
         For an overview of generation strategies and code examples, check out the [following
         guide](./generation_strategies).
         </Tip>
         Parameters:
             inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
                 The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
         Return:
             [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
             or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
                 If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
                 [`~utils.ModelOutput`] types are:
                     - [`~generation.GenerateDecoderOnlyOutput`],
                     - [`~generation.GenerateBeamDecoderOnlyOutput`]
                 If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                 [`~utils.ModelOutput`] types are:
                     - [`~generation.GenerateEncoderDecoderOutput`],
                     - [`~generation.GenerateBeamEncoderDecoderOutput`]
         """
         # the first timestamps corresponds to decoder_start_token
         current_generated_length = input_ids.shape[1] - 1
+        max_new_tokens = generation_config.max_new_tokens
+        while current_generated_length + 20 <= max_longform_generation_length:
+            generation_config.max_new_tokens = min(max_new_tokens, max_longform_generation_length - current_generated_length)
             if is_greedy_gen_mode:
                 if generation_config.num_return_sequences > 1:
                     raise ValueError(
                 generated_tokens.append(output_ids[:, :, self.stride_longform:])
             else:
                 generated_tokens.append(output_ids)
             current_generated_length += generated_tokens[-1].shape[-1]
             # append the frame dimension back to the audio codes
             # use last generated tokens as begining of the newest generation
+            output_ids = output_ids[None, :, :, - self.stride_longform:]
             model_kwargs = self._prepare_audio_encoder_kwargs_for_longform_generation(output_ids, model_kwargs)
 demucs = pretrained.get_model('htdemucs')
+title = "Streaming Long-form MusicGen"
 description = """
+Stream the outputs of the MusicGen Melody text-to-music model by playing the generated audio as soon as the first chunk is ready.
+The generation loop is adapted to perform **long-form** music generation. In this demo, we limit the duration of the music generated, but in theory, it could run **endlessly**.
+Demo uses [MusicGen Melody](https://huggingface.co/facebook/musicgen-melody) in the 🤗 Transformers library. Note that the
 demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
 """
 article = """
+## FAQ
+### How Does It Work?
 MusicGen is an auto-regressive transformer-based model, meaning generates audio codes (tokens) in a causal fashion.
 At each decoding step, the model generates a new set of audio codes, conditional on the text input and all previous audio codes. From the
 frame rate of the [EnCodec model](https://huggingface.co/facebook/encodec_32khz) used to decode the generated codes to audio waveform,
 each set of generated audio codes corresponds to 0.02 seconds. This means we require a total of 1000 decoding steps to generate
 Rather than waiting for the entire audio sequence to be generated, which would require the full 1000 decoding steps, we can start
 playing the audio after a specified number of decoding steps have been reached, a techinque known as [*streaming*](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming).
 For example, after 250 steps we have the first 5 seconds of audio ready, and so can play this without waiting for the remaining
 750 decoding steps to be complete. As we continue to generate with the MusicGen model, we append new chunks of generated audio
 to our output waveform on-the-fly. After the full 1000 decoding steps, the generated audio is complete, and is composed of four
 chunks of audio, each corresponding to 250 tokens.
+This method of playing incremental generations **reduces the latency** of the MusicGen model from the total time to generate 1000 tokens,
+to the time taken to play the first chunk of audio (250 tokens). This can result in **significant improvements** to perceived latency,
+particularly when the chunk size is chosen to be small.
+In practice, the chunk size should be tuned to your device: using a smaller chunk size will mean that the first chunk is ready faster, but should not be chosen so small that the model generates slower
 than the time it takes to play the audio.
 For details on how the streaming class works, check out the source code for the [MusicgenStreamer](https://huggingface.co/spaces/sanchit-gandhi/musicgen-streaming/blob/main/app.py#L52).
+### Could this be used for stereo music generation?
+In theory, yes, but you would have to adapt the current demo a bit and use a checkpoint specificaly made for stereo generation, for example, this [one](https://huggingface.co/facebook/musicgen-stereo-melody).
+### Why is there a delay between the moment the first chunk is generated and the moment the audio starts playing?
+This behaviour is specific to gradio and the different components it uses. If you ever adapt this demo for a streaming use-case, you could have lower latency.
 """
         Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
         useful for applications that benefit from accessing the generated audio in a non-blocking way (e.g. in an interactive
         Gradio demo).
         Parameters:
             model (`MusicgenForConditionalGeneration`):
                 The MusicGen model used to generate the audio waveform.
         # send the input_ids to the correct device
         input_ids = input_ids.to(self.audio_encoder.device)
+        if self.decoder.config.audio_channels == 1:
+            output_values = self.audio_encoder.decode(
+                input_ids,
+                audio_scales=[None],
+            ).audio_values
+        else:
+            codec_outputs_left = self.audio_encoder.decode(input_ids[:, :, ::2, :], audio_scales=[None])
+            output_values_left = codec_outputs_left.audio_values
+            codec_outputs_right = self.audio_encoder.decode(input_ids[:, :, 1::2, :], audio_scales=[None])
+            output_values_right = codec_outputs_right.audio_values
+            output_values = torch.cat([output_values_left, output_values_right], dim=1)
+        audio_values = output_values[0, 0]
         return audio_values.cpu().float().numpy()
     def put(self, value):
         if self.token_cache is None:
             self.token_cache = value
         else:
+            # if self.is_longform and not self.longform_stride_applied:
+            #     value = value[self.longform_stride:]
+            #     self.longform_stride_applied = True
             self.token_cache = torch.concatenate([self.token_cache, value[:, None]], dim=-1)
         if self.token_cache.shape[-1] % self.play_steps == 0:
             audio_values = self.apply_delay_pattern_mask(self.token_cache)
             self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
             self.to_yield += len(audio_values) - self.to_yield - self.stride
     return wav_buf.read()
+@spaces.GPU(duration=90)
 def generate_audio(text_prompt, audio, audio_length_in_s=10.0, play_steps_in_s=2.0, seed=0):
     max_new_tokens = int(frame_rate * audio_length_in_s)
     play_steps = int(frame_rate * play_steps_in_s)
     if audio is not None:
+        audio = torchaudio.load(audio)
+        audio = convert_audio(audio[0], audio[1], demucs.samplerate, demucs.audio_channels)
         audio = apply_model(demucs, audio[None])
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
     fn=generate_audio,
     inputs=[
         gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
+        gr.Audio(type="filepath", label="Conditioning audio. Use this for melody-guided generation."),
+        gr.Slider(35, 60, value=45, step=5, label="(Approximate) Audio length in seconds."),
+        gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds.", info="Lower = shorter chunks, lower latency, more codec steps."),
+        gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations."),
     ],
     outputs=[
         gr.Audio(label="Generated Music", autoplay=True,  interactive=False, streaming=True)
     ],
     examples=[
+        ["An 80s driving pop song with heavy drums and synth pads in the background", None, 45, 1.5, 5],
+        ["Bossa nova with guitars and synthesizer", None, 45, 1.5, 5],
+        ["90s rock song with electric guitar and heavy drums", None, 45, 1.5, 5],
+        ["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None, 45, 1.5, 5],
+        ["lofi slow bpm electro chill with organic samples", None, 45, 1.5, 5],
     ],
     title=title,
     description=description,