radames commited on
Commit
daf3ca1
2 Parent(s): 17e0c31 eaf8326

Merge remote-tracking branch 'upstream/main'

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +260 -107
  3. app_batched.py +4 -2
README.md CHANGED
@@ -5,7 +5,7 @@ tags:
5
  - music generation
6
  - language models
7
  - LLMs
8
- app_file: app_batched.py
9
  emoji: 🎵
10
  colorFrom: white
11
  colorTo: blue
 
5
  - music generation
6
  - language models
7
  - LLMs
8
+ app_file: app.py
9
  emoji: 🎵
10
  colorFrom: white
11
  colorTo: blue
app.py CHANGED
@@ -7,14 +7,18 @@ LICENSE file in the root directory of this source tree.
7
  """
8
 
9
  from tempfile import NamedTemporaryFile
 
10
  import torch
 
11
  import gradio as gr
 
12
  from audiocraft.models import MusicGen
13
-
14
  from audiocraft.data.audio import audio_write
15
 
 
16
 
17
  MODEL = None
 
18
 
19
 
20
  def load_model(version):
@@ -22,14 +26,18 @@ def load_model(version):
22
  return MusicGen.get_pretrained(version)
23
 
24
 
25
- def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
 
 
26
  global MODEL
27
  topk = int(topk)
28
- if MODEL is None or MODEL.name != model:
29
- MODEL = load_model(model)
30
 
31
  if duration > MODEL.lm.cfg.dataset.segment_duration:
32
  raise gr.Error("MusicGen currently supports durations of up to 30 seconds!")
 
 
33
  MODEL.set_generation_params(
34
  use_sampling=True,
35
  top_k=topk,
@@ -39,120 +47,265 @@ def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
39
  duration=duration,
40
  )
41
 
42
- if melody:
43
- sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
44
- print(melody.shape)
45
  if melody.dim() == 2:
46
  melody = melody[None]
47
- melody = melody[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
48
- output = MODEL.generate_with_chroma(
49
- descriptions=[text],
50
- melody_wavs=melody,
51
- melody_sample_rate=sr,
52
- progress=False
53
- )
 
 
 
 
 
 
 
 
 
 
 
54
  else:
55
  output = MODEL.generate(descriptions=[text], progress=False)
56
 
57
  output = output.detach().cpu().float()[0]
58
  with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
59
- audio_write(file.name, output, MODEL.sample_rate, strategy="loudness", add_suffix=False)
 
 
 
 
 
 
 
 
60
  waveform_video = gr.make_waveform(file.name)
61
- return waveform_video
62
 
63
- def toggle(choice):
64
- if choice == "mic":
65
- return gr.update(source="microphone", value=None, label="Microphone")
66
- else:
67
- return gr.update(source="upload", value=None, label="File")
68
-
69
- with gr.Blocks() as demo:
70
- gr.Markdown(
71
- """
72
- # MusicGen
73
-
74
- This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
75
- presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
76
- <br/>
77
- <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
78
- <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
79
- for longer sequences, more control and no queue.</p>
80
- """
81
- )
82
- with gr.Row():
83
- with gr.Column():
84
- with gr.Row():
85
- text = gr.Text(label="Input Text", interactive=True)
86
- with gr.Column():
87
- radio = gr.Radio(["file", "mic"], value="file", label="Melody Condition (optional) File or Mic")
88
- melody = gr.Audio(source="upload", type="numpy", label="File", interactive=True)
89
- with gr.Row():
90
- submit = gr.Button("Submit")
91
- with gr.Row():
92
- model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
93
- with gr.Row():
94
- duration = gr.Slider(minimum=1, maximum=30, value=10, label="Duration", interactive=True)
95
- with gr.Row():
96
- topk = gr.Number(label="Top-k", value=250, interactive=True)
97
- topp = gr.Number(label="Top-p", value=0, interactive=True)
98
- temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
99
- cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
100
- with gr.Column():
101
- output = gr.Video(label="Generated Music")
102
- submit.click(predict, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
103
- radio.change(toggle, radio, [melody], queue=False, show_progress=False)
104
- gr.Examples(
105
- fn=predict,
106
- examples=[
107
- [
108
- "An 80s driving pop song with heavy drums and synth pads in the background",
109
- "./assets/bach.mp3",
110
- "melody"
111
- ],
112
- [
113
- "A cheerful country song with acoustic guitars",
114
- "./assets/bolero_ravel.mp3",
115
- "melody"
116
- ],
117
- [
118
- "90s rock song with electric guitar and heavy drums",
119
- None,
120
- "medium"
121
- ],
122
- [
123
- "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
124
- "./assets/bach.mp3",
125
- "melody"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  ],
127
- [
128
- "lofi slow bpm electro chill with organic samples",
129
- None,
130
- "medium",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  ],
132
- ],
133
- inputs=[text, melody, model],
134
- outputs=[output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  )
136
- gr.Markdown(
137
- """
138
- ### More details
139
-
140
- The model will generate a short music extract based on the description you provided.
141
- You can generate up to 30 seconds of audio.
142
-
143
- We present 4 model variations:
144
- 1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
145
- 2. Small -- a 300M transformer decoder conditioned on text only.
146
- 3. Medium -- a 1.5B transformer decoder conditioned on text only.
147
- 4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
148
-
149
- When using `melody`, ou can optionaly provide a reference audio from
150
- which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
151
-
152
- You can also use your own GPU or a Google Colab by following the instructions on our repo.
153
- See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
154
- for more details.
155
- """
156
  )
 
 
 
 
 
 
 
 
 
 
157
 
158
- demo.launch()
 
 
 
 
 
 
 
 
7
  """
8
 
9
  from tempfile import NamedTemporaryFile
10
+ import argparse
11
  import torch
12
+ import torchaudio
13
  import gradio as gr
14
+ import os
15
  from audiocraft.models import MusicGen
 
16
  from audiocraft.data.audio import audio_write
17
 
18
+ from share_btn import community_icon_html, loading_icon_html, share_js, css
19
 
20
  MODEL = None
21
+ IS_SHARED_SPACE = "radames/MusicGen-Continuation" in os.environ.get("SPACE_ID", "")
22
 
23
 
24
  def load_model(version):
 
26
  return MusicGen.get_pretrained(version)
27
 
28
 
29
+ def predict(
30
+ text, melody_input, duration, continuation, topk, topp, temperature, cfg_coef
31
+ ):
32
  global MODEL
33
  topk = int(topk)
34
+ if MODEL is None:
35
+ MODEL = load_model("melody")
36
 
37
  if duration > MODEL.lm.cfg.dataset.segment_duration:
38
  raise gr.Error("MusicGen currently supports durations of up to 30 seconds!")
39
+ if continuation >= duration:
40
+ raise gr.Error("The continuation setting can't be higher or equal to duration!")
41
  MODEL.set_generation_params(
42
  use_sampling=True,
43
  top_k=topk,
 
47
  duration=duration,
48
  )
49
 
50
+ if melody_input:
51
+ melody, sr = torchaudio.load(melody_input)
52
+ # sr, melody = melody_input[0], torch.from_numpy(melody_input[1]).to(MODEL.device).float().t().unsqueeze(0)
53
  if melody.dim() == 2:
54
  melody = melody[None]
55
+ if continuation:
56
+ prompt_waveform = melody[..., -int(sr * continuation) :]
57
+ output = MODEL.generate_continuation(
58
+ prompt=prompt_waveform,
59
+ prompt_sample_rate=sr,
60
+ descriptions=[text],
61
+ progress=True,
62
+ )
63
+ else:
64
+ melody_wavform = melody[
65
+ ..., : int(sr * MODEL.lm.cfg.dataset.segment_duration)
66
+ ]
67
+ output = MODEL.generate_with_chroma(
68
+ descriptions=[text],
69
+ melody_wavs=melody_wavform,
70
+ melody_sample_rate=sr,
71
+ progress=True,
72
+ )
73
  else:
74
  output = MODEL.generate(descriptions=[text], progress=False)
75
 
76
  output = output.detach().cpu().float()[0]
77
  with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
78
+ audio_write(
79
+ file.name,
80
+ output,
81
+ MODEL.sample_rate,
82
+ strategy="loudness",
83
+ loudness_headroom_db=16,
84
+ loudness_compressor=True,
85
+ add_suffix=False,
86
+ )
87
  waveform_video = gr.make_waveform(file.name)
88
+ return waveform_video, melody_input
89
 
90
+
91
+ def ui(**kwargs):
92
+ def toggle(choice):
93
+ if choice == "mic":
94
+ return gr.update(source="microphone", value=None, label="Microphone")
95
+ else:
96
+ return gr.update(source="upload", value=None, label="File")
97
+
98
+ with gr.Blocks(css=css) as interface:
99
+ gr.Markdown(
100
+ """
101
+ # MusicGen
102
+ This is your private demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
103
+ presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
104
+ """
105
+ )
106
+ if IS_SHARED_SPACE:
107
+ gr.Markdown(
108
+ """
109
+ This Space doesn't work in this shared UI ⚠
110
+
111
+ <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
112
+ <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
113
+ to use it privately, or use the <a href="https://huggingface.co/spaces/facebook/MusicGen">public demo</a>
114
+ """
115
+ )
116
+ with gr.Row():
117
+ with gr.Column():
118
+ with gr.Row():
119
+ text = gr.Text(
120
+ label="Describe your music",
121
+ lines=2,
122
+ interactive=True,
123
+ elem_id="text-input",
124
+ )
125
+ with gr.Column():
126
+ radio = gr.Radio(
127
+ ["file", "mic"],
128
+ value="file",
129
+ label="Melody Condition (optional) File or Mic",
130
+ )
131
+ melody = gr.Audio(
132
+ source="upload",
133
+ type="filepath",
134
+ label="File",
135
+ interactive=True,
136
+ elem_id="melody-input",
137
+ )
138
+ with gr.Row():
139
+ submit = gr.Button("Submit")
140
+ # with gr.Row():
141
+ # model = gr.Radio(
142
+ # ["melody", "medium", "small", "large"],
143
+ # label="Model",
144
+ # value="melody",
145
+ # interactive=True,
146
+ # )
147
+ with gr.Row():
148
+ duration = gr.Slider(
149
+ minimum=1,
150
+ maximum=30,
151
+ value=10,
152
+ label="Duration",
153
+ interactive=True,
154
+ )
155
+ with gr.Row():
156
+ continuation = gr.Slider(
157
+ minimum=0,
158
+ maximum=30,
159
+ value=0,
160
+ label="Continue from the end duration",
161
+ interactive=True,
162
+ )
163
+ with gr.Row():
164
+ topk = gr.Number(label="Top-k", value=250, interactive=True)
165
+ topp = gr.Number(label="Top-p", value=0, interactive=True)
166
+ temperature = gr.Number(
167
+ label="Temperature", value=1.0, interactive=True
168
+ )
169
+ cfg_coef = gr.Number(
170
+ label="Classifier Free Guidance", value=3.0, interactive=True
171
+ )
172
+ with gr.Column():
173
+ output = gr.Video(label="Generated Music", elem_id="generated-video")
174
+ output_melody = gr.Audio(label="Melody ", elem_id="melody-output")
175
+ with gr.Row(visible=False) as share_row:
176
+ with gr.Group(elem_id="share-btn-container"):
177
+ community_icon = gr.HTML(community_icon_html)
178
+ loading_icon = gr.HTML(loading_icon_html)
179
+ share_button = gr.Button(
180
+ "Share to community", elem_id="share-btn"
181
+ )
182
+ share_button.click(None, [], [], _js=share_js)
183
+ submit.click(
184
+ lambda x: gr.update(visible=False),
185
+ None,
186
+ [share_row],
187
+ queue=False,
188
+ show_progress=False,
189
+ ).then(
190
+ predict,
191
+ inputs=[
192
+ text,
193
+ melody,
194
+ duration,
195
+ continuation,
196
+ topk,
197
+ topp,
198
+ temperature,
199
+ cfg_coef,
200
  ],
201
+ outputs=[output, output_melody],
202
+ ).then(
203
+ lambda x: gr.update(visible=True),
204
+ None,
205
+ [share_row],
206
+ queue=False,
207
+ show_progress=False,
208
+ )
209
+ radio.change(toggle, radio, [melody], queue=False, show_progress=False)
210
+ gr.Examples(
211
+ fn=predict,
212
+ examples=[
213
+ [
214
+ "An 80s driving pop song with heavy drums and synth pads in the background",
215
+ "./assets/bach.mp3",
216
+ ],
217
+ [
218
+ "A cheerful country song with acoustic guitars",
219
+ "./assets/bolero_ravel.mp3",
220
+ ],
221
+ ["90s rock song with electric guitar and heavy drums", None, "medium"],
222
+ [
223
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
224
+ "./assets/bach.mp3",
225
+ ],
226
+ [
227
+ "lofi slow bpm electro chill with organic samples",
228
+ None,
229
+ ],
230
  ],
231
+ inputs=[text, melody],
232
+ outputs=[output],
233
+ )
234
+ gr.Markdown(
235
+ """
236
+ ### More details
237
+
238
+ The model will generate a short music extract based on the description you provided.
239
+ You can generate up to 30 seconds of audio.
240
+
241
+ We present 4 model variations:
242
+ 1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
243
+ 2. Small -- a 300M transformer decoder conditioned on text only.
244
+ 3. Medium -- a 1.5B transformer decoder conditioned on text only.
245
+ 4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
246
+
247
+ When using `melody`, ou can optionaly provide a reference audio from
248
+ which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
249
+
250
+ You can also use your own GPU or a Google Colab by following the instructions on our repo.
251
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
252
+ for more details.
253
+ """
254
+ )
255
+
256
+ # Show the interface
257
+ launch_kwargs = {}
258
+ username = kwargs.get("username")
259
+ password = kwargs.get("password")
260
+ server_port = kwargs.get("server_port", 0)
261
+ inbrowser = kwargs.get("inbrowser", False)
262
+ share = kwargs.get("share", False)
263
+ server_name = kwargs.get("listen")
264
+
265
+ launch_kwargs["server_name"] = server_name
266
+
267
+ if username and password:
268
+ launch_kwargs["auth"] = (username, password)
269
+ if server_port > 0:
270
+ launch_kwargs["server_port"] = server_port
271
+ if inbrowser:
272
+ launch_kwargs["inbrowser"] = inbrowser
273
+ if share:
274
+ launch_kwargs["share"] = share
275
+
276
+ interface.queue().launch(**launch_kwargs, max_threads=1)
277
+
278
+
279
+ if __name__ == "__main__":
280
+ parser = argparse.ArgumentParser()
281
+ parser.add_argument(
282
+ "--listen",
283
+ type=str,
284
+ default="127.0.0.1",
285
+ help="IP to listen on for connections to Gradio",
286
+ )
287
+ parser.add_argument(
288
+ "--username", type=str, default="", help="Username for authentication"
289
  )
290
+ parser.add_argument(
291
+ "--password", type=str, default="", help="Password for authentication"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  )
293
+ parser.add_argument(
294
+ "--server_port",
295
+ type=int,
296
+ default=0,
297
+ help="Port to run the server listener on",
298
+ )
299
+ parser.add_argument("--inbrowser", action="store_true", help="Open in browser")
300
+ parser.add_argument("--share", action="store_true", help="Share the gradio UI")
301
+
302
+ args = parser.parse_args()
303
 
304
+ ui(
305
+ username=args.username,
306
+ password=args.password,
307
+ inbrowser=args.inbrowser,
308
+ server_port=args.server_port,
309
+ share=args.share,
310
+ listen=args.listen,
311
+ )
app_batched.py CHANGED
@@ -67,10 +67,13 @@ def predict(texts, melodies):
67
  output,
68
  MODEL.sample_rate,
69
  strategy="loudness",
 
 
70
  add_suffix=False,
71
  )
72
  waveform_video = gr.make_waveform(file.name)
73
  out_files.append(waveform_video)
 
74
  return [out_files, melodies]
75
 
76
 
@@ -189,5 +192,4 @@ with gr.Blocks(css=css) as demo:
189
  for more details.
190
  """
191
  )
192
-
193
- demo.queue(max_size=15).launch()
 
67
  output,
68
  MODEL.sample_rate,
69
  strategy="loudness",
70
+ loudness_headroom_db=16,
71
+ loudness_compressor=True,
72
  add_suffix=False,
73
  )
74
  waveform_video = gr.make_waveform(file.name)
75
  out_files.append(waveform_video)
76
+
77
  return [out_files, melodies]
78
 
79
 
 
192
  for more details.
193
  """
194
  )
195
+ demo.queue(max_size=60).launch()