Spaces:
Sleeping
Sleeping
File size: 11,849 Bytes
f1069cc 9e0eee2 f1069cc f165857 f1069cc 9e0eee2 f1069cc 9e0eee2 f1069cc b3bab00 f1069cc 9e0eee2 b3bab00 f1069cc 9e0eee2 f1069cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 |
import gradio as gr
import json
import torch
import wavio
import numpy as np
from tqdm import tqdm
from huggingface_hub import snapshot_download
from audioldm.audio.stft import TacotronSTFT
from audioldm.variational_autoencoder import AutoencoderKL
from transformers import AutoTokenizer, T5ForConditionalGeneration
from modelling_deberta_v2 import DebertaV2ForTokenClassificationRegression
import sys
sys.path.insert(0, "diffusers/src")
from diffusers import DDPMScheduler
from models import MusicAudioDiffusion
from gradio import Markdown
class MusicFeaturePredictor:
def __init__(self, path, device="cuda:0", cache_dir=None, local_files_only=False):
self.beats_tokenizer = AutoTokenizer.from_pretrained(
"microsoft/deberta-v3-large",
use_fast=False,
cache_dir=cache_dir,
local_files_only=local_files_only,
)
self.beats_model = DebertaV2ForTokenClassificationRegression.from_pretrained(
"microsoft/deberta-v3-large",
cache_dir=cache_dir,
local_files_only=local_files_only,
)
self.beats_model.eval()
self.beats_model.to(device)
beats_ckpt = f"{path}/beats/microsoft-deberta-v3-large.pt"
beats_weight = torch.load(beats_ckpt, map_location="cpu")
self.beats_model.load_state_dict(beats_weight)
self.chords_tokenizer = AutoTokenizer.from_pretrained(
"google/flan-t5-large",
cache_dir=cache_dir,
local_files_only=local_files_only,
)
self.chords_model = T5ForConditionalGeneration.from_pretrained(
"google/flan-t5-large",
cache_dir=cache_dir,
local_files_only=local_files_only,
)
self.chords_model.eval()
self.chords_model.to(device)
chords_ckpt = f"{path}/chords/flan-t5-large.bin"
chords_weight = torch.load(chords_ckpt, map_location="cpu")
self.chords_model.load_state_dict(chords_weight)
def generate_beats(self, prompt):
tokenized = self.beats_tokenizer(
prompt, max_length=512, padding=True, truncation=True, return_tensors="pt"
)
tokenized = {k: v.to(self.beats_model.device) for k, v in tokenized.items()}
with torch.no_grad():
out = self.beats_model(**tokenized)
max_beat = (
1 + torch.argmax(out["logits"][:, 0, :], -1).detach().cpu().numpy()
).tolist()[0]
intervals = (
out["values"][:, :, 0]
.detach()
.cpu()
.numpy()
.astype("float32")
.round(4)
.tolist()
)
intervals = np.cumsum(intervals)
predicted_beats_times = []
for t in intervals:
if t < 10:
predicted_beats_times.append(round(t, 2))
else:
break
predicted_beats_times = list(np.array(predicted_beats_times)[:50])
if len(predicted_beats_times) == 0:
predicted_beats = [[], []]
else:
beat_counts = []
for i in range(len(predicted_beats_times)):
beat_counts.append(float(1.0 + np.mod(i, max_beat)))
predicted_beats = [[predicted_beats_times, beat_counts]]
return max_beat, predicted_beats_times, predicted_beats
def generate(self, prompt):
max_beat, predicted_beats_times, predicted_beats = self.generate_beats(prompt)
chords_prompt = "Caption: {} \\n Timestamps: {} \\n Max Beat: {}".format(
prompt,
" , ".join([str(round(t, 2)) for t in predicted_beats_times]),
max_beat,
)
tokenized = self.chords_tokenizer(
chords_prompt,
max_length=512,
padding=True,
truncation=True,
return_tensors="pt",
)
tokenized = {k: v.to(self.chords_model.device) for k, v in tokenized.items()}
generated_chords = self.chords_model.generate(
input_ids=tokenized["input_ids"],
attention_mask=tokenized["attention_mask"],
min_length=8,
max_length=128,
num_beams=5,
early_stopping=True,
num_return_sequences=1,
)
generated_chords = self.chords_tokenizer.decode(
generated_chords[0],
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
).split(" n ")
predicted_chords, predicted_chords_times = [], []
for item in generated_chords:
c, ct = item.split(" at ")
predicted_chords.append(c)
predicted_chords_times.append(float(ct))
return predicted_beats, predicted_chords, predicted_chords_times
class Mustango:
def __init__(
self,
name="declare-lab/mustango",
device="cuda:0",
cache_dir=None,
local_files_only=False,
):
path = snapshot_download(repo_id=name, cache_dir=cache_dir)
self.music_model = MusicFeaturePredictor(
path, device, cache_dir=cache_dir, local_files_only=local_files_only
)
vae_config = json.load(open(f"{path}/configs/vae_config.json"))
stft_config = json.load(open(f"{path}/configs/stft_config.json"))
main_config = json.load(open(f"{path}/configs/main_config.json"))
self.vae = AutoencoderKL(**vae_config).to(device)
self.stft = TacotronSTFT(**stft_config).to(device)
self.model = MusicAudioDiffusion(
main_config["text_encoder_name"],
main_config["scheduler_name"],
unet_model_config_path=f"{path}/configs/music_diffusion_model_config.json",
).to(device)
self.model.device = device
vae_weights = torch.load(
f"{path}/vae/pytorch_model_vae.bin", map_location=device
)
stft_weights = torch.load(
f"{path}/stft/pytorch_model_stft.bin", map_location=device
)
main_weights = torch.load(
f"{path}/ldm/pytorch_model_ldm.bin", map_location=device
)
self.vae.load_state_dict(vae_weights)
self.stft.load_state_dict(stft_weights)
self.model.load_state_dict(main_weights)
print("Successfully loaded checkpoint from:", name)
self.vae.eval()
self.stft.eval()
self.model.eval()
self.scheduler = DDPMScheduler.from_pretrained(
main_config["scheduler_name"], subfolder="scheduler"
)
def generate(self, prompt, steps=100, guidance=3, samples=1, disable_progress=True):
"""Genrate music for a single prompt string."""
with torch.no_grad():
beats, chords, chords_times = self.music_model.generate(prompt)
latents = self.model.inference(
[prompt],
beats,
[chords],
[chords_times],
self.scheduler,
steps,
guidance,
samples,
disable_progress,
)
mel = self.vae.decode_first_stage(latents)
wave = self.vae.decode_to_waveform(mel)
return wave[0]
# Initialize Mustango
if torch.cuda.is_available():
mustango = Mustango()
else:
mustango = Mustango(device="cpu")
# output_wave = mustango.generate("This techno song features a synth lead playing the main melody.", 5, 3, disable_progress=False)
def gradio_generate(prompt, steps, guidance):
output_wave = mustango.generate(prompt, steps, guidance)
# output_filename = f"{prompt.replace(' ', '_')}_{steps}_{guidance}"[:250] + ".wav"
output_filename = "temp.wav"
wavio.write(output_filename, output_wave, rate=16000, sampwidth=2)
return output_filename
# description_text = """
# <p><a href="https://huggingface.co/spaces/declare-lab/mustango/blob/main/app.py?duplicate=true"> <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> For faster inference without waiting in queue, you may duplicate the space and upgrade to a GPU in the settings. <br/><br/>
# Generate music using Mustango by providing a text prompt.
# <br/><br/> Meet Mustango, an exciting addition to the vibrant landscape of Multimodal Large Language Models \
# designed for controlled music generation. Mustango leverages Latent Diffusion Model (LDM), Flan-T5, and \
# musical features to do the magic! \
# <p/>
# """
description_text = ""
# Gradio input and output components
input_text = gr.inputs.Textbox(lines=2, label="Prompt")
output_audio = gr.outputs.Audio(label="Generated Music", type="filepath")
denoising_steps = gr.Slider(minimum=100, maximum=200, value=100, step=1, label="Steps", interactive=True)
guidance_scale = gr.Slider(minimum=1, maximum=10, value=3, step=0.1, label="Guidance Scale", interactive=True)
# Gradio interface
gr_interface = gr.Interface(
fn=gradio_generate,
inputs=[input_text, denoising_steps, guidance_scale],
outputs=[output_audio],
title="Mustango: Toward Controllable Text-to-Music Generation",
description=description_text,
allow_flagging=False,
examples=[
["This techno song features a synth lead playing the main melody. This is accompanied by programmed percussion playing a simple kick focused beat. The hi-hat is accented in an open position on the 3-and count of every bar. The synth plays the bass part with a voicing that sounds like a cello. This techno song can be played in a club. The chord sequence is Gm, A7, Eb, Bb, C, F, Gm. The beat counts to 2. The tempo of this song is 128.0 beats per minute. The key of this song is G minor."],
["This is a new age piece. There is a flute playing the main melody with a lot of staccato notes. The rhythmic background consists of a medium tempo electronic drum beat with percussive elements all over the spectrum. There is a playful atmosphere to the piece. This piece can be used in the soundtrack of a children's TV show or an advertisement jingle."],
["The song is an instrumental. The song is in medium tempo with a classical guitar playing a lilting melody in accompaniment style. The song is emotional and romantic. The song is a romantic instrumental song. The chord sequence is Gm, F6, Ebm. The time signature is 4/4. This song is in Adagio. The key of this song is G minor."],
["This folk song features a female voice singing the main melody. This is accompanied by a tabla playing the percussion. A guitar strums chords. For most parts of the song, only one chord is played. At the last bar, a different chord is played. This song has minimal instruments. This song has a story-telling mood. This song can be played in a village scene in an Indian movie. The chord sequence is Bbm, Ab. The beat is 3. The tempo of this song is Allegro. The key of this song is Bb minor."],
["This is a live performance of a classical music piece. There is an orchestra performing the piece with a violin lead playing the main melody. The atmosphere is sentimental and heart-touching. This piece could be playing in the background at a classy restaurant. The chord progression in this song is Am7, Gm, Dm, A7, Dm. The beat is 3. This song is in Largo. The key of this song is D minor."],
["This is a techno piece with drums and beats and a leading melody. A synth plays chords. The music kicks off with a powerful and relentless drumbeat. Over the pounding beats, a leading melody emerges. In the middle of the song, a flock of seagulls flies over the venue and make loud bird sounds. It has strong danceability and can be played in a club. The tempo is 120 bpm. The chords played by the synth are Am, Cm, Dm, Gm."],
],
cache_examples=False,
)
# Launch Gradio app
gr_interface.launch() |