import torch import pytorch_lightning as pl from torch import nn from tqdm import tqdm import numpy as np import einops import wandb import torch # import wandb logging from pytorch_lightning.loggers import WandbLogger from stable_audio_tools import get_pretrained_model from transformers import T5Tokenizer, T5EncoderModel class SinActivation(nn.Module): def forward(self, x): return torch.sin(x) class FourierFeatures(nn.Module): def __init__(self, in_features, out_features, n_layers): super().__init__() self.in_features = in_features self.out_features = out_features self.n_layers = n_layers layers = [] layers += [nn.Linear(in_features, out_features)] # add sin activation layers += [SinActivation()] for i in range(n_layers-1): layers += [nn.Linear(out_features, out_features)] layers += [SinActivation()] self.layers = nn.Sequential(*layers) def forward(self, x): return self.layers(x) class FlowMatchingModule(pl.LightningModule): def __init__(self, main_model=None, text_conditioner=None, max_tokens=128, n_channels=None, t_input=None): super().__init__() self.save_hyperparameters(ignore=['main_model', "text_conditioner"]) self.model = main_model.transformer self.input_layer = main_model.transformer.project_in self.output_layer = main_model.transformer.project_out self.text_conditioner = text_conditioner self.d_model = self.input_layer.weight.shape[0] self.d_input = self.input_layer.weight.shape[1] # use fourier features for schedule self.schedule_embedding = FourierFeatures(1, self.d_model, 2) # use learned positional encoding self.pitch_embedding = nn.Parameter(torch.randn(n_channels, self.d_model)) # make embedding layer for tags self.channels = n_channels mean_proj = [] for layer in self.model.layers: mean_proj += [nn.Linear(self.d_model, self.d_model)] self.mean_proj = nn.ModuleList(mean_proj) def get_example_inputs(self): text = "A piano playing a C major chord" conditioning, conditioning_mask = self.text_conditioner(text, device = self.device) # repeat conditioning conditioning = einops.repeat(conditioning, 'b t d-> b t c d', c=self.channels) conditioning_mask = einops.repeat(conditioning_mask, 'b t -> b t c', c=self.channels) t = torch.rand(1, device=self.device) z = torch.randn(1, self.hparams.t_input ,self.hparams.n_channels, self.d_input , device=self.device) return z, conditioning, conditioning_mask, t def forward(self, x, conditioning, conditioning_mask, t): batch, t_input, n_channels, d_input = x.shape # add conditioning to x x = self.input_layer(x) tz = self.schedule_embedding(t[:,None,None,None]) pitch_z = self.pitch_embedding[None, None, :n_channels, :] # print shapes x = x + tz + pitch_z rot = self.model.rotary_pos_emb.forward_from_seq_len(x.shape[1]) conditioning = einops.rearrange(conditioning, 'b t c d -> (b c) t d', c=self.channels) conditioning_mask = einops.rearrange(conditioning_mask, 'b t c -> (b c) t', c=self.channels) for layer_idx, layer in enumerate(self.model.layers): x = einops.rearrange(x, 'b t c d -> (b c) t d') x = layer(x, rotary_pos_emb=rot, context = conditioning, context_mask = conditioning_mask) x = einops.rearrange(x, '(b c) t d -> b t c d', c=self.channels) x_ch_mean = x.mean(dim=2) x_ch_mean = self.mean_proj[layer_idx](x_ch_mean) # non linearity # x_ch_mean = torch.relu(x_ch_mean) # # layer norm # x_ch_mean = torch.layer_norm(x_ch_mean, x_ch_mean.shape[1:]) x += x_ch_mean[:, :, None, :] x = self.output_layer(x) return x def step(self, batch, batch_idx): x = batch["z"] text = batch["text"] conditioning, conditioning_mask = self.text_conditioner(text, device = self.device) # repeat conditioning conditioning = einops.repeat(conditioning, 'b t d-> b t c d', c=self.channels) conditioning_mask = einops.repeat(conditioning_mask, 'b t -> b t c', c=self.channels) x = einops.rearrange(x, 'b c d t -> b t c d') z0 = torch.randn(x.shape, device=x.device) z1 = x t = torch.rand(x.shape[0], device=x.device) zt = t[:,None,None,None] * z1 + (1 - t[:,None,None,None]) * z0 vt = self(zt,conditioning,conditioning_mask,t) loss = (vt - (z1 - z0)).pow(2).mean() return loss @torch.inference_mode() def sample(self, batch_size, text, steps=10, same_latent=False): # Ensure model is on the correct device device = next(self.parameters()).device dtype = self.input_layer.weight.dtype # Move conditioning to the correct device and dtype conditioning, conditioning_mask = self.text_conditioner(text, device=device) conditioning = einops.repeat(conditioning, "b t d-> b t c d", c=self.channels) conditioning_mask = einops.repeat( conditioning_mask, "b t -> b t c", c=self.channels ) conditioning =, dtype=dtype) conditioning_mask = self.eval() with torch.no_grad(): # Create initial noise on the correct device and dtype z0 = torch.randn( batch_size, self.hparams.t_input, self.hparams.n_channels, self.d_input, device=device, dtype=dtype, ) if same_latent: z0 = z0[0].repeat(batch_size, 1, 1, 1) zt = z0 for step in tqdm(range(steps)): t = torch.tensor([step / steps], device=device, dtype=dtype) zt = zt + (1 / steps) * self.forward( zt, conditioning, conditioning_mask, t ) return zt def training_step(self, batch, batch_idx): loss = self.step(batch, batch_idx) self.log('trn_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) return loss def validation_step(self, batch, batch_idx): loss = self.step(batch, batch_idx) self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) return loss def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=1e-5) class EncodedAudioDataset( def __init__(self, paths, pitch_range): records = [] print("Loading data") for path in tqdm(paths): records+=torch.load(path) self.records = records self.pitch_range = pitch_range # keep only records with z self.records = [r for r in self.records if "z" in r] print(f"Loaded {len(self.records)} records") def compose_prompt(self,record): title = record["name"] if "name" in record else record["title"] tags = record["tags"] # take tags # shuffle tags = np.random.choice(tags, len(tags), replace=False) # take random number of tags tags = list(tags[:np.random.randint(0, len(tags)+1)]) # # take either the title or group or type or nothing if "type_group" in record and "type" in record: type_group = record["type_group"] type = record["type"] head = np.random.choice([title, type_group, type]) else: head = np.random.choice([title]) # append tags # with 75% chance add head elements = tags if np.random.rand() < 0.75: elements = [head] + elements # shuffle elements elements = np.random.choice(elements, len(elements), replace=False) prompt = " ".join(elements) # make everything lowercase prompt = prompt.lower() return prompt def __len__(self): return len(self.records) def __getitem__(self, idx): return { "z": self.records[idx]["z"][self.pitch_range[0]:self.pitch_range[1]], "text": self.compose_prompt(self.records[idx]) } def check_for_nans(self): for r in self.records: # check if z has nan values if np.isnan(r["z"]).any(): raise ValueError("Nan values in z") def get_z_shape(self): shapes = [r["z"].shape for r in self.records] # return unique shapes return list(set(shapes)) if __name__ == "__main__": # set seed SEED = 0 torch.manual_seed(SEED) BATCH_SIZE = 1 LATENT_T = 86 # initialize wandb logger wandb.init() logger = WandbLogger(project="synth_flow") # don't log models wandb.config.log_model = False DATASET = "dataset_a" if DATASET == "dataset_a": PITCH_RANGE = [2,12] trn_ds = EncodedAudioDataset([f"artefacts/synth_data_{i}.pt" for i in range(9)], PITCH_RANGE) trn_ds.check_for_nans() trn_dl =, batch_size=BATCH_SIZE, shuffle=True) val_ds = EncodedAudioDataset([f"artefacts/"], PITCH_RANGE) val_ds.check_for_nans() val_dl =, batch_size=BATCH_SIZE, shuffle=True) elif DATASET == "dataset_b": PITCH_RANGE = [0,10] trn_ds = EncodedAudioDataset([f"artefacts/synth_data_2_joined_{i}.pt" for i in range(3)], PITCH_RANGE) trn_ds.check_for_nans() trn_dl =, batch_size=BATCH_SIZE, shuffle=True) val_ds = EncodedAudioDataset([f"artefacts/"], PITCH_RANGE) val_ds.check_for_nans() val_dl =, batch_size=BATCH_SIZE, shuffle=True) src_model = get_pretrained_model("stabilityai/stable-audio-open-1.0")[0].to("cpu") src_model ="cpu") transformer_model = src_model.model.model transformer_model = transformer_model.train() text_conditioner = src_model.conditioner.conditioners.prompt t5_version = "google-t5/t5-base" lr_callback = pl.callbacks.LearningRateMonitor(logging_interval='step') model = FlowMatchingModule( main_model=transformer_model, text_conditioner=text_conditioner, n_channels=PITCH_RANGE[1] - PITCH_RANGE[0], t_input=LATENT_T, ) trainer = pl.Trainer(devices = [3], logger=logger, gradient_clip_val=1.0, callbacks=[lr_callback], max_epochs=1000, precision="16-mixed"), trn_dl, val_dl, ckpt_path="synth_flow/9gzpz0i6/epoch=85-step=774000.ckpt") # save checkpoint trainer.save_checkpoint("artefacts/model_finetuned_2.ckpt")