frutiemax
/

rct_model

Model card Files Files and versions Community

frutiemax commited on Oct 1, 2023

Commit

42f8b67

1 Parent(s): 82ebedf

Use float16 for inference and float32 for training

Browse files

Files changed (3) hide show

rct_diffusion_pipeline.py +24 -27
test_pipeline.py +11 -83
train_model.py +60 -97

rct_diffusion_pipeline.py CHANGED Viewed

@@ -30,7 +30,7 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         self.text_tokenizer = text_tokenizer
         # channels for 1 image
-        self.num_channels = int(self.unet.config.in_channels / 4)
         self.load_dictionaries_from_dataset()
         self.register_modules(unet=unet, scheduler=scheduler, vae=vae, text_tokenizer=text_tokenizer, text_encoder=text_encoder)
@@ -171,13 +171,12 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         return self.pack_labels_to_tensor(batch_size, object_descriptions, colors1, colors2, colors3).to(device='cuda',dtype=torch.float16)
     def generate_noise_batches(self, batch_size):
-        noise_batches = torch.Tensor(size=(batch_size, 4, self.num_channels, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
         for batch_index in range(batch_size):
-            for view_index in range(4):
-                noise = torch.randn(self.num_channels, self.latent_size, self.latent_size).to(dtype=torch.float16, device='cuda')
-                noise_batches[batch_index, view_index] = noise
-        return torch.reshape(noise_batches, (batch_size, 1, self.num_channels*4, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
     def test_generate_embeddings(self, object_description, color1, color2, color3) -> torch.Tensor:
         batch_size = len(object_description)
@@ -190,7 +189,7 @@ class RCTDiffusionPipeline(DiffusionPipeline):
             with torch.no_grad():
                 embeddings[batch_index] = self.text_encoder(tokens.input_ids.to('cuda'))[0]
-        return embeddings.to(dtype=torch.float16)
     def generate_embeddings(self, object_description, color1, color2, color3) -> torch.Tensor:
         batch_size = len(object_description)
@@ -244,11 +243,11 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         if res == False:
             return None
         embeddings = self.test_generate_embeddings(object_description, color1, color2, color3)
-        embeddings = embeddings.to('cuda')
         # set the inference steps
         self.scheduler.set_timesteps(num_inference_steps)
-        noise_batches = self.generate_noise_batches(batch_size)
         # now call the model for the n interations
         progress_bar = tqdm(total=num_inference_steps)
@@ -257,36 +256,34 @@ class RCTDiffusionPipeline(DiffusionPipeline):
             progress_bar.set_description(f'Inference step {epoch}')
             for batch_index in range(batch_size):
-                noise_batches[batch_index] = self.scheduler.scale_model_input(noise_batches[batch_index], timestep=t)
                 with torch.no_grad():
-                    noise_residual = self.unet(noise_batches[batch_index], t, encoder_hidden_states=embeddings).sample
-                previous_noisy_sample = self.scheduler.step(noise_residual, t, noise_batches[batch_index]).prev_sample
                 noise_batches[batch_index] = previous_noisy_sample
             progress_bar.update(1)
             epoch = epoch + 1
         # reshape the data so we get back 4 RGB images
-        noise_batches = torch.reshape(noise_batches, (batch_size, 4, self.num_channels, self.latent_size, self.latent_size))
-        images = torch.Tensor(size=(batch_size, 4, 3, self.sample_size, self.sample_size))
         with torch.no_grad():
-            for image_index in range(4):
-                image = noise_batches[:, image_index]
-                result = self.vae.decode(image).sample
-                images[:, image_index] = result
         # convert those tensors to PIL images
         output_images = []
         for batch_index in range(batch_size):
-            for image_index in range(4):
-                # run these into the vae decoder
-                image = images[batch_index, image_index]
-                image = (image / 2 + 0.5).clamp(0, 1).squeeze()
-                image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
-                image = (image * 255).round().astype("uint8")
-                image = Image.fromarray(image)
-                image.save(f'test{image_index}.png')
-                output_images.append(image)
         # for now just return the images
         return output_images

         self.text_tokenizer = text_tokenizer
         # channels for 1 image
+        self.num_channels = int(self.unet.config.in_channels)
         self.load_dictionaries_from_dataset()
         self.register_modules(unet=unet, scheduler=scheduler, vae=vae, text_tokenizer=text_tokenizer, text_encoder=text_encoder)
         return self.pack_labels_to_tensor(batch_size, object_descriptions, colors1, colors2, colors3).to(device='cuda',dtype=torch.float16)
     def generate_noise_batches(self, batch_size):
+        noise_batches = torch.Tensor(size=(batch_size, self.num_channels, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
         for batch_index in range(batch_size):
+            noise = torch.randn(self.num_channels, self.latent_size, self.latent_size).to(dtype=torch.float16, device='cuda')
+            noise_batches[batch_index] = noise
+        return torch.reshape(noise_batches, (batch_size, self.num_channels, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
     def test_generate_embeddings(self, object_description, color1, color2, color3) -> torch.Tensor:
         batch_size = len(object_description)
             with torch.no_grad():
                 embeddings[batch_index] = self.text_encoder(tokens.input_ids.to('cuda'))[0]
+        return embeddings
     def generate_embeddings(self, object_description, color1, color2, color3) -> torch.Tensor:
         batch_size = len(object_description)
         if res == False:
             return None
         embeddings = self.test_generate_embeddings(object_description, color1, color2, color3)
+        embeddings = embeddings.to(device='cuda', dtype=torch.float16)
         # set the inference steps
         self.scheduler.set_timesteps(num_inference_steps)
+        noise_batches = self.generate_noise_batches(batch_size).to(dtype=torch.float16)
         # now call the model for the n interations
         progress_bar = tqdm(total=num_inference_steps)
             progress_bar.set_description(f'Inference step {epoch}')
             for batch_index in range(batch_size):
+                noise_batch = self.scheduler.scale_model_input(noise_batches, timestep=t)
                 with torch.no_grad():
+                    noise_residual = self.unet(noise_batch, t, encoder_hidden_states=embeddings).sample
+                previous_noisy_sample = self.scheduler.step(noise_residual, t, noise_batch).prev_sample
                 noise_batches[batch_index] = previous_noisy_sample
             progress_bar.update(1)
             epoch = epoch + 1
         # reshape the data so we get back 4 RGB images
+        noise_batches = torch.reshape(noise_batches, (batch_size, self.num_channels, self.latent_size, self.latent_size))
+        noise_batches = noise_batches.to('cuda')
+        images = torch.Tensor(size=(batch_size, 3, self.sample_size, self.sample_size)).to('cuda')
         with torch.no_grad():
+            image = noise_batches
+            result = self.vae.decode(image).sample
+            images = result
         # convert those tensors to PIL images
         output_images = []
         for batch_index in range(batch_size):
+            image = images[batch_index]
+            image = (image / 2 + 0.5).clamp(0, 1).squeeze()
+            image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
+            image = (image * 255).round().astype("uint8")
+            image = Image.fromarray(image)
+            image.save(f'test{batch_index}.png')
+            output_images.append(image)
         # for now just return the images
         return output_images

test_pipeline.py CHANGED Viewed

@@ -2,6 +2,7 @@ from rct_diffusion_pipeline import RCTDiffusionPipeline
 from diffusers import UNet2DConditionModel, DDPMScheduler, AutoencoderKL
 import torch
 from transformers import CLIPTextModel, CLIPTokenizer
 torch_device = "cuda"
@@ -22,11 +23,17 @@ test2 = tokenizer('dark green', padding="max_length", max_length=tokenizer.model
 with torch.no_grad():
     test2 = text_encoder(test2.input_ids.to('cuda'))[0]
-unet = UNet2DConditionModel(sample_size=32, in_channels=16, out_channels=16, \
-                        down_block_types=('CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D'),\
-                              up_block_types=('UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D'), cross_attention_dim=768*4,
-                            block_out_channels=(64, 128, 256), norm_num_groups=32)
 unet = unet.to('cuda', dtype=torch.float16)
 scheduler = DDPMScheduler(num_train_timesteps=20)
 vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
 vae = vae.to('cuda', dtype=torch.float16)
@@ -34,83 +41,4 @@ vae = vae.to('cuda', dtype=torch.float16)
 pipeline = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder)
 output = pipeline(['aleppo pine tree'], ['dark green'])
 pipeline.save_pretrained('test')
-# from PIL import Image
-# import torch
-# from transformers import CLIPTextModel, CLIPTokenizer
-# from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
-# vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae", use_safetensors=True)
-# tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
-# text_encoder = CLIPTextModel.from_pretrained(
-#     "CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
-# )
-# unet = UNet2DConditionModel.from_pretrained(
-#     "CompVis/stable-diffusion-v1-4", subfolder="unet", use_safetensors=True
-# )
-# from diffusers import UniPCMultistepScheduler
-# scheduler = UniPCMultistepScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
-# torch_device = "cuda"
-# vae.to(torch_device)
-# text_encoder.to(torch_device)
-# unet.to(torch_device)
-# prompt = ["a photograph of an astronaut riding a horse"]
-# height = 512  # default height of Stable Diffusion
-# width = 512  # default width of Stable Diffusion
-# num_inference_steps = 25  # Number of denoising steps
-# guidance_scale = 7.5  # Scale for classifier-free guidance
-# generator = torch.manual_seed(0)  # Seed generator to create the inital latent noise
-# batch_size = len(prompt)
-# text_input = tokenizer(
-#     prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt"
-# )
-# with torch.no_grad():
-#     text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
-# text_input = tokenizer(
-#     prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt"
-# )
-# with torch.no_grad():
-#     text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
-# max_length = text_input.input_ids.shape[-1]
-# uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
-# uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
-# text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
-# latents = torch.randn(
-#     (batch_size, unet.in_channels, height // 8, width // 8),
-#     generator=generator,
-# )
-# latents = latents.to(torch_device)
-# latents = latents * scheduler.init_noise_sigma
-# from tqdm.auto import tqdm
-# scheduler.set_timesteps(num_inference_steps)
-# for t in tqdm(scheduler.timesteps):
-#     # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
-#     latent_model_input = torch.cat([latents] * 2)
-#     latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)
-#     # predict the noise residual
-#     with torch.no_grad():
-#         noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
-#     # perform guidance
-#     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-#     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-#     # compute the previous noisy sample x_t -> x_t-1
-#     latents = scheduler.step(noise_pred, t, latents).prev_sample
 print('test')

 from diffusers import UNet2DConditionModel, DDPMScheduler, AutoencoderKL
 import torch
 from transformers import CLIPTextModel, CLIPTokenizer
+import torch.nn as nn
 torch_device = "cuda"
 with torch.no_grad():
     test2 = text_encoder(test2.input_ids.to('cuda'))[0]
+unet = UNet2DConditionModel(sample_size=32, in_channels=4, out_channels=4, \
+                        down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D", "DownBlock2D"),\
+                              up_block_types=("UpBlock2D","CrossAttnUpBlock2D","CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), cross_attention_dim=768,
+                            block_out_channels=(320, 640, 1280, 1280), norm_num_groups=32)
 unet = unet.to('cuda', dtype=torch.float16)
+# put float32 for the accumulation
+for layer in unet.modules():
+    if isinstance(layer, nn.BatchNorm2d):
+        layer.float()
 scheduler = DDPMScheduler(num_train_timesteps=20)
 vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
 vae = vae.to('cuda', dtype=torch.float16)
 pipeline = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder)
 output = pipeline(['aleppo pine tree'], ['dark green'])
 pipeline.save_pretrained('test')
 print('test')

train_model.py CHANGED Viewed

@@ -12,11 +12,13 @@ from tqdm.auto import tqdm
 from accelerate import Accelerator
 from diffusers import DDPMScheduler, UNet2DConditionModel, AutoencoderKL
 from transformers import CLIPTextModel, CLIPTokenizer
 SAMPLE_SIZE = 256
 LATENT_SIZE = 32
 SAMPLE_NUM_CHANNELS = 3
 LATENT_NUM_CHANNELS = 4
 def save_and_test(pipeline, epoch):
     outputs = pipeline(['aleppo pine tree'], ['dark green'])
@@ -28,42 +30,22 @@ def save_and_test(pipeline, epoch):
     pipeline.save_pretrained(model_file)
 def convert_images(dataset):
-        # let's get all the entries for the 4 views split in four lists
-    views = []
-    num_images = int(dataset.num_rows / 4)
-    for view_index in range(4):
-        entries = [entry for entry in dataset if entry['view'] == view_index]
-        views.append(entries)
-    # convert those images to 256x256 by cropping and scaling up the image
-    image_views = []
-    for view_index in range(4):
-        images = []
-        for entry in views[view_index]:
-            image = entry['image']
-            scale_factor = np.minimum(LATENT_SIZE / image.width, LATENT_SIZE / image.height)
-            image = Image.resize(image, size=(int(scale_factor * image.width), int(scale_factor * image.height)), resample=Resampling.NEAREST)
-            new_image = PIL.Image.new('RGBA', (LATENT_SIZE, LATENT_SIZE))
-            new_image.paste(image, box=(int((LATENT_SIZE - image.width)/2), int((LATENT_SIZE - image.height)/2)))
-            images.append(new_image)
-        image_views.append(images)
-    del views
-    # convert those views in tensors
-    targets = torch.Tensor(size=(num_images, 4, LATENT_NUM_CHANNELS, LATENT_SIZE, LATENT_SIZE)).to(dtype=torch.float16)
-    pillow_to_tensor = T.ToTensor()
-    for image_index in range(num_images):
-        for view_index in range(4):
-            targets[image_index, view_index] = pillow_to_tensor(image_views[view_index][image_index]).to(dtype=torch.float16)
-    del image_views
-    del entries
-    return torch.reshape(targets, (num_images, 4 * LATENT_NUM_CHANNELS, LATENT_SIZE, LATENT_SIZE))
 def convert_labels(dataset, model, num_images):
     # get the labels
@@ -97,115 +79,96 @@ def convert_labels(dataset, model, num_images):
     del dataset
     return class_labels.to(dtype=torch.float16, device='cuda')
-def train_model(batch_size=4, total_images=None, epochs=100, scheduler_num_timesteps=20, save_model_interval=10, start_learning_rate=1e-3, lr_warmup_steps=1):
-    dataset = load_dataset('frutiemax/rct_dataset')
-    dataset = dataset['train']
-    targets = convert_images(dataset)
     num_images = int(dataset.num_rows / 4) if total_images == None else int(total_images / 4)
-    unet = UNet2DConditionModel(sample_size=LATENT_SIZE, in_channels=LATENT_NUM_CHANNELS*4, out_channels=LATENT_NUM_CHANNELS*4, \
                         down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D", "DownBlock2D"),\
                               up_block_types=("UpBlock2D","CrossAttnUpBlock2D","CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), cross_attention_dim=768,
-                            block_out_channels=(320, 640, 1280, 1280), norm_num_groups=32)
-    unet = unet.to(dtype=torch.float16)
     scheduler = DDPMScheduler(num_train_timesteps=scheduler_num_timesteps)
     tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
     text_encoder = CLIPTextModel.from_pretrained(
         "CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
     ).to('cuda')
     vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
-    vae = vae.to(dtype=torch.float16)
-    optimizer = torch.optim.SGD(unet.parameters(), lr=start_learning_rate)
     lr_scheduler = get_cosine_schedule_with_warmup(
         optimizer=optimizer,
         num_warmup_steps=lr_warmup_steps,
         num_training_steps=num_images * epochs
     )
     model = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder)
-    # get all the object descriptions, color1, color2, color3
-    object_descriptions = dataset['object_description']
-    colors1 = dataset['color1']
-    colors2 = dataset['color2']
-    colors3 = dataset['color3']
-    # we only need 1 of the 4 views
-    object_descriptions = [object_descriptions[desc_index] for desc_index in range(0, len(object_descriptions), 4)]
-    colors1 = [colors1[desc_index] for desc_index in range(0, len(colors1), 4)]
-    colors2 = [colors2[desc_index] for desc_index in range(0, len(colors2), 4)]
-    colors3 = [colors3[desc_index] for desc_index in range(0, len(colors3), 4)]
-    #embeddings = model.generate_embeddings(object_descriptions, colors1, colors2, colors3)
-    embeddings = model.test_generate_embeddings(object_descriptions, colors1, colors2, colors3)
-    labels = convert_labels(dataset, model, num_images)
-    del model
-    if total_images != None:
-        targets = targets[:int(total_images/4)]
-        label_indices = [index for index in range(0, total_images, 4)]
-        labels = labels[label_indices]
     # lets train for 100 epoch for each sprite in the dataset with a random noise level
     progress_bar = tqdm(total=epochs)
-    accelerator = Accelerator(mixed_precision='fp16')
-    accelerator.clip_grad_norm_(unet.parameters(), 1.0)
-    unet, scheduler, lr_scheduler, vae = accelerator.prepare(unet, scheduler, lr_scheduler, vae)
     loss_fn = torch.nn.MSELoss()
     tensor_to_pillow = T.ToPILImage()
     for epoch in range(epochs):
         # create a noisy version of each sprite
-        for batch_index in range(0, num_images, batch_size):
-            batch_end = np.minimum(num_images, batch_index + batch_size)
-            clean_images = targets[batch_index:batch_end]
-            clean_images = torch.reshape(clean_images, ((batch_end - batch_index), LATENT_NUM_CHANNELS * 4, LATENT_SIZE, LATENT_SIZE)).\
-                to(device='cuda', dtype=torch.float16)
-            noise = torch.randn(clean_images.shape, dtype=torch.float16, device='cuda')
-            timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (batch_end - batch_index, )).to(device='cuda')
             #timesteps = timesteps.to(dtype=torch.int, device='cuda')
             noisy_images = scheduler.add_noise(clean_images, noise, timesteps)
-            # with accelerator.accumulate(unet):
-            #     assert not torch.any(torch.isnan(timesteps))
-            #     batch_embeddings = embeddings[batch_index:batch_end]
-            #     batch_embeddings = batch_embeddings.to('cuda')
-            #     optimizer.zero_grad()
-            #     unet_results = unet(noisy_images, timesteps, batch_embeddings).sample
-            #     unet_results = unet_results.to(dtype=torch.float16)
-            #     loss = loss_fn(unet_results, noise)
-            #     accelerator.backward(loss)
-            #     optimizer.step()
-            #     lr_scheduler.step()
-            #     optimizer.zero_grad()
-            batch_embeddings = embeddings[batch_index:batch_end]
             batch_embeddings = batch_embeddings.to('cuda')
             optimizer.zero_grad()
             unet_results = unet(noisy_images, timesteps, batch_embeddings).sample
-            unet_results = unet_results.to(dtype=torch.float16)
             loss = loss_fn(unet_results, noise)
             loss.backward()
             optimizer.step()
             lr_scheduler.step()
             optimizer.zero_grad()
-            progress_bar.set_description(f'epoch={epoch}, batch_index={batch_index}, last_loss={loss.item()}')
         if (epoch + 1) % save_model_interval == 0:
-            model = RCTDiffusionPipeline(accelerator.unwrap_model(unet), scheduler, vae, tokenizer, text_encoder)
             save_and_test(model, epoch)
         progress_bar.update(1)
 if __name__ == '__main__':
-    train_model(1, total_images=4, save_model_interval=1)

 from accelerate import Accelerator
 from diffusers import DDPMScheduler, UNet2DConditionModel, AutoencoderKL
 from transformers import CLIPTextModel, CLIPTokenizer
+import torch.nn as nn
 SAMPLE_SIZE = 256
 LATENT_SIZE = 32
 SAMPLE_NUM_CHANNELS = 3
 LATENT_NUM_CHANNELS = 4
+from torchvision import transforms
 def save_and_test(pipeline, epoch):
     outputs = pipeline(['aleppo pine tree'], ['dark green'])
     pipeline.save_pretrained(model_file)
 def convert_images(dataset):
+    preprocess = transforms.Compose(
+    [
+        transforms.Resize((LATENT_SIZE, LATENT_SIZE)),
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ]
+    )
+    images = [preprocess(image.convert("RGBA")) for image in dataset["image"]]
+    object_descriptions = [obj_desc for obj_desc in dataset["object_description"]]
+    colors1 = [color1 for color1 in dataset['color1']]
+    colors2 = [color1 for color1 in dataset['color2']]
+    colors3 = [color1 for color1 in dataset['color3']]
+    return {"image": images, 'object_description':object_descriptions, 'color1':colors1, \
+            'color2':colors2, 'color3':colors3}
 def convert_labels(dataset, model, num_images):
     # get the labels
     del dataset
     return class_labels.to(dtype=torch.float16, device='cuda')
+def create_embeddings(dataset, model):
+    object_descriptions = dataset['object_description']
+    colors1 = dataset['color1']
+    colors2 = dataset['color2']
+    colors3 = dataset['color3']
+    return model.test_generate_embeddings(object_descriptions, colors1, colors2, colors3)
+def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timesteps=20, save_model_interval=10, start_learning_rate=1e-3, lr_warmup_steps=1):
+    dataset = load_dataset('frutiemax/rct_dataset', split=f'train[0:{total_images}]')
+    dataset.set_transform(convert_images)
     num_images = int(dataset.num_rows / 4) if total_images == None else int(total_images / 4)
+    unet = UNet2DConditionModel(sample_size=LATENT_SIZE, in_channels=LATENT_NUM_CHANNELS, out_channels=LATENT_NUM_CHANNELS, \
                         down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D", "DownBlock2D"),\
                               up_block_types=("UpBlock2D","CrossAttnUpBlock2D","CrossAttnUpBlock2D", "CrossAttnUpBlock2D"), cross_attention_dim=768,
+                            block_out_channels=(128, 256, 512, 512), norm_num_groups=32)
+    unet = unet.to(dtype=torch.float32)
+    #https://discuss.pytorch.org/t/training-with-half-precision/11815
+    for layer in unet.modules():
+        if isinstance(layer, nn.BatchNorm2d):
+            layer.float()
     scheduler = DDPMScheduler(num_train_timesteps=scheduler_num_timesteps)
     tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
     text_encoder = CLIPTextModel.from_pretrained(
         "CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
     ).to('cuda')
     vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
+    vae = vae.to(dtype=torch.float16, device='cuda')
+    optimizer = torch.optim.AdamW(unet.parameters(), lr=start_learning_rate)
     lr_scheduler = get_cosine_schedule_with_warmup(
         optimizer=optimizer,
         num_warmup_steps=lr_warmup_steps,
         num_training_steps=num_images * epochs
     )
     model = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder)
+    unet = unet.to('cuda')
+    train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
     # lets train for 100 epoch for each sprite in the dataset with a random noise level
     progress_bar = tqdm(total=epochs)
     loss_fn = torch.nn.MSELoss()
     tensor_to_pillow = T.ToPILImage()
     for epoch in range(epochs):
         # create a noisy version of each sprite
+        for step, batch in enumerate(train_dataloader):
+            clean_images = batch['image']
+            batch_size = clean_images.size(0)
+            embeddings = create_embeddings(batch, model)
+            clean_images = torch.reshape(clean_images, (batch['image'].size(0), LATENT_NUM_CHANNELS, LATENT_SIZE, LATENT_SIZE)).\
+                to(device='cuda')
+            noise = torch.randn(clean_images.shape, dtype=torch.float32, device='cuda')
+            timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (batch_size, )).to(device='cuda')
             #timesteps = timesteps.to(dtype=torch.int, device='cuda')
             noisy_images = scheduler.add_noise(clean_images, noise, timesteps)
+            batch_embeddings = embeddings
             batch_embeddings = batch_embeddings.to('cuda')
             optimizer.zero_grad()
             unet_results = unet(noisy_images, timesteps, batch_embeddings).sample
             loss = loss_fn(unet_results, noise)
             loss.backward()
             optimizer.step()
             lr_scheduler.step()
             optimizer.zero_grad()
+            progress_bar.set_description(f'epoch={epoch}, batch_index={step}, last_loss={loss.item()}')
         if (epoch + 1) % save_model_interval == 0:
+            # inference in float16
+            model = RCTDiffusionPipeline(unet.to(dtype=torch.float16), scheduler, \
+                                         vae.to(dtype=torch.float16), tokenizer, text_encoder.to(dtype=torch.float16))
             save_and_test(model, epoch)
+            # training in float32
+            unet.to(dtype=torch.float32)
+            vae.to(dtype=torch.float32)
+            text_encoder.to(dtype=torch.float32)
         progress_bar.update(1)
 if __name__ == '__main__':
+    train_model(1, total_images=4, save_model_interval=100, epochs=1000)