frutiemax
/

rct_model

Model card Files Files and versions Community

frutiemax commited on Sep 23, 2023

Commit

054faf7

1 Parent(s): 21640c8

Use VAE to speed up inference

Browse files

Files changed (3) hide show

rct_diffusion_pipeline.py +43 -37
test_pipeline.py +11 -3
train_model.py +63 -33

rct_diffusion_pipeline.py CHANGED Viewed

@@ -4,6 +4,7 @@ from diffusers import DDPMScheduler, UNet2DConditionModel
 import torch
 import torchvision.transforms as T
 from PIL import Image
 from transformers import AutoTokenizer
 from datasets import load_dataset
 import numpy as np
@@ -11,13 +12,7 @@ import pandas as pd
 from tqdm.auto import tqdm
 class RCTDiffusionPipeline(DiffusionPipeline):
-    def get_default_unet(hidden_dim):
-        return UNet2DConditionModel(sample_size=256, in_channels=12, out_channels=12, \
-                        down_block_types=('CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D'),\
-                              up_block_types=('UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D'), cross_attention_dim=hidden_dim,
-                            block_out_channels=(64, 128, 256), norm_num_groups=32)
-    def __init__(self):
         super().__init__()
         # dictionnary that keeps the different classes of object description, color1, color2 and color3
@@ -25,16 +20,13 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         self.color1_dict = {}
         self.color2_dict = {}
         self.color3_dict = {}
-        self.load_dictionaries_from_dataset()
-        self.scheduler = None
-        self.unet = None
-    def set_unet(self, unet):
-        self.unet = unet
-    def set_scheduler(self, scheduler):
         self.scheduler = scheduler
     def load_dictionaries_from_dataset(self):
         dataset = load_dataset('frutiemax/rct_dataset')
@@ -127,11 +119,10 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         class_labels = torch.reshape(class_labels, (num_images, 1, self.get_class_labels_size()))
         return class_labels
-    def __call__(self, object_description : list[list[tuple[str, float]]], color1 : list[list[tuple[str, float]]], \
                 color2 : list[list[tuple[str, float]]] = None, color3 : list[list[tuple[str, float]]] = None, \
-                 batch_size=1, num_inference_steps=20, generator=torch.manual_seed(torch.random.seed())):
         # check if the labels are the correct size
         if len(object_description) != batch_size:
             return None
@@ -171,25 +162,29 @@ class RCTDiffusionPipeline(DiffusionPipeline):
             colors3.append(c3)
         # now put those weights into a tensor
-        class_labels = self.pack_labels_to_tensor(batch_size, object_descriptions, colors1, colors2, colors3).to(device='cuda',dtype=torch.float16)
-        # we need those class labels for the 12 channels
-        #new_class_labels = torch.Tensor(size=(batch_size, 12, self.get_class_labels_size()))
-        #new_class_labels[:, :] = class_labels
-        #class_labels = new_class_labels.to(device='cuda', dtype=torch.float16)
-        #del new_class_labels
-        # set the inference steps
-        self.scheduler.set_timesteps(num_inference_steps)
-        noise_batches = torch.Tensor(size=(batch_size, 4, 3, 256, 256)).to(dtype=torch.float16, device='cuda')
         for batch_index in range(batch_size):
             for view_index in range(4):
-                noise = torch.randn(3, 256, 256).to(dtype=torch.float16, device='cuda')
                 noise_batches[batch_index, view_index] = noise
-        # reshape the data so it's (batch_size, 12, 256, 256)
-        noise_batches = torch.reshape(noise_batches, (batch_size, 1, 12, 256, 256)).to(dtype=torch.float16, device='cuda')
         # now call the model for the n interations
         progress_bar = tqdm(total=num_inference_steps)
@@ -206,15 +201,26 @@ class RCTDiffusionPipeline(DiffusionPipeline):
             epoch = epoch + 1
         # reshape the data so we get back 4 RGB images
-        noise_batches = torch.reshape(noise_batches, (batch_size, 4, 3, 256, 256)).to('cpu')
         # convert those tensors to PIL images
         output_images = []
-        tensor_to_pil = T.ToPILImage('RGB')
         for batch_index in range(batch_size):
             for image_index in range(4):
-                output_images.append(tensor_to_pil(noise_batches[batch_index, image_index]))
         # for now just return the images
         return output_images

 import torch
 import torchvision.transforms as T
 from PIL import Image
+import PIL.Image
 from transformers import AutoTokenizer
 from datasets import load_dataset
 import numpy as np
 from tqdm.auto import tqdm
 class RCTDiffusionPipeline(DiffusionPipeline):
+    def __init__(self, unet, scheduler, vae):
         super().__init__()
         # dictionnary that keeps the different classes of object description, color1, color2 and color3
         self.color1_dict = {}
         self.color2_dict = {}
         self.color3_dict = {}
         self.scheduler = scheduler
+        self.unet = unet
+        self.vae = vae
+        # channels for 1 image
+        self.num_channels = int(self.unet.config.in_channels / 4)
     def load_dictionaries_from_dataset(self):
         dataset = load_dataset('frutiemax/rct_dataset')
         class_labels = torch.reshape(class_labels, (num_images, 1, self.get_class_labels_size()))
         return class_labels
+    def get_class_labels(self, object_description : list[list[tuple[str, float]]], color1 : list[list[tuple[str, float]]], \
                 color2 : list[list[tuple[str, float]]] = None, color3 : list[list[tuple[str, float]]] = None, \
+                 batch_size=1):
         # check if the labels are the correct size
         if len(object_description) != batch_size:
             return None
             colors3.append(c3)
         # now put those weights into a tensor
+        return self.pack_labels_to_tensor(batch_size, object_descriptions, colors1, colors2, colors3).to(device='cuda',dtype=torch.float16)
+    # generate 64x64 latents
+    def generate_noise_batches(self, batch_size):
+        noise_batches = torch.Tensor(size=(batch_size, 4, self.num_channels, 64, 64)).to(dtype=torch.float16, device='cuda')
         for batch_index in range(batch_size):
             for view_index in range(4):
+                noise = torch.randn(self.num_channels, 64, 64).to(dtype=torch.float16, device='cuda')
                 noise_batches[batch_index, view_index] = noise
+        return torch.reshape(noise_batches, (batch_size, 1, self.num_channels*4, 64, 64)).to(dtype=torch.float16, device='cuda')
+    def __call__(self, object_description : list[list[tuple[str, float]]], color1 : list[list[tuple[str, float]]], \
+                color2 : list[list[tuple[str, float]]] = None, color3 : list[list[tuple[str, float]]] = None, \
+                 batch_size=1, num_inference_steps=20, generator=torch.manual_seed(torch.random.seed())):
+        class_labels = self.get_class_labels(object_description, color1, color2, color3, batch_size).to(device='cuda', dtype=torch.float16)
+        if class_labels == None:
+            return None
+        # set the inference steps
+        self.scheduler.set_timesteps(num_inference_steps)
+        noise_batches = self.generate_noise_batches(batch_size)
         # now call the model for the n interations
         progress_bar = tqdm(total=num_inference_steps)
             epoch = epoch + 1
         # reshape the data so we get back 4 RGB images
+        noise_batches = torch.reshape(noise_batches, (batch_size, 4, self.num_channels, 64, 64))
+        images = torch.Tensor(size=(batch_size, 4, 3, 512, 512))
+        with torch.no_grad():
+            for image_index in range(4):
+                image = noise_batches[:, image_index]
+                result = self.vae.decode(image).sample
+                images[:, image_index] = result
         # convert those tensors to PIL images
         output_images = []
         for batch_index in range(batch_size):
             for image_index in range(4):
+                # run these into the vae decoder
+                image = images[batch_index, image_index]
+                image = (image / 2 + 0.5).clamp(0, 1).squeeze()
+                image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
+                image = (image * 255).round().astype("uint8")
+                image = Image.fromarray(image)
+                output_images.append(image)
         # for now just return the images
         return output_images

test_pipeline.py CHANGED Viewed

@@ -1,11 +1,19 @@
 from rct_diffusion_pipeline import RCTDiffusionPipeline
-from diffusers import UNet2DConditionModel
 torch_device = "cuda"
-pipeline = RCTDiffusionPipeline()
-pipeline.print_class_tokens_to_csv()
 output = pipeline([[('aleppo pine tree', 1.0)]], [[('dark green', 1.0)]])
 # from PIL import Image

 from rct_diffusion_pipeline import RCTDiffusionPipeline
+from diffusers import UNet2DConditionModel, DDPMScheduler, AutoencoderKL
 torch_device = "cuda"
+unet = UNet2DConditionModel(sample_size=64, in_channels=16, out_channels=16, \
+                        down_block_types=('CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D'),\
+                              up_block_types=('UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D'), cross_attention_dim=160,
+                            block_out_channels=(64, 128, 256), norm_num_groups=32)
+scheduler = DDPMScheduler(num_train_timesteps=20)
+vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae", use_safetensors=True)
+vae.tile_sample_min_size = 256
+pipeline = RCTDiffusionPipeline(unet, scheduler, vae)
 output = pipeline([[('aleppo pine tree', 1.0)]], [[('dark green', 1.0)]])
 # from PIL import Image

train_model.py CHANGED Viewed

@@ -10,7 +10,12 @@ import torch.nn.functional as F
 from diffusers.optimization import get_cosine_schedule_with_warmup
 from tqdm.auto import tqdm
 from accelerate import Accelerator
-from diffusers import DDPMScheduler, UNet2DConditionModel
 def save_and_test(pipeline, epoch):
     outputs = pipeline([[('aleppo pine tree', 1.0)]], [[('dark green', 1.0)]])
@@ -21,14 +26,10 @@ def save_and_test(pipeline, epoch):
     model_file = f'rct_foliage_{epoch}.pth'
     pipeline.save_pretrained(model_file)
-def train_model(batch_size=4, epochs=100, scheduler_num_timesteps=20, save_model_interval=10, start_learning_rate=1e-3, lr_warmup_steps=500):
-    dataset = load_dataset('frutiemax/rct_dataset')
-    dataset = dataset['train']
-    num_images = int(dataset.num_rows / 4)
-    # let's get all the entries for the 4 views split in four lists
     views = []
     for view_index in range(4):
         entries = [entry for entry in dataset if entry['view'] == view_index]
@@ -41,18 +42,18 @@ def train_model(batch_size=4, epochs=100, scheduler_num_timesteps=20, save_model
         for entry in views[view_index]:
             image = entry['image']
-            scale_factor = int(np.minimum(256 / image.width, 256 / image.height))
             image = Image.resize(image, size=(scale_factor * image.width, scale_factor * image.height), resample=Resampling.NEAREST)
-            new_image = PIL.Image.new('RGB', (256, 256))
-            new_image.paste(image, box=(int((256 - image.width)/2), int((256 - image.height)/2)))
             images.append(new_image)
         image_views.append(images)
     del views
     # convert those views in tensors
-    targets = torch.Tensor(size=(num_images, 4, 3, 256, 256)).to(dtype=torch.float16)
     pillow_to_tensor = T.ToTensor()
     for image_index in range(num_images):
@@ -61,8 +62,9 @@ def train_model(batch_size=4, epochs=100, scheduler_num_timesteps=20, save_model
     del image_views
     del entries
-    targets = torch.reshape(targets, (num_images, 12, 256, 256))
     # get the labels
     view0_entries = [row for row in dataset if row['view'] == 0]
     obj_descriptions = [row['object_description'] for row in view0_entries]
@@ -79,7 +81,7 @@ def train_model(batch_size=4, epochs=100, scheduler_num_timesteps=20, save_model
     colors3 = [[(color3, 1.0)] for color3 in colors3]
     # convert those tuples in numpy arrays using the helper function of the model
-    model = RCTDiffusionPipeline()
     obj_descriptions = [model.get_object_description_weights(obj_desc) for obj_desc in obj_descriptions]
     colors1 = [model.get_color1_weights(color1) for color1 in colors1]
     colors2 = [model.get_color2_weights(color2) for color2 in colors2]
@@ -92,51 +94,79 @@ def train_model(batch_size=4, epochs=100, scheduler_num_timesteps=20, save_model
     del colors2
     del colors3
     del dataset
-    unet = RCTDiffusionPipeline.get_default_unet(160)
     optimizer = torch.optim.Adam(unet.parameters(), lr=start_learning_rate)
     lr_scheduler = get_cosine_schedule_with_warmup(
         optimizer=optimizer,
         num_warmup_steps=lr_warmup_steps,
         num_training_steps=num_images * epochs
     )
     # lets train for 100 epoch for each sprite in the dataset with a random noise level
     progress_bar = tqdm(total=epochs)
-    scheduler = DDPMScheduler(scheduler_num_timesteps)
-    unet = unet.to(device='cuda', dtype=torch.float16)
-    scheduler.set_timesteps(scheduler_num_timesteps)
     for epoch in range(epochs):
         # create a noisy version of each sprite
         for batch_index in range(0, num_images, batch_size):
             progress_bar.set_description(f'epoch={epoch}, batch_index={batch_index}')
             batch_end = np.minimum(num_images, batch_index + batch_size)
             clean_images = targets[batch_index:batch_end]
-            clean_images = torch.reshape(clean_images, ((batch_end - batch_index), 12, 256, 256)).to(device='cuda', dtype=torch.float16)
             noise = torch.randn(clean_images.shape, dtype=torch.float16, device='cuda')
             timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (batch_end - batch_index, )).to(device='cuda')
             #timesteps = timesteps.to(dtype=torch.int, device='cuda')
             noisy_images = scheduler.add_noise(clean_images, noise, timesteps)
-            noise_pred = unet(noisy_images, timesteps, class_labels[batch_index:batch_end].to(device='cuda',dtype=torch.float16), return_dict=False)[0]
-            #noise_pred = noise_pred.to(device='cuda', dtype=torch.float16)
-            loss = F.mse_loss(noise_pred, noise)
-            loss.backward()
-            optimizer.step()
-            lr_scheduler.step()
-            optimizer.zero_grad()
         if (epoch + 1) % save_model_interval == 0:
-            model.unet = unet
-            model.scheduler = scheduler
             save_and_test(model, epoch)
-            del model.unet
-            del model.scheduler
         progress_bar.update(1)
 if __name__ == '__main__':
-    train_model(8, save_model_interval=1)

 from diffusers.optimization import get_cosine_schedule_with_warmup
 from tqdm.auto import tqdm
 from accelerate import Accelerator
+from diffusers import DDPMScheduler, UNet2DConditionModel, AutoencoderKL
+SAMPLE_SIZE = 512
+LATENT_SIZE = 64
+SAMPLE_NUM_CHANNELS = 3
+LATENT_NUM_CHANNELS = 4
 def save_and_test(pipeline, epoch):
     outputs = pipeline([[('aleppo pine tree', 1.0)]], [[('dark green', 1.0)]])
     model_file = f'rct_foliage_{epoch}.pth'
     pipeline.save_pretrained(model_file)
+def convert_images(dataset):
+        # let's get all the entries for the 4 views split in four lists
     views = []
+    num_images = int(dataset.num_rows / 4)
     for view_index in range(4):
         entries = [entry for entry in dataset if entry['view'] == view_index]
         for entry in views[view_index]:
             image = entry['image']
+            scale_factor = int(np.minimum(SAMPLE_SIZE / image.width, SAMPLE_SIZE / image.height))
             image = Image.resize(image, size=(scale_factor * image.width, scale_factor * image.height), resample=Resampling.NEAREST)
+            new_image = PIL.Image.new('RGB', (SAMPLE_SIZE, SAMPLE_SIZE))
+            new_image.paste(image, box=(int((SAMPLE_SIZE - image.width)/2), int((SAMPLE_SIZE - image.height)/2)))
             images.append(new_image)
         image_views.append(images)
     del views
     # convert those views in tensors
+    targets = torch.Tensor(size=(num_images, 4, SAMPLE_NUM_CHANNELS, SAMPLE_SIZE, SAMPLE_SIZE)).to(dtype=torch.float16)
     pillow_to_tensor = T.ToTensor()
     for image_index in range(num_images):
     del image_views
     del entries
+    return torch.reshape(targets, (num_images, 4 * SAMPLE_NUM_CHANNELS, SAMPLE_SIZE, SAMPLE_SIZE))
+def convert_labels(dataset, model, num_images):
     # get the labels
     view0_entries = [row for row in dataset if row['view'] == 0]
     obj_descriptions = [row['object_description'] for row in view0_entries]
     colors3 = [[(color3, 1.0)] for color3 in colors3]
     # convert those tuples in numpy arrays using the helper function of the model
     obj_descriptions = [model.get_object_description_weights(obj_desc) for obj_desc in obj_descriptions]
     colors1 = [model.get_color1_weights(color1) for color1 in colors1]
     colors2 = [model.get_color2_weights(color2) for color2 in colors2]
     del colors2
     del colors3
     del dataset
+    return class_labels.to(dtype=torch.float16, device='cuda')
+def train_model(batch_size=4, epochs=100, scheduler_num_timesteps=20, save_model_interval=10, start_learning_rate=1e-3, lr_warmup_steps=500):
+    dataset = load_dataset('frutiemax/rct_dataset')
+    dataset = dataset['train']
+    targets = convert_images(dataset)
+    num_images = int(dataset.num_rows / 4)
+    unet = UNet2DConditionModel(sample_size=LATENT_SIZE, in_channels=LATENT_NUM_CHANNELS * 4, out_channels=LATENT_NUM_CHANNELS * 4, \
+                        down_block_types=('CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D'),\
+                              up_block_types=('UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D'), cross_attention_dim=160,
+                            block_out_channels=(64, 128, 256), norm_num_groups=32)
+    unet = unet.to(dtype=torch.float16)
+    scheduler = DDPMScheduler(num_train_timesteps=20)
+    vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae", use_safetensors=True, variant='fp16')
+    vae = vae.to(dtype=torch.float16)
     optimizer = torch.optim.Adam(unet.parameters(), lr=start_learning_rate)
     lr_scheduler = get_cosine_schedule_with_warmup(
         optimizer=optimizer,
         num_warmup_steps=lr_warmup_steps,
         num_training_steps=num_images * epochs
     )
+    model = RCTDiffusionPipeline(unet, scheduler, vae)
+    model.load_dictionaries_from_dataset()
+    labels = convert_labels(dataset, model, num_images)
     # lets train for 100 epoch for each sprite in the dataset with a random noise level
     progress_bar = tqdm(total=epochs)
+    accelerator = Accelerator(mixed_precision='fp16')
+    unet, scheduler, lr_scheduler, vae = accelerator.prepare(unet, scheduler, lr_scheduler, vae)
     for epoch in range(epochs):
         # create a noisy version of each sprite
         for batch_index in range(0, num_images, batch_size):
             progress_bar.set_description(f'epoch={epoch}, batch_index={batch_index}')
             batch_end = np.minimum(num_images, batch_index + batch_size)
             clean_images = targets[batch_index:batch_end]
+            clean_images = torch.reshape(clean_images, ((batch_end - batch_index), SAMPLE_NUM_CHANNELS * 4, SAMPLE_SIZE, SAMPLE_SIZE)).to(device='cuda', dtype=torch.float16)
             noise = torch.randn(clean_images.shape, dtype=torch.float16, device='cuda')
             timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (batch_end - batch_index, )).to(device='cuda')
             #timesteps = timesteps.to(dtype=torch.int, device='cuda')
             noisy_images = scheduler.add_noise(clean_images, noise, timesteps)
+            # encode through the vae
+            with accelerator.accumulate(unet):
+                latent_images = torch.Tensor(size=(batch_end - batch_index, LATENT_NUM_CHANNELS * 4, LATENT_SIZE, LATENT_SIZE)).to(device='cuda', dtype=torch.float16)
+                latent_noises = torch.Tensor(size=(batch_end - batch_index, LATENT_NUM_CHANNELS * 4, LATENT_SIZE, LATENT_SIZE)).to(device='cuda', dtype=torch.float16)
+                for view_index in range(4):
+                    images = noisy_images[:, view_index*SAMPLE_NUM_CHANNELS:(view_index+1)*SAMPLE_NUM_CHANNELS]
+                    result = vae.encode(images).latent_dist.sample()
+                    latent_images[:, view_index*LATENT_NUM_CHANNELS:(view_index+1)*LATENT_NUM_CHANNELS] = result
+                    images = noise[:, view_index*SAMPLE_NUM_CHANNELS:(view_index+1)*SAMPLE_NUM_CHANNELS]
+                    result = vae.encode(images).latent_dist.sample()
+                    latent_noises[:, view_index*LATENT_NUM_CHANNELS:(view_index+1)*LATENT_NUM_CHANNELS] = result
+                unet_results = unet(latent_images, timesteps, labels[batch_index:batch_end])[0]
+                unet_results = unet_results.to(dtype=torch.float16)
+                loss = F.mse_loss(unet_results, latent_noises)
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
         if (epoch + 1) % save_model_interval == 0:
+            model = RCTDiffusionPipeline(accelerator.unwrap_model(unet), scheduler, vae)
             save_and_test(model, epoch)
         progress_bar.update(1)
 if __name__ == '__main__':
+    train_model(1, save_model_interval=1)