frutiemax
/

rct_model

Model card Files Files and versions Community

frutiemax commited on Sep 23, 2023

Commit

88deab4

1 Parent(s): d751051

Use ConditionalUnetModel

Browse files

Files changed (3) hide show

rct_diffusion_pipeline.py +24 -13
test_pipeline.py +80 -0
train_model.py +4 -4

rct_diffusion_pipeline.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from diffusers import DiffusionPipeline
 from diffusers import DDPMPipeline
-from diffusers import DDPMScheduler, UNet2DModel
 import torch
 import torchvision.transforms as T
 from PIL import Image
@@ -24,11 +24,13 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         self.scheduler = DDPMScheduler()
         # the number of hidden features is dependant on the loaded dictionaries!
-        self.unet = UNet2DModel(sample_size=256, in_channels=12, out_channels=12, \
-                        down_block_types=('DownBlock2D', 'DownBlock2D', 'AttnDownBlock2D'), up_block_types=('UpBlock2D', 'UpBlock2D', 'AttnUpBlock2D'), \
-                            block_out_channels=(16, 32, 64), norm_num_groups=16)
-        self.unet.to('cuda')
     def load_dictionaries_from_dataset(self):
         dataset = load_dataset('frutiemax/rct_dataset')
@@ -118,6 +120,8 @@ class RCTDiffusionPipeline(DiffusionPipeline):
             offset += len(self.color2_dict.items())
             class_labels[batch_index, offset:offset + len(self.color3_dict)] = torch.from_numpy(colors3[batch_index])
         return class_labels
     def __call__(self, object_description : list[list[tuple[str, float]]], color1 : list[list[tuple[str, float]]], \
@@ -164,29 +168,36 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         # now put those weights into a tensor
         class_labels = self.pack_labels_to_tensor(batch_size, object_descriptions, colors1, colors2, colors3)
-        class_labels = class_labels.to('cuda')
         # set the inference steps
         self.scheduler.set_timesteps(num_inference_steps)
-        noise_batches = torch.Tensor(size=(batch_size, 4, 3, 256, 256)).to('cuda')
         for batch_index in range(batch_size):
             for view_index in range(4):
-                noise = torch.randn(3, 256, 256).to('cuda')
                 noise_batches[batch_index, view_index] = noise
         # reshape the data so it's (batch_size, 12, 256, 256)
-        noise_batches = torch.reshape(noise_batches, (batch_size, 12, 256, 256)).to('cuda')
         # now call the model for the n interations
         progress_bar = tqdm(total=num_inference_steps)
         epoch = 0
         for t in self.scheduler.timesteps:
             progress_bar.set_description(f'Inference step {epoch}')
-            with torch.no_grad():
-                noise_residual = self.unet(noise_batches, t, class_labels=class_labels).sample
-            previous_noisy_sample = self.scheduler.step(noise_residual, t, noise_batches).prev_sample
-            noise_batches = previous_noisy_sample
             progress_bar.update(1)
             epoch = epoch + 1

 from diffusers import DiffusionPipeline
 from diffusers import DDPMPipeline
+from diffusers import DDPMScheduler, UNet2DConditionModel
 import torch
 import torchvision.transforms as T
 from PIL import Image
         self.scheduler = DDPMScheduler()
         # the number of hidden features is dependant on the loaded dictionaries!
+        hidden_dim = self.get_class_labels_size()
+        self.unet = UNet2DConditionModel(sample_size=256, in_channels=12, out_channels=12, \
+                        down_block_types=('CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D'),\
+                              up_block_types=('UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D'), cross_attention_dim=160,
+                            block_out_channels=(12, 24, 30), norm_num_groups=6)
+        self.unet.to(device='cuda', dtype=torch.float16)
     def load_dictionaries_from_dataset(self):
         dataset = load_dataset('frutiemax/rct_dataset')
             offset += len(self.color2_dict.items())
             class_labels[batch_index, offset:offset + len(self.color3_dict)] = torch.from_numpy(colors3[batch_index])
+        class_labels = torch.reshape(class_labels, (num_images, 1, self.get_class_labels_size()))
         return class_labels
     def __call__(self, object_description : list[list[tuple[str, float]]], color1 : list[list[tuple[str, float]]], \
         # now put those weights into a tensor
         class_labels = self.pack_labels_to_tensor(batch_size, object_descriptions, colors1, colors2, colors3)
+        # we need those class labels for the 12 channels
+        #new_class_labels = torch.Tensor(size=(batch_size, 12, self.get_class_labels_size()))
+        #new_class_labels[:, :] = class_labels
+        #class_labels = new_class_labels.to(device='cuda', dtype=torch.float16)
+        #del new_class_labels
         # set the inference steps
         self.scheduler.set_timesteps(num_inference_steps)
+        noise_batches = torch.Tensor(size=(batch_size, 4, 3, 256, 256)).to(dtype=torch.float16, device='cuda')
         for batch_index in range(batch_size):
             for view_index in range(4):
+                noise = torch.randn(3, 256, 256).to(dtype=torch.float16, device='cuda')
                 noise_batches[batch_index, view_index] = noise
         # reshape the data so it's (batch_size, 12, 256, 256)
+        noise_batches = torch.reshape(noise_batches, (batch_size, 1, 12, 256, 256)).to(dtype=torch.float16, device='cuda')
         # now call the model for the n interations
         progress_bar = tqdm(total=num_inference_steps)
         epoch = 0
         for t in self.scheduler.timesteps:
             progress_bar.set_description(f'Inference step {epoch}')
+            for batch_index in range(batch_size):
+                with torch.no_grad():
+                    noise_residual = self.unet(noise_batches[batch_index], t, encoder_hidden_states=class_labels).sample
+                previous_noisy_sample = self.scheduler.step(noise_residual, t, noise_batches[batch_index]).prev_sample
+                noise_batches[batch_index] = previous_noisy_sample
             progress_bar.update(1)
             epoch = epoch + 1

test_pipeline.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from rct_diffusion_pipeline import RCTDiffusionPipeline
 torch_device = "cuda"
@@ -6,4 +7,83 @@ torch_device = "cuda"
 pipeline = RCTDiffusionPipeline()
 pipeline.print_class_tokens_to_csv()
 output = pipeline([[('aleppo pine tree', 1.0)]], [[('dark green', 1.0)]])
 print('test')

 from rct_diffusion_pipeline import RCTDiffusionPipeline
+from diffusers import UNet2DConditionModel
 torch_device = "cuda"
 pipeline = RCTDiffusionPipeline()
 pipeline.print_class_tokens_to_csv()
 output = pipeline([[('aleppo pine tree', 1.0)]], [[('dark green', 1.0)]])
+# from PIL import Image
+# import torch
+# from transformers import CLIPTextModel, CLIPTokenizer
+# from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
+# vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae", use_safetensors=True)
+# tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer")
+# text_encoder = CLIPTextModel.from_pretrained(
+#     "CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
+# )
+# unet = UNet2DConditionModel.from_pretrained(
+#     "CompVis/stable-diffusion-v1-4", subfolder="unet", use_safetensors=True
+# )
+# from diffusers import UniPCMultistepScheduler
+# scheduler = UniPCMultistepScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+# torch_device = "cuda"
+# vae.to(torch_device)
+# text_encoder.to(torch_device)
+# unet.to(torch_device)
+# prompt = ["a photograph of an astronaut riding a horse"]
+# height = 512  # default height of Stable Diffusion
+# width = 512  # default width of Stable Diffusion
+# num_inference_steps = 25  # Number of denoising steps
+# guidance_scale = 7.5  # Scale for classifier-free guidance
+# generator = torch.manual_seed(0)  # Seed generator to create the inital latent noise
+# batch_size = len(prompt)
+# text_input = tokenizer(
+#     prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt"
+# )
+# with torch.no_grad():
+#     text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
+# text_input = tokenizer(
+#     prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt"
+# )
+# with torch.no_grad():
+#     text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
+# max_length = text_input.input_ids.shape[-1]
+# uncond_input = tokenizer([""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt")
+# uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
+# text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+# latents = torch.randn(
+#     (batch_size, unet.in_channels, height // 8, width // 8),
+#     generator=generator,
+# )
+# latents = latents.to(torch_device)
+# latents = latents * scheduler.init_noise_sigma
+# from tqdm.auto import tqdm
+# scheduler.set_timesteps(num_inference_steps)
+# for t in tqdm(scheduler.timesteps):
+#     # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+#     latent_model_input = torch.cat([latents] * 2)
+#     latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)
+#     # predict the noise residual
+#     with torch.no_grad():
+#         noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+#     # perform guidance
+#     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+#     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+#     # compute the previous noisy sample x_t -> x_t-1
+#     latents = scheduler.step(noise_pred, t, latents).prev_sample
 print('test')

train_model.py CHANGED Viewed

@@ -105,13 +105,13 @@ def train_model(batch_size=4, epochs=100, save_model_interval=10, start_learning
         for batch_index in range(0, num_images, batch_size):
             progress_bar.set_description(f'epoch={epoch}, batch_index={batch_index}')
             batch_end = np.minimum(num_images, batch_index + batch_size)
-            clean_images = targets[batch_index:batch_end].to('cuda')
-            batch_labels = class_labels[batch_index:batch_end].to('cuda')
             noise = torch.randn(clean_images.shape).to('cuda')
             timesteps = torch.randint(0, model.scheduler.config.num_train_timesteps, (batch_size, )).to('cuda')
-            noisy_images = model.scheduler.add_noise(clean_images, noise, timesteps)
-            noise_pred = model.unet(noisy_images, timesteps, batch_labels, return_dict=False)[0]
             loss = F.mse_loss(noise_pred, noise)
             loss.backward()

         for batch_index in range(0, num_images, batch_size):
             progress_bar.set_description(f'epoch={epoch}, batch_index={batch_index}')
             batch_end = np.minimum(num_images, batch_index + batch_size)
+            clean_images = targets[batch_index:batch_end].to(device='cuda', dtype=torch.float16)
+            clean_images = torch.reshape(clean_images, (batch_size, 12, 256, 256))
             noise = torch.randn(clean_images.shape).to('cuda')
             timesteps = torch.randint(0, model.scheduler.config.num_train_timesteps, (batch_size, )).to('cuda')
+            noisy_images = model.scheduler.add_noise(clean_images, noise, timesteps).to(device='cuda', dtype=torch.float16)
+            noise_pred = model.unet(noisy_images, timesteps, class_labels[batch_index:batch_end].to(device='cuda',dtype=torch.float16), return_dict=False)[0]
             loss = F.mse_loss(noise_pred, noise)
             loss.backward()