frutiemax
/

rct_model

Model card Files Files and versions Community

frutiemax commited on Oct 1, 2023

Commit

9bde8da

•

1 Parent(s): 42f8b67

Use vae for encoding and decoding for training

Browse files

Files changed (3) hide show

rct_diffusion_pipeline.py +21 -10
test_pipeline.py +4 -2
train_model.py +29 -16

rct_diffusion_pipeline.py CHANGED Viewed

@@ -172,8 +172,12 @@ class RCTDiffusionPipeline(DiffusionPipeline):
     def generate_noise_batches(self, batch_size):
         noise_batches = torch.Tensor(size=(batch_size, self.num_channels, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
         for batch_index in range(batch_size):
-            noise = torch.randn(self.num_channels, self.latent_size, self.latent_size).to(dtype=torch.float16, device='cuda')
             noise_batches[batch_index] = noise
         return torch.reshape(noise_batches, (batch_size, self.num_channels, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
@@ -237,7 +241,11 @@ class RCTDiffusionPipeline(DiffusionPipeline):
     def __call__(self, object_description : list[str], color1 : list[str], \
                 color2 : list[str] = None, color3 : list[str] = None, \
-                 batch_size=1, num_inference_steps=20, generator=torch.manual_seed(torch.random.seed())):
         res, object_description, color1, color2, color3 = self.validate_inputs(object_description, color1, color2, color3, batch_size)
         if res == False:
@@ -268,20 +276,23 @@ class RCTDiffusionPipeline(DiffusionPipeline):
         noise_batches = torch.reshape(noise_batches, (batch_size, self.num_channels, self.latent_size, self.latent_size))
         noise_batches = noise_batches.to('cuda')
         images = torch.Tensor(size=(batch_size, 3, self.sample_size, self.sample_size)).to('cuda')
-        with torch.no_grad():
-            image = noise_batches
-            result = self.vae.decode(image).sample
-            images = result
         # convert those tensors to PIL images
         output_images = []
         for batch_index in range(batch_size):
             image = images[batch_index]
-            image = (image / 2 + 0.5).clamp(0, 1).squeeze()
-            image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
-            image = (image * 255).round().astype("uint8")
-            image = Image.fromarray(image)
             image.save(f'test{batch_index}.png')
             output_images.append(image)

     def generate_noise_batches(self, batch_size):
         noise_batches = torch.Tensor(size=(batch_size, self.num_channels, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
+        seed = int(0)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
         for batch_index in range(batch_size):
+            noise = torch.randn((self.num_channels, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
             noise_batches[batch_index] = noise
         return torch.reshape(noise_batches, (batch_size, self.num_channels, self.latent_size, self.latent_size)).to(dtype=torch.float16, device='cuda')
     def __call__(self, object_description : list[str], color1 : list[str], \
                 color2 : list[str] = None, color3 : list[str] = None, \
+                 batch_size=1, num_inference_steps=100, generator=torch.manual_seed(torch.random.seed())):
+        self.unet.to(device='cuda', dtype=torch.float16)
+        self.vae.to(device='cuda', dtype=torch.float16)
+        self.text_encoder.to(device='cuda', dtype=torch.float16)
         res, object_description, color1, color2, color3 = self.validate_inputs(object_description, color1, color2, color3, batch_size)
         if res == False:
         noise_batches = torch.reshape(noise_batches, (batch_size, self.num_channels, self.latent_size, self.latent_size))
         noise_batches = noise_batches.to('cuda')
         images = torch.Tensor(size=(batch_size, 3, self.sample_size, self.sample_size)).to('cuda')
+        images = noise_batches[:, :3]
+        #with torch.no_grad():
+            #image = noise_batches
+            #result = self.vae.decode(image).sample
+            #images = result
         # convert those tensors to PIL images
+        tensor_to_pil = T.ToPILImage()
         output_images = []
         for batch_index in range(batch_size):
             image = images[batch_index]
+            image = (image / 2 + 0.5).clamp(0, 1)
+            #image = (image.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
+            #image = (image * 255).round().astype("uint8")
+            #image = Image.fromarray(image)
+            image = tensor_to_pil(image)
             image.save(f'test{batch_index}.png')
             output_images.append(image)

test_pipeline.py CHANGED Viewed

@@ -38,7 +38,9 @@ scheduler = DDPMScheduler(num_train_timesteps=20)
 vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
 vae = vae.to('cuda', dtype=torch.float16)
-pipeline = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder)
-output = pipeline(['aleppo pine tree'], ['dark green'])
 pipeline.save_pretrained('test')
 print('test')

 vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
 vae = vae.to('cuda', dtype=torch.float16)
+#pipeline = RCTDiffusionPipeline(unet, scheduler, vae, tokenizer, text_encoder)
+pipeline = RCTDiffusionPipeline.from_pretrained('rct_foliage_999')
+output = pipeline(['pagoda pine tree'], ['green'], ['grey'])
+output[0].save('out.png')
 pipeline.save_pretrained('test')
 print('test')

train_model.py CHANGED Viewed

@@ -6,6 +6,7 @@ import numpy as np
 from rct_diffusion_pipeline import RCTDiffusionPipeline
 import torch
 import torchvision.transforms as T
 import torch.nn.functional as F
 from diffusers.optimization import get_cosine_schedule_with_warmup
 from tqdm.auto import tqdm
@@ -29,16 +30,21 @@ def save_and_test(pipeline, epoch):
     model_file = f'rct_foliage_{epoch}'
     pipeline.save_pretrained(model_file)
-def convert_images(dataset):
-    preprocess = transforms.Compose(
-    [
-        transforms.Resize((LATENT_SIZE, LATENT_SIZE)),
-        transforms.ToTensor(),
-        transforms.Normalize([0.5], [0.5]),
-    ]
-    )
-    images = [preprocess(image.convert("RGBA")) for image in dataset["image"]]
     object_descriptions = [obj_desc for obj_desc in dataset["object_description"]]
     colors1 = [color1 for color1 in dataset['color1']]
     colors2 = [color1 for color1 in dataset['color2']]
@@ -87,10 +93,10 @@ def create_embeddings(dataset, model):
     return model.test_generate_embeddings(object_descriptions, colors1, colors2, colors3)
-def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timesteps=20, save_model_interval=10, start_learning_rate=1e-3, lr_warmup_steps=1):
     dataset = load_dataset('frutiemax/rct_dataset', split=f'train[0:{total_images}]')
     dataset.set_transform(convert_images)
-    num_images = int(dataset.num_rows / 4) if total_images == None else int(total_images / 4)
     unet = UNet2DConditionModel(sample_size=LATENT_SIZE, in_channels=LATENT_NUM_CHANNELS, out_channels=LATENT_NUM_CHANNELS, \
                         down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D", "DownBlock2D"),\
@@ -109,7 +115,7 @@ def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timeste
         "CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
     ).to('cuda')
     vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
-    vae = vae.to(dtype=torch.float16, device='cuda')
     optimizer = torch.optim.AdamW(unet.parameters(), lr=start_learning_rate)
     lr_scheduler = get_cosine_schedule_with_warmup(
@@ -134,7 +140,7 @@ def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timeste
             clean_images = batch['image']
             batch_size = clean_images.size(0)
             embeddings = create_embeddings(batch, model)
-            clean_images = torch.reshape(clean_images, (batch['image'].size(0), LATENT_NUM_CHANNELS, LATENT_SIZE, LATENT_SIZE)).\
                 to(device='cuda')
             noise = torch.randn(clean_images.shape, dtype=torch.float32, device='cuda')
@@ -146,9 +152,16 @@ def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timeste
             batch_embeddings = embeddings
             batch_embeddings = batch_embeddings.to('cuda')
             optimizer.zero_grad()
-            unet_results = unet(noisy_images, timesteps, batch_embeddings).sample
-            loss = loss_fn(unet_results, noise)
             loss.backward()
             optimizer.step()
             lr_scheduler.step()
@@ -171,4 +184,4 @@ def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timeste
 if __name__ == '__main__':
-    train_model(1, total_images=4, save_model_interval=100, epochs=1000)

 from rct_diffusion_pipeline import RCTDiffusionPipeline
 import torch
 import torchvision.transforms as T
+import torchvision
 import torch.nn.functional as F
 from diffusers.optimization import get_cosine_schedule_with_warmup
 from tqdm.auto import tqdm
     model_file = f'rct_foliage_{epoch}'
     pipeline.save_pretrained(model_file)
+def transform_images(image):
+    res = torch.Tensor(SAMPLE_NUM_CHANNELS, SAMPLE_SIZE, SAMPLE_SIZE)
+    pil_to_tensor = T.PILToTensor()
+    res_index = 0
+    scale_factor = np.minimum(SAMPLE_SIZE / image.width, SAMPLE_SIZE / image.height)
+    image = Image.resize(image, size=(int(scale_factor * image.width), int(scale_factor * image.height)), resample=Resampling.NEAREST)
+    new_image = PIL.Image.new('RGB', (SAMPLE_SIZE, SAMPLE_SIZE))
+    new_image.paste(image, box=(int((SAMPLE_SIZE - image.width)/2), int((SAMPLE_SIZE - image.height)/2)))
+    res = pil_to_tensor(new_image)
+    return res
+def convert_images(dataset):
+    images = [transform_images(image) for image in dataset["image"]]
     object_descriptions = [obj_desc for obj_desc in dataset["object_description"]]
     colors1 = [color1 for color1 in dataset['color1']]
     colors2 = [color1 for color1 in dataset['color2']]
     return model.test_generate_embeddings(object_descriptions, colors1, colors2, colors3)
+def train_model(batch_size=4, total_images=-1, epochs=100, scheduler_num_timesteps=100, save_model_interval=10, start_learning_rate=1e-4, lr_warmup_steps=500):
     dataset = load_dataset('frutiemax/rct_dataset', split=f'train[0:{total_images}]')
     dataset.set_transform(convert_images)
+    num_images = dataset.num_rows
     unet = UNet2DConditionModel(sample_size=LATENT_SIZE, in_channels=LATENT_NUM_CHANNELS, out_channels=LATENT_NUM_CHANNELS, \
                         down_block_types=("CrossAttnDownBlock2D","CrossAttnDownBlock2D","CrossAttnDownBlock2D", "DownBlock2D"),\
         "CompVis/stable-diffusion-v1-4", subfolder="text_encoder", use_safetensors=True
     ).to('cuda')
     vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", use_safetensors=True)
+    vae = vae.to(dtype=torch.float32, device='cuda')
     optimizer = torch.optim.AdamW(unet.parameters(), lr=start_learning_rate)
     lr_scheduler = get_cosine_schedule_with_warmup(
             clean_images = batch['image']
             batch_size = clean_images.size(0)
             embeddings = create_embeddings(batch, model)
+            clean_images = torch.reshape(clean_images, (batch['image'].size(0), SAMPLE_NUM_CHANNELS, SAMPLE_SIZE, SAMPLE_SIZE)).\
                 to(device='cuda')
             noise = torch.randn(clean_images.shape, dtype=torch.float32, device='cuda')
             batch_embeddings = embeddings
             batch_embeddings = batch_embeddings.to('cuda')
+            # use the vae to get the latent images
+            latent_images = vae.encode(noisy_images).latent_dist.sample()
             optimizer.zero_grad()
+            unet_results = unet(latent_images, timesteps, batch_embeddings).sample
+            # get back the upscale result
+            noise_pred = vae.decode(unet_results).sample
+            loss = loss_fn(noise_pred, noise)
             loss.backward()
             optimizer.step()
             lr_scheduler.step()
 if __name__ == '__main__':
+    train_model(1, save_model_interval=10, epochs=100)