Spaces:

imsuperkong
/

dreamdrone

Runtime error

App Files Files Community

imsuperkong commited on Dec 14, 2023

Commit

d3bdeec

1 Parent(s): dc47947

Upload 6 files

Browse files

Files changed (6) hide show

requirements.txt +7 -0
sd/core.py +435 -0
sd/dift_sd.py +240 -0
sd/gradio_utils.py +85 -0
sd/pnp_utils.py +569 -0
weights/dpt_beit_large_512.pt +3 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==2.0.1
+torchvision
+timm==0.6.12
+gradio==3.40.1
+diffusers==0.17.1
+numpy==1.20.3
+wget

sd/core.py ADDED Viewed

	@@ -0,0 +1,435 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+from diffusers import StableDiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from typing import Any, Callable, Dict, List, Optional, Union
+from sd.pnp_utils import register_time, register_attention_control_efficient_kv_w_mask, register_conv_control_efficient_w_mask
+import torch.nn as nn
+from sd.dift_sd import MyUNet2DConditionModel, OneStepSDPipeline
+import ipdb
+from tqdm import tqdm
+from lib.midas import MiDas
+class DDIMBackward(StableDiffusionPipeline):
+    def __init__(
+        self, vae, text_encoder, tokenizer, unet, scheduler,
+        safety_checker, feature_extractor,
+        requires_safety_checker: bool = True,
+        device='cuda', model_id='ckpt/stable-diffusion-2-1-base',depth_model='dpt_swin2_large_384'
+    ):
+        super().__init__(
+            vae, text_encoder, tokenizer, unet, scheduler,
+            safety_checker, feature_extractor, requires_safety_checker,
+        )
+        self.dift_unet = MyUNet2DConditionModel.from_pretrained(model_id, subfolder="unet", torch_dtype=torch.float16 if 'cuda' in device else torch.float32)
+        self.onestep_pipe =  OneStepSDPipeline.from_pretrained(model_id, unet=self.dift_unet, safety_checker=None, torch_dtype=torch.float16 if 'cuda' in device else torch.float32)
+        self.onestep_pipe = self.onestep_pipe.to(device)
+        if 'cuda' in device:
+            self.onestep_pipe.enable_attention_slicing()
+            self.onestep_pipe.enable_xformers_memory_efficient_attention()
+        self.ensemble_size = 4
+        self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)
+        self.midas_model = MiDas(device,model_type=depth_model)
+        self.torch_dtype=torch.float16 if 'cuda' in device else torch.float32
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        t_start=None,
+    ):
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if t_start and t >= t_start:
+                    progress_bar.update()
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            image = self.decode_latents(latents)
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+            image = self.numpy_to_pil(image)
+        else:
+            image = self.decode_latents(latents)
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+    def denoise_w_injection(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        t_start=None,
+        attn=0.8,
+        f=0.5,
+        latent_mask=None,
+        guidance_loss_scale=0,
+        cfg_decay=False,
+        cfg_norm=False,
+        lr=1.0,
+        up_ft_indexes=[1,2],
+        img_tensor=None,
+        early_stop=50,
+        intrinsic=None, extrinsic=None, threshold=20,depth=None,
+    ):
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat((prompt_embeds[1:], prompt_embeds[1:], prompt_embeds[:1]), dim=0)
+        else:
+            prompt_embeds = torch.cat([prompt_embeds]*2, dim=0)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        kv_injection_timesteps = self.scheduler.timesteps[:int(len(self.scheduler.timesteps) * attn)]
+        f_injection_timesteps = self.scheduler.timesteps[:int(len(self.scheduler.timesteps) * f)]
+        register_attention_control_efficient_kv_w_mask(self, kv_injection_timesteps, mask=latent_mask, do_classifier_free_guidance=do_classifier_free_guidance)
+        register_conv_control_efficient_w_mask(self, f_injection_timesteps, mask=latent_mask)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if t_start and t >= t_start:
+                    progress_bar.update()
+                    continue
+                if i > early_stop: guidance_loss_scale = 0 # Early stop (optional)
+                # if t > 300: guidance_loss_scale = 0 # Early stop (optional)
+                register_time(self, t.item())
+                # Set requires grad
+                if guidance_loss_scale != 0:
+                    latents = latents.detach().requires_grad_()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = latents    # latents: ori_z + wrap_z
+                if do_classifier_free_guidance:
+                    latent_model_input = torch.cat([latent_model_input, latent_model_input[1:]], dim=0)
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                if guidance_loss_scale != 0:
+                    with torch.no_grad():
+                        noise_pred = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=prompt_embeds,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                        ).sample
+                else:
+                    with torch.no_grad():
+                        noise_pred = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=prompt_embeds,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                        ).sample
+                # perform guidance
+                if do_classifier_free_guidance:
+                    cfg_scale = guidance_scale
+                    if cfg_decay: cfg_scale = 1 + guidance_scale * (1-i/num_inference_steps)
+                    noise_pred_text, wrap_noise_pred_text, wrap_noise_pred_uncond = noise_pred.chunk(3)
+                    noise_pred = wrap_noise_pred_text + cfg_scale * (wrap_noise_pred_text - wrap_noise_pred_uncond)
+                else:
+                    noise_pred_text, wrap_noise_pred_text = noise_pred.chunk(3)
+                    noise_pred = wrap_noise_pred_text
+                if cfg_norm:
+                    noise_pred = noise_pred * (torch.linalg.norm(wrap_noise_pred_uncond) / torch.linalg.norm(noise_pred))
+                if guidance_loss_scale != 0:
+                    for up_ft_index in up_ft_indexes:
+                        alpha_prod_t = self.scheduler.alphas_cumprod[t]
+                        alpha_prod_t_prev = (
+                            self.scheduler.alphas_cumprod[timesteps[i - 0]]
+                            if i > 0 else self.scheduler.final_alpha_cumprod
+                        )
+                        mu = alpha_prod_t ** 0.5
+                        mu_prev = alpha_prod_t_prev ** 0.5
+                        sigma = (1 - alpha_prod_t) ** 0.5
+                        sigma_prev = (1 - alpha_prod_t_prev) ** 0.5
+                        pred_x0 = (latents - sigma_prev * noise_pred[:latents.shape[0]]) / mu_prev
+                        unet_ft_all = self.onestep_pipe(
+                            latents=pred_x0[:1].repeat(self.ensemble_size, 1, 1, 1),
+                            t=t,
+                            up_ft_indices=[up_ft_index],
+                            prompt_embeds=prompt_embeds[:1].repeat(self.ensemble_size, 1, 1)
+                        )
+                        unet_ft1 = unet_ft_all['up_ft'][up_ft_index].mean(0, keepdim=True) # 1,c,h,w
+                        unet_ft1_norm = unet_ft1 / torch.norm(unet_ft1, dim=1, keepdim=True)
+                        unet_ft1_norm = self.midas_model.wrap_img_tensor_w_fft_ext(
+                            unet_ft1_norm.to(self.torch_dtype),
+                            torch.from_numpy(depth).to(device).to(self.torch_dtype),
+                            intrinsic,
+                            extrinsic[:3,:3], extrinsic[:3,3], threshold=threshold).to(self.torch_dtype)
+                        unet_ft_all = self.onestep_pipe(
+                            latents=pred_x0[1:2].repeat(self.ensemble_size, 1, 1, 1),
+                            t=t,
+                            up_ft_indices=[up_ft_index],
+                            prompt_embeds=prompt_embeds[:1].repeat(self.ensemble_size, 1, 1)
+                        )
+                        unet_ft2 = unet_ft_all['up_ft'][up_ft_index].mean(0, keepdim=True) # 1,c,h,w
+                        unet_ft2_norm = unet_ft2 / torch.norm(unet_ft2, dim=1, keepdim=True)
+                        c = unet_ft2.shape[1]
+                        loss = (-self.cos(unet_ft1_norm.squeeze().view(c, -1).T, unet_ft2_norm.squeeze().view(c, -1).T).mean() + 1) / 2.
+                    # Get gradient
+                    cond_grad = torch.autograd.grad(loss * guidance_loss_scale, latents)[0][1:2]
+                # compute the previous noisy sample x_t -> x_t-1
+                noise_pred_ = noise_pred - sigma_prev * cond_grad*lr
+                noise_pred_ = torch.cat([noise_pred_text, noise_pred_], dim=0)
+                # compute the previous noisy sample x_t -> x_t-1
+                with torch.no_grad():
+                    latents = self.scheduler.step(noise_pred_, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            with torch.no_grad():
+                image = self.decode_latents(latents)
+                image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+                image = self.numpy_to_pil(image)
+        else:
+            image = self.decode_latents(latents)
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+    @torch.no_grad()
+    def decoder(self, latents):
+        with torch.autocast(device_type=self.device, dtype=torch.float32):
+            latents = 1 / 0.18215 * latents
+            imgs = self.vae.decode(latents).sample
+            imgs = (imgs / 2 + 0.5).clamp(0, 1)
+        return imgs
+    def ddim_inversion_w_grad(self, latent, cond, stop_t, guidance_loss_scale=1.0, lr=1.0):
+        timesteps = reversed(self.scheduler.timesteps)
+        with torch.autocast(device_type=self.device, dtype=torch.float32):
+            for i, t in enumerate(tqdm(timesteps)):
+                if t >= stop_t:
+                    break
+                if guidance_loss_scale != 0:
+                    latent = latent.detach().requires_grad_()
+                cond_batch = cond.repeat(latent.shape[0], 1, 1)
+                alpha_prod_t = self.scheduler.alphas_cumprod[t]
+                alpha_prod_t_prev = (
+                    self.scheduler.alphas_cumprod[timesteps[i - 1]]
+                    if i > 0 else self.scheduler.final_alpha_cumprod
+                )
+                mu = alpha_prod_t ** 0.5
+                mu_prev = alpha_prod_t_prev ** 0.5
+                sigma = (1 - alpha_prod_t) ** 0.5
+                sigma_prev = (1 - alpha_prod_t_prev) ** 0.5
+                eps = self.onestep_pipe.unet(latent, t, encoder_hidden_states=cond_batch, up_ft_indices=[3], output_eps=True)['eps']
+                pred_x0 = (latent - sigma_prev * eps) / mu_prev
+                unet_ft_all = self.onestep_pipe(
+                                    latents=pred_x0[:1].repeat(self.ensemble_size, 1, 1, 1),
+                                    t=t,
+                                    up_ft_indices=[1],
+                                    prompt_embeds=cond_batch[:1].repeat(self.ensemble_size, 1, 1)
+                                )
+                unet_ft1 = unet_ft_all['up_ft'][1].mean(0, keepdim=True) # 1,c,h,w
+                unet_ft1_norm = unet_ft1 / torch.norm(unet_ft1, dim=1, keepdim=True)
+                unet_ft_all = self.onestep_pipe(
+                    latents=pred_x0[1:2].repeat(self.ensemble_size, 1, 1, 1),
+                    t=t,
+                    up_ft_indices=[1],
+                    prompt_embeds=cond_batch[:1].repeat(self.ensemble_size, 1, 1)
+                )
+                unet_ft2 = unet_ft_all['up_ft'][1].mean(0, keepdim=True) # 1,c,h,w
+                unet_ft2_norm = unet_ft2 / torch.norm(unet_ft2, dim=1, keepdim=True)
+                c = unet_ft2.shape[1]
+                loss = (-self.cos(unet_ft1_norm.squeeze().view(c, -1).T.detach(), unet_ft2_norm.squeeze().view(c, -1).T).mean() + 1) / 2.
+                print(f'loss: {loss.item()}')
+                # Get gradient
+                cond_grad = torch.autograd.grad(loss * guidance_loss_scale, latent)[0]
+                # latent = latent.detach() - cond_grad  * lr
+                latent = mu * pred_x0 + sigma * eps - cond_grad  * lr
+        return latent
+@torch.no_grad()
+def DDPM_forward(x_t_dot, t_start, delta_t, ddpm_scheduler, generator):
+    # just simple implementation, this should have an analytical expression
+    # TODO: implementation analytical form
+    for delta in range(1, delta_t):
+        # noise = torch.randn_like(x_t_dot, generator=generator)
+        noise = torch.empty_like(x_t_dot).normal_(generator=generator)
+        beta = ddpm_scheduler.betas[t_start+delta]
+        std_ = beta ** 0.5
+        mu_ = ((1 - beta) ** 0.5) * x_t_dot
+        x_t_dot = mu_ + std_ * noise
+    return x_t_dot

sd/dift_sd.py ADDED Viewed

	@@ -0,0 +1,240 @@

+from diffusers import StableDiffusionPipeline
+import torch
+import torch.nn as nn
+import matplotlib.pyplot as plt
+import numpy as np
+from typing import Any, Callable, Dict, List, Optional, Union
+from diffusers.models.unet_2d_condition import UNet2DConditionModel
+from diffusers import DDIMScheduler
+import gc
+from PIL import Image
+class MyUNet2DConditionModel(UNet2DConditionModel):
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        up_ft_indices,
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        output_eps=False):
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            # logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+        # 5. up
+        up_ft = {}
+        for i, upsample_block in enumerate(self.up_blocks):
+            if i > np.max(up_ft_indices):
+                break
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+            if i in up_ft_indices:
+                up_ft[i] = sample
+        output = {}
+        output['up_ft'] = up_ft
+        if output_eps:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+            sample = self.conv_out(sample)
+            output['eps'] = sample
+        return output
+class OneStepSDPipeline(StableDiffusionPipeline):
+    # @torch.no_grad()
+    def __call__(
+        self,
+        t,
+        up_ft_indices,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        img_tensor=None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        latents=None
+    ):
+        device = self._execution_device
+        if latents is None:
+            latents = self.vae.encode(img_tensor).latent_dist.sample() * self.vae.config.scaling_factor
+        t = torch.tensor(t.clone().detach(), dtype=torch.long, device=device)
+        noise = torch.randn_like(latents).to(device)
+        latents_noisy = self.scheduler.add_noise(latents, noise, t)
+        unet_output = self.unet(latents_noisy,
+                               t,
+                               up_ft_indices,
+                               encoder_hidden_states=prompt_embeds,
+                               cross_attention_kwargs=cross_attention_kwargs)
+        return unet_output
+class SDFeaturizer:
+    def __init__(self, sd_id='ckpt/stable-diffusion-2-1-base'):
+        unet = MyUNet2DConditionModel.from_pretrained(sd_id, subfolder="unet")
+        onestep_pipe = OneStepSDPipeline.from_pretrained(sd_id, unet=unet, safety_checker=None)
+        onestep_pipe.vae.decoder = None
+        onestep_pipe.scheduler = DDIMScheduler.from_pretrained(sd_id, subfolder="scheduler")
+        gc.collect()
+        onestep_pipe = onestep_pipe.to("cuda")
+        onestep_pipe.enable_attention_slicing()
+        onestep_pipe.enable_xformers_memory_efficient_attention()
+        self.pipe = onestep_pipe
+    @torch.no_grad()
+    def forward(self,
+                img_tensor,
+                prompt,
+                t=261,
+                up_ft_index=1,
+                ensemble_size=8):
+        '''
+        Args:
+            img_tensor: should be a single torch tensor in the shape of [1, C, H, W] or [C, H, W]
+            prompt: the prompt to use, a string
+            t: the time step to use, should be an int in the range of [0, 1000]
+            up_ft_index: which upsampling block of the U-Net to extract feature, you can choose [0, 1, 2, 3]
+            ensemble_size: the number of repeated images used in the batch to extract features
+        Return:
+            unet_ft: a torch tensor in the shape of [1, c, h, w]
+        '''
+        img_tensor = img_tensor.repeat(ensemble_size, 1, 1, 1).cuda() # ensem, c, h, w
+        prompt_embeds = self.pipe._encode_prompt(
+            prompt=prompt,
+            device='cuda',
+            num_images_per_prompt=1,
+            do_classifier_free_guidance=False) # [1, 77, dim]
+        prompt_embeds = prompt_embeds.repeat(ensemble_size, 1, 1)
+        unet_ft_all = self.pipe(
+            img_tensor=img_tensor,
+            t=t,
+            up_ft_indices=[up_ft_index],
+            prompt_embeds=prompt_embeds)
+        unet_ft = unet_ft_all['up_ft'][up_ft_index] # ensem, c, h, w
+        unet_ft = unet_ft.mean(0, keepdim=True) # 1,c,h,w
+        return unet_ft

sd/gradio_utils.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import copy
+import math
+import os
+import urllib.request
+from typing import List, Optional, Tuple
+import numpy as np
+import PIL
+import PIL.Image
+import PIL.ImageDraw
+import torch
+import torch.optim
+from tqdm import tqdm
+import ipdb
+def tensor_to_PIL(img: torch.Tensor) -> PIL.Image.Image:
+    """
+    Converts a tensor image to a PIL Image.
+    Args:
+        img (torch.Tensor): The tensor image of shape [batch_size, num_channels, height, width].
+    Returns:
+        A PIL Image object.
+    """
+    img = (img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+    return PIL.Image.fromarray(img[0].cpu().numpy(), "RGB")
+def get_ellipse_coords(
+    point: Tuple[int, int], radius: int = 5
+) -> Tuple[int, int, int, int]:
+    """
+    Returns the coordinates of an ellipse centered at the given point.
+    Args:
+        point (Tuple[int, int]): The center point of the ellipse.
+        radius (int): The radius of the ellipse.
+    Returns:
+        A tuple containing the coordinates of the ellipse in the format (x_min, y_min, x_max, y_max).
+    """
+    center = point
+    return (
+        center[0] - radius,
+        center[1] - radius,
+        center[0] + radius,
+        center[1] + radius,
+    )
+def draw_handle_target_points(
+        img: PIL.Image.Image,
+        # handle_points: List[Tuple[int, int]],
+        target_points: List[Tuple[int, int]],
+        radius: int = 5):
+    """
+    Draws handle and target points with arrow pointing towards the target point.
+    Args:
+        img (PIL.Image.Image): The image to draw on.
+        handle_points (List[Tuple[int, int]]): A list of handle [x,y] points.
+        target_points (List[Tuple[int, int]]): A list of target [x,y] points.
+        radius (int): The radius of the handle and target points.
+    """
+    if not isinstance(img, PIL.Image.Image):
+        img = PIL.Image.fromarray(img)
+    # if len(handle_points) == len(target_points) + 1:
+    #     target_points = copy.deepcopy(target_points) + [None]
+    draw = PIL.ImageDraw.Draw(img)
+    for handle_point, target_point in zip(target_points, target_points):
+        # handle_point = [handle_point[1], handle_point[0]]
+        # Draw the handle point
+        # ipdb.set_trace()
+        target_coords = get_ellipse_coords(target_point, radius)
+        draw.ellipse((target_coords), fill="red")
+    return np.array(img)

sd/pnp_utils.py ADDED Viewed

	@@ -0,0 +1,569 @@

+import torch
+import os
+import random
+import numpy as np
+import ipdb
+import torch.nn.functional as F
+def seed_everything(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+def register_time(model, t):
+    conv_module = model.unet.up_blocks[1].resnets[1]
+    setattr(conv_module, 't', t)
+    down_res_dict = {0: [0, 1], 1: [0, 1], 2: [0, 1]}
+    up_res_dict = {1: [0, 1, 2], 2: [0, 1, 2], 3: [0, 1, 2]}
+    for res in up_res_dict:
+        for block in up_res_dict[res]:
+            module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
+            setattr(module, 't', t)
+    for res in down_res_dict:
+        for block in down_res_dict[res]:
+            module = model.unet.down_blocks[res].attentions[block].transformer_blocks[0].attn1
+            setattr(module, 't', t)
+    module = model.unet.mid_block.attentions[0].transformer_blocks[0].attn1
+    setattr(module, 't', t)
+def load_source_latents_t(t, latents_path):
+    latents_t_path = os.path.join(latents_path, f'noisy_latents_{t}.pt')
+    assert os.path.exists(latents_t_path), f'Missing latents at t {t} path {latents_t_path}'
+    latents = torch.load(latents_t_path)
+    return latents
+def register_attention_control_efficient(model, injection_schedule):
+    def sa_forward(self):
+        to_out = self.to_out
+        if type(to_out) is torch.nn.modules.container.ModuleList:
+            to_out = self.to_out[0]
+        else:
+            to_out = self.to_out
+        def forward(x, encoder_hidden_states=None, attention_mask=None):
+            batch_size, sequence_length, dim = x.shape
+            h = self.heads
+            is_cross = encoder_hidden_states is not None
+            encoder_hidden_states = encoder_hidden_states if is_cross else x
+            if not is_cross and self.injection_schedule is not None and (
+                    self.t in self.injection_schedule or self.t == 1000):
+                q = self.to_q(x)
+                k = self.to_k(encoder_hidden_states)
+                source_batch_size = int(q.shape[0] // 3)
+                # inject unconditional
+                q[source_batch_size:2 * source_batch_size] = q[:source_batch_size]
+                k[source_batch_size:2 * source_batch_size] = k[:source_batch_size]
+                # inject conditional
+                q[2 * source_batch_size:] = q[:source_batch_size]
+                k[2 * source_batch_size:] = k[:source_batch_size]
+                q = self.head_to_batch_dim(q)
+                k = self.head_to_batch_dim(k)
+            else:
+                q = self.to_q(x)
+                k = self.to_k(encoder_hidden_states)
+                q = self.head_to_batch_dim(q)
+                k = self.head_to_batch_dim(k)
+            v = self.to_v(encoder_hidden_states)
+            v = self.head_to_batch_dim(v)
+            sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
+            if attention_mask is not None:
+                attention_mask = attention_mask.reshape(batch_size, -1)
+                max_neg_value = -torch.finfo(sim.dtype).max
+                attention_mask = attention_mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~attention_mask, max_neg_value)
+            # attention, what we cannot get enough of
+            attn = sim.softmax(dim=-1)
+            out = torch.einsum("b i j, b j d -> b i d", attn, v)
+            out = self.batch_to_head_dim(out)
+            return to_out(out)
+        return forward
+    res_dict = {1: [1, 2], 2: [0, 1, 2], 3: [0, 1, 2]}  # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
+    for res in res_dict:
+        for block in res_dict[res]:
+            module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
+            module.forward = sa_forward(module)
+            setattr(module, 'injection_schedule', injection_schedule)
+def register_attention_control_efficient_kv(model, injection_schedule):
+    def sa_forward(self):
+        to_out = self.to_out
+        if type(to_out) is torch.nn.modules.container.ModuleList:
+            to_out = self.to_out[0]
+        else:
+            to_out = self.to_out
+        def forward(x, encoder_hidden_states=None, attention_mask=None):
+            batch_size, sequence_length, dim = x.shape
+            h = self.heads
+            # if encoder_hidden_states is None:
+            #     ipdb.set_trace()
+            is_cross = encoder_hidden_states is not None
+            encoder_hidden_states = encoder_hidden_states if is_cross else x
+            q = self.to_q(x)
+            q = self.head_to_batch_dim(q)
+            if not is_cross and self.injection_schedule is not None and (
+                    self.t in self.injection_schedule or self.t == 1000):
+                # q = self.to_q(x)
+                k = self.to_k(encoder_hidden_states)
+                v = self.to_v(encoder_hidden_states)
+                source_batch_size = int(v.shape[0] // 3)
+                # inject unconditional
+                k[source_batch_size:2 * source_batch_size] = k[:source_batch_size]
+                v[source_batch_size:2 * source_batch_size] = v[:source_batch_size]
+                # inject conditional
+                k[2 * source_batch_size:] = k[:source_batch_size]
+                v[2 * source_batch_size:] = v[:source_batch_size]
+                # q = self.head_to_batch_dim(q)
+                k = self.head_to_batch_dim(k)
+                v = self.head_to_batch_dim(v)
+            else:
+                # q = self.to_q(x)
+                k = self.to_k(encoder_hidden_states)
+                # q = self.head_to_batch_dim(q)
+                k = self.head_to_batch_dim(k)
+                v = self.to_v(encoder_hidden_states)
+                v = self.head_to_batch_dim(v)
+            sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
+            if attention_mask is not None:
+                attention_mask = attention_mask.reshape(batch_size, -1)
+                max_neg_value = -torch.finfo(sim.dtype).max
+                attention_mask = attention_mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~attention_mask, max_neg_value)
+            # attention, what we cannot get enough of
+            attn = sim.softmax(dim=-1)
+            out = torch.einsum("b i j, b j d -> b i d", attn, v)
+            out = self.batch_to_head_dim(out)
+            return to_out(out)
+        return forward
+    res_dict = {1: [1, 2], 2: [0, 1, 2], 3: [0, 1, 2]}  # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
+    for res in res_dict:
+        for block in res_dict[res]:
+            module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
+            module.forward = sa_forward(module)
+            setattr(module, 'injection_schedule', injection_schedule)
+def register_conv_control_efficient(model, injection_schedule):
+    def conv_forward(self):
+        def forward(input_tensor, temb):
+            hidden_states = input_tensor
+            hidden_states = self.norm1(hidden_states)
+            hidden_states = self.nonlinearity(hidden_states)
+            if self.upsample is not None:
+                # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+                if hidden_states.shape[0] >= 64:
+                    input_tensor = input_tensor.contiguous()
+                    hidden_states = hidden_states.contiguous()
+                input_tensor = self.upsample(input_tensor)
+                hidden_states = self.upsample(hidden_states)
+            elif self.downsample is not None:
+                input_tensor = self.downsample(input_tensor)
+                hidden_states = self.downsample(hidden_states)
+            hidden_states = self.conv1(hidden_states)
+            if temb is not None:
+                temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
+            if temb is not None and self.time_embedding_norm == "default":
+                hidden_states = hidden_states + temb
+            hidden_states = self.norm2(hidden_states)
+            if temb is not None and self.time_embedding_norm == "scale_shift":
+                scale, shift = torch.chunk(temb, 2, dim=1)
+                hidden_states = hidden_states * (1 + scale) + shift
+            hidden_states = self.nonlinearity(hidden_states)
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = self.conv2(hidden_states)
+            if self.injection_schedule is not None and (self.t in self.injection_schedule or self.t == 1000):
+                source_batch_size = int(hidden_states.shape[0] // 3)
+                # inject unconditional
+                hidden_states[source_batch_size:2 * source_batch_size] = hidden_states[:source_batch_size]
+                # inject conditional
+                hidden_states[2 * source_batch_size:] = hidden_states[:source_batch_size]
+            if self.conv_shortcut is not None:
+                input_tensor = self.conv_shortcut(input_tensor)
+            output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+            return output_tensor
+        return forward
+    conv_module = model.unet.up_blocks[1].resnets[1]
+    conv_module.forward = conv_forward(conv_module)
+    setattr(conv_module, 'injection_schedule', injection_schedule)
+def register_attention_control_efficient_kv_2nd_to_1st(model, injection_schedule, mask=None):
+    def sa_forward(self):
+        to_out = self.to_out
+        if type(to_out) is torch.nn.modules.container.ModuleList:
+            to_out = self.to_out[0]
+        else:
+            to_out = self.to_out
+        def forward(x, mask=mask, encoder_hidden_states=None, attention_mask=None):
+            batch_size, sequence_length, dim = x.shape
+            h = self.heads
+            # if encoder_hidden_states is None:
+            #     ipdb.set_trace()
+            is_cross = encoder_hidden_states is not None
+            encoder_hidden_states = encoder_hidden_states if is_cross else x
+            q = self.to_q(x)
+            q = self.head_to_batch_dim(q)
+            if not is_cross and self.injection_schedule is not None and (
+                    self.t in self.injection_schedule or self.t == 1000):
+                # q = self.to_q(x)
+                target_size = int(np.sqrt(encoder_hidden_states.shape[1]))
+                target_mask = F.interpolate(mask.unsqueeze(1),size=(target_size, target_size))[:,0,:,:]
+                target_mask = target_mask.view(target_mask.shape[0], -1).unsqueeze(-1)
+                k = self.to_k(encoder_hidden_states)    # k: bx256x1280
+                v = self.to_v(encoder_hidden_states)
+                source_batch_size = int(v.shape[0] // 2)
+                # inject
+                k[:source_batch_size] = k[source_batch_size:2 * source_batch_size] * (1-target_mask) + k[:source_batch_size] * target_mask
+                v[:source_batch_size] = v[source_batch_size:2 * source_batch_size] * (1-target_mask) + v[:source_batch_size] * target_mask
+                # q = self.head_to_batch_dim(q)
+                k = self.head_to_batch_dim(k)
+                v = self.head_to_batch_dim(v)
+            else:
+                # q = self.to_q(x)
+                k = self.to_k(encoder_hidden_states)
+                # q = self.head_to_batch_dim(q)
+                k = self.head_to_batch_dim(k)
+                v = self.to_v(encoder_hidden_states)
+                v = self.head_to_batch_dim(v)
+            sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
+            if attention_mask is not None:
+                attention_mask = attention_mask.reshape(batch_size, -1)
+                max_neg_value = -torch.finfo(sim.dtype).max
+                attention_mask = attention_mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~attention_mask, max_neg_value)
+            # attention, what we cannot get enough of
+            attn = sim.softmax(dim=-1)
+            out = torch.einsum("b i j, b j d -> b i d", attn, v)
+            out = self.batch_to_head_dim(out)
+            return to_out(out)
+        return forward
+    # res_dict = {1: [1, 2], 2: [0, 1, 2], 3: [0, 1, 2]}  # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
+    res_dict = {1: [1, 2], 2: [0, 1, 2]}  # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
+    for res in res_dict:
+        for block in res_dict[res]:
+            module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
+            module.forward = sa_forward(module)
+            setattr(module, 'injection_schedule', injection_schedule)
+def register_conv_control_efficient_2nd_to_1st(model, injection_schedule, mask=None):
+    def conv_forward(self):
+        def forward(input_tensor, temb):
+            hidden_states = input_tensor
+            hidden_states = self.norm1(hidden_states)
+            hidden_states = self.nonlinearity(hidden_states)
+            if self.upsample is not None:
+                # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+                if hidden_states.shape[0] >= 64:
+                    input_tensor = input_tensor.contiguous()
+                    hidden_states = hidden_states.contiguous()
+                input_tensor = self.upsample(input_tensor)
+                hidden_states = self.upsample(hidden_states)
+            elif self.downsample is not None:
+                input_tensor = self.downsample(input_tensor)
+                hidden_states = self.downsample(hidden_states)
+            hidden_states = self.conv1(hidden_states)
+            if temb is not None:
+                temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
+            if temb is not None and self.time_embedding_norm == "default":
+                hidden_states = hidden_states + temb
+            hidden_states = self.norm2(hidden_states)
+            if temb is not None and self.time_embedding_norm == "scale_shift":
+                scale, shift = torch.chunk(temb, 2, dim=1)
+                hidden_states = hidden_states * (1 + scale) + shift
+            hidden_states = self.nonlinearity(hidden_states)
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = self.conv2(hidden_states)
+            if self.injection_schedule is not None and (self.t in self.injection_schedule or self.t == 1000):
+                source_batch_size = int(hidden_states.shape[0] // 2)
+                # inject unconditional
+                # hidden_states[source_batch_size:2 * source_batch_size] = hidden_states[:source_batch_size]
+                # inject conditional
+                target_size = int(np.sqrt(hidden_states.shape[-1]))
+                target_mask = F.interpolate(mask.unsqueeze(1),size=(target_size, target_size))[:,0,:,:]
+                target_mask = target_mask.view(target_mask.shape[0], -1).unsqueeze(-1)
+                hidden_states[:source_batch_size] = hidden_states[source_batch_size:] * (1-target_mask) + hidden_states[:source_batch_size] * target_mask
+            if self.conv_shortcut is not None:
+                input_tensor = self.conv_shortcut(input_tensor)
+            output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+            return output_tensor
+        return forward
+    conv_module = model.unet.up_blocks[1].resnets[1]
+    conv_module.forward = conv_forward(conv_module)
+    setattr(conv_module, 'injection_schedule', injection_schedule)
+def register_attention_control_efficient_qk_w_mask(model, injection_schedule, mask):
+    def sa_forward(self):
+        to_out = self.to_out
+        if type(to_out) is torch.nn.modules.container.ModuleList:
+            to_out = self.to_out[0]
+        else:
+            to_out = self.to_out
+        def forward(x, encoder_hidden_states=None, attention_mask=None):
+            batch_size, sequence_length, dim = x.shape
+            h = self.heads
+            is_cross = encoder_hidden_states is not None
+            encoder_hidden_states = encoder_hidden_states if is_cross else x
+            if not is_cross and self.injection_schedule is not None and (
+                    self.t in self.injection_schedule or self.t == 1000):
+                q = self.to_q(x)
+                k = self.to_k(encoder_hidden_states)
+                target_size = int(np.sqrt(encoder_hidden_states.shape[1]))
+                target_mask = F.interpolate(mask.unsqueeze(1),size=(target_size, target_size))[:,0,:,:]
+                target_mask = target_mask.view(target_mask.shape[0], -1).unsqueeze(-1)
+                source_batch_size = int(q.shape[0] // 3)
+                # inject unconditional
+                q[source_batch_size:2 * source_batch_size] = q[:source_batch_size] * target_mask + q[source_batch_size:2 * source_batch_size] * (1 - target_mask)
+                k[source_batch_size:2 * source_batch_size] = k[:source_batch_size] * target_mask + k[source_batch_size:2 * source_batch_size] * (1 - target_mask)
+                # inject conditional
+                q[2 * source_batch_size:] = q[:source_batch_size] * target_mask + q[2 * source_batch_size:] * (1 - target_mask)
+                k[2 * source_batch_size:] = k[:source_batch_size] * target_mask + k[2 * source_batch_size:] * (1 - target_mask)
+                q = self.head_to_batch_dim(q)
+                k = self.head_to_batch_dim(k)
+            else:
+                q = self.to_q(x)
+                k = self.to_k(encoder_hidden_states)
+                q = self.head_to_batch_dim(q)
+                k = self.head_to_batch_dim(k)
+            v = self.to_v(encoder_hidden_states)
+            v = self.head_to_batch_dim(v)
+            sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
+            if attention_mask is not None:
+                attention_mask = attention_mask.reshape(batch_size, -1)
+                max_neg_value = -torch.finfo(sim.dtype).max
+                attention_mask = attention_mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~attention_mask, max_neg_value)
+            # attention, what we cannot get enough of
+            attn = sim.softmax(dim=-1)
+            out = torch.einsum("b i j, b j d -> b i d", attn, v)
+            out = self.batch_to_head_dim(out)
+            return to_out(out)
+        return forward
+    res_dict = {1: [1, 2], 2: [0, 1, 2], 3: [0, 1, 2]}  # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
+    for res in res_dict:
+        for block in res_dict[res]:
+            module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
+            module.forward = sa_forward(module)
+            setattr(module, 'injection_schedule', injection_schedule)
+def register_attention_control_efficient_kv_w_mask(model, injection_schedule, mask, do_classifier_free_guidance):
+    def sa_forward(self):
+        to_out = self.to_out
+        if type(to_out) is torch.nn.modules.container.ModuleList:
+            to_out = self.to_out[0]
+        else:
+            to_out = self.to_out
+        def forward(x, encoder_hidden_states=None, attention_mask=None):
+            batch_size, sequence_length, dim = x.shape
+            h = self.heads
+            is_cross = encoder_hidden_states is not None
+            encoder_hidden_states = encoder_hidden_states if is_cross else x
+            q = self.to_q(x)
+            q = self.head_to_batch_dim(q)
+            if not is_cross and self.injection_schedule is not None and (
+                    self.t in self.injection_schedule or self.t == 1000):
+            # if False:
+                k = self.to_k(encoder_hidden_states)    # k: bx256x1280
+                v = self.to_v(encoder_hidden_states)
+                target_size = int(np.sqrt(encoder_hidden_states.shape[1]))
+                target_mask = F.interpolate(mask.unsqueeze(1),size=(target_size, target_size))[:,0,:,:]
+                target_mask = target_mask.view(target_mask.shape[0], -1).unsqueeze(-1)
+                source_batch_size = int(v.shape[0] // 3)
+                if do_classifier_free_guidance:
+                    # inject unconditional
+                    v[source_batch_size:2 * source_batch_size] = v[:source_batch_size] * target_mask + v[source_batch_size:2 * source_batch_size] * (1 - target_mask)
+                    k[source_batch_size:2 * source_batch_size] = k[:source_batch_size] * target_mask + k[source_batch_size:2 * source_batch_size] * (1 - target_mask)
+                    # inject conditional
+                    v[2 * source_batch_size:] = v[:source_batch_size] * target_mask + v[2 * source_batch_size:] * (1 - target_mask)
+                    k[2 * source_batch_size:] = k[:source_batch_size] * target_mask + k[2 * source_batch_size:] * (1 - target_mask)
+                else:
+                    v[source_batch_size:2 * source_batch_size] = v[:source_batch_size] * target_mask + v[source_batch_size:2 * source_batch_size] * (1 - target_mask)
+                    k[source_batch_size:2 * source_batch_size] = k[:source_batch_size] * target_mask + k[source_batch_size:2 * source_batch_size] * (1 - target_mask)
+                k = self.head_to_batch_dim(k)
+                v = self.head_to_batch_dim(v)
+            else:
+                # q = self.to_q(x)
+                k = self.to_k(encoder_hidden_states)
+                # q = self.head_to_batch_dim(q)
+                k = self.head_to_batch_dim(k)
+                v = self.to_v(encoder_hidden_states)
+                v = self.head_to_batch_dim(v)
+            sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
+            if attention_mask is not None:
+                attention_mask = attention_mask.reshape(batch_size, -1)
+                max_neg_value = -torch.finfo(sim.dtype).max
+                attention_mask = attention_mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~attention_mask, max_neg_value)
+            # attention, what we cannot get enough of
+            attn = sim.softmax(dim=-1)
+            out = torch.einsum("b i j, b j d -> b i d", attn, v)
+            out = self.batch_to_head_dim(out)
+            return to_out(out)
+        return forward
+    res_dict = {1: [0, 1, 2], 2: [0, 1, 2], 3: [0, 1, 2]}  # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
+    # res_dict = {1: [2], 2: [2], 3: [2]}  # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
+    for res in res_dict:
+        for block in res_dict[res]:
+            module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
+            module.forward = sa_forward(module)
+            setattr(module, 'injection_schedule', injection_schedule)
+    # down_res_dict = {0: [0, 1], 1: [0, 1], 2: [0, 1]}
+    # for res in down_res_dict:
+    #     for block in down_res_dict[res]:
+    #         module = model.unet.down_blocks[res].attentions[block].transformer_blocks[0].attn1
+    #         module.forward = sa_forward(module)
+    #         setattr(module, 'injection_schedule', injection_schedule)
+def register_conv_control_efficient_w_mask(model, injection_schedule, mask):
+    def conv_forward(self):
+        def forward(input_tensor, temb):
+            hidden_states = input_tensor
+            hidden_states = self.norm1(hidden_states)
+            hidden_states = self.nonlinearity(hidden_states)
+            if self.upsample is not None:
+                # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+                if hidden_states.shape[0] >= 64:
+                    input_tensor = input_tensor.contiguous()
+                    hidden_states = hidden_states.contiguous()
+                input_tensor = self.upsample(input_tensor)
+                hidden_states = self.upsample(hidden_states)
+            elif self.downsample is not None:
+                input_tensor = self.downsample(input_tensor)
+                hidden_states = self.downsample(hidden_states)
+            hidden_states = self.conv1(hidden_states)
+            if temb is not None:
+                temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
+            if temb is not None and self.time_embedding_norm == "default":
+                hidden_states = hidden_states + temb
+            hidden_states = self.norm2(hidden_states)
+            if temb is not None and self.time_embedding_norm == "scale_shift":
+                scale, shift = torch.chunk(temb, 2, dim=1)
+                hidden_states = hidden_states * (1 + scale) + shift
+            hidden_states = self.nonlinearity(hidden_states)
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = self.conv2(hidden_states)
+            if self.injection_schedule is not None and (self.t in self.injection_schedule or self.t == 1000):
+            # if False:
+                source_batch_size = int(hidden_states.shape[0] // 3)
+                target_size = int(np.sqrt(hidden_states.shape[-1]))
+                target_mask = F.interpolate(mask.unsqueeze(1),size=(target_size, target_size))[:,0,:,:]
+                target_mask = target_mask.view(target_mask.shape[0], -1).unsqueeze(-1)
+                # inject unconditional
+                hidden_states[source_batch_size:2 * source_batch_size] = hidden_states[:source_batch_size] * target_mask + hidden_states[source_batch_size:2 * source_batch_size] * (1-target_mask)
+                # inject conditional
+                hidden_states[2 * source_batch_size:] = hidden_states[:source_batch_size] * target_mask + hidden_states[2 * source_batch_size:] * (1-target_mask)
+            if self.conv_shortcut is not None:
+                input_tensor = self.conv_shortcut(input_tensor)
+            output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+            return output_tensor
+        return forward
+    conv_module = model.unet.up_blocks[1].resnets[1]
+    conv_module.forward = conv_forward(conv_module)
+    setattr(conv_module, 'injection_schedule', injection_schedule)

weights/dpt_beit_large_512.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e9e900747e9e8b3112df716979219836a27716277b3d0dc53889cbba8b82328
+size 1581966003