Spaces:
Runtime error
Runtime error
imsuperkong
commited on
Commit
•
d3bdeec
1
Parent(s):
dc47947
Upload 6 files
Browse files- requirements.txt +7 -0
- sd/core.py +435 -0
- sd/dift_sd.py +240 -0
- sd/gradio_utils.py +85 -0
- sd/pnp_utils.py +569 -0
- weights/dpt_beit_large_512.pt +3 -0
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==2.0.1
|
2 |
+
torchvision
|
3 |
+
timm==0.6.12
|
4 |
+
gradio==3.40.1
|
5 |
+
diffusers==0.17.1
|
6 |
+
numpy==1.20.3
|
7 |
+
wget
|
sd/core.py
ADDED
@@ -0,0 +1,435 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from diffusers import StableDiffusionPipeline
|
5 |
+
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
6 |
+
from typing import Any, Callable, Dict, List, Optional, Union
|
7 |
+
from sd.pnp_utils import register_time, register_attention_control_efficient_kv_w_mask, register_conv_control_efficient_w_mask
|
8 |
+
import torch.nn as nn
|
9 |
+
from sd.dift_sd import MyUNet2DConditionModel, OneStepSDPipeline
|
10 |
+
import ipdb
|
11 |
+
from tqdm import tqdm
|
12 |
+
from lib.midas import MiDas
|
13 |
+
|
14 |
+
class DDIMBackward(StableDiffusionPipeline):
|
15 |
+
def __init__(
|
16 |
+
self, vae, text_encoder, tokenizer, unet, scheduler,
|
17 |
+
safety_checker, feature_extractor,
|
18 |
+
requires_safety_checker: bool = True,
|
19 |
+
device='cuda', model_id='ckpt/stable-diffusion-2-1-base',depth_model='dpt_swin2_large_384'
|
20 |
+
):
|
21 |
+
super().__init__(
|
22 |
+
vae, text_encoder, tokenizer, unet, scheduler,
|
23 |
+
safety_checker, feature_extractor, requires_safety_checker,
|
24 |
+
)
|
25 |
+
|
26 |
+
self.dift_unet = MyUNet2DConditionModel.from_pretrained(model_id, subfolder="unet", torch_dtype=torch.float16 if 'cuda' in device else torch.float32)
|
27 |
+
self.onestep_pipe = OneStepSDPipeline.from_pretrained(model_id, unet=self.dift_unet, safety_checker=None, torch_dtype=torch.float16 if 'cuda' in device else torch.float32)
|
28 |
+
self.onestep_pipe = self.onestep_pipe.to(device)
|
29 |
+
|
30 |
+
if 'cuda' in device:
|
31 |
+
self.onestep_pipe.enable_attention_slicing()
|
32 |
+
self.onestep_pipe.enable_xformers_memory_efficient_attention()
|
33 |
+
self.ensemble_size = 4
|
34 |
+
self.cos = nn.CosineSimilarity(dim=1, eps=1e-6)
|
35 |
+
|
36 |
+
self.midas_model = MiDas(device,model_type=depth_model)
|
37 |
+
|
38 |
+
self.torch_dtype=torch.float16 if 'cuda' in device else torch.float32
|
39 |
+
|
40 |
+
|
41 |
+
@torch.no_grad()
|
42 |
+
def __call__(
|
43 |
+
self,
|
44 |
+
prompt: Union[str, List[str]] = None,
|
45 |
+
height: Optional[int] = None,
|
46 |
+
width: Optional[int] = None,
|
47 |
+
num_inference_steps: int = 50,
|
48 |
+
guidance_scale: float = 7.5,
|
49 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
50 |
+
num_images_per_prompt: Optional[int] = 1,
|
51 |
+
eta: float = 0.0,
|
52 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
53 |
+
latents: Optional[torch.FloatTensor] = None,
|
54 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
55 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
56 |
+
output_type: Optional[str] = "pil",
|
57 |
+
return_dict: bool = True,
|
58 |
+
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
59 |
+
callback_steps: int = 1,
|
60 |
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
61 |
+
t_start=None,
|
62 |
+
):
|
63 |
+
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
64 |
+
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
65 |
+
self.check_inputs(
|
66 |
+
prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
|
67 |
+
)
|
68 |
+
|
69 |
+
if prompt is not None and isinstance(prompt, str):
|
70 |
+
batch_size = 1
|
71 |
+
elif prompt is not None and isinstance(prompt, list):
|
72 |
+
batch_size = len(prompt)
|
73 |
+
else:
|
74 |
+
batch_size = prompt_embeds.shape[0]
|
75 |
+
|
76 |
+
device = self._execution_device
|
77 |
+
do_classifier_free_guidance = guidance_scale > 1.0
|
78 |
+
prompt_embeds = self._encode_prompt(
|
79 |
+
prompt,
|
80 |
+
device,
|
81 |
+
num_images_per_prompt,
|
82 |
+
do_classifier_free_guidance,
|
83 |
+
negative_prompt,
|
84 |
+
prompt_embeds=prompt_embeds,
|
85 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
86 |
+
)
|
87 |
+
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
88 |
+
timesteps = self.scheduler.timesteps
|
89 |
+
num_channels_latents = self.unet.in_channels
|
90 |
+
latents = self.prepare_latents(
|
91 |
+
batch_size * num_images_per_prompt,
|
92 |
+
num_channels_latents,
|
93 |
+
height,
|
94 |
+
width,
|
95 |
+
prompt_embeds.dtype,
|
96 |
+
device,
|
97 |
+
generator,
|
98 |
+
latents,
|
99 |
+
)
|
100 |
+
|
101 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
102 |
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
103 |
+
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
104 |
+
for i, t in enumerate(timesteps):
|
105 |
+
if t_start and t >= t_start:
|
106 |
+
progress_bar.update()
|
107 |
+
continue
|
108 |
+
|
109 |
+
# expand the latents if we are doing classifier free guidance
|
110 |
+
latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
|
111 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
112 |
+
|
113 |
+
# predict the noise residual
|
114 |
+
noise_pred = self.unet(
|
115 |
+
latent_model_input,
|
116 |
+
t,
|
117 |
+
encoder_hidden_states=prompt_embeds,
|
118 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
119 |
+
).sample
|
120 |
+
|
121 |
+
# perform guidance
|
122 |
+
if do_classifier_free_guidance:
|
123 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
124 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
125 |
+
|
126 |
+
# compute the previous noisy sample x_t -> x_t-1
|
127 |
+
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
|
128 |
+
|
129 |
+
# call the callback, if provided
|
130 |
+
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
131 |
+
progress_bar.update()
|
132 |
+
if callback is not None and i % callback_steps == 0:
|
133 |
+
callback(i, t, latents)
|
134 |
+
|
135 |
+
if output_type == "latent":
|
136 |
+
image = latents
|
137 |
+
has_nsfw_concept = None
|
138 |
+
elif output_type == "pil":
|
139 |
+
image = self.decode_latents(latents)
|
140 |
+
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
141 |
+
image = self.numpy_to_pil(image)
|
142 |
+
else:
|
143 |
+
image = self.decode_latents(latents)
|
144 |
+
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
145 |
+
|
146 |
+
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
|
147 |
+
self.final_offload_hook.offload()
|
148 |
+
|
149 |
+
if not return_dict:
|
150 |
+
return (image, has_nsfw_concept)
|
151 |
+
|
152 |
+
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
153 |
+
|
154 |
+
def denoise_w_injection(
|
155 |
+
self,
|
156 |
+
prompt: Union[str, List[str]] = None,
|
157 |
+
height: Optional[int] = None,
|
158 |
+
width: Optional[int] = None,
|
159 |
+
num_inference_steps: int = 50,
|
160 |
+
guidance_scale: float = 7.5,
|
161 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
162 |
+
num_images_per_prompt: Optional[int] = 1,
|
163 |
+
eta: float = 0.0,
|
164 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
165 |
+
latents: Optional[torch.FloatTensor] = None,
|
166 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
167 |
+
negative_prompt_embeds: Optional[torch.FloatTensor] = None,
|
168 |
+
output_type: Optional[str] = "pil",
|
169 |
+
return_dict: bool = True,
|
170 |
+
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
171 |
+
callback_steps: int = 1,
|
172 |
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
173 |
+
t_start=None,
|
174 |
+
attn=0.8,
|
175 |
+
f=0.5,
|
176 |
+
latent_mask=None,
|
177 |
+
guidance_loss_scale=0,
|
178 |
+
cfg_decay=False,
|
179 |
+
cfg_norm=False,
|
180 |
+
lr=1.0,
|
181 |
+
up_ft_indexes=[1,2],
|
182 |
+
img_tensor=None,
|
183 |
+
early_stop=50,
|
184 |
+
intrinsic=None, extrinsic=None, threshold=20,depth=None,
|
185 |
+
):
|
186 |
+
height = height or self.unet.config.sample_size * self.vae_scale_factor
|
187 |
+
width = width or self.unet.config.sample_size * self.vae_scale_factor
|
188 |
+
self.check_inputs(
|
189 |
+
prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
|
190 |
+
)
|
191 |
+
|
192 |
+
if prompt is not None and isinstance(prompt, str):
|
193 |
+
batch_size = 1
|
194 |
+
elif prompt is not None and isinstance(prompt, list):
|
195 |
+
batch_size = len(prompt)
|
196 |
+
else:
|
197 |
+
batch_size = prompt_embeds.shape[0]
|
198 |
+
|
199 |
+
device = self._execution_device
|
200 |
+
do_classifier_free_guidance = guidance_scale > 1.0
|
201 |
+
prompt_embeds = self._encode_prompt(
|
202 |
+
prompt,
|
203 |
+
device,
|
204 |
+
num_images_per_prompt,
|
205 |
+
do_classifier_free_guidance,
|
206 |
+
negative_prompt,
|
207 |
+
prompt_embeds=prompt_embeds,
|
208 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
209 |
+
)
|
210 |
+
if do_classifier_free_guidance:
|
211 |
+
prompt_embeds = torch.cat((prompt_embeds[1:], prompt_embeds[1:], prompt_embeds[:1]), dim=0)
|
212 |
+
else:
|
213 |
+
prompt_embeds = torch.cat([prompt_embeds]*2, dim=0)
|
214 |
+
|
215 |
+
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
216 |
+
timesteps = self.scheduler.timesteps
|
217 |
+
num_channels_latents = self.unet.in_channels
|
218 |
+
latents = self.prepare_latents(
|
219 |
+
batch_size * num_images_per_prompt,
|
220 |
+
num_channels_latents,
|
221 |
+
height,
|
222 |
+
width,
|
223 |
+
prompt_embeds.dtype,
|
224 |
+
device,
|
225 |
+
generator,
|
226 |
+
latents,
|
227 |
+
)
|
228 |
+
|
229 |
+
kv_injection_timesteps = self.scheduler.timesteps[:int(len(self.scheduler.timesteps) * attn)]
|
230 |
+
f_injection_timesteps = self.scheduler.timesteps[:int(len(self.scheduler.timesteps) * f)]
|
231 |
+
register_attention_control_efficient_kv_w_mask(self, kv_injection_timesteps, mask=latent_mask, do_classifier_free_guidance=do_classifier_free_guidance)
|
232 |
+
register_conv_control_efficient_w_mask(self, f_injection_timesteps, mask=latent_mask)
|
233 |
+
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
234 |
+
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
235 |
+
with self.progress_bar(total=num_inference_steps) as progress_bar:
|
236 |
+
for i, t in enumerate(timesteps):
|
237 |
+
if t_start and t >= t_start:
|
238 |
+
progress_bar.update()
|
239 |
+
continue
|
240 |
+
if i > early_stop: guidance_loss_scale = 0 # Early stop (optional)
|
241 |
+
# if t > 300: guidance_loss_scale = 0 # Early stop (optional)
|
242 |
+
register_time(self, t.item())
|
243 |
+
# Set requires grad
|
244 |
+
if guidance_loss_scale != 0:
|
245 |
+
latents = latents.detach().requires_grad_()
|
246 |
+
|
247 |
+
# expand the latents if we are doing classifier free guidance
|
248 |
+
latent_model_input = latents # latents: ori_z + wrap_z
|
249 |
+
if do_classifier_free_guidance:
|
250 |
+
latent_model_input = torch.cat([latent_model_input, latent_model_input[1:]], dim=0)
|
251 |
+
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
252 |
+
|
253 |
+
# predict the noise residual
|
254 |
+
if guidance_loss_scale != 0:
|
255 |
+
with torch.no_grad():
|
256 |
+
noise_pred = self.unet(
|
257 |
+
latent_model_input,
|
258 |
+
t,
|
259 |
+
encoder_hidden_states=prompt_embeds,
|
260 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
261 |
+
).sample
|
262 |
+
else:
|
263 |
+
with torch.no_grad():
|
264 |
+
noise_pred = self.unet(
|
265 |
+
latent_model_input,
|
266 |
+
t,
|
267 |
+
encoder_hidden_states=prompt_embeds,
|
268 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
269 |
+
).sample
|
270 |
+
|
271 |
+
# perform guidance
|
272 |
+
if do_classifier_free_guidance:
|
273 |
+
cfg_scale = guidance_scale
|
274 |
+
if cfg_decay: cfg_scale = 1 + guidance_scale * (1-i/num_inference_steps)
|
275 |
+
noise_pred_text, wrap_noise_pred_text, wrap_noise_pred_uncond = noise_pred.chunk(3)
|
276 |
+
noise_pred = wrap_noise_pred_text + cfg_scale * (wrap_noise_pred_text - wrap_noise_pred_uncond)
|
277 |
+
else:
|
278 |
+
noise_pred_text, wrap_noise_pred_text = noise_pred.chunk(3)
|
279 |
+
noise_pred = wrap_noise_pred_text
|
280 |
+
|
281 |
+
if cfg_norm:
|
282 |
+
noise_pred = noise_pred * (torch.linalg.norm(wrap_noise_pred_uncond) / torch.linalg.norm(noise_pred))
|
283 |
+
|
284 |
+
if guidance_loss_scale != 0:
|
285 |
+
for up_ft_index in up_ft_indexes:
|
286 |
+
|
287 |
+
alpha_prod_t = self.scheduler.alphas_cumprod[t]
|
288 |
+
alpha_prod_t_prev = (
|
289 |
+
self.scheduler.alphas_cumprod[timesteps[i - 0]]
|
290 |
+
if i > 0 else self.scheduler.final_alpha_cumprod
|
291 |
+
)
|
292 |
+
|
293 |
+
mu = alpha_prod_t ** 0.5
|
294 |
+
mu_prev = alpha_prod_t_prev ** 0.5
|
295 |
+
sigma = (1 - alpha_prod_t) ** 0.5
|
296 |
+
sigma_prev = (1 - alpha_prod_t_prev) ** 0.5
|
297 |
+
|
298 |
+
pred_x0 = (latents - sigma_prev * noise_pred[:latents.shape[0]]) / mu_prev
|
299 |
+
|
300 |
+
unet_ft_all = self.onestep_pipe(
|
301 |
+
latents=pred_x0[:1].repeat(self.ensemble_size, 1, 1, 1),
|
302 |
+
t=t,
|
303 |
+
up_ft_indices=[up_ft_index],
|
304 |
+
prompt_embeds=prompt_embeds[:1].repeat(self.ensemble_size, 1, 1)
|
305 |
+
)
|
306 |
+
unet_ft1 = unet_ft_all['up_ft'][up_ft_index].mean(0, keepdim=True) # 1,c,h,w
|
307 |
+
unet_ft1_norm = unet_ft1 / torch.norm(unet_ft1, dim=1, keepdim=True)
|
308 |
+
|
309 |
+
unet_ft1_norm = self.midas_model.wrap_img_tensor_w_fft_ext(
|
310 |
+
unet_ft1_norm.to(self.torch_dtype),
|
311 |
+
torch.from_numpy(depth).to(device).to(self.torch_dtype),
|
312 |
+
intrinsic,
|
313 |
+
extrinsic[:3,:3], extrinsic[:3,3], threshold=threshold).to(self.torch_dtype)
|
314 |
+
|
315 |
+
unet_ft_all = self.onestep_pipe(
|
316 |
+
latents=pred_x0[1:2].repeat(self.ensemble_size, 1, 1, 1),
|
317 |
+
t=t,
|
318 |
+
up_ft_indices=[up_ft_index],
|
319 |
+
prompt_embeds=prompt_embeds[:1].repeat(self.ensemble_size, 1, 1)
|
320 |
+
)
|
321 |
+
unet_ft2 = unet_ft_all['up_ft'][up_ft_index].mean(0, keepdim=True) # 1,c,h,w
|
322 |
+
unet_ft2_norm = unet_ft2 / torch.norm(unet_ft2, dim=1, keepdim=True)
|
323 |
+
c = unet_ft2.shape[1]
|
324 |
+
loss = (-self.cos(unet_ft1_norm.squeeze().view(c, -1).T, unet_ft2_norm.squeeze().view(c, -1).T).mean() + 1) / 2.
|
325 |
+
# Get gradient
|
326 |
+
cond_grad = torch.autograd.grad(loss * guidance_loss_scale, latents)[0][1:2]
|
327 |
+
|
328 |
+
# compute the previous noisy sample x_t -> x_t-1
|
329 |
+
noise_pred_ = noise_pred - sigma_prev * cond_grad*lr
|
330 |
+
noise_pred_ = torch.cat([noise_pred_text, noise_pred_], dim=0)
|
331 |
+
|
332 |
+
# compute the previous noisy sample x_t -> x_t-1
|
333 |
+
with torch.no_grad():
|
334 |
+
latents = self.scheduler.step(noise_pred_, t, latents, **extra_step_kwargs).prev_sample
|
335 |
+
# call the callback, if provided
|
336 |
+
if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
|
337 |
+
progress_bar.update()
|
338 |
+
if callback is not None and i % callback_steps == 0:
|
339 |
+
callback(i, t, latents)
|
340 |
+
|
341 |
+
if output_type == "latent":
|
342 |
+
image = latents
|
343 |
+
has_nsfw_concept = None
|
344 |
+
elif output_type == "pil":
|
345 |
+
with torch.no_grad():
|
346 |
+
image = self.decode_latents(latents)
|
347 |
+
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
348 |
+
image = self.numpy_to_pil(image)
|
349 |
+
else:
|
350 |
+
image = self.decode_latents(latents)
|
351 |
+
image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
|
352 |
+
|
353 |
+
if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
|
354 |
+
self.final_offload_hook.offload()
|
355 |
+
|
356 |
+
if not return_dict:
|
357 |
+
return (image, has_nsfw_concept)
|
358 |
+
|
359 |
+
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
|
360 |
+
@torch.no_grad()
|
361 |
+
def decoder(self, latents):
|
362 |
+
with torch.autocast(device_type=self.device, dtype=torch.float32):
|
363 |
+
latents = 1 / 0.18215 * latents
|
364 |
+
imgs = self.vae.decode(latents).sample
|
365 |
+
imgs = (imgs / 2 + 0.5).clamp(0, 1)
|
366 |
+
return imgs
|
367 |
+
|
368 |
+
|
369 |
+
def ddim_inversion_w_grad(self, latent, cond, stop_t, guidance_loss_scale=1.0, lr=1.0):
|
370 |
+
timesteps = reversed(self.scheduler.timesteps)
|
371 |
+
with torch.autocast(device_type=self.device, dtype=torch.float32):
|
372 |
+
|
373 |
+
for i, t in enumerate(tqdm(timesteps)):
|
374 |
+
if t >= stop_t:
|
375 |
+
break
|
376 |
+
|
377 |
+
if guidance_loss_scale != 0:
|
378 |
+
latent = latent.detach().requires_grad_()
|
379 |
+
cond_batch = cond.repeat(latent.shape[0], 1, 1)
|
380 |
+
|
381 |
+
alpha_prod_t = self.scheduler.alphas_cumprod[t]
|
382 |
+
alpha_prod_t_prev = (
|
383 |
+
self.scheduler.alphas_cumprod[timesteps[i - 1]]
|
384 |
+
if i > 0 else self.scheduler.final_alpha_cumprod
|
385 |
+
)
|
386 |
+
|
387 |
+
mu = alpha_prod_t ** 0.5
|
388 |
+
mu_prev = alpha_prod_t_prev ** 0.5
|
389 |
+
sigma = (1 - alpha_prod_t) ** 0.5
|
390 |
+
sigma_prev = (1 - alpha_prod_t_prev) ** 0.5
|
391 |
+
|
392 |
+
eps = self.onestep_pipe.unet(latent, t, encoder_hidden_states=cond_batch, up_ft_indices=[3], output_eps=True)['eps']
|
393 |
+
pred_x0 = (latent - sigma_prev * eps) / mu_prev
|
394 |
+
|
395 |
+
unet_ft_all = self.onestep_pipe(
|
396 |
+
latents=pred_x0[:1].repeat(self.ensemble_size, 1, 1, 1),
|
397 |
+
t=t,
|
398 |
+
up_ft_indices=[1],
|
399 |
+
prompt_embeds=cond_batch[:1].repeat(self.ensemble_size, 1, 1)
|
400 |
+
)
|
401 |
+
unet_ft1 = unet_ft_all['up_ft'][1].mean(0, keepdim=True) # 1,c,h,w
|
402 |
+
unet_ft1_norm = unet_ft1 / torch.norm(unet_ft1, dim=1, keepdim=True)
|
403 |
+
|
404 |
+
unet_ft_all = self.onestep_pipe(
|
405 |
+
latents=pred_x0[1:2].repeat(self.ensemble_size, 1, 1, 1),
|
406 |
+
t=t,
|
407 |
+
up_ft_indices=[1],
|
408 |
+
prompt_embeds=cond_batch[:1].repeat(self.ensemble_size, 1, 1)
|
409 |
+
)
|
410 |
+
unet_ft2 = unet_ft_all['up_ft'][1].mean(0, keepdim=True) # 1,c,h,w
|
411 |
+
unet_ft2_norm = unet_ft2 / torch.norm(unet_ft2, dim=1, keepdim=True)
|
412 |
+
c = unet_ft2.shape[1]
|
413 |
+
loss = (-self.cos(unet_ft1_norm.squeeze().view(c, -1).T.detach(), unet_ft2_norm.squeeze().view(c, -1).T).mean() + 1) / 2.
|
414 |
+
print(f'loss: {loss.item()}')
|
415 |
+
# Get gradient
|
416 |
+
cond_grad = torch.autograd.grad(loss * guidance_loss_scale, latent)[0]
|
417 |
+
|
418 |
+
# latent = latent.detach() - cond_grad * lr
|
419 |
+
latent = mu * pred_x0 + sigma * eps - cond_grad * lr
|
420 |
+
|
421 |
+
return latent
|
422 |
+
|
423 |
+
@torch.no_grad()
|
424 |
+
def DDPM_forward(x_t_dot, t_start, delta_t, ddpm_scheduler, generator):
|
425 |
+
# just simple implementation, this should have an analytical expression
|
426 |
+
# TODO: implementation analytical form
|
427 |
+
for delta in range(1, delta_t):
|
428 |
+
# noise = torch.randn_like(x_t_dot, generator=generator)
|
429 |
+
noise = torch.empty_like(x_t_dot).normal_(generator=generator)
|
430 |
+
|
431 |
+
beta = ddpm_scheduler.betas[t_start+delta]
|
432 |
+
std_ = beta ** 0.5
|
433 |
+
mu_ = ((1 - beta) ** 0.5) * x_t_dot
|
434 |
+
x_t_dot = mu_ + std_ * noise
|
435 |
+
return x_t_dot
|
sd/dift_sd.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from diffusers import StableDiffusionPipeline
|
2 |
+
import torch
|
3 |
+
import torch.nn as nn
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import numpy as np
|
6 |
+
from typing import Any, Callable, Dict, List, Optional, Union
|
7 |
+
from diffusers.models.unet_2d_condition import UNet2DConditionModel
|
8 |
+
from diffusers import DDIMScheduler
|
9 |
+
import gc
|
10 |
+
from PIL import Image
|
11 |
+
|
12 |
+
class MyUNet2DConditionModel(UNet2DConditionModel):
|
13 |
+
def forward(
|
14 |
+
self,
|
15 |
+
sample: torch.FloatTensor,
|
16 |
+
timestep: Union[torch.Tensor, float, int],
|
17 |
+
up_ft_indices,
|
18 |
+
encoder_hidden_states: torch.Tensor,
|
19 |
+
class_labels: Optional[torch.Tensor] = None,
|
20 |
+
timestep_cond: Optional[torch.Tensor] = None,
|
21 |
+
attention_mask: Optional[torch.Tensor] = None,
|
22 |
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
23 |
+
output_eps=False):
|
24 |
+
r"""
|
25 |
+
Args:
|
26 |
+
sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
|
27 |
+
timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
|
28 |
+
encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
|
29 |
+
cross_attention_kwargs (`dict`, *optional*):
|
30 |
+
A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
|
31 |
+
`self.processor` in
|
32 |
+
[diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
|
33 |
+
"""
|
34 |
+
# By default samples have to be AT least a multiple of the overall upsampling factor.
|
35 |
+
# The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
|
36 |
+
# However, the upsampling interpolation output size can be forced to fit any upsampling size
|
37 |
+
# on the fly if necessary.
|
38 |
+
default_overall_up_factor = 2**self.num_upsamplers
|
39 |
+
|
40 |
+
# upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
|
41 |
+
forward_upsample_size = False
|
42 |
+
upsample_size = None
|
43 |
+
|
44 |
+
if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
|
45 |
+
# logger.info("Forward upsample size to force interpolation output size.")
|
46 |
+
forward_upsample_size = True
|
47 |
+
|
48 |
+
# prepare attention_mask
|
49 |
+
if attention_mask is not None:
|
50 |
+
attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
|
51 |
+
attention_mask = attention_mask.unsqueeze(1)
|
52 |
+
|
53 |
+
# 0. center input if necessary
|
54 |
+
if self.config.center_input_sample:
|
55 |
+
sample = 2 * sample - 1.0
|
56 |
+
|
57 |
+
# 1. time
|
58 |
+
timesteps = timestep
|
59 |
+
if not torch.is_tensor(timesteps):
|
60 |
+
# TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
|
61 |
+
# This would be a good case for the `match` statement (Python 3.10+)
|
62 |
+
is_mps = sample.device.type == "mps"
|
63 |
+
if isinstance(timestep, float):
|
64 |
+
dtype = torch.float32 if is_mps else torch.float64
|
65 |
+
else:
|
66 |
+
dtype = torch.int32 if is_mps else torch.int64
|
67 |
+
timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
|
68 |
+
elif len(timesteps.shape) == 0:
|
69 |
+
timesteps = timesteps[None].to(sample.device)
|
70 |
+
|
71 |
+
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
|
72 |
+
timesteps = timesteps.expand(sample.shape[0])
|
73 |
+
|
74 |
+
t_emb = self.time_proj(timesteps)
|
75 |
+
|
76 |
+
# timesteps does not contain any weights and will always return f32 tensors
|
77 |
+
# but time_embedding might actually be running in fp16. so we need to cast here.
|
78 |
+
# there might be better ways to encapsulate this.
|
79 |
+
t_emb = t_emb.to(dtype=self.dtype)
|
80 |
+
|
81 |
+
emb = self.time_embedding(t_emb, timestep_cond)
|
82 |
+
|
83 |
+
if self.class_embedding is not None:
|
84 |
+
if class_labels is None:
|
85 |
+
raise ValueError("class_labels should be provided when num_class_embeds > 0")
|
86 |
+
|
87 |
+
if self.config.class_embed_type == "timestep":
|
88 |
+
class_labels = self.time_proj(class_labels)
|
89 |
+
|
90 |
+
class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
|
91 |
+
emb = emb + class_emb
|
92 |
+
|
93 |
+
# 2. pre-process
|
94 |
+
sample = self.conv_in(sample)
|
95 |
+
|
96 |
+
# 3. down
|
97 |
+
down_block_res_samples = (sample,)
|
98 |
+
for downsample_block in self.down_blocks:
|
99 |
+
if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
|
100 |
+
sample, res_samples = downsample_block(
|
101 |
+
hidden_states=sample,
|
102 |
+
temb=emb,
|
103 |
+
encoder_hidden_states=encoder_hidden_states,
|
104 |
+
attention_mask=attention_mask,
|
105 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
106 |
+
)
|
107 |
+
else:
|
108 |
+
sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
|
109 |
+
|
110 |
+
down_block_res_samples += res_samples
|
111 |
+
|
112 |
+
# 4. mid
|
113 |
+
if self.mid_block is not None:
|
114 |
+
sample = self.mid_block(
|
115 |
+
sample,
|
116 |
+
emb,
|
117 |
+
encoder_hidden_states=encoder_hidden_states,
|
118 |
+
attention_mask=attention_mask,
|
119 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
120 |
+
)
|
121 |
+
|
122 |
+
# 5. up
|
123 |
+
up_ft = {}
|
124 |
+
for i, upsample_block in enumerate(self.up_blocks):
|
125 |
+
|
126 |
+
if i > np.max(up_ft_indices):
|
127 |
+
break
|
128 |
+
|
129 |
+
is_final_block = i == len(self.up_blocks) - 1
|
130 |
+
|
131 |
+
res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
|
132 |
+
down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
|
133 |
+
|
134 |
+
# if we have not reached the final block and need to forward the
|
135 |
+
# upsample size, we do it here
|
136 |
+
if not is_final_block and forward_upsample_size:
|
137 |
+
upsample_size = down_block_res_samples[-1].shape[2:]
|
138 |
+
|
139 |
+
if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
|
140 |
+
sample = upsample_block(
|
141 |
+
hidden_states=sample,
|
142 |
+
temb=emb,
|
143 |
+
res_hidden_states_tuple=res_samples,
|
144 |
+
encoder_hidden_states=encoder_hidden_states,
|
145 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
146 |
+
upsample_size=upsample_size,
|
147 |
+
attention_mask=attention_mask,
|
148 |
+
)
|
149 |
+
else:
|
150 |
+
sample = upsample_block(
|
151 |
+
hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
|
152 |
+
)
|
153 |
+
|
154 |
+
if i in up_ft_indices:
|
155 |
+
up_ft[i] = sample
|
156 |
+
|
157 |
+
output = {}
|
158 |
+
output['up_ft'] = up_ft
|
159 |
+
if output_eps:
|
160 |
+
sample = self.conv_norm_out(sample)
|
161 |
+
sample = self.conv_act(sample)
|
162 |
+
sample = self.conv_out(sample)
|
163 |
+
output['eps'] = sample
|
164 |
+
return output
|
165 |
+
|
166 |
+
class OneStepSDPipeline(StableDiffusionPipeline):
|
167 |
+
# @torch.no_grad()
|
168 |
+
def __call__(
|
169 |
+
self,
|
170 |
+
|
171 |
+
t,
|
172 |
+
up_ft_indices,
|
173 |
+
negative_prompt: Optional[Union[str, List[str]]] = None,
|
174 |
+
img_tensor=None,
|
175 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
176 |
+
prompt_embeds: Optional[torch.FloatTensor] = None,
|
177 |
+
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
178 |
+
callback_steps: int = 1,
|
179 |
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
180 |
+
latents=None
|
181 |
+
):
|
182 |
+
|
183 |
+
device = self._execution_device
|
184 |
+
if latents is None:
|
185 |
+
latents = self.vae.encode(img_tensor).latent_dist.sample() * self.vae.config.scaling_factor
|
186 |
+
t = torch.tensor(t.clone().detach(), dtype=torch.long, device=device)
|
187 |
+
noise = torch.randn_like(latents).to(device)
|
188 |
+
latents_noisy = self.scheduler.add_noise(latents, noise, t)
|
189 |
+
unet_output = self.unet(latents_noisy,
|
190 |
+
t,
|
191 |
+
up_ft_indices,
|
192 |
+
encoder_hidden_states=prompt_embeds,
|
193 |
+
cross_attention_kwargs=cross_attention_kwargs)
|
194 |
+
return unet_output
|
195 |
+
|
196 |
+
|
197 |
+
class SDFeaturizer:
|
198 |
+
def __init__(self, sd_id='ckpt/stable-diffusion-2-1-base'):
|
199 |
+
unet = MyUNet2DConditionModel.from_pretrained(sd_id, subfolder="unet")
|
200 |
+
onestep_pipe = OneStepSDPipeline.from_pretrained(sd_id, unet=unet, safety_checker=None)
|
201 |
+
onestep_pipe.vae.decoder = None
|
202 |
+
onestep_pipe.scheduler = DDIMScheduler.from_pretrained(sd_id, subfolder="scheduler")
|
203 |
+
gc.collect()
|
204 |
+
onestep_pipe = onestep_pipe.to("cuda")
|
205 |
+
onestep_pipe.enable_attention_slicing()
|
206 |
+
onestep_pipe.enable_xformers_memory_efficient_attention()
|
207 |
+
self.pipe = onestep_pipe
|
208 |
+
|
209 |
+
@torch.no_grad()
|
210 |
+
def forward(self,
|
211 |
+
img_tensor,
|
212 |
+
prompt,
|
213 |
+
t=261,
|
214 |
+
up_ft_index=1,
|
215 |
+
ensemble_size=8):
|
216 |
+
'''
|
217 |
+
Args:
|
218 |
+
img_tensor: should be a single torch tensor in the shape of [1, C, H, W] or [C, H, W]
|
219 |
+
prompt: the prompt to use, a string
|
220 |
+
t: the time step to use, should be an int in the range of [0, 1000]
|
221 |
+
up_ft_index: which upsampling block of the U-Net to extract feature, you can choose [0, 1, 2, 3]
|
222 |
+
ensemble_size: the number of repeated images used in the batch to extract features
|
223 |
+
Return:
|
224 |
+
unet_ft: a torch tensor in the shape of [1, c, h, w]
|
225 |
+
'''
|
226 |
+
img_tensor = img_tensor.repeat(ensemble_size, 1, 1, 1).cuda() # ensem, c, h, w
|
227 |
+
prompt_embeds = self.pipe._encode_prompt(
|
228 |
+
prompt=prompt,
|
229 |
+
device='cuda',
|
230 |
+
num_images_per_prompt=1,
|
231 |
+
do_classifier_free_guidance=False) # [1, 77, dim]
|
232 |
+
prompt_embeds = prompt_embeds.repeat(ensemble_size, 1, 1)
|
233 |
+
unet_ft_all = self.pipe(
|
234 |
+
img_tensor=img_tensor,
|
235 |
+
t=t,
|
236 |
+
up_ft_indices=[up_ft_index],
|
237 |
+
prompt_embeds=prompt_embeds)
|
238 |
+
unet_ft = unet_ft_all['up_ft'][up_ft_index] # ensem, c, h, w
|
239 |
+
unet_ft = unet_ft.mean(0, keepdim=True) # 1,c,h,w
|
240 |
+
return unet_ft
|
sd/gradio_utils.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import copy
|
3 |
+
import math
|
4 |
+
import os
|
5 |
+
import urllib.request
|
6 |
+
from typing import List, Optional, Tuple
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
import PIL
|
10 |
+
import PIL.Image
|
11 |
+
import PIL.ImageDraw
|
12 |
+
import torch
|
13 |
+
import torch.optim
|
14 |
+
from tqdm import tqdm
|
15 |
+
import ipdb
|
16 |
+
|
17 |
+
def tensor_to_PIL(img: torch.Tensor) -> PIL.Image.Image:
|
18 |
+
"""
|
19 |
+
Converts a tensor image to a PIL Image.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
img (torch.Tensor): The tensor image of shape [batch_size, num_channels, height, width].
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
A PIL Image object.
|
26 |
+
"""
|
27 |
+
img = (img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)
|
28 |
+
return PIL.Image.fromarray(img[0].cpu().numpy(), "RGB")
|
29 |
+
|
30 |
+
|
31 |
+
def get_ellipse_coords(
|
32 |
+
point: Tuple[int, int], radius: int = 5
|
33 |
+
) -> Tuple[int, int, int, int]:
|
34 |
+
"""
|
35 |
+
Returns the coordinates of an ellipse centered at the given point.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
point (Tuple[int, int]): The center point of the ellipse.
|
39 |
+
radius (int): The radius of the ellipse.
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
A tuple containing the coordinates of the ellipse in the format (x_min, y_min, x_max, y_max).
|
43 |
+
"""
|
44 |
+
center = point
|
45 |
+
return (
|
46 |
+
center[0] - radius,
|
47 |
+
center[1] - radius,
|
48 |
+
center[0] + radius,
|
49 |
+
center[1] + radius,
|
50 |
+
)
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
def draw_handle_target_points(
|
55 |
+
img: PIL.Image.Image,
|
56 |
+
# handle_points: List[Tuple[int, int]],
|
57 |
+
target_points: List[Tuple[int, int]],
|
58 |
+
radius: int = 5):
|
59 |
+
"""
|
60 |
+
Draws handle and target points with arrow pointing towards the target point.
|
61 |
+
|
62 |
+
Args:
|
63 |
+
img (PIL.Image.Image): The image to draw on.
|
64 |
+
handle_points (List[Tuple[int, int]]): A list of handle [x,y] points.
|
65 |
+
target_points (List[Tuple[int, int]]): A list of target [x,y] points.
|
66 |
+
radius (int): The radius of the handle and target points.
|
67 |
+
"""
|
68 |
+
if not isinstance(img, PIL.Image.Image):
|
69 |
+
img = PIL.Image.fromarray(img)
|
70 |
+
|
71 |
+
# if len(handle_points) == len(target_points) + 1:
|
72 |
+
# target_points = copy.deepcopy(target_points) + [None]
|
73 |
+
|
74 |
+
draw = PIL.ImageDraw.Draw(img)
|
75 |
+
for handle_point, target_point in zip(target_points, target_points):
|
76 |
+
# handle_point = [handle_point[1], handle_point[0]]
|
77 |
+
# Draw the handle point
|
78 |
+
# ipdb.set_trace()
|
79 |
+
|
80 |
+
target_coords = get_ellipse_coords(target_point, radius)
|
81 |
+
draw.ellipse((target_coords), fill="red")
|
82 |
+
|
83 |
+
return np.array(img)
|
84 |
+
|
85 |
+
|
sd/pnp_utils.py
ADDED
@@ -0,0 +1,569 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import numpy as np
|
5 |
+
import ipdb
|
6 |
+
import torch.nn.functional as F
|
7 |
+
|
8 |
+
def seed_everything(seed):
|
9 |
+
torch.manual_seed(seed)
|
10 |
+
torch.cuda.manual_seed(seed)
|
11 |
+
random.seed(seed)
|
12 |
+
np.random.seed(seed)
|
13 |
+
|
14 |
+
def register_time(model, t):
|
15 |
+
conv_module = model.unet.up_blocks[1].resnets[1]
|
16 |
+
setattr(conv_module, 't', t)
|
17 |
+
down_res_dict = {0: [0, 1], 1: [0, 1], 2: [0, 1]}
|
18 |
+
up_res_dict = {1: [0, 1, 2], 2: [0, 1, 2], 3: [0, 1, 2]}
|
19 |
+
for res in up_res_dict:
|
20 |
+
for block in up_res_dict[res]:
|
21 |
+
module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
|
22 |
+
setattr(module, 't', t)
|
23 |
+
for res in down_res_dict:
|
24 |
+
for block in down_res_dict[res]:
|
25 |
+
module = model.unet.down_blocks[res].attentions[block].transformer_blocks[0].attn1
|
26 |
+
setattr(module, 't', t)
|
27 |
+
module = model.unet.mid_block.attentions[0].transformer_blocks[0].attn1
|
28 |
+
setattr(module, 't', t)
|
29 |
+
|
30 |
+
|
31 |
+
def load_source_latents_t(t, latents_path):
|
32 |
+
latents_t_path = os.path.join(latents_path, f'noisy_latents_{t}.pt')
|
33 |
+
assert os.path.exists(latents_t_path), f'Missing latents at t {t} path {latents_t_path}'
|
34 |
+
latents = torch.load(latents_t_path)
|
35 |
+
return latents
|
36 |
+
|
37 |
+
def register_attention_control_efficient(model, injection_schedule):
|
38 |
+
def sa_forward(self):
|
39 |
+
to_out = self.to_out
|
40 |
+
if type(to_out) is torch.nn.modules.container.ModuleList:
|
41 |
+
to_out = self.to_out[0]
|
42 |
+
else:
|
43 |
+
to_out = self.to_out
|
44 |
+
|
45 |
+
def forward(x, encoder_hidden_states=None, attention_mask=None):
|
46 |
+
batch_size, sequence_length, dim = x.shape
|
47 |
+
h = self.heads
|
48 |
+
|
49 |
+
is_cross = encoder_hidden_states is not None
|
50 |
+
encoder_hidden_states = encoder_hidden_states if is_cross else x
|
51 |
+
if not is_cross and self.injection_schedule is not None and (
|
52 |
+
self.t in self.injection_schedule or self.t == 1000):
|
53 |
+
q = self.to_q(x)
|
54 |
+
k = self.to_k(encoder_hidden_states)
|
55 |
+
|
56 |
+
source_batch_size = int(q.shape[0] // 3)
|
57 |
+
# inject unconditional
|
58 |
+
q[source_batch_size:2 * source_batch_size] = q[:source_batch_size]
|
59 |
+
k[source_batch_size:2 * source_batch_size] = k[:source_batch_size]
|
60 |
+
# inject conditional
|
61 |
+
q[2 * source_batch_size:] = q[:source_batch_size]
|
62 |
+
k[2 * source_batch_size:] = k[:source_batch_size]
|
63 |
+
|
64 |
+
q = self.head_to_batch_dim(q)
|
65 |
+
k = self.head_to_batch_dim(k)
|
66 |
+
else:
|
67 |
+
q = self.to_q(x)
|
68 |
+
k = self.to_k(encoder_hidden_states)
|
69 |
+
q = self.head_to_batch_dim(q)
|
70 |
+
k = self.head_to_batch_dim(k)
|
71 |
+
|
72 |
+
v = self.to_v(encoder_hidden_states)
|
73 |
+
v = self.head_to_batch_dim(v)
|
74 |
+
|
75 |
+
sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
|
76 |
+
|
77 |
+
if attention_mask is not None:
|
78 |
+
attention_mask = attention_mask.reshape(batch_size, -1)
|
79 |
+
max_neg_value = -torch.finfo(sim.dtype).max
|
80 |
+
attention_mask = attention_mask[:, None, :].repeat(h, 1, 1)
|
81 |
+
sim.masked_fill_(~attention_mask, max_neg_value)
|
82 |
+
|
83 |
+
# attention, what we cannot get enough of
|
84 |
+
attn = sim.softmax(dim=-1)
|
85 |
+
out = torch.einsum("b i j, b j d -> b i d", attn, v)
|
86 |
+
out = self.batch_to_head_dim(out)
|
87 |
+
|
88 |
+
return to_out(out)
|
89 |
+
|
90 |
+
return forward
|
91 |
+
res_dict = {1: [1, 2], 2: [0, 1, 2], 3: [0, 1, 2]} # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
|
92 |
+
for res in res_dict:
|
93 |
+
for block in res_dict[res]:
|
94 |
+
module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
|
95 |
+
module.forward = sa_forward(module)
|
96 |
+
setattr(module, 'injection_schedule', injection_schedule)
|
97 |
+
|
98 |
+
def register_attention_control_efficient_kv(model, injection_schedule):
|
99 |
+
def sa_forward(self):
|
100 |
+
to_out = self.to_out
|
101 |
+
if type(to_out) is torch.nn.modules.container.ModuleList:
|
102 |
+
to_out = self.to_out[0]
|
103 |
+
else:
|
104 |
+
to_out = self.to_out
|
105 |
+
|
106 |
+
def forward(x, encoder_hidden_states=None, attention_mask=None):
|
107 |
+
batch_size, sequence_length, dim = x.shape
|
108 |
+
h = self.heads
|
109 |
+
# if encoder_hidden_states is None:
|
110 |
+
# ipdb.set_trace()
|
111 |
+
|
112 |
+
is_cross = encoder_hidden_states is not None
|
113 |
+
encoder_hidden_states = encoder_hidden_states if is_cross else x
|
114 |
+
|
115 |
+
q = self.to_q(x)
|
116 |
+
q = self.head_to_batch_dim(q)
|
117 |
+
|
118 |
+
if not is_cross and self.injection_schedule is not None and (
|
119 |
+
self.t in self.injection_schedule or self.t == 1000):
|
120 |
+
# q = self.to_q(x)
|
121 |
+
k = self.to_k(encoder_hidden_states)
|
122 |
+
v = self.to_v(encoder_hidden_states)
|
123 |
+
|
124 |
+
source_batch_size = int(v.shape[0] // 3)
|
125 |
+
# inject unconditional
|
126 |
+
k[source_batch_size:2 * source_batch_size] = k[:source_batch_size]
|
127 |
+
v[source_batch_size:2 * source_batch_size] = v[:source_batch_size]
|
128 |
+
|
129 |
+
# inject conditional
|
130 |
+
k[2 * source_batch_size:] = k[:source_batch_size]
|
131 |
+
v[2 * source_batch_size:] = v[:source_batch_size]
|
132 |
+
|
133 |
+
# q = self.head_to_batch_dim(q)
|
134 |
+
k = self.head_to_batch_dim(k)
|
135 |
+
v = self.head_to_batch_dim(v)
|
136 |
+
else:
|
137 |
+
# q = self.to_q(x)
|
138 |
+
k = self.to_k(encoder_hidden_states)
|
139 |
+
# q = self.head_to_batch_dim(q)
|
140 |
+
k = self.head_to_batch_dim(k)
|
141 |
+
|
142 |
+
v = self.to_v(encoder_hidden_states)
|
143 |
+
v = self.head_to_batch_dim(v)
|
144 |
+
|
145 |
+
sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
|
146 |
+
|
147 |
+
if attention_mask is not None:
|
148 |
+
attention_mask = attention_mask.reshape(batch_size, -1)
|
149 |
+
max_neg_value = -torch.finfo(sim.dtype).max
|
150 |
+
attention_mask = attention_mask[:, None, :].repeat(h, 1, 1)
|
151 |
+
sim.masked_fill_(~attention_mask, max_neg_value)
|
152 |
+
|
153 |
+
# attention, what we cannot get enough of
|
154 |
+
attn = sim.softmax(dim=-1)
|
155 |
+
out = torch.einsum("b i j, b j d -> b i d", attn, v)
|
156 |
+
out = self.batch_to_head_dim(out)
|
157 |
+
|
158 |
+
return to_out(out)
|
159 |
+
|
160 |
+
return forward
|
161 |
+
|
162 |
+
res_dict = {1: [1, 2], 2: [0, 1, 2], 3: [0, 1, 2]} # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
|
163 |
+
for res in res_dict:
|
164 |
+
for block in res_dict[res]:
|
165 |
+
module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
|
166 |
+
module.forward = sa_forward(module)
|
167 |
+
setattr(module, 'injection_schedule', injection_schedule)
|
168 |
+
|
169 |
+
|
170 |
+
def register_conv_control_efficient(model, injection_schedule):
|
171 |
+
def conv_forward(self):
|
172 |
+
def forward(input_tensor, temb):
|
173 |
+
hidden_states = input_tensor
|
174 |
+
|
175 |
+
hidden_states = self.norm1(hidden_states)
|
176 |
+
hidden_states = self.nonlinearity(hidden_states)
|
177 |
+
|
178 |
+
if self.upsample is not None:
|
179 |
+
# upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
|
180 |
+
if hidden_states.shape[0] >= 64:
|
181 |
+
input_tensor = input_tensor.contiguous()
|
182 |
+
hidden_states = hidden_states.contiguous()
|
183 |
+
input_tensor = self.upsample(input_tensor)
|
184 |
+
hidden_states = self.upsample(hidden_states)
|
185 |
+
elif self.downsample is not None:
|
186 |
+
input_tensor = self.downsample(input_tensor)
|
187 |
+
hidden_states = self.downsample(hidden_states)
|
188 |
+
|
189 |
+
hidden_states = self.conv1(hidden_states)
|
190 |
+
|
191 |
+
if temb is not None:
|
192 |
+
temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
|
193 |
+
|
194 |
+
if temb is not None and self.time_embedding_norm == "default":
|
195 |
+
hidden_states = hidden_states + temb
|
196 |
+
|
197 |
+
hidden_states = self.norm2(hidden_states)
|
198 |
+
|
199 |
+
if temb is not None and self.time_embedding_norm == "scale_shift":
|
200 |
+
scale, shift = torch.chunk(temb, 2, dim=1)
|
201 |
+
hidden_states = hidden_states * (1 + scale) + shift
|
202 |
+
|
203 |
+
hidden_states = self.nonlinearity(hidden_states)
|
204 |
+
|
205 |
+
hidden_states = self.dropout(hidden_states)
|
206 |
+
hidden_states = self.conv2(hidden_states)
|
207 |
+
if self.injection_schedule is not None and (self.t in self.injection_schedule or self.t == 1000):
|
208 |
+
source_batch_size = int(hidden_states.shape[0] // 3)
|
209 |
+
# inject unconditional
|
210 |
+
hidden_states[source_batch_size:2 * source_batch_size] = hidden_states[:source_batch_size]
|
211 |
+
# inject conditional
|
212 |
+
hidden_states[2 * source_batch_size:] = hidden_states[:source_batch_size]
|
213 |
+
|
214 |
+
if self.conv_shortcut is not None:
|
215 |
+
input_tensor = self.conv_shortcut(input_tensor)
|
216 |
+
|
217 |
+
output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
|
218 |
+
|
219 |
+
return output_tensor
|
220 |
+
|
221 |
+
return forward
|
222 |
+
|
223 |
+
conv_module = model.unet.up_blocks[1].resnets[1]
|
224 |
+
conv_module.forward = conv_forward(conv_module)
|
225 |
+
setattr(conv_module, 'injection_schedule', injection_schedule)
|
226 |
+
|
227 |
+
|
228 |
+
def register_attention_control_efficient_kv_2nd_to_1st(model, injection_schedule, mask=None):
|
229 |
+
def sa_forward(self):
|
230 |
+
to_out = self.to_out
|
231 |
+
if type(to_out) is torch.nn.modules.container.ModuleList:
|
232 |
+
to_out = self.to_out[0]
|
233 |
+
else:
|
234 |
+
to_out = self.to_out
|
235 |
+
|
236 |
+
def forward(x, mask=mask, encoder_hidden_states=None, attention_mask=None):
|
237 |
+
batch_size, sequence_length, dim = x.shape
|
238 |
+
h = self.heads
|
239 |
+
# if encoder_hidden_states is None:
|
240 |
+
# ipdb.set_trace()
|
241 |
+
is_cross = encoder_hidden_states is not None
|
242 |
+
encoder_hidden_states = encoder_hidden_states if is_cross else x
|
243 |
+
|
244 |
+
q = self.to_q(x)
|
245 |
+
q = self.head_to_batch_dim(q)
|
246 |
+
|
247 |
+
if not is_cross and self.injection_schedule is not None and (
|
248 |
+
self.t in self.injection_schedule or self.t == 1000):
|
249 |
+
# q = self.to_q(x)
|
250 |
+
target_size = int(np.sqrt(encoder_hidden_states.shape[1]))
|
251 |
+
target_mask = F.interpolate(mask.unsqueeze(1),size=(target_size, target_size))[:,0,:,:]
|
252 |
+
target_mask = target_mask.view(target_mask.shape[0], -1).unsqueeze(-1)
|
253 |
+
k = self.to_k(encoder_hidden_states) # k: bx256x1280
|
254 |
+
v = self.to_v(encoder_hidden_states)
|
255 |
+
|
256 |
+
source_batch_size = int(v.shape[0] // 2)
|
257 |
+
# inject
|
258 |
+
k[:source_batch_size] = k[source_batch_size:2 * source_batch_size] * (1-target_mask) + k[:source_batch_size] * target_mask
|
259 |
+
v[:source_batch_size] = v[source_batch_size:2 * source_batch_size] * (1-target_mask) + v[:source_batch_size] * target_mask
|
260 |
+
|
261 |
+
# q = self.head_to_batch_dim(q)
|
262 |
+
k = self.head_to_batch_dim(k)
|
263 |
+
v = self.head_to_batch_dim(v)
|
264 |
+
else:
|
265 |
+
# q = self.to_q(x)
|
266 |
+
k = self.to_k(encoder_hidden_states)
|
267 |
+
# q = self.head_to_batch_dim(q)
|
268 |
+
k = self.head_to_batch_dim(k)
|
269 |
+
|
270 |
+
v = self.to_v(encoder_hidden_states)
|
271 |
+
v = self.head_to_batch_dim(v)
|
272 |
+
|
273 |
+
sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
|
274 |
+
|
275 |
+
if attention_mask is not None:
|
276 |
+
attention_mask = attention_mask.reshape(batch_size, -1)
|
277 |
+
max_neg_value = -torch.finfo(sim.dtype).max
|
278 |
+
attention_mask = attention_mask[:, None, :].repeat(h, 1, 1)
|
279 |
+
sim.masked_fill_(~attention_mask, max_neg_value)
|
280 |
+
|
281 |
+
# attention, what we cannot get enough of
|
282 |
+
attn = sim.softmax(dim=-1)
|
283 |
+
out = torch.einsum("b i j, b j d -> b i d", attn, v)
|
284 |
+
out = self.batch_to_head_dim(out)
|
285 |
+
|
286 |
+
return to_out(out)
|
287 |
+
|
288 |
+
return forward
|
289 |
+
|
290 |
+
# res_dict = {1: [1, 2], 2: [0, 1, 2], 3: [0, 1, 2]} # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
|
291 |
+
res_dict = {1: [1, 2], 2: [0, 1, 2]} # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
|
292 |
+
|
293 |
+
for res in res_dict:
|
294 |
+
for block in res_dict[res]:
|
295 |
+
module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
|
296 |
+
module.forward = sa_forward(module)
|
297 |
+
setattr(module, 'injection_schedule', injection_schedule)
|
298 |
+
|
299 |
+
def register_conv_control_efficient_2nd_to_1st(model, injection_schedule, mask=None):
|
300 |
+
def conv_forward(self):
|
301 |
+
def forward(input_tensor, temb):
|
302 |
+
hidden_states = input_tensor
|
303 |
+
|
304 |
+
hidden_states = self.norm1(hidden_states)
|
305 |
+
hidden_states = self.nonlinearity(hidden_states)
|
306 |
+
|
307 |
+
if self.upsample is not None:
|
308 |
+
# upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
|
309 |
+
if hidden_states.shape[0] >= 64:
|
310 |
+
input_tensor = input_tensor.contiguous()
|
311 |
+
hidden_states = hidden_states.contiguous()
|
312 |
+
input_tensor = self.upsample(input_tensor)
|
313 |
+
hidden_states = self.upsample(hidden_states)
|
314 |
+
elif self.downsample is not None:
|
315 |
+
input_tensor = self.downsample(input_tensor)
|
316 |
+
hidden_states = self.downsample(hidden_states)
|
317 |
+
|
318 |
+
hidden_states = self.conv1(hidden_states)
|
319 |
+
|
320 |
+
if temb is not None:
|
321 |
+
temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
|
322 |
+
|
323 |
+
if temb is not None and self.time_embedding_norm == "default":
|
324 |
+
hidden_states = hidden_states + temb
|
325 |
+
|
326 |
+
hidden_states = self.norm2(hidden_states)
|
327 |
+
|
328 |
+
if temb is not None and self.time_embedding_norm == "scale_shift":
|
329 |
+
scale, shift = torch.chunk(temb, 2, dim=1)
|
330 |
+
hidden_states = hidden_states * (1 + scale) + shift
|
331 |
+
|
332 |
+
hidden_states = self.nonlinearity(hidden_states)
|
333 |
+
|
334 |
+
hidden_states = self.dropout(hidden_states)
|
335 |
+
hidden_states = self.conv2(hidden_states)
|
336 |
+
if self.injection_schedule is not None and (self.t in self.injection_schedule or self.t == 1000):
|
337 |
+
source_batch_size = int(hidden_states.shape[0] // 2)
|
338 |
+
# inject unconditional
|
339 |
+
# hidden_states[source_batch_size:2 * source_batch_size] = hidden_states[:source_batch_size]
|
340 |
+
# inject conditional
|
341 |
+
target_size = int(np.sqrt(hidden_states.shape[-1]))
|
342 |
+
target_mask = F.interpolate(mask.unsqueeze(1),size=(target_size, target_size))[:,0,:,:]
|
343 |
+
target_mask = target_mask.view(target_mask.shape[0], -1).unsqueeze(-1)
|
344 |
+
|
345 |
+
hidden_states[:source_batch_size] = hidden_states[source_batch_size:] * (1-target_mask) + hidden_states[:source_batch_size] * target_mask
|
346 |
+
|
347 |
+
if self.conv_shortcut is not None:
|
348 |
+
input_tensor = self.conv_shortcut(input_tensor)
|
349 |
+
|
350 |
+
output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
|
351 |
+
|
352 |
+
return output_tensor
|
353 |
+
|
354 |
+
return forward
|
355 |
+
|
356 |
+
conv_module = model.unet.up_blocks[1].resnets[1]
|
357 |
+
conv_module.forward = conv_forward(conv_module)
|
358 |
+
setattr(conv_module, 'injection_schedule', injection_schedule)
|
359 |
+
|
360 |
+
|
361 |
+
def register_attention_control_efficient_qk_w_mask(model, injection_schedule, mask):
|
362 |
+
def sa_forward(self):
|
363 |
+
to_out = self.to_out
|
364 |
+
if type(to_out) is torch.nn.modules.container.ModuleList:
|
365 |
+
to_out = self.to_out[0]
|
366 |
+
else:
|
367 |
+
to_out = self.to_out
|
368 |
+
|
369 |
+
def forward(x, encoder_hidden_states=None, attention_mask=None):
|
370 |
+
batch_size, sequence_length, dim = x.shape
|
371 |
+
h = self.heads
|
372 |
+
|
373 |
+
is_cross = encoder_hidden_states is not None
|
374 |
+
encoder_hidden_states = encoder_hidden_states if is_cross else x
|
375 |
+
if not is_cross and self.injection_schedule is not None and (
|
376 |
+
self.t in self.injection_schedule or self.t == 1000):
|
377 |
+
q = self.to_q(x)
|
378 |
+
k = self.to_k(encoder_hidden_states)
|
379 |
+
|
380 |
+
target_size = int(np.sqrt(encoder_hidden_states.shape[1]))
|
381 |
+
target_mask = F.interpolate(mask.unsqueeze(1),size=(target_size, target_size))[:,0,:,:]
|
382 |
+
target_mask = target_mask.view(target_mask.shape[0], -1).unsqueeze(-1)
|
383 |
+
|
384 |
+
source_batch_size = int(q.shape[0] // 3)
|
385 |
+
# inject unconditional
|
386 |
+
q[source_batch_size:2 * source_batch_size] = q[:source_batch_size] * target_mask + q[source_batch_size:2 * source_batch_size] * (1 - target_mask)
|
387 |
+
k[source_batch_size:2 * source_batch_size] = k[:source_batch_size] * target_mask + k[source_batch_size:2 * source_batch_size] * (1 - target_mask)
|
388 |
+
# inject conditional
|
389 |
+
q[2 * source_batch_size:] = q[:source_batch_size] * target_mask + q[2 * source_batch_size:] * (1 - target_mask)
|
390 |
+
k[2 * source_batch_size:] = k[:source_batch_size] * target_mask + k[2 * source_batch_size:] * (1 - target_mask)
|
391 |
+
|
392 |
+
q = self.head_to_batch_dim(q)
|
393 |
+
k = self.head_to_batch_dim(k)
|
394 |
+
else:
|
395 |
+
q = self.to_q(x)
|
396 |
+
k = self.to_k(encoder_hidden_states)
|
397 |
+
q = self.head_to_batch_dim(q)
|
398 |
+
k = self.head_to_batch_dim(k)
|
399 |
+
|
400 |
+
v = self.to_v(encoder_hidden_states)
|
401 |
+
v = self.head_to_batch_dim(v)
|
402 |
+
|
403 |
+
sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
|
404 |
+
|
405 |
+
if attention_mask is not None:
|
406 |
+
attention_mask = attention_mask.reshape(batch_size, -1)
|
407 |
+
max_neg_value = -torch.finfo(sim.dtype).max
|
408 |
+
attention_mask = attention_mask[:, None, :].repeat(h, 1, 1)
|
409 |
+
sim.masked_fill_(~attention_mask, max_neg_value)
|
410 |
+
|
411 |
+
# attention, what we cannot get enough of
|
412 |
+
attn = sim.softmax(dim=-1)
|
413 |
+
out = torch.einsum("b i j, b j d -> b i d", attn, v)
|
414 |
+
out = self.batch_to_head_dim(out)
|
415 |
+
|
416 |
+
return to_out(out)
|
417 |
+
|
418 |
+
return forward
|
419 |
+
res_dict = {1: [1, 2], 2: [0, 1, 2], 3: [0, 1, 2]} # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
|
420 |
+
|
421 |
+
for res in res_dict:
|
422 |
+
for block in res_dict[res]:
|
423 |
+
module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
|
424 |
+
module.forward = sa_forward(module)
|
425 |
+
setattr(module, 'injection_schedule', injection_schedule)
|
426 |
+
|
427 |
+
def register_attention_control_efficient_kv_w_mask(model, injection_schedule, mask, do_classifier_free_guidance):
|
428 |
+
def sa_forward(self):
|
429 |
+
to_out = self.to_out
|
430 |
+
if type(to_out) is torch.nn.modules.container.ModuleList:
|
431 |
+
to_out = self.to_out[0]
|
432 |
+
else:
|
433 |
+
to_out = self.to_out
|
434 |
+
|
435 |
+
def forward(x, encoder_hidden_states=None, attention_mask=None):
|
436 |
+
batch_size, sequence_length, dim = x.shape
|
437 |
+
h = self.heads
|
438 |
+
|
439 |
+
is_cross = encoder_hidden_states is not None
|
440 |
+
encoder_hidden_states = encoder_hidden_states if is_cross else x
|
441 |
+
|
442 |
+
q = self.to_q(x)
|
443 |
+
q = self.head_to_batch_dim(q)
|
444 |
+
|
445 |
+
if not is_cross and self.injection_schedule is not None and (
|
446 |
+
self.t in self.injection_schedule or self.t == 1000):
|
447 |
+
# if False:
|
448 |
+
k = self.to_k(encoder_hidden_states) # k: bx256x1280
|
449 |
+
v = self.to_v(encoder_hidden_states)
|
450 |
+
|
451 |
+
target_size = int(np.sqrt(encoder_hidden_states.shape[1]))
|
452 |
+
target_mask = F.interpolate(mask.unsqueeze(1),size=(target_size, target_size))[:,0,:,:]
|
453 |
+
target_mask = target_mask.view(target_mask.shape[0], -1).unsqueeze(-1)
|
454 |
+
|
455 |
+
source_batch_size = int(v.shape[0] // 3)
|
456 |
+
if do_classifier_free_guidance:
|
457 |
+
# inject unconditional
|
458 |
+
v[source_batch_size:2 * source_batch_size] = v[:source_batch_size] * target_mask + v[source_batch_size:2 * source_batch_size] * (1 - target_mask)
|
459 |
+
k[source_batch_size:2 * source_batch_size] = k[:source_batch_size] * target_mask + k[source_batch_size:2 * source_batch_size] * (1 - target_mask)
|
460 |
+
# inject conditional
|
461 |
+
v[2 * source_batch_size:] = v[:source_batch_size] * target_mask + v[2 * source_batch_size:] * (1 - target_mask)
|
462 |
+
k[2 * source_batch_size:] = k[:source_batch_size] * target_mask + k[2 * source_batch_size:] * (1 - target_mask)
|
463 |
+
else:
|
464 |
+
v[source_batch_size:2 * source_batch_size] = v[:source_batch_size] * target_mask + v[source_batch_size:2 * source_batch_size] * (1 - target_mask)
|
465 |
+
k[source_batch_size:2 * source_batch_size] = k[:source_batch_size] * target_mask + k[source_batch_size:2 * source_batch_size] * (1 - target_mask)
|
466 |
+
|
467 |
+
k = self.head_to_batch_dim(k)
|
468 |
+
v = self.head_to_batch_dim(v)
|
469 |
+
else:
|
470 |
+
# q = self.to_q(x)
|
471 |
+
k = self.to_k(encoder_hidden_states)
|
472 |
+
# q = self.head_to_batch_dim(q)
|
473 |
+
k = self.head_to_batch_dim(k)
|
474 |
+
|
475 |
+
v = self.to_v(encoder_hidden_states)
|
476 |
+
v = self.head_to_batch_dim(v)
|
477 |
+
|
478 |
+
sim = torch.einsum("b i d, b j d -> b i j", q, k) * self.scale
|
479 |
+
|
480 |
+
if attention_mask is not None:
|
481 |
+
attention_mask = attention_mask.reshape(batch_size, -1)
|
482 |
+
max_neg_value = -torch.finfo(sim.dtype).max
|
483 |
+
attention_mask = attention_mask[:, None, :].repeat(h, 1, 1)
|
484 |
+
sim.masked_fill_(~attention_mask, max_neg_value)
|
485 |
+
|
486 |
+
# attention, what we cannot get enough of
|
487 |
+
attn = sim.softmax(dim=-1)
|
488 |
+
out = torch.einsum("b i j, b j d -> b i d", attn, v)
|
489 |
+
out = self.batch_to_head_dim(out)
|
490 |
+
|
491 |
+
return to_out(out)
|
492 |
+
|
493 |
+
return forward
|
494 |
+
res_dict = {1: [0, 1, 2], 2: [0, 1, 2], 3: [0, 1, 2]} # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
|
495 |
+
# res_dict = {1: [2], 2: [2], 3: [2]} # we are injecting attention in blocks 4 - 11 of the decoder, so not in the first block of the lowest resolution
|
496 |
+
|
497 |
+
for res in res_dict:
|
498 |
+
for block in res_dict[res]:
|
499 |
+
module = model.unet.up_blocks[res].attentions[block].transformer_blocks[0].attn1
|
500 |
+
module.forward = sa_forward(module)
|
501 |
+
setattr(module, 'injection_schedule', injection_schedule)
|
502 |
+
# down_res_dict = {0: [0, 1], 1: [0, 1], 2: [0, 1]}
|
503 |
+
# for res in down_res_dict:
|
504 |
+
# for block in down_res_dict[res]:
|
505 |
+
# module = model.unet.down_blocks[res].attentions[block].transformer_blocks[0].attn1
|
506 |
+
# module.forward = sa_forward(module)
|
507 |
+
# setattr(module, 'injection_schedule', injection_schedule)
|
508 |
+
|
509 |
+
def register_conv_control_efficient_w_mask(model, injection_schedule, mask):
|
510 |
+
def conv_forward(self):
|
511 |
+
def forward(input_tensor, temb):
|
512 |
+
hidden_states = input_tensor
|
513 |
+
|
514 |
+
hidden_states = self.norm1(hidden_states)
|
515 |
+
hidden_states = self.nonlinearity(hidden_states)
|
516 |
+
|
517 |
+
if self.upsample is not None:
|
518 |
+
# upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
|
519 |
+
if hidden_states.shape[0] >= 64:
|
520 |
+
input_tensor = input_tensor.contiguous()
|
521 |
+
hidden_states = hidden_states.contiguous()
|
522 |
+
input_tensor = self.upsample(input_tensor)
|
523 |
+
hidden_states = self.upsample(hidden_states)
|
524 |
+
elif self.downsample is not None:
|
525 |
+
input_tensor = self.downsample(input_tensor)
|
526 |
+
hidden_states = self.downsample(hidden_states)
|
527 |
+
|
528 |
+
hidden_states = self.conv1(hidden_states)
|
529 |
+
|
530 |
+
if temb is not None:
|
531 |
+
temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
|
532 |
+
|
533 |
+
if temb is not None and self.time_embedding_norm == "default":
|
534 |
+
hidden_states = hidden_states + temb
|
535 |
+
|
536 |
+
hidden_states = self.norm2(hidden_states)
|
537 |
+
|
538 |
+
if temb is not None and self.time_embedding_norm == "scale_shift":
|
539 |
+
scale, shift = torch.chunk(temb, 2, dim=1)
|
540 |
+
hidden_states = hidden_states * (1 + scale) + shift
|
541 |
+
|
542 |
+
hidden_states = self.nonlinearity(hidden_states)
|
543 |
+
|
544 |
+
hidden_states = self.dropout(hidden_states)
|
545 |
+
hidden_states = self.conv2(hidden_states)
|
546 |
+
if self.injection_schedule is not None and (self.t in self.injection_schedule or self.t == 1000):
|
547 |
+
# if False:
|
548 |
+
source_batch_size = int(hidden_states.shape[0] // 3)
|
549 |
+
target_size = int(np.sqrt(hidden_states.shape[-1]))
|
550 |
+
target_mask = F.interpolate(mask.unsqueeze(1),size=(target_size, target_size))[:,0,:,:]
|
551 |
+
target_mask = target_mask.view(target_mask.shape[0], -1).unsqueeze(-1)
|
552 |
+
|
553 |
+
# inject unconditional
|
554 |
+
hidden_states[source_batch_size:2 * source_batch_size] = hidden_states[:source_batch_size] * target_mask + hidden_states[source_batch_size:2 * source_batch_size] * (1-target_mask)
|
555 |
+
# inject conditional
|
556 |
+
hidden_states[2 * source_batch_size:] = hidden_states[:source_batch_size] * target_mask + hidden_states[2 * source_batch_size:] * (1-target_mask)
|
557 |
+
|
558 |
+
if self.conv_shortcut is not None:
|
559 |
+
input_tensor = self.conv_shortcut(input_tensor)
|
560 |
+
|
561 |
+
output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
|
562 |
+
|
563 |
+
return output_tensor
|
564 |
+
|
565 |
+
return forward
|
566 |
+
|
567 |
+
conv_module = model.unet.up_blocks[1].resnets[1]
|
568 |
+
conv_module.forward = conv_forward(conv_module)
|
569 |
+
setattr(conv_module, 'injection_schedule', injection_schedule)
|
weights/dpt_beit_large_512.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e9e900747e9e8b3112df716979219836a27716277b3d0dc53889cbba8b82328
|
3 |
+
size 1581966003
|