SunderAli17 commited on
Commit
489a5bc
1 Parent(s): 2333fe7

Create aggregator.py

Browse files
Files changed (1) hide show
  1. module/aggregator.py +973 -0
module/aggregator.py ADDED
@@ -0,0 +1,973 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, Dict, List, Optional, Tuple, Union
3
+
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
9
+ from diffusers.loaders.single_file_model import FromOriginalModelMixin
10
+ from diffusers.utils import BaseOutput, logging
11
+ from diffusers.models.attention_processor import (
12
+ ADDED_KV_ATTENTION_PROCESSORS,
13
+ CROSS_ATTENTION_PROCESSORS,
14
+ AttentionProcessor,
15
+ AttnAddedKVProcessor,
16
+ AttnProcessor,
17
+ )
18
+ from diffusers.models.embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
19
+ from diffusers.models.modeling_utils import ModelMixin
20
+ from diffusers.models.unets.unet_2d_blocks import (
21
+ CrossAttnDownBlock2D,
22
+ DownBlock2D,
23
+ UNetMidBlock2D,
24
+ UNetMidBlock2DCrossAttn,
25
+ get_down_block,
26
+ )
27
+ from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
28
+
29
+
30
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
31
+
32
+
33
+ class ZeroConv(nn.Module):
34
+ def __init__(self, label_nc, norm_nc, mask=False):
35
+ super().__init__()
36
+ self.zero_conv = zero_module(nn.Conv2d(label_nc+norm_nc, norm_nc, 1, 1, 0))
37
+ self.mask = mask
38
+
39
+ def forward(self, hidden_states, h_ori=None):
40
+ # with torch.cuda.amp.autocast(enabled=False, dtype=torch.float32):
41
+ c, h = hidden_states
42
+ if not self.mask:
43
+ h = self.zero_conv(torch.cat([c, h], dim=1))
44
+ else:
45
+ h = self.zero_conv(torch.cat([c, h], dim=1)) * torch.zeros_like(h)
46
+ if h_ori is not None:
47
+ h = torch.cat([h_ori, h], dim=1)
48
+ return h
49
+
50
+
51
+ class SFT(nn.Module):
52
+ def __init__(self, label_nc, norm_nc, mask=False):
53
+ super().__init__()
54
+
55
+ # param_free_norm_type = str(parsed.group(1))
56
+ ks = 3
57
+ pw = ks // 2
58
+
59
+ self.mask = mask
60
+
61
+ nhidden = 128
62
+
63
+ self.mlp_shared = nn.Sequential(
64
+ nn.Conv2d(label_nc, nhidden, kernel_size=ks, padding=pw),
65
+ nn.SiLU()
66
+ )
67
+ self.mul = nn.Conv2d(nhidden, norm_nc, kernel_size=ks, padding=pw)
68
+ self.add = nn.Conv2d(nhidden, norm_nc, kernel_size=ks, padding=pw)
69
+
70
+ def forward(self, hidden_states, mask=False):
71
+
72
+ c, h = hidden_states
73
+ mask = mask or self.mask
74
+ assert mask is False
75
+
76
+ actv = self.mlp_shared(c)
77
+ gamma = self.mul(actv)
78
+ beta = self.add(actv)
79
+
80
+ if self.mask:
81
+ gamma = gamma * torch.zeros_like(gamma)
82
+ beta = beta * torch.zeros_like(beta)
83
+ # gamma_ori, gamma_res = torch.split(gamma, [h_ori_c, h_c], dim=1)
84
+ # beta_ori, beta_res = torch.split(beta, [h_ori_c, h_c], dim=1)
85
+ # print(gamma_ori.mean(), gamma_res.mean(), beta_ori.mean(), beta_res.mean())
86
+ h = h * (gamma + 1) + beta
87
+ # sample_ori, sample_res = torch.split(h, [h_ori_c, h_c], dim=1)
88
+ # print(sample_ori.mean(), sample_res.mean())
89
+
90
+ return h
91
+
92
+
93
+ @dataclass
94
+ class AggregatorOutput(BaseOutput):
95
+ """
96
+ The output of [`Aggregator`].
97
+ Args:
98
+ down_block_res_samples (`tuple[torch.Tensor]`):
99
+ A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
100
+ be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
101
+ used to condition the original UNet's downsampling activations.
102
+ mid_down_block_re_sample (`torch.Tensor`):
103
+ The activation of the midde block (the lowest sample resolution). Each tensor should be of shape
104
+ `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
105
+ Output can be used to condition the original UNet's middle block activation.
106
+ """
107
+
108
+ down_block_res_samples: Tuple[torch.Tensor]
109
+ mid_block_res_sample: torch.Tensor
110
+
111
+
112
+ class ConditioningEmbedding(nn.Module):
113
+ """
114
+ Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
115
+ [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
116
+ training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
117
+ convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
118
+ (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
119
+ model) to encode image-space conditions ... into feature maps ..."
120
+ """
121
+
122
+ def __init__(
123
+ self,
124
+ conditioning_embedding_channels: int,
125
+ conditioning_channels: int = 3,
126
+ block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
127
+ ):
128
+ super().__init__()
129
+
130
+ self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
131
+
132
+ self.blocks = nn.ModuleList([])
133
+
134
+ for i in range(len(block_out_channels) - 1):
135
+ channel_in = block_out_channels[i]
136
+ channel_out = block_out_channels[i + 1]
137
+ self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
138
+ self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
139
+
140
+ self.conv_out = zero_module(
141
+ nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
142
+ )
143
+
144
+ def forward(self, conditioning):
145
+ embedding = self.conv_in(conditioning)
146
+ embedding = F.silu(embedding)
147
+
148
+ for block in self.blocks:
149
+ embedding = block(embedding)
150
+ embedding = F.silu(embedding)
151
+
152
+ embedding = self.conv_out(embedding)
153
+
154
+ return embedding
155
+
156
+
157
+ class Aggregator(ModelMixin, ConfigMixin, FromOriginalModelMixin):
158
+ """
159
+ Aggregator model.
160
+ Args:
161
+ in_channels (`int`, defaults to 4):
162
+ The number of channels in the input sample.
163
+ flip_sin_to_cos (`bool`, defaults to `True`):
164
+ Whether to flip the sin to cos in the time embedding.
165
+ freq_shift (`int`, defaults to 0):
166
+ The frequency shift to apply to the time embedding.
167
+ down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
168
+ The tuple of downsample blocks to use.
169
+ only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
170
+ block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
171
+ The tuple of output channels for each block.
172
+ layers_per_block (`int`, defaults to 2):
173
+ The number of layers per block.
174
+ downsample_padding (`int`, defaults to 1):
175
+ The padding to use for the downsampling convolution.
176
+ mid_block_scale_factor (`float`, defaults to 1):
177
+ The scale factor to use for the mid block.
178
+ act_fn (`str`, defaults to "silu"):
179
+ The activation function to use.
180
+ norm_num_groups (`int`, *optional*, defaults to 32):
181
+ The number of groups to use for the normalization. If None, normalization and activation layers is skipped
182
+ in post-processing.
183
+ norm_eps (`float`, defaults to 1e-5):
184
+ The epsilon to use for the normalization.
185
+ cross_attention_dim (`int`, defaults to 1280):
186
+ The dimension of the cross attention features.
187
+ transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
188
+ The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
189
+ [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
190
+ [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
191
+ encoder_hid_dim (`int`, *optional*, defaults to None):
192
+ If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
193
+ dimension to `cross_attention_dim`.
194
+ encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
195
+ If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
196
+ embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
197
+ attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
198
+ The dimension of the attention heads.
199
+ use_linear_projection (`bool`, defaults to `False`):
200
+ class_embed_type (`str`, *optional*, defaults to `None`):
201
+ The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
202
+ `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
203
+ addition_embed_type (`str`, *optional*, defaults to `None`):
204
+ Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
205
+ "text". "text" will use the `TextTimeEmbedding` layer.
206
+ num_class_embeds (`int`, *optional*, defaults to 0):
207
+ Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
208
+ class conditioning with `class_embed_type` equal to `None`.
209
+ upcast_attention (`bool`, defaults to `False`):
210
+ resnet_time_scale_shift (`str`, defaults to `"default"`):
211
+ Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
212
+ projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
213
+ The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
214
+ `class_embed_type="projection"`.
215
+ controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
216
+ The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
217
+ conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
218
+ The tuple of output channel for each block in the `conditioning_embedding` layer.
219
+ global_pool_conditions (`bool`, defaults to `False`):
220
+ TODO(Patrick) - unused parameter.
221
+ addition_embed_type_num_heads (`int`, defaults to 64):
222
+ The number of heads to use for the `TextTimeEmbedding` layer.
223
+ """
224
+
225
+ _supports_gradient_checkpointing = True
226
+
227
+ @register_to_config
228
+ def __init__(
229
+ self,
230
+ in_channels: int = 4,
231
+ conditioning_channels: int = 3,
232
+ flip_sin_to_cos: bool = True,
233
+ freq_shift: int = 0,
234
+ down_block_types: Tuple[str, ...] = (
235
+ "CrossAttnDownBlock2D",
236
+ "CrossAttnDownBlock2D",
237
+ "CrossAttnDownBlock2D",
238
+ "DownBlock2D",
239
+ ),
240
+ mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
241
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
242
+ block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
243
+ layers_per_block: int = 2,
244
+ downsample_padding: int = 1,
245
+ mid_block_scale_factor: float = 1,
246
+ act_fn: str = "silu",
247
+ norm_num_groups: Optional[int] = 32,
248
+ norm_eps: float = 1e-5,
249
+ cross_attention_dim: int = 1280,
250
+ transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
251
+ encoder_hid_dim: Optional[int] = None,
252
+ encoder_hid_dim_type: Optional[str] = None,
253
+ attention_head_dim: Union[int, Tuple[int, ...]] = 8,
254
+ num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
255
+ use_linear_projection: bool = False,
256
+ class_embed_type: Optional[str] = None,
257
+ addition_embed_type: Optional[str] = None,
258
+ addition_time_embed_dim: Optional[int] = None,
259
+ num_class_embeds: Optional[int] = None,
260
+ upcast_attention: bool = False,
261
+ resnet_time_scale_shift: str = "default",
262
+ projection_class_embeddings_input_dim: Optional[int] = None,
263
+ controlnet_conditioning_channel_order: str = "rgb",
264
+ conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
265
+ global_pool_conditions: bool = False,
266
+ addition_embed_type_num_heads: int = 64,
267
+ pad_concat: bool = False,
268
+ ):
269
+ super().__init__()
270
+
271
+ # If `num_attention_heads` is not defined (which is the case for most models)
272
+ # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
273
+ # The reason for this behavior is to correct for incorrectly named variables that were introduced
274
+ # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
275
+ # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
276
+ # which is why we correct for the naming here.
277
+ num_attention_heads = num_attention_heads or attention_head_dim
278
+ self.pad_concat = pad_concat
279
+
280
+ # Check inputs
281
+ if len(block_out_channels) != len(down_block_types):
282
+ raise ValueError(
283
+ f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
284
+ )
285
+
286
+ if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
287
+ raise ValueError(
288
+ f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
289
+ )
290
+
291
+ if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
292
+ raise ValueError(
293
+ f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
294
+ )
295
+
296
+ if isinstance(transformer_layers_per_block, int):
297
+ transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
298
+
299
+ # input
300
+ conv_in_kernel = 3
301
+ conv_in_padding = (conv_in_kernel - 1) // 2
302
+ self.conv_in = nn.Conv2d(
303
+ in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
304
+ )
305
+
306
+ # time
307
+ time_embed_dim = block_out_channels[0] * 4
308
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
309
+ timestep_input_dim = block_out_channels[0]
310
+ self.time_embedding = TimestepEmbedding(
311
+ timestep_input_dim,
312
+ time_embed_dim,
313
+ act_fn=act_fn,
314
+ )
315
+
316
+ if encoder_hid_dim_type is None and encoder_hid_dim is not None:
317
+ encoder_hid_dim_type = "text_proj"
318
+ self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
319
+ logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
320
+
321
+ if encoder_hid_dim is None and encoder_hid_dim_type is not None:
322
+ raise ValueError(
323
+ f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
324
+ )
325
+
326
+ if encoder_hid_dim_type == "text_proj":
327
+ self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
328
+ elif encoder_hid_dim_type == "text_image_proj":
329
+ # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
330
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
331
+ # case when `addition_embed_type == "text_image_proj"` (Kandinsky 2.1)`
332
+ self.encoder_hid_proj = TextImageProjection(
333
+ text_embed_dim=encoder_hid_dim,
334
+ image_embed_dim=cross_attention_dim,
335
+ cross_attention_dim=cross_attention_dim,
336
+ )
337
+
338
+ elif encoder_hid_dim_type is not None:
339
+ raise ValueError(
340
+ f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
341
+ )
342
+ else:
343
+ self.encoder_hid_proj = None
344
+
345
+ # class embedding
346
+ if class_embed_type is None and num_class_embeds is not None:
347
+ self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
348
+ elif class_embed_type == "timestep":
349
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
350
+ elif class_embed_type == "identity":
351
+ self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
352
+ elif class_embed_type == "projection":
353
+ if projection_class_embeddings_input_dim is None:
354
+ raise ValueError(
355
+ "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
356
+ )
357
+ # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
358
+ # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
359
+ # 2. it projects from an arbitrary input dimension.
360
+ #
361
+ # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
362
+ # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
363
+ # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
364
+ self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
365
+ else:
366
+ self.class_embedding = None
367
+
368
+ if addition_embed_type == "text":
369
+ if encoder_hid_dim is not None:
370
+ text_time_embedding_from_dim = encoder_hid_dim
371
+ else:
372
+ text_time_embedding_from_dim = cross_attention_dim
373
+
374
+ self.add_embedding = TextTimeEmbedding(
375
+ text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
376
+ )
377
+ elif addition_embed_type == "text_image":
378
+ # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
379
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
380
+ # case when `addition_embed_type == "text_image"` (Kandinsky 2.1)`
381
+ self.add_embedding = TextImageTimeEmbedding(
382
+ text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
383
+ )
384
+ elif addition_embed_type == "text_time":
385
+ self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
386
+ self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
387
+
388
+ elif addition_embed_type is not None:
389
+ raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
390
+
391
+ # control net conditioning embedding
392
+ self.ref_conv_in = nn.Conv2d(
393
+ in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
394
+ )
395
+
396
+ self.down_blocks = nn.ModuleList([])
397
+ self.controlnet_down_blocks = nn.ModuleList([])
398
+
399
+ if isinstance(only_cross_attention, bool):
400
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
401
+
402
+ if isinstance(attention_head_dim, int):
403
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
404
+
405
+ if isinstance(num_attention_heads, int):
406
+ num_attention_heads = (num_attention_heads,) * len(down_block_types)
407
+
408
+ # down
409
+ output_channel = block_out_channels[0]
410
+
411
+ # controlnet_block = ZeroConv(output_channel, output_channel)
412
+ controlnet_block = nn.Sequential(
413
+ SFT(output_channel, output_channel),
414
+ zero_module(nn.Conv2d(output_channel, output_channel, kernel_size=1))
415
+ )
416
+ self.controlnet_down_blocks.append(controlnet_block)
417
+
418
+ for i, down_block_type in enumerate(down_block_types):
419
+ input_channel = output_channel
420
+ output_channel = block_out_channels[i]
421
+ is_final_block = i == len(block_out_channels) - 1
422
+
423
+ down_block = get_down_block(
424
+ down_block_type,
425
+ num_layers=layers_per_block,
426
+ transformer_layers_per_block=transformer_layers_per_block[i],
427
+ in_channels=input_channel,
428
+ out_channels=output_channel,
429
+ temb_channels=time_embed_dim,
430
+ add_downsample=not is_final_block,
431
+ resnet_eps=norm_eps,
432
+ resnet_act_fn=act_fn,
433
+ resnet_groups=norm_num_groups,
434
+ cross_attention_dim=cross_attention_dim,
435
+ num_attention_heads=num_attention_heads[i],
436
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
437
+ downsample_padding=downsample_padding,
438
+ use_linear_projection=use_linear_projection,
439
+ only_cross_attention=only_cross_attention[i],
440
+ upcast_attention=upcast_attention,
441
+ resnet_time_scale_shift=resnet_time_scale_shift,
442
+ )
443
+ self.down_blocks.append(down_block)
444
+
445
+ for _ in range(layers_per_block):
446
+ # controlnet_block = ZeroConv(output_channel, output_channel)
447
+ controlnet_block = nn.Sequential(
448
+ SFT(output_channel, output_channel),
449
+ zero_module(nn.Conv2d(output_channel, output_channel, kernel_size=1))
450
+ )
451
+ self.controlnet_down_blocks.append(controlnet_block)
452
+
453
+ if not is_final_block:
454
+ # controlnet_block = ZeroConv(output_channel, output_channel)
455
+ controlnet_block = nn.Sequential(
456
+ SFT(output_channel, output_channel),
457
+ zero_module(nn.Conv2d(output_channel, output_channel, kernel_size=1))
458
+ )
459
+ self.controlnet_down_blocks.append(controlnet_block)
460
+
461
+ # mid
462
+ mid_block_channel = block_out_channels[-1]
463
+
464
+ # controlnet_block = ZeroConv(mid_block_channel, mid_block_channel)
465
+ controlnet_block = nn.Sequential(
466
+ SFT(mid_block_channel, mid_block_channel),
467
+ zero_module(nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1))
468
+ )
469
+ self.controlnet_mid_block = controlnet_block
470
+
471
+ if mid_block_type == "UNetMidBlock2DCrossAttn":
472
+ self.mid_block = UNetMidBlock2DCrossAttn(
473
+ transformer_layers_per_block=transformer_layers_per_block[-1],
474
+ in_channels=mid_block_channel,
475
+ temb_channels=time_embed_dim,
476
+ resnet_eps=norm_eps,
477
+ resnet_act_fn=act_fn,
478
+ output_scale_factor=mid_block_scale_factor,
479
+ resnet_time_scale_shift=resnet_time_scale_shift,
480
+ cross_attention_dim=cross_attention_dim,
481
+ num_attention_heads=num_attention_heads[-1],
482
+ resnet_groups=norm_num_groups,
483
+ use_linear_projection=use_linear_projection,
484
+ upcast_attention=upcast_attention,
485
+ )
486
+ elif mid_block_type == "UNetMidBlock2D":
487
+ self.mid_block = UNetMidBlock2D(
488
+ in_channels=block_out_channels[-1],
489
+ temb_channels=time_embed_dim,
490
+ num_layers=0,
491
+ resnet_eps=norm_eps,
492
+ resnet_act_fn=act_fn,
493
+ output_scale_factor=mid_block_scale_factor,
494
+ resnet_groups=norm_num_groups,
495
+ resnet_time_scale_shift=resnet_time_scale_shift,
496
+ add_attention=False,
497
+ )
498
+ else:
499
+ raise ValueError(f"unknown mid_block_type : {mid_block_type}")
500
+
501
+ @classmethod
502
+ def from_unet(
503
+ cls,
504
+ unet: UNet2DConditionModel,
505
+ controlnet_conditioning_channel_order: str = "rgb",
506
+ conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
507
+ load_weights_from_unet: bool = True,
508
+ conditioning_channels: int = 3,
509
+ ):
510
+ r"""
511
+ Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
512
+ Parameters:
513
+ unet (`UNet2DConditionModel`):
514
+ The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
515
+ where applicable.
516
+ """
517
+ transformer_layers_per_block = (
518
+ unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
519
+ )
520
+ encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
521
+ encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
522
+ addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
523
+ addition_time_embed_dim = (
524
+ unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
525
+ )
526
+
527
+ controlnet = cls(
528
+ encoder_hid_dim=encoder_hid_dim,
529
+ encoder_hid_dim_type=encoder_hid_dim_type,
530
+ addition_embed_type=addition_embed_type,
531
+ addition_time_embed_dim=addition_time_embed_dim,
532
+ transformer_layers_per_block=transformer_layers_per_block,
533
+ in_channels=unet.config.in_channels,
534
+ flip_sin_to_cos=unet.config.flip_sin_to_cos,
535
+ freq_shift=unet.config.freq_shift,
536
+ down_block_types=unet.config.down_block_types,
537
+ only_cross_attention=unet.config.only_cross_attention,
538
+ block_out_channels=unet.config.block_out_channels,
539
+ layers_per_block=unet.config.layers_per_block,
540
+ downsample_padding=unet.config.downsample_padding,
541
+ mid_block_scale_factor=unet.config.mid_block_scale_factor,
542
+ act_fn=unet.config.act_fn,
543
+ norm_num_groups=unet.config.norm_num_groups,
544
+ norm_eps=unet.config.norm_eps,
545
+ cross_attention_dim=unet.config.cross_attention_dim,
546
+ attention_head_dim=unet.config.attention_head_dim,
547
+ num_attention_heads=unet.config.num_attention_heads,
548
+ use_linear_projection=unet.config.use_linear_projection,
549
+ class_embed_type=unet.config.class_embed_type,
550
+ num_class_embeds=unet.config.num_class_embeds,
551
+ upcast_attention=unet.config.upcast_attention,
552
+ resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
553
+ projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
554
+ mid_block_type=unet.config.mid_block_type,
555
+ controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
556
+ conditioning_embedding_out_channels=conditioning_embedding_out_channels,
557
+ conditioning_channels=conditioning_channels,
558
+ )
559
+
560
+ if load_weights_from_unet:
561
+ controlnet.conv_in.load_state_dict(unet.conv_in.state_dict())
562
+ controlnet.ref_conv_in.load_state_dict(unet.conv_in.state_dict())
563
+ controlnet.time_proj.load_state_dict(unet.time_proj.state_dict())
564
+ controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())
565
+
566
+ if controlnet.class_embedding:
567
+ controlnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())
568
+
569
+ if hasattr(controlnet, "add_embedding"):
570
+ controlnet.add_embedding.load_state_dict(unet.add_embedding.state_dict())
571
+
572
+ controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict())
573
+ controlnet.mid_block.load_state_dict(unet.mid_block.state_dict())
574
+
575
+ return controlnet
576
+
577
+ @property
578
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
579
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
580
+ r"""
581
+ Returns:
582
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
583
+ indexed by its weight name.
584
+ """
585
+ # set recursively
586
+ processors = {}
587
+
588
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
589
+ if hasattr(module, "get_processor"):
590
+ processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
591
+
592
+ for sub_name, child in module.named_children():
593
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
594
+
595
+ return processors
596
+
597
+ for name, module in self.named_children():
598
+ fn_recursive_add_processors(name, module, processors)
599
+
600
+ return processors
601
+
602
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
603
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
604
+ r"""
605
+ Sets the attention processor to use to compute attention.
606
+ Parameters:
607
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
608
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
609
+ for **all** `Attention` layers.
610
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
611
+ processor. This is strongly recommended when setting trainable attention processors.
612
+ """
613
+ count = len(self.attn_processors.keys())
614
+
615
+ if isinstance(processor, dict) and len(processor) != count:
616
+ raise ValueError(
617
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
618
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
619
+ )
620
+
621
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
622
+ if hasattr(module, "set_processor"):
623
+ if not isinstance(processor, dict):
624
+ module.set_processor(processor)
625
+ else:
626
+ module.set_processor(processor.pop(f"{name}.processor"))
627
+
628
+ for sub_name, child in module.named_children():
629
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
630
+
631
+ for name, module in self.named_children():
632
+ fn_recursive_attn_processor(name, module, processor)
633
+
634
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
635
+ def set_default_attn_processor(self):
636
+ """
637
+ Disables custom attention processors and sets the default attention implementation.
638
+ """
639
+ if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
640
+ processor = AttnAddedKVProcessor()
641
+ elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
642
+ processor = AttnProcessor()
643
+ else:
644
+ raise ValueError(
645
+ f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
646
+ )
647
+
648
+ self.set_attn_processor(processor)
649
+
650
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
651
+ def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
652
+ r"""
653
+ Enable sliced attention computation.
654
+ When this option is enabled, the attention module splits the input tensor in slices to compute attention in
655
+ several steps. This is useful for saving some memory in exchange for a small decrease in speed.
656
+ Args:
657
+ slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
658
+ When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
659
+ `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
660
+ provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
661
+ must be a multiple of `slice_size`.
662
+ """
663
+ sliceable_head_dims = []
664
+
665
+ def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
666
+ if hasattr(module, "set_attention_slice"):
667
+ sliceable_head_dims.append(module.sliceable_head_dim)
668
+
669
+ for child in module.children():
670
+ fn_recursive_retrieve_sliceable_dims(child)
671
+
672
+ # retrieve number of attention layers
673
+ for module in self.children():
674
+ fn_recursive_retrieve_sliceable_dims(module)
675
+
676
+ num_sliceable_layers = len(sliceable_head_dims)
677
+
678
+ if slice_size == "auto":
679
+ # half the attention head size is usually a good trade-off between
680
+ # speed and memory
681
+ slice_size = [dim // 2 for dim in sliceable_head_dims]
682
+ elif slice_size == "max":
683
+ # make smallest slice possible
684
+ slice_size = num_sliceable_layers * [1]
685
+
686
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
687
+
688
+ if len(slice_size) != len(sliceable_head_dims):
689
+ raise ValueError(
690
+ f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
691
+ f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
692
+ )
693
+
694
+ for i in range(len(slice_size)):
695
+ size = slice_size[i]
696
+ dim = sliceable_head_dims[i]
697
+ if size is not None and size > dim:
698
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
699
+
700
+ # Recursively walk through all the children.
701
+ # Any children which exposes the set_attention_slice method
702
+ # gets the message
703
+ def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
704
+ if hasattr(module, "set_attention_slice"):
705
+ module.set_attention_slice(slice_size.pop())
706
+
707
+ for child in module.children():
708
+ fn_recursive_set_attention_slice(child, slice_size)
709
+
710
+ reversed_slice_size = list(reversed(slice_size))
711
+ for module in self.children():
712
+ fn_recursive_set_attention_slice(module, reversed_slice_size)
713
+
714
+ def process_encoder_hidden_states(
715
+ self, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any]
716
+ ) -> torch.Tensor:
717
+ if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
718
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
719
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
720
+ # Kandinsky 2.1 - style
721
+ if "image_embeds" not in added_cond_kwargs:
722
+ raise ValueError(
723
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
724
+ )
725
+
726
+ image_embeds = added_cond_kwargs.get("image_embeds")
727
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
728
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
729
+ # Kandinsky 2.2 - style
730
+ if "image_embeds" not in added_cond_kwargs:
731
+ raise ValueError(
732
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
733
+ )
734
+ image_embeds = added_cond_kwargs.get("image_embeds")
735
+ encoder_hidden_states = self.encoder_hid_proj(image_embeds)
736
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
737
+ if "image_embeds" not in added_cond_kwargs:
738
+ raise ValueError(
739
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
740
+ )
741
+ image_embeds = added_cond_kwargs.get("image_embeds")
742
+ image_embeds = self.encoder_hid_proj(image_embeds)
743
+ encoder_hidden_states = (encoder_hidden_states, image_embeds)
744
+ return encoder_hidden_states
745
+
746
+ def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
747
+ if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
748
+ module.gradient_checkpointing = value
749
+
750
+ def forward(
751
+ self,
752
+ sample: torch.FloatTensor,
753
+ timestep: Union[torch.Tensor, float, int],
754
+ encoder_hidden_states: torch.Tensor,
755
+ controlnet_cond: torch.FloatTensor,
756
+ cat_dim: int = -2,
757
+ conditioning_scale: float = 1.0,
758
+ class_labels: Optional[torch.Tensor] = None,
759
+ timestep_cond: Optional[torch.Tensor] = None,
760
+ attention_mask: Optional[torch.Tensor] = None,
761
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
762
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
763
+ return_dict: bool = True,
764
+ ) -> Union[AggregatorOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]:
765
+ """
766
+ The [`Aggregator`] forward method.
767
+ Args:
768
+ sample (`torch.FloatTensor`):
769
+ The noisy input tensor.
770
+ timestep (`Union[torch.Tensor, float, int]`):
771
+ The number of timesteps to denoise an input.
772
+ encoder_hidden_states (`torch.Tensor`):
773
+ The encoder hidden states.
774
+ controlnet_cond (`torch.FloatTensor`):
775
+ The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
776
+ conditioning_scale (`float`, defaults to `1.0`):
777
+ The scale factor for ControlNet outputs.
778
+ class_labels (`torch.Tensor`, *optional*, defaults to `None`):
779
+ Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
780
+ timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
781
+ Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
782
+ timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
783
+ embeddings.
784
+ attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
785
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
786
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
787
+ negative values to the attention scores corresponding to "discard" tokens.
788
+ added_cond_kwargs (`dict`):
789
+ Additional conditions for the Stable Diffusion XL UNet.
790
+ cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
791
+ A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
792
+ return_dict (`bool`, defaults to `True`):
793
+ Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
794
+ Returns:
795
+ [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
796
+ If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
797
+ returned where the first element is the sample tensor.
798
+ """
799
+ # check channel order
800
+ channel_order = self.config.controlnet_conditioning_channel_order
801
+
802
+ if channel_order == "rgb":
803
+ # in rgb order by default
804
+ ...
805
+ else:
806
+ raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
807
+
808
+ # prepare attention_mask
809
+ if attention_mask is not None:
810
+ attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
811
+ attention_mask = attention_mask.unsqueeze(1)
812
+
813
+ # 1. time
814
+ timesteps = timestep
815
+ if not torch.is_tensor(timesteps):
816
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
817
+ # This would be a good case for the `match` statement (Python 3.10+)
818
+ is_mps = sample.device.type == "mps"
819
+ if isinstance(timestep, float):
820
+ dtype = torch.float32 if is_mps else torch.float64
821
+ else:
822
+ dtype = torch.int32 if is_mps else torch.int64
823
+ timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
824
+ elif len(timesteps.shape) == 0:
825
+ timesteps = timesteps[None].to(sample.device)
826
+
827
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
828
+ timesteps = timesteps.expand(sample.shape[0])
829
+
830
+ t_emb = self.time_proj(timesteps)
831
+
832
+ # timesteps does not contain any weights and will always return f32 tensors
833
+ # but time_embedding might actually be running in fp16. so we need to cast here.
834
+ # there might be better ways to encapsulate this.
835
+ t_emb = t_emb.to(dtype=sample.dtype)
836
+
837
+ emb = self.time_embedding(t_emb, timestep_cond)
838
+ aug_emb = None
839
+
840
+ if self.class_embedding is not None:
841
+ if class_labels is None:
842
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
843
+
844
+ if self.config.class_embed_type == "timestep":
845
+ class_labels = self.time_proj(class_labels)
846
+
847
+ class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
848
+ emb = emb + class_emb
849
+
850
+ if self.config.addition_embed_type is not None:
851
+ if self.config.addition_embed_type == "text":
852
+ aug_emb = self.add_embedding(encoder_hidden_states)
853
+
854
+ elif self.config.addition_embed_type == "text_time":
855
+ if "text_embeds" not in added_cond_kwargs:
856
+ raise ValueError(
857
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
858
+ )
859
+ text_embeds = added_cond_kwargs.get("text_embeds")
860
+ if "time_ids" not in added_cond_kwargs:
861
+ raise ValueError(
862
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
863
+ )
864
+ time_ids = added_cond_kwargs.get("time_ids")
865
+ time_embeds = self.add_time_proj(time_ids.flatten())
866
+ time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
867
+
868
+ add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
869
+ add_embeds = add_embeds.to(emb.dtype)
870
+ aug_emb = self.add_embedding(add_embeds)
871
+
872
+ emb = emb + aug_emb if aug_emb is not None else emb
873
+
874
+ encoder_hidden_states = self.process_encoder_hidden_states(
875
+ encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
876
+ )
877
+
878
+ # 2. prepare input
879
+ cond_latent = self.conv_in(sample)
880
+ ref_latent = self.ref_conv_in(controlnet_cond)
881
+ batch_size, channel, height, width = cond_latent.shape
882
+ if self.pad_concat:
883
+ if cat_dim == -2 or cat_dim == 2:
884
+ concat_pad = torch.zeros(batch_size, channel, 1, width)
885
+ elif cat_dim == -1 or cat_dim == 3:
886
+ concat_pad = torch.zeros(batch_size, channel, height, 1)
887
+ else:
888
+ raise ValueError(f"Aggregator shall concat along spatial dimension, but is asked to concat dim: {cat_dim}.")
889
+ concat_pad = concat_pad.to(cond_latent.device, dtype=cond_latent.dtype)
890
+ sample = torch.cat([cond_latent, concat_pad, ref_latent], dim=cat_dim)
891
+ else:
892
+ sample = torch.cat([cond_latent, ref_latent], dim=cat_dim)
893
+
894
+ # 3. down
895
+ down_block_res_samples = (sample,)
896
+ for downsample_block in self.down_blocks:
897
+ sample, res_samples = downsample_block(
898
+ hidden_states=sample,
899
+ temb=emb,
900
+ cross_attention_kwargs=cross_attention_kwargs,
901
+ )
902
+
903
+ # rebuild sample: split and concat
904
+ if self.pad_concat:
905
+ batch_size, channel, height, width = sample.shape
906
+ if cat_dim == -2 or cat_dim == 2:
907
+ cond_latent = sample[:, :, :height//2, :]
908
+ ref_latent = sample[:, :, -(height//2):, :]
909
+ concat_pad = torch.zeros(batch_size, channel, 1, width)
910
+ elif cat_dim == -1 or cat_dim == 3:
911
+ cond_latent = sample[:, :, :, :width//2]
912
+ ref_latent = sample[:, :, :, -(width//2):]
913
+ concat_pad = torch.zeros(batch_size, channel, height, 1)
914
+ concat_pad = concat_pad.to(cond_latent.device, dtype=cond_latent.dtype)
915
+ sample = torch.cat([cond_latent, concat_pad, ref_latent], dim=cat_dim)
916
+ res_samples = res_samples[:-1] + (sample,)
917
+
918
+ down_block_res_samples += res_samples
919
+
920
+ # 4. mid
921
+ if self.mid_block is not None:
922
+ sample = self.mid_block(
923
+ sample,
924
+ emb,
925
+ cross_attention_kwargs=cross_attention_kwargs,
926
+ )
927
+
928
+ # 5. split samples and SFT.
929
+ controlnet_down_block_res_samples = ()
930
+ for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
931
+ batch_size, channel, height, width = down_block_res_sample.shape
932
+ if cat_dim == -2 or cat_dim == 2:
933
+ cond_latent = down_block_res_sample[:, :, :height//2, :]
934
+ ref_latent = down_block_res_sample[:, :, -(height//2):, :]
935
+ elif cat_dim == -1 or cat_dim == 3:
936
+ cond_latent = down_block_res_sample[:, :, :, :width//2]
937
+ ref_latent = down_block_res_sample[:, :, :, -(width//2):]
938
+ down_block_res_sample = controlnet_block((cond_latent, ref_latent), )
939
+ controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
940
+
941
+ down_block_res_samples = controlnet_down_block_res_samples
942
+
943
+ batch_size, channel, height, width = sample.shape
944
+ if cat_dim == -2 or cat_dim == 2:
945
+ cond_latent = sample[:, :, :height//2, :]
946
+ ref_latent = sample[:, :, -(height//2):, :]
947
+ elif cat_dim == -1 or cat_dim == 3:
948
+ cond_latent = sample[:, :, :, :width//2]
949
+ ref_latent = sample[:, :, :, -(width//2):]
950
+ mid_block_res_sample = self.controlnet_mid_block((cond_latent, ref_latent), )
951
+
952
+ # 6. scaling
953
+ down_block_res_samples = [sample*conditioning_scale for sample in down_block_res_samples]
954
+ mid_block_res_sample = mid_block_res_sample*conditioning_scale
955
+
956
+ if self.config.global_pool_conditions:
957
+ down_block_res_samples = [
958
+ torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
959
+ ]
960
+ mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
961
+
962
+ if not return_dict:
963
+ return (down_block_res_samples, mid_block_res_sample)
964
+
965
+ return AggregatorOutput(
966
+ down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
967
+ )
968
+
969
+
970
+ def zero_module(module):
971
+ for p in module.parameters():
972
+ nn.init.zeros_(p)
973
+ return module