Spaces:

HikariDawn
/

This-and-That

Running on Zero

App Files Files Community

This-and-That / svd /temporal_controlnet.py

HikariDawn777

feat: initial push

59b2a81 about 1 month ago

raw

history blame contribute delete

30 kB

	# Copyright 2023 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	'''
	This is a ControlNet for sptio temporal unet (SVD)
	'''
	from dataclasses import dataclass
	from typing import Any, Dict, List, Optional, Tuple, Union
	import os, sys
	import random
	import torch
	from torch import nn
	from torch.nn import functional as F

	from diffusers import AutoencoderKLTemporalDecoder
	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.loaders import FromOriginalControlnetMixin
	from diffusers.utils import BaseOutput, logging
	from diffusers.models.attention_processor import (
	ADDED_KV_ATTENTION_PROCESSORS,
	CROSS_ATTENTION_PROCESSORS,
	AttentionProcessor,
	AttnAddedKVProcessor,
	AttnProcessor,
	)
	from diffusers.models.embeddings import TimestepEmbedding, Timesteps
	from diffusers.models.modeling_utils import ModelMixin

	# Import files from the local folder
	root_path = os.path.abspath('.')
	sys.path.append(root_path)
	from svd.unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel
	from svd.diffusion_arch.unet_3d_blocks import UNetMidBlockSpatioTemporal, get_down_block

	logger = logging.get_logger(__name__) # pylint: disable=invalid-name


	def zero_module(module):
	for p in module.parameters():
	nn.init.zeros_(p)
	return module


	@dataclass
	class ControlNetOutput(BaseOutput):
	"""
	The output of [`ControlNetModel`].

	Args:
	down_block_res_samples (`tuple[torch.Tensor]`):
	A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
	be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
	used to condition the original UNet's downsampling activations.
	mid_down_block_re_sample (`torch.Tensor`):
	The activation of the midde block (the lowest sample resolution). Each tensor should be of shape
	`(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
	Output can be used to condition the original UNet's middle block activation.
	"""

	down_block_res_samples: Tuple[torch.Tensor]
	mid_block_res_sample: torch.Tensor



	class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
	"""
	A ControlNet model.

	Args:
	in_channels (`int`, defaults to 4):
	The number of channels in the input sample.
	flip_sin_to_cos (`bool`, defaults to `True`):
	Whether to flip the sin to cos in the time embedding.
	freq_shift (`int`, defaults to 0):
	The frequency shift to apply to the time embedding.
	down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
	The tuple of downsample blocks to use.
	only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
	block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
	The tuple of output channels for each block.
	layers_per_block (`int`, defaults to 2):
	The number of layers per block.
	downsample_padding (`int`, defaults to 1):
	The padding to use for the downsampling convolution.
	mid_block_scale_factor (`float`, defaults to 1):
	The scale factor to use for the mid block.
	act_fn (`str`, defaults to "silu"):
	The activation function to use.
	norm_num_groups (`int`, optional, defaults to 32):
	The number of groups to use for the normalization. If None, normalization and activation layers is skipped
	in post-processing.
	norm_eps (`float`, defaults to 1e-5):
	The epsilon to use for the normalization.
	cross_attention_dim (`int`, defaults to 1280):
	The dimension of the cross attention features.
	transformer_layers_per_block (`int` or `Tuple[int]`, optional, defaults to 1):
	The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
	[`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
	[`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
	encoder_hid_dim (`int`, optional, defaults to None):
	If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
	dimension to `cross_attention_dim`.
	encoder_hid_dim_type (`str`, optional, defaults to `None`):
	If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
	embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
	attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
	The dimension of the attention heads.
	use_linear_projection (`bool`, defaults to `False`):
	class_embed_type (`str`, optional, defaults to `None`):
	The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
	`"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
	addition_embed_type (`str`, optional, defaults to `None`):
	Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
	"text". "text" will use the `TextTimeEmbedding` layer.
	num_class_embeds (`int`, optional, defaults to 0):
	Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
	class conditioning with `class_embed_type` equal to `None`.
	upcast_attention (`bool`, defaults to `False`):
	resnet_time_scale_shift (`str`, defaults to `"default"`):
	Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
	projection_class_embeddings_input_dim (`int`, optional, defaults to `None`):
	The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
	`class_embed_type="projection"`.
	controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
	The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
	conditioning_embedding_out_channels (`tuple[int]`, optional, defaults to `(16, 32, 96, 256)`):
	The tuple of output channel for each block in the `conditioning_embedding` layer.
	global_pool_conditions (`bool`, defaults to `False`):
	TODO(Patrick) - unused parameter.
	addition_embed_type_num_heads (`int`, defaults to 64):
	The number of heads to use for the `TextTimeEmbedding` layer.
	"""

	_supports_gradient_checkpointing = True

	@register_to_config
	def __init__(
	self,
	in_channels: int = 8,
	conditioning_channels: int = 3,
	flip_sin_to_cos: bool = True,
	freq_shift: int = 0,
	down_block_types: Tuple[str, ...] = (
	"CrossAttnDownBlockSpatioTemporal",
	"CrossAttnDownBlockSpatioTemporal",
	"CrossAttnDownBlockSpatioTemporal",
	"DownBlockSpatioTemporal",
	),
	mid_block_type: Optional[str] = "UNetMidBlockSpatioTemporal",
	block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
	addition_time_embed_dim: int = 256,
	layers_per_block: int = 2,
	act_fn: str = "silu",
	cross_attention_dim: int = 1024,
	projection_class_embeddings_input_dim: Optional[int] = 768,
	conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
	transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
	num_attention_heads: Union[int, Tuple[int]] = (5, 10, 20, 20), # This is modified to SVD config setting for the default case
	encoder_hid_dim: Optional[int] = None,
	encoder_hid_dim_type: Optional[str] = None,
	controlnet_conditioning_channel_order = 'rgb',
	):
	super().__init__()

	self.controlnet_conditioning_channel_order = controlnet_conditioning_channel_order

	# Check inputs
	if len(block_out_channels) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
	)

	if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
	)

	if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
	)

	if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
	raise ValueError(
	f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
	)



	########################## First convolution channel for sample (noise) ##########################
	conv_in_kernel = 3
	conv_in_padding = (conv_in_kernel - 1) // 2
	self.conv_in_concat = zero_module(nn.Conv2d(
	12, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
	)) # Input is 12 channels (8 + 4) right now


	########################## Time embedding and so on ##########################
	time_embed_dim = block_out_channels[0] * 4

	self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) # defualt flip_sin_to_cos True
	timestep_input_dim = block_out_channels[0]
	self.time_embedding = TimestepEmbedding(
	timestep_input_dim,
	time_embed_dim,
	act_fn=act_fn,
	)

	# Additional time embedding for other purpose
	self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0) # This will include hyperparameter like fps, motion_bucket_id, noise_aug_strength
	self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)

	if encoder_hid_dim is None and encoder_hid_dim_type is not None:
	raise ValueError(
	f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
	)



	############################# Down and Mid Blocks Init #############################
	# Init ModuleList and prepare information needed
	self.down_blocks = nn.ModuleList([])
	output_channel = block_out_channels[0]


	# Check instance
	if isinstance(num_attention_heads, int):
	num_attention_heads = (num_attention_heads,) * len(down_block_types)

	if isinstance(cross_attention_dim, int):
	cross_attention_dim = (cross_attention_dim,) * len(down_block_types)

	if isinstance(layers_per_block, int):
	layers_per_block = [layers_per_block] * len(down_block_types)

	if isinstance(transformer_layers_per_block, int):
	transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)

	blocks_time_embed_dim = time_embed_dim


	# ControlNet Module!!!!!
	self.controlnet_down_blocks = nn.ModuleList([])
	controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
	controlnet_block = zero_module(controlnet_block) # Zero Convolution
	self.controlnet_down_blocks.append(controlnet_block)

	# Down block init one by one
	for i, down_block_type in enumerate(down_block_types):
	input_channel = output_channel
	output_channel = block_out_channels[i]
	is_final_block = i == len(block_out_channels) - 1

	down_block = get_down_block(
	down_block_type,
	num_layers=layers_per_block[i],
	transformer_layers_per_block=transformer_layers_per_block[i],
	in_channels=input_channel,
	out_channels=output_channel,
	temb_channels=blocks_time_embed_dim,
	add_downsample=not is_final_block,
	resnet_eps=1e-5,
	cross_attention_dim=cross_attention_dim[i],
	num_attention_heads=num_attention_heads[i],
	resnet_act_fn="silu",
	)
	self.down_blocks.append(down_block)


	# ControlNet Module !!!!
	for _ in range(layers_per_block[0]): # Loop 2 times here
	controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
	controlnet_block = zero_module(controlnet_block)
	self.controlnet_down_blocks.append(controlnet_block)

	if not is_final_block: # Loop only once
	controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
	controlnet_block = zero_module(controlnet_block)
	self.controlnet_down_blocks.append(controlnet_block)

	# Mid block
	mid_block_channel = block_out_channels[-1]

	# ControlNet Module !!!!
	controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
	controlnet_block = zero_module(controlnet_block)
	self.controlnet_mid_block = controlnet_block

	if mid_block_type == "UNetMidBlockSpatioTemporal":
	self.mid_block = UNetMidBlockSpatioTemporal(
	block_out_channels[-1],
	temb_channels=blocks_time_embed_dim,
	transformer_layers_per_block=transformer_layers_per_block[-1],
	cross_attention_dim=cross_attention_dim[-1],
	num_attention_heads=num_attention_heads[-1],
	)
	else:
	raise ValueError(f"unknown mid_block_type : {mid_block_type}")


	@classmethod
	def from_unet(
	cls,
	unet: UNetSpatioTemporalConditionModel,
	conditioning_channels: int = 3,
	load_weights_from_unet: bool = True,
	):
	r"""
	Instantiate a [`ControlNetModel`] from [`UNetSpatioTemporalConditionModel`].

	Parameters:
	unet (`UNetSpatioTemporalConditionModel`):
	The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
	where applicable.
	load_weights_from_unet (bool):
	Whether we used unet as trainable copy (Should be True in default)
	"""

	controlnet = cls(conditioning_channels=conditioning_channels)

	if load_weights_from_unet:
	# controlnet.conv_in.load_state_dict(unet.conv_in.state_dict()) # Won't load this conv_in now, we will replace it with another zero conv
	controlnet.time_proj.load_state_dict(unet.time_proj.state_dict())
	controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())

	controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict())
	controlnet.mid_block.load_state_dict(unet.mid_block.state_dict())

	return controlnet


	@property
	def attn_processors(self) -> Dict[str, AttentionProcessor]:
	r"""
	Returns:
	`dict` of attention processors: A dictionary containing all attention processors used in the model with
	indexed by its weight name.
	"""
	# set recursively
	processors = {}

	def fn_recursive_add_processors(
	name: str,
	module: torch.nn.Module,
	processors: Dict[str, AttentionProcessor],
	):
	if hasattr(module, "get_processor"):
	processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)

	for sub_name, child in module.named_children():
	fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)

	return processors

	for name, module in self.named_children():
	fn_recursive_add_processors(name, module, processors)

	return processors


	def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
	r"""
	Sets the attention processor to use to compute attention.

	Parameters:
	processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
	The instantiated processor class or a dictionary of processor classes that will be set as the processor
	for all `Attention` layers.

	If `processor` is a dict, the key needs to define the path to the corresponding cross attention
	processor. This is strongly recommended when setting trainable attention processors.

	"""
	count = len(self.attn_processors.keys())

	if isinstance(processor, dict) and len(processor) != count:
	raise ValueError(
	f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
	f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
	)

	def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
	if hasattr(module, "set_processor"):
	if not isinstance(processor, dict):
	module.set_processor(processor)
	else:
	module.set_processor(processor.pop(f"{name}.processor"))

	for sub_name, child in module.named_children():
	fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)

	for name, module in self.named_children():
	fn_recursive_attn_processor(name, module, processor)


	def set_default_attn_processor(self):
	"""
	Disables custom attention processors and sets the default attention implementation.
	"""
	if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
	processor = AttnProcessor()
	else:
	raise ValueError(
	f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
	)

	self.set_attn_processor(processor)


	def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
	if hasattr(module, "gradient_checkpointing"):
	module.gradient_checkpointing = value


	def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
	"""
	Sets the attention processor to use [feed forward
	chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

	Parameters:
	chunk_size (`int`, optional):
	The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
	over each tensor of dim=`dim`.
	dim (`int`, optional, defaults to `0`):
	The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
	or dim=1 (sequence length).
	"""
	if dim not in [0, 1]:
	raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")

	# By default chunk size is 1
	chunk_size = chunk_size or 1

	def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
	if hasattr(module, "set_chunk_feed_forward"):
	module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)

	for child in module.children():
	fn_recursive_feed_forward(child, chunk_size, dim)

	for module in self.children():
	fn_recursive_feed_forward(module, chunk_size, dim)


	def forward(
	self,
	sample: torch.FloatTensor,
	timestep: Union[torch.Tensor, float, int],
	encoder_hidden_states: torch.Tensor,
	added_time_ids: torch.Tensor,
	added_positions: torch.Tensor = None,
	controlnet_cond: torch.FloatTensor = None,
	conditioning_scale: float = 1.0,
	inner_conditioning_scale: float = 1.0,
	timestep_cond: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	guess_mode: bool = False,
	return_dict: bool = True,
	) -> Union[ControlNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]:
	"""
	The [`ControlNetModel`] forward method.

	Args:
	sample (`torch.FloatTensor`):
	The noisy input tensor.
	timestep (`Union[torch.Tensor, float, int]`):
	The number of timesteps to denoise an input.
	encoder_hidden_states (`torch.Tensor`):
	The encoder hidden states.
	controlnet_cond (`torch.FloatTensor`):
	The conditional input tensor of shape `(batch_size, sequence_length, 4, hidden_size)` which is already encoded in VAE.
	conditioning_scale (`float`, defaults to `1.0`):
	The scale factor for ControlNet outputs.
	class_labels (`torch.Tensor`, optional, defaults to `None`):
	Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
	timestep_cond (`torch.Tensor`, optional, defaults to `None`):
	Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
	timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
	embeddings.
	attention_mask (`torch.Tensor`, optional, defaults to `None`):
	An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
	is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
	negative values to the attention scores corresponding to "discard" tokens.
	added_cond_kwargs (`dict`):
	Additional conditions for the Stable Diffusion XL UNet.
	cross_attention_kwargs (`dict[str]`, optional, defaults to `None`):
	A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
	guess_mode (`bool`, defaults to `False`):
	In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
	you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
	return_dict (`bool`, defaults to `True`):
	Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.

	Returns:
	[`~models.controlnet.ControlNetOutput`] or `tuple`:
	If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
	returned where the first element is the sample tensor.
	"""

	# check channel order
	channel_order = self.controlnet_conditioning_channel_order

	# if channel_order == "rgb":
	# # in rgb order by default
	# ...
	# elif channel_order == "bgr":
	# controlnet_cond = torch.flip(controlnet_cond, dims=[1])
	# else:
	# raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")

	# prepare attention_mask
	if attention_mask is not None:
	attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
	attention_mask = attention_mask.unsqueeze(1)

	# 1. time
	timesteps = timestep
	if not torch.is_tensor(timesteps):
	# TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
	# This would be a good case for the `match` statement (Python 3.10+)
	is_mps = sample.device.type == "mps"
	if isinstance(timestep, float):
	dtype = torch.float32 if is_mps else torch.float64
	else:
	dtype = torch.int32 if is_mps else torch.int64
	timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
	elif len(timesteps.shape) == 0:
	timesteps = timesteps[None].to(sample.device)

	# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
	batch_size, num_frames = sample.shape[:2] # Take the classifier guidance also as an input in batch
	timesteps = timesteps.expand(batch_size)

	t_emb = self.time_proj(timesteps)

	# timesteps does not contain any weights and will always return f32 tensors
	# but time_embedding might actually be running in fp16. so we need to cast here.
	# there might be better ways to encapsulate this.
	t_emb = t_emb.to(dtype=sample.dtype)

	emb = self.time_embedding(t_emb) # No more timestep_cond because usually this is None

	# motion score + fps + aug strength embeds
	time_embeds = self.add_time_proj(added_time_ids.flatten())
	time_embeds = time_embeds.reshape((batch_size, -1))
	time_embeds = time_embeds.to(emb.dtype)
	aug_emb = self.add_embedding(time_embeds)

	# Wrap up
	emb = emb + aug_emb


	sample = sample.flatten(0, 1)
	# Repeat the embeddings num_video_frames times
	# emb: [batch, channels] -> [batch * frames, channels]
	emb = emb.repeat_interleave(num_frames, dim=0)
	# encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
	encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)


	# 2. Pre-Process
	image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)


	# Feature: Concat the sample && controlnet_cond at dim 1 (channel-wise) !!!
	sample = torch.cat([sample, controlnet_cond], dim=1)


	# Merge sample and controlnet_cond together
	sample = self.conv_in_concat(sample)



	# 3. Down block
	down_block_res_samples = (sample,)
	for downsample_block in self.down_blocks:
	if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
	sample, res_samples = downsample_block(
	hidden_states=sample, # Vae encode + noise
	temb=emb,
	encoder_hidden_states=encoder_hidden_states, # Clip encode
	image_only_indicator=image_only_indicator,
	)
	else:
	sample, res_samples = downsample_block(
	hidden_states=sample,
	temb=emb,
	image_only_indicator=image_only_indicator,
	)

	down_block_res_samples += res_samples


	# 4. Mid block
	sample = self.mid_block(
	hidden_states=sample,
	temb=emb,
	encoder_hidden_states=encoder_hidden_states,
	image_only_indicator=image_only_indicator,
	)


	# 5. ControlNet blocks
	controlnet_down_block_res_samples = ()

	for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
	down_block_res_sample = controlnet_block(down_block_res_sample)
	controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)

	down_block_res_samples = controlnet_down_block_res_samples
	# Mid block
	mid_block_res_sample = self.controlnet_mid_block(sample)


	# 6. Scaling
	if guess_mode:
	scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device) # 0.1 to 1.0
	scales = scales * conditioning_scale
	down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
	mid_block_res_sample = mid_block_res_sample * scales[-1] # last one
	else:
	down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
	mid_block_res_sample = mid_block_res_sample * conditioning_scale


	if not return_dict:
	return (down_block_res_samples, mid_block_res_sample)

	return ControlNetOutput(
	down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
	)