from typing import Callable, Iterable, Union import torch from einops import rearrange, repeat from sgm.modules.diffusionmodules.model import (XFORMERS_IS_AVAILABLE, AttnBlock, Decoder, MemoryEfficientAttnBlock, ResnetBlock) from sgm.modules.diffusionmodules.openaimodel import (ResBlock, timestep_embedding) from sgm.modules.video_attention import VideoTransformerBlock from sgm.util import partialclass class VideoResBlock(ResnetBlock): def __init__( self, out_channels, *args, dropout=0.0, video_kernel_size=3, alpha=0.0, merge_strategy="learned", **kwargs, ): super().__init__(out_channels=out_channels, dropout=dropout, *args, **kwargs) if video_kernel_size is None: video_kernel_size = [3, 1, 1] self.time_stack = ResBlock( channels=out_channels, emb_channels=0, dropout=dropout, dims=3, use_scale_shift_norm=False, use_conv=False, up=False, down=False, kernel_size=video_kernel_size, use_checkpoint=False, skip_t_emb=True, ) self.merge_strategy = merge_strategy if self.merge_strategy == "fixed": self.register_buffer("mix_factor", torch.Tensor([alpha])) elif self.merge_strategy == "learned": self.register_parameter( "mix_factor", torch.nn.Parameter(torch.Tensor([alpha])) ) else: raise ValueError(f"unknown merge strategy {self.merge_strategy}") def get_alpha(self, bs): if self.merge_strategy == "fixed": return self.mix_factor elif self.merge_strategy == "learned": return torch.sigmoid(self.mix_factor) else: raise NotImplementedError() def forward(self, x, temb, skip_video=False, timesteps=None): if timesteps is None: timesteps = self.timesteps b, c, h, w = x.shape x = super().forward(x, temb) if not skip_video: x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps) x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps) x = self.time_stack(x, temb) alpha = self.get_alpha(bs=b // timesteps) x = alpha * x + (1.0 - alpha) * x_mix x = rearrange(x, "b c t h w -> (b t) c h w") return x class AE3DConv(torch.nn.Conv2d): def __init__(self, in_channels, out_channels, video_kernel_size=3, *args, **kwargs): super().__init__(in_channels, out_channels, *args, **kwargs) if isinstance(video_kernel_size, Iterable): padding = [int(k // 2) for k in video_kernel_size] else: padding = int(video_kernel_size // 2) self.time_mix_conv = torch.nn.Conv3d( in_channels=out_channels, out_channels=out_channels, kernel_size=video_kernel_size, padding=padding, ) def forward(self, input, timesteps, skip_video=False): x = super().forward(input) if skip_video: return x x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps) x = self.time_mix_conv(x) return rearrange(x, "b c t h w -> (b t) c h w") class VideoBlock(AttnBlock): def __init__( self, in_channels: int, alpha: float = 0, merge_strategy: str = "learned" ): super().__init__(in_channels) # no context, single headed, as in base class self.time_mix_block = VideoTransformerBlock( dim=in_channels, n_heads=1, d_head=in_channels, checkpoint=False, ff_in=True, attn_mode="softmax", ) time_embed_dim = self.in_channels * 4 self.video_time_embed = torch.nn.Sequential( torch.nn.Linear(self.in_channels, time_embed_dim), torch.nn.SiLU(), torch.nn.Linear(time_embed_dim, self.in_channels), ) self.merge_strategy = merge_strategy if self.merge_strategy == "fixed": self.register_buffer("mix_factor", torch.Tensor([alpha])) elif self.merge_strategy == "learned": self.register_parameter( "mix_factor", torch.nn.Parameter(torch.Tensor([alpha])) ) else: raise ValueError(f"unknown merge strategy {self.merge_strategy}") def forward(self, x, timesteps, skip_video=False): if skip_video: return super().forward(x) x_in = x x = self.attention(x) h, w = x.shape[2:] x = rearrange(x, "b c h w -> b (h w) c") x_mix = x num_frames = torch.arange(timesteps, device=x.device) num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps) num_frames = rearrange(num_frames, "b t -> (b t)") t_emb = timestep_embedding(num_frames, self.in_channels, repeat_only=False) emb = self.video_time_embed(t_emb) # b, n_channels emb = emb[:, None, :] x_mix = x_mix + emb alpha = self.get_alpha() x_mix = self.time_mix_block(x_mix, timesteps=timesteps) x = alpha * x + (1.0 - alpha) * x_mix # alpha merge x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w) x = self.proj_out(x) return x_in + x def get_alpha( self, ): if self.merge_strategy == "fixed": return self.mix_factor elif self.merge_strategy == "learned": return torch.sigmoid(self.mix_factor) else: raise NotImplementedError(f"unknown merge strategy {self.merge_strategy}") class MemoryEfficientVideoBlock(MemoryEfficientAttnBlock): def __init__( self, in_channels: int, alpha: float = 0, merge_strategy: str = "learned" ): super().__init__(in_channels) # no context, single headed, as in base class self.time_mix_block = VideoTransformerBlock( dim=in_channels, n_heads=1, d_head=in_channels, checkpoint=False, ff_in=True, attn_mode="softmax-xformers", ) time_embed_dim = self.in_channels * 4 self.video_time_embed = torch.nn.Sequential( torch.nn.Linear(self.in_channels, time_embed_dim), torch.nn.SiLU(), torch.nn.Linear(time_embed_dim, self.in_channels), ) self.merge_strategy = merge_strategy if self.merge_strategy == "fixed": self.register_buffer("mix_factor", torch.Tensor([alpha])) elif self.merge_strategy == "learned": self.register_parameter( "mix_factor", torch.nn.Parameter(torch.Tensor([alpha])) ) else: raise ValueError(f"unknown merge strategy {self.merge_strategy}") def forward(self, x, timesteps, skip_time_block=False): if skip_time_block: return super().forward(x) x_in = x x = self.attention(x) h, w = x.shape[2:] x = rearrange(x, "b c h w -> b (h w) c") x_mix = x num_frames = torch.arange(timesteps, device=x.device) num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps) num_frames = rearrange(num_frames, "b t -> (b t)") t_emb = timestep_embedding(num_frames, self.in_channels, repeat_only=False) emb = self.video_time_embed(t_emb) # b, n_channels emb = emb[:, None, :] x_mix = x_mix + emb alpha = self.get_alpha() x_mix = self.time_mix_block(x_mix, timesteps=timesteps) x = alpha * x + (1.0 - alpha) * x_mix # alpha merge x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w) x = self.proj_out(x) return x_in + x def get_alpha( self, ): if self.merge_strategy == "fixed": return self.mix_factor elif self.merge_strategy == "learned": return torch.sigmoid(self.mix_factor) else: raise NotImplementedError(f"unknown merge strategy {self.merge_strategy}") def make_time_attn( in_channels, attn_type="vanilla", attn_kwargs=None, alpha: float = 0, merge_strategy: str = "learned", ): assert attn_type in [ "vanilla", "vanilla-xformers", ], f"attn_type {attn_type} not supported for spatio-temporal attention" print( f"making spatial and temporal attention of type '{attn_type}' with {in_channels} in_channels" ) if not XFORMERS_IS_AVAILABLE and attn_type == "vanilla-xformers": print( f"Attention mode '{attn_type}' is not available. Falling back to vanilla attention. " f"This is not a problem in Pytorch >= 2.0. FYI, you are running with PyTorch version {torch.__version__}" ) attn_type = "vanilla" if attn_type == "vanilla": assert attn_kwargs is None return partialclass( VideoBlock, in_channels, alpha=alpha, merge_strategy=merge_strategy ) elif attn_type == "vanilla-xformers": print(f"building MemoryEfficientAttnBlock with {in_channels} in_channels...") return partialclass( MemoryEfficientVideoBlock, in_channels, alpha=alpha, merge_strategy=merge_strategy, ) else: return NotImplementedError() class Conv2DWrapper(torch.nn.Conv2d): def forward(self, input: torch.Tensor, **kwargs) -> torch.Tensor: return super().forward(input) class VideoDecoder(Decoder): available_time_modes = ["all", "conv-only", "attn-only"] def __init__( self, *args, video_kernel_size: Union[int, list] = 3, alpha: float = 0.0, merge_strategy: str = "learned", time_mode: str = "conv-only", **kwargs, ): self.video_kernel_size = video_kernel_size self.alpha = alpha self.merge_strategy = merge_strategy self.time_mode = time_mode assert ( self.time_mode in self.available_time_modes ), f"time_mode parameter has to be in {self.available_time_modes}" super().__init__(*args, **kwargs) def get_last_layer(self, skip_time_mix=False, **kwargs): if self.time_mode == "attn-only": raise NotImplementedError("TODO") else: return ( self.conv_out.time_mix_conv.weight if not skip_time_mix else self.conv_out.weight ) def _make_attn(self) -> Callable: if self.time_mode not in ["conv-only", "only-last-conv"]: return partialclass( make_time_attn, alpha=self.alpha, merge_strategy=self.merge_strategy, ) else: return super()._make_attn() def _make_conv(self) -> Callable: if self.time_mode != "attn-only": return partialclass(AE3DConv, video_kernel_size=self.video_kernel_size) else: return Conv2DWrapper def _make_resblock(self) -> Callable: if self.time_mode not in ["attn-only", "only-last-conv"]: return partialclass( VideoResBlock, video_kernel_size=self.video_kernel_size, alpha=self.alpha, merge_strategy=self.merge_strategy, ) else: return super()._make_resblock()