""" This module defines various 3D UNet blocks used in the video model. The blocks include: - UNetMidBlock3DCrossAttn: The middle block of the UNet with cross attention. - CrossAttnDownBlock3D: The downsampling block with cross attention. - DownBlock3D: The standard downsampling block without cross attention. - CrossAttnUpBlock3D: The upsampling block with cross attention. - UpBlock3D: The standard upsampling block without cross attention. These blocks are used to construct the 3D UNet architecture for video-related tasks. """ import torch from einops import rearrange from torch import nn from .motion_module import get_motion_module from .resnet import Downsample3D, ResnetBlock3D, Upsample3D from .transformer_3d import Transformer3DModel def get_down_block( down_block_type, num_layers, in_channels, out_channels, temb_channels, add_downsample, resnet_eps, resnet_act_fn, attn_num_head_channels, resnet_groups=None, cross_attention_dim=None, audio_attention_dim=None, downsample_padding=None, dual_cross_attention=False, use_linear_projection=False, only_cross_attention=False, upcast_attention=False, resnet_time_scale_shift="default", unet_use_cross_frame_attention=None, unet_use_temporal_attention=None, use_inflated_groupnorm=None, use_motion_module=None, motion_module_type=None, motion_module_kwargs=None, use_audio_module=None, depth=0, stack_enable_blocks_name=None, stack_enable_blocks_depth=None, ): """ Factory function to instantiate a down-block module for the 3D UNet architecture. Down blocks are used in the downsampling part of the U-Net to reduce the spatial dimensions of the feature maps while increasing the depth. This function can create blocks with or without cross attention based on the specified parameters. Parameters: - down_block_type (str): The type of down block to instantiate. - num_layers (int): The number of layers in the block. - in_channels (int): The number of input channels. - out_channels (int): The number of output channels. - temb_channels (int): The number of token embedding channels. - add_downsample (bool): Flag to add a downsampling layer. - resnet_eps (float): Epsilon for residual block stability. - resnet_act_fn (callable): Activation function for the residual block. - ... (remaining parameters): Additional parameters for configuring the block. Returns: - nn.Module: An instance of a down-sampling block module. """ down_block_type = ( down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type ) if down_block_type == "DownBlock3D": return DownBlock3D( num_layers=num_layers, in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, add_downsample=add_downsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, downsample_padding=downsample_padding, resnet_time_scale_shift=resnet_time_scale_shift, use_inflated_groupnorm=use_inflated_groupnorm, use_motion_module=use_motion_module, motion_module_type=motion_module_type, motion_module_kwargs=motion_module_kwargs, ) if down_block_type == "CrossAttnDownBlock3D": if cross_attention_dim is None: raise ValueError( "cross_attention_dim must be specified for CrossAttnDownBlock3D" ) return CrossAttnDownBlock3D( num_layers=num_layers, in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, add_downsample=add_downsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, downsample_padding=downsample_padding, cross_attention_dim=cross_attention_dim, audio_attention_dim=audio_attention_dim, attn_num_head_channels=attn_num_head_channels, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, resnet_time_scale_shift=resnet_time_scale_shift, unet_use_cross_frame_attention=unet_use_cross_frame_attention, unet_use_temporal_attention=unet_use_temporal_attention, use_inflated_groupnorm=use_inflated_groupnorm, use_motion_module=use_motion_module, motion_module_type=motion_module_type, motion_module_kwargs=motion_module_kwargs, use_audio_module=use_audio_module, depth=depth, stack_enable_blocks_name=stack_enable_blocks_name, stack_enable_blocks_depth=stack_enable_blocks_depth, ) raise ValueError(f"{down_block_type} does not exist.") def get_up_block( up_block_type, num_layers, in_channels, out_channels, prev_output_channel, temb_channels, add_upsample, resnet_eps, resnet_act_fn, attn_num_head_channels, resnet_groups=None, cross_attention_dim=None, audio_attention_dim=None, dual_cross_attention=False, use_linear_projection=False, only_cross_attention=False, upcast_attention=False, resnet_time_scale_shift="default", unet_use_cross_frame_attention=None, unet_use_temporal_attention=None, use_inflated_groupnorm=None, use_motion_module=None, motion_module_type=None, motion_module_kwargs=None, use_audio_module=None, depth=0, stack_enable_blocks_name=None, stack_enable_blocks_depth=None, ): """ Factory function to instantiate an up-block module for the 3D UNet architecture. Up blocks are used in the upsampling part of the U-Net to increase the spatial dimensions of the feature maps while decreasing the depth. This function can create blocks with or without cross attention based on the specified parameters. Parameters: - up_block_type (str): The type of up block to instantiate. - num_layers (int): The number of layers in the block. - in_channels (int): The number of input channels. - out_channels (int): The number of output channels. - prev_output_channel (int): The number of channels from the previous layer's output. - temb_channels (int): The number of token embedding channels. - add_upsample (bool): Flag to add an upsampling layer. - resnet_eps (float): Epsilon for residual block stability. - resnet_act_fn (callable): Activation function for the residual block. - ... (remaining parameters): Additional parameters for configuring the block. Returns: - nn.Module: An instance of an up-sampling block module. """ up_block_type = ( up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type ) if up_block_type == "UpBlock3D": return UpBlock3D( num_layers=num_layers, in_channels=in_channels, out_channels=out_channels, prev_output_channel=prev_output_channel, temb_channels=temb_channels, add_upsample=add_upsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, resnet_time_scale_shift=resnet_time_scale_shift, use_inflated_groupnorm=use_inflated_groupnorm, use_motion_module=use_motion_module, motion_module_type=motion_module_type, motion_module_kwargs=motion_module_kwargs, ) if up_block_type == "CrossAttnUpBlock3D": if cross_attention_dim is None: raise ValueError( "cross_attention_dim must be specified for CrossAttnUpBlock3D" ) return CrossAttnUpBlock3D( num_layers=num_layers, in_channels=in_channels, out_channels=out_channels, prev_output_channel=prev_output_channel, temb_channels=temb_channels, add_upsample=add_upsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, cross_attention_dim=cross_attention_dim, audio_attention_dim=audio_attention_dim, attn_num_head_channels=attn_num_head_channels, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, resnet_time_scale_shift=resnet_time_scale_shift, unet_use_cross_frame_attention=unet_use_cross_frame_attention, unet_use_temporal_attention=unet_use_temporal_attention, use_inflated_groupnorm=use_inflated_groupnorm, use_motion_module=use_motion_module, motion_module_type=motion_module_type, motion_module_kwargs=motion_module_kwargs, use_audio_module=use_audio_module, depth=depth, stack_enable_blocks_name=stack_enable_blocks_name, stack_enable_blocks_depth=stack_enable_blocks_depth, ) raise ValueError(f"{up_block_type} does not exist.") class UNetMidBlock3DCrossAttn(nn.Module): """ A 3D UNet middle block with cross attention mechanism. This block is part of the U-Net architecture and is used for feature extraction in the middle of the downsampling path. Parameters: - in_channels (int): Number of input channels. - temb_channels (int): Number of token embedding channels. - dropout (float): Dropout rate. - num_layers (int): Number of layers in the block. - resnet_eps (float): Epsilon for residual block. - resnet_time_scale_shift (str): Time scale shift for time embedding normalization. - resnet_act_fn (str): Activation function for the residual block. - resnet_groups (int): Number of groups for the convolutions in the residual block. - resnet_pre_norm (bool): Whether to use pre-normalization in the residual block. - attn_num_head_channels (int): Number of attention heads. - cross_attention_dim (int): Dimensionality of the cross attention layers. - audio_attention_dim (int): Dimensionality of the audio attention layers. - dual_cross_attention (bool): Whether to use dual cross attention. - use_linear_projection (bool): Whether to use linear projection in attention. - upcast_attention (bool): Whether to upcast attention to the original input dimension. - unet_use_cross_frame_attention (bool): Whether to use cross frame attention in U-Net. - unet_use_temporal_attention (bool): Whether to use temporal attention in U-Net. - use_inflated_groupnorm (bool): Whether to use inflated group normalization. - use_motion_module (bool): Whether to use motion module. - motion_module_type (str): Type of motion module. - motion_module_kwargs (dict): Keyword arguments for the motion module. - use_audio_module (bool): Whether to use audio module. - depth (int): Depth of the block in the network. - stack_enable_blocks_name (str): Name of the stack enable blocks. - stack_enable_blocks_depth (int): Depth of the stack enable blocks. Forward method: The forward method applies the residual blocks, cross attention, and optional motion and audio modules to the input hidden states. It returns the transformed hidden states. """ def __init__( self, in_channels: int, temb_channels: int, dropout: float = 0.0, num_layers: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, attn_num_head_channels=1, output_scale_factor=1.0, cross_attention_dim=1280, audio_attention_dim=1024, dual_cross_attention=False, use_linear_projection=False, upcast_attention=False, unet_use_cross_frame_attention=None, unet_use_temporal_attention=None, use_inflated_groupnorm=None, use_motion_module=None, motion_module_type=None, motion_module_kwargs=None, use_audio_module=None, depth=0, stack_enable_blocks_name=None, stack_enable_blocks_depth=None, ): super().__init__() self.has_cross_attention = True self.attn_num_head_channels = attn_num_head_channels resnet_groups = ( resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) ) # there is always at least one resnet resnets = [ ResnetBlock3D( in_channels=in_channels, out_channels=in_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, use_inflated_groupnorm=use_inflated_groupnorm, ) ] attentions = [] motion_modules = [] audio_modules = [] for _ in range(num_layers): if dual_cross_attention: raise NotImplementedError attentions.append( Transformer3DModel( attn_num_head_channels, in_channels // attn_num_head_channels, in_channels=in_channels, num_layers=1, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, upcast_attention=upcast_attention, unet_use_cross_frame_attention=unet_use_cross_frame_attention, unet_use_temporal_attention=unet_use_temporal_attention, ) ) audio_modules.append( Transformer3DModel( attn_num_head_channels, in_channels // attn_num_head_channels, in_channels=in_channels, num_layers=1, cross_attention_dim=audio_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, upcast_attention=upcast_attention, use_audio_module=use_audio_module, depth=depth, unet_block_name="mid", stack_enable_blocks_name=stack_enable_blocks_name, stack_enable_blocks_depth=stack_enable_blocks_depth, ) if use_audio_module else None ) motion_modules.append( get_motion_module( in_channels=in_channels, motion_module_type=motion_module_type, motion_module_kwargs=motion_module_kwargs, ) if use_motion_module else None ) resnets.append( ResnetBlock3D( in_channels=in_channels, out_channels=in_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, use_inflated_groupnorm=use_inflated_groupnorm, ) ) self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) self.audio_modules = nn.ModuleList(audio_modules) self.motion_modules = nn.ModuleList(motion_modules) def forward( self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, full_mask=None, face_mask=None, lip_mask=None, audio_embedding=None, motion_scale=None, ): """ Forward pass for the UNetMidBlock3DCrossAttn class. Args: self (UNetMidBlock3DCrossAttn): An instance of the UNetMidBlock3DCrossAttn class. hidden_states (Tensor): The input hidden states tensor. temb (Tensor, optional): The input temporal embedding tensor. Defaults to None. encoder_hidden_states (Tensor, optional): The encoder hidden states tensor. Defaults to None. attention_mask (Tensor, optional): The attention mask tensor. Defaults to None. full_mask (Tensor, optional): The full mask tensor. Defaults to None. face_mask (Tensor, optional): The face mask tensor. Defaults to None. lip_mask (Tensor, optional): The lip mask tensor. Defaults to None. audio_embedding (Tensor, optional): The audio embedding tensor. Defaults to None. Returns: Tensor: The output tensor after passing through the UNetMidBlock3DCrossAttn layers. """ hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet, audio_module, motion_module in zip( self.attentions, self.resnets[1:], self.audio_modules, self.motion_modules ): hidden_states, motion_frame = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, return_dict=False, ) # .sample if len(motion_frame[0]) > 0: # if motion_frame[0][0].numel() > 0: motion_frames = motion_frame[0][0] motion_frames = rearrange( motion_frames, "b f (d1 d2) c -> b c f d1 d2", d1=hidden_states.size(-1), ) else: motion_frames = torch.zeros( hidden_states.shape[0], hidden_states.shape[1], 4, hidden_states.shape[3], hidden_states.shape[4], ) n_motion_frames = motion_frames.size(2) if audio_module is not None: hidden_states = ( audio_module( hidden_states, encoder_hidden_states=audio_embedding, attention_mask=attention_mask, full_mask=full_mask, face_mask=face_mask, lip_mask=lip_mask, motion_scale=motion_scale, return_dict=False, ) )[0] # .sample if motion_module is not None: motion_frames = motion_frames.to( device=hidden_states.device, dtype=hidden_states.dtype ) _hidden_states = ( torch.cat([motion_frames, hidden_states], dim=2) if n_motion_frames > 0 else hidden_states ) hidden_states = motion_module( _hidden_states, encoder_hidden_states=encoder_hidden_states ) hidden_states = hidden_states[:, :, n_motion_frames:] hidden_states = resnet(hidden_states, temb) return hidden_states class CrossAttnDownBlock3D(nn.Module): """ A 3D downsampling block with cross attention for the U-Net architecture. Parameters: - (same as above, refer to the constructor for details) Forward method: The forward method downsamples the input hidden states using residual blocks and cross attention. It also applies optional motion and audio modules. The method supports gradient checkpointing to save memory during training. """ def __init__( self, in_channels: int, out_channels: int, temb_channels: int, dropout: float = 0.0, num_layers: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, attn_num_head_channels=1, cross_attention_dim=1280, audio_attention_dim=1024, output_scale_factor=1.0, downsample_padding=1, add_downsample=True, dual_cross_attention=False, use_linear_projection=False, only_cross_attention=False, upcast_attention=False, unet_use_cross_frame_attention=None, unet_use_temporal_attention=None, use_inflated_groupnorm=None, use_motion_module=None, motion_module_type=None, motion_module_kwargs=None, use_audio_module=None, depth=0, stack_enable_blocks_name=None, stack_enable_blocks_depth=None, ): super().__init__() resnets = [] attentions = [] audio_modules = [] motion_modules = [] self.has_cross_attention = True self.attn_num_head_channels = attn_num_head_channels for i in range(num_layers): in_channels = in_channels if i == 0 else out_channels resnets.append( ResnetBlock3D( in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, use_inflated_groupnorm=use_inflated_groupnorm, ) ) if dual_cross_attention: raise NotImplementedError attentions.append( Transformer3DModel( attn_num_head_channels, out_channels // attn_num_head_channels, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, unet_use_cross_frame_attention=unet_use_cross_frame_attention, unet_use_temporal_attention=unet_use_temporal_attention, ) ) # TODO:检查维度 audio_modules.append( Transformer3DModel( attn_num_head_channels, in_channels // attn_num_head_channels, in_channels=out_channels, num_layers=1, cross_attention_dim=audio_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, use_audio_module=use_audio_module, depth=depth, unet_block_name="down", stack_enable_blocks_name=stack_enable_blocks_name, stack_enable_blocks_depth=stack_enable_blocks_depth, ) if use_audio_module else None ) motion_modules.append( get_motion_module( in_channels=out_channels, motion_module_type=motion_module_type, motion_module_kwargs=motion_module_kwargs, ) if use_motion_module else None ) self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) self.audio_modules = nn.ModuleList(audio_modules) self.motion_modules = nn.ModuleList(motion_modules) if add_downsample: self.downsamplers = nn.ModuleList( [ Downsample3D( out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op", ) ] ) else: self.downsamplers = None self.gradient_checkpointing = False def forward( self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, full_mask=None, face_mask=None, lip_mask=None, audio_embedding=None, motion_scale=None, ): """ Defines the forward pass for the CrossAttnDownBlock3D class. Parameters: - hidden_states : torch.Tensor The input tensor to the block. temb : torch.Tensor, optional The token embeddings from the previous block. encoder_hidden_states : torch.Tensor, optional The hidden states from the encoder. attention_mask : torch.Tensor, optional The attention mask for the cross-attention mechanism. full_mask : torch.Tensor, optional The full mask for the cross-attention mechanism. face_mask : torch.Tensor, optional The face mask for the cross-attention mechanism. lip_mask : torch.Tensor, optional The lip mask for the cross-attention mechanism. audio_embedding : torch.Tensor, optional The audio embedding for the cross-attention mechanism. Returns: -- torch.Tensor The output tensor from the block. """ output_states = () for _, (resnet, attn, audio_module, motion_module) in enumerate( zip(self.resnets, self.attentions, self.audio_modules, self.motion_modules) ): # self.gradient_checkpointing = False if self.training and self.gradient_checkpointing: def create_custom_forward(module, return_dict=None): def custom_forward(*inputs): if return_dict is not None: return module(*inputs, return_dict=return_dict) return module(*inputs) return custom_forward hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(resnet), hidden_states, temb ) motion_frames = [] hidden_states, motion_frame = torch.utils.checkpoint.checkpoint( create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, ) if len(motion_frame[0]) > 0: motion_frames = motion_frame[0][0] # motion_frames = torch.cat(motion_frames, dim=0) motion_frames = rearrange( motion_frames, "b f (d1 d2) c -> b c f d1 d2", d1=hidden_states.size(-1), ) else: motion_frames = torch.zeros( hidden_states.shape[0], hidden_states.shape[1], 4, hidden_states.shape[3], hidden_states.shape[4], ) n_motion_frames = motion_frames.size(2) if audio_module is not None: # audio_embedding = audio_embedding hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(audio_module, return_dict=False), hidden_states, audio_embedding, attention_mask, full_mask, face_mask, lip_mask, motion_scale, )[0] # add motion module if motion_module is not None: motion_frames = motion_frames.to( device=hidden_states.device, dtype=hidden_states.dtype ) _hidden_states = torch.cat( [motion_frames, hidden_states], dim=2 ) # if n_motion_frames > 0 else hidden_states hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(motion_module), _hidden_states, encoder_hidden_states, ) hidden_states = hidden_states[:, :, n_motion_frames:] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, ).sample if audio_module is not None: hidden_states = audio_module( hidden_states, audio_embedding, attention_mask=attention_mask, full_mask=full_mask, face_mask=face_mask, lip_mask=lip_mask, return_dict=False, )[0] # add motion module if motion_module is not None: hidden_states = motion_module( hidden_states, encoder_hidden_states=encoder_hidden_states ) output_states += (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) output_states += (hidden_states,) return hidden_states, output_states class DownBlock3D(nn.Module): """ A 3D downsampling block for the U-Net architecture. This block performs downsampling operations using residual blocks and an optional motion module. Parameters: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - temb_channels (int): Number of token embedding channels. - dropout (float): Dropout rate for the block. - num_layers (int): Number of layers in the block. - resnet_eps (float): Epsilon for residual block stability. - resnet_time_scale_shift (str): Time scale shift for the residual block's time embedding. - resnet_act_fn (str): Activation function used in the residual block. - resnet_groups (int): Number of groups for the convolutions in the residual block. - resnet_pre_norm (bool): Whether to use pre-normalization in the residual block. - output_scale_factor (float): Scaling factor for the block's output. - add_downsample (bool): Whether to add a downsampling layer. - downsample_padding (int): Padding for the downsampling layer. - use_inflated_groupnorm (bool): Whether to use inflated group normalization. - use_motion_module (bool): Whether to include a motion module. - motion_module_type (str): Type of motion module to use. - motion_module_kwargs (dict): Keyword arguments for the motion module. Forward method: The forward method processes the input hidden states through the residual blocks and optional motion modules, followed by an optional downsampling step. It supports gradient checkpointing during training to reduce memory usage. """ def __init__( self, in_channels: int, out_channels: int, temb_channels: int, dropout: float = 0.0, num_layers: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, output_scale_factor=1.0, add_downsample=True, downsample_padding=1, use_inflated_groupnorm=None, use_motion_module=None, motion_module_type=None, motion_module_kwargs=None, ): super().__init__() resnets = [] motion_modules = [] # use_motion_module = False for i in range(num_layers): in_channels = in_channels if i == 0 else out_channels resnets.append( ResnetBlock3D( in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, use_inflated_groupnorm=use_inflated_groupnorm, ) ) motion_modules.append( get_motion_module( in_channels=out_channels, motion_module_type=motion_module_type, motion_module_kwargs=motion_module_kwargs, ) if use_motion_module else None ) self.resnets = nn.ModuleList(resnets) self.motion_modules = nn.ModuleList(motion_modules) if add_downsample: self.downsamplers = nn.ModuleList( [ Downsample3D( out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op", ) ] ) else: self.downsamplers = None self.gradient_checkpointing = False def forward( self, hidden_states, temb=None, encoder_hidden_states=None, ): """ forward method for the DownBlock3D class. Args: hidden_states (Tensor): The input tensor to the DownBlock3D layer. temb (Tensor, optional): The token embeddings, if using transformer. encoder_hidden_states (Tensor, optional): The hidden states from the encoder. Returns: Tensor: The output tensor after passing through the DownBlock3D layer. """ output_states = () for resnet, motion_module in zip(self.resnets, self.motion_modules): # print(f"DownBlock3D {self.gradient_checkpointing = }") if self.training and self.gradient_checkpointing: def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs) return custom_forward hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(resnet), hidden_states, temb ) else: hidden_states = resnet(hidden_states, temb) # add motion module hidden_states = ( motion_module( hidden_states, encoder_hidden_states=encoder_hidden_states ) if motion_module is not None else hidden_states ) output_states += (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: hidden_states = downsampler(hidden_states) output_states += (hidden_states,) return hidden_states, output_states class CrossAttnUpBlock3D(nn.Module): """ Standard 3D downsampling block for the U-Net architecture. This block performs downsampling operations in the U-Net using residual blocks and an optional motion module. Parameters: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - temb_channels (int): Number of channels for the temporal embedding. - dropout (float): Dropout rate for the block. - num_layers (int): Number of layers in the block. - resnet_eps (float): Epsilon for residual block stability. - resnet_time_scale_shift (str): Time scale shift for the residual block's time embedding. - resnet_act_fn (str): Activation function used in the residual block. - resnet_groups (int): Number of groups for the convolutions in the residual block. - resnet_pre_norm (bool): Whether to use pre-normalization in the residual block. - output_scale_factor (float): Scaling factor for the block's output. - add_downsample (bool): Whether to add a downsampling layer. - downsample_padding (int): Padding for the downsampling layer. - use_inflated_groupnorm (bool): Whether to use inflated group normalization. - use_motion_module (bool): Whether to include a motion module. - motion_module_type (str): Type of motion module to use. - motion_module_kwargs (dict): Keyword arguments for the motion module. Forward method: The forward method processes the input hidden states through the residual blocks and optional motion modules, followed by an optional downsampling step. It supports gradient checkpointing during training to reduce memory usage. """ def __init__( self, in_channels: int, out_channels: int, prev_output_channel: int, temb_channels: int, dropout: float = 0.0, num_layers: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, attn_num_head_channels=1, cross_attention_dim=1280, audio_attention_dim=1024, output_scale_factor=1.0, add_upsample=True, dual_cross_attention=False, use_linear_projection=False, only_cross_attention=False, upcast_attention=False, unet_use_cross_frame_attention=None, unet_use_temporal_attention=None, use_motion_module=None, use_inflated_groupnorm=None, motion_module_type=None, motion_module_kwargs=None, use_audio_module=None, depth=0, stack_enable_blocks_name=None, stack_enable_blocks_depth=None, ): super().__init__() resnets = [] attentions = [] audio_modules = [] motion_modules = [] self.has_cross_attention = True self.attn_num_head_channels = attn_num_head_channels for i in range(num_layers): res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( ResnetBlock3D( in_channels=resnet_in_channels + res_skip_channels, out_channels=out_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, use_inflated_groupnorm=use_inflated_groupnorm, ) ) if dual_cross_attention: raise NotImplementedError attentions.append( Transformer3DModel( attn_num_head_channels, out_channels // attn_num_head_channels, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, unet_use_cross_frame_attention=unet_use_cross_frame_attention, unet_use_temporal_attention=unet_use_temporal_attention, ) ) audio_modules.append( Transformer3DModel( attn_num_head_channels, in_channels // attn_num_head_channels, in_channels=out_channels, num_layers=1, cross_attention_dim=audio_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, upcast_attention=upcast_attention, use_audio_module=use_audio_module, depth=depth, unet_block_name="up", stack_enable_blocks_name=stack_enable_blocks_name, stack_enable_blocks_depth=stack_enable_blocks_depth, ) if use_audio_module else None ) motion_modules.append( get_motion_module( in_channels=out_channels, motion_module_type=motion_module_type, motion_module_kwargs=motion_module_kwargs, ) if use_motion_module else None ) self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) self.audio_modules = nn.ModuleList(audio_modules) self.motion_modules = nn.ModuleList(motion_modules) if add_upsample: self.upsamplers = nn.ModuleList( [Upsample3D(out_channels, use_conv=True, out_channels=out_channels)] ) else: self.upsamplers = None self.gradient_checkpointing = False def forward( self, hidden_states, res_hidden_states_tuple, temb=None, encoder_hidden_states=None, upsample_size=None, attention_mask=None, full_mask=None, face_mask=None, lip_mask=None, audio_embedding=None, motion_scale=None, ): """ Forward pass for the CrossAttnUpBlock3D class. Args: self (CrossAttnUpBlock3D): An instance of the CrossAttnUpBlock3D class. hidden_states (Tensor): The input hidden states tensor. res_hidden_states_tuple (Tuple[Tensor]): A tuple of residual hidden states tensors. temb (Tensor, optional): The token embeddings tensor. Defaults to None. encoder_hidden_states (Tensor, optional): The encoder hidden states tensor. Defaults to None. upsample_size (int, optional): The upsample size. Defaults to None. attention_mask (Tensor, optional): The attention mask tensor. Defaults to None. full_mask (Tensor, optional): The full mask tensor. Defaults to None. face_mask (Tensor, optional): The face mask tensor. Defaults to None. lip_mask (Tensor, optional): The lip mask tensor. Defaults to None. audio_embedding (Tensor, optional): The audio embedding tensor. Defaults to None. Returns: Tensor: The output tensor after passing through the CrossAttnUpBlock3D. """ for _, (resnet, attn, audio_module, motion_module) in enumerate( zip(self.resnets, self.attentions, self.audio_modules, self.motion_modules) ): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) if self.training and self.gradient_checkpointing: def create_custom_forward(module, return_dict=None): def custom_forward(*inputs): if return_dict is not None: return module(*inputs, return_dict=return_dict) return module(*inputs) return custom_forward hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(resnet), hidden_states, temb ) motion_frames = [] hidden_states, motion_frame = torch.utils.checkpoint.checkpoint( create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, ) if len(motion_frame[0]) > 0: motion_frames = motion_frame[0][0] # motion_frames = torch.cat(motion_frames, dim=0) motion_frames = rearrange( motion_frames, "b f (d1 d2) c -> b c f d1 d2", d1=hidden_states.size(-1), ) else: motion_frames = torch.zeros( hidden_states.shape[0], hidden_states.shape[1], 4, hidden_states.shape[3], hidden_states.shape[4], ) n_motion_frames = motion_frames.size(2) if audio_module is not None: # audio_embedding = audio_embedding hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(audio_module, return_dict=False), hidden_states, audio_embedding, attention_mask, full_mask, face_mask, lip_mask, motion_scale, )[0] # add motion module if motion_module is not None: motion_frames = motion_frames.to( device=hidden_states.device, dtype=hidden_states.dtype ) _hidden_states = ( torch.cat([motion_frames, hidden_states], dim=2) if n_motion_frames > 0 else hidden_states ) hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(motion_module), _hidden_states, encoder_hidden_states, ) hidden_states = hidden_states[:, :, n_motion_frames:] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, ).sample if audio_module is not None: hidden_states = ( audio_module( hidden_states, encoder_hidden_states=audio_embedding, attention_mask=attention_mask, full_mask=full_mask, face_mask=face_mask, lip_mask=lip_mask, ) ).sample # add motion module hidden_states = ( motion_module( hidden_states, encoder_hidden_states=encoder_hidden_states ) if motion_module is not None else hidden_states ) if self.upsamplers is not None: for upsampler in self.upsamplers: hidden_states = upsampler(hidden_states, upsample_size) return hidden_states class UpBlock3D(nn.Module): """ 3D upsampling block with cross attention for the U-Net architecture. This block performs upsampling operations and incorporates cross attention mechanisms, which allow the model to focus on different parts of the input when upscaling. Parameters: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. - prev_output_channel (int): Number of channels from the previous layer's output. - temb_channels (int): Number of channels for the temporal embedding. - dropout (float): Dropout rate for the block. - num_layers (int): Number of layers in the block. - resnet_eps (float): Epsilon for residual block stability. - resnet_time_scale_shift (str): Time scale shift for the residual block's time embedding. - resnet_act_fn (str): Activation function used in the residual block. - resnet_groups (int): Number of groups for the convolutions in the residual block. - resnet_pre_norm (bool): Whether to use pre-normalization in the residual block. - attn_num_head_channels (int): Number of attention heads for the cross attention mechanism. - cross_attention_dim (int): Dimensionality of the cross attention layers. - audio_attention_dim (int): Dimensionality of the audio attention layers. - output_scale_factor (float): Scaling factor for the block's output. - add_upsample (bool): Whether to add an upsampling layer. - dual_cross_attention (bool): Whether to use dual cross attention (not implemented). - use_linear_projection (bool): Whether to use linear projection in the cross attention. - only_cross_attention (bool): Whether to use only cross attention (no self-attention). - upcast_attention (bool): Whether to upcast attention to the original input dimension. - unet_use_cross_frame_attention (bool): Whether to use cross frame attention in U-Net. - unet_use_temporal_attention (bool): Whether to use temporal attention in U-Net. - use_motion_module (bool): Whether to include a motion module. - use_inflated_groupnorm (bool): Whether to use inflated group normalization. - motion_module_type (str): Type of motion module to use. - motion_module_kwargs (dict): Keyword arguments for the motion module. - use_audio_module (bool): Whether to include an audio module. - depth (int): Depth of the block in the network. - stack_enable_blocks_name (str): Name of the stack enable blocks. - stack_enable_blocks_depth (int): Depth of the stack enable blocks. Forward method: The forward method upsamples the input hidden states and residual hidden states, processes them through the residual and cross attention blocks, and optional motion and audio modules. It supports gradient checkpointing during training. """ def __init__( self, in_channels: int, prev_output_channel: int, out_channels: int, temb_channels: int, dropout: float = 0.0, num_layers: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, output_scale_factor=1.0, add_upsample=True, use_inflated_groupnorm=None, use_motion_module=None, motion_module_type=None, motion_module_kwargs=None, ): super().__init__() resnets = [] motion_modules = [] # use_motion_module = False for i in range(num_layers): res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels resnets.append( ResnetBlock3D( in_channels=resnet_in_channels + res_skip_channels, out_channels=out_channels, temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, time_embedding_norm=resnet_time_scale_shift, non_linearity=resnet_act_fn, output_scale_factor=output_scale_factor, pre_norm=resnet_pre_norm, use_inflated_groupnorm=use_inflated_groupnorm, ) ) motion_modules.append( get_motion_module( in_channels=out_channels, motion_module_type=motion_module_type, motion_module_kwargs=motion_module_kwargs, ) if use_motion_module else None ) self.resnets = nn.ModuleList(resnets) self.motion_modules = nn.ModuleList(motion_modules) if add_upsample: self.upsamplers = nn.ModuleList( [Upsample3D(out_channels, use_conv=True, out_channels=out_channels)] ) else: self.upsamplers = None self.gradient_checkpointing = False def forward( self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, encoder_hidden_states=None, ): """ Forward pass for the UpBlock3D class. Args: self (UpBlock3D): An instance of the UpBlock3D class. hidden_states (Tensor): The input hidden states tensor. res_hidden_states_tuple (Tuple[Tensor]): A tuple of residual hidden states tensors. temb (Tensor, optional): The token embeddings tensor. Defaults to None. upsample_size (int, optional): The upsample size. Defaults to None. encoder_hidden_states (Tensor, optional): The encoder hidden states tensor. Defaults to None. Returns: Tensor: The output tensor after passing through the UpBlock3D layers. """ for resnet, motion_module in zip(self.resnets, self.motion_modules): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] res_hidden_states_tuple = res_hidden_states_tuple[:-1] hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) # print(f"UpBlock3D {self.gradient_checkpointing = }") if self.training and self.gradient_checkpointing: def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs) return custom_forward hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(resnet), hidden_states, temb ) else: hidden_states = resnet(hidden_states, temb) hidden_states = ( motion_module( hidden_states, encoder_hidden_states=encoder_hidden_states ) if motion_module is not None else hidden_states ) if self.upsamplers is not None: for upsampler in self.upsamplers: hidden_states = upsampler(hidden_states, upsample_size) return hidden_states