Spaces:

Lightricks
/

LTX-Video-Playground

Running on A100

eitanrich commited on Oct 20, 2024

Commit

43d3c68

1 Parent(s): 66d4261

VAE: Support more configurations for Encoder and Decoder blocks

VAE: Define encoder compress-all block with channel multiplier

VAE: Support residual connection in the decoder

VAE: Refactor CausalConv3d parameters

lint

Files changed (2) hide show

xora/models/autoencoders/causal_conv3d.py +3 -1
xora/models/autoencoders/causal_video_autoencoder.py +54 -17

xora/models/autoencoders/causal_conv3d.py CHANGED Viewed

@@ -11,6 +11,8 @@ class CausalConv3d(nn.Module):
         out_channels,
         kernel_size: int = 3,
         stride: Union[int, Tuple[int]] = 1,
         **kwargs,
     ):
         super().__init__()
@@ -21,7 +23,6 @@ class CausalConv3d(nn.Module):
         kernel_size = (kernel_size, kernel_size, kernel_size)
         self.time_kernel_size = kernel_size[0]
-        dilation = kwargs.pop("dilation", 1)
         dilation = (dilation, 1, 1)
         height_pad = kernel_size[1] // 2
@@ -36,6 +37,7 @@ class CausalConv3d(nn.Module):
             dilation=dilation,
             padding=padding,
             padding_mode="zeros",
         )
     def forward(self, x, causal: bool = True):

         out_channels,
         kernel_size: int = 3,
         stride: Union[int, Tuple[int]] = 1,
+        dilation: int = 1,
+        groups: int = 1,
         **kwargs,
     ):
         super().__init__()
         kernel_size = (kernel_size, kernel_size, kernel_size)
         self.time_kernel_size = kernel_size[0]
         dilation = (dilation, 1, 1)
         height_pad = kernel_size[1] // 2
             dilation=dilation,
             padding=padding,
             padding_mode="zeros",
+            groups=groups,
         )
     def forward(self, x, causal: bool = True):

xora/models/autoencoders/causal_video_autoencoder.py CHANGED Viewed

@@ -78,7 +78,7 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
             dims=config["dims"],
             in_channels=config.get("in_channels", 3),
             out_channels=config["latent_channels"],
-            blocks=config["blocks"],
             patch_size=config.get("patch_size", 1),
             latent_log_var=latent_log_var,
             norm_layer=config.get("norm_layer", "group_norm"),
@@ -88,7 +88,7 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
             dims=config["dims"],
             in_channels=config["latent_channels"],
             out_channels=config.get("out_channels", 3),
-            blocks=config["blocks"],
             patch_size=config.get("patch_size", 1),
             norm_layer=config.get("norm_layer", "group_norm"),
             causal=config.get("causal_decoder", False),
@@ -112,7 +112,8 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
             out_channels=self.decoder.conv_out.out_channels
             // self.decoder.patch_size**2,
             latent_channels=self.decoder.conv_in.in_channels,
-            blocks=self.encoder.blocks_desc,
             scaling_factor=1.0,
             norm_layer=self.encoder.norm_layer,
             patch_size=self.encoder.patch_size,
@@ -242,7 +243,7 @@ class Encoder(nn.Module):
         dims: Union[int, Tuple[int, int]] = 3,
         in_channels: int = 3,
         out_channels: int = 3,
-        blocks: List[Tuple[str, int]] = [("res_x", 1)],
         base_channels: int = 128,
         norm_num_groups: int = 32,
         patch_size: Union[int, Tuple[int]] = 1,
@@ -271,20 +272,22 @@ class Encoder(nn.Module):
         self.down_blocks = nn.ModuleList([])
-        for block_name, num_layers in blocks:
             input_channel = output_channel
             if block_name == "res_x":
                 block = UNetMidBlock3D(
                     dims=dims,
                     in_channels=input_channel,
-                    num_layers=num_layers,
                     resnet_eps=1e-6,
                     resnet_groups=norm_num_groups,
                     norm_layer=norm_layer,
                 )
             elif block_name == "res_x_y":
-                output_channel = 2 * output_channel
                 block = ResnetBlock3D(
                     dims=dims,
                     in_channels=input_channel,
@@ -320,6 +323,16 @@ class Encoder(nn.Module):
                     stride=(2, 2, 2),
                     causal=True,
                 )
             else:
                 raise ValueError(f"unknown block: {block_name}")
@@ -421,7 +434,7 @@ class Decoder(nn.Module):
         dims,
         in_channels: int = 3,
         out_channels: int = 3,
-        blocks: List[Tuple[str, int]] = [("res_x", 1)],
         base_channels: int = 128,
         layers_per_block: int = 2,
         norm_num_groups: int = 32,
@@ -433,9 +446,15 @@ class Decoder(nn.Module):
         self.patch_size = patch_size
         self.layers_per_block = layers_per_block
         out_channels = out_channels * patch_size**2
-        num_channel_doubles = len([x for x in blocks if x[0] == "res_x_y"])
-        output_channel = base_channels * 2**num_channel_doubles
         self.causal = causal
         self.conv_in = make_conv_nd(
             dims,
@@ -449,20 +468,22 @@ class Decoder(nn.Module):
         self.up_blocks = nn.ModuleList([])
-        for block_name, num_layers in list(reversed(blocks)):
             input_channel = output_channel
             if block_name == "res_x":
                 block = UNetMidBlock3D(
                     dims=dims,
                     in_channels=input_channel,
-                    num_layers=num_layers,
                     resnet_eps=1e-6,
                     resnet_groups=norm_num_groups,
                     norm_layer=norm_layer,
                 )
             elif block_name == "res_x_y":
-                output_channel = output_channel // 2
                 block = ResnetBlock3D(
                     dims=dims,
                     in_channels=input_channel,
@@ -481,7 +502,10 @@ class Decoder(nn.Module):
                 )
             elif block_name == "compress_all":
                 block = DepthToSpaceUpsample(
-                    dims=dims, in_channels=input_channel, stride=(2, 2, 2)
                 )
             else:
                 raise ValueError(f"unknown layer: {block_name}")
@@ -590,7 +614,7 @@ class UNetMidBlock3D(nn.Module):
 class DepthToSpaceUpsample(nn.Module):
-    def __init__(self, dims, in_channels, stride):
         super().__init__()
         self.stride = stride
         self.out_channels = np.prod(stride) * in_channels
@@ -602,8 +626,21 @@ class DepthToSpaceUpsample(nn.Module):
             stride=1,
             causal=True,
         )
     def forward(self, x, causal: bool = True):
         x = self.conv(x, causal=causal)
         x = rearrange(
             x,
@@ -614,6 +651,8 @@ class DepthToSpaceUpsample(nn.Module):
         )
         if self.stride[0] == 2:
             x = x[:, :, 1:, :, :]
         return x
@@ -647,7 +686,6 @@ class ResnetBlock3D(nn.Module):
         dims: Union[int, Tuple[int, int]],
         in_channels: int,
         out_channels: Optional[int] = None,
-        conv_shortcut: bool = False,
         dropout: float = 0.0,
         groups: int = 32,
         eps: float = 1e-6,
@@ -657,7 +695,6 @@ class ResnetBlock3D(nn.Module):
         self.in_channels = in_channels
         out_channels = in_channels if out_channels is None else out_channels
         self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
         if norm_layer == "group_norm":
             self.norm1 = nn.GroupNorm(

             dims=config["dims"],
             in_channels=config.get("in_channels", 3),
             out_channels=config["latent_channels"],
+            blocks=config.get("encoder_blocks", config.get("blocks")),
             patch_size=config.get("patch_size", 1),
             latent_log_var=latent_log_var,
             norm_layer=config.get("norm_layer", "group_norm"),
             dims=config["dims"],
             in_channels=config["latent_channels"],
             out_channels=config.get("out_channels", 3),
+            blocks=config.get("decoder_blocks", config.get("blocks")),
             patch_size=config.get("patch_size", 1),
             norm_layer=config.get("norm_layer", "group_norm"),
             causal=config.get("causal_decoder", False),
             out_channels=self.decoder.conv_out.out_channels
             // self.decoder.patch_size**2,
             latent_channels=self.decoder.conv_in.in_channels,
+            encoder_blocks=self.encoder.blocks_desc,
+            decoder_blocks=self.decoder.blocks_desc,
             scaling_factor=1.0,
             norm_layer=self.encoder.norm_layer,
             patch_size=self.encoder.patch_size,
         dims: Union[int, Tuple[int, int]] = 3,
         in_channels: int = 3,
         out_channels: int = 3,
+        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
         base_channels: int = 128,
         norm_num_groups: int = 32,
         patch_size: Union[int, Tuple[int]] = 1,
         self.down_blocks = nn.ModuleList([])
+        for block_name, block_params in blocks:
             input_channel = output_channel
+            if isinstance(block_params, int):
+                block_params = {"num_layers": block_params}
             if block_name == "res_x":
                 block = UNetMidBlock3D(
                     dims=dims,
                     in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
                     resnet_eps=1e-6,
                     resnet_groups=norm_num_groups,
                     norm_layer=norm_layer,
                 )
             elif block_name == "res_x_y":
+                output_channel = block_params.get("multiplier", 2) * output_channel
                 block = ResnetBlock3D(
                     dims=dims,
                     in_channels=input_channel,
                     stride=(2, 2, 2),
                     causal=True,
                 )
+            elif block_name == "compress_all_x_y":
+                output_channel = block_params.get("multiplier", 2) * output_channel
+                block = make_conv_nd(
+                    dims=dims,
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    kernel_size=3,
+                    stride=(2, 2, 2),
+                    causal=True,
+                )
             else:
                 raise ValueError(f"unknown block: {block_name}")
         dims,
         in_channels: int = 3,
         out_channels: int = 3,
+        blocks: List[Tuple[str, int | dict]] = [("res_x", 1)],
         base_channels: int = 128,
         layers_per_block: int = 2,
         norm_num_groups: int = 32,
         self.patch_size = patch_size
         self.layers_per_block = layers_per_block
         out_channels = out_channels * patch_size**2
         self.causal = causal
+        self.blocks_desc = blocks
+        # Compute output channel to be product of all channel-multiplier blocks
+        output_channel = base_channels
+        for block_name, block_params in list(reversed(blocks)):
+            block_params = block_params if isinstance(block_params, dict) else {}
+            if block_name == "res_x_y":
+                output_channel = output_channel * block_params.get("multiplier", 2)
         self.conv_in = make_conv_nd(
             dims,
         self.up_blocks = nn.ModuleList([])
+        for block_name, block_params in list(reversed(blocks)):
             input_channel = output_channel
+            if isinstance(block_params, int):
+                block_params = {"num_layers": block_params}
             if block_name == "res_x":
                 block = UNetMidBlock3D(
                     dims=dims,
                     in_channels=input_channel,
+                    num_layers=block_params["num_layers"],
                     resnet_eps=1e-6,
                     resnet_groups=norm_num_groups,
                     norm_layer=norm_layer,
                 )
             elif block_name == "res_x_y":
+                output_channel = output_channel // block_params.get("multiplier", 2)
                 block = ResnetBlock3D(
                     dims=dims,
                     in_channels=input_channel,
                 )
             elif block_name == "compress_all":
                 block = DepthToSpaceUpsample(
+                    dims=dims,
+                    in_channels=input_channel,
+                    stride=(2, 2, 2),
+                    residual=block_params.get("residual", False),
                 )
             else:
                 raise ValueError(f"unknown layer: {block_name}")
 class DepthToSpaceUpsample(nn.Module):
+    def __init__(self, dims, in_channels, stride, residual=False):
         super().__init__()
         self.stride = stride
         self.out_channels = np.prod(stride) * in_channels
             stride=1,
             causal=True,
         )
+        self.residual = residual
     def forward(self, x, causal: bool = True):
+        if self.residual:
+            # Reshape and duplicate the input to match the output shape
+            x_in = rearrange(
+                x,
+                "b (c p1 p2 p3) d h w -> b c (d p1) (h p2) (w p3)",
+                p1=self.stride[0],
+                p2=self.stride[1],
+                p3=self.stride[2],
+            )
+            x_in = x_in.repeat(1, np.prod(self.stride), 1, 1, 1)
+            if self.stride[0] == 2:
+                x_in = x_in[:, :, 1:, :, :]
         x = self.conv(x, causal=causal)
         x = rearrange(
             x,
         )
         if self.stride[0] == 2:
             x = x[:, :, 1:, :, :]
+        if self.residual:
+            x = x + x_in
         return x
         dims: Union[int, Tuple[int, int]],
         in_channels: int,
         out_channels: Optional[int] = None,
         dropout: float = 0.0,
         groups: int = 32,
         eps: float = 1e-6,
         self.in_channels = in_channels
         out_channels = in_channels if out_channels is None else out_channels
         self.out_channels = out_channels
         if norm_layer == "group_norm":
             self.norm1 = nn.GroupNorm(