Spaces:

tree3po
/

oasis-game

Runtime error

App Files Files Community

tree3po commited on Nov 1, 2024

Commit

12aae2e

•

1 Parent(s): d983af9

Upload 21 files

Browse files

Files changed (22) hide show

.gitattributes +5 -0
open_oasis_master/.gitattributes +1 -0
open_oasis_master/LICENSE +21 -0
open_oasis_master/README.md +37 -0
open_oasis_master/attention.py +137 -0
open_oasis_master/dit.py +310 -0
open_oasis_master/embeddings.py +103 -0
open_oasis_master/generate.py +119 -0
open_oasis_master/media/arch.png +0 -0
open_oasis_master/media/sample_0.gif +3 -0
open_oasis_master/media/sample_1.gif +3 -0
open_oasis_master/media/thumb.png +0 -0
open_oasis_master/requirements.txt +31 -0
open_oasis_master/rotary_embedding_torch.py +316 -0
open_oasis_master/sample_data/Player729-f153ac423f61-20210806-224813.chunk_000.actions.pt +3 -0
open_oasis_master/sample_data/Player729-f153ac423f61-20210806-224813.chunk_000.mp4 +3 -0
open_oasis_master/sample_data/snippy-chartreuse-mastiff-f79998db196d-20220401-224517.chunk_001.actions.pt +3 -0
open_oasis_master/sample_data/snippy-chartreuse-mastiff-f79998db196d-20220401-224517.chunk_001.mp4 +3 -0
open_oasis_master/sample_data/treechop-f153ac423f61-20210916-183423.chunk_000.actions.pt +3 -0
open_oasis_master/sample_data/treechop-f153ac423f61-20210916-183423.chunk_000.mp4 +3 -0
open_oasis_master/utils.py +82 -0
open_oasis_master/vae.py +381 -0

.gitattributes CHANGED Viewed

@@ -38,3 +38,8 @@ open-oasis-master/media/sample_1.gif filter=lfs diff=lfs merge=lfs -text
 open-oasis-master/sample_data/Player729-f153ac423f61-20210806-224813.chunk_000.mp4 filter=lfs diff=lfs merge=lfs -text
 open-oasis-master/sample_data/snippy-chartreuse-mastiff-f79998db196d-20220401-224517.chunk_001.mp4 filter=lfs diff=lfs merge=lfs -text
 open-oasis-master/sample_data/treechop-f153ac423f61-20210916-183423.chunk_000.mp4 filter=lfs diff=lfs merge=lfs -text

 open-oasis-master/sample_data/Player729-f153ac423f61-20210806-224813.chunk_000.mp4 filter=lfs diff=lfs merge=lfs -text
 open-oasis-master/sample_data/snippy-chartreuse-mastiff-f79998db196d-20220401-224517.chunk_001.mp4 filter=lfs diff=lfs merge=lfs -text
 open-oasis-master/sample_data/treechop-f153ac423f61-20210916-183423.chunk_000.mp4 filter=lfs diff=lfs merge=lfs -text
+open_oasis_master/media/sample_0.gif filter=lfs diff=lfs merge=lfs -text
+open_oasis_master/media/sample_1.gif filter=lfs diff=lfs merge=lfs -text
+open_oasis_master/sample_data/Player729-f153ac423f61-20210806-224813.chunk_000.mp4 filter=lfs diff=lfs merge=lfs -text
+open_oasis_master/sample_data/snippy-chartreuse-mastiff-f79998db196d-20220401-224517.chunk_001.mp4 filter=lfs diff=lfs merge=lfs -text
+open_oasis_master/sample_data/treechop-f153ac423f61-20210916-183423.chunk_000.mp4 filter=lfs diff=lfs merge=lfs -text

open_oasis_master/.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ video.mp4 filter=lfs diff=lfs merge=lfs -text

open_oasis_master/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Etched & Decart
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

open_oasis_master/README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+# Oasis 500M
+![](./media/arch.png)
+![](./media/thumb.png)
+Oasis is an interactive world model developed by [Decart](https://www.decart.ai/) and [Etched](https://www.etched.com/). Based on diffusion transformers, Oasis takes in user keyboard input and generates gameplay in an autoregressive manner. We release the weights for Oasis 500M, a downscaled version of the model, along with inference code for action-conditional frame generation.
+For more details, see our [joint blog post](https://oasis-model.github.io/) to learn more.
+And to use the most powerful version of the model, be sure to check out the [live demo](https://oasis.us.decart.ai/) as well!
+## Setup
+```
+git clone https://github.com/etched-ai/open-oasis.git
+cd open-oasis
+pip install -r requirements.txt
+```
+## Download the model weights
+```
+huggingface-cli login
+huggingface-cli download Etched/oasis-500m oasis500m.pt # DiT checkpoint
+huggingface-cli download Etched/oasis-500m vit-l-20.pt  # ViT VAE checkpoint
+```
+## Basic Usage
+We include a basic inference script that loads a prompt frame from a video and generates additional frames conditioned on actions.
+```
+python generate.py
+```
+The resulting video will be saved to `video.mp4`. Here's are some examples of a generation from this 500M model!
+![](media/sample_0.gif)
+![](media/sample_1.gif)
+> Hint: try swapping out the `.mp4` input file in the script to try different environments!

open_oasis_master/attention.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+Based on https://github.com/buoyancy99/diffusion-forcing/blob/main/algorithms/diffusion_forcing/models/attention.py
+"""
+from typing import Optional
+from collections import namedtuple
+import torch
+from torch import nn
+from torch.nn import functional as F
+from einops import rearrange
+from rotary_embedding_torch import RotaryEmbedding, apply_rotary_emb
+from embeddings import TimestepEmbedding, Timesteps, Positions2d
+class TemporalAxialAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        heads: int = 4,
+        dim_head: int = 32,
+        is_causal: bool = True,
+        rotary_emb: Optional[RotaryEmbedding] = None,
+    ):
+        super().__init__()
+        self.inner_dim = dim_head * heads
+        self.heads = heads
+        self.head_dim = dim_head
+        self.inner_dim = dim_head * heads
+        self.to_qkv = nn.Linear(dim, self.inner_dim * 3, bias=False)
+        self.to_out = nn.Linear(self.inner_dim, dim)
+        self.rotary_emb = rotary_emb
+        self.time_pos_embedding = (
+            nn.Sequential(
+                Timesteps(dim),
+                TimestepEmbedding(in_channels=dim, time_embed_dim=dim * 4, out_dim=dim),
+            )
+            if rotary_emb is None
+            else None
+        )
+        self.is_causal = is_causal
+    def forward(self, x: torch.Tensor):
+        B, T, H, W, D = x.shape
+        if self.time_pos_embedding is not None:
+            time_emb = self.time_pos_embedding(
+                torch.arange(T, device=x.device)
+            )
+            x = x + rearrange(time_emb, "t d -> 1 t 1 1 d")
+        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+        q = rearrange(q, "B T H W (h d) -> (B H W) h T d", h=self.heads)
+        k = rearrange(k, "B T H W (h d) -> (B H W) h T d", h=self.heads)
+        v = rearrange(v, "B T H W (h d) -> (B H W) h T d", h=self.heads)
+        if self.rotary_emb is not None:
+            q = self.rotary_emb.rotate_queries_or_keys(q, self.rotary_emb.freqs)
+            k = self.rotary_emb.rotate_queries_or_keys(k, self.rotary_emb.freqs)
+        q, k, v = map(lambda t: t.contiguous(), (q, k, v))
+        x = F.scaled_dot_product_attention(
+            query=q, key=k, value=v, is_causal=self.is_causal
+        )
+        x = rearrange(x, "(B H W) h T d -> B T H W (h d)", B=B, H=H, W=W)
+        x = x.to(q.dtype)
+        # linear proj
+        x = self.to_out(x)
+        return x
+class SpatialAxialAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        heads: int = 4,
+        dim_head: int = 32,
+        rotary_emb: Optional[RotaryEmbedding] = None,
+    ):
+        super().__init__()
+        self.inner_dim = dim_head * heads
+        self.heads = heads
+        self.head_dim = dim_head
+        self.inner_dim = dim_head * heads
+        self.to_qkv = nn.Linear(dim, self.inner_dim * 3, bias=False)
+        self.to_out = nn.Linear(self.inner_dim, dim)
+        self.rotary_emb = rotary_emb
+        self.space_pos_embedding = (
+            nn.Sequential(
+                Positions2d(dim),
+                TimestepEmbedding(in_channels=dim, time_embed_dim=dim * 4, out_dim=dim),
+            )
+            if rotary_emb is None
+            else None
+        )
+    def forward(self, x: torch.Tensor):
+        B, T, H, W, D = x.shape
+        if self.space_pos_embedding is not None:
+            h_steps = torch.arange(H, device=x.device)
+            w_steps = torch.arange(W, device=x.device)
+            grid = torch.meshgrid(h_steps, w_steps, indexing="ij")
+            space_emb = self.space_pos_embedding(grid)
+            x = x + rearrange(space_emb, "h w d -> 1 1 h w d")
+        q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+        q = rearrange(q, "B T H W (h d) -> (B T) h H W d", h=self.heads)
+        k = rearrange(k, "B T H W (h d) -> (B T) h H W d", h=self.heads)
+        v = rearrange(v, "B T H W (h d) -> (B T) h H W d", h=self.heads)
+        if self.rotary_emb is not None:
+            freqs = self.rotary_emb.get_axial_freqs(H, W)
+            q = apply_rotary_emb(freqs, q)
+            k = apply_rotary_emb(freqs, k)
+        # prepare for attn
+        q = rearrange(q, "(B T) h H W d -> (B T) h (H W) d", B=B, T=T, h=self.heads)
+        k = rearrange(k, "(B T) h H W d -> (B T) h (H W) d", B=B, T=T, h=self.heads)
+        v = rearrange(v, "(B T) h H W d -> (B T) h (H W) d", B=B, T=T, h=self.heads)
+        q, k, v = map(lambda t: t.contiguous(), (q, k, v))
+        x = F.scaled_dot_product_attention(
+            query=q, key=k, value=v, is_causal=False
+        )
+        x = rearrange(x, "(B T) h (H W) d -> B T H W (h d)", B=B, H=H, W=W)
+        x = x.to(q.dtype)
+        # linear proj
+        x = self.to_out(x)
+        return x

open_oasis_master/dit.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+References:
+    - DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
+    - Diffusion Forcing: https://github.com/buoyancy99/diffusion-forcing/blob/main/algorithms/diffusion_forcing/models/unet3d.py
+    - Latte: https://github.com/Vchitect/Latte/blob/main/models/latte.py
+"""
+from typing import Optional, Literal
+import torch
+from torch import nn
+from rotary_embedding_torch import RotaryEmbedding
+from einops import rearrange
+from embeddings import Timesteps, TimestepEmbedding
+from attention import SpatialAxialAttention, TemporalAxialAttention
+from timm.models.vision_transformer import Mlp
+from timm.layers.helpers import to_2tuple
+import math
+def modulate(x, shift, scale):
+    fixed_dims = [1] * len(shift.shape[1:])
+    shift = shift.repeat(x.shape[0] // shift.shape[0], *fixed_dims)
+    scale = scale.repeat(x.shape[0] // scale.shape[0], *fixed_dims)
+    while shift.dim() < x.dim():
+        shift = shift.unsqueeze(-2)
+        scale = scale.unsqueeze(-2)
+    return x * (1 + scale) + shift
+def gate(x, g):
+    fixed_dims = [1] * len(g.shape[1:])
+    g = g.repeat(x.shape[0] // g.shape[0], *fixed_dims)
+    while g.dim() < x.dim():
+        g = g.unsqueeze(-2)
+    return g * x
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+    def __init__(
+        self,
+        img_height=256,
+        img_width=256,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        norm_layer=None,
+        flatten=True,
+    ):
+        super().__init__()
+        img_size = (img_height, img_width)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x, random_sample=False):
+        B, C, H, W = x.shape
+        assert random_sample or (
+            H == self.img_size[0] and W == self.img_size[1]
+        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        if self.flatten:
+            x = rearrange(x, "B C H W -> B (H W) C")
+        else:
+            x = rearrange(x, "B C H W -> B H W C")
+        x = self.norm(x)
+        return x
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True), # hidden_size is diffusion model hidden size
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class SpatioTemporalDiTBlock(nn.Module):
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, is_causal=True, spatial_rotary_emb: Optional[RotaryEmbedding] = None, temporal_rotary_emb: Optional[RotaryEmbedding] = None):
+        super().__init__()
+        self.is_causal = is_causal
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.s_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.s_attn = SpatialAxialAttention(hidden_size, heads=num_heads, dim_head=hidden_size // num_heads, rotary_emb=spatial_rotary_emb)
+        self.s_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.s_mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.s_adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+        self.t_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.t_attn = TemporalAxialAttention(hidden_size, heads=num_heads, dim_head=hidden_size // num_heads, is_causal=is_causal, rotary_emb=temporal_rotary_emb)
+        self.t_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.t_mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.t_adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        B, T, H, W, D = x.shape
+        # spatial block
+        s_shift_msa, s_scale_msa, s_gate_msa, s_shift_mlp, s_scale_mlp, s_gate_mlp = self.s_adaLN_modulation(c).chunk(6, dim=-1)
+        x = x + gate(self.s_attn(modulate(self.s_norm1(x), s_shift_msa, s_scale_msa)), s_gate_msa)
+        x = x + gate(self.s_mlp(modulate(self.s_norm2(x), s_shift_mlp, s_scale_mlp)), s_gate_mlp)
+        # temporal block
+        t_shift_msa, t_scale_msa, t_gate_msa, t_shift_mlp, t_scale_mlp, t_gate_mlp = self.t_adaLN_modulation(c).chunk(6, dim=-1)
+        x = x + gate(self.t_attn(modulate(self.t_norm1(x), t_shift_msa, t_scale_msa)), t_gate_msa)
+        x = x + gate(self.t_mlp(modulate(self.t_norm2(x), t_shift_mlp, t_scale_mlp)), t_gate_mlp)
+        return x
+class DiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_h=18,
+        input_w=32,
+        patch_size=2,
+        in_channels=16,
+        hidden_size=1024,
+        depth=12,
+        num_heads=16,
+        mlp_ratio=4.0,
+        external_cond_dim=25,
+        max_frames=32,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.max_frames = max_frames
+        self.x_embedder = PatchEmbed(input_h, input_w, patch_size, in_channels, hidden_size, flatten=False)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        frame_h, frame_w = self.x_embedder.grid_size
+        self.spatial_rotary_emb = RotaryEmbedding(dim=hidden_size // num_heads // 2, freqs_for="pixel", max_freq=256)
+        self.temporal_rotary_emb = RotaryEmbedding(dim=hidden_size // num_heads)
+        self.external_cond = nn.Linear(external_cond_dim, hidden_size) if external_cond_dim > 0 else nn.Identity()
+        self.blocks = nn.ModuleList(
+            [
+                SpatioTemporalDiTBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    is_causal=True,
+                    spatial_rotary_emb=self.spatial_rotary_emb,
+                    temporal_rotary_emb=self.temporal_rotary_emb,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.s_adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.s_adaLN_modulation[-1].bias, 0)
+            nn.init.constant_(block.t_adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.t_adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, H, W, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = x.shape[1]
+        w = x.shape[2]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+        return imgs
+    def forward(self, x, t, external_cond=None):
+        """
+        Forward pass of DiT.
+        x: (B, T, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        t: (B, T,) tensor of diffusion timesteps
+        """
+        B, T, C, H, W = x.shape
+        # add spatial embeddings
+        x = rearrange(x, "b t c h w -> (b t) c h w")
+        x = self.x_embedder(x) # (B*T, C, H, W) -> (B*T, H/2, W/2, D) , C = 16, D = d_model
+        # restore shape
+        x = rearrange(x, "(b t) h w d -> b t h w d", t = T)
+        # embed noise steps
+        t = rearrange(t, "b t -> (b t)")
+        c = self.t_embedder(t)                  # (N, D)
+        c = rearrange(c, "(b t) d -> b t d", t = T)
+        if torch.is_tensor(external_cond):
+            c += self.external_cond(external_cond)
+        for block in self.blocks:
+            x = block(x, c)                      # (N, T, H, W, D)
+        x = self.final_layer(x, c)               # (N, T, H, W, patch_size ** 2 * out_channels)
+        # unpatchify
+        x = rearrange(x, "b t h w d -> (b t) h w d")
+        x = self.unpatchify(x)                   # (N, out_channels, H, W)
+        x = rearrange(x, "(b t) c h w -> b t c h w", t = T)
+        return x
+def DiT_S_2():
+    return DiT(
+        patch_size=2,
+        hidden_size=1024,
+        depth=16,
+        num_heads=16,
+    )
+DiT_models = {
+    "DiT-S/2": DiT_S_2
+}

open_oasis_master/embeddings.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py
+"""
+from typing import Optional
+import math
+import torch
+from torch import nn
+# pylint: disable=unused-import
+from diffusers.models.embeddings import TimestepEmbedding
+class Timesteps(nn.Module):
+    def __init__(
+        self,
+        num_channels: int,
+        flip_sin_to_cos: bool = True,
+        downscale_freq_shift: float = 0,
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
+class Positions2d(nn.Module):
+    def __init__(
+        self,
+        num_channels: int,
+        flip_sin_to_cos: bool = True,
+        downscale_freq_shift: float = 0,
+    ):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+    def forward(self, grid):
+        h_emb = get_timestep_embedding(
+            grid[0],
+            self.num_channels // 2,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        w_emb = get_timestep_embedding(
+            grid[1],
+            self.num_channels // 2,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        emb = torch.cat((h_emb, w_emb), dim=-1)
+        return emb
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D or 2-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] or [N x M x dim] Tensor of positional embeddings.
+    """
+    if len(timesteps.shape) not in [1, 2]:
+        raise ValueError("Timesteps should be a 1D or 2D tensor")
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[..., None].float() * emb
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[..., half_dim:], emb[..., :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb

open_oasis_master/generate.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+References:
+    - Diffusion Forcing: https://github.com/buoyancy99/diffusion-forcing
+"""
+import torch
+from dit import DiT_models
+from vae import VAE_models
+from torchvision.io import read_video, write_video
+from utils import one_hot_actions, sigmoid_beta_schedule
+from tqdm import tqdm
+from einops import rearrange
+from torch import autocast
+assert torch.cuda.is_available()
+device = "cuda:0"
+# load DiT checkpoint
+ckpt = torch.load("oasis500m.pt")
+model = DiT_models["DiT-S/2"]()
+model.load_state_dict(ckpt, strict=False)
+model = model.to(device).eval()
+# load VAE checkpoint
+vae_ckpt = torch.load("vit-l-20.pt")
+vae = VAE_models["vit-l-20-shallow-encoder"]()
+vae.load_state_dict(vae_ckpt)
+vae = vae.to(device).eval()
+# sampling params
+B = 1
+total_frames = 32
+max_noise_level = 1000
+ddim_noise_steps = 100
+noise_range = torch.linspace(-1, max_noise_level - 1, ddim_noise_steps + 1)
+noise_abs_max = 20
+ctx_max_noise_idx = ddim_noise_steps // 10 * 3
+# get input video
+video_id = "snippy-chartreuse-mastiff-f79998db196d-20220401-224517.chunk_001"
+mp4_path = f"sample_data/{video_id}.mp4"
+actions_path = f"sample_data/{video_id}.actions.pt"
+video = read_video(mp4_path, pts_unit="sec")[0].float() / 255
+actions = one_hot_actions(torch.load(actions_path))
+offset = 100
+video = video[offset:offset+total_frames].unsqueeze(0)
+actions = actions[offset:offset+total_frames].unsqueeze(0)
+# sampling inputs
+n_prompt_frames = 1
+x = video[:, :n_prompt_frames]
+x = x.to(device)
+actions = actions.to(device)
+# vae encoding
+scaling_factor = 0.07843137255
+x = rearrange(x, "b t h w c -> (b t) c h w")
+H, W = x.shape[-2:]
+with torch.no_grad():
+    x = vae.encode(x * 2 - 1).mean * scaling_factor
+x = rearrange(x, "(b t) (h w) c -> b t c h w", t=n_prompt_frames, h=H//vae.patch_size, w=W//vae.patch_size)
+# get alphas
+betas = sigmoid_beta_schedule(max_noise_level).to(device)
+alphas = 1.0 - betas
+alphas_cumprod = torch.cumprod(alphas, dim=0)
+alphas_cumprod = rearrange(alphas_cumprod, "T -> T 1 1 1")
+# sampling loop
+for i in tqdm(range(n_prompt_frames, total_frames)):
+    chunk = torch.randn((B, 1, *x.shape[-3:]), device=device)
+    chunk = torch.clamp(chunk, -noise_abs_max, +noise_abs_max)
+    x = torch.cat([x, chunk], dim=1)
+    start_frame = max(0, i + 1 - model.max_frames)
+    for noise_idx in reversed(range(1, ddim_noise_steps + 1)):
+        # set up noise values
+        ctx_noise_idx = min(noise_idx, ctx_max_noise_idx)
+        t_ctx  = torch.full((B, i), noise_range[ctx_noise_idx], dtype=torch.long, device=device)
+        t      = torch.full((B, 1), noise_range[noise_idx],     dtype=torch.long, device=device)
+        t_next = torch.full((B, 1), noise_range[noise_idx - 1], dtype=torch.long, device=device)
+        t_next = torch.where(t_next < 0, t, t_next)
+        t = torch.cat([t_ctx, t], dim=1)
+        t_next = torch.cat([t_ctx, t_next], dim=1)
+        # sliding window
+        x_curr = x.clone()
+        x_curr = x_curr[:, start_frame:]
+        t = t[:, start_frame:]
+        t_next = t_next[:, start_frame:]
+        # add some noise to the context
+        ctx_noise = torch.randn_like(x_curr[:, :-1])
+        ctx_noise = torch.clamp(ctx_noise, -noise_abs_max, +noise_abs_max)
+        x_curr[:, :-1] = alphas_cumprod[t[:, :-1]].sqrt() * x_curr[:, :-1] + (1 - alphas_cumprod[t[:, :-1]]).sqrt() * ctx_noise
+        # get model predictions
+        with torch.no_grad():
+            with autocast("cuda", dtype=torch.half):
+                v = model(x_curr, t, actions[:, start_frame : i + 1])
+        x_start = alphas_cumprod[t].sqrt() * x_curr - (1 - alphas_cumprod[t]).sqrt() * v
+        x_noise = ((1 / alphas_cumprod[t]).sqrt() * x_curr - x_start) \
+                / (1 / alphas_cumprod[t] - 1).sqrt()
+        # get frame prediction
+        x_pred = alphas_cumprod[t_next].sqrt() * x_start + x_noise * (1 - alphas_cumprod[t_next]).sqrt()
+        x[:, -1:] = x_pred[:, -1:]
+# vae decoding
+x = rearrange(x, "b t c h w -> (b t) (h w) c")
+with torch.no_grad():
+    x = (vae.decode(x / scaling_factor) + 1) / 2
+x = rearrange(x, "(b t) c h w -> b t h w c", t=total_frames)
+# save video
+x = torch.clamp(x, 0, 1)
+x = (x * 255).byte()
+write_video("video.mp4", x[0], fps=20)
+print("generation saved to video.mp4.")

open_oasis_master/media/arch.png ADDED Viewed

open_oasis_master/media/sample_0.gif ADDED Viewed

Git LFS Details

SHA256: 684d0b42eed5f82d6285dbc46b0c69dbe4661c91fdb92043c3c298c300249574
Pointer size: 132 Bytes
Size of remote file: 3.15 MB

open_oasis_master/media/sample_1.gif ADDED Viewed

Git LFS Details

SHA256: d771ac40069b4e7a424d18d7c91c64904e560e5c61cc52f51f67eb6c667c39f9
Pointer size: 132 Bytes
Size of remote file: 2.95 MB

open_oasis_master/media/thumb.png ADDED Viewed

open_oasis_master/requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+av==13.1.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+diffusers==0.31.0
+einops==0.8.0
+filelock==3.13.1
+fsspec==2024.2.0
+huggingface-hub==0.26.2
+idna==3.10
+importlib_metadata==8.5.0
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+mpmath==1.3.0
+networkx==3.2.1
+numpy==1.26.3
+packaging==24.1
+pillow==10.2.0
+PyYAML==6.0.2
+regex==2024.9.11
+requests==2.32.3
+safetensors==0.4.5
+sympy==1.13.1
+timm==1.0.11
+torch==2.5.1
+torchaudio==2.5.1
+torchvision==0.20.1
+tqdm==4.66.6
+triton==3.1.0
+typing_extensions==4.9.0
+urllib3==2.2.3
+zipp==3.20.2

open_oasis_master/rotary_embedding_torch.py ADDED Viewed

	@@ -0,0 +1,316 @@

+"""
+Adapted from https://github.com/lucidrains/rotary-embedding-torch/blob/main/rotary_embedding_torch/rotary_embedding_torch.py
+"""
+from __future__ import annotations
+from math import pi, log
+import torch
+from torch.nn import Module, ModuleList
+from torch.amp import autocast
+from torch import nn, einsum, broadcast_tensors, Tensor
+from einops import rearrange, repeat
+from typing import Literal
+# helper functions
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+# broadcat, as tortoise-tts was using it
+def broadcat(tensors, dim = -1):
+    broadcasted_tensors = broadcast_tensors(*tensors)
+    return torch.cat(broadcasted_tensors, dim = dim)
+# rotary embedding helper functions
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+@autocast('cuda', enabled = False)
+def apply_rotary_emb(freqs, t, start_index = 0, scale = 1., seq_dim = -2):
+    dtype = t.dtype
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        freqs = freqs[-seq_len:]
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+    # Split t into three parts: left, middle (to be transformed), and right
+    t_left = t[..., :start_index]
+    t_middle = t[..., start_index:end_index]
+    t_right = t[..., end_index:]
+    # Apply rotary embeddings without modifying t in place
+    t_transformed = (t_middle * freqs.cos() * scale) + (rotate_half(t_middle) * freqs.sin() * scale)
+    out = torch.cat((t_left, t_transformed, t_right), dim=-1)
+    return out.type(dtype)
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index = 0, freq_ranges = None):
+    if exists(freq_ranges):
+        rotations = einsum('..., f -> ... f', rotations, freq_ranges)
+        rotations = rearrange(rotations, '... r f -> ... (r f)')
+    rotations = repeat(rotations, '... n -> ... (n r)', r = 2)
+    return apply_rotary_emb(rotations, t, start_index = start_index)
+# classes
+class RotaryEmbedding(Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs: Tensor | None = None,
+        freqs_for:  Literal['lang', 'pixel', 'constant'] = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+        learned_freq = False,
+        use_xpos = False,
+        xpos_scale_base = 512,
+        interpolate_factor = 1.,
+        theta_rescale_factor = 1.,
+        seq_before_head_dim = False,
+        cache_if_possible = True,
+        cache_max_seq_len = 8192
+    ):
+        super().__init__()
+        # proposed by reddit user bloc97, to rescale rotary embeddings to longer sequence length without fine-tuning
+        # has some connection to NTK literature
+        # https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
+        theta *= theta_rescale_factor ** (dim / (dim - 2))
+        self.freqs_for = freqs_for
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'spacetime':
+            time_freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        if freqs_for == 'spacetime':
+            self.time_freqs = nn.Parameter(time_freqs, requires_grad = learned_freq)
+        self.freqs = nn.Parameter(freqs, requires_grad = learned_freq)
+        self.cache_if_possible = cache_if_possible
+        self.cache_max_seq_len = cache_max_seq_len
+        self.register_buffer('cached_freqs', torch.zeros(cache_max_seq_len, dim), persistent = False)
+        self.register_buffer('cached_freqs_seq_len', torch.tensor(0), persistent = False)
+        self.learned_freq = learned_freq
+        # dummy for device
+        self.register_buffer('dummy', torch.tensor(0), persistent = False)
+        # default sequence dimension
+        self.seq_before_head_dim = seq_before_head_dim
+        self.default_seq_dim = -3 if seq_before_head_dim else -2
+        # interpolation factors
+        assert interpolate_factor >= 1.
+        self.interpolate_factor = interpolate_factor
+        # xpos
+        self.use_xpos = use_xpos
+        if not use_xpos:
+            return
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        self.scale_base = xpos_scale_base
+        self.register_buffer('scale', scale, persistent = False)
+        self.register_buffer('cached_scales', torch.zeros(cache_max_seq_len, dim), persistent = False)
+        self.register_buffer('cached_scales_seq_len', torch.tensor(0), persistent = False)
+        # add apply_rotary_emb as static method
+        self.apply_rotary_emb = staticmethod(apply_rotary_emb)
+    @property
+    def device(self):
+        return self.dummy.device
+    def get_seq_pos(self, seq_len, device, dtype, offset = 0):
+        return (torch.arange(seq_len, device = device, dtype = dtype) + offset) / self.interpolate_factor
+    def rotate_queries_or_keys(self, t, freqs, seq_dim = None, offset = 0, scale = None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert not self.use_xpos or exists(scale), 'you must use `.rotate_queries_and_keys` method instead and pass in both queries and keys, for length extrapolatable rotary embeddings'
+        device, dtype, seq_len = t.device, t.dtype, t.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, device = device, dtype = dtype, offset = offset)
+        seq_freqs = self.forward(seq, freqs, seq_len = seq_len, offset = offset)
+        if seq_dim == -3:
+            seq_freqs = rearrange(seq_freqs, 'n d -> n 1 d')
+        return apply_rotary_emb(seq_freqs, t, scale = default(scale, 1.), seq_dim = seq_dim)
+    def rotate_queries_with_cached_keys(self, q, k, seq_dim = None, offset = 0):
+        dtype, device, seq_dim = q.dtype, q.device, default(seq_dim, self.default_seq_dim)
+        q_len, k_len = q.shape[seq_dim], k.shape[seq_dim]
+        assert q_len <= k_len
+        q_scale = k_scale = 1.
+        if self.use_xpos:
+            seq = self.get_seq_pos(k_len, dtype = dtype, device = device)
+            q_scale = self.get_scale(seq[-q_len:]).type(dtype)
+            k_scale = self.get_scale(seq).type(dtype)
+        rotated_q = self.rotate_queries_or_keys(q, seq_dim = seq_dim, scale = q_scale, offset = k_len - q_len + offset)
+        rotated_k = self.rotate_queries_or_keys(k, seq_dim = seq_dim, scale = k_scale ** -1)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def rotate_queries_and_keys(self, q, k, freqs, seq_dim = None):
+        seq_dim = default(seq_dim, self.default_seq_dim)
+        assert self.use_xpos
+        device, dtype, seq_len = q.device, q.dtype, q.shape[seq_dim]
+        seq = self.get_seq_pos(seq_len, dtype = dtype, device = device)
+        seq_freqs = self.forward(seq, freqs, seq_len = seq_len)
+        scale = self.get_scale(seq, seq_len = seq_len).to(dtype)
+        if seq_dim == -3:
+            seq_freqs = rearrange(seq_freqs, 'n d -> n 1 d')
+            scale = rearrange(scale, 'n d -> n 1 d')
+        rotated_q = apply_rotary_emb(seq_freqs, q, scale = scale, seq_dim = seq_dim)
+        rotated_k = apply_rotary_emb(seq_freqs, k, scale = scale ** -1, seq_dim = seq_dim)
+        rotated_q = rotated_q.type(q.dtype)
+        rotated_k = rotated_k.type(k.dtype)
+        return rotated_q, rotated_k
+    def get_scale(
+        self,
+        t: Tensor,
+        seq_len: int | None = None,
+        offset = 0
+    ):
+        assert self.use_xpos
+        should_cache = (
+            self.cache_if_possible and
+            exists(seq_len) and
+            (offset + seq_len) <= self.cache_max_seq_len
+        )
+        if (
+            should_cache and \
+            exists(self.cached_scales) and \
+            (seq_len + offset) <= self.cached_scales_seq_len.item()
+        ):
+            return self.cached_scales[offset:(offset + seq_len)]
+        scale = 1.
+        if self.use_xpos:
+            power = (t - len(t) // 2) / self.scale_base
+            scale = self.scale ** rearrange(power, 'n -> n 1')
+            scale = repeat(scale, 'n d -> n (d r)', r = 2)
+        if should_cache and offset == 0:
+            self.cached_scales[:seq_len] = scale.detach()
+            self.cached_scales_seq_len.copy_(seq_len)
+        return scale
+    def get_axial_freqs(self, *dims):
+        Colon = slice(None)
+        all_freqs = []
+        for ind, dim in enumerate(dims):
+            # only allow pixel freqs for last two dimensions
+            use_pixel = (self.freqs_for == 'pixel' or self.freqs_for == 'spacetime') and ind >= len(dims) - 2
+            if use_pixel:
+                pos = torch.linspace(-1, 1, steps = dim, device = self.device)
+            else:
+                pos = torch.arange(dim, device = self.device)
+            if self.freqs_for == 'spacetime' and not use_pixel:
+                seq_freqs = self.forward(pos, self.time_freqs, seq_len = dim)
+            else:
+                seq_freqs = self.forward(pos, self.freqs, seq_len = dim)
+            all_axis = [None] * len(dims)
+            all_axis[ind] = Colon
+            new_axis_slice = (Ellipsis, *all_axis, Colon)
+            all_freqs.append(seq_freqs[new_axis_slice])
+        all_freqs = broadcast_tensors(*all_freqs)
+        return torch.cat(all_freqs, dim = -1)
+    @autocast('cuda', enabled = False)
+    def forward(
+        self,
+        t: Tensor,
+        freqs: Tensor,
+        seq_len = None,
+        offset = 0
+    ):
+        should_cache = (
+            self.cache_if_possible and
+            not self.learned_freq and
+            exists(seq_len) and
+            self.freqs_for != 'pixel' and
+            (offset + seq_len) <= self.cache_max_seq_len
+        )
+        if (
+            should_cache and \
+            exists(self.cached_freqs) and \
+            (offset + seq_len) <= self.cached_freqs_seq_len.item()
+        ):
+            return self.cached_freqs[offset:(offset + seq_len)].detach()
+        freqs = einsum('..., f -> ... f', t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
+        if should_cache and offset == 0:
+            self.cached_freqs[:seq_len] = freqs.detach()
+            self.cached_freqs_seq_len.copy_(seq_len)
+        return freqs

open_oasis_master/sample_data/Player729-f153ac423f61-20210806-224813.chunk_000.actions.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc3ea8894f87e2c2c2387dd32b193f27a8a95009397c32b5fbaf8a6f23608b0c
+size 230180

open_oasis_master/sample_data/Player729-f153ac423f61-20210806-224813.chunk_000.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fb1cf3a87be9deca2fec2e946427521a85026ee607cf9281aa87f6df447e4ea
+size 6818283

open_oasis_master/sample_data/snippy-chartreuse-mastiff-f79998db196d-20220401-224517.chunk_001.actions.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:955929d771293156d3f27d295091a978dcd97fdaa78e3a17395ac90c0403004d
+size 230308

open_oasis_master/sample_data/snippy-chartreuse-mastiff-f79998db196d-20220401-224517.chunk_001.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:745b0348a014d943f70ccf6ccba17ad260540caba502b312d972235326003ab0
+size 7109171

open_oasis_master/sample_data/treechop-f153ac423f61-20210916-183423.chunk_000.actions.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46ae60cc9d3a02df949923c707df4c5cd3f49d279aa6500c81f0ef00c14f7747
+size 230176

open_oasis_master/sample_data/treechop-f153ac423f61-20210916-183423.chunk_000.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0ad584df52d7b2636fae5d7a3116f596f25a09ba7d28ff5fc42193105605d92
+size 8716515

open_oasis_master/utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Adapted from https://github.com/buoyancy99/diffusion-forcing/blob/main/algorithms/diffusion_forcing/models/utils.py
+Action format derived from VPT https://github.com/openai/Video-Pre-Training
+"""
+import math
+import torch
+from torch import nn
+from einops import rearrange, parse_shape
+from typing import Mapping, Sequence
+import torch
+from einops import rearrange
+def sigmoid_beta_schedule(timesteps, start=-3, end=3, tau=1, clamp_min=1e-5):
+    """
+    sigmoid schedule
+    proposed in https://arxiv.org/abs/2212.11972 - Figure 8
+    better for images > 64x64, when used during training
+    """
+    steps = timesteps + 1
+    t = torch.linspace(0, timesteps, steps, dtype=torch.float32) / timesteps
+    v_start = torch.tensor(start / tau).sigmoid()
+    v_end = torch.tensor(end / tau).sigmoid()
+    alphas_cumprod = (-((t * (end - start) + start) / tau).sigmoid() + v_end) / (v_end - v_start)
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+    return torch.clip(betas, 0, 0.999)
+ACTION_KEYS = [
+    "inventory",
+    "ESC",
+    "hotbar.1",
+    "hotbar.2",
+    "hotbar.3",
+    "hotbar.4",
+    "hotbar.5",
+    "hotbar.6",
+    "hotbar.7",
+    "hotbar.8",
+    "hotbar.9",
+    "forward",
+    "back",
+    "left",
+    "right",
+    "cameraX",
+    "cameraY",
+    "jump",
+    "sneak",
+    "sprint",
+    "swapHands",
+    "attack",
+    "use",
+    "pickItem",
+    "drop",
+]
+def one_hot_actions(actions: Sequence[Mapping[str, int]]) -> torch.Tensor:
+    actions_one_hot = torch.zeros(len(actions), len(ACTION_KEYS))
+    for i, current_actions in enumerate(actions):
+        for j, action_key in enumerate(ACTION_KEYS):
+            if action_key.startswith("camera"):
+                if action_key == "cameraX":
+                    value = current_actions["camera"][0]
+                elif action_key == "cameraY":
+                    value = current_actions["camera"][1]
+                else:
+                    raise ValueError(f"Unknown camera action key: {action_key}")
+                # NOTE these numbers specific to the camera quantization used in
+                # https://github.com/etched-ai/dreamcraft/blob/216e952f795bb3da598639a109bcdba4d2067b69/spark/preprocess_vpt_to_videos_actions.py#L312
+                # see method `compress_mouse`
+                max_val = 20
+                bin_size = 0.5
+                num_buckets = int(max_val / bin_size)
+                value = (value - num_buckets) / num_buckets
+                assert -1 - 1e-3 <= value <= 1 + 1e-3, f"Camera action value must be in [-1, 1], got {value}"
+            else:
+                value = current_actions[action_key]
+                assert 0 <= value <= 1, f"Action value must be in [0, 1] got {value}"
+            actions_one_hot[i, j] = value
+    return actions_one_hot

open_oasis_master/vae.py ADDED Viewed

	@@ -0,0 +1,381 @@

+"""
+References:
+    - VQGAN: https://github.com/CompVis/taming-transformers
+    - MAE: https://github.com/facebookresearch/mae
+"""
+import numpy as np
+import math
+import functools
+from collections import namedtuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from timm.models.vision_transformer import Mlp
+from timm.layers.helpers import to_2tuple
+from rotary_embedding_torch import RotaryEmbedding, apply_rotary_emb
+from dit import PatchEmbed
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False, dim=1):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=dim)
+        if dim == 1:
+            self.dims = [1, 2, 3]
+        elif dim == 2:
+            self.dims = [1, 2]
+        else:
+            raise NotImplementedError
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(
+                device=self.parameters.device
+            )
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(
+            device=self.parameters.device
+        )
+        return x
+    def mode(self):
+        return self.mean
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        frame_height,
+        frame_width,
+        qkv_bias=False,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        is_causal=False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.frame_height = frame_height
+        self.frame_width = frame_width
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = attn_drop
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.is_causal = is_causal
+        rotary_freqs = RotaryEmbedding(
+            dim=head_dim // 4,
+            freqs_for="pixel",
+            max_freq=frame_height*frame_width,
+        ).get_axial_freqs(frame_height, frame_width)
+        self.register_buffer("rotary_freqs", rotary_freqs, persistent=False)
+    def forward(self, x):
+        B, N, C = x.shape
+        assert N == self.frame_height * self.frame_width
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        if self.rotary_freqs is not None:
+            q = rearrange(q, "b h (H W) d -> b h H W d", H=self.frame_height, W=self.frame_width)
+            k = rearrange(k, "b h (H W) d -> b h H W d", H=self.frame_height, W=self.frame_width)
+            q = apply_rotary_emb(self.rotary_freqs, q)
+            k = apply_rotary_emb(self.rotary_freqs, k)
+            q = rearrange(q, "b h H W d -> b h (H W) d")
+            k = rearrange(k, "b h H W d -> b h (H W) d")
+        attn = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            dropout_p=self.attn_drop,
+            is_causal=self.is_causal,
+        )
+        x = attn.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class AttentionBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        frame_height,
+        frame_width,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        attn_causal=False,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads,
+            frame_height,
+            frame_width,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            is_causal=attn_causal,
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class AutoencoderKL(nn.Module):
+    def __init__(
+        self,
+        latent_dim,
+        input_height=256,
+        input_width=256,
+        patch_size=16,
+        enc_dim=768,
+        enc_depth=6,
+        enc_heads=12,
+        dec_dim=768,
+        dec_depth=6,
+        dec_heads=12,
+        mlp_ratio=4.0,
+        norm_layer=functools.partial(nn.LayerNorm, eps=1e-6),
+        use_variational=True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.input_height = input_height
+        self.input_width = input_width
+        self.patch_size = patch_size
+        self.seq_h = input_height // patch_size
+        self.seq_w = input_width // patch_size
+        self.seq_len = self.seq_h * self.seq_w
+        self.patch_dim = 3 * patch_size**2
+        self.latent_dim = latent_dim
+        self.enc_dim = enc_dim
+        self.dec_dim = dec_dim
+        # patch
+        self.patch_embed = PatchEmbed(input_height, input_width, patch_size, 3, enc_dim)
+        # encoder
+        self.encoder = nn.ModuleList(
+            [
+                AttentionBlock(
+                    enc_dim,
+                    enc_heads,
+                    self.seq_h,
+                    self.seq_w,
+                    mlp_ratio,
+                    qkv_bias=True,
+                    norm_layer=norm_layer,
+                )
+                for i in range(enc_depth)
+            ]
+        )
+        self.enc_norm = norm_layer(enc_dim)
+        # bottleneck
+        self.use_variational = use_variational
+        mult = 2 if self.use_variational else 1
+        self.quant_conv = nn.Linear(enc_dim, mult * latent_dim)
+        self.post_quant_conv = nn.Linear(latent_dim, dec_dim)
+        # decoder
+        self.decoder = nn.ModuleList(
+            [
+                AttentionBlock(
+                    dec_dim,
+                    dec_heads,
+                    self.seq_h,
+                    self.seq_w,
+                    mlp_ratio,
+                    qkv_bias=True,
+                    norm_layer=norm_layer,
+                )
+                for i in range(dec_depth)
+            ]
+        )
+        self.dec_norm = norm_layer(dec_dim)
+        self.predictor = nn.Linear(dec_dim, self.patch_dim)  # decoder to patch
+        # initialize this weight first
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialization
+        # initialize nn.Linear and nn.LayerNorm
+        self.apply(self._init_weights)
+        # initialize patch_embed like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0.0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0.0)
+            nn.init.constant_(m.weight, 1.0)
+    def patchify(self, x):
+        # patchify
+        bsz, _, h, w = x.shape
+        x = x.reshape(
+            bsz,
+            3,
+            self.seq_h,
+            self.patch_size,
+            self.seq_w,
+            self.patch_size,
+        ).permute(
+            [0, 1, 3, 5, 2, 4]
+        )  # [b, c, h, p, w, p] --> [b, c, p, p, h, w]
+        x = x.reshape(
+            bsz, self.patch_dim, self.seq_h, self.seq_w
+        )  # --> [b, cxpxp, h, w]
+        x = x.permute([0, 2, 3, 1]).reshape(
+            bsz, self.seq_len, self.patch_dim
+        )  # --> [b, hxw, cxpxp]
+        return x
+    def unpatchify(self, x):
+        bsz = x.shape[0]
+        # unpatchify
+        x = x.reshape(bsz, self.seq_h, self.seq_w, self.patch_dim).permute(
+            [0, 3, 1, 2]
+        )  # [b, h, w, cxpxp] --> [b, cxpxp, h, w]
+        x = x.reshape(
+            bsz,
+            3,
+            self.patch_size,
+            self.patch_size,
+            self.seq_h,
+            self.seq_w,
+        ).permute(
+            [0, 1, 4, 2, 5, 3]
+        )  # [b, c, p, p, h, w] --> [b, c, h, p, w, p]
+        x = x.reshape(
+            bsz,
+            3,
+            self.input_height,
+            self.input_width,
+        )  # [b, c, hxp, wxp]
+        return x
+    def encode(self, x):
+        # patchify
+        x = self.patch_embed(x)
+        # encoder
+        for blk in self.encoder:
+            x = blk(x)
+        x = self.enc_norm(x)
+        # bottleneck
+        moments = self.quant_conv(x)
+        if not self.use_variational:
+            moments = torch.cat((moments, torch.zeros_like(moments)), 2)
+        posterior = DiagonalGaussianDistribution(
+            moments, deterministic=(not self.use_variational), dim=2
+        )
+        return posterior
+    def decode(self, z):
+        # bottleneck
+        z = self.post_quant_conv(z)
+        # decoder
+        for blk in self.decoder:
+            z = blk(z)
+        z = self.dec_norm(z)
+        # predictor
+        z = self.predictor(z)
+        # unpatchify
+        dec = self.unpatchify(z)
+        return dec
+    def autoencode(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if self.use_variational and sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior, z
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        return x
+    def forward(self, inputs, labels, split="train"):
+        rec, post, latent = self.autoencode(inputs)
+        return rec, post, latent
+    def get_last_layer(self):
+        return self.predictor.weight
+def ViT_L_20_Shallow_Encoder(**kwargs):
+    if "latent_dim" in kwargs:
+        latent_dim = kwargs.pop("latent_dim")
+    else:
+        latent_dim = 16
+    return AutoencoderKL(
+        latent_dim=latent_dim,
+        patch_size=20,
+        enc_dim=1024,
+        enc_depth=6,
+        enc_heads=16,
+        dec_dim=1024,
+        dec_depth=12,
+        dec_heads=16,
+        input_height=360,
+        input_width=640,
+        **kwargs,
+    )
+VAE_models = {
+    "vit-l-20-shallow-encoder": ViT_L_20_Shallow_Encoder,
+}