Spaces:

HikariDawn
/

This-and-That

Sleeping

App Files Files Community

HikariDawn777 commited on Oct 17, 2024

Commit

59b2a81

1 Parent(s): aa505db

feat: initial push

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +49 -0
__assets__/0.jpg +0 -0
__assets__/156.jpg +0 -0
__assets__/274.jpg +0 -0
__assets__/375.jpg +0 -0
__assets__/551.jpg +0 -0
__assets__/91.jpg +0 -0
__assets__/ThisThat_logo.png +0 -0
app.py +475 -4
config/accelerate_config.json +18 -0
config/flowformer_config.py +78 -0
config/train_image2video.yaml +78 -0
config/train_image2video_controlnet.yaml +101 -0
curation_pipeline/add_lang_info.py +38 -0
curation_pipeline/match_dataset_v1.py +117 -0
curation_pipeline/match_dataset_v2.py +137 -0
curation_pipeline/prepare_bridge_csv.py +69 -0
curation_pipeline/prepare_bridge_jsonl.py +47 -0
curation_pipeline/prepare_bridge_v1.py +132 -0
curation_pipeline/prepare_bridge_v2.py +139 -0
curation_pipeline/select_frame_with_this_that.py +421 -0
curation_pipeline/tracking_by_keypoint.py +136 -0
data_loader/video_dataset.py +323 -0
data_loader/video_this_that_dataset.py +326 -0
pretrained/PUT_YOUR_WEIGHT_HERE.md +0 -0
requirements.txt +27 -0
scripts/active_learning_select.py +27 -0
scripts/add_point2img.py +51 -0
scripts/check_video.py +19 -0
scripts/clean_bridge_dataset.py +22 -0
scripts/collect_lang.py +31 -0
scripts/combine_results.py +85 -0
scripts/compress_gif.py +52 -0
scripts/compress_videos.py +55 -0
scripts/crop_video_frames.py +22 -0
scripts/extract_test_dataset.py +18 -0
scripts/generate_noise.py +14 -0
scripts/generate_sam.py +56 -0
scripts/generate_sam_this_that.py +108 -0
scripts/generate_traj.py +601 -0
scripts/interpolate_by_repeat.py +55 -0
scripts/length_stats.py +21 -0
scripts/motion_stats.py +75 -0
scripts/process_llama.py +74 -0
scripts/process_sim.py +59 -0
scripts/resize_img.py +17 -0
scripts/resize_video_seq.py +33 -0
scripts/train_test_split.py +23 -0
scripts/visualize_thisthat_point.py +43 -0
svd/diffusion_arch/transformer_temporal.py +381 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,49 @@

+.ipynb_checkpoints
+.idea
+__pycache__
+datasets/
+tmp_imgs
+runs/
+runs_last/
+saved_models/
+pre_trained/
+save_log/
+diffusers/
+weights/
+checkpoints/
+validation_videos*
+pretrained/*
+.gradio/*
+*.pyc
+*.sh
+*.pth
+*.png
+*.jpg
+*.mp4
+*.txt
+*.json
+*.jsonl
+*.zip
+*.mp4
+*.csv
+*.webp
+*.bin
+*.pkl
+*.safetensors
+*.pt
+*.log
+events.*
+*.yml
+*.gif
+*.npy
+*.out
+!requirements.txt
+!saved_models/*.md
+!LICENSE.txt
+!config/*
+!__assets__/*
+!__assets__/Bridge_example/*
+!pretrained/PUT_YOUR_WEIGHT_HERE.md

__assets__/0.jpg ADDED Viewed

__assets__/156.jpg ADDED Viewed

__assets__/274.jpg ADDED Viewed

__assets__/375.jpg ADDED Viewed

__assets__/551.jpg ADDED Viewed

__assets__/91.jpg ADDED Viewed

__assets__/ThisThat_logo.png ADDED Viewed

app.py CHANGED Viewed

@@ -1,7 +1,478 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+# *************************************************************************
+# Copyright (2023) Bytedance Inc.
+#
+# Copyright (2023) DragDiffusion Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *************************************************************************
+import os, shutil, sys
+import urllib.request
+import argparse
+import imageio
+import math
+import cv2
+import collections
+import numpy as np
 import gradio as gr
+from PIL import Image
+import torch
+from pathlib import Path
+from omegaconf import OmegaConf
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from accelerate import Accelerator
+from accelerate.utils import ProjectConfiguration
+from diffusers import (
+    AutoencoderKLTemporalDecoder,
+    DDPMScheduler,
+)
+from diffusers.utils import check_min_version, is_wandb_available, load_image, export_to_video
+from huggingface_hub import hf_hub_download
+from transformers import AutoTokenizer, PretrainedConfig
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from train_code.train_svd import import_pretrained_text_encoder
+from data_loader.video_dataset import tokenize_captions
+from data_loader.video_this_that_dataset import get_thisthat_sam
+from svd.unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel
+from svd.pipeline_stable_video_diffusion import StableVideoDiffusionPipeline
+from svd.temporal_controlnet import ControlNetModel
+from svd.pipeline_stable_video_diffusion_controlnet import StableVideoDiffusionControlNetPipeline
+from utils.optical_flow_utils import bivariate_Gaussian
+# For the 2D dilation
+blur_kernel = bivariate_Gaussian(99, 10, 10, 0, grid = None, isotropic = True)
+# Import
+# LENGTH=480 # length of the square area displaying/editing images
+HEIGHT = 256
+WIDTH = 384
+MARKDOWN = \
+    """
+    ## <p style='text-align: center'> This&That </p>
+    [GitHub](https://github.com/Kiteretsu77/This_and_That_VDM) | [Paper](http://arxiv.org/abs/2407.05530) | [Webpage](https://cfeng16.github.io/this-and-that/)
+    This&That is a Robotics scenario (Bridge-dataset-based for this repo) Language-Gesture-Image-conditioned Video Generation Model for Robot Planning.
+    This Demo is on the Video Diffusion Model part.
+    Only GestureNet is provided in this Gradio Demo, you can check the full test code for all pretrained weight available.
+    ### Note: The index we put the gesture point by default here is [4, 10] for two gesture points or [4] for one gesture point.
+    ### Note: The result now only support is 256x384
+    ### Note: Click "Clear All" to restart everything; Click "Undo Point" to cancel the point you put
+    If This&That is helpful, please help star the [GitHub Repo](https://github.com/Kiteretsu77/This_and_That_VDM). Thanks!
+    """
+def store_img(img):
+    # when new image is uploaded, `selected_points` should be empty
+    return img, []
+def clear_all():
+    return None, \
+        gr.Image(value=None, height=HEIGHT, width=WIDTH, interactive=False), \
+        None, []    # selected points
+def undo_points(original_image):
+    img = original_image.copy()
+    return img, []
+# User click the image to get points, and show the points on the image [From https://github.com/Yujun-Shi/DragDiffusion]
+def get_points(img, original_image, sel_pix, evt: gr.SelectData):
+    # collect the selected point
+    sel_pix.append(evt.index)
+    if len(sel_pix) > 2:
+        raise gr.Error("We only at most support two points")
+    if original_image is None:
+        original_image = img.copy()
+    # draw points
+    points = []
+    for idx, point in enumerate(sel_pix):
+        if idx % 2 == 0:
+            # draw a red circle at the handle point
+            cv2.circle(img, tuple(point), 10, (255, 0, 0), -1)
+        else:
+            # draw a blue circle at the handle point
+            cv2.circle(img, tuple(point), 10, (0, 255, 0), -1)
+        points.append(tuple(point))
+        # draw an arrow from handle point to target point
+        # if len(points) == 2:
+        #     cv2.arrowedLine(img, points[0], points[1], (255, 255, 255), 4, tipLength=0.5)
+        #     points = []
+    return [img if isinstance(img, np.ndarray) else np.array(img), original_image]
+def gesturenet_inference(ref_image, prompt, selected_points):
+    # Check some paramter, must have prompt and selected points
+    if prompt == "" or prompt is None:
+        raise gr.Error("Please input text prompt")
+    if selected_points == []:
+        raise gr.Error("Please click one/two points in the Image")
+    # Prepare the setting
+    frame_idxs = [4, 10]
+    use_ambiguous_prompt = False
+    model_type = "GestureNet"
+    huggingface_pretrained_path = "HikariDawn/This-and-That-1.1"
+    print("Text prompt is ", prompt)
+    # Prepare tmp folder
+    store_folder_name = "tmp"
+    if os.path.exists(store_folder_name):
+        shutil.rmtree(store_folder_name)
+    os.makedirs(store_folder_name)
+    # Read the yaml setting files (Very important for loading hyperparamters needed)
+    if not os.path.exists(huggingface_pretrained_path):
+        yaml_download_path = hf_hub_download(repo_id=huggingface_pretrained_path, subfolder="unet", filename="train_image2video.yaml")
+        if model_type == "GestureNet":
+            yaml_download_path = hf_hub_download(repo_id=huggingface_pretrained_path, subfolder="gesturenet", filename="train_image2video_gesturenet.yaml")
+    else:   # If the path is a local path we can concatenate it here
+        yaml_download_path = os.path.join(huggingface_pretrained_path, "unet", "train_image2video.yaml")
+        if model_type == "GestureNet":
+            yaml_download_path = os.path.join(huggingface_pretrained_path, "gesturenet", "train_image2video_gesturenet.yaml")
+    # Load the config
+    assert(os.path.exists(yaml_download_path))
+    config = OmegaConf.load(yaml_download_path)
+    ################################################ Prepare vae, unet, image_encoder Same as before #################################################################
+    print("Prepare the pretrained model")
+    accelerator = Accelerator(
+        gradient_accumulation_steps = config["gradient_accumulation_steps"],
+        mixed_precision = config["mixed_precision"],
+        log_with = config["report_to"],
+        project_config = ProjectConfiguration(project_dir=config["output_dir"], logging_dir=Path(config["output_dir"], config["logging_name"])),
+    )
+    feature_extractor = CLIPImageProcessor.from_pretrained(
+        config["pretrained_model_name_or_path"], subfolder="feature_extractor", revision=None
+    )   # This instance has now weight, they are just seeting file
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+        config["pretrained_model_name_or_path"], subfolder="image_encoder", revision=None, variant="fp16"
+    )
+    vae = AutoencoderKLTemporalDecoder.from_pretrained(
+        config["pretrained_model_name_or_path"], subfolder="vae", revision=None, variant="fp16"
+    )
+    unet = UNetSpatioTemporalConditionModel.from_pretrained(
+        huggingface_pretrained_path,
+        subfolder = "unet",
+        low_cpu_mem_usage = True,
+        # variant = "fp16",
+    )
+    # For text ..............................................
+    tokenizer = AutoTokenizer.from_pretrained(
+        config["pretrained_tokenizer_name_or_path"],
+        subfolder = "tokenizer",
+        revision = None,
+        use_fast = False,
+    )
+    # Clip Text Encoder
+    text_encoder_cls = import_pretrained_text_encoder(config["pretrained_tokenizer_name_or_path"], revision=None)
+    text_encoder = text_encoder_cls.from_pretrained(config["pretrained_tokenizer_name_or_path"], subfolder = "text_encoder", revision = None, variant = None)
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move vae + image_encoder to gpu and cast to weight_dtype
+    vae.requires_grad_(False)
+    image_encoder.requires_grad_(False)
+    unet.requires_grad_(False)  # Will switch back at the end
+    text_encoder.requires_grad_(False)
+    # Move to accelerator
+    vae.to(accelerator.device, dtype=weight_dtype)
+    image_encoder.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    # For GestureNet
+    if model_type == "GestureNet":
+        unet.to(accelerator.device, dtype=weight_dtype)     # There is no need to cast unet in unet training, only needed in controlnet one
+        # Handle the Controlnet first from UNet
+        gesturenet = ControlNetModel.from_pretrained(
+                                                        huggingface_pretrained_path,
+                                                        subfolder = "gesturenet",
+                                                        low_cpu_mem_usage = True,
+                                                        variant = None,
+                                                    )
+        gesturenet.requires_grad_(False)
+        gesturenet.to(accelerator.device)
+    ##############################################################################################################################################################
+    # Init the pipeline
+    pipeline = StableVideoDiffusionControlNetPipeline.from_pretrained(
+        config["pretrained_model_name_or_path"],        # Still based on regular SVD config
+        vae = vae,
+        image_encoder = image_encoder,
+        unet = unet,
+        revision = None,    # Set None directly now
+        torch_dtype = weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+    ############################## Prepare and Process the condition here ##############################
+    org_height, org_width, _ = ref_image.shape
+    ref_image_pil = Image.fromarray(ref_image)
+    ref_image_pil = ref_image_pil.resize((config["width"], config["height"]))
+    # Initial the optical flow format we want
+    gesture_condition_img = np.zeros((config["video_seq_length"], config["conditioning_channels"], config["height"], config["width"]), dtype=np.float32)  # The last image should be empty
+    # Handle the selected points to the condition we want
+    for point_idx, point in enumerate(selected_points):
+        frame_idx = frame_idxs[point_idx]
+        horizontal, vertical = point
+        # Init the base image
+        base_img = np.zeros((org_height, org_width, 3)).astype(np.float32)      # Use the original image size
+        base_img.fill(255)
+        # Draw square around the target position
+        dot_range = 10       # Diameter
+        for i in range(-1*dot_range, dot_range+1):
+            for j in range(-1*dot_range, dot_range+1):
+                dil_vertical, dil_horizontal = vertical + i, horizontal + j
+                if (0 <= dil_vertical and dil_vertical < base_img.shape[0]) and (0 <= dil_horizontal and dil_horizontal < base_img.shape[1]):
+                    if point_idx == 0:
+                        base_img[dil_vertical][dil_horizontal] = [0, 0, 255]        # The first point should be red
+                    else:
+                        base_img[dil_vertical][dil_horizontal] = [0, 255, 0]        # The second point should be green to distinguish the first point
+        # Dilate
+        if config["dilate"]:
+            base_img = cv2.filter2D(base_img, -1, blur_kernel)
+        ##############################################################################################################################
+        ### The core pipeline of processing is: Dilate -> Resize -> Range Shift -> Transpose Shape -> Store
+        # Resize frames  Don't use negative and don't resize in [0,1]
+        base_img = cv2.resize(base_img, (config["width"], config["height"]), interpolation = cv2.INTER_CUBIC)
+        # Channel Transform and Range Shift
+        if config["conditioning_channels"] == 3:
+            # Map to [0, 1] range
+            base_img = base_img / 255.0
+        else:
+            raise NotImplementedError()
+        # ReOrganize shape
+        base_img = base_img.transpose(2, 0, 1)  # hwc -> chw
+        # Write base img based on frame_idx
+        gesture_condition_img[frame_idx] = base_img        # Only the first frame, the rest is 0 initialized
+    ####################################################################################################
+    # Use the same tokenize process as the dataset preparation stage
+    tokenized_prompt = tokenize_captions(prompt, tokenizer, config, is_train=False).unsqueeze(0).to(accelerator.device)    # Use unsqueeze to expand dim
+    # Call the pipeline
+    with torch.autocast("cuda"):
+        frames = pipeline(
+                            image = ref_image_pil,
+                            condition_img = gesture_condition_img,       # numpy [0,1] range
+                            controlnet = accelerator.unwrap_model(gesturenet),
+                            prompt = tokenized_prompt,
+                            use_text = config["use_text"],
+                            text_encoder = text_encoder,
+                            height = config["height"],
+                            width = config["width"],
+                            num_frames = config["video_seq_length"],
+                            decode_chunk_size = 8,
+                            motion_bucket_id = 200,
+                            # controlnet_image_index = controlnet_image_index,
+                            # coordinate_values = coordinate_values,
+                            num_inference_steps = config["num_inference_steps"],
+                            max_guidance_scale = config["inference_max_guidance_scale"],
+                            fps = 7,
+                            use_instructpix2pix = config["use_instructpix2pix"],
+                            noise_aug_strength = config["inference_noise_aug_strength"],
+                            controlnet_conditioning_scale = config["outer_conditioning_scale"],
+                            inner_conditioning_scale = config["inner_conditioning_scale"],
+                            guess_mode = config["inference_guess_mode"],        # False in inference
+                            image_guidance_scale = config["image_guidance_scale"],
+                        ).frames[0]
+    # Save frames
+    video_file_path = os.path.join(store_folder_name, "tmp.mp4")
+    writer = imageio.get_writer(video_file_path, fps=4)
+    for idx, frame in enumerate(frames):
+        frame.save(os.path.join(store_folder_name, str(idx)+".png"))
+        writer.append_data(cv2.cvtColor(cv2.imread(os.path.join(store_folder_name, str(idx)+".png")), cv2.COLOR_BGR2RGB))
+    writer.close()
+    # Cleaning process
+    del pipeline
+    torch.cuda.empty_cache()
+    return gr.update(value=video_file_path, width=config["width"], height=config["height"])   # Return resuly based on the need
+if __name__ == '__main__':
+    # Gradio demo part
+    with gr.Blocks() as demo:
+        # layout definition
+        with gr.Row():
+            gr.Markdown(MARKDOWN)
+        # UI components for editing real images
+        with gr.Row(elem_classes=["container"]):
+            selected_points = gr.State([]) # store points
+            original_image = gr.State(value=None) # store original input image
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("""<p style="text-align: center; font-size: 20px">Click two Points</p>""")
+                    input_image = gr.Image(label="Input Image", height=HEIGHT, width=WIDTH, interactive=False, elem_id="input_img")
+                    # gr.Image(type="numpy", label="Click Points", height=HEIGHT, width=WIDTH, interactive=False) # for points clicking
+                    undo_button = gr.Button("Undo point")
+                    # Text prompt
+                    with gr.Row():
+                        prompt = gr.Textbox(label="Text Prompt")
+                with gr.Column():
+                    gr.Markdown("""<p style="text-align: center; font-size: 20px">Results</p>""")
+                    frames = gr.Video(value=None, label="Generate Video", show_label=True, height=HEIGHT, width=WIDTH)
+                    with gr.Row():
+                        run_button = gr.Button("Run")
+                        clear_all_button = gr.Button("Clear All")
+            # with gr.Tab("Base Model Config"):
+            #     with gr.Row():
+            #         local_models_dir = 'local_pretrained_models'
+            #         local_models_choice = \
+            #             [os.path.join(local_models_dir,d) for d in os.listdir(local_models_dir) if os.path.isdir(os.path.join(local_models_dir,d))]
+            #         model_path = gr.Dropdown(value="runwayml/stable-diffusion-v1-5",
+            #             label="Diffusion Model Path",
+            #             choices=[
+            #                 "runwayml/stable-diffusion-v1-5",
+            #                 "gsdf/Counterfeit-V2.5",
+            #                 "stablediffusionapi/anything-v5",
+            #                 "SG161222/Realistic_Vision_V2.0",
+            #             ] + local_models_choice
+            #         )
+            #         vae_path = gr.Dropdown(value="default",
+            #             label="VAE choice",
+            #             choices=["default",
+            #             "stabilityai/sd-vae-ft-mse"] + local_models_choice
+            #         )
+        # Examples
+        with gr.Row(elem_classes=["container"]):
+            gr.Examples(
+                [
+                    ["__assets__/Bridge_example/Task1_v1_511/im_0.jpg", "take this to there"],
+                    ["__assets__/Bridge_example/Task2_v2_164/im_0.jpg", "put this to there"],
+                    ["__assets__/Bridge_example/Task3_v2_490/im_0.jpg", "fold this"],
+                    ["__assets__/Bridge_example/Task4_v2_119/im_0.jpg", "open this"],
+                    # ["__assets__/0.jpg", "take this to there"],
+                    ["__assets__/91.jpg", "take this to there"],
+                    ["__assets__/156.jpg", "take this to there"],
+                    # ["__assets__/274.jpg", "take this to there"],
+                    ["__assets__/375.jpg", "take this to there"],
+                    # ["__assets__/551.jpg", "take this to there"],
+                ],
+                [input_image, prompt, selected_points],
+            )
+        ####################################### Event Definition #######################################
+        # Draw the points
+        input_image.select(
+            get_points,
+            [input_image, original_image, selected_points],
+            [input_image, original_image],
+        )
+        # Clean the points
+        undo_button.click(
+            undo_points,
+            [original_image],
+            [input_image, selected_points],
+        )
+        run_button.click(
+            gesturenet_inference,
+            inputs = [
+                # vae, unet, gesturenet, image_encoder, text_encoder, tokenizer,
+                original_image, prompt, selected_points,
+                # frame_idxs,
+                # config, accelerator, weight_dtype
+             ],
+            outputs = [frames]
+        )
+        clear_all_button.click(
+            clear_all,
+            [],
+            outputs = [original_image, input_image, prompt, selected_points],
+        )
+    demo.queue().launch(share=True, debug=True)

config/accelerate_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "compute_environment": "LOCAL_MACHINE",
+  "debug": false,
+  "distributed_type": "MULTI_GPU",
+  "downcast_bf16": "no",
+  "gpu_ids": "all",
+  "machine_rank": 0,
+  "main_training_function": "main",
+  "mixed_precision": "fp16",
+  "num_machines": 1,
+  "num_processes": 8,
+  "rdzv_backend": "static",
+  "same_network": true,
+  "tpu_env": [],
+  "tpu_use_cluster": false,
+  "tpu_use_sudo": false,
+  "use_cpu": false
+}

config/flowformer_config.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from yacs.config import CfgNode as CN
+_CN = CN()
+_CN.name = 'default'
+_CN.suffix ='sintel'
+_CN.gamma = 0.75
+_CN.max_flow = 400
+_CN.batch_size = 6
+_CN.sum_freq = 100
+_CN.val_freq = 100000000
+_CN.image_size = [432, 960]
+_CN.add_noise = False
+_CN.use_smoothl1 = False
+_CN.critical_params = []
+_CN.transformer = 'percostformer3'
+### change the path here
+_CN.model = "pretrained/sintel.pth"
+_CN.percostformer3 = CN()
+_CN.percostformer3.pe = 'linear'
+_CN.percostformer3.dropout = 0.0
+_CN.percostformer3.droppath = 0.0
+_CN.percostformer3.encoder_latent_dim = 256 # in twins, this is 256
+_CN.percostformer3.query_latent_dim = 64
+_CN.percostformer3.cost_latent_input_dim = 64
+_CN.percostformer3.cost_latent_token_num = 8
+_CN.percostformer3.cost_latent_dim = 128
+_CN.percostformer3.cost_heads_num = 1
+# encoder
+_CN.percostformer3.pretrain = True
+_CN.percostformer3.use_convertor = False
+_CN.percostformer3.del_layers = True
+_CN.percostformer3.encoder_depth = 3
+_CN.percostformer3.expand_factor = 4
+_CN.percostformer3.vertical_encoder_attn = "twins"
+_CN.percostformer3.attn_dim = 128
+_CN.percostformer3.patch_size = 8
+_CN.percostformer3.patch_embed = 'single'
+_CN.percostformer3.cross_attn = "all"
+_CN.percostformer3.gma = "GMA"
+_CN.percostformer3.vert_c_dim = 64
+_CN.percostformer3.cost_encoder_res = True
+_CN.percostformer3.cnet = 'twins'
+_CN.percostformer3.fnet = 'twins'
+_CN.percostformer3.flow_or_pe = "and"
+_CN.percostformer3.use_patch = False # use cost patch rather than local cost as query
+_CN.percostformer3.use_rpe = False
+_CN.percostformer3.detach_local = False
+_CN.percostformer3.no_sc = False
+_CN.percostformer3.r_16 =-1
+_CN.percostformer3.quater_refine = False
+# pretrain config
+_CN.percostformer3.pretrain_mode = False
+_CN.percostformer3.pic_size = [368, 496, 368, 496]
+_CN.percostformer3.mask_ratio = 0.5
+_CN.percostformer3.query_num = 30
+_CN.percostformer3.no_border = True
+_CN.percostformer3.gt_r = 15
+_CN.percostformer3.fix_pe = False
+# decoder
+_CN.percostformer3.decoder_depth = 12
+_CN.percostformer3.critical_params = ['vert_c_dim', 'encoder_depth', 'vertical_encoder_attn', "use_patch", "flow_or_pe", "use_rpe", "dropout", "detach_local", "expand_factor"]
+### TRAINER
+_CN.trainer = CN()
+_CN.trainer.scheduler = 'OneCycleLR'
+_CN.trainer.optimizer = 'adamw'
+_CN.trainer.canonical_lr = 12.5e-5
+_CN.trainer.adamw_decay = 1e-5
+_CN.trainer.clip = 1.0
+_CN.trainer.num_steps = 120000
+_CN.trainer.epsilon = 1e-8
+_CN.trainer.anneal_strategy = 'linear'
+def get_cfg():
+    return _CN.clone()

config/train_image2video.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+# Model Setting
+pretrained_model_name_or_path: stabilityai/stable-video-diffusion-img2vid   # -xt is for 25 frames version
+load_unet_path:         # This is usally used to load pretrained UNet; e.g., you may want to start one of your checkpoints trained before
+video_seq_length: 14    # Standardized to 14
+process_fps: 7
+train_noise_aug_strength: 0.1
+scheduler: EDM
+conditioning_dropout_prob: 0.1
+# Dataset Setting
+dataset_name: Bridge            # WebVid / Bridge
+dataset_path: [../sanity_check/bridge_v1_raw, ../sanity_check/bridge_v2_raw]
+output_dir: checkpoints/img2video
+height: 256                   # Ratio that is functional: 256:384  576:1024  320:512  320:576
+width: 384                    # It is said that the height and width should be a scale of 64
+dataloader_num_workers: 4     # Don't set this too large; usually, Video diffusion are slow processing, so don't need that many workers to do early loading
+flip_aug_prob: 0.45           # Whether we flip the GT and cond vertically
+acceleration_tolerance: 4     # Recommened setting
+# Text setting
+use_text: True                             # If this is True, we will use text value
+pretrained_tokenizer_name_or_path: stabilityai/stable-diffusion-2-1-base      # Use SD 2.1
+empty_prompts_proportion: 0.0              # Useless now, we already have CFG in training
+mix_ambiguous: False                       # Whether we mix ambiguous prompt for "this" and "that"
+# Motion setting  Useless right now...
+motion_bucket_id: 200           # Set it for exact value; If this is none, we will use below setting
+dataset_motion_mean: 35.3       # For 14 fps, it is N(35.3, 18.5)
+dataset_motion_std: 18.5        # For 25 fps, it is N(?, ?)
+svd_motion_mean: 165
+svd_motion_std: 22.5
+# Training setting
+resume_from_checkpoint: False     # latest/False
+num_train_iters: 100000           # Will automatically choose the checkpoints at 99K
+partial_finetune: False           # Whether we just tune some params to speed up
+train_batch_size: 1               # This is the batch size per GPU
+checkpointing_steps: 3000
+validation_step: 300
+logging_name: logging
+seed: 42
+validation_img_folder:    # Prepare your own validation dataset
+validation_store_folder: validation_results
+checkpoints_total_limit: 15
+# Noise Strength
+noise_mean: 0.5       # Regular Img2Video: (0.7, 1.6); Text2Video: (0.5, 1.4)
+noise_std: 1.4
+# Inference
+num_inference_steps: 25
+inference_noise_aug_strength: 0.1
+inference_max_guidance_scale: 3.0   # Take training and testing at different scenario
+# Learning Rate and Optimizer
+learning_rate: 1e-5           # Usually this is ok
+scale_lr: False               # TODO: Is it needed to scale the learning rate?
+adam_beta1: 0.9
+adam_beta2: 0.999
+use_8bit_adam: True           # Need this to save more memory
+adam_weight_decay: 1e-2
+adam_epsilon: 1e-08
+lr_warmup_steps: 500
+lr_decay_scale: 0.5
+# Other Setting
+mixed_precision: fp16
+gradient_accumulation_steps: 1
+gradient_checkpointing: 1
+report_to: tensorboard

config/train_image2video_controlnet.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+# Model Setting
+pretrained_model_name_or_path: stabilityai/stable-video-diffusion-img2vid   # stabilityai/pretrained
+load_unet_path: ../saved_weights/v4_VL_paper/checkpoint-99000        # None/specific path    This is for pretrained-UNet path
+load_controlnet_path:   # None/specific path    For checkpoint loaded from pretrained-Controlnet Path
+video_seq_length: 14
+process_fps: 7
+train_noise_aug_strength: 0.1
+scheduler: EDM
+conditioning_dropout_prob: 0.1
+# Dataset Setting
+data_loader_type: thisthat          # thisthat
+dataset_name: Bridge                # Bridge
+dataset_path: [../sanity_check/bridge_v1_TT14, ../sanity_check/bridge_v2_TT14]      # ../Bridge_filter_flow, ../Bridge_v2_filter_flow/]
+output_dir: checkpoints/img2video
+height: 256                 # Ratio that is functional: 256:384  576:1024  320:448  320:576  512:640  448:640
+width: 384                  # It is said that the height and width should be a scale of 64
+dataloader_num_workers: 4   # For Debug, it only needs 1
+flip_aug_prob: 0.45         # Whether we flip the GT and cond vertically
+# No acceleration_tolerance, since TT dataset already filter those out
+# Text setting
+use_text: True                              # If this is True, we will use text value
+pretrained_tokenizer_name_or_path: stabilityai/stable-diffusion-2-1-base      # Use SD 2.1
+empty_prompts_proportion: 0.0
+mix_ambiguous: False                         # Whether we mix ambiguous prompt for "this" and "that"
+# Mask setting
+mask_unet_vae: False        # Whether we use mask to map latents to be zero padding
+mask_controlnet_vae: False
+mask_proportion: 0.0
+# Condition Setting
+conditioning_channels: 3    # Usually it is 3
+num_points_left:    # 1     # For flow: You can only choose one between flow_select_rate and num_points_left; num_points_left should be higher priority
+flow_select_rate: 0.99      # For flow
+threshold_factor: 0.2       # For flow
+dilate: True                # Traj must be True for dilate
+inner_conditioning_scale: 1.0    # Conditioning scale for the internal value, defauly is starting from 1.0
+outer_conditioning_scale: 1.0    # Outer Conditioning Scale for whole conditioning trainable copy  这里有点意思，直接不小心设定成2.0了
+# Motion setting
+motion_bucket_id: 200
+dataset_motion_mean: 25       # For 14 fps, it is N(25, 10)
+dataset_motion_std: 10        # For 25 fps, it is N(18, 7)
+svd_motion_mean: 180
+svd_motion_std: 30
+# Training setting
+resume_from_checkpoint: False     # latest/False
+num_train_iters: 30100            # Will automatically choose the checkpoints
+partial_finetune: False           # Whether we just tune some params to speed up
+train_batch_size: 1               # This is the batch size per GPU
+checkpointing_steps: 3000
+validation_step: 300
+logging_name: logging
+seed: 42
+validation_img_folder: datasets/validation_TT14
+validation_store_folder: validation_videos
+checkpoints_total_limit: 15
+# Noise Strength
+noise_mean: 0.5       # Regular Img2Video: (0.7, 1.6); Text2Video: (0.5, 1.4)
+noise_std: 1.4
+# Inference
+num_inference_steps: 25
+use_instructpix2pix: False          # Whether we will use the instructPix2Pix mode, which involves 3 inputs; it may needs tuning to have better result at the end.
+inference_noise_aug_strength: 0.1
+inference_max_guidance_scale: 3.0   # Take training and testing at different scenario
+inference_guess_mode: False         # Whether we use guess mode in the contorlnet
+image_guidance_scale: 2.5           # Empirically, 2.5 is the best value    Seems not using this now
+# Learning Rate and Optimizer
+learning_rate: 5e-6           # 5e-6 is the LR we test that is just right
+scale_lr: False               # TODO: Is it needed to scale the learning rate?
+adam_beta1: 0.9
+adam_beta2: 0.999
+use_8bit_adam: True           # Need this to save more memory
+adam_weight_decay: 1e-2
+adam_epsilon: 1e-08
+lr_warmup_steps: 500
+lr_decay_scale: 0.5
+# Other Setting
+mixed_precision: fp16
+gradient_accumulation_steps: 1    # ????
+gradient_checkpointing: 1         # ????
+report_to: tensorboard

curation_pipeline/add_lang_info.py ADDED Viewed

	@@ -0,0 +1,38 @@

+'''
+    Add the processed lang information
+'''
+import os, sys, shutil
+import json
+if __name__ == "__main__":
+    # Main config file path information
+    processed_json_file_path = "updated_bridge_v2.json"
+    # Read the json file
+    file = open(processed_json_file_path)
+    data = json.load(file)
+    # Iterate all the folders inside
+    start_idx = 0
+    for seq_instance in data:
+        target_path = seq_instance["images0"]
+        print("We are processing ", target_path)
+        processed_lang_txt_path = os.path.join(target_path, "processed_lang.txt")
+        if os.path.exists(processed_lang_txt_path):
+            os.remove(processed_lang_txt_path)
+        # Write the action + This + That into the sequence.
+        processed_lang_txt = open(processed_lang_txt_path, "a")
+        processed_lang_txt.write(str(seq_instance["action"])+"\n")
+        processed_lang_txt.write(str(seq_instance["this"])+"\n")
+        processed_lang_txt.write(str(seq_instance["that"])+"\n")
+        start_idx += 1
+    print("We have ", start_idx)

curation_pipeline/match_dataset_v1.py ADDED Viewed

	@@ -0,0 +1,117 @@

+'''
+    This file is to match the selected frames with the bridge dataset
+    We need to use some tricks to select the item
+'''
+import os, sys, shutil
+import cv2
+import numpy as np
+def compare_img(imageA, imageB):
+	# the 'Mean Squared Error' between the two images is the
+	# sum of the squared difference between the two images;
+	# NOTE: the two images must have the same dimension
+	err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2)
+	err /= float(imageA.shape[0] * imageA.shape[1])
+	# return the MSE, the lower the error, the more "similar"
+	# the two images are
+	return err
+def search_path(dataset_path, target_path, store_txt_path):
+    # We only needs to care about Bridge v1 dataset area
+    target_img_path = os.path.join(target_path, "im_0.jpg")
+    target_img = cv2.imread(target_img_path)
+    # Iterate all the folders inside
+    for scene_name in sorted(os.listdir(dataset_path)):
+        # print("We are reading scene", scene_name)
+        scene_dir = os.path.join(dataset_path, scene_name)
+        for task_name in os.listdir(scene_dir):
+            task_dir = os.path.join(scene_dir, task_name)
+            for time_clock in os.listdir(task_dir):
+                if time_clock == "lmdb":
+                    continue    # Skip lmdb folder
+                time_dir = os.path.join(task_dir, time_clock, "raw", "traj_group0")
+                if not os.path.exists(time_dir):
+                    continue
+                for traj_name in os.listdir(time_dir):
+                    traj_path = os.path.join(time_dir, traj_name)
+                    if not os.path.isdir(traj_path):
+                        continue
+                    # Directly move policy_out_file_path; just in case there is also valuable information there
+                    policy_out_file_path = os.path.join(traj_path, "policy_out.pkl")
+                    if not os.path.exists(policy_out_file_path):
+                        continue
+                    # Check the lang txt file
+                    lang_txt_file_path = os.path.join(traj_path, "lang.txt")
+                    if not os.path.exists(lang_txt_file_path):
+                        continue
+                    # Last thing to locate to the right path
+                    for img_name in os.listdir(traj_path):
+                        if img_name != "images0":       # Only consider one camera angle
+                            continue
+                        img_folder_path = os.path.join(traj_path, img_name)
+                        if not os.path.isdir(img_folder_path):
+                            continue
+                        # Compare two image
+                        img_path = os.path.join(img_folder_path, "im_0.jpg")
+                        # print("img_folder_path is ", img_path)
+                        compare_sample_img = cv2.imread(img_path)
+                        error = compare_img(target_img, compare_sample_img)
+                        if error == 0:
+                            # Continue to all the rest for at least 5 images
+                            status = True
+                            for idx in range (10):
+                                idx_img_path = os.path.join(img_folder_path, "im_"+str(idx)+".jpg")
+                                idx_target_img_path = os.path.join(target_path, "im_"+str(idx)+".jpg")
+                                idx_compare_sample_img = cv2.imread(idx_img_path)
+                                idx_target_img = cv2.imread(idx_target_img_path)
+                                error = compare_img(idx_target_img, idx_compare_sample_img)
+                                if error != 0:
+                                    status = False
+                                    break
+                            if status:
+                                print("We found one at ", img_path)
+                                f = open(store_txt_path, "a")
+                                f.write(target_path + " " + img_folder_path + "\n")
+                                return True
+    return False
+if __name__ == "__main__":
+    input_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/datasets_rob/Bridge_v1_test_raw"
+    dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v1/berkeley"       # 直接从本地新unzip的获取，怕之前的被xuweiyi改动过
+    store_txt_path = "match_info.txt"
+    if os.path.exists(store_txt_path):
+        os.remove(store_txt_path)
+    for img_name in sorted(os.listdir(input_path)):
+        target_path = os.path.join(input_path, img_name)
+        print("We are finding for ", target_path)
+        status = search_path(dataset_path, target_path, store_txt_path)
+        if not status:
+            print("we cannot find one")

curation_pipeline/match_dataset_v2.py ADDED Viewed

	@@ -0,0 +1,137 @@

+'''
+    This file is to match the selected frames with the bridge dataset
+    We need to use some tricks to select the item
+'''
+import os, sys, shutil
+import cv2
+import numpy as np
+def compare_img(imageA, imageB):
+	# the 'Mean Squared Error' between the two images is the
+	# sum of the squared difference between the two images;
+	# NOTE: the two images must have the same dimension
+	err = np.sum((imageA.astype("float") - imageB.astype("float")) ** 2)
+	err /= float(imageA.shape[0] * imageA.shape[1])
+	# return the MSE, the lower the error, the more "similar"
+	# the two images are
+	return err
+def search_path(dataset_path, target_path, store_txt_path):
+    # We only needs to care about Bridge v1 dataset area
+    target_img_path = os.path.join(target_path, "im_0.jpg")
+    if not os.path.exists(target_img_path):
+        print("The image we read is False")
+        return False
+    target_img = cv2.imread(target_img_path)
+    # Iterate all the folders inside
+    for scene_name in sorted(os.listdir(dataset_path)):
+        scene_dir = os.path.join(dataset_path, scene_name)
+        for task_name in sorted(os.listdir(scene_dir)):
+            task_dir = os.path.join(scene_dir, task_name)
+            for order_name in sorted(os.listdir(task_dir)):
+                order_dir = os.path.join(task_dir, order_name)
+                for time_clock in sorted(os.listdir(order_dir)):
+                    if time_clock == "lmdb":
+                        continue    # Skip lmdb folder
+                    time_dir = os.path.join(order_dir, time_clock, "raw", "traj_group0")
+                    if not os.path.exists(time_dir):
+                        continue
+                    for traj_name in sorted(os.listdir(time_dir)):
+                        traj_path = os.path.join(time_dir, traj_name)
+                        if not os.path.isdir(traj_path):
+                            continue
+                        # Directly move policy_out_file_path; just in case there is also valuable information there
+                        policy_out_file_path = os.path.join(traj_path, "policy_out.pkl")
+                        if not os.path.exists(policy_out_file_path):
+                            continue
+                        # Check the lang txt file
+                        lang_txt_file_path = os.path.join(traj_path, "lang.txt")
+                        if not os.path.exists(lang_txt_file_path):
+                            continue
+                        for img_name in sorted(os.listdir(traj_path)):
+                            if img_name != "images0":       # Only consider one camera angle
+                                continue
+                            img_folder_path = os.path.join(traj_path, img_name)
+                            if not os.path.isdir(img_folder_path):
+                                continue
+                            # Compare two image
+                            img_path = os.path.join(img_folder_path, "im_0.jpg")
+                            if not os.path.exists(img_path):
+                                print(img_folder_path + " doesn't even have im_0.jpg")
+                                continue
+                            # print("img_folder_path is ", img_path)
+                            compare_sample_img = cv2.imread(img_path)
+                            # try:
+                            #     compare_sample_img.shape
+                            # except Exception:
+                            #     print("The compare_sample_img cannot be red")
+                            #     continue
+                            error = compare_img(target_img, compare_sample_img)
+                            if error == 0:
+                                # Continue to all the rest for at least 5 images
+                                status = True
+                                for idx in range (10):
+                                    idx_img_path = os.path.join(img_folder_path, "im_"+str(idx)+".jpg")
+                                    idx_target_img_path = os.path.join(target_path, "im_"+str(idx)+".jpg")
+                                    if not os.path.exists(idx_img_path):
+                                        print("The idx_img_path long idx we see only at ", idx)
+                                        continue
+                                    if not os.path.exists(idx_target_img_path):
+                                        print("The idx_target_img_path long idx we see only at ", idx)
+                                        continue
+                                    idx_compare_sample_img = cv2.imread(idx_img_path)
+                                    idx_target_img = cv2.imread(idx_target_img_path)
+                                    error = compare_img(idx_target_img, idx_compare_sample_img)
+                                    if error != 0:
+                                        status = False
+                                        break
+                                if status:
+                                    print("We found one at ", img_path)
+                                    f = open(store_txt_path, "a")
+                                    f.write(target_path + " " + img_folder_path + "\n")
+                                    return True
+    return False
+if __name__ == "__main__":
+    input_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/datasets_rob/Bridge_v2_test_raw"
+    dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v2"       # 直接从本地新unzip的获取，怕之前的被xuweiyi改动过
+    store_txt_path = "match_info_v2_p1.txt"
+    start_idx = 0
+    end_idx = 500
+    if os.path.exists(store_txt_path):
+        os.remove(store_txt_path)
+    for img_name in sorted(os.listdir(input_path))[start_idx:end_idx]:
+        target_path = os.path.join(input_path, img_name)
+        print("We are finding for ", target_path)
+        status = search_path(dataset_path, target_path, store_txt_path)
+        if not status:
+            print("we cannot find one")

curation_pipeline/prepare_bridge_csv.py ADDED Viewed

	@@ -0,0 +1,69 @@

+'''
+    This file is to prepare the dataset in csv file following the format required by Opne-SORA
+'''
+import os, sys, shutil
+import json
+import csv
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+# from curation_pipeline.prepare_bridge_v1 import read_bridge_v1
+# from curation_pipeline.prepare_bridge_v2 import read_bridge_v2
+def iter_dataset(dataset_path):
+    lists = []
+    for sub_folder_name in os.listdir(dataset_path):
+        sub_folder_path = os.path.join(dataset_path, sub_folder_name)
+        # Check number of frames
+        max_length = len(os.listdir(sub_folder_path))
+        for check_idx in range(max_length):
+            if not os.path.exists(os.path.join(sub_folder_path, 'im_' + str(check_idx) + '.jpg')):  # Should be sequentially exists
+                break
+        num_frames = check_idx
+        # Read the text
+        txt_path = os.path.join(sub_folder_path, "lang.txt")
+        f = open(txt_path, "r")
+        lang_prompt = f.readline()
+        lists.append([sub_folder_path, lang_prompt, num_frames, 480, 640])
+        # break
+    return lists
+if __name__ == "__main__":
+    v1_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/sanity_check/bridge_v1_raw"
+    v2_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/sanity_check/bridge_v2_raw"
+    store_name = "Bridge_raw.csv"
+    if os.path.exists(store_name):
+        os.remove(store_name)
+    # Execute
+    full_lists = [["path", "text", "num_frames", "height", "width"]]
+    v1_lists = iter_dataset(v1_dataset_path)
+    full_lists.extend(v1_lists)
+    v2_lists = iter_dataset(v2_dataset_path)
+    full_lists.extend(v2_lists)
+    print("Full length is ", len(full_lists))
+    # Store as csv file
+    with open(store_name, 'w') as outfile:
+        write = csv.writer(outfile)
+        write.writerows(full_lists)
+    # with open('output.jsonl', 'w') as outfile:
+    #     for entry in JSON_file:
+    #         json.dump(entry, outfile)
+    #         outfile.write('\n')

curation_pipeline/prepare_bridge_jsonl.py ADDED Viewed

	@@ -0,0 +1,47 @@

+'''
+    This file is to prepare the dataset in jsonl file
+'''
+import os, sys, shutil
+import json
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from curation_pipeline.prepare_bridge_v1 import read_bridge_v1
+from curation_pipeline.prepare_bridge_v2 import read_bridge_v2
+if __name__ == "__main__":
+    v1_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v1/berkeley"
+    v2_dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v2"
+    store_name = "store.jsonl"
+    if os.path.exists(store_name):
+        os.remove(store_name)
+    # Execute
+    full_lists = []
+    v1_lists = read_bridge_v1(v1_dataset_path, "", copyfile=False)
+    full_lists.extend(v1_lists)
+    v2_lists = read_bridge_v2(v2_dataset_path, "", copyfile=False)
+    full_lists.extend(v2_lists)
+    print("Full length is ", len(full_lists))
+    with open(store_name, 'w') as outfile:
+        for list_name in full_lists:
+            instance = dict()
+            instance["file_path"] = list_name
+            json.dump(instance, outfile)
+            outfile.write('\n')
+    # with open('output.jsonl', 'w') as outfile:
+    #     for entry in JSON_file:
+    #         json.dump(entry, outfile)
+    #         outfile.write('\n')

curation_pipeline/prepare_bridge_v1.py ADDED Viewed

	@@ -0,0 +1,132 @@

+'''
+    This repository is used to prepare Bridge dataset
+'''
+import os, sys, shutil
+def read_bridge_v1(dataset_path, train_store_path, test_store_path, test_dataset_lists, copyfile=True):
+    # copyfile is True when we need to copy the file to the target destination
+    start_idx = 0
+    target_lists = []
+    prefix_len = len(dataset_path) + 1
+    # Iterate all the folders inside
+    for scene_name in sorted(os.listdir(dataset_path)):
+        print("We are reading scene ", scene_name)
+        scene_dir = os.path.join(dataset_path, scene_name)
+        for task_name in sorted(os.listdir(scene_dir)):
+            task_dir = os.path.join(scene_dir, task_name)
+            for time_clock in sorted(os.listdir(task_dir)):
+                if time_clock == "lmdb":
+                    continue    # Skip lmdb folder
+                time_dir = os.path.join(task_dir, time_clock, "raw", "traj_group0")
+                if not os.path.exists(time_dir):
+                    continue
+                for traj_name in sorted(os.listdir(time_dir)):
+                    traj_path = os.path.join(time_dir, traj_name)
+                    if not os.path.isdir(traj_path):
+                        continue
+                    # Directly move policy_out_file_path; just in case there is also valuable information there
+                    policy_out_file_path = os.path.join(traj_path, "policy_out.pkl")
+                    if not os.path.exists(policy_out_file_path):
+                        continue
+                    # Check the lang txt file
+                    lang_txt_file_path = os.path.join(traj_path, "lang.txt")
+                    if not os.path.exists(lang_txt_file_path):
+                        continue
+                    for img_name in sorted(os.listdir(traj_path)):
+                        if img_name != "images0":       # Only consider one camera angle
+                            continue
+                        img_folder_path = os.path.join(traj_path, img_name)
+                        if not os.path.isdir(img_folder_path):
+                            continue
+                        ############################################ Main Process ####################################################
+                        # # First Sanity check (Make sure the input source is jpg good)
+                        # length = len(os.listdir(img_folder_path))
+                        # status = True
+                        # for check_idx in range(length):
+                        #     if not os.path.exists(os.path.join(img_folder_path, 'im_' + str(check_idx) + '.jpg')):  # Should be sequentially exists
+                        #         status = False
+                        #         break
+                        # Now we can copy the folder to our destination
+                        target_lists.append(img_folder_path)
+                        if copyfile:
+                            print("img_folder_path[prefix_len:] is ", img_folder_path[prefix_len:])
+                            if img_folder_path[prefix_len:] in test_dataset_lists:
+                                # Store to test set
+                                target_dir = os.path.join(test_store_path, str(start_idx))
+                            else:
+                                # This is training set
+                                target_dir = os.path.join(train_store_path, str(start_idx))
+                            print("Copy " + str(img_folder_path) + " to " + str(target_dir))
+                            shutil.copytree(img_folder_path, target_dir)
+                            # Sanity check
+                            length = len(os.listdir(target_dir))
+                            status = True
+                            for check_idx in range(length):
+                                if not os.path.exists(os.path.join(target_dir, 'im_' + str(check_idx) + '.jpg')):  # Should be sequentially exists
+                                    status = False
+                                    break
+                            if not status:
+                                # If they didn't have sequential files we need, we will remove and begin again without updating start_idx
+                                print("This file cannot pass the sanity check. We will remove it!")
+                                shutil.rmtree(target_dir)
+                                continue
+                            # Move other auxiliary files
+                            shutil.copy(policy_out_file_path, os.path.join(target_dir, "policy_out.pkl"))
+                            shutil.copy(lang_txt_file_path, os.path.join(target_dir, "lang.txt"))
+                        ################################################################################################################
+                        # Update the idx
+                        start_idx += 1
+    print("We have ", start_idx, " number of cases")
+    # Return a list of file path
+    return target_lists
+if __name__ == "__main__":
+    dataset_path = "/Path/to/Bridge/raw/bridge_data_v1/berkeley"   # Until Bridge v1 - berkeley section
+    train_store_path = "/Path/to/Bridge/train/bridge_v1_raw"
+    test_store_path = "/Path/to/Bridge/train/bridge_v1_test_raw"
+    test_dataset_predefined_path = "test_path.txt"      # This will be providede by us
+    # Make dir if needed
+    if os.path.exists(train_store_path):
+        shutil.rmtree(train_store_path)
+    os.makedirs(train_store_path)
+    if os.path.exists(test_store_path):
+        shutil.rmtree(test_store_path)
+    os.makedirs(test_store_path)
+    # Read Test dataset path
+    test_dataset_lists = []
+    read_file = open(test_dataset_predefined_path, "r")
+    for line in read_file.readlines():
+        test_dataset_lists.append(line[:-1])
+    print("test_dataset_lists is ", test_dataset_lists)
+    read_bridge_v1(dataset_path, train_store_path, test_store_path, test_dataset_lists)

curation_pipeline/prepare_bridge_v2.py ADDED Viewed

	@@ -0,0 +1,139 @@

+'''
+    This repository is used to prepare Bridge dataset
+'''
+import os, sys, shutil
+def read_bridge_v2(dataset_path, train_store_path, test_store_path, test_dataset_lists, copyfile=True):
+    # copyfile is True most of the time
+    start_idx = 0
+    target_lists = []
+    prefix_len = len(dataset_path) + 1
+    # Iterate all the folders inside
+    for scene_name in sorted(os.listdir(dataset_path)):
+        print("We are reading scene ", scene_name)
+        scene_dir = os.path.join(dataset_path, scene_name)
+        for task_name in sorted(os.listdir(scene_dir)):
+            task_dir = os.path.join(scene_dir, task_name)
+            for order_name in sorted(os.listdir(task_dir)):
+                order_dir = os.path.join(task_dir, order_name)
+                for time_clock in sorted(os.listdir(order_dir)):
+                    if time_clock == "lmdb":
+                        continue    # Skip lmdb folder
+                    time_dir = os.path.join(order_dir, time_clock, "raw", "traj_group0")
+                    if not os.path.exists(time_dir):
+                        print("time_dir does not exist for ", time_dir)
+                        continue
+                    for traj_name in sorted(os.listdir(time_dir)):
+                        traj_path = os.path.join(time_dir, traj_name)
+                        if not os.path.isdir(traj_path):
+                            print("traj_path does not exist for ", traj_path)
+                            continue
+                        # Directly move policy_out_file_path; just in case there is also valuable information there
+                        policy_out_file_path = os.path.join(traj_path, "policy_out.pkl")
+                        if not os.path.exists(policy_out_file_path):
+                            continue
+                        # Check the lang txt file
+                        lang_txt_file_path = os.path.join(traj_path, "lang.txt")
+                        if not os.path.exists(lang_txt_file_path):
+                            continue
+                        for img_name in sorted(os.listdir(traj_path)):
+                            if img_name != "images0":       # Only consider one camera angle
+                                continue
+                            img_folder_path = os.path.join(traj_path, img_name)
+                            if not os.path.isdir(img_folder_path):
+                                print("img_folder_path does not exist for ", img_folder_path)
+                                continue
+                            ############################################ Main Process ####################################################
+                            # # First Sanity check (Make sure the input source is jpg good)
+                            # length = len(os.listdir(img_folder_path))
+                            # status = True
+                            # for check_idx in range(length):
+                            #     if not os.path.exists(os.path.join(img_folder_path, 'im_' + str(check_idx) + '.jpg')):  # Should be sequentially exists
+                            #         status = False
+                            #         break
+                            # Now we can copy the folder to our destination
+                            target_lists.append(img_folder_path)
+                            if copyfile:
+                                print("img_folder_path[prefix_len:] is ", img_folder_path[prefix_len:])
+                                if img_folder_path[prefix_len:] in test_dataset_lists:
+                                    # Store to test set
+                                    target_dir = os.path.join(test_store_path, str(start_idx))
+                                else:
+                                    # This is training set
+                                    target_dir = os.path.join(train_store_path, str(start_idx))
+                                # Now we can copy the folder to our destination
+                                print("Copy " + str(img_folder_path) + " to " + str(os.path.join(train_store_path, str(start_idx))))
+                                shutil.copytree(img_folder_path, target_dir)
+                                # Sanity check
+                                length = len(os.listdir(target_dir))
+                                status = True
+                                for check_idx in range(length):
+                                    if not os.path.exists(os.path.join(target_dir, 'im_' + str(check_idx) + '.jpg' )):    # Should be sequentially exists
+                                        status = False
+                                        break
+                                if not status:
+                                    # If they didn't have sequential files we need, we will remove and begin again without updating start_idx
+                                    print("This file cannot pass the sanity check. We will remove it!")
+                                    shutil.rmtree(target_dir)
+                                    continue
+                                # Move other auxilary files
+                                shutil.copy(policy_out_file_path, os.path.join(target_dir, "policy_out.pkl"))
+                                shutil.copy(lang_txt_file_path, os.path.join(target_dir, "lang.txt"))
+                            # Update the idx
+                            start_idx += 1
+    print("We have ", start_idx)
+    # Return a list of file path
+    return target_lists
+if __name__ == "__main__":
+    dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v2"
+    train_store_path = "../sanity_check/bridge_v2_raw"
+    test_store_path = "../sanity_check/bridge_v2_test_raw"
+    test_dataset_predefined_path = "test_path_v2.txt"
+    # Make dir if needed
+    if os.path.exists(train_store_path):
+        shutil.rmtree(train_store_path)
+    os.makedirs(train_store_path)
+    if os.path.exists(test_store_path):
+        shutil.rmtree(test_store_path)
+    os.makedirs(test_store_path)
+    # Read Test dataset path
+    test_dataset_lists = []
+    read_file = open(test_dataset_predefined_path, "r")
+    for line in read_file.readlines():
+        test_dataset_lists.append(line[:-1])
+    print("test_dataset_lists is ", test_dataset_lists)
+    read_bridge_v2(dataset_path, train_store_path, test_store_path, test_dataset_lists)

curation_pipeline/select_frame_with_this_that.py ADDED Viewed

	@@ -0,0 +1,421 @@

+'''
+    This repository is used to prepare Bridge dataset with this that conditioning
+'''
+import os, sys, shutil
+import pickle
+from ultralytics import YOLO
+from PIL import Image, ImageDraw
+import numpy as np
+import cv2
+import math
+import collections
+from segment_anything import SamAutomaticMaskGenerator, SamPredictor, sam_model_registry
+def show_mask(mask, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    return mask_image * 255
+def read_center_point(model, img_path, do_visualization, store_path):
+    action_img = Image.open(img_path)
+    prediction = model.predict(source=action_img, save=False)[0]  # Only 1 frame
+    if not hasattr(prediction, "boxes"):
+        print("Detection Fail: We cannot have boxes attribute")
+        return None, None   # -1 means NAN and pass this case
+    # save at the temp_places for visualizaiton
+    if do_visualization:
+        prediction.save(filename=store_path)
+    bounding_boxes = prediction.boxes.xywh
+    num, dim = bounding_boxes.shape
+    assert(dim == 4)
+    # Catch up all center point of all bounding boxes
+    edge_point_cord = []
+    center_points = []
+    for idx in range(num):
+        x, y, w, h = bounding_boxes[idx].detach().cpu().numpy()
+        center_point = [x, y]   # TODO: y+(h/4) 根据经验，往下飘逸25%的高度，一般来说比较有帮助
+        edge_point_cord.extend([ (x+w//2, y+h//2), (x-w//2, y+h//2), (x-w//2, y-h//2), (x+w//2, y-h//2) ])
+        if w <= 15 or h <= 15:    # If a bounding box is too small, we will disregard this case
+            return None, None
+        # Calculate the distance between current one and previous points for sanity check
+        for point in center_points: # Check all previous points
+            give_up_threshold = 90
+            if center_point[0] - point[0] >= give_up_threshold:
+                print("Two points are too far away and neglect the case")
+                return None, None
+            if center_point[1] - point[1] >= give_up_threshold:
+                print("Two points are too far away and neglect the case")
+                return None, None
+        # Append to the list
+        center_points.append(center_point)
+    if len(center_points) == 0 or len(center_points) > 2:
+        print("Detection Fail: We cannot detect bounding boxes")
+        return None, None
+    # Calculating the average distance among center_points
+    if len(center_points) == 2:
+        first_box, second_box = center_points
+        center_x = (first_box[0] + second_box[0]) / 2
+        center_y = (first_box[1] + second_box[1]) / 2
+        distance = math.sqrt(abs(first_box[0] - second_box[0])**2 + abs(first_box[1] - second_box[1])**2)
+        return [center_x, center_y, distance], edge_point_cord
+    return [*center_points[0], 100], edge_point_cord # if len(center_points) == 1, distance is 0; however, to avoid 2-1-2 box detection in sequential, we set it as a higher value
+def detect_gripper(gripper_detection_model, input_dir, action_start, action_end, do_visualization, store_dir, sample_failure_collect_folder=None):
+    # 先处理第一个point的（这个比较重要，所以要重复3次）；然后再快速处理最后一个point
+    # Process the first action frame by iterating next three frames and choose the closest one
+    first_center_points = []
+    edge_point_cords = []
+    for idx in range(3):    # Repeat 3 times
+        action_start_path = os.path.join(input_dir, "im_"+str(action_start + idx)+".jpg")
+        first_center_point, edge_point_cord = read_center_point(gripper_detection_model, action_start_path, do_visualization, os.path.join(store_dir, "contact_first"+str(idx)+".jpg"))    # The first frame
+        if idx == 0 and first_center_point is None:
+            message = "Cannot find the first contact point!"
+            print("The contact point we cannot detect is at ", action_start_path)
+            if sample_failure_collect_folder != "":
+                shutil.copyfile(action_start_path, os.path.join(sample_failure_collect_folder, str(len(os.listdir(sample_failure_collect_folder)))+".jpg") )
+            return (None, None, message)
+        if first_center_point is not None:
+            first_center_points.append([action_start + idx, first_center_point])
+            # Add edge points
+            print(edge_point_cord)
+            edge_point_cords.extend(edge_point_cord)    # 我有点担心所有point就这么extend会对一些的edge case不是那么robust
+    # Select the closest point between two
+    first_center_points.sort(key=lambda x: x[1][2])
+    first_center_point = first_center_points[0][1][:2]
+    start_idx = first_center_points[0][0]
+    print("first_center_point is " + str(first_center_point)  + " with idx " + str(start_idx))
+    order_idx = [start_idx, action_end]
+    # Find the xmin, ymin, xmax, ymax for based all three points as the bounding box for the SAM
+    edge_point_cords.sort(key=lambda x: x[0])
+    xmin = int(edge_point_cords[0][0])
+    xmax = int(edge_point_cords[-1][0])
+    edge_point_cords.sort(key=lambda x: x[1])
+    ymin = int(edge_point_cords[0][1])
+    ymax = int(edge_point_cords[-1][1])
+    bbox_info = (xmin, xmax, ymin, ymax)
+    # Process the last action frame
+    action_end_path = os.path.join(input_dir, "im_"+str(action_end)+".jpg")
+    last_center_point, edge_point_cord = read_center_point(gripper_detection_model, action_end_path, do_visualization, os.path.join(store_dir, "contact_last.jpg"))  # The last frame
+    if last_center_point is None:
+        message = "Cannot find the last contact point!"
+        print("The contact point we cannot detect is at ", action_start_path)
+        if sample_failure_collect_folder != "":
+            store_name = str(len(os.listdir(sample_failure_collect_folder))) + ".jpg"
+            shutil.copyfile(action_start_path, os.path.join(sample_failure_collect_folder, store_name) )
+        return (None, bbox_info, message)
+    last_center_point = last_center_point[:2]
+    # Check if two center points is too close, if they are too close, we will merge to one point
+    merge_threshold = 30
+    if math.sqrt((first_center_point[0] - last_center_point[0])**2 + (first_center_point[1] - last_center_point[1])**2) <= merge_threshold:
+        print("Merge two points to one!")
+        message = "Success!"
+        return ([[first_center_point], order_idx], bbox_info, message)
+    # Return needed information
+    message = "Success!"
+    return ([[first_center_point, last_center_point], order_idx], bbox_info, message)
+def visualize_this_that(base_img, bbox_info, this_that_points):
+    # Draw a green dot only for the start point
+    for point in this_that_points:
+        print("point is ", point)
+        target_horizontal, target_vertical = point
+        target_horizontal, target_vertical = int(target_horizontal), int(target_vertical)
+        dot_range = 3
+        for i in range(-1*dot_range, dot_range+1):
+            for j in range(-1*dot_range, dot_range+1):
+                dil_vertical, dil_horizontal = target_vertical + i, target_horizontal + j
+                if (0 <= dil_vertical and dil_vertical < base_img.shape[0]) and (0 <= dil_horizontal and dil_horizontal < base_img.shape[1]):
+                    base_img[dil_vertical, dil_horizontal, :] = [0, 128, 0]
+                # else:
+                #     # print("The traj is out of boundary!!!!!!!!!!!!!!!!!!!!! and we won't consider it")      # 现在
+                #     return (False, base_img)
+    # Draw the bounding box
+    xmin, xmax, ymin, ymax = bbox_info
+    base_img = cv2.rectangle(base_img, (xmin, ymin), (xmax, ymax), color=(0,0,255), thickness=2)
+    return (True, base_img)
+def manage_seq_range(input_dir, store_dir, sample_failure_collect_folder, total_frames_needed,
+                        max_original_input_tolerate, gripper_detection_model, sam_predictor, do_visualization):
+    # Find valid image lists
+    num_frames_input = 0
+    for file_name in os.listdir(input_dir):
+        if file_name.startswith("im_"):
+            num_frames_input += 1
+    for idx in range(num_frames_input):
+        target_path = os.path.join(input_dir, "im_"+str(idx)+".jpg")
+        if not os.path.exists(target_path):
+            print("We don't have ", target_path)
+            message = "Invalid error"   # Make sure that every file in this order is existed, this is quite important
+            return (False, message)
+    if num_frames_input > max_original_input_tolerate:
+        message = "The number of frames is too long for constructing the sequence length needed"
+        return (False, message)
+    if num_frames_input < total_frames_needed:
+        message = "The number of frames is too short for constructing the sequence length needed"
+        return (False, message)
+    # Prepare this and that based on policy_out.pkl
+    policy_out_file_path = os.path.join(input_dir, "policy_out.pkl")
+    with open(policy_out_file_path, "rb") as f:
+        policy = pickle.load(f)
+    actions_codes = []
+    action_start, action_end = None, None
+    for idx, item in enumerate(policy):
+        action_value = item["actions"][-1]
+        if action_start is None and action_value == 0.0:
+            action_start = idx
+        if (action_start is not None) and (action_end is None) and (action_value == 1.0):
+            action_end = idx    # Until record the first 1.0 exists after the first 0.0 appears
+        actions_codes.append(action_value)
+    if action_start is None or action_end is None:
+        message = "We cannot read an action_start or action_end code!"
+        return (False, message)    # Requires to have both start and end actions (Usually, they are a pair)
+    print("actions_codes is ", actions_codes)
+    print("the start end idx we read is ", action_start, action_end)
+    # Detect the gripper (should return a list with exactly two x,y coordinate points)
+    detection_retrun_info, bbox_info, detect_message = detect_gripper(
+                                                                        gripper_detection_model,
+                                                                        input_dir,
+                                                                        action_start,
+                                                                        action_end,
+                                                                        do_visualization = do_visualization,
+                                                                        store_dir = store_dir,
+                                                                        sample_failure_collect_folder = sample_failure_collect_folder,
+                                                                    )
+    if detection_retrun_info is None:
+        return (False, detect_message)
+    detected_point, old_seq_idx = detection_retrun_info
+    print("detected_point is ", detected_point)
+    # Visualize if needed
+    base_img = cv2.imread(os.path.join(input_dir, "im_0.jpg"))
+    if do_visualization:
+        status, visual_img = visualize_this_that(base_img, bbox_info, detected_point)
+        if status:
+            cv2.imwrite(os.path.join(store_dir, "visualization.png"), visual_img)
+    # SAM process based on bbox_info
+    xmin, xmax, ymin, ymax = bbox_info
+    sam_predictor.set_image(np.uint8(base_img))
+    positive_point_cords = np.array([[ int(detected_point[0][0]), int(detected_point[0][1]) ]])
+    positive_point_cords = np.array(positive_point_cords)
+    positive_point_labels = np.ones(len(positive_point_cords))
+    # Predict the mask based on the point and bounding box designed
+    masks, scores, logits = sam_predictor.predict(
+                                                    point_coords = positive_point_cords,
+                                                    point_labels = positive_point_labels,
+                                                    box = np.array([xmin, ymin, xmax, ymax])[None, :],
+                                                    multimask_output = False,
+                                                )
+    print(scores)
+    for mask_idx, mask in enumerate(masks):
+        mask_img = show_mask(mask)
+        cv2.imwrite(os.path.join(store_dir, "mask_" + str(mask_idx) + ".png"), mask_img)
+    ################################ Move the img ######################################
+    # Calculate needed parameters
+    division_factor = num_frames_input // total_frames_needed
+    remain_frames = (num_frames_input % total_frames_needed) - 1    # -1 for adaptation
+    # Define the gap
+    gaps = [division_factor for _ in range(total_frames_needed-1)]
+    for idx in range(remain_frames):
+        if idx % 2 == 0:
+            gaps[idx//2] += 1      # Start to end order
+        else:
+            gaps[-1*(1+(idx//2))] += 1   # End to start order
+    # Map the gap to the specific orders
+    idx_orders = [1]    # 从1还是shift一下问题应该不大
+    for global_idx, gap in enumerate(gaps):
+        idx_orders.append(idx_orders[-1] + gap)
+    if idx_orders[-1] >= num_frames_input:
+        message = "Invalid error"
+        return (False, message)
+    # assert(idx_orders[-1] < num_frames_input)
+    assert(len(idx_orders) == total_frames_needed)
+    # Copy the essential files first
+    for global_idx, cur_idx in enumerate(idx_orders):
+        source_path = os.path.join(input_dir, "im_"+str(cur_idx)+".jpg")
+        destination_path = os.path.join(store_dir, "im_"+str(global_idx)+".jpg")
+        if not os.path.exists(source_path):     # Theoretically, source_path must exists
+            message = "We couldn't find the source path. Theoretically, source_path must exists!"  # 有一种可能就是我们丢失了一些地方，在cp或者本来就没有，记得统计数量
+            return (False, message)
+        shutil.copyfile(source_path, destination_path)
+    # Map order_idx to the cropped version
+    mapped_seq_idx = []
+    for old_idx in old_seq_idx:
+        tmp = []
+        for tmp_idx, new_idx in enumerate(range(len(idx_orders))):
+            tmp.append((tmp_idx, abs(old_idx - idx_orders[new_idx])))
+        # Sort the smallest fistance
+        tmp.sort(key=lambda x: x[1])
+        mapped_seq_idx.append(tmp[0][0])
+    print("Before the idx is ", old_seq_idx)
+    print("mapped idx is ", mapped_seq_idx)
+    # Write the information to new destination
+    f = open(os.path.join(store_dir, "data.txt"), "a")
+    f.write(str(mapped_seq_idx[0]) + " " + str(detected_point[0][0]) + " " + str(detected_point[0][1]) + "\n")
+    if len(detected_point) == 2:   # Two points excluding the last idx
+        f.write(str(mapped_seq_idx[1]) + " " + str(detected_point[1][0]) + " " + str(detected_point[1][1]) + "\n")
+    f.close()
+    # Move lang.txt file
+    shutil.copyfile(os.path.join(input_dir, 'lang.txt'), os.path.join(store_dir, 'lang.txt'))
+    message = "Success!"
+    return (True, message)
+if __name__ == "__main__":
+    # General storage setting
+    dataset_path = "../datasets_rob/Bridge_v2_raw"
+    destination_path = "../sanity_check/bridge_v2_TT14_longer_tolerance"
+    sample_failure_collect_folder = ""      # This is to collect cases that fail for active learning
+    total_frames_needed = 14
+    max_original_input_tolerate = 56        # 40 for 14 fps; 60 for 25fps;
+    do_visualization = True
+    # YOLO model init
+    yolo_pretarined_path = "pretrained/yolov8n_best.pt"
+    gripper_detection_model = YOLO("yolov8n.yaml")  # build a new model from scratch
+    gripper_detection_model = YOLO(yolo_pretarined_path)  # load a pretrained model (recommended for training)
+    # SAM model init
+    model_type = "vit_h"
+    sam_pretrained_path = "pretrained/sam_vit_h_4b8939.pth"
+    sam = sam_model_registry[model_type](checkpoint=sam_pretrained_path).to(device="cuda")
+    sam_predictor = SamPredictor(sam)     # There is a lot of setting here
+    # Make dir if needed
+    if os.path.exists(destination_path):
+        shutil.rmtree(destination_path)
+    os.makedirs(destination_path)
+    # Prepare the folder to collect failure cases
+    if sample_failure_collect_folder != "":
+        if os.path.exists(sample_failure_collect_folder):
+            shutil.rmtree(sample_failure_collect_folder)
+        os.makedirs(sample_failure_collect_folder)
+    # Collect the message
+    message_dict = collections.defaultdict(int)
+    store_idx = 0
+    for folder_name in sorted(os.listdir(dataset_path)):
+        input_folder_path = os.path.join(dataset_path, folder_name)
+        store_folder_path = os.path.join(destination_path, "0"*(6-len(str(store_idx)))+str(store_idx))
+        print("We are processing ", input_folder_path)
+        # Prepare store_folder_path folder
+        os.makedirs(store_folder_path)
+        status, message = manage_seq_range(input_folder_path, store_folder_path, sample_failure_collect_folder, total_frames_needed, max_original_input_tolerate, gripper_detection_model, sam_predictor, do_visualization)
+        if status:      # We will only update the store_idx only when this file is successfully written
+            store_idx += 1
+        else:
+            print("This status failed! Message: " + message)
+            shutil.rmtree(store_folder_path)
+        # break # For debug
+        # Collect the infor to dict
+        message_dict[message] += 1
+    print("We have " + str(store_idx) + " valid dataset")
+    print("message_dict info is ", message_dict)

curation_pipeline/tracking_by_keypoint.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os, shutil, sys
+import argparse
+import gdown
+import cv2
+import numpy as np
+import os
+import sys
+import requests
+import json
+import torchvision
+import torch
+import psutil
+import time
+try:
+    from mmcv.cnn import ConvModule
+except:
+    os.system("mim install mmcv")
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from track_anything_code.model import TrackingAnything
+from track_anything_code.track_anything_module import get_frames_from_video, download_checkpoint, parse_augment, sam_refine, vos_tracking_video
+from scripts.compress_videos import compress_video
+if __name__ == "__main__":
+    dataset_path = "Bridge_v1_TT14"
+    video_name = "combined.mp4"
+    verbose = True      # If this is verbose, you will continue to write the code
+    ################################################## Model setup ####################################################
+    # check and download checkpoints if needed
+    sam_checkpoint = "sam_vit_h_4b8939.pth"
+    sam_checkpoint_url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
+    xmem_checkpoint = "XMem-s012.pth"
+    xmem_checkpoint_url = "https://github.com/hkchengrex/XMem/releases/download/v1.0/XMem-s012.pth"
+    folder ="./pretrained"
+    SAM_checkpoint = download_checkpoint(sam_checkpoint_url, folder, sam_checkpoint)
+    xmem_checkpoint = download_checkpoint(xmem_checkpoint_url, folder, xmem_checkpoint)
+    # argument
+    args = parse_augment()
+    args.device = "cuda"      # Any GPU is ok
+    # Initialize the Track model
+    track_model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, args)
+    ###################################################################################################################
+    # Iterate all files under the folder
+    for sub_folder_name in sorted(os.listdir(dataset_path)):
+        ################################################## Setting ####################################################
+        sub_folder_path = os.path.join(dataset_path, sub_folder_name)
+        click_state = [[],[]]
+        interactive_state = {
+                                "inference_times": 0,
+                                "negative_click_times" : 0,
+                                "positive_click_times": 0,
+                                "mask_save": args.mask_save,
+                                "multi_mask": {
+                                    "mask_names": [],
+                                    "masks": []
+                                },
+                                "track_end_number": None,
+                                "resize_ratio": 1
+                            }
+        ###################################################################################################################
+        video_path = os.path.join(sub_folder_path, video_name)
+        if not os.path.exists(video_path):
+            print("We cannot find the path of the ", video_path, " and we will compress one")
+            status = compress_video(sub_folder_path, video_name)
+            if not status:
+                print("We still cannot generate a video")
+                continue
+        # Read video state
+        video_state = {
+                        "user_name": "",
+                        "video_name": "",
+                        "origin_images": None,
+                        "painted_images": None,
+                        "masks": None,
+                        "inpaint_masks": None,
+                        "logits": None,
+                        "select_frame_number": 0,
+                        "fps": 30
+                    }
+        video_state, template_frame = get_frames_from_video(video_path, video_state, track_model)
+        ########################################################## Get the sam point based on the data.txt ###########################################################
+        data_txt_path = os.path.join(sub_folder_path, "data.txt")
+        if not os.path.exists(data_txt_path):
+            print("We cannot find data.txt in this folder")
+            continue
+        data_file = open(data_txt_path, 'r')
+        lines = data_file.readlines()
+        frame_idx, horizontal, vertical = lines[0][:-2].split(' ')   # Only read the first point
+        point_cord = [int(float(horizontal)), int(float(vertical))]
+        # Process by SAM
+        track_model.samcontroler.sam_controler.reset_image() # Reset the image to clean history
+        painted_image, video_state, interactive_state, operation_log = sam_refine(track_model, video_state, "Positive", click_state, interactive_state, point_cord)
+        ################################################################################################################################################################
+        ######################################################### Get the tracking output ########################################################################
+        # Track the video for processing
+        segment_output_path = os.path.join(sub_folder_path, "segment_output.gif")
+        video_state = vos_tracking_video(track_model, segment_output_path, video_state, interactive_state, mask_dropdown=[])[0]   # mask_dropdown is empty now
+        # Extract the mask needed by us for further point calculating
+        masks = video_state["masks"]        # In the range [0, 1]
+        if verbose:
+            for idx, mask in enumerate(masks):
+                cv2.imwrite(os.path.join(sub_folder_path, "mask"+str(idx)+".png"), mask*255)
+        ##############################################################################################################################################################

data_loader/video_dataset.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import os, sys
+import json
+import cv2
+import math
+import shutil
+import numpy as np
+import random
+import collections
+from PIL import Image
+import torch
+from torch.utils.data import Dataset
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from utils.img_utils import resize_with_antialiasing, numpy_to_pt
+def get_video_frames(config, video_frame_path, flip = False):
+    video_seq_length = config["video_seq_length"]
+    # Calculate needed parameters
+    num_frames_input = 0
+    for file_name in os.listdir(video_frame_path):
+        if file_name.startswith("im_"):
+            num_frames_input += 1
+    total_frames_needed = video_seq_length
+    division_factor = num_frames_input // total_frames_needed
+    remain_frames = (num_frames_input % total_frames_needed) - 1    # -1 for adaptation
+    # Define the gap
+    gaps = [division_factor for _ in range(total_frames_needed-1)]
+    for idx in range(remain_frames):
+        if idx % 2 == 0:
+            gaps[idx//2] += 1      # Start to end order
+        else:
+            gaps[-1*(1+(idx//2))] += 1   # End to start order
+    # Find needed file
+    needed_img_path = []
+    cur_idx = 0
+    for gap in gaps:
+        img_path = os.path.join(video_frame_path, "im_" + str(cur_idx) + ".jpg")
+        needed_img_path.append(img_path)
+        # Update the idx
+        cur_idx += gap
+    # Append the last one
+    img_path = os.path.join(video_frame_path, "im_" + str(cur_idx) + ".jpg")
+    needed_img_path.append(img_path)
+    # Read all img_path based on the order
+    video_frames = []
+    for img_path in needed_img_path:
+        if not os.path.exists(img_path):
+            print("We don't have ", img_path)
+        frame = cv2.imread(img_path)
+        try:
+            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+        except Exception:
+            print("The exception places is ", img_path)
+        # Resize frames
+        frame = cv2.resize(frame, (config["width"], config["height"]), interpolation = cv2.INTER_CUBIC)
+        # Flip aug
+        if flip:
+            frame = np.fliplr(frame)
+        # Collect frames
+        video_frames.append(np.expand_dims(frame, axis=0))       # The frame is already RGB, there is no need to convert here.
+    # Concatenate
+    video_frames = np.concatenate(video_frames, axis=0)
+    assert(len(video_frames) == video_seq_length)
+    return video_frames
+def tokenize_captions(prompt, tokenizer, config, is_train=True):
+    '''
+        Tokenize text prompt be prepared tokenizer from SD2.1
+    '''
+    captions = []
+    if random.random() < config["empty_prompts_proportion"]:
+        captions.append("")
+    elif isinstance(prompt, str):
+        captions.append(prompt)
+    elif isinstance(prompt, (list, np.ndarray)):
+        # take a random caption if there are multiple
+        captions.append(random.choice(prompt) if is_train else prompt[0])
+    else:
+        raise ValueError(
+            f"Caption column should contain either strings or lists of strings."
+        )
+    inputs = tokenizer(
+        captions, max_length = tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+    )
+    return inputs.input_ids[0]
+class Video_Dataset(Dataset):
+    '''
+        Video Dataset to load sequential frames for training with needed pre-processing
+    '''
+    def __init__(self, config, device, normalize=True, tokenizer=None):
+        # Attribute variables
+        self.config = config
+        self.device = device
+        self.normalize = normalize
+        self.tokenizer = tokenizer
+        # Obtain values
+        self.video_seq_length = config["video_seq_length"]
+        self.height = config["height"]
+        self.width = config["width"]
+        # Process data
+        self.video_lists = []
+        stats_analysis = collections.defaultdict(int)
+        print("Process all files to check valid datasets....")
+        for dataset_path in config["dataset_path"]:
+            for video_name in sorted(os.listdir(dataset_path)):
+                video_path = os.path.join(dataset_path, video_name)
+                all_files = os.listdir(video_path)
+                valid = True
+                # Valid check 1: the number of files should be in sequential order
+                num_frames_input = 0
+                for file_name in os.listdir(video_path):
+                    if file_name.startswith("im_"):
+                        num_frames_input += 1
+                for idx in range(num_frames_input):
+                    img_path = 'im_' + str(idx) + '.jpg'
+                    if img_path not in all_files:            # Should be sequential existing
+                        valid = False
+                        stats_analysis["incomplete_img"] += 1
+                        break
+                # Valid check 1.5: the number of files must be longer than video_seq_length and less than self.config["acceleration_tolerance"]*self.config["video_seq_length"]
+                if num_frames_input < self.config["video_seq_length"]:
+                    stats_analysis["too_little_frames"] += 1
+                    valid = False
+                if num_frames_input > self.config["acceleration_tolerance"] * self.config["video_seq_length"]:
+                    stats_analysis["too_many_frames"] += 1
+                    valid = False
+                if not valid:   # SpeedUp so set in the middle here
+                    continue
+                # Valid check 2: language if needed
+                if config["use_text"] and not os.path.exists(os.path.join(dataset_path, video_name, "lang.txt")):
+                    stats_analysis["no_lang_txt"] += 1
+                    valid = False
+                # Valid check 3: motion if needed
+                if config["motion_bucket_id"] is None:
+                    flow_path = os.path.join(dataset_path, video_name, "flow.txt")
+                    if "flow.txt" not in all_files:
+                        stats_analysis["no_flow_txt"] += 1
+                        valid = False
+                    else:
+                        file = open(flow_path, 'r')
+                        info = file.readlines()
+                        if len(info) == 0:
+                            stats_analysis["no_flow_txt"] += 1
+                            valid = False
+                if valid:
+                    self.video_lists.append(video_path)
+        print("stats_analysis is ", stats_analysis)
+        print("Valid dataset length is ", len(self.video_lists))
+    def __len__(self):
+        return len(self.video_lists)
+    def _get_motion_value(self, sub_folder_path):
+        ''' Read the motion value from the flow.txt file prepared; preprocess the flow to accelerate
+        '''
+        # Read the flow.txt
+        flow_path = os.path.join(sub_folder_path, 'flow.txt')
+        file = open(flow_path, 'r')
+        info = file.readlines()
+        per_video_movement = float(info[0][:-2])
+        # Map the raw reflected_motion_bucket_id to target range based on the number of images have
+        num_frames_input = 0
+        for file_name in os.listdir(sub_folder_path):   # num_frames_input is the total number of files with name begin with im_
+            if file_name.startswith("im_"):
+                num_frames_input += 1
+        # Correct the value based on the number of frames relative to video_seq_length
+        per_video_movement_correct = per_video_movement * (num_frames_input/self.config["video_seq_length"])
+        # Map from one Normal Distribution to another Normal Distribution
+        z = (per_video_movement_correct - self.config["dataset_motion_mean"]) / (self.config["dataset_motion_std"] + 0.001)
+        reflected_motion_bucket_id = int((z * self.config["svd_motion_std"]) + self.config["svd_motion_mean"])
+        print("We map " + str(per_video_movement) + " to " + str(per_video_movement_correct) + " by length " + str(num_frames_input) + " to bucket_id of " + str(reflected_motion_bucket_id))
+        return reflected_motion_bucket_id
+    def __getitem__(self, idx):
+        ''' Get item by idx and pre-process by Resize and Normalize to [0, 1]
+        Args:
+            idx (int):                  The index to the file in the directory
+        Returns:
+            video_frames (torch.float32):           The Pytorch tensor format of obtained frames (max: 1.0; min: 0.0)
+            reflected_motion_bucket_id (tensor):    Motion value is there is optical flow provided, else they are fixed value from config
+            prompt (tensor):                        Tokenized text
+        '''
+        # Prepare the text if needed:
+        if self.config["use_text"]:
+            # Read the file
+            file_path = os.path.join(self.video_lists[idx], "lang.txt")
+            file = open(file_path, 'r')
+            prompt = file.readlines()[0]  # Only read the first line
+            if self.config["mix_ambiguous"] and os.path.exists(os.path.join(self.video_lists[idx], "processed_text.txt")):
+                # If we don't have this txt file, we skip
+                ######################################################## Mix up prompt ########################################################
+                # Read the file
+                file_path = os.path.join(self.video_lists[idx], "processed_text.txt")
+                file = open(file_path, 'r')
+                prompts = [line for line in file.readlines()]  # Only read the first line
+                # Get the componenet
+                action = prompts[0][:-1]
+                this = prompts[1][:-1]
+                there = prompts[2][:-1]
+                random_value = random.random()
+                # If less than 0.4, we don't care, just use the most concrete one
+                if random_value >= 0.4 and random_value < 0.6:
+                    # Mask pick object to "This"
+                    prompt = action + " this to " + there
+                elif random_value >= 0.6 and random_value < 0.8:
+                    # Mask place position to "There"
+                    prompt = action + " " + this + " to there"
+                elif random_value >= 0.8 and random_value < 1.0:
+                    # Just be like "this to there"
+                    prompt = action + " this to there"
+                # print("New prompt is ", prompt)
+                ###################################################################################################################################################
+            # else:
+            #     print("We don't have llama processed prompt at ", self.video_lists[idx])
+        else:
+            prompt = ""
+        # Tokenize text prompt
+        tokenized_prompt = tokenize_captions(prompt, self.tokenizer, self.config)
+        # Dataset aug by chance (it is needed to check whether there is any object position words [left|right] in the prompt text)
+        flip = False
+        if random.random() < self.config["flip_aug_prob"]:
+            if self.config["use_text"]:
+                if prompt.find("left") == -1 and prompt.find("right") == -1:    # Cannot have position word, like left and right (up and down is ok)
+                    flip = True
+            else:
+                flip = True
+        # Read frames for different datasets; Currently, we have WebVid / Bridge
+        if self.config["dataset_name"] == "Bridge":
+            video_frames = get_video_frames(self.config, self.video_lists[idx], flip=flip)
+        else:
+            raise NotImplementedError("We don't support this dataset loader")
+        # Scale [0, 255] -> [-1, 1]
+        if self.normalize:
+            video_frames = video_frames.astype(np.float32) / 127.5 - 1      # Be careful to cast to float32
+        # Transform to Pytorch Tensor in the range [-1, 1]
+        video_frames = numpy_to_pt(video_frames)
+        # print("length of input frames has ", len(video_frames))
+        # Get the motion value based on the optical flow
+        if self.config["motion_bucket_id"] is None:
+            reflected_motion_bucket_id = self._get_motion_value(self.video_lists[idx])
+        else:
+            reflected_motion_bucket_id = self.config["motion_bucket_id"]
+        # The tensor we returned is torch float32. We won't cast here for mixed precision training!
+        return {
+                "video_frames" : video_frames,
+                "reflected_motion_bucket_id" : reflected_motion_bucket_id,
+                "prompt": tokenized_prompt,
+                }

data_loader/video_this_that_dataset.py ADDED Viewed

	@@ -0,0 +1,326 @@

+import os, sys
+import json
+import cv2
+import math
+import shutil
+import numpy as np
+import random
+from PIL import Image
+import torch.nn.functional as F
+import torch
+import os.path as osp
+import time
+from moviepy.editor import VideoFileClip
+from torch.utils.data import Dataset
+# Import files from the local folder
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from utils.img_utils import resize_with_antialiasing, numpy_to_pt
+from utils.optical_flow_utils import flow_to_image, filter_uv, bivariate_Gaussian
+from data_loader.video_dataset import tokenize_captions
+# For the 2D dilation
+blur_kernel = bivariate_Gaussian(99, 10, 10, 0, grid = None, isotropic = True)
+def get_thisthat_sam(config, intput_dir, store_dir = None, flip = False, verbose=False):
+    '''
+    Args:
+        idx (int): The index to the folder we need to process
+    '''
+    # Read file
+    file_path = os.path.join(intput_dir, "data.txt")
+    file1 = open(file_path, 'r')
+    Lines = file1.readlines()
+    # Initial the optical flow format we want
+    thisthat_condition = np.zeros((config["video_seq_length"], config["conditioning_channels"], config["height"], config["width"]), dtype=np.float32)  # The last image should be empty
+    # Init the image
+    sample_img = cv2.imread(os.path.join(intput_dir, "im_0.jpg"))
+    org_height, org_width, _ = sample_img.shape
+    # Prepare masking
+    controlnet_image_index = []
+    coordinate_values = []
+    # Iterate all points in the txt file
+    for idx in range(len(Lines)):
+        # Read points
+        frame_idx, horizontal, vertical = Lines[idx].split(' ')
+        frame_idx, vertical, horizontal = int(frame_idx), int(float(vertical)), int(float(horizontal))
+        # Read the mask frame idx
+        controlnet_image_index.append(frame_idx)
+        coordinate_values.append((vertical, horizontal))
+        # Init the base image
+        base_img = np.zeros((org_height, org_width, 3)).astype(np.float32)      # Use the original image size
+        base_img.fill(255)
+        # Draw square around the target position
+        dot_range = 10       # Diameter
+        for i in range(-1*dot_range, dot_range+1):
+            for j in range(-1*dot_range, dot_range+1):
+                dil_vertical, dil_horizontal = vertical + i, horizontal + j
+                if (0 <= dil_vertical and dil_vertical < base_img.shape[0]) and (0 <= dil_horizontal and dil_horizontal < base_img.shape[1]):
+                    if idx == 0:
+                        base_img[dil_vertical][dil_horizontal] = [0, 0, 255]        # The first point should be red
+                    else:
+                        base_img[dil_vertical][dil_horizontal] = [0, 255, 0]        # The second point should be green to distinguish the first point
+        # Dilate
+        if config["dilate"]:
+            base_img = cv2.filter2D(base_img, -1, blur_kernel)
+        ##############################################################################################################################
+        ### The core pipeline of processing is: Dilate -> Resize -> Range Shift -> Transpose Shape -> Store
+        # Resize frames  Don't use negative and don't resize in [0,1]
+        base_img = cv2.resize(base_img, (config["width"], config["height"]), interpolation = cv2.INTER_CUBIC)
+        # Flip the image for aug if needed
+        if flip:
+            base_img = np.fliplr(base_img)
+        # Channel Transform and Range Shift
+        if config["conditioning_channels"] == 3:
+            # Map to [0, 1] range
+            if store_dir is not None and verbose:    # For the first frame condition visualization
+                cv2.imwrite(os.path.join(store_dir, "condition_TT"+str(idx)+".png"), base_img)
+            base_img = base_img / 255.0
+        else:
+            raise NotImplementedError()
+        # ReOrganize shape
+        base_img = base_img.transpose(2, 0, 1)  # hwc -> chw
+        # Check the min max value range
+        # if verbose:
+        #     print("{} min, max range value is {} - {}".format(intput_dir, np.min(base_img), np.max(base_img)))
+        # Write base img based on frame_idx
+        thisthat_condition[frame_idx] = base_img        # Only the first frame, the rest is 0 initialized
+    ##############################################################################################################################
+    if config["motion_bucket_id"] is None:
+        # take the motion to stats collected before
+        reflected_motion_bucket_id = 200
+    else:
+        reflected_motion_bucket_id = config["motion_bucket_id"]
+    # print("Motion Bucket ID is ", reflected_motion_bucket_id)
+    return (thisthat_condition, reflected_motion_bucket_id, controlnet_image_index, coordinate_values)
+class Video_ThisThat_Dataset(Dataset):
+    '''
+        Video Dataset to load sequential frames for training with needed pre-processing and process with optical flow
+    '''
+    def __init__(self, config, device, normalize=True, tokenizer=None):
+        # Attribute variables
+        self.config = config
+        self.device = device
+        self.normalize = normalize
+        self.tokenizer = tokenizer
+        # Obtain values
+        self.video_seq_length = config["video_seq_length"]
+        self.height = config["height"]
+        self.width = config["width"]
+        # Process data
+        self.video_lists = []
+        for dataset_path in config["dataset_path"]:
+            for video_name in sorted(os.listdir(dataset_path)):
+                if not os.path.exists(os.path.join(dataset_path, video_name, "data.txt")):
+                    continue
+                self.video_lists.append(os.path.join(dataset_path, video_name))
+        print("length of the dataset is ", len(self.video_lists))
+    def __len__(self):
+        return len(self.video_lists)
+    def _extract_frame_bridge(self, idx, flip=False):
+        ''' Extract the frame in video based on the needed fps from already extracted frame
+        Args:
+            idx (int):                  The index to the file in the directory
+            flip (bool):                Bool for whether we will flip
+        Returns:
+            video_frames (numpy):       Extracted video frames in numpy format
+        '''
+        # Init the the Video Reader
+        # The naming of the Bridge dataset follow a pattern: im_x.jpg, so we need to
+        video_frame_path = self.video_lists[idx]
+        # Find needed file
+        needed_img_path = []
+        for idx in range(self.video_seq_length):
+            img_path = os.path.join(video_frame_path, "im_" + str(idx) + ".jpg")
+            needed_img_path.append(img_path)
+        # Read all img_path based on the order
+        video_frames = []
+        for img_path in needed_img_path:
+            if not os.path.exists(img_path):
+                print("We don't have ", img_path)
+            frame = cv2.imread(img_path)
+            try:
+                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+            except Exception:
+                print("The exception place is ", img_path)
+            # Resize frames
+            frame = cv2.resize(frame, (self.width, self.height), interpolation = cv2.INTER_CUBIC)
+            # Flip aug
+            if flip:
+                frame = np.fliplr(frame)
+            # Collect frames
+            video_frames.append(np.expand_dims(frame, axis=0))       # The frame is already RGB, there is no need to convert here.
+        # Concatenate
+        video_frames = np.concatenate(video_frames, axis=0)
+        assert(len(video_frames) == self.video_seq_length)
+        # Returns
+        return video_frames
+    def __getitem__(self, idx):
+        ''' Get item by idx and pre-process by Resize and Normalize to [0, 1]
+        Args:
+            idx (int):                  The index to the file in the directory
+        Returns:
+            return_dict (dict):         video_frames (torch.float32) [-1, 1] and controlnet_condition (torch.float32) [0, 1]
+        '''
+        # Prepare the text if needed:
+        if self.config["use_text"]:
+            # Read the file
+            file_path = os.path.join(self.video_lists[idx], "lang.txt")
+            file = open(file_path, 'r')
+            prompt = file.readlines()[0]  # Only read the first line
+            if self.config["mix_ambiguous"] and os.path.exists(os.path.join(self.video_lists[idx], "processed_text.txt")):
+                # If we don't have this txt file, we skip
+                ######################################################## Mix up prompt ########################################################
+                # Read the file
+                file_path = os.path.join(self.video_lists[idx], "processed_text.txt")
+                file = open(file_path, 'r')
+                prompts = [line for line in file.readlines()]  # Only read the first line
+                # Get the componenet
+                action = prompts[0][:-1]
+                this = prompts[1][:-1]
+                there = prompts[2][:-1]
+                random_value = random.random()
+                # If less than 0.4, we don't care, just use the most concrete one
+                if random_value >= 0.4 and random_value < 0.6:
+                    # Mask pick object to "This"
+                    prompt = action + " this to " + there
+                elif random_value >= 0.6 and random_value < 0.8:
+                    # Mask place position to "There"
+                    prompt = action + " " + this + " to there"
+                elif random_value >= 0.8 and random_value < 1.0:
+                    # Just be like "this to there"
+                    prompt = action + " this to there"
+                # print("New prompt is ", prompt)
+                ###################################################################################################################################################
+            # else:
+            #     print("We don't have llama processed prompt at ", self.video_lists[idx])
+        else:
+            prompt = ""
+        # Tokenize text prompt
+        tokenized_prompt = tokenize_captions(prompt, self.tokenizer, self.config)
+        # Dataset aug by chance (it is needed to check whether there is any object position words [left|right] in the prompt text)
+        flip = False
+        if random.random() < self.config["flip_aug_prob"]:
+            if self.config["use_text"]:
+                if prompt.find("left") == -1 and prompt.find("right") == -1:    # Cannot have position word, like left and right (up and down is ok)
+                    flip = True
+            else:
+                flip = True
+        # Read frames for different dataset; Currently, we have WebVid / Bridge
+        if self.config["dataset_name"] == "Bridge":
+            video_frames_raw = self._extract_frame_bridge(idx, flip=flip)
+        else:
+            raise NotImplementedError("We don't support this dataset loader")
+        # Scale [0, 255] -> [-1, 1] if needed
+        if self.normalize:
+            video_frames = video_frames_raw.astype(np.float32) / 127.5 - 1      # Be careful to cast to float32
+        # Transform to Pytorch Tensor in the range [-1, 1]
+        video_frames = numpy_to_pt(video_frames)
+        # Generate the pairs we need
+        intput_dir = self.video_lists[idx]
+        # Get the This That point information
+        controlnet_condition, reflected_motion_bucket_id, controlnet_image_index, coordinate_values = get_thisthat_sam(self.config, intput_dir, flip=flip)
+        controlnet_condition = torch.from_numpy(controlnet_condition)
+        # Cast other value to tensor
+        reflected_motion_bucket_id = torch.tensor(reflected_motion_bucket_id, dtype=torch.float32)
+        controlnet_image_index = torch.tensor(controlnet_image_index, dtype=torch.int32)
+        coordinate_values = torch.tensor(coordinate_values, dtype=torch.int32)
+        # The tensor we returned is torch float32. We won't cast here for mixed precision training!
+        return {"video_frames" : video_frames,
+                "controlnet_condition" : controlnet_condition,
+                "reflected_motion_bucket_id" : reflected_motion_bucket_id,
+                "controlnet_image_index": controlnet_image_index,
+                "prompt": tokenized_prompt,
+                "coordinate_values": coordinate_values,     # Useless now, but I still passed back
+                }

pretrained/PUT_YOUR_WEIGHT_HERE.md ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+# Non-strict version lib
+opencv-python
+transformers
+accelerate
+requests
+moviepy
+omegaconf
+# xformers
+tensorboard
+einops
+yacs
+loguru
+imageio
+pyparsing
+ultralytics
+lpips
+matplotlib
+gradio
+torch==2.0.1
+torchvision
+# Strict version lib
+bitsandbytes==0.43.0
+diffusers==0.25.1
+timm==0.4.12
+scipy==1.9.3
+pyiqa==0.1.7

scripts/active_learning_select.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os, shutil
+import random
+if __name__ == "__main__":
+    start_idx = 950
+    end_idx = 1020
+    select_num = 70
+    label_start_idx = 632
+    input_parent_dir = "../Bridge"
+    store_dir = "../bridge_select3"
+    if os.path.exists(store_dir):
+        shutil.rmtree(store_dir)
+    os.makedirs(store_dir)
+    for idx in range(start_idx, end_idx):
+        folder_path = os.path.join(input_parent_dir, str(idx))
+        select_idx = random.randint(0, len(os.listdir(folder_path)))
+        for idx, img_name in enumerate(os.listdir(folder_path)):
+            if idx == select_idx and img_name != "policy_out.pkl":
+                img_path = os.path.join(folder_path, img_name)
+                target_path = os.path.join(store_dir, str(label_start_idx) + ".jpg")
+                label_start_idx += 1
+                shutil.copy(img_path, target_path)

scripts/add_point2img.py ADDED Viewed

	@@ -0,0 +1,51 @@

+'''
+    This file is to add point to the first image
+'''
+import os, shutil, sys
+if __name__ == "__main__":
+    input_folder_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/model_results/Human_Study/Input_Bridge_human_evaluation"
+    store_path = "point_highlighted"
+    if os.path.exists(input_folder_path):
+        shutil.rmtree(input_folder_path)
+    os.makedirs(input_folder_path)
+    for instance_name in os.listdir(input_folder_path):
+        sub_folder_dir = os.path.join(input_folder_path, instance_name)
+        # Read file
+        file_path = os.path.join(sub_folder_dir, "data.txt")
+        file1 = open(file_path, 'r')
+        Lines = file1.readlines()
+        # Read the first img
+        first_img_path = os.path.join(sub_folder_dir, "im_0.jpg")
+        # Init the image
+        base_img = cv2.imread(first_img_path).astype(np.float32)      # Use the original image size
+        # Draw the point
+        for idx in range(len(Lines)):
+            # Read points
+            frame_idx, horizontal, vertical = Lines[idx].split(' ')
+            frame_idx, vertical, horizontal = int(frame_idx), int(float(vertical)), int(float(horizontal))
+            # Draw square around the target position
+            dot_range = 15       # Diameter
+            for i in range(-1*dot_range, dot_range+1):
+                for j in range(-1*dot_range, dot_range+1):
+                    dil_vertical, dil_horizontal = vertical + i, horizontal + j
+                    if (0 <= dil_vertical and dil_vertical < base_img.shape[0]) and (0 <= dil_horizontal and dil_horizontal < base_img.shape[1]):
+                        if idx == 0:
+                            base_img[dil_vertical][dil_horizontal] = [0, 0, 255]        # The first point should be red
+                        else:
+                            base_img[dil_vertical][dil_horizontal] = [0, 255, 0]        # The second point should be green to distinguish the first point

scripts/check_video.py ADDED Viewed

	@@ -0,0 +1,19 @@

+'''
+    This file is to make sure that the video files is readeable by moviepy, such that the data loader can read these files.
+'''
+import os
+from moviepy.editor import VideoFileClip
+if __name__ == "__main__":
+    video_dir = "../webvid_sample"
+    delete_abnormal_video = True    # Whether you want to delete these abnormal video directly
+    for video_name in sorted(os.listdir(video_dir)):
+        video_path = os.path.join(video_dir, video_name)
+        try:
+            objVideoreader = VideoFileClip(filename=video_path)
+        except Exception:
+            print("There is an exception of reading: ", video_path)
+            if delete_abnormal_video:
+                print("We will remove this abnormal video source")
+                os.remove(video_path)

scripts/clean_bridge_dataset.py ADDED Viewed

	@@ -0,0 +1,22 @@

+'''
+    Sometimes, Bridge dataset will contain strange downloads, we need to clean them
+'''
+import os, shutil
+# TODO: 后面把这个直接merge 到prepare_bridge_dataset中
+if __name__ == "__main__":
+    dataset_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/Bridge"
+    for sub_folder in sorted(os.listdir(dataset_path)):
+        sub_folder_path = os.path.join(dataset_path, sub_folder)
+        img_lists = os.listdir(sub_folder_path)
+        if len(img_lists) < 14:
+            print("The folder is too short, we will remove them all")
+            shutil.rmtree(sub_folder_path)
+            continue
+        for img_name in img_lists:
+            img_path = os.path.join(sub_folder_path, img_name)
+            if not img_name.startswith("im_"):
+                print("We remove ", img_path)
+                os.remove(img_path)

scripts/collect_lang.py ADDED Viewed

	@@ -0,0 +1,31 @@

+'''
+    THis file is to collect all lang.txt and move to a new directory, this is for the convenience to compress and scp the lang for post-processing
+'''
+import os, sys, shutil
+if __name__ == "__main__":
+    parent_dir = "../datasets_rob"
+    dataset_paths = ["Bridge_v1_TT14", "Bridge_v2_TT14"]
+    store_folder = "../full_text_tmp"
+    # Manage the store folder
+    if os.path.exists(store_folder):
+        shutil.rmtree(store_folder)
+    os.makedirs(store_folder)
+    for dataset_name in dataset_paths:
+        store_path = os.path.join(store_folder, dataset_name)
+        if os.path.exists(store_path):
+            shutil.rmtree(store_path)
+        os.makedirs(store_path)
+        # Iterate all the files
+        for sub_folder_name in os.listdir(os.path.join(parent_dir, dataset_name)):
+            print("We are processing ", sub_folder_name)
+            lang_txt_path = os.path.join(parent_dir, dataset_name, sub_folder_name, "lang.txt")
+            # Store on the new address
+            store_file_path = os.path.join(store_path, sub_folder_name)
+            os.makedirs(store_file_path)
+            shutil.copyfile(lang_txt_path, os.path.join(store_file_path, "lang.txt"))

scripts/combine_results.py ADDED Viewed

	@@ -0,0 +1,85 @@

+'''
+    This repo is to combine multiple generated images with same index together
+'''
+import os, shutil, sys
+import imageio
+import math
+import cv2
+from PIL import Image
+import collections
+import numpy as np
+if __name__ == "__main__":
+    # Basic setting
+    data_paths = [
+                    "human_evaluation_v3_V_raw_prompt",
+                    "human_evaluation_v3_VG_raw_prompt_no_sam",
+                    "human_evaluation_v3_VL_ambiguous_prompt",
+                    "../datasets_rob/Bridge_human_evaluation",
+                    "human_evaluation_v3_VL_raw_prompt",
+                    "human_evaluation_v3_VGL_raw_prompt_no_sam",
+                    "human_evaluation_v3_VGL_ambiguous_prompt_no_sam",
+                ]
+    store_path = "combined_results_human_evaluation"
+    sample_data_path = data_paths[0]
+    gif_per_row = 4     # Number of GIF files per row
+    # Create folder
+    if os.path.exists(store_path):
+        shutil.rmtree(store_path)
+    os.makedirs(store_path)
+    # Iterate the sample
+    for instance_idx, sub_folder_name in enumerate(sorted(os.listdir(sample_data_path))):
+        print("we are processing ", sub_folder_name)
+        collected_gif_paths = []
+        for data_path in data_paths:
+            collected_gif_paths.append(os.path.join(data_path, sub_folder_name, 'combined.gif'))
+        # Merge frames together
+        rows = math.ceil(len(collected_gif_paths) / gif_per_row)
+        cols = gif_per_row
+        # Read all input GIFs and find maximum dimensions
+        gifs = []
+        max_width, max_height = 0, 0
+        for path in collected_gif_paths:
+            gif = imageio.mimread(path)
+            max_width = max(max_width, gif[0].shape[1])
+            max_height = max(max_height, gif[0].shape[0])
+            gifs.append(gif)
+            # Create blank canvas for concatenated GIF
+            frames_length = len(gifs[0])
+            canvas_width = max_width * cols
+            canvas_height = max_height * rows
+            canvas = np.zeros((frames_length, canvas_height, canvas_width, 3), dtype=np.uint8)
+        # push each frame into the canvas placeholder
+        gif_index = 0
+        for row in range(rows):
+            for col in range(cols):
+                gif = gifs[gif_index]
+                gif_height, gif_width, _ = gif[0].shape
+                start_y = row * max_height
+                start_x = col * max_width
+                for i in range(frames_length):
+                    canvas[i, start_y:start_y+gif_height, start_x:start_x+gif_width, :] = gif[i]
+                # Update index
+                gif_index += 1
+                if gif_index == len(collected_gif_paths):
+                    break
+        # Write the concatenated GIF
+        imageio.mimsave(os.path.join(store_path, sub_folder_name + ".gif"), canvas, duration=0.05, quality=100)

scripts/compress_gif.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os, shutil, sys
+import cv2
+import imageio
+import numpy as np
+def compress_gif(sub_folder_path):
+    # Check valid length
+    all_files = os.listdir(sub_folder_path)
+    num_frames_input = 0
+    valid = True
+    for file_name in os.listdir(sub_folder_path):
+        if file_name.startswith("im_"):
+            num_frames_input += 1
+    for idx in range(num_frames_input):
+        img_path = 'im_' + str(idx) + '.jpg'
+        if img_path not in all_files:            # Should be sequential existing
+            valid = False
+            break
+    if not valid:
+        print("We cannot generate a video because the video is not sequential")
+        return False
+    if num_frames_input == 0:
+        print("We cannot generate a video because the input length is 0")
+        return False
+    img_lists = []
+    for idx in range(num_frames_input):
+        img_path = os.path.join(sub_folder_path, "im_" + str(idx) + ".jpg")
+        img_lists.append(cv2.resize(cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB), (384, 256)))
+    imageio.mimsave(os.path.join(sub_folder_path, 'combined.gif'), np.array(img_lists), duration=0.05, quality=100)
+    return True
+if __name__ == "__main__":
+    dataset_path = "../datasets_rob/Bridge_human_evaluation"  # ../datasets_rob/Bridge_v1_raw
+    for sub_folder_name in sorted(os.listdir(dataset_path)):
+        print("We are processing ", sub_folder_name)
+        sub_folder_path = os.path.join(dataset_path, sub_folder_name)
+        status = compress_gif(sub_folder_path)

scripts/compress_videos.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os, shutil, sys
+from moviepy.editor import ImageSequenceClip
+def compress_video(sub_folder_path, video_name):
+    store_path = os.path.join(sub_folder_path, video_name)
+    if os.path.exists(store_path):
+        os.remove(store_path)
+    # Check valid length
+    all_files = os.listdir(sub_folder_path)
+    num_frames_input = 0
+    valid = True
+    for file_name in os.listdir(sub_folder_path):
+        if file_name.startswith("im_"):
+            num_frames_input += 1
+    for idx in range(num_frames_input):
+        img_path = 'im_' + str(idx) + '.jpg'
+        if img_path not in all_files:            # Should be sequential existing
+            valid = False
+            break
+    if not valid:
+        print("We cannot generate a video because the video is not sequential")
+        return False
+    if num_frames_input == 0:
+        print("We cannot generate a video because the input length is 0")
+        return False
+    img_lists = []
+    for idx in range(num_frames_input):
+        img_path = os.path.join(sub_folder_path, "im_" + str(idx) + ".jpg")
+        img_lists.append(img_path)
+    clip = ImageSequenceClip(img_lists, fps=4)
+    clip.write_videofile(store_path)
+    return True
+if __name__ == "__main__":
+    dataset_path = "../datasets_rob/Bridge_v2_raw"  # ../datasets_rob/Bridge_v1_raw
+    for sub_folder_name in sorted(os.listdir(dataset_path)):
+        sub_folder_path = os.path.join(dataset_path, sub_folder_name)
+        status = compress_video(sub_folder_path)

scripts/crop_video_frames.py ADDED Viewed

	@@ -0,0 +1,22 @@

+'''
+    This file is to split the video sources in a folder to folder with images, for the mass evaluation
+'''
+import os, shutil, sys
+import cv2
+if __name__ == "__main__":
+    input_folder = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/StreamingT2V_results"
+    needed_frame_length = 14
+    idx = 0
+    for file_name in sorted(os.listdir(input_folder)):
+        print("We are processing ", file_name)
+        sub_folder_path = os.path.join(input_folder, file_name)
+        for idx in range(len(os.listdir(sub_folder_path))):
+            if idx >= needed_frame_length:
+                target_path = os.path.join(sub_folder_path, str(idx)+".png")
+                os.remove(target_path)

scripts/extract_test_dataset.py ADDED Viewed

	@@ -0,0 +1,18 @@

+'''
+    Extract the test dataset from the txt file
+'''
+if __name__ == "__main__":
+    txt_path = "match_info_v2.txt"
+    store_path = "test_path_v2.txt"
+    start_idx = len("/nfs/turbo/jjparkcv-turbo-large/boyangwa/raw/bridge_data_v2/")
+    read_file = open(txt_path, "r")
+    write_file = open(store_path, "w")
+    for line in read_file.readlines():
+        test_dataset_path = line.split(' ')[1]
+        test_instance = test_dataset_path[start_idx:]
+        write_file.write(test_instance)

scripts/generate_noise.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+# Set the dimensions of the image
+height = 256
+width = 256
+# Generate random pixel values
+noise = np.random.rand(height, width, 3) * 255  # Scale to 255 for grayscale image
+for idx in range (4):
+    cv2.imwrite("noise"+str(idx)+".png", noise)

scripts/generate_sam.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os, sys, shutil
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+from segment_anything import SamAutomaticMaskGenerator, SamPredictor, sam_model_registry
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(True)
+    img = np.ones((sorted_anns[0]['segmentation'].shape[0], sorted_anns[0]['segmentation'].shape[1], 3))
+    # img[:,:,3] = 0
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        color_mask = np.concatenate([np.random.random(3)])
+        img[m] = color_mask
+    return img*255
+if __name__ == "__main__":
+    input_parent_folder = "../Bridge_filter_flow"
+    # Init SAM for segmentation task
+    model_type = "vit_h"
+    weight_path = "pretrained/sam_vit_h_4b8939.pth"
+    sam = sam_model_registry[model_type](checkpoint=weight_path).to(device="cuda")
+    mask_generator = SamAutomaticMaskGenerator(sam)     # There is a lot of setting here
+    for sub_dir_name in sorted(os.listdir(input_parent_folder)):
+        print("We are processing ", sub_dir_name)
+        ref_img_path = os.path.join(input_parent_folder, sub_dir_name, 'im_0.jpg')
+        store_path = os.path.join(input_parent_folder, sub_dir_name, 'sam.png')
+        image = cv2.imread(ref_img_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        mask = mask_generator.generate(image)
+        mask_img = show_anns(mask)
+        cv2.imwrite(store_path, mask_img)

scripts/generate_sam_this_that.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os, sys, shutil
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+from segment_anything import SamAutomaticMaskGenerator, SamPredictor, sam_model_registry
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(True)
+    img = np.ones((sorted_anns[0]['segmentation'].shape[0], sorted_anns[0]['segmentation'].shape[1], 3))
+    # img[:,:,3] = 0
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        color_mask = np.concatenate([np.random.random(3)])
+        img[m] = color_mask
+    return img*255
+def show_mask(mask, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    return mask_image * 255
+def show_points(coords, labels, ax, marker_size=375):
+    pos_points = coords[labels==1]
+    neg_points = coords[labels==0]
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1)
+if __name__ == "__main__":
+    input_parent_folder = "validation_tmp"
+    # Init SAM for segmentation task
+    model_type = "vit_h"
+    weight_path = "pretrained/sam_vit_h_4b8939.pth"
+    sam = sam_model_registry[model_type](checkpoint=weight_path).to(device="cuda")
+    sam_predictor = SamPredictor(sam)
+    mask_generator = SamAutomaticMaskGenerator(sam)
+    # Iterate the folder
+    for sub_dir_name in sorted(os.listdir(input_parent_folder)):
+        print("We are processing ", sub_dir_name)
+        ref_img_path = os.path.join(input_parent_folder, sub_dir_name, 'im_0.jpg')
+        data_txt_path = os.path.join(input_parent_folder, sub_dir_name, 'data.txt')
+        # Read the image and process
+        image = cv2.imread(ref_img_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Read the positive point
+        data_file = open(data_txt_path, 'r')
+        lines = data_file.readlines()
+        for idx in range(len(lines)):
+            frame_idx, horizontal, vertical = lines[idx].split(' ')
+            vertical, horizontal = int(float(vertical)), int(float(horizontal))
+            positive_point_cords = [[horizontal, vertical]]
+            positive_point_cords = np.array(positive_point_cords)
+            positive_point_labels = np.ones(len(positive_point_cords))
+            print(positive_point_cords)
+            # Set the SAM predictor
+            sam_predictor.set_image(np.uint8(image))
+            masks, scores, logits = sam_predictor.predict(
+                                                point_coords = positive_point_cords,  # Only positive points here
+                                                point_labels = positive_point_labels,
+                                                multimask_output = False,
+                                                )
+            # print("Detected mask length is ", len(masks))
+            # Visualize
+            mask_img = show_mask(masks[0])
+            cv2.imwrite(os.path.join(input_parent_folder, sub_dir_name, "first_contact0.png"), mask_img)
+            break
+        # SAM all
+        sam_all = mask_generator.generate(image)
+        all_sam_imgs = show_anns(sam_all)
+        cv2.imwrite("sam_all.png", all_sam_imgs)

scripts/generate_traj.py ADDED Viewed

	@@ -0,0 +1,601 @@

+import sys
+import argparse
+import copy
+import os, shutil
+import imageio
+import cv2
+from PIL import Image, ImageDraw
+import os.path as osp
+import random
+import numpy as np
+import torch.multiprocessing as mp
+from multiprocessing import set_start_method
+import math, time, gc
+import torch
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+from segment_anything import SamAutomaticMaskGenerator, SamPredictor, sam_model_registry
+# Import files from the local path
+root_path = os.path.abspath('.')
+sys.path.append(root_path)
+from config.flowformer_config import get_cfg
+from flowformer_code.utils import flow_viz, frame_utils
+from flowformer_code.utils.utils import InputPadder
+from flowformer_code.FlowFormer import build_flowformer
+TRAIN_SIZE = [432, 960]
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(True)
+    img = np.ones((sorted_anns[0]['segmentation'].shape[0], sorted_anns[0]['segmentation'].shape[1], 4))
+    img[:,:,3] = 0
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        color_mask = np.concatenate([np.random.random(3), [0.35]])
+        img[m] = color_mask
+    return img*255
+def show_mask(mask, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    return mask_image * 255
+def compute_grid_indices(image_shape, patch_size=TRAIN_SIZE, min_overlap=20):
+  if min_overlap >= TRAIN_SIZE[0] or min_overlap >= TRAIN_SIZE[1]:
+    raise ValueError(
+        f"Overlap should be less than size of patch (got {min_overlap}"
+        f"for patch size {patch_size}).")
+  if image_shape[0] == TRAIN_SIZE[0]:
+    hs = list(range(0, image_shape[0], TRAIN_SIZE[0]))
+  else:
+    hs = list(range(0, image_shape[0], TRAIN_SIZE[0] - min_overlap))
+  if image_shape[1] == TRAIN_SIZE[1]:
+    ws = list(range(0, image_shape[1], TRAIN_SIZE[1]))
+  else:
+    ws = list(range(0, image_shape[1], TRAIN_SIZE[1] - min_overlap))
+  # Make sure the final patch is flush with the image boundary
+  hs[-1] = image_shape[0] - patch_size[0]
+  ws[-1] = image_shape[1] - patch_size[1]
+  return [(h, w) for h in hs for w in ws]
+def compute_flow(model, image1, image2, weights=None):
+    print(f"computing flow...")
+    image_size = image1.shape[1:]
+    image1, image2 = image1[None].cuda(), image2[None].cuda()
+    hws = compute_grid_indices(image_size)
+    if weights is None:     # no tile
+        padder = InputPadder(image1.shape)
+        image1, image2 = padder.pad(image1, image2)
+        flow_pre, _ = model(image1, image2)
+        flow_pre = padder.unpad(flow_pre)
+        flow = flow_pre[0].permute(1, 2, 0).cpu().numpy()
+    else:                   # tile
+        flows = 0
+        flow_count = 0
+        for idx, (h, w) in enumerate(hws):
+            image1_tile = image1[:, :, h:h+TRAIN_SIZE[0], w:w+TRAIN_SIZE[1]]
+            image2_tile = image2[:, :, h:h+TRAIN_SIZE[0], w:w+TRAIN_SIZE[1]]
+            flow_pre, _ = model(image1_tile, image2_tile)
+            padding = (w, image_size[1]-w-TRAIN_SIZE[1], h, image_size[0]-h-TRAIN_SIZE[0], 0, 0)
+            flows += F.pad(flow_pre * weights[idx], padding)
+            flow_count += F.pad(weights[idx], padding)
+        flow_pre = flows / flow_count
+        flow = flow_pre[0].permute(1, 2, 0).cpu().numpy()
+    return flow
+def compute_adaptive_image_size(image_size):
+    target_size = TRAIN_SIZE
+    scale0 = target_size[0] / image_size[0]
+    scale1 = target_size[1] / image_size[1]
+    if scale0 > scale1:
+        scale = scale0
+    else:
+        scale = scale1
+    image_size = (int(image_size[1] * scale), int(image_size[0] * scale))
+    return image_size
+def prepare_image(viz_root_dir, fn1, fn2, keep_size):
+    print(f"preparing image...")
+    image1 = frame_utils.read_gen(fn1)
+    image2 = frame_utils.read_gen(fn2)
+    image1 = np.array(image1).astype(np.uint8)[..., :3]
+    image2 = np.array(image2).astype(np.uint8)[..., :3]
+    if not keep_size:
+        dsize = compute_adaptive_image_size(image1.shape[0:2])
+        image1 = cv2.resize(image1, dsize=dsize, interpolation=cv2.INTER_CUBIC)
+        image2 = cv2.resize(image2, dsize=dsize, interpolation=cv2.INTER_CUBIC)
+    image1 = torch.from_numpy(image1).permute(2, 0, 1).float()
+    image2 = torch.from_numpy(image2).permute(2, 0, 1).float()
+    dirname = osp.dirname(fn1)
+    filename = osp.splitext(osp.basename(fn1))[0]
+    viz_dir = osp.join(viz_root_dir, dirname)
+    # if not osp.exists(viz_dir):
+    #     os.makedirs(viz_dir)
+    viz_fn = osp.join(viz_dir, filename + '.png')
+    return image1, image2, viz_fn
+def build_model():
+    print(f"building  model...")
+    cfg = get_cfg()
+    model = torch.nn.DataParallel(build_flowformer(cfg))
+    model.load_state_dict(torch.load(cfg.model))
+    model.cuda()
+    model.eval()
+    return model
+def filter_uv(flow, threshold_factor = 0.2):
+    u = flow[:,:,0]
+    v = flow[:,:,1]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    rad_max = np.max(rad)
+    threshold = threshold_factor * rad_max
+    flow[:,:,0][rad < threshold] = 0
+    flow[:,:,1][rad < threshold] = 0
+    return flow
+def visualize_traj(base_img, traj_path, connect_points = True):
+    target_vertical, target_horizontal = traj_path[-1]
+    if connect_points and len(traj_path) > 1:
+        # Draw a line to connect two point to show motion direction
+        start_coordinate = (traj_path[-2][1], traj_path[-2][0])
+        end_coordinate = (traj_path[-1][1], traj_path[-1][0])
+        pil_img = Image.fromarray(base_img)
+        # Draw the line
+        color = 'red'
+        draw = ImageDraw.Draw(pil_img)
+        draw.line([start_coordinate, end_coordinate], fill = color, width = 3)
+        base_img = np.array(pil_img)
+    # Draw a green dot only for the start point
+    if len(traj_path) == 1:
+        dot_range = 3
+        for i in range(-1*dot_range, dot_range+1):
+            for j in range(-1*dot_range, dot_range+1):
+                dil_vertical, dil_horizontal = target_vertical + i, target_horizontal + j
+                if (0 <= dil_vertical and dil_vertical < base_img.shape[0]) and (0 <= dil_horizontal and dil_horizontal < base_img.shape[1]):
+                    base_img[dil_vertical][dil_horizontal] = [0, 128, 0]
+                else:
+                    print("The traj is out of boundary!!!!!!!!!!!!!!!!!!!!! and we won't consider it")      # 现在
+                    return (False, base_img)
+    return (True, base_img)
+def calculate_flow(viz_root_dir, store_dir, img_pairs, optical_flow_model, sam_predictor, SAM_positive_sample_num, SAM_negative_sample_num, mask_generator, traj_visualization, keep_size, verbose=False):
+    # Trajectory prepare
+    traj_path = []              # It collects all points traversed in a temporal order
+    is_hard_to_track = False    # If this is True, it means that, we have a time in tracking hard to find dx and dy movement. Under this circumstance, we are not very recommended to use it
+    hard_track_idxs = set()
+    traj_image_lists = []
+    # Iterate all image pairs
+    for idx, img_pair in enumerate(img_pairs):
+        fn1, fn2 = img_pair
+        print(f"processing {fn1}, {fn2}...")
+        image1, image2, viz_fn = prepare_image(viz_root_dir, fn1, fn2, keep_size)     # Be very careful, image1 and image2 may be different resolution shape if keep_size is False
+        # Generate the optical flow and filter those that is small motion
+        flow_uv = filter_uv(compute_flow(optical_flow_model, image1, image2, None))
+        # if verbose:
+            # Store the visualization of flow_uv
+            # flow_img = flow_viz.flow_to_image(flow_uv)
+            # cv2.imwrite("optical_flow_" + str(idx+1) + ".png", flow_img[:, :, [2,1,0]])
+        if idx == 0:
+            # We will store the first image to memory for further visualization purpose
+            # Base img
+            # base_img = np.uint8(np.transpose(image1.numpy(), (1,2,0)))
+            # SAM figure
+            # sam_all = mask_generator.generate(image1)
+            # base_img = show_anns(sam_all)
+            # base_img = np.transpose(base_img, (1,2,0))
+            # Plain white image
+            base_img = np.zeros(np.transpose(image1.numpy(), (1,2,0)).shape, dtype=np.uint8)
+            base_img.fill(255)
+        # Extract moving points (positive point)
+        positive_point_cords = []
+        nonzeros = np.nonzero(flow_uv)          # [(vertical), (horizontal)]
+        if len(nonzeros[0]) < SAM_positive_sample_num:
+            # We require the number of points to be more than SAM_positive_sample_num
+            return False
+        positive_orders = np.random.choice(len(nonzeros[0]), SAM_positive_sample_num, replace=False)    # we have randomly select instead of use all in the sam_predictor prediction
+        for i in range(len(nonzeros[0])):
+            if i in positive_orders:
+                positive_point_cords.append([nonzeros[1][i], nonzeros[0][i]])       # 根据document来看，这个就应该是先horizontal再vertical，也就是这个顺序
+        positive_point_cords = np.array(positive_point_cords)
+        positive_point_labels = np.ones(len(positive_point_cords))
+        # Define negative sample (outside the optical flow choice)
+        if SAM_negative_sample_num != 0:
+            skip_prob = 2 * SAM_negative_sample_num / (flow_uv.shape[0]*flow_uv.shape[1] - len(nonzeros[0]))
+            negative_point_cords = []
+            for i in range(flow_uv.shape[0]):
+                for j in range(flow_uv.shape[1]):
+                    if flow_uv[i][j][0] == 0 and flow_uv[i][j][1] == 0:         # 0 means the no motion zone and we have already filter low motion as zero before
+                        if random.random() < skip_prob:
+                            negative_point_cords.append([j, i])                 # 根据document来看，这个就应该是先horizontal再vertical，也就是这个顺序
+            negative_point_cords = np.array(negative_point_cords)       # [:SAM_negative_sample_num]
+            negative_point_labels = np.zeros(len(negative_point_cords))         # Make sure that it is less than / equals to SAM_negative_sample_num quantity
+        ################## Use SAM to filter out what we need (& use negative points) ##################
+        if idx == 0:    # Only consider the first frame now.
+            # With sample coordinate
+            sam_predictor.set_image(np.uint8(np.transpose(image1.numpy(), (1,2,0))))
+            if SAM_negative_sample_num != 0 and len(negative_point_cords) != 0:
+                all_point_cords = np.concatenate((positive_point_cords, negative_point_cords), axis=0)
+                all_point_labels = np.concatenate((positive_point_labels, negative_point_labels), axis=0)
+            else:
+                all_point_cords = positive_point_cords
+                all_point_labels = positive_point_labels
+            masks, scores, logits = sam_predictor.predict(
+                                            point_coords=all_point_cords,
+                                            point_labels=all_point_labels,
+                                            multimask_output=False,
+                                            )
+            mask = masks[0]      # TODO: 一定要确定我们这里选择了最大的mask，而没有考虑的第二大和其他的, 这里可能有bug，我们默认了第一个就是最大的mask
+            # if verbose:
+                # cv2.imwrite("mask_"+str(idx+1)+".png", (np.uint8(mask)*255))
+                # annotated_img = show_mask(mask)
+                # cv2.imwrite("annotated.png", annotated_img)
+            ################## Choose the one we need as the reference for the future tracking ##################
+            # Choose a random point in the mask
+            target_zone = np.nonzero(mask)      # [(vertical), (horizontal)]
+            target_zone = [(target_zone[0][i], target_zone[1][i]) for i in range(len(target_zone[0]))]      # Now, the sturcture is [(vertical, horizontal), ...]
+            repeat_time = 0
+            loop2find = True
+            while loop2find:
+                loop2find = False
+                start_point = target_zone[np.random.choice(len(target_zone), 1, replace=False)[0]]
+                start_vertical, start_horizontal = start_point
+                repeat_time += 1
+                if repeat_time == 100:
+                    # In some minor case, it may have infinite loop, so we need to manually break if it is looping
+                    print("We are still hard to find a optimal first point, but we cannot let it loop")
+                    break
+                # Try to choose a start_point that is more centralized (Not close to the border)
+                fast_break = False
+                for i in range(-15, 15):
+                    for j in range(-15, 15):
+                        dil_vertical, dil_horizontal = start_vertical + i, start_horizontal + j
+                        if (0 <= dil_vertical and dil_vertical < mask.shape[0]) and (0 <= dil_horizontal and dil_horizontal < mask.shape[1]):
+                            if mask[dil_vertical][dil_horizontal] == 0:
+                                print("We need to change to a new position for the start p Since this one is close to the border of the object...........")
+                                loop2find = True
+                                fast_break = True
+                                break
+                        else:
+                            # We won't want to consider those that is close to the boundary
+                            print("We need to change to a new position Since this one is close to the border of the image...........")
+                            loop2find = True
+                            fast_break = True
+                            break
+                    if fast_break:
+                        break
+            traj_path.append(start_point)
+            status, base_img = visualize_traj(base_img, traj_path)
+            if status == False:       # If the traj is False, we won't consider it anymore.
+                file = open("log.txt", "a")
+                file.write("Invalid start point\n")
+                return False
+        # Read from the last one in traj
+        ref_vertical, ref_horizontal = traj_path[-1][0], traj_path[-1][1]
+        # Get the average motion vector for point surrounding (8+1 directions) the ref_point; This is because this is the most accurate statistics
+        horizon_lists, vertical_lists = [], []
+        start_range, end_range = -5, 5
+        # Calculate the average motion based on surrounding motion
+        search_times = 0
+        while len(horizon_lists) == 0:  # If we cannot find a direction, we use average value inside this mask, but we will flag it.
+            search_times += 1
+            if search_times > 1:
+                print("This is hard to track!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! and we have tracked " + str(search_times) + " times")
+                # TODO: 如果out of boundary那种，search times到了8-10次的就砍掉那后面frame吧，这种非常inaccurate了， 你也可以retrack一个新的点，但是没有什么意义，看整体数量来定吧
+                is_hard_to_track = True
+                hard_track_idxs.add(idx)
+                if abs(start_range) >= flow_uv.shape[0]//2:
+                    file = open("log.txt", "a")
+                    file.write("This folder has search all space but didn't find any place to track optical flow\n")
+                    return False    # If we have already search for the whole graph but didn't find anything to track, we discard this sample
+            # Search for a larger space which is nearby 我觉得扩大搜索范围应该是最稳定的选择吧
+            for i in range(start_range, end_range):
+                for j in range(start_range, end_range):
+                    target_vertical, target_horizontal = ref_vertical + i, ref_horizontal + j
+                    if 0 <= target_vertical and target_vertical < flow_uv.shape[0] and 0 <= target_horizontal and target_horizontal < flow_uv.shape[1]:
+                        if flow_uv[target_vertical, target_horizontal, 0] == 0 or flow_uv[target_vertical, target_horizontal, 1] == 0:
+                            continue     # Ignore zero vector to ensure only calculate moving position
+                        horizon_lists.append(flow_uv[target_vertical, target_horizontal, 0])      # Horizontal motion strength
+                        vertical_lists.append(flow_uv[target_vertical, target_horizontal, 1])     # Vertical motion strength
+            # If there isn't any to search, we kepp on a larger space
+            start_range -= 10
+            end_range += 10
+        average_dx = sum(horizon_lists)/len(horizon_lists)
+        average_dy = sum(vertical_lists)/len(vertical_lists)
+        print("average movement is ", (average_dx, average_dy))
+        traj_path.append(( int(traj_path[-1][0] + average_dy), int(traj_path[-1][1] + average_dx)))    # Append the motion in independent order
+        print(traj_path)
+        ##################### Visualize the trajectory path (Debug Purpose) #####################
+        status, base_img = visualize_traj(base_img, traj_path)
+        if status == False:       # If the traj is False, we won't consider it anymore.
+            return False
+        cv2.imwrite(os.path.join(store_dir, "traj_path.png"), cv2.cvtColor(base_img, cv2.COLOR_BGR2RGB))
+        if traj_visualization:
+            status, single_traj_img = visualize_traj(np.uint8(np.transpose(image1.numpy(), (1,2,0))), traj_path[:-1], connect_points=False)
+            if status == False:       # If the traj is False, we won't consider it anymore.
+                return False
+            traj_write_path = os.path.join(store_dir, "traj_"+str(idx)+".png")
+            # cv2.imwrite(traj_write_path, cv2.cvtColor(single_traj_img, cv2.COLOR_BGR2RGB))
+            traj_image_lists.append(traj_write_path)
+    # if traj_visualization:
+    #     images = []
+    #     for filename in traj_image_lists:
+    #         images.append(imageio.imread(filename))
+    #         # os.remove(filename)     # Remove when used
+    #     imageio.mimsave(os.path.join(store_dir, 'traj_motion.gif'), images, duration=0.05)
+    # TODO: 可以如果hard to track，就aggressivly多试即便，我们根据这个hard_track_idxs的长度来粗略判断哪个最好，三次里面选最好的
+    if is_hard_to_track:
+        if len(hard_track_idxs) >= len(img_pairs)//3:       # If more than half of the traj is hard to track, we need to consider discard this one
+            file = open("log.txt", "a")
+            file.write("we have a lot of times hard to find dx and dy movement. Under this circumstance, we are not very recommended to use the track\n")
+            return False
+    # Write a file store all position for further utilization
+    txt_path = os.path.join(store_dir, "traj_data.txt")
+    if os.path.exists(txt_path):
+        os.remove(txt_path)
+    file = open(txt_path, "a")
+    for traj in traj_path:
+        file.write(str(traj[0]) + " " + str(traj[1]) + "\n")
+    # Save in numpy information
+    # with open(os.path.join(store_dir, 'traj_data.npy'), 'wb') as f:
+    #     np.save(f, flow_uv)
+    print("We write ", traj_path)
+    return True
+def manage_seq_range(input_dir, store_dir, total_frame_needed):
+    lists = os.listdir(input_dir)
+    lists = lists[2:-2]
+    num_frames_input = len(lists)
+    if num_frames_input < total_frame_needed:
+        print("The number of frames is too short for constructing the sequnece length needed")
+        return False
+    division_factor = num_frames_input // total_frame_needed
+    remain_frame = num_frames_input % total_frame_needed
+    gaps = [division_factor for _ in range(total_frame_needed)]
+    for idx in range(remain_frame):
+        gaps[idx] += 1
+    cur_idx = 2
+    for global_idx, gap in enumerate(gaps):
+        source_path = os.path.join(input_dir, "im_"+str(cur_idx)+".jpg")
+        destination_path = os.path.join(store_dir, "im_"+str(global_idx)+".jpg")
+        shutil.copyfile(source_path, destination_path)
+        cur_idx += gap
+    return True
+def generate_pairs(dirname, start_idx, end_idx):
+    img_pairs = []
+    for idx in range(start_idx, end_idx):
+        img1 = osp.join(dirname, f'im_{idx}.jpg')
+        img2 = osp.join(dirname, f'im_{idx+1}.jpg')
+        # img1 = f'{idx:06}.png'
+        # img2 = f'{idx+1:06}.png'
+        img_pairs.append((img1, img2))
+    return img_pairs
+def process_partial_request(request_list, num_frames, traj_visualization, viz_root_dir):
+    # Init the optical flow model
+    optical_flow_model = build_model()
+    # Init SAM for segmentation task
+    model_type = "vit_h"
+    weight_path = "pretrained/sam_vit_h_4b8939.pth"
+    SAM_positive_sample_num = 20    # How many points we use for the positive sample num ()
+    SAM_negative_sample_num = 0    # How many points we use for the negative sample num
+    print("In multi processing, we will build an instance of mask_generator independently")
+    sam = sam_model_registry[model_type](checkpoint=weight_path).to(device="cuda")
+    mask_generator = SamAutomaticMaskGenerator(sam)
+    print("In multi processing, we will build an instance of sam_predictor independently")
+    sam_predictor = SamPredictor(sam)
+    counter = 0
+    while True:
+        counter += 1
+        if counter == 10:
+            counter = 0
+            gc.collect()
+            print("We will sleep here to clear memory")
+            time.sleep(5)
+        info = request_list[0]
+        request_list = request_list[1:]
+        if info == None:
+            print("This queue ends")
+            break
+        # Process each sub_input_dir and store the information there
+        sub_input_dir = info
+        img_pairs = generate_pairs(sub_input_dir, 0, num_frames-1)
+        print(img_pairs)
+        with torch.no_grad():
+            # Calculate the optical flow and return a status to say whther this generated flow is usable
+            status = calculate_flow(viz_root_dir, sub_input_dir, img_pairs, optical_flow_model, sam_predictor, SAM_positive_sample_num, SAM_negative_sample_num,
+                                    mask_generator, traj_visualization, keep_size = True)
+            # file = open("log.txt", "a")
+            print("The status for folder " + sub_input_dir + " is " + str(status) + "\n")
+            if status == False:
+                # If the status is failed, we will remove it afterwords
+                print("The status is Failed, so we won't store this one as one promising data")
+            else:
+                print("We have successfully process one!")
+if __name__ == '__main__':
+    # Manage the paramter
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_dir', default = '../validation_flow14/')
+    parser.add_argument('--num_workers', type = int, default = 1)       # starting index of the image sequence
+    parser.add_argument('--viz_root_dir', default = 'viz_results')
+    parser.add_argument('--traj_visualization', default = True)      # If this is True,
+    # list_start = 0
+    # list_end = 25000
+    num_frames = 14
+    args = parser.parse_args()
+    input_dir = args.input_dir
+    num_workers = args.num_workers
+    viz_root_dir = args.viz_root_dir
+    traj_visualization = args.traj_visualization
+    store_idx = 0
+    dir_list = []
+    for sub_input_name in sorted(os.listdir(input_dir)):
+        sub_input_dir = os.path.join(input_dir, sub_input_name)
+        # sub_store_dir = os.path.join(store_dir, "0"*(7-len(str(store_idx)))+str(store_idx))
+        store_idx += 1
+        dir_list.append(sub_input_dir)
+    # Truncate the list to the target
+    # dir_list = dir_list[list_start:]
+    # Use multiprocessing to handle to speed up
+    num = math.ceil(len(dir_list) / num_workers)
+    for idx in range(num_workers):
+        # set_start_method('spawn', force=True)
+        request_list = dir_list[:num]
+        request_list.append(None)
+        dir_list = dir_list[num:]
+        process_partial_request(request_list, num_frames, traj_visualization, viz_root_dir)   # This is for debug purpose
+        # p = mp.Process(target=process_partial_request, args=(request_list, num_frames, traj_visualization, viz_root_dir, ))
+        # p.start()
+    print("Submitted all jobs!")
+    # p.join()        # 好像不加这个multiprocess就莫名自己结束了
+    print("All task finished!")

scripts/interpolate_by_repeat.py ADDED Viewed

	@@ -0,0 +1,55 @@

+'''
+    This file is trying to repeat the frames such the it reaches target frames needed
+'''
+import os, shutil, sys
+if __name__ == "__main__":
+    input_path = "/nfs/turbo/coe-jjparkcv/boyangwa/AVDC/AVDC_results"
+    store_path = "/nfs/turbo/coe-jjparkcv/boyangwa/AVDC/AVDC_results_interpolated"
+    total_frames_needed = 14
+    # Handle the file folder management
+    if os.path.exists(store_path):
+        shutil.rmtree(store_path)
+    os.makedirs(store_path)
+    for video_name in sorted(os.listdir(input_path)):
+        sub_input_path = os.path.join(input_path, video_name)
+        sub_store_path = os.path.join(store_path, video_name)
+        # Create the store place
+        os.makedirs(sub_store_path)
+        # Find valid image lists
+        num_frames_input = 0
+        for file_name in os.listdir(sub_input_path):
+            if file_name.endswith("png"):
+                num_frames_input += 1
+        print("num_frames_input is ", num_frames_input)
+        # Calculate needed parameters
+        division_factor = total_frames_needed // num_frames_input
+        remain_frames = (total_frames_needed % num_frames_input) - 1    # -1 for adaptation
+        # Define the gap
+        gaps = [division_factor for _ in range(num_frames_input)]
+        for idx in range(remain_frames):
+            if idx % 2 == 0:
+                gaps[idx//2] += 1      # Start to end order
+            else:
+                gaps[-1*(1+(idx//2))] += 1   # End to start order
+        print("gaps is ", gaps)
+        # Write to the new folder
+        store_idx = 0
+        for frame_idx, gap in enumerate(gaps):
+            for tmp in range(gap): # Repeat copy gap num of times
+                img_path = os.path.join(sub_input_path, str(frame_idx)+".png")
+                shutil.copyfile(img_path, os.path.join(sub_store_path, str(store_idx)+".png"))
+                store_idx += 1

scripts/length_stats.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os, sys, shutil
+import numpy as np
+import matplotlib.pyplot as plt
+if __name__ == "__main__":
+    input_folder_path = "../Bridge_v2"
+    average_length = []
+    # Iterate each file
+    for sub_folder_name in sorted(os.listdir(input_folder_path)):
+        sub_folder_path = os.path.join(input_folder_path, sub_folder_name)
+        average_length.append(len(os.listdir(sub_folder_path)))       # Have more than one than expected, but we keep this
+        print("average length of {} is {}".format(sub_folder_name, average_length[-1]))
+    print("average_movement_list is ", average_length)
+    n, bins, patches = plt.hist(average_length, bins=100)
+    plt.savefig("dataset_length2.png")

scripts/motion_stats.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os, sys, shutil
+import numpy as np
+import math
+from statistics import mean
+import matplotlib.pyplot as plt
+if __name__ == "__main__":
+    input_folder_paths = ["../datasets_rob/Bridge_v1_raw", "../datasets_rob/Bridge_v2_raw"]     # "../datasets_rob/Bridge_v1_raw", "../datasets_rob/Bridge_v2_raw"
+    num_frames = 14
+    store_name = "movement.png"
+    average_movement_list = []
+    not_valid_num = 0
+    not_exists_num = 0
+    # Iterate each file
+    for input_folder_path in input_folder_paths:
+        for sub_folder_name in sorted(os.listdir(input_folder_path)):
+            sub_folder_path = os.path.join(input_folder_path, sub_folder_name)
+            flow_path = os.path.join(sub_folder_path, 'flow.txt')
+            if not os.path.exists(flow_path):
+                not_exists_num += 1
+                continue
+            # Read the movement
+            file = open(flow_path, 'r')
+            info = file.readlines()
+            print(info)
+            if len(info) == 0:
+                not_valid_num += 1
+                continue
+            info = info[0][:-2]
+            per_video_movement = float(info)
+            # Calculate the number of frames in this video
+            num_frames_input = 0
+            valid = True
+            for file_name in os.listdir(sub_folder_path):   # num_frames_input is the total number of files with name begin with im_
+                if file_name.startswith("im_"):
+                    num_frames_input += 1
+            for idx in range(num_frames_input):     # Ensure that this number is concurrent
+                img_path = os.path.join(sub_folder_path, 'im_' + str(idx) + '.jpg')
+                if not os.path.exists(img_path):            # Should be sequential existing
+                    valid = False
+                    break
+            if num_frames_input < 2:
+                valid = False
+            if not valid:
+                not_valid_num += 1
+                print("This is not valid path")
+                continue
+            average_movement_list.append(per_video_movement * (num_frames_input/num_frames))       # Have more than one than expected, but we keep this
+            print("average movement of {} is {}".format(sub_folder_name, average_movement_list[-1]))
+    print("not_exists_num is ", not_exists_num)
+    print("not_valid_num is ", not_valid_num)
+    print("average_movement_list length is ", len(average_movement_list))
+    # Get mean and variance data
+    mean_value = mean(average_movement_list)
+    std_value = math.sqrt(np.var(average_movement_list))
+    print("Mean is ", mean_value)
+    print("std_value is ", std_value)
+    # Plot the figure
+    n, bins, patches = plt.hist(average_movement_list, bins=100)
+    plt.title("Mean" + str(mean_value) + "_STD"+str(std_value))
+    plt.savefig(store_name)

scripts/process_llama.py ADDED Viewed

	@@ -0,0 +1,74 @@

+'''
+    Process the llama file for the next step
+'''
+import os, shutil, sys
+import json
+import pandas as pd
+import collections
+if __name__ == "__main__":
+    # Define important path
+    json_path = "../SVD1/v1.jsonl"
+    folder_path = "/home/kiteret/Desktop/StableVideoDiffusion/full_text_tmp/"
+    # Read the json file
+    with open(json_path, 'r') as json_file:
+        json_list = list(json_file)
+    # Iterate all the json files
+    length_stats = collections.defaultdict(int)
+    for json_info in json_list:
+        json_info = json.loads(json_info)
+        # Define the path to write
+        key_start = len("/home/chfeng/llama3/full_text_tmp/")
+        key_end = len("lang.txt")
+        sub_path = json_info["file_path"][key_start:int(-1*key_end)]
+        new_text_path = os.path.join(folder_path, sub_path, "processed_text.txt")
+        if os.path.exists(new_text_path):
+            os.remove(new_text_path)
+        # Sanity check for the case where input is missed
+        if json_info["input"] == "":
+            print("It is weird for the input is empty in the LLM process for ", sub_path)
+            continue
+        # Re-Define the content
+        outputs = json_info["output"]
+        if outputs.find("action:") != 0:
+            print("It is weird for no actions: keyword in the outputs for ", sub_path, " with prompt ", outputs)
+            continue
+        # Prepare write file
+        contents = outputs.split('\n')
+        f = open(new_text_path, "a")
+        # Itearte
+        effective_length = 0
+        for idx, content in enumerate(contents):
+            key_word = content.split(":")[1][1:]
+            if key_word != "":
+                effective_length += 1
+            else:
+                if idx == 1:
+                    print("It is abnormal for the this content to be empty ", sub_path, " with prompt ", outputs)
+            f.write(key_word + "\n")
+        # if effective_length == 2:
+        #     print("short prompt case is ", sub_path, " with prompt ", outputs)
+        if effective_length < 2:  # For those only 1 or zero, we won't consider them
+            print("The prompt is too short for ", sub_path, " with prompt ", outputs)
+            os.remove(new_text_path)
+        length_stats[effective_length] += 1
+    print("length_stats is ", length_stats)

scripts/process_sim.py ADDED Viewed

	@@ -0,0 +1,59 @@

+'''
+    This is a script to processs Mark's data.
+'''
+import os, sys, shutil
+if __name__ == "__main__":
+    file_path = "/nfs/turbo/coe-jjparkcv/datasets/isaac-gym-pick-place/full/dataset_v3_proc"
+    store_path = "../datasets_rob/sim_raw"
+    most_descriptive_prompt_idx = 6     # Start from the 0
+    # Folder management
+    if os.path.exists(store_path):
+        shutil.rmtree(store_path)
+    os.makedirs(store_path)
+    # Check length
+    file_names = os.listdir(file_path)
+    target_length = len(file_names) // 10   # 10 files as a cycle
+    for idx in range(target_length):
+        sub_folder_path = os.path.join(file_path, "run_"+str(10*idx))
+        if not os.path.exists(sub_folder_path):
+            continue
+        # Prepare the target position
+        sub_store_path = os.path.join(store_path, str(idx))
+        os.makedirs(sub_store_path)
+        # Find the key prompt to read it
+        prompt_content = []
+        for tmp_idx in range(10):
+            tmp_text_path = os.path.join(file_path, "run_"+str(10*idx + tmp_idx), "lang.txt")    # Usually, the 6th is the most concrete version
+            if not os.path.exists(tmp_text_path):
+                continue
+            file = open(tmp_text_path, 'r')
+            prompt_content.append(file.readlines()[0])
+            file.close()
+        print("prompt_content we have num ", len(prompt_content))
+        # Copy the image into the target position and copy the data.txt
+        for file_name in os.listdir(sub_folder_path):
+            if file_name == "lang.txt":
+                continue
+            shutil.copyfile(os.path.join(sub_folder_path, file_name), os.path.join(sub_store_path, file_name))
+        # Handle the lang.txt
+        target_lang_txt_path = os.path.join(sub_store_path, "lang.txt")
+        f = open(target_lang_txt_path, "a")
+        f.write(prompt_content[most_descriptive_prompt_idx]+"\n")
+        for tmp_idx in range(10):
+            if tmp_idx == most_descriptive_prompt_idx:
+                continue
+            f.write(prompt_content[tmp_idx]+"\n")
+        f.close()

scripts/resize_img.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os, sys, shutil
+import cv2
+if __name__ == "__main__":
+    input_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/resize"
+    output_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/resize_resized"
+    if os.path.exists(output_path):
+        shutil.rmtree(output_path)
+    os.makedirs(output_path)
+    for img_name in os.listdir(input_path):
+        img_path = os.path.join(input_path, img_name)
+        img = cv2.imread(img_path)
+        img = cv2.resize(img, (384, 256))
+        store_path = os.path.join(output_path, img_name)
+        cv2.imwrite(store_path, img)

scripts/resize_video_seq.py ADDED Viewed

	@@ -0,0 +1,33 @@

+'''
+    This file is designed to resize the video sequence to the target resolution
+'''
+import os, sys, shutil
+import cv2
+if __name__ == "__main__":
+    input_folder = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/model_results/SVD_results"
+    store_path = "/nfs/turbo/jjparkcv-turbo-large/boyangwa/model_results/SVD_results_resized"
+    target_height, target_width = 256, 384
+    if os.path.exists(store_path):
+        shutil.rmtree(store_path)
+    os.makedirs(store_path)
+    for video_name in sorted(os.listdir(input_folder)):
+        print("We are processing ", video_name)
+        sub_video_folder = os.path.join(input_folder, video_name)
+        sub_store_folder = os.path.join(store_path, video_name)
+        os.makedirs(sub_store_folder)
+        for img_name in os.listdir(sub_video_folder):
+            if not img_name.endswith("jpg") and not img_name.endswith("png"):
+                continue
+            img_path = os.path.join(sub_video_folder, img_name)
+            store_img_path = os.path.join(sub_store_folder, img_name)
+            img = cv2.imread(img_path)
+            # Resize
+            img = cv2.resize(img, (target_width, target_height))
+            cv2.imwrite(store_img_path, img)

scripts/train_test_split.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os, sys, shutil
+import random
+if __name__ == "__main__":
+    base_dataset_path = "../datasets_rob/Bridge_v1_raw"
+    test_store_path = "../datasets_rob/Bridge_v1_test_raw"
+    split_ratio = 0.1       # [0, 1] range
+    # Prepare the folder
+    if os.path.exists(test_store_path):
+        shutil.rmtree(test_store_path)
+    os.makedirs(test_store_path)
+    full_img_lists = os.listdir(base_dataset_path)
+    random.shuffle(full_img_lists)
+    target_test_length = int(len(full_img_lists) * split_ratio)
+    test_img_lists = full_img_lists[-1 * target_test_length : ]
+    # Move the lists based on test_img_lists
+    for test_img_name in test_img_lists:
+        shutil.move(os.path.join(base_dataset_path, test_img_name), os.path.join(test_store_path, test_img_name))

scripts/visualize_thisthat_point.py ADDED Viewed

	@@ -0,0 +1,43 @@

+'''
+    This repo is provided to change the destination area.
+'''
+import os, cv2
+def draw_dot(ref_img, new_h, new_w):
+    # Draw the dot
+    dot_range = 3
+    for i in range(-1*dot_range, dot_range+1):
+        for j in range(-1*dot_range, dot_range+1):
+            dil_vertical, dil_horizontal = new_h + i, new_w + j
+            if (0 <= dil_vertical and dil_vertical < ref_img.shape[0]) and (0 <= dil_horizontal and dil_horizontal < ref_img.shape[1]):
+                ref_img[dil_vertical, dil_horizontal, :] = [0, 128, 0]
+    return ref_img
+if __name__ == "__main__":
+    instance_path = "datasets/validation_thisthat14/000049/"
+    new_w, new_h = 385, 310
+    # 256.1850280761719 241.71287155151367
+    # Read the items
+    data_path = os.path.join(instance_path, "data.txt")
+    ref_img_path = os.path.join(instance_path, "im_0.jpg")
+    ref_img = cv2.imread(ref_img_path)
+    # Read the first point
+    file1 = open(data_path, 'r')
+    Lines = file1.readlines()
+    frame_idx, horizontal, vertical = Lines[0].split(' ')
+    ref_img = draw_dot(ref_img, int(float(vertical)), int(float(horizontal)))
+    # Second dot
+    ref_img = draw_dot(ref_img, new_h, new_w)
+    # Store the image
+    cv2.imwrite("visual.png", ref_img)

svd/diffusion_arch/transformer_temporal.py ADDED Viewed

	@@ -0,0 +1,381 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import torch
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput
+from diffusers.models.attention import BasicTransformerBlock, TemporalBasicTransformerBlock
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.resnet import AlphaBlender
+@dataclass
+class TransformerTemporalModelOutput(BaseOutput):
+    """
+    The output of [`TransformerTemporalModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size x num_frames, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input.
+    """
+    sample: torch.FloatTensor
+class TransformerTemporalModel(ModelMixin, ConfigMixin):
+    """
+    A Transformer model for video-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlock` attention should contain a bias parameter.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
+            activation functions.
+        norm_elementwise_affine (`bool`, *optional*):
+            Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
+        double_self_attention (`bool`, *optional*):
+            Configure if each `TransformerBlock` should contain two self-attention layers.
+        positional_embeddings: (`str`, *optional*):
+            The type of positional embeddings to apply to the sequence input before passing use.
+        num_positional_embeddings: (`int`, *optional*):
+            The maximum length of the sequence over which to apply positional embeddings.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        norm_elementwise_affine: bool = True,
+        double_self_attention: bool = True,
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    double_self_attention=double_self_attention,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    positional_embeddings=positional_embeddings,
+                    num_positional_embeddings=num_positional_embeddings,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.LongTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: torch.LongTensor = None,
+        num_frames: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> TransformerTemporalModelOutput:
+        """
+        The [`TransformerTemporal`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input hidden_states.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            num_frames (`int`, *optional*, defaults to 1):
+                The number of frames to be processed per batch. This is used to reshape the hidden states.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
+                returned, otherwise a `tuple` where the first element is the sample tensor.
+        """
+        # 1. Input
+        batch_frames, channel, height, width = hidden_states.shape
+        batch_size = batch_frames // num_frames
+        residual = hidden_states
+        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, channel, height, width)
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states.permute(0, 3, 4, 2, 1).reshape(batch_size * height * width, num_frames, channel)
+        hidden_states = self.proj_in(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+        # 3. Output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = (
+            hidden_states[None, None, :]
+            .reshape(batch_size, height, width, num_frames, channel)
+            .permute(0, 3, 4, 1, 2)
+            .contiguous()
+        )
+        hidden_states = hidden_states.reshape(batch_frames, channel, height, width)
+        output = hidden_states + residual
+        if not return_dict:
+            return (output,)
+        return TransformerTemporalModelOutput(sample=output)
+class TransformerSpatioTemporalModel(nn.Module):
+    """
+    A Transformer model for video-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        out_channels (`int`, *optional*):
+            The number of channels in the output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+    """
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: int = 320,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        cross_attention_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.inner_dim = inner_dim
+        # 2. Define input layers
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        time_mix_inner_dim = inner_dim
+        self.temporal_transformer_blocks = nn.ModuleList(
+            [
+                TemporalBasicTransformerBlock(
+                    inner_dim,
+                    time_mix_inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        time_embed_dim = in_channels * 4
+        self.time_pos_embed = TimestepEmbedding(in_channels, time_embed_dim, out_dim=in_channels)
+        self.time_proj = Timesteps(in_channels, True, 0)
+        self.time_mixer = AlphaBlender(alpha=0.5, merge_strategy="learned_with_images")
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        # TODO: should use out_channels for continuous projections
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input hidden_states.
+            num_frames (`int`):
+                The number of frames to be processed per batch. This is used to reshape the hidden states.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            image_only_indicator (`torch.LongTensor` of shape `(batch size, num_frames)`, *optional*):
+                A tensor indicating whether the input contains only images. 1 indicates that the input contains only
+                images, 0 indicates that the input contains video frames.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a plain
+                tuple.
+        Returns:
+            [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
+                returned, otherwise a `tuple` where the first element is the sample tensor.
+        """
+        # 1. Input
+        batch_frames, _, height, width = hidden_states.shape
+        num_frames = image_only_indicator.shape[-1]
+        batch_size = batch_frames // num_frames
+        time_context = encoder_hidden_states
+        time_context_first_timestep = time_context[None, :].reshape(
+            batch_size, num_frames, -1, time_context.shape[-1]
+        )[:, 0]     #  This part means that the cross attn section for the temporal blocks only consider ths first frames
+        encoder_hidden_states_dim = time_context_first_timestep.shape[1]
+        time_context = time_context_first_timestep[None, :].broadcast_to(
+            height * width, batch_size, encoder_hidden_states_dim, time_context.shape[-1]
+        )
+        time_context = time_context.reshape(height * width * batch_size, encoder_hidden_states_dim, time_context.shape[-1])
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_frames, height * width, inner_dim)
+        hidden_states = self.proj_in(hidden_states)
+        num_frames_emb = torch.arange(num_frames, device=hidden_states.device)
+        num_frames_emb = num_frames_emb.repeat(batch_size, 1)
+        num_frames_emb = num_frames_emb.reshape(-1)
+        t_emb = self.time_proj(num_frames_emb)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_pos_embed(t_emb)
+        emb = emb[:, None, :]
+        # 2. Blocks
+        for block, temporal_block in zip(self.transformer_blocks, self.temporal_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    block,
+                    hidden_states,
+                    None,
+                    encoder_hidden_states,
+                    None,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+            hidden_states_mix = hidden_states
+            hidden_states_mix = hidden_states_mix + emb
+            hidden_states_mix = temporal_block(
+                hidden_states_mix,
+                num_frames=num_frames,
+                encoder_hidden_states=time_context,
+            )
+            hidden_states = self.time_mixer(
+                x_spatial=hidden_states,
+                x_temporal=hidden_states_mix,
+                image_only_indicator=image_only_indicator,
+            )
+        # 3. Output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(batch_frames, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+        output = hidden_states + residual
+        if not return_dict:
+            return (output,)
+        return TransformerTemporalModelOutput(sample=output)