open-sora

Runtime error

App Files Files Community

frankleeeee commited on Apr 24

Commit

5613724

•

1 Parent(s): 68404e4

updated to v1.1

Browse files

Files changed (35) hide show

app.py +299 -54
configs/dit/inference/16x256x256.py +2 -2
configs/dit/inference/1x256x256-class.py +2 -2
configs/dit/inference/1x256x256.py +2 -2
configs/dit/train/16x256x256.py +9 -9
configs/dit/train/1x256x256.py +9 -8
configs/latte/inference/16x256x256-class.py +2 -2
configs/latte/inference/16x256x256.py +2 -2
configs/latte/train/16x256x256.py +8 -8
configs/opensora-v1-1/inference/sample-ref.py +62 -0
configs/opensora-v1-1/inference/sample.py +43 -0
configs/opensora-v1-1/train/benchmark.py +101 -0
configs/opensora-v1-1/train/image.py +65 -0
configs/opensora-v1-1/train/stage1.py +77 -0
configs/opensora-v1-1/train/stage2.py +79 -0
configs/opensora-v1-1/train/stage3.py +79 -0
configs/opensora-v1-1/train/video.py +67 -0
configs/opensora/inference/16x256x256.py +7 -4
configs/opensora/inference/16x512x512.py +3 -3
configs/opensora/inference/64x512x512.py +2 -2
configs/opensora/train/16x256x256-mask.py +60 -0
configs/opensora/train/16x256x256-spee.py +60 -0
configs/opensora/train/16x256x256.py +8 -8
configs/opensora/train/16x512x512.py +9 -9
configs/opensora/train/360x512x512.py +13 -7
configs/opensora/train/64x512x512-sp.py +9 -9
configs/opensora/train/64x512x512.py +8 -8
configs/pixart/inference/16x256x256.py +2 -2
configs/pixart/inference/1x1024MS.py +3 -3
configs/pixart/inference/1x256x256.py +2 -2
configs/pixart/inference/1x512x512.py +9 -3
configs/pixart/train/16x256x256.py +9 -9
configs/pixart/train/1x512x512.py +8 -8
configs/pixart/train/64x512x512.py +9 -8
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -11,25 +11,148 @@ import importlib
 import os
 import subprocess
 import sys
 import spaces
 import torch
 import gradio as gr
-MODEL_TYPES = ["v1-16x256x256", "v1-HQ-16x256x256", "v1-HQ-16x512x512"]
 CONFIG_MAP = {
-    "v1-16x256x256": "configs/opensora/inference/16x256x256.py",
-    "v1-HQ-16x256x256": "configs/opensora/inference/16x256x256.py",
-    "v1-HQ-16x512x512": "configs/opensora/inference/16x512x512.py",
 }
 HF_STDIT_MAP = {
-    "v1-16x256x256": "hpcai-tech/OpenSora-STDiT-v1-16x256x256",
-    "v1-HQ-16x256x256": "hpcai-tech/OpenSora-STDiT-v1-HQ-16x256x256",
-    "v1-HQ-16x512x512": "hpcai-tech/OpenSora-STDiT-v1-HQ-16x512x512",
 }
 def install_dependencies(enable_optimization=False):
     """
     Install the required dependencies for the demo if they are not already installed.
@@ -72,6 +195,9 @@ def install_dependencies(enable_optimization=False):
             )
 def read_config(config_path):
     """
     Read the configuration file.
@@ -81,7 +207,7 @@ def read_config(config_path):
     return Config.fromfile(config_path)
-def build_models(model_type, config):
     """
     Build the models for the given model type and configuration.
     """
@@ -101,8 +227,7 @@ def build_models(model_type, config):
     stdit = AutoModel.from_pretrained(
         HF_STDIT_MAP[model_type],
-        enable_flash_attn=False,
-        enable_layernorm_kernel=False,
         trust_remote_code=True,
     ).cuda()
@@ -115,23 +240,20 @@ def build_models(model_type, config):
     text_encoder.y_embedder = stdit.y_embedder
     # move modelst to device
-    vae = vae.to(torch.float16).eval()
     text_encoder.t5.model = text_encoder.t5.model.eval()  # t5 must be in fp32
-    stdit = stdit.to(torch.float16).eval()
-    return vae, text_encoder, stdit, scheduler
-def get_latent_size(config, vae):
-    input_size = (config.num_frames, *config.image_size)
-    latent_size = vae.get_latent_size(input_size)
-    return latent_size
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model-type",
-        default="v1-HQ-16x256x256",
         choices=MODEL_TYPES,
         help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
     )
@@ -168,27 +290,129 @@ torch.jit._state.disable()
 # set up
 install_dependencies(enable_optimization=args.enable_optimization)
 # build model
-vae, text_encoder, stdit, scheduler = build_models(args.model_type, config)
 @spaces.GPU(duration=200)
-def run_inference(prompt_text):
-    from opensora.datasets import save_sample
-    latent_size = get_latent_size(config, vae)
-    samples = scheduler.sample(
-        stdit,
-        text_encoder,
-        z_size=(vae.out_channels, *latent_size),
-        prompts=[prompt_text],
-        device="cuda",
-    )
-    samples = vae.decode(samples.to(torch.float16))
-    filename = f"{args.output}/sample"
-    saved_path = save_sample(samples[0], fps=config.fps, save_path=filename)
-    return saved_path
 def main():
@@ -218,27 +442,48 @@ def main():
         with gr.Row():
             with gr.Column():
-                prompt_text = gr.Textbox(show_label=False, placeholder="Describe your video here", lines=4)
-                submit_button = gr.Button("Generate video")
             with gr.Column():
-                output_video = gr.Video()
-        submit_button.click(fn=run_inference, inputs=[prompt_text], outputs=output_video)
-        gr.Examples(
-            examples=[
-                [
-                    "The video captures the majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene. The camera angle provides a bird's eye view of the waterfall, allowing viewers to appreciate the full height and grandeur of the waterfall. The video is a stunning representation of nature's power and beauty.",
-                ],
-            ],
-            fn=run_inference,
-            inputs=[
-                prompt_text,
-            ],
-            outputs=[output_video],
-            cache_examples=True,
-        )
     # launch
     demo.launch(server_port=args.port, server_name=args.host, share=args.share)

 import os
 import subprocess
 import sys
+import re
+import json
+import math
 import spaces
 import torch
 import gradio as gr
+MODEL_TYPES = ["v1.1"]
 CONFIG_MAP = {
+    "v1.1-stage2": "configs/opensora-v1-1/inference/sample-ref.py",
+    "v1.1-stage3": "configs/opensora-v1-1/inference/sample-ref.py",
 }
 HF_STDIT_MAP = {
+    "v1.1-stage2": "hpcai-tech/OpenSora-STDiT-v2-stage2",
+    "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
+}
+RESOLUTION_MAP = {
+    "144p": (144, 256),
+    "240p": (240, 426),
+    "360p": (360, 480),
+    "480p": (480, 858),
+    "720p": (720, 1280),
+    "1080p": (1080, 1920)
 }
+# ============================
+# Utils
+# ============================
+def collect_references_batch(reference_paths, vae, image_size):
+    from opensora.datasets.utils import read_from_path
+    refs_x = []
+    for reference_path in reference_paths:
+        if reference_path is None:
+            refs_x.append([])
+            continue
+        ref_path = reference_path.split(";")
+        ref = []
+        for r_path in ref_path:
+            r = read_from_path(r_path, image_size, transform_name="resize_crop")
+            r_x = vae.encode(r.unsqueeze(0).to(vae.device, vae.dtype))
+            r_x = r_x.squeeze(0)
+            ref.append(r_x)
+        refs_x.append(ref)
+    # refs_x: [batch, ref_num, C, T, H, W]
+    return refs_x
+def process_mask_strategy(mask_strategy):
+    mask_batch = []
+    mask_strategy = mask_strategy.split(";")
+    for mask in mask_strategy:
+        mask_group = mask.split(",")
+        assert len(mask_group) >= 1 and len(mask_group) <= 6, f"Invalid mask strategy: {mask}"
+        if len(mask_group) == 1:
+            mask_group.extend(["0", "0", "0", "1", "0"])
+        elif len(mask_group) == 2:
+            mask_group.extend(["0", "0", "1", "0"])
+        elif len(mask_group) == 3:
+            mask_group.extend(["0", "1", "0"])
+        elif len(mask_group) == 4:
+            mask_group.extend(["1", "0"])
+        elif len(mask_group) == 5:
+            mask_group.append("0")
+        mask_batch.append(mask_group)
+    return mask_batch
+def apply_mask_strategy(z, refs_x, mask_strategys, loop_i):
+    masks = []
+    for i, mask_strategy in enumerate(mask_strategys):
+        mask = torch.ones(z.shape[2], dtype=torch.float, device=z.device)
+        if mask_strategy is None:
+            masks.append(mask)
+            continue
+        mask_strategy = process_mask_strategy(mask_strategy)
+        for mst in mask_strategy:
+            loop_id, m_id, m_ref_start, m_target_start, m_length, edit_ratio = mst
+            loop_id = int(loop_id)
+            if loop_id != loop_i:
+                continue
+            m_id = int(m_id)
+            m_ref_start = int(m_ref_start)
+            m_length = int(m_length)
+            m_target_start = int(m_target_start)
+            edit_ratio = float(edit_ratio)
+            ref = refs_x[i][m_id]  # [C, T, H, W]
+            if m_ref_start < 0:
+                m_ref_start = ref.shape[1] + m_ref_start
+            if m_target_start < 0:
+                # z: [B, C, T, H, W]
+                m_target_start = z.shape[2] + m_target_start
+            z[i, :, m_target_start : m_target_start + m_length] = ref[:, m_ref_start : m_ref_start + m_length]
+            mask[m_target_start : m_target_start + m_length] = edit_ratio
+        masks.append(mask)
+    masks = torch.stack(masks)
+    return masks
+def process_prompts(prompts, num_loop):
+    from opensora.models.text_encoder.t5 import text_preprocessing
+    ret_prompts = []
+    for prompt in prompts:
+        if prompt.startswith("|0|"):
+            prompt_list = prompt.split("|")[1:]
+            text_list = []
+            for i in range(0, len(prompt_list), 2):
+                start_loop = int(prompt_list[i])
+                text = prompt_list[i + 1]
+                text = text_preprocessing(text)
+                end_loop = int(prompt_list[i + 2]) if i + 2 < len(prompt_list) else num_loop
+                text_list.extend([text] * (end_loop - start_loop))
+            assert len(text_list) == num_loop, f"Prompt loop mismatch: {len(text_list)} != {num_loop}"
+            ret_prompts.append(text_list)
+        else:
+            prompt = text_preprocessing(prompt)
+            ret_prompts.append([prompt] * num_loop)
+    return ret_prompts
+def extract_json_from_prompts(prompts):
+    additional_infos = []
+    ret_prompts = []
+    for prompt in prompts:
+        parts = re.split(r"(?=[{\[])", prompt)
+        assert len(parts) <= 2, f"Invalid prompt: {prompt}"
+        ret_prompts.append(parts[0])
+        if len(parts) == 1:
+            additional_infos.append({})
+        else:
+            additional_infos.append(json.loads(parts[1]))
+    return ret_prompts, additional_infos
+# ============================
+# Runtime Environment
+# ============================
 def install_dependencies(enable_optimization=False):
     """
     Install the required dependencies for the demo if they are not already installed.
             )
+# ============================
+# Model-related
+# ============================
 def read_config(config_path):
     """
     Read the configuration file.
     return Config.fromfile(config_path)
+def build_models(model_type, config, enable_optimization=False):
     """
     Build the models for the given model type and configuration.
     """
     stdit = AutoModel.from_pretrained(
         HF_STDIT_MAP[model_type],
+        enable_flash_attn=enable_optimization,
         trust_remote_code=True,
     ).cuda()
     text_encoder.y_embedder = stdit.y_embedder
     # move modelst to device
+    vae = vae.to(torch.bfloat16).eval()
     text_encoder.t5.model = text_encoder.t5.model.eval()  # t5 must be in fp32
+    stdit = stdit.to(torch.bfloat16).eval()
+    # clear cuda
+    torch.cuda.empty_cache()
+    return vae, text_encoder, stdit, scheduler
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model-type",
+        default="v1.1-stage3",
         choices=MODEL_TYPES,
         help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
     )
 # set up
 install_dependencies(enable_optimization=args.enable_optimization)
+# import after installation
+from opensora.datasets import IMG_FPS, save_sample
+from opensora.utils.misc import to_torch_dtype
+# some global variables
+dtype = to_torch_dtype(config.dtype)
+device = torch.device("cuda")
 # build model
+vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enable_optimization=args.enable_optimization)
 @spaces.GPU(duration=200)
+def run_inference(mode, prompt_text, resolution, length, reference_image):
+    with torch.inference_mode():
+        # ======================
+        # 1. Preparation
+        # ======================
+        # parse the inputs
+        resolution = RESOLUTION_MAP[resolution]
+        # compute number of loops
+        num_seconds = int(length.rstrip('s'))
+        total_number_of_frames = num_seconds * config.fps / config.frame_interval
+        num_loop = math.ceil(total_number_of_frames / config.num_frames)
+        # prepare model args
+        model_args = dict()
+        height = torch.tensor([resolution[0]], device=device, dtype=dtype)
+        width = torch.tensor([resolution[1]], device=device, dtype=dtype)
+        num_frames = torch.tensor([config.num_frames], device=device, dtype=dtype)
+        ar = torch.tensor([resolution[0] / resolution[1]], device=device, dtype=dtype)
+        if config.num_frames == 1:
+            config.fps = IMG_FPS
+        fps = torch.tensor([config.fps], device=device, dtype=dtype)
+        model_args["height"] = height
+        model_args["width"] = width
+        model_args["num_frames"] = num_frames
+        model_args["ar"] = ar
+        model_args["fps"] = fps
+        # compute latent size
+        input_size = (config.num_frames, *resolution)
+        latent_size = vae.get_latent_size(input_size)
+        # process prompt
+        prompt_raw = [prompt_text]
+        prompt_raw, _ = extract_json_from_prompts(prompt_raw)
+        prompt_loops = process_prompts(prompt_raw, num_loop)
+        video_clips = []
+        # prepare mask strategy
+        if mode == "Text2Video":
+            mask_strategy = [None]
+        elif mode == "Image2Video":
+            mask_strategy = ['0']
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+        # =========================
+        # 2. Load reference images
+        # =========================
+        if mode == "Text2Video":
+            refs_x = collect_references_batch([None], vae, resolution)
+        elif mode == "Image2Video":
+            # save image to disk
+            from PIL import Image
+            im = Image.fromarray(reference_image)
+            im.save("test.jpg")
+            refs_x = collect_references_batch(["test.jpg"], vae, resolution)
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+        # 4.3. long video generation
+        for loop_i in range(num_loop):
+            # 4.4 sample in hidden space
+            batch_prompts = [prompt[loop_i] for prompt in prompt_loops]
+            z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
+            # 4.5. apply mask strategy
+            masks = None
+            # if cfg.reference_path is not None:
+            if loop_i > 0:
+                ref_x = vae.encode(video_clips[-1])
+                for j, refs in enumerate(refs_x):
+                    if refs is None:
+                        refs_x[j] = [ref_x[j]]
+                    else:
+                        refs.append(ref_x[j])
+                    if mask_strategy[j] is None:
+                        mask_strategy[j] = ""
+                    else:
+                        mask_strategy[j] += ";"
+                    mask_strategy[
+                        j
+                    ] += f"{loop_i},{len(refs)-1},-{config.condition_frame_length},0,{config.condition_frame_length}"
+            masks = apply_mask_strategy(z, refs_x, mask_strategy, loop_i)
+            # 4.6. diffusion sampling
+            samples = scheduler.sample(
+                stdit,
+                text_encoder,
+                z=z,
+                prompts=batch_prompts,
+                device=device,
+                additional_args=model_args,
+                mask=masks,  # scheduler must support mask
+            )
+            samples = vae.decode(samples.to(dtype))
+            video_clips.append(samples)
+            # 4.7. save video
+            if loop_i == num_loop - 1:
+                video_clips_list = [
+                    video_clips[0][0]] + [video_clips[i][0][:, config.condition_frame_length :]
+                    for i in range(1, num_loop)
+                ]
+                video = torch.cat(video_clips_list, dim=1)
+                save_path = f"{args.output}/sample"
+                saved_path = save_sample(video, fps=config.fps // config.frame_interval, save_path=save_path, force_video=True)
+                return saved_path
 def main():
         with gr.Row():
             with gr.Column():
+                mode = gr.Radio(
+                    choices=["Text2Video", "Image2Video"],
+                    value="Text2Video",
+                    label="Usage",
+                    info="Choose your usage scenario",
+                )
+                prompt_text = gr.Textbox(
+                    label="Prompt",
+                    placeholder="Describe your video here",
+                    lines=4,
+                )
+                resolution = gr.Radio(
+                     choices=["144p", "240p", "360p", "480p", "720p", "1080p"],
+                     value="144p",
+                    label="Resolution",
+                )
+                length = gr.Radio(
+                    choices=["2s", "4s", "8s"],
+                    value="2s",
+                    label="Video Length",
+                    info="8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time."
+                )
+                reference_image = gr.Image(
+                    label="Reference Image (only used for Image2Video)",
+                )
             with gr.Column():
+                output_video = gr.Video(
+                    label="Output Video",
+                    height="100%"
+                )
+        with gr.Row():
+             submit_button = gr.Button("Generate video")
+        submit_button.click(
+             fn=run_inference,
+             inputs=[mode, prompt_text, resolution, length, reference_image],
+             outputs=output_video
+             )
     # launch
     demo.launch(server_port=args.port, server_name=args.host, share=args.share)

configs/dit/inference/16x256x256.py CHANGED Viewed

@@ -22,10 +22,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=4.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_labels.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=4.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_labels.txt"
+save_dir = "./samples/samples/"

configs/dit/inference/1x256x256-class.py CHANGED Viewed

@@ -22,10 +22,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=4.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/imagenet_id.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=4.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/imagenet_id.txt"
+save_dir = "./samples/samples/"

configs/dit/inference/1x256x256.py CHANGED Viewed

@@ -23,10 +23,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=4.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/imagenet_labels.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=4.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/imagenet_labels.txt"
+save_dir = "./samples/samples/"

configs/dit/train/16x256x256.py CHANGED Viewed

@@ -1,16 +1,16 @@
-num_frames = 16
-frame_interval = 3
-image_size = (256, 256)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
-grad_checkpoint = False
 plugin = "zero2"
 sp_size = 1

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
+grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1

configs/dit/train/1x256x256.py CHANGED Viewed

@@ -1,14 +1,15 @@
-num_frames = 1
-frame_interval = 1
-image_size = (256, 256)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = True
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = False
 plugin = "zero2"

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=1,
+    frame_interval=1,
+    image_size=(256, 256),
+    transform_name="center",
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = False
 plugin = "zero2"

configs/latte/inference/16x256x256-class.py CHANGED Viewed

@@ -21,10 +21,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=4.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_id.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=4.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_id.txt"
+save_dir = "./samples/samples/"

configs/latte/inference/16x256x256.py CHANGED Viewed

@@ -22,10 +22,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=4.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_labels.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=4.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_labels.txt"
+save_dir = "./samples/samples/"

configs/latte/train/16x256x256.py CHANGED Viewed

@@ -1,14 +1,14 @@
-num_frames = 16
-frame_interval = 3
-image_size = (256, 256)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"

configs/opensora-v1-1/inference/sample-ref.py ADDED Viewed

	@@ -0,0 +1,62 @@

+num_frames = 16
+frame_interval = 3
+fps = 24
+image_size = (240, 426)
+multi_resolution = "STDiT2"
+# Condition
+prompt_path = None
+prompt = [
+    "A car driving on the ocean.",
+    'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
+    "In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.",
+]
+loop = 2
+condition_frame_length = 4
+reference_path = [
+    "https://cdn.openai.com/tmp/s/interp/d0.mp4",
+    None,
+    "assets/images/condition/wave.png",
+]
+# valid when reference_path is not None
+# (loop id, ref id, ref start, length, target start)
+mask_strategy = [
+    "0,0,0,0,8,0.3",
+    None,
+    "0",
+]
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    micro_batch_size=4,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    model_max_length=200,
+)
+scheduler = dict(
+    type="iddpm",
+    num_sampling_steps=100,
+    cfg_scale=7.0,
+    cfg_channel=3,  # or None
+)
+dtype = "bf16"
+# Others
+batch_size = 1
+seed = 42
+save_dir = "./samples/samples/"

configs/opensora-v1-1/inference/sample.py ADDED Viewed

	@@ -0,0 +1,43 @@

+num_frames = 16
+frame_interval = 3
+fps = 24
+image_size = (240, 426)
+multi_resolution = "STDiT2"
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    micro_batch_size=4,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    model_max_length=200,
+)
+scheduler = dict(
+    type="iddpm",
+    num_sampling_steps=100,
+    cfg_scale=7.0,
+    cfg_channel=3,  # or None
+)
+dtype = "bf16"
+# Condition
+prompt_path = "./assets/texts/t2v_samples.txt"
+prompt = None  # prompt has higher priority than prompt_path
+# Others
+batch_size = 1
+seed = 42
+save_dir = "./samples/samples/"

configs/opensora-v1-1/train/benchmark.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# this file is only for batch size search and is not used for training
+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+# bucket config format:
+# 1. { resolution: {num_frames: (prob, batch_size)} }, in this case batch_size is ignored when searching
+# 2. { resolution: {num_frames: (prob, (max_batch_size, ))} }, batch_size is searched in the range [batch_size_start, max_batch_size), batch_size_start is configured via CLI
+# 3. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size)
+# 4. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size, step_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) with step_size (grid search)
+# 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used
+bucket_config = {
+    # == manual search ==
+    # "240p": {128: (1.0, 2)}, # 4.28s/it
+    # "240p": {64: (1.0, 4)},
+    # "240p": {32: (1.0, 8)},  # 4.6s/it
+    # "240p": {16: (1.0, 16)},  # 4.6s/it
+    # "480p": {16: (1.0, 4)},  # 4.6s/it
+    # "720p": {16: (1.0, 2)},  # 5.89s/it
+    # "256": {1: (1.0, 256)},  # 4.5s/it
+    # "512": {1: (1.0, 96)}, # 4.7s/it
+    # "512": {1: (1.0, 128)}, # 6.3s/it
+    # "480p": {1: (1.0, 50)},  # 4.0s/it
+    # "1024": {1: (1.0, 32)},  # 6.8s/it
+    # "1024": {1: (1.0, 20)}, # 4.3s/it
+    # "1080p": {1: (1.0, 16)}, # 8.6s/it
+    # "1080p": {1: (1.0, 8)},  # 4.4s/it
+    # == stage 2 ==
+    # "240p": {
+    #     16: (1.0, (2, 32)),
+    #     32: (1.0, (2, 16)),
+    #     64: (1.0, (2, 8)),
+    #     128: (1.0, (2, 6)),
+    # },
+    # "256": {1: (1.0, (128, 300))},
+    # "512": {1: (0.5, (64, 128))},
+    # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
+    # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)},  # No examples now
+    # "1024": {1: (0.3, (8, 64))},
+    # "1080p": {1: (0.3, (2, 32))},
+    # == stage 3 ==
+    "720p": {1: (20, 40), 32: (0.5, (2, 4)), 64: (0.5, (1, 1))},
+}
+# Define acceleration
+num_workers = 4
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+batch_size = None
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/train/image.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+bucket_config = {  # 6s/it
+    "256": {1: (1.0, 256)},
+    "512": {1: (1.0, 80)},
+    "480p": {1: (1.0, 52)},
+    "1024": {1: (1.0, 20)},
+    "1080p": {1: (1.0, 8)},
+}
+# Define acceleration
+num_workers = 4
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+batch_size = 10  # only for logging
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/train/stage1.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+# IMG: 1024 (20%) 512 (30%) 256 (50%) drop (50%)
+bucket_config = {  # 1s/it
+    "144p": {1: (0.5, 48), 16: (1.0, 6), 32: (1.0, 3), 96: (1.0, 1)},
+    "256": {1: (0.5, 24), 16: (0.5, 3), 48: (0.5, 1), 64: (0.0, None)},
+    "240p": {16: (0.3, 2), 32: (0.3, 1), 64: (0.0, None)},
+    "512": {1: (0.4, 12)},
+    "1024": {1: (0.3, 3)},
+}
+mask_ratios = {
+    "mask_no": 0.75,
+    "mask_quarter_random": 0.025,
+    "mask_quarter_head": 0.025,
+    "mask_quarter_tail": 0.025,
+    "mask_quarter_head_tail": 0.05,
+    "mask_image_random": 0.025,
+    "mask_image_head": 0.025,
+    "mask_image_tail": 0.025,
+    "mask_image_head_tail": 0.05,
+}
+# Define acceleration
+num_workers = 8
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = False
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+batch_size = None
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/train/stage2.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+bucket_config = {  # 7s/it
+    "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)},
+    "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)},
+    "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)},
+    "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)},
+    "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)},
+    "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)},
+    "1024": {1: (0.3, 20)},
+    "1080p": {1: (0.4, 8)},
+}
+mask_ratios = {
+    "mask_no": 0.75,
+    "mask_quarter_random": 0.025,
+    "mask_quarter_head": 0.025,
+    "mask_quarter_tail": 0.025,
+    "mask_quarter_head_tail": 0.05,
+    "mask_image_random": 0.025,
+    "mask_image_head": 0.025,
+    "mask_image_tail": 0.025,
+    "mask_image_head_tail": 0.05,
+}
+# Define acceleration
+num_workers = 8
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+batch_size = None
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/train/stage3.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+bucket_config = {  # 13s/it
+    "144p": {1: (1.0, 200), 16: (1.0, 36), 32: (1.0, 18), 64: (1.0, 9), 128: (1.0, 4)},
+    "256": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 11), 64: (0.5, 6), 128: (0.8, 4)},
+    "240p": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 10), 64: (0.5, 6), 128: (0.5, 3)},
+    "360p": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.5, 1)},
+    "512": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.8, 1)},
+    "480p": {1: (0.4, 80), 16: (0.6, 6), 32: (0.6, 3), 64: (0.6, 1), 128: (0.0, None)},
+    "720p": {1: (0.4, 40), 16: (0.6, 3), 32: (0.6, 1), 96: (0.0, None)},
+    "1024": {1: (0.3, 40)},
+}
+mask_ratios = {
+    "mask_no": 0.75,
+    "mask_quarter_random": 0.025,
+    "mask_quarter_head": 0.025,
+    "mask_quarter_tail": 0.025,
+    "mask_quarter_head_tail": 0.05,
+    "mask_image_random": 0.025,
+    "mask_image_head": 0.025,
+    "mask_image_tail": 0.025,
+    "mask_image_head_tail": 0.05,
+}
+# Define acceleration
+num_workers = 8
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+batch_size = None
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/train/video.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+bucket_config = {  # 6s/it
+    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
+    "256": {1: (1.0, 256)},
+    "512": {1: (0.5, 80)},
+    "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)},
+    "720p": {16: (0.1, 2), 32: (0.0, None)},  # No examples now
+    "1024": {1: (0.3, 20)},
+    "1080p": {1: (0.3, 8)},
+}
+# Define acceleration
+num_workers = 4
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+batch_size = 10  # only for logging
+lr = 2e-5
+grad_clip = 1.0

configs/opensora/inference/16x256x256.py CHANGED Viewed

@@ -25,12 +25,15 @@ scheduler = dict(
     type="iddpm",
     num_sampling_steps=100,
     cfg_scale=7.0,
-    cfg_channel=3, # or None
 )
-dtype = "fp16"
 # Others
 batch_size = 1
 seed = 42
-prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./outputs/samples/"

     type="iddpm",
     num_sampling_steps=100,
     cfg_scale=7.0,
+    cfg_channel=3,  # or None
 )
+dtype = "bf16"
+# Condition
+prompt_path = "./assets/texts/t2v_samples.txt"
+prompt = None  # prompt has higher priority than prompt_path
 # Others
 batch_size = 1
 seed = 42
+save_dir = "./samples/samples/"

configs/opensora/inference/16x512x512.py CHANGED Viewed

@@ -9,7 +9,7 @@ model = dict(
     time_scale=1.0,
     enable_flashattn=True,
     enable_layernorm_kernel=True,
-    from_pretrained="PRETRAINED_MODEL"
 )
 vae = dict(
     type="VideoAutoencoderKL",
@@ -26,10 +26,10 @@ scheduler = dict(
     num_sampling_steps=100,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./outputs/samples/"

     time_scale=1.0,
     enable_flashattn=True,
     enable_layernorm_kernel=True,
+    from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
     type="VideoAutoencoderKL",
     num_sampling_steps=100,
     cfg_scale=7.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
+save_dir = "./samples/samples/"

configs/opensora/inference/64x512x512.py CHANGED Viewed

@@ -26,10 +26,10 @@ scheduler = dict(
     num_sampling_steps=100,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 1
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=100,
     cfg_scale=7.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 1
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
+save_dir = "./samples/samples/"

configs/opensora/train/16x256x256-mask.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
+# Define acceleration
+num_workers = 4
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=0.5,
+    time_scale=1.0,
+    from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+mask_ratios = {
+    "mask_no": 0.7,
+    "mask_random": 0.15,
+    "mask_head": 0.05,
+    "mask_tail": 0.05,
+    "mask_head_tail": 0.05,
+}
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=120,
+    shardformer=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+batch_size = 8
+lr = 2e-5
+grad_clip = 1.0

configs/opensora/train/16x256x256-spee.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
+# Define acceleration
+num_workers = 4
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=0.5,
+    time_scale=1.0,
+    from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+mask_ratios = {
+    "mask_no": 0.5,
+    "mask_random": 0.29,
+    "mask_head": 0.07,
+    "mask_tail": 0.07,
+    "mask_head_tail": 0.07,
+}
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=120,
+    shardformer=True,
+)
+scheduler = dict(
+    type="iddpm-speed",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+batch_size = 8
+lr = 2e-5
+grad_clip = 1.0

configs/opensora/train/16x256x256.py CHANGED Viewed

@@ -1,14 +1,14 @@
-num_frames = 16
-frame_interval = 3
-image_size = (256, 256)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"

configs/opensora/train/16x512x512.py CHANGED Viewed

@@ -1,16 +1,16 @@
-num_frames = 16
-frame_interval = 3
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
-grad_checkpoint = False
 plugin = "zero2"
 sp_size = 1

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(512, 512),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
+grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1

configs/opensora/train/360x512x512.py CHANGED Viewed

@@ -1,12 +1,18 @@
-num_frames = 360
-frame_interval = 1
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
 num_workers = 4
 # Define acceleration
 dtype = "bf16"

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=360,
+    frame_interval=3,
+    image_size=(512, 512),
+)
+# Define acceleration
 num_workers = 4
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
 # Define acceleration
 dtype = "bf16"

configs/opensora/train/64x512x512-sp.py CHANGED Viewed

@@ -1,17 +1,17 @@
-num_frames = 64
-frame_interval = 2
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
-plugin = "zero2-seq"
 sp_size = 2
 # Define model

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(512, 512),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
+plugin = "zero2"
 sp_size = 2
 # Define model

configs/opensora/train/64x512x512.py CHANGED Viewed

@@ -1,14 +1,14 @@
-num_frames = 64
-frame_interval = 2
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=64,
+    frame_interval=3,
+    image_size=(512, 512),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"

configs/pixart/inference/16x256x256.py CHANGED Viewed

@@ -23,10 +23,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=7.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
+save_dir = "./samples/samples/"

configs/pixart/inference/1x1024MS.py CHANGED Viewed

@@ -1,7 +1,7 @@
 num_frames = 1
 fps = 1
 image_size = (1920, 512)
-multi_resolution = True
 # Define model
 model = dict(
@@ -25,10 +25,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2i_samples.txt"
-save_dir = "./outputs/samples/"

 num_frames = 1
 fps = 1
 image_size = (1920, 512)
+multi_resolution = "PixArtMS"
 # Define model
 model = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2i_samples.txt"
+save_dir = "./samples/samples/"

configs/pixart/inference/1x256x256.py CHANGED Viewed

@@ -24,10 +24,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2i_samples.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=7.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2i_samples.txt"
+save_dir = "./samples/samples/"

configs/pixart/inference/1x512x512.py CHANGED Viewed

@@ -24,10 +24,16 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
-prompt_path = "./assets/texts/t2i_samples.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=7.0,
 )
+dtype = "bf16"
+# prompt_path = "./assets/texts/t2i_samples.txt"
+prompt = [
+    "Pirate ship trapped in a cosmic maelstrom nebula.",
+    "A small cactus with a happy face in the Sahara desert.",
+    "A small cactus with a sad face in the Sahara desert.",
+]
 # Others
 batch_size = 2
 seed = 42
+save_dir = "./samples/samples/"

configs/pixart/train/16x256x256.py CHANGED Viewed

@@ -1,16 +1,16 @@
-num_frames = 16
-frame_interval = 3
-image_size = (256, 256)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
-grad_checkpoint = False
 plugin = "zero2"
 sp_size = 1

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
+grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1

configs/pixart/train/1x512x512.py CHANGED Viewed

@@ -1,14 +1,14 @@
-num_frames = 1
-frame_interval = 1
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = True
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=1,
+    frame_interval=3,
+    image_size=(512, 512),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"

configs/pixart/train/64x512x512.py CHANGED Viewed

@@ -1,19 +1,20 @@
-num_frames = 64
-frame_interval = 2
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1
 # Define model
 model = dict(
     type="PixArt-XL/2",

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=64,
+    frame_interval=3,
+    image_size=(256, 256),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1
 # Define model
 model = dict(
     type="PixArt-XL/2",

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 xformers
-git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora
 transformers

 xformers
 transformers
+git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora