frankleeeee commited on
Commit
9319b27
1 Parent(s): fc91abf
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ outputs/
README copy.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Open Sora
3
+ emoji: 📚
4
+ colorFrom: yellow
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.21.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ This script runs a Gradio App for the Open-Sora model.
4
+
5
+ Usage:
6
+ python demo.py <config-path>
7
+ """
8
+
9
+ import argparse
10
+ import importlib
11
+ import os
12
+ import subprocess
13
+ import sys
14
+ import spaces
15
+ import gradio as gr
16
+ import torch
17
+
18
+
19
+
20
+ MODEL_TYPES = ["v1-16x256x256", "v1-HQ-16x256x256", "v1-HQ-16x512x512"]
21
+ CONFIG_MAP = {
22
+ "v1-16x256x256": "configs/opensora/inference/16x256x256.py",
23
+ "v1-HQ-16x256x256": "configs/opensora/inference/16x256x256.py",
24
+ "v1-HQ-16x512x512": "configs/opensora/inference/16x512x512.py",
25
+ }
26
+ HF_STDIT_MAP = {
27
+ "v1-16x256x256": "hpcai-tech/OpenSora-STDiT-v1-16x256x256",
28
+ "v1-HQ-16x256x256": "hpcai-tech/OpenSora-STDiT-v1-HQ-16x256x256",
29
+ "v1-HQ-16x512x512": "hpcai-tech/OpenSora-STDiT-v1-HQ-16x512x512",
30
+ }
31
+
32
+ def install_dependencies():
33
+ """
34
+ Install the required dependencies for the demo if they are not already installed.
35
+ """
36
+
37
+ def _is_package_available(name) -> bool:
38
+ try:
39
+ importlib.import_module(name)
40
+ return True
41
+ except (ImportError, ModuleNotFoundError):
42
+ return False
43
+
44
+ # install flash attention
45
+ if not _is_package_available("flash_attn"):
46
+ subprocess.run(
47
+ f"{sys.executable} -m pip install flash-attn --no-build-isolation",
48
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
49
+ shell=True,
50
+ )
51
+
52
+ def read_config(config_path):
53
+ """
54
+ Read the configuration file.
55
+ """
56
+ from mmengine.config import Config
57
+ return Config.fromfile(config_path)
58
+
59
+ def build_models(model_type, config):
60
+ """
61
+ Build the models for the given model type and configuration.
62
+ """
63
+ # build vae
64
+ from opensora.registry import MODELS, build_module
65
+
66
+ vae = build_module(config.vae, MODELS).cuda()
67
+
68
+ # build text encoder
69
+ text_encoder = build_module(config.text_encoder, MODELS) # T5 must be fp32
70
+ text_encoder.t5.model = text_encoder.t5.model.cuda()
71
+
72
+ # build stdit
73
+ # we load model from HuggingFace directly so that we don't need to
74
+ # handle model download logic in HuggingFace Space
75
+ from transformers import AutoModel
76
+
77
+ stdit = AutoModel.from_pretrained(
78
+ HF_STDIT_MAP[model_type],
79
+ enable_flash_attn=False,
80
+ enable_layernorm_kernel=False,
81
+ trust_remote_code=True,
82
+ ).cuda()
83
+
84
+ # build scheduler
85
+ from opensora.registry import SCHEDULERS
86
+
87
+ scheduler = build_module(config.scheduler, SCHEDULERS)
88
+
89
+ # hack for classifier-free guidance
90
+ text_encoder.y_embedder = stdit.y_embedder
91
+
92
+ # move modelst to device
93
+ vae = vae.to(torch.float16).eval()
94
+ text_encoder.t5.model = text_encoder.t5.model.eval() # t5 must be in fp32
95
+ stdit = stdit.to(torch.float16).eval()
96
+ return vae, text_encoder, stdit, scheduler
97
+
98
+
99
+ def get_latent_size(config, vae):
100
+ input_size = (config.num_frames, *config.image_size)
101
+ latent_size = vae.get_latent_size(input_size)
102
+ return latent_size
103
+
104
+
105
+ def parse_args():
106
+ parser = argparse.ArgumentParser()
107
+ parser.add_argument(
108
+ "--model-type",
109
+ default="v1-HQ-16x512x512",
110
+ choices=MODEL_TYPES,
111
+ help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
112
+ )
113
+ parser.add_argument("--output", default="./outputs", type=str, help="The path to the output folder")
114
+ parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.")
115
+ parser.add_argument("--host", default=None, type=str, help="The host to run the Gradio App on.")
116
+ parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.")
117
+ return parser.parse_args()
118
+
119
+
120
+ # ============================
121
+ # Main Gradio Script
122
+ # ============================
123
+ # read config
124
+ args = parse_args()
125
+ config = read_config(CONFIG_MAP[args.model_type])
126
+
127
+ # make outputs dir
128
+ os.makedirs(args.output, exist_ok=True)
129
+
130
+ # disable torch jit as it can cause failure in gradio SDK
131
+ # since gradio sdk uses torch with cuda 11.3
132
+ torch.jit._state.disable()
133
+
134
+ # set up
135
+ install_dependencies()
136
+
137
+ # build model
138
+ vae, text_encoder, stdit, scheduler = build_models(args.model_type, config)
139
+
140
+ @spaces.GPU(duration=200)
141
+ def run_inference(prompt_text):
142
+ latent_size = get_latent_size(config, vae)
143
+
144
+ from opensora.datasets import save_sample
145
+ samples = scheduler.sample(
146
+ stdit,
147
+ text_encoder,
148
+ z_size=(vae.out_channels, *latent_size),
149
+ prompts=[prompt_text],
150
+ device="cuda",
151
+ )
152
+
153
+ samples = vae.decode(samples.to(torch.float16))
154
+ filename = f"{args.output}/sample"
155
+ saved_path = save_sample(samples[0], fps=config.fps, save_path=filename)
156
+ return saved_path
157
+
158
+ # create demo
159
+ with gr.Blocks() as demo:
160
+ with gr.Row():
161
+ with gr.Column():
162
+ gr.HTML(
163
+ """
164
+ <div style='text-align: center;'>
165
+ <p align="center">
166
+ <img src="https://github.com/hpcaitech/Open-Sora/raw/main/assets/readme/icon.png" width="250"/>
167
+ </p>
168
+ <div style="display: flex; gap: 10px; justify-content: center;">
169
+ <a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
170
+ <a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
171
+ <a href="https://discord.gg/kZakZzrSUT"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
172
+ <a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
173
+ <a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
174
+ <a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
175
+ <a href="https://hpc-ai.com/blog/open-sora-v1.0"><img src="https://img.shields.io/badge/Open_Sora-Blog-blue"></a>
176
+ </div>
177
+ <h1 style='margin-top: 5px;'>Open-Sora: Democratizing Efficient Video Production for All</h1>
178
+ </div>
179
+ """
180
+ )
181
+
182
+ with gr.Row():
183
+ with gr.Column():
184
+ prompt_text = gr.Textbox(show_label=False, placeholder="Describe your video here", lines=4)
185
+ submit_button = gr.Button("Generate video")
186
+
187
+ with gr.Column():
188
+ output_video = gr.Video()
189
+
190
+ submit_button.click(fn=run_inference, inputs=[prompt_text], outputs=output_video)
191
+
192
+ gr.Examples(
193
+ examples=[
194
+ [
195
+ "The video captures the majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene. The camera angle provides a bird's eye view of the waterfall, allowing viewers to appreciate the full height and grandeur of the waterfall. The video is a stunning representation of nature's power and beauty.",
196
+ ],
197
+ ],
198
+ fn=run_inference,
199
+ inputs=[
200
+ prompt_text,
201
+ ],
202
+ outputs=[output_video],
203
+ cache_examples=True,
204
+ )
205
+
206
+ # launch
207
+ # demo.launch(server_port=args.port, server_name=args.host, share=args.share)
208
+ demo.launch()
209
+
configs/dit/inference/16x256x256.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 8
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="DiT-XL/2",
8
+ condition="text",
9
+ from_pretrained="PRETRAINED_MODEL",
10
+ )
11
+ vae = dict(
12
+ type="VideoAutoencoderKL",
13
+ from_pretrained="stabilityai/sd-vae-ft-ema",
14
+ )
15
+ text_encoder = dict(
16
+ type="clip",
17
+ from_pretrained="openai/clip-vit-base-patch32",
18
+ model_max_length=77,
19
+ )
20
+ scheduler = dict(
21
+ type="dpm-solver",
22
+ num_sampling_steps=20,
23
+ cfg_scale=4.0,
24
+ )
25
+ dtype = "fp16"
26
+
27
+ # Others
28
+ batch_size = 2
29
+ seed = 42
30
+ prompt_path = "./assets/texts/ucf101_labels.txt"
31
+ save_dir = "./outputs/samples/"
configs/dit/inference/1x256x256-class.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="DiT-XL/2",
8
+ no_temporal_pos_emb=True,
9
+ condition="label_1000",
10
+ from_pretrained="DiT-XL-2-256x256.pt",
11
+ )
12
+ vae = dict(
13
+ type="VideoAutoencoderKL",
14
+ from_pretrained="stabilityai/sd-vae-ft-ema",
15
+ )
16
+ text_encoder = dict(
17
+ type="classes",
18
+ num_classes=1000,
19
+ )
20
+ scheduler = dict(
21
+ type="dpm-solver",
22
+ num_sampling_steps=20,
23
+ cfg_scale=4.0,
24
+ )
25
+ dtype = "fp16"
26
+
27
+ # Others
28
+ batch_size = 2
29
+ seed = 42
30
+ prompt_path = "./assets/texts/imagenet_id.txt"
31
+ save_dir = "./outputs/samples/"
configs/dit/inference/1x256x256.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="DiT-XL/2",
8
+ no_temporal_pos_emb=True,
9
+ condition="text",
10
+ from_pretrained="PRETRAINED_MODEL",
11
+ )
12
+ vae = dict(
13
+ type="VideoAutoencoderKL",
14
+ from_pretrained="stabilityai/sd-vae-ft-ema",
15
+ )
16
+ text_encoder = dict(
17
+ type="clip",
18
+ from_pretrained="openai/clip-vit-base-patch32",
19
+ model_max_length=77,
20
+ )
21
+ scheduler = dict(
22
+ type="dpm-solver",
23
+ num_sampling_steps=20,
24
+ cfg_scale=4.0,
25
+ )
26
+ dtype = "fp16"
27
+
28
+ # Others
29
+ batch_size = 2
30
+ seed = 42
31
+ prompt_path = "./assets/texts/imagenet_labels.txt"
32
+ save_dir = "./outputs/samples/"
configs/dit/train/16x256x256.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ image_size = (256, 256)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = False
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="DiT-XL/2",
20
+ from_pretrained="DiT-XL-2-256x256.pt",
21
+ enable_flashattn=True,
22
+ enable_layernorm_kernel=True,
23
+ )
24
+ vae = dict(
25
+ type="VideoAutoencoderKL",
26
+ from_pretrained="stabilityai/sd-vae-ft-ema",
27
+ )
28
+ text_encoder = dict(
29
+ type="clip",
30
+ from_pretrained="openai/clip-vit-base-patch32",
31
+ model_max_length=77,
32
+ )
33
+ scheduler = dict(
34
+ type="iddpm",
35
+ timestep_respacing="",
36
+ )
37
+
38
+ # Others
39
+ seed = 42
40
+ outputs = "outputs"
41
+ wandb = False
42
+
43
+ epochs = 1000
44
+ log_every = 10
45
+ ckpt_every = 1000
46
+ load = None
47
+
48
+ batch_size = 8
49
+ lr = 2e-5
50
+ grad_clip = 1.0
configs/dit/train/1x256x256.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ frame_interval = 1
3
+ image_size = (256, 256)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = True
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = False
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="DiT-XL/2",
20
+ no_temporal_pos_emb=True,
21
+ enable_flashattn=True,
22
+ enable_layernorm_kernel=True,
23
+ )
24
+ vae = dict(
25
+ type="VideoAutoencoderKL",
26
+ from_pretrained="stabilityai/sd-vae-ft-ema",
27
+ )
28
+ text_encoder = dict(
29
+ type="clip",
30
+ from_pretrained="openai/clip-vit-base-patch32",
31
+ model_max_length=77,
32
+ )
33
+ scheduler = dict(
34
+ type="iddpm",
35
+ timestep_respacing="",
36
+ )
37
+
38
+ # Others
39
+ seed = 42
40
+ outputs = "outputs"
41
+ wandb = False
42
+
43
+ epochs = 1000
44
+ log_every = 10
45
+ ckpt_every = 1000
46
+ load = None
47
+
48
+ batch_size = 128
49
+ lr = 1e-4 # according to DiT repo
50
+ grad_clip = 1.0
configs/latte/inference/16x256x256-class.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 8
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="Latte-XL/2",
8
+ condition="label_101",
9
+ from_pretrained="Latte-XL-2-256x256-ucf101.pt",
10
+ )
11
+ vae = dict(
12
+ type="VideoAutoencoderKL",
13
+ from_pretrained="stabilityai/sd-vae-ft-ema",
14
+ )
15
+ text_encoder = dict(
16
+ type="classes",
17
+ num_classes=101,
18
+ )
19
+ scheduler = dict(
20
+ type="dpm-solver",
21
+ num_sampling_steps=20,
22
+ cfg_scale=4.0,
23
+ )
24
+ dtype = "fp16"
25
+
26
+ # Others
27
+ batch_size = 2
28
+ seed = 42
29
+ prompt_path = "./assets/texts/ucf101_id.txt"
30
+ save_dir = "./outputs/samples/"
configs/latte/inference/16x256x256.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 8
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="Latte-XL/2",
8
+ condition="text",
9
+ from_pretrained="PRETRAINED_MODEL",
10
+ )
11
+ vae = dict(
12
+ type="VideoAutoencoderKL",
13
+ from_pretrained="stabilityai/sd-vae-ft-ema",
14
+ )
15
+ text_encoder = dict(
16
+ type="clip",
17
+ from_pretrained="openai/clip-vit-base-patch32",
18
+ model_max_length=77,
19
+ )
20
+ scheduler = dict(
21
+ type="dpm-solver",
22
+ num_sampling_steps=20,
23
+ cfg_scale=4.0,
24
+ )
25
+ dtype = "fp16"
26
+
27
+ # Others
28
+ batch_size = 2
29
+ seed = 42
30
+ prompt_path = "./assets/texts/ucf101_labels.txt"
31
+ save_dir = "./outputs/samples/"
configs/latte/train/16x256x256.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ image_size = (256, 256)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="Latte-XL/2",
20
+ enable_flashattn=True,
21
+ enable_layernorm_kernel=True,
22
+ )
23
+ vae = dict(
24
+ type="VideoAutoencoderKL",
25
+ from_pretrained="stabilityai/sd-vae-ft-ema",
26
+ )
27
+ text_encoder = dict(
28
+ type="clip",
29
+ from_pretrained="openai/clip-vit-base-patch32",
30
+ model_max_length=77,
31
+ )
32
+ scheduler = dict(
33
+ type="iddpm",
34
+ timestep_respacing="",
35
+ )
36
+
37
+ # Others
38
+ seed = 42
39
+ outputs = "outputs"
40
+ wandb = False
41
+
42
+ epochs = 1000
43
+ log_every = 10
44
+ ckpt_every = 1000
45
+ load = None
46
+
47
+ batch_size = 8
48
+ lr = 2e-5
49
+ grad_clip = 1.0
configs/opensora/inference/16x256x256.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 24 // 3
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="STDiT-XL/2",
8
+ space_scale=0.5,
9
+ time_scale=1.0,
10
+ enable_flashattn=False,
11
+ enable_layernorm_kernel=False,
12
+ from_pretrained="PRETRAINED_MODEL",
13
+ )
14
+ vae = dict(
15
+ type="VideoAutoencoderKL",
16
+ from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ )
18
+ text_encoder = dict(
19
+ type="t5",
20
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
21
+ model_max_length=120,
22
+ )
23
+ scheduler = dict(
24
+ type="iddpm",
25
+ num_sampling_steps=100,
26
+ cfg_scale=7.0,
27
+ )
28
+ dtype = "fp16"
29
+
30
+ # Others
31
+ batch_size = 2
32
+ seed = 42
33
+ prompt_path = "./assets/texts/t2v_samples.txt"
34
+ save_dir = "./outputs/samples/"
configs/opensora/inference/16x512x512.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 24 // 3
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="STDiT-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ enable_flashattn=False,
11
+ enable_layernorm_kernel=False,
12
+ from_pretrained="PRETRAINED_MODEL"
13
+ )
14
+ vae = dict(
15
+ type="VideoAutoencoderKL",
16
+ from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ micro_batch_size=128,
18
+ )
19
+ text_encoder = dict(
20
+ type="t5",
21
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
22
+ model_max_length=120,
23
+ )
24
+ scheduler = dict(
25
+ type="iddpm",
26
+ num_sampling_steps=100,
27
+ cfg_scale=7.0,
28
+ )
29
+ dtype = "fp16"
30
+
31
+ # Others
32
+ batch_size = 2
33
+ seed = 42
34
+ prompt_path = "./assets/texts/t2v_samples.txt"
35
+ save_dir = "./outputs/samples/"
configs/opensora/inference/64x512x512.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 24 //4
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="STDiT-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=2 / 3,
10
+ enable_flashattn=False,
11
+ enable_layernorm_kernel=False,
12
+ from_pretrained="PRETRAINED_MODEL",
13
+ )
14
+ vae = dict(
15
+ type="VideoAutoencoderKL",
16
+ from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ micro_batch_size=128,
18
+ )
19
+ text_encoder = dict(
20
+ type="t5",
21
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
22
+ model_max_length=120,
23
+ )
24
+ scheduler = dict(
25
+ type="iddpm",
26
+ num_sampling_steps=50,
27
+ cfg_scale=7.0,
28
+ )
29
+ dtype = "fp16"
30
+
31
+ # Others
32
+ batch_size = 1
33
+ seed = 42
34
+ prompt_path = "./assets/texts/t2v_samples.txt"
35
+ save_dir = "./outputs/samples/"
configs/opensora/train/16x256x256.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ image_size = (256, 256)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=0.5,
21
+ time_scale=1.0,
22
+ from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ vae = dict(
27
+ type="VideoAutoencoderKL",
28
+ from_pretrained="stabilityai/sd-vae-ft-ema",
29
+ )
30
+ text_encoder = dict(
31
+ type="t5",
32
+ from_pretrained="./pretrained_models/t5_ckpts",
33
+ model_max_length=120,
34
+ shardformer=True,
35
+ )
36
+ scheduler = dict(
37
+ type="iddpm",
38
+ timestep_respacing="",
39
+ )
40
+
41
+ # Others
42
+ seed = 42
43
+ outputs = "outputs"
44
+ wandb = False
45
+
46
+ epochs = 1000
47
+ log_every = 10
48
+ ckpt_every = 1000
49
+ load = None
50
+
51
+ batch_size = 8
52
+ lr = 2e-5
53
+ grad_clip = 1.0
configs/opensora/train/16x512x512.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = False
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=1.0,
22
+ from_pretrained=None,
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ vae = dict(
27
+ type="VideoAutoencoderKL",
28
+ from_pretrained="stabilityai/sd-vae-ft-ema",
29
+ micro_batch_size=128,
30
+ )
31
+ text_encoder = dict(
32
+ type="t5",
33
+ from_pretrained="./pretrained_models/t5_ckpts",
34
+ model_max_length=120,
35
+ shardformer=True,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 500
50
+ load = None
51
+
52
+ batch_size = 8
53
+ lr = 2e-5
54
+ grad_clip = 1.0
configs/opensora/train/360x512x512.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 360
2
+ frame_interval = 1
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2-seq"
15
+ sp_size = 2
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=2 / 3,
22
+ from_pretrained=None,
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ enable_sequence_parallelism=True, # enable sq here
26
+ )
27
+ vae = dict(
28
+ type="VideoAutoencoderKL",
29
+ from_pretrained="stabilityai/sd-vae-ft-ema",
30
+ micro_batch_size=128,
31
+ )
32
+ text_encoder = dict(
33
+ type="t5",
34
+ from_pretrained="./pretrained_models/t5_ckpts",
35
+ model_max_length=120,
36
+ shardformer=True,
37
+ )
38
+ scheduler = dict(
39
+ type="iddpm",
40
+ timestep_respacing="",
41
+ )
42
+
43
+ # Others
44
+ seed = 42
45
+ outputs = "outputs"
46
+ wandb = False
47
+
48
+ epochs = 1000
49
+ log_every = 10
50
+ ckpt_every = 250
51
+ load = None
52
+
53
+ batch_size = 1
54
+ lr = 2e-5
55
+ grad_clip = 1.0
configs/opensora/train/64x512x512-sp.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 64
2
+ frame_interval = 2
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2-seq"
15
+ sp_size = 2
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=2 / 3,
22
+ from_pretrained=None,
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ enable_sequence_parallelism=True, # enable sq here
26
+ )
27
+ vae = dict(
28
+ type="VideoAutoencoderKL",
29
+ from_pretrained="stabilityai/sd-vae-ft-ema",
30
+ )
31
+ text_encoder = dict(
32
+ type="t5",
33
+ from_pretrained="./pretrained_models/t5_ckpts",
34
+ model_max_length=120,
35
+ shardformer=True,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 1000
50
+ load = None
51
+
52
+ batch_size = 1
53
+ lr = 2e-5
54
+ grad_clip = 1.0
configs/opensora/train/64x512x512.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 64
2
+ frame_interval = 2
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=2 / 3,
22
+ from_pretrained=None,
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ vae = dict(
27
+ type="VideoAutoencoderKL",
28
+ from_pretrained="stabilityai/sd-vae-ft-ema",
29
+ micro_batch_size=64,
30
+ )
31
+ text_encoder = dict(
32
+ type="t5",
33
+ from_pretrained="./pretrained_models/t5_ckpts",
34
+ model_max_length=120,
35
+ shardformer=True,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 250
50
+ load = None
51
+
52
+ batch_size = 4
53
+ lr = 2e-5
54
+ grad_clip = 1.0
configs/pixart/inference/16x256x256.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 8
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ space_scale=0.5,
9
+ time_scale=1.0,
10
+ from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt",
11
+ )
12
+ vae = dict(
13
+ type="VideoAutoencoderKL",
14
+ from_pretrained="stabilityai/sd-vae-ft-ema",
15
+ )
16
+ text_encoder = dict(
17
+ type="t5",
18
+ from_pretrained="./pretrained_models/t5_ckpts",
19
+ model_max_length=120,
20
+ )
21
+ scheduler = dict(
22
+ type="dpm-solver",
23
+ num_sampling_steps=20,
24
+ cfg_scale=7.0,
25
+ )
26
+ dtype = "fp16"
27
+
28
+ # Others
29
+ batch_size = 2
30
+ seed = 42
31
+ prompt_path = "./assets/texts/t2v_samples.txt"
32
+ save_dir = "./outputs/samples/"
configs/pixart/inference/1x1024MS.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (1920, 512)
4
+ multi_resolution = True
5
+
6
+ # Define model
7
+ model = dict(
8
+ type="PixArtMS-XL/2",
9
+ space_scale=2.0,
10
+ time_scale=1.0,
11
+ no_temporal_pos_emb=True,
12
+ from_pretrained="PixArt-XL-2-1024-MS.pth",
13
+ )
14
+ vae = dict(
15
+ type="VideoAutoencoderKL",
16
+ from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ )
18
+ text_encoder = dict(
19
+ type="t5",
20
+ from_pretrained="./pretrained_models/t5_ckpts",
21
+ model_max_length=120,
22
+ )
23
+ scheduler = dict(
24
+ type="dpm-solver",
25
+ num_sampling_steps=20,
26
+ cfg_scale=7.0,
27
+ )
28
+ dtype = "fp16"
29
+
30
+ # Others
31
+ batch_size = 2
32
+ seed = 42
33
+ prompt_path = "./assets/texts/t2i_samples.txt"
34
+ save_dir = "./outputs/samples/"
configs/pixart/inference/1x256x256.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ no_temporal_pos_emb=True,
11
+ from_pretrained="PixArt-XL-2-256x256.pth",
12
+ )
13
+ vae = dict(
14
+ type="VideoAutoencoderKL",
15
+ from_pretrained="stabilityai/sd-vae-ft-ema",
16
+ )
17
+ text_encoder = dict(
18
+ type="t5",
19
+ from_pretrained="./pretrained_models/t5_ckpts",
20
+ model_max_length=120,
21
+ )
22
+ scheduler = dict(
23
+ type="dpm-solver",
24
+ num_sampling_steps=20,
25
+ cfg_scale=7.0,
26
+ )
27
+ dtype = "fp16"
28
+
29
+ # Others
30
+ batch_size = 2
31
+ seed = 42
32
+ prompt_path = "./assets/texts/t2i_samples.txt"
33
+ save_dir = "./outputs/samples/"
configs/pixart/inference/1x512x512.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ no_temporal_pos_emb=True,
11
+ from_pretrained="PixArt-XL-2-512x512.pth",
12
+ )
13
+ vae = dict(
14
+ type="VideoAutoencoderKL",
15
+ from_pretrained="stabilityai/sd-vae-ft-ema",
16
+ )
17
+ text_encoder = dict(
18
+ type="t5",
19
+ from_pretrained="./pretrained_models/t5_ckpts",
20
+ model_max_length=120,
21
+ )
22
+ scheduler = dict(
23
+ type="dpm-solver",
24
+ num_sampling_steps=20,
25
+ cfg_scale=7.0,
26
+ )
27
+ dtype = "fp16"
28
+
29
+ # Others
30
+ batch_size = 2
31
+ seed = 42
32
+ prompt_path = "./assets/texts/t2i_samples.txt"
33
+ save_dir = "./outputs/samples/"
configs/pixart/train/16x256x256.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ image_size = (256, 256)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = False
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="PixArt-XL/2",
20
+ space_scale=0.5,
21
+ time_scale=1.0,
22
+ from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ vae = dict(
27
+ type="VideoAutoencoderKL",
28
+ from_pretrained="stabilityai/sd-vae-ft-ema",
29
+ )
30
+ text_encoder = dict(
31
+ type="t5",
32
+ from_pretrained="./pretrained_models/t5_ckpts",
33
+ model_max_length=120,
34
+ shardformer=True,
35
+ )
36
+ scheduler = dict(
37
+ type="iddpm",
38
+ timestep_respacing="",
39
+ )
40
+
41
+ # Others
42
+ seed = 42
43
+ outputs = "outputs"
44
+ wandb = False
45
+
46
+ epochs = 1000
47
+ log_every = 10
48
+ ckpt_every = 1000
49
+ load = None
50
+
51
+ batch_size = 8
52
+ lr = 2e-5
53
+ grad_clip = 1.0
configs/pixart/train/1x512x512.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ frame_interval = 1
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = True
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="PixArt-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=1.0,
22
+ no_temporal_pos_emb=True,
23
+ from_pretrained="PixArt-XL-2-512x512.pth",
24
+ enable_flashattn=True,
25
+ enable_layernorm_kernel=True,
26
+ )
27
+ vae = dict(
28
+ type="VideoAutoencoderKL",
29
+ from_pretrained="stabilityai/sd-vae-ft-ema",
30
+ )
31
+ text_encoder = dict(
32
+ type="t5",
33
+ from_pretrained="./pretrained_models/t5_ckpts",
34
+ model_max_length=120,
35
+ shardformer=True,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 1000
50
+ load = None
51
+
52
+ batch_size = 32
53
+ lr = 2e-5
54
+ grad_clip = 1.0
configs/pixart/train/64x512x512.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 64
2
+ frame_interval = 2
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="PixArt-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=2 / 3,
22
+ from_pretrained=None,
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ vae = dict(
27
+ type="VideoAutoencoderKL",
28
+ from_pretrained="stabilityai/sd-vae-ft-ema",
29
+ micro_batch_size=128,
30
+ )
31
+ text_encoder = dict(
32
+ type="t5",
33
+ from_pretrained="./pretrained_models/t5_ckpts",
34
+ model_max_length=120,
35
+ shardformer=True,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 250
50
+ load = None
51
+
52
+ batch_size = 4
53
+ lr = 2e-5
54
+ grad_clip = 1.0
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ xformers
2
+ git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora
3
+ transformers