|
num_workers = 8 |
|
dtype = "bf16" |
|
seed = 42 |
|
num_eval_timesteps = 10 |
|
|
|
|
|
dataset = dict( |
|
type="VariableVideoTextDataset", |
|
transform_name="resize_crop", |
|
) |
|
|
|
bucket_config = { |
|
"144p": {1: (None, 100), 51: (None, 30), 102: (None, 20), 204: (None, 8), 408: (None, 4)}, |
|
|
|
"240p": {1: (None, 100), 51: (None, 24), 102: (None, 12), 204: (None, 4), 408: (None, 2)}, |
|
|
|
"360p": {1: (None, 60), 51: (None, 12), 102: (None, 6), 204: (None, 2), 408: (None, 1)}, |
|
|
|
"480p": {1: (None, 40), 51: (None, 6), 102: (None, 3), 204: (None, 1)}, |
|
|
|
"720p": {1: (None, 20), 51: (None, 2), 102: (None, 1)}, |
|
|
|
"1080p": {1: (None, 10)}, |
|
|
|
"2048": {1: (None, 5)}, |
|
} |
|
|
|
|
|
model = dict( |
|
type="STDiT3-XL/2", |
|
from_pretrained=None, |
|
qk_norm=True, |
|
enable_flash_attn=True, |
|
enable_layernorm_kernel=True, |
|
) |
|
vae = dict( |
|
type="OpenSoraVAE_V1_2", |
|
from_pretrained="pretrained_models/vae-pipeline", |
|
micro_frame_size=17, |
|
micro_batch_size=4, |
|
local_files_only=True, |
|
) |
|
text_encoder = dict( |
|
type="t5", |
|
from_pretrained="DeepFloyd/t5-v1_1-xxl", |
|
model_max_length=300, |
|
local_files_only=True, |
|
) |
|
scheduler = dict(type="rflow") |
|
|