|
|
|
dataset = dict( |
|
type="VariableVideoTextDataset", |
|
transform_name="resize_crop", |
|
) |
|
|
|
|
|
bucket_config = { |
|
"144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)}, |
|
|
|
"256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)}, |
|
"240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)}, |
|
|
|
"360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)}, |
|
"512": {1: (0.1, 141)}, |
|
|
|
"480p": {1: (0.1, 89)}, |
|
|
|
"720p": {1: (0.05, 36)}, |
|
"1024": {1: (0.05, 36)}, |
|
|
|
"1080p": {1: (0.1, 5)}, |
|
|
|
"2048": {1: (0.1, 5)}, |
|
} |
|
|
|
|
|
num_workers = 8 |
|
num_bucket_build_workers = 16 |
|
dtype = "bf16" |
|
seed = 42 |
|
outputs = "outputs" |
|
wandb = False |
|
|
|
|
|
|
|
model = dict( |
|
type="STDiT3-XL/2", |
|
from_pretrained="/mnt/nfs-206/zangwei/opensora/outputs/1091-STDiT3-XL-2/epoch0-global_step8500", |
|
qk_norm=True, |
|
enable_flash_attn=True, |
|
enable_layernorm_kernel=True, |
|
) |
|
vae = dict( |
|
type="OpenSoraVAE_V1_2", |
|
from_pretrained="pretrained_models/vae-pipeline", |
|
micro_frame_size=17, |
|
micro_batch_size=32, |
|
) |
|
text_encoder = dict( |
|
type="t5", |
|
from_pretrained="DeepFloyd/t5-v1_1-xxl", |
|
model_max_length=300, |
|
shardformer=True, |
|
local_files_only=True, |
|
) |
|
|
|
|
|
save_text_features = True |
|
save_compressed_text_features = True |
|
bin_size = 250 |
|
log_time = False |
|
|