|
train: |
|
epoch: 201 |
|
batchsize: 8 |
|
lr: 5e-5 |
|
lr_gamma: 0.1 |
|
lr_steps: [30, 40] |
|
cos: True |
|
checkpoint_every: 3000 |
|
|
|
model: |
|
target: modules.mage_model.MAGE |
|
params: |
|
codebook_size: 512 |
|
frames_length: 10 |
|
image_resolution: 16 |
|
vision_width: 512 |
|
dropout: 0.2 |
|
use_cids: False |
|
randomness: True |
|
auto_beta: True |
|
v_kl: 100 |
|
|
|
first_stage_config: |
|
target: ldm.models.autoencoder.AutoencoderKL |
|
params: |
|
monitor: val/rec_loss |
|
embed_dim: 4 |
|
ckpt_path: "models/autoencoders/kl_f8_cater/last_caterv2.ckpt" |
|
ddconfig: |
|
double_z: true |
|
z_channels: 4 |
|
resolution: 128 |
|
in_channels: 3 |
|
out_ch: 3 |
|
ch: 128 |
|
ch_mult: |
|
- 1 |
|
- 2 |
|
- 4 |
|
- 4 |
|
num_res_blocks: 2 |
|
attn_resolutions: [ ] |
|
dropout: 0.0 |
|
lossconfig: |
|
target: torch.nn.Identity |
|
text_encoder_config: |
|
target: modules.mage_model.TransformerTextEncoder |
|
params: |
|
vocab_size: 50 |
|
context_length: 38 |
|
transformer_width: 512 |
|
transformer_layers: 2 |
|
output_dim: 512 |
|
padding_idx: 0 |
|
dropout: 0.1 |
|
ma_config: |
|
target: modules.mage_model.MAEncoder |
|
params: |
|
layers: 1 |
|
d_model: 512 |
|
generate_decoder_config: |
|
target: modules.mage_model.FlatAxialDecoder |
|
params: |
|
in_channels: 512 |
|
out_channels: 4 |
|
model_channels: 512 |
|
frames_length: 10 |
|
layers: 6 |
|
|
|
data: |
|
target: dataload.CATER |
|
params: |
|
dataset: 'caterv2' |
|
data_root: '../datasets/CATER-GEN-v2' |
|
frames_length: 10 |
|
sample_speed: [3.0, 6.0] |
|
randomness: True |
|
|