Spaces:
Running
on
Zero
Running
on
Zero
experiment: | |
project: "titok_generation" | |
name: "titok_b64_maskgit" | |
max_train_examples: 1_281_167 | |
save_every: 50_000 | |
eval_every: 50_000 | |
generate_every: 5_000 | |
log_every: 50 | |
log_grad_norm_every: 1_000 | |
resume: True | |
tokenizer_checkpoint: "tokenizer_titok_b64.bin" | |
model: | |
vq_model: | |
codebook_size: 4096 | |
token_size: 12 | |
use_l2_norm: True | |
commitment_cost: 0.25 | |
# vit arch | |
vit_enc_model_size: "base" | |
vit_dec_model_size: "base" | |
vit_enc_patch_size: 16 | |
vit_dec_patch_size: 16 | |
num_latent_tokens: 64 | |
finetune_decoder: True | |
generator: | |
model_type: "ViT" | |
hidden_size: 768 | |
num_hidden_layers: 24 | |
num_attention_heads: 16 | |
intermediate_size: 3072 | |
dropout: 0.1 | |
attn_drop: 0.1 | |
num_steps: 8 | |
class_label_dropout: 0.1 | |
image_seq_len: ${model.vq_model.num_latent_tokens} | |
condition_num_classes: 1000 | |
# sampling hyper-params on the flight | |
randomize_temperature: 1.0 | |
guidance_scale: 4.5 | |
guidance_decay: "constant" | |
losses: | |
label_smoothing: 0.1 | |
loss_weight_unmasked_token: 0.1 | |
dataset: | |
params: | |
train_shards_path_or_url: "imagenet_sharded/train/imagenet-train-{0000..0252}.tar" | |
eval_shards_path_or_url: "imagenet_sharded/val/imagenet-val-{0000..0009}.tar" | |
num_workers_per_gpu: 12 | |
preprocessing: | |
resize_shorter_edge: 256 | |
crop_size: 256 | |
random_crop: False | |
random_flip: True | |
optimizer: | |
name: adamw | |
params: | |
learning_rate: 2e-4 | |
beta1: 0.9 | |
beta2: 0.96 | |
weight_decay: 0.03 | |
lr_scheduler: | |
scheduler: "cosine" | |
params: | |
learning_rate: ${optimizer.params.learning_rate} | |
warmup_steps: 10_000 | |
end_lr: 1e-5 | |
training: | |
gradient_accumulation_steps: 1 | |
per_gpu_batch_size: 64 # 32 GPU, total batch size 2048 | |
mixed_precision: "bf16" | |
enable_tf32: True | |
enable_wandb: True | |
use_ema: True | |
seed: 42 | |
max_train_steps: 500_000 | |
max_grad_norm: 1.0 |