model: base_learning_rate: 4.5e-6 target: taming.models.vqgan.VQModel params: embed_dim: 256 n_embed: 1024 image_key: depth ddconfig: double_z: False z_channels: 256 resolution: 256 in_channels: 1 out_ch: 1 ch: 128 ch_mult: [ 1,1,2,2,4] # num_down = len(ch_mult)-1 num_res_blocks: 2 attn_resolutions: [16] dropout: 0.0 lossconfig: target: taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator params: disc_conditional: False disc_in_channels: 1 disc_start: 50001 disc_weight: 0.75 codebook_weight: 1.0 data: target: main.DataModuleFromConfig params: batch_size: 3 num_workers: 8 train: target: taming.data.imagenet.ImageNetTrainWithDepth params: size: 256 validation: target: taming.data.imagenet.ImageNetValidationWithDepth params: size: 256