model: base_learning_rate: 1.0e-4 target: sgm.models.diffusion.DiffusionEngine params: scale_factor: 0.13025 disable_first_stage_autocast: True trainkeys: pose multiplier: 0.05 loss_rgb_lambda: 5 loss_fg_lambda: 10 loss_bg_lambda: 10 log_keys: - txt denoiser_config: target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser params: num_idx: 1000 weighting_config: target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting scaling_config: target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization network_config: target: sgm.modules.diffusionmodules.openaimodel.UNetModel params: adm_in_channels: 2816 num_classes: sequential use_checkpoint: False in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: [4, 2] num_res_blocks: 2 channel_mult: [1, 2, 4] num_head_channels: 64 use_linear_in_transformer: True transformer_depth: [1, 2, 10] context_dim: 2048 spatial_transformer_attn_type: softmax-xformers image_cross_blocks: [0, 2, 4, 6, 8, 10] rgb: True far: 2 num_samples: 24 not_add_context_in_triplane: False rgb_predict: True add_lora: False average: False use_prev_weights_imp_sample: True stratified: True imp_sampling_percent: 0.9 conditioner_config: target: sgm.modules.GeneralConditioner params: emb_models: # crossattn cond - is_trainable: False input_keys: txt,txt_ref target: sgm.modules.encoders.modules.FrozenCLIPEmbedder params: layer: hidden layer_idx: 11 modifier_token: # crossattn and vector cond - is_trainable: False input_keys: txt,txt_ref target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder params: arch: ViT-bigG-14 version: laion2b_s39b_b160k layer: penultimate always_return_pooled: True legacy: False modifier_token: # vector cond - is_trainable: False input_keys: original_size_as_tuple,original_size_as_tuple_ref target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 # multiplied by two # vector cond - is_trainable: False input_keys: crop_coords_top_left,crop_coords_top_left_ref target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 # multiplied by two # vector cond - is_trainable: False input_keys: target_size_as_tuple,target_size_as_tuple_ref target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 # multiplied by two first_stage_config: target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper params: ckpt_path: pretrained-models/sdxl_vae.safetensors embed_dim: 4 monitor: val/rec_loss ddconfig: attn_type: vanilla-xformers double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [1, 2, 4, 4] num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity loss_fn_config: target: sgm.modules.diffusionmodules.loss.StandardDiffusionLossImgRef params: sigma_sampler_config: target: sgm.modules.diffusionmodules.sigma_sampling.CubicSampling params: num_idx: 1000 discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization sigma_sampler_config_ref: target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling params: num_idx: 50 discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization sampler_config: target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler params: num_steps: 50 discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization guider_config: target: sgm.modules.diffusionmodules.guiders.VanillaCFGImgRef params: scale: 7.5 data: target: sgm.data.data_co3d.CustomDataDictLoader params: batch_size: 1 num_workers: 4 category: teddybear img_size: 512 skip: 2 num_images: 5 mask_images: True single_id: 0 bbox: True addreg: True drop_ratio: 0.25 drop_txt: 0.1 modifier_token: lightning: modelcheckpoint: params: every_n_train_steps: 1600 save_top_k: -1 save_on_train_epoch_end: False callbacks: metrics_over_trainsteps_checkpoint: params: every_n_train_steps: 25000 image_logger: target: main.ImageLogger params: disabled: False enable_autocast: False batch_frequency: 5000 max_images: 8 increase_log_steps: False log_first_step: False log_images_kwargs: use_ema_scope: False N: 1 n_rows: 2 trainer: devices: 0,1,2,3 benchmark: True num_sanity_val_steps: 0 accumulate_grad_batches: 1 max_steps: 1610 # val_check_interval: 400