model: base_learning_rate: 0.0001 target: sgm.models.diffusion.DiffusionEngine params: scale_factor: 0.13025 disable_first_stage_autocast: true trainkeys: pose multiplier: 0.05 loss_rgb_lambda: 5 loss_fg_lambda: 10 loss_bg_lambda: 10 log_keys: - txt denoiser_config: target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser params: num_idx: 1000 weighting_config: target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting scaling_config: target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization network_config: target: sgm.modules.diffusionmodules.openaimodel.UNetModel params: adm_in_channels: 2816 num_classes: sequential use_checkpoint: false in_channels: 4 out_channels: 4 model_channels: 320 attention_resolutions: - 4 - 2 num_res_blocks: 2 channel_mult: - 1 - 2 - 4 num_head_channels: 64 use_linear_in_transformer: true transformer_depth: - 1 - 2 - 10 context_dim: 2048 spatial_transformer_attn_type: softmax-xformers image_cross_blocks: - 0 - 2 - 4 - 6 - 8 - 10 rgb: true far: 2 num_samples: 24 not_add_context_in_triplane: false rgb_predict: true add_lora: false average: false use_prev_weights_imp_sample: true stratified: true imp_sampling_percent: 0.9 use_prev_weights_imp_sample: true conditioner_config: target: sgm.modules.GeneralConditioner params: emb_models: - is_trainable: false input_keys: txt,txt_ref target: sgm.modules.encoders.modules.FrozenCLIPEmbedder params: layer: hidden layer_idx: 11 modifier_token: - is_trainable: false input_keys: txt,txt_ref target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder params: arch: ViT-bigG-14 version: laion2b_s39b_b160k layer: penultimate always_return_pooled: true legacy: false modifier_token: - is_trainable: false input_keys: original_size_as_tuple,original_size_as_tuple_ref target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 - is_trainable: false input_keys: crop_coords_top_left,crop_coords_top_left_ref target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 - is_trainable: false input_keys: target_size_as_tuple,target_size_as_tuple_ref target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND params: outdim: 256 first_stage_config: target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper params: ckpt_path: pretrained-models/sdxl_vae.safetensors embed_dim: 4 monitor: val/rec_loss ddconfig: attn_type: vanilla-xformers double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: - 1 - 2 - 4 - 4 num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity loss_fn_config: target: sgm.modules.diffusionmodules.loss.StandardDiffusionLossImgRef params: sigma_sampler_config: target: sgm.modules.diffusionmodules.sigma_sampling.CubicSampling params: num_idx: 1000 discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization sigma_sampler_config_ref: target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling params: num_idx: 50 discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization sampler_config: target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler params: num_steps: 50 discretization_config: target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization guider_config: target: sgm.modules.diffusionmodules.guiders.VanillaCFGImgRef params: scale: 7.5 data: target: sgm.data.data_co3d.CustomDataDictLoader params: batch_size: 1 num_workers: 4 category: motorcycle img_size: 512 skip: 2 num_images: 5 mask_images: true single_id: 12 bbox: true addreg: true drop_ratio: 0.25 drop_txt: 0.1 modifier_token: categoryname: null