model:
  base_learning_rate: 0.0001
  target: sgm.models.diffusion.DiffusionEngine
  params:
    scale_factor: 0.13025
    disable_first_stage_autocast: true
    trainkeys: pose
    multiplier: 0.05
    loss_rgb_lambda: 5
    loss_fg_lambda: 10
    loss_bg_lambda: 10
    log_keys:
    - txt
    denoiser_config:
      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
      params:
        num_idx: 1000
        weighting_config:
          target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
        scaling_config:
          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
    network_config:
      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        adm_in_channels: 2816
        num_classes: sequential
        use_checkpoint: false
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions:
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        num_head_channels: 64
        use_linear_in_transformer: true
        transformer_depth:
        - 1
        - 2
        - 10
        context_dim: 2048
        spatial_transformer_attn_type: softmax-xformers
        image_cross_blocks:
        - 0
        - 2
        - 4
        - 6
        - 8
        - 10
        rgb: true
        far: 2
        num_samples: 24
        not_add_context_in_triplane: false
        rgb_predict: true
        add_lora: false
        average: false
        use_prev_weights_imp_sample: true
        stratified: true
        imp_sampling_percent: 0.9
      use_prev_weights_imp_sample: true
    conditioner_config:
      target: sgm.modules.GeneralConditioner
      params:
        emb_models:
        - is_trainable: false
          input_keys: txt,txt_ref
          target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
          params:
            layer: hidden
            layer_idx: 11
            modifier_token: <new1>
        - is_trainable: false
          input_keys: txt,txt_ref
          target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
          params:
            arch: ViT-bigG-14
            version: laion2b_s39b_b160k
            layer: penultimate
            always_return_pooled: true
            legacy: false
            modifier_token: <new1>
        - is_trainable: false
          input_keys: original_size_as_tuple,original_size_as_tuple_ref
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
        - is_trainable: false
          input_keys: crop_coords_top_left,crop_coords_top_left_ref
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
        - is_trainable: false
          input_keys: target_size_as_tuple,target_size_as_tuple_ref
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
    first_stage_config:
      target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
      params:
        ckpt_path: pretrained-models/sdxl_vae.safetensors
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          attn_type: vanilla-xformers
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    loss_fn_config:
      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLossImgRef
      params:
        sigma_sampler_config:
          target: sgm.modules.diffusionmodules.sigma_sampling.CubicSampling
          params:
            num_idx: 1000
            discretization_config:
              target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
        sigma_sampler_config_ref:
          target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
          params:
            num_idx: 50
            discretization_config:
              target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
    sampler_config:
      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
      params:
        num_steps: 50
        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
        guider_config:
          target: sgm.modules.diffusionmodules.guiders.VanillaCFGImgRef
          params:
            scale: 7.5
data:
  target: sgm.data.data_co3d.CustomDataDictLoader
  params:
    batch_size: 1
    num_workers: 4
    category: motorcycle
    img_size: 512
    skip: 2
    num_images: 5
    mask_images: true
    single_id: 12
    bbox: true
    addreg: true
    drop_ratio: 0.25
    drop_txt: 0.1
    modifier_token: <new1>
    categoryname: null