File size: 5,160 Bytes
ad7bc89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
model:
  base_learning_rate: 0.0001
  target: sgm.models.diffusion.DiffusionEngine
  params:
    scale_factor: 0.13025
    disable_first_stage_autocast: true
    trainkeys: pose
    multiplier: 0.05
    loss_rgb_lambda: 5
    loss_fg_lambda: 10
    loss_bg_lambda: 10
    log_keys:
    - txt
    denoiser_config:
      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
      params:
        num_idx: 1000
        weighting_config:
          target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
        scaling_config:
          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
    network_config:
      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        adm_in_channels: 2816
        num_classes: sequential
        use_checkpoint: false
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions:
        - 4
        - 2
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        num_head_channels: 64
        use_linear_in_transformer: true
        transformer_depth:
        - 1
        - 2
        - 10
        context_dim: 2048
        spatial_transformer_attn_type: softmax-xformers
        image_cross_blocks:
        - 0
        - 2
        - 4
        - 6
        - 8
        - 10
        rgb: true
        far: 2
        num_samples: 24
        not_add_context_in_triplane: false
        rgb_predict: true
        add_lora: false
        average: false
        use_prev_weights_imp_sample: true
        stratified: true
        imp_sampling_percent: 0.9
      use_prev_weights_imp_sample: true
    conditioner_config:
      target: sgm.modules.GeneralConditioner
      params:
        emb_models:
        - is_trainable: false
          input_keys: txt,txt_ref
          target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
          params:
            layer: hidden
            layer_idx: 11
            modifier_token: <new1>
        - is_trainable: false
          input_keys: txt,txt_ref
          target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
          params:
            arch: ViT-bigG-14
            version: laion2b_s39b_b160k
            layer: penultimate
            always_return_pooled: true
            legacy: false
            modifier_token: <new1>
        - is_trainable: false
          input_keys: original_size_as_tuple,original_size_as_tuple_ref
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
        - is_trainable: false
          input_keys: crop_coords_top_left,crop_coords_top_left_ref
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
        - is_trainable: false
          input_keys: target_size_as_tuple,target_size_as_tuple_ref
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
    first_stage_config:
      target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
      params:
        ckpt_path: /sensei-fs/tenants/Sensei-AdobeResearchTeam/nupkumar1/custom-pose/pretrained-models/sdxl_vae.safetensors
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          attn_type: vanilla-xformers
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    loss_fn_config:
      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLossImgRef
      params:
        sigma_sampler_config:
          target: sgm.modules.diffusionmodules.sigma_sampling.CubicSampling
          params:
            num_idx: 1000
            discretization_config:
              target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
        sigma_sampler_config_ref:
          target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
          params:
            num_idx: 50
            discretization_config:
              target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
    sampler_config:
      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
      params:
        num_steps: 50
        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
        guider_config:
          target: sgm.modules.diffusionmodules.guiders.VanillaCFGImgRef
          params:
            scale: 7.5
data:
  target: sgm.data.data_co3d.CustomDataDictLoader
  params:
    batch_size: 1
    num_workers: 4
    category: chair
    img_size: 512
    skip: 2
    num_images: 5
    mask_images: true
    single_id: 191
    bbox: true
    addreg: true
    drop_ratio: 0.25
    drop_txt: 0.1
    modifier_token: <new1>
    categoryname: null