File size: 5,872 Bytes
ad7bc89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
model:
  base_learning_rate: 1.0e-4
  target: sgm.models.diffusion.DiffusionEngine
  params:
    scale_factor: 0.13025
    disable_first_stage_autocast: True
    trainkeys: pose
    multiplier: 0.05
    loss_rgb_lambda: 5
    loss_fg_lambda: 10
    loss_bg_lambda: 10
    log_keys:
      - txt

    denoiser_config:
      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
      params:
        num_idx: 1000

        weighting_config:
          target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting
        scaling_config:
          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization

    network_config:
      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        adm_in_channels: 2816
        num_classes: sequential
        use_checkpoint: False
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [4, 2]
        num_res_blocks: 2
        channel_mult: [1, 2, 4]
        num_head_channels: 64
        use_linear_in_transformer: True
        transformer_depth: [1, 2, 10] 
        context_dim: 2048
        spatial_transformer_attn_type: softmax-xformers
        image_cross_blocks: [0, 2, 4, 6, 8, 10]
        rgb: True
        far: 2
        num_samples: 24
        not_add_context_in_triplane: False
        rgb_predict: True
        add_lora: False
        average: False
        use_prev_weights_imp_sample: True
        stratified: True
        imp_sampling_percent: 0.9

    conditioner_config:
      target: sgm.modules.GeneralConditioner
      params:
        emb_models:
          # crossattn cond
          - is_trainable: False
            input_keys: txt,txt_ref
            target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
            params:
              layer: hidden
              layer_idx: 11
              modifier_token: <new1>
          # crossattn and vector cond
          - is_trainable: False
            input_keys: txt,txt_ref
            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
            params:
              arch: ViT-bigG-14
              version: laion2b_s39b_b160k
              layer: penultimate
              always_return_pooled: True
              legacy: False
              modifier_token: <new1>
          # vector cond
          - is_trainable: False
            input_keys: original_size_as_tuple,original_size_as_tuple_ref
            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
            params:
              outdim: 256  # multiplied by two
          # vector cond
          - is_trainable: False
            input_keys: crop_coords_top_left,crop_coords_top_left_ref
            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
            params:
              outdim: 256  # multiplied by two
          # vector cond
          - is_trainable: False
            input_keys: target_size_as_tuple,target_size_as_tuple_ref
            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
            params:
              outdim: 256  # multiplied by two

    first_stage_config:
      target: sgm.models.autoencoder.AutoencoderKLInferenceWrapper
      params:
        ckpt_path: pretrained-models/sdxl_vae.safetensors
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          attn_type: vanilla-xformers
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult: [1, 2, 4, 4]
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity

    loss_fn_config:
      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLossImgRef
      params:
        sigma_sampler_config:
          target: sgm.modules.diffusionmodules.sigma_sampling.CubicSampling
          params:
            num_idx: 1000
            discretization_config:
              target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
        sigma_sampler_config_ref:
          target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
          params:
            num_idx: 50

            discretization_config:
              target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization

    sampler_config:
      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
      params:
        num_steps: 50

        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization

        guider_config:
          target: sgm.modules.diffusionmodules.guiders.VanillaCFGImgRef
          params:
            scale: 7.5

data:
  target: sgm.data.data_co3d.CustomDataDictLoader
  params:
    batch_size: 1
    num_workers: 4
    category: teddybear
    img_size: 512
    skip: 2
    num_images: 5
    mask_images: True
    single_id: 0
    bbox: True
    addreg: True
    drop_ratio: 0.25
    drop_txt: 0.1
    modifier_token: <new1>

lightning:
  modelcheckpoint:
    params:
      every_n_train_steps: 1600
      save_top_k: -1
      save_on_train_epoch_end: False

  callbacks:
    metrics_over_trainsteps_checkpoint:
      params:
        every_n_train_steps: 25000

    image_logger:
      target: main.ImageLogger
      params:
        disabled: False
        enable_autocast: False
        batch_frequency: 5000
        max_images: 8
        increase_log_steps: False
        log_first_step: False
        log_images_kwargs:
          use_ema_scope: False
          N: 1
          n_rows: 2

  trainer:
    devices: 0,1,2,3
    benchmark: True
    num_sanity_val_steps: 0
    accumulate_grad_batches: 1
    max_steps: 1610
    # val_check_interval: 400