File size: 4,502 Bytes
42b0b31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
model:
  target: visconet.visconet.ViscoNetLDM
  params:
    linear_start: 0.00085
    linear_end: 0.0120
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: "jpg"
    cond_stage_key: "txt"
    control_key: "hint"
    control_crossattn_key: "styles"
    mask_key: "human_mask"
    image_size: 64
    channels: 4
    cond_stage_trainable: false
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    use_ema: False
    only_mid_control: False

    control_cond_config:
      target: visconet.modules.ProjectLocalStyle
      #target: visconet.modules.ClipImageEncoder
      # params:
      #    context_dim: 1024

    control_stage_config:
      target: cldm.cldm.ControlNet
      params:
        use_checkpoint: True
        image_size: 32 # unused
        in_channels: 4
        hint_channels: 3
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_head_channels: 64 # need to fix for flash-attn
        use_spatial_transformer: True
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
        legacy: False

    unet_config:
      target: cldm.cldm.ControlledUnetModel
      params:
        use_checkpoint: True
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_head_channels: 64 # need to fix for flash-attn
        use_spatial_transformer: True
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
        legacy: False

    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          #attn_type: "vanilla-xformers"
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity

    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
      params:
        freeze: True
        layer: "penultimate"
  style_embedding_config:
    target: scripts.image_emb_hidden.ClipImageEncoder
    
dataset:
  train:
    target: visconet.deepfashion.DeepFashionDataset
    params:
      image_root: "/home/soon/datasets/deepfashion_inshop"
      image_dir: img_512_padded
      pose_dir: openpose_hand_512
      style_dir: styles
      style_postfix: _hidden
      mask_dir: smpl_256
      data_files: 
         - data/deepfashion/pairs-train-all.csv
         - data/deepfashion/solo-train-all.csv
      map_file: data/deepfashion/deepfashion_map.csv
      style_emb_shape: 
        - 257
        - 1024
      style_names:
        - background
        - face
        - hair
        - headwear
        - top
        - outer
        - bottom
        - shoes
        - accesories
  val:
    target: visconet.deepfashion.DeepFashionDataset
    params:
      image_root: "/home/soon/datasets/deepfashion_inshop"
      image_dir: img_512_padded
      pose_dir: openpose_hand_512
      style_dir: styles
      style_postfix: _hidden
      mask_dir: smpl_256
      data_files: 
         - data/deepfashion/solo-test-all.csv
      map_file: data/deepfashion/deepfashion_map.csv
      sample_ratio: 0.1
      style_emb_shape: 
        - 257
        - 1024
      style_names:
        - background
        - face
        - hair
        - headwear
        - top
        - outer
        - bottom
        - shoes
        - accesories      

  test:
    target: visconet.deepfashion.DeepFashionDataset
    params:
      image_root: "/home/soon/datasets/deepfashion_inshop"
      image_dir: img_512_padded
      pose_dir: openpose_hand_512
      style_dir: styles
      style_postfix: _hidden
      mask_dir: smpl_256
      data_files: 
         - data/deepfashion/pairs-test-all.csv
      map_file: data/deepfashion/deepfashion_map.csv
      style_emb_shape: 
        - 257
        - 1024
      style_names:
        - background
        - face
        - hair
        - headwear
        - top
        - outer
        - bottom
        - shoes
        - accesories