soonyau commited on
Commit
664539f
·
1 Parent(s): 43557dc
configs/mm_local_style.yaml DELETED
@@ -1,147 +0,0 @@
1
- model:
2
- target: visconet.visconet.ViscoNetLDM
3
- params:
4
- linear_start: 0.00085
5
- linear_end: 0.0120
6
- num_timesteps_cond: 1
7
- log_every_t: 200
8
- timesteps: 1000
9
- first_stage_key: "jpg"
10
- cond_stage_key: "txt"
11
- control_key: "hint"
12
- control_crossattn_key: "styles"
13
- mask_key: "human_mask"
14
- image_size: 64
15
- channels: 4
16
- cond_stage_trainable: false
17
- conditioning_key: crossattn
18
- monitor: val/loss_simple_ema
19
- scale_factor: 0.18215
20
- use_ema: False
21
- only_mid_control: False
22
-
23
- control_cond_config:
24
- target: visconet.modules.ProjectLocalStyle
25
- #target: visconet.modules.ClipImageEncoder
26
-
27
- control_stage_config:
28
- target: cldm.cldm.ControlNet
29
- params:
30
- use_checkpoint: True
31
- image_size: 32 # unused
32
- in_channels: 4
33
- hint_channels: 3
34
- model_channels: 320
35
- attention_resolutions: [ 4, 2, 1 ]
36
- num_res_blocks: 2
37
- channel_mult: [ 1, 2, 4, 4 ]
38
- num_head_channels: 64 # need to fix for flash-attn
39
- use_spatial_transformer: True
40
- use_linear_in_transformer: True
41
- transformer_depth: 1
42
- context_dim: 1024
43
- legacy: False
44
-
45
- unet_config:
46
- target: cldm.cldm.ControlledUnetModel
47
- params:
48
- use_checkpoint: True
49
- image_size: 32 # unused
50
- in_channels: 4
51
- out_channels: 4
52
- model_channels: 320
53
- attention_resolutions: [ 4, 2, 1 ]
54
- num_res_blocks: 2
55
- channel_mult: [ 1, 2, 4, 4 ]
56
- num_head_channels: 64 # need to fix for flash-attn
57
- use_spatial_transformer: True
58
- use_linear_in_transformer: True
59
- transformer_depth: 1
60
- context_dim: 1024
61
- legacy: False
62
-
63
- first_stage_config:
64
- target: ldm.models.autoencoder.AutoencoderKL
65
- params:
66
- embed_dim: 4
67
- monitor: val/rec_loss
68
- ddconfig:
69
- #attn_type: "vanilla-xformers"
70
- double_z: true
71
- z_channels: 4
72
- resolution: 256
73
- in_channels: 3
74
- out_ch: 3
75
- ch: 128
76
- ch_mult:
77
- - 1
78
- - 2
79
- - 4
80
- - 4
81
- num_res_blocks: 2
82
- attn_resolutions: []
83
- dropout: 0.0
84
- lossconfig:
85
- target: torch.nn.Identity
86
-
87
- cond_stage_config:
88
- target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
89
- params:
90
- freeze: True
91
- layer: "penultimate"
92
-
93
- style_embedding_config:
94
- target: scripts.image_emb_hidden.ClipImageEncoder
95
-
96
- dataset:
97
- train:
98
- target: visconet.deepfashion.DeepFashionDataset
99
- params:
100
- image_root: "/home/soon/datasets/deepfashion_inshop"
101
- image_dir: img_512_padded
102
- pose_dir: openpose_512
103
- style_dir: styles
104
- style_postfix: _hidden
105
- mask_dir: mask_512_padded
106
- data_files:
107
- - data/deepfashion/mm-train-all.csv
108
- map_file: data/deepfashion/deepfashion_map.csv
109
- style_emb_shape:
110
- - 257
111
- - 1024
112
- style_names:
113
- - background
114
- - face
115
- - hair
116
- - headwear
117
- - top
118
- - outer
119
- - bottom
120
- - shoes
121
- - accesories
122
- val:
123
- target: visconet.deepfashion.DeepFashionDataset
124
- params:
125
- image_root: "/home/soon/datasets/deepfashion_inshop"
126
- image_dir: img_512_padded
127
- pose_dir: openpose_512
128
- style_dir: styles
129
- style_postfix: _hidden
130
- mask_dir: mask_512_padded
131
- data_files:
132
- - data/deepfashion/mm-test-all.csv
133
- map_file: data/deepfashion/deepfashion_map.csv
134
- #sample_ratio: 0.1
135
- style_emb_shape:
136
- - 257
137
- - 1024
138
- style_names:
139
- - background
140
- - face
141
- - hair
142
- - headwear
143
- - top
144
- - outer
145
- - bottom
146
- - shoes
147
- - accesories
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/pose_transfer.yaml DELETED
@@ -1,175 +0,0 @@
1
- model:
2
- target: visconet.visconet.ViscoNetLDM
3
- params:
4
- linear_start: 0.00085
5
- linear_end: 0.0120
6
- num_timesteps_cond: 1
7
- log_every_t: 200
8
- timesteps: 1000
9
- first_stage_key: "jpg"
10
- cond_stage_key: "txt"
11
- control_key: "hint"
12
- control_crossattn_key: "styles"
13
- mask_key: "human_mask"
14
- image_size: 64
15
- channels: 4
16
- cond_stage_trainable: false
17
- conditioning_key: crossattn
18
- monitor: val/loss_simple_ema
19
- scale_factor: 0.18215
20
- use_ema: False
21
- only_mid_control: False
22
-
23
- control_cond_config:
24
- target: visconet.modules.ProjectLocalStyle
25
- #target: visconet.modules.ClipImageEncoder
26
- # params:
27
- # context_dim: 1024
28
-
29
- control_stage_config:
30
- target: cldm.cldm.ControlNet
31
- params:
32
- use_checkpoint: True
33
- image_size: 32 # unused
34
- in_channels: 4
35
- hint_channels: 3
36
- model_channels: 320
37
- attention_resolutions: [ 4, 2, 1 ]
38
- num_res_blocks: 2
39
- channel_mult: [ 1, 2, 4, 4 ]
40
- num_head_channels: 64 # need to fix for flash-attn
41
- use_spatial_transformer: True
42
- use_linear_in_transformer: True
43
- transformer_depth: 1
44
- context_dim: 1024
45
- legacy: False
46
-
47
- unet_config:
48
- target: cldm.cldm.ControlledUnetModel
49
- params:
50
- use_checkpoint: True
51
- image_size: 32 # unused
52
- in_channels: 4
53
- out_channels: 4
54
- model_channels: 320
55
- attention_resolutions: [ 4, 2, 1 ]
56
- num_res_blocks: 2
57
- channel_mult: [ 1, 2, 4, 4 ]
58
- num_head_channels: 64 # need to fix for flash-attn
59
- use_spatial_transformer: True
60
- use_linear_in_transformer: True
61
- transformer_depth: 1
62
- context_dim: 1024
63
- legacy: False
64
-
65
- first_stage_config:
66
- target: ldm.models.autoencoder.AutoencoderKL
67
- params:
68
- embed_dim: 4
69
- monitor: val/rec_loss
70
- ddconfig:
71
- #attn_type: "vanilla-xformers"
72
- double_z: true
73
- z_channels: 4
74
- resolution: 256
75
- in_channels: 3
76
- out_ch: 3
77
- ch: 128
78
- ch_mult:
79
- - 1
80
- - 2
81
- - 4
82
- - 4
83
- num_res_blocks: 2
84
- attn_resolutions: []
85
- dropout: 0.0
86
- lossconfig:
87
- target: torch.nn.Identity
88
-
89
- cond_stage_config:
90
- target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
91
- params:
92
- freeze: True
93
- layer: "penultimate"
94
- style_embedding_config:
95
- target: scripts.image_emb_hidden.ClipImageEncoder
96
-
97
- dataset:
98
- train:
99
- target: visconet.deepfashion.DeepFashionDataset
100
- params:
101
- image_root: "/home/soon/datasets/deepfashion_inshop"
102
- image_dir: img_512_padded
103
- pose_dir: openpose_hand_512
104
- style_dir: styles
105
- style_postfix: _hidden
106
- mask_dir: smpl_256
107
- data_files:
108
- - data/deepfashion/pairs-train-all.csv
109
- - data/deepfashion/solo-train-all.csv
110
- map_file: data/deepfashion/deepfashion_map.csv
111
- style_emb_shape:
112
- - 257
113
- - 1024
114
- style_names:
115
- - background
116
- - face
117
- - hair
118
- - headwear
119
- - top
120
- - outer
121
- - bottom
122
- - shoes
123
- - accesories
124
- val:
125
- target: visconet.deepfashion.DeepFashionDataset
126
- params:
127
- image_root: "/home/soon/datasets/deepfashion_inshop"
128
- image_dir: img_512_padded
129
- pose_dir: openpose_hand_512
130
- style_dir: styles
131
- style_postfix: _hidden
132
- mask_dir: smpl_256
133
- data_files:
134
- - data/deepfashion/solo-test-all.csv
135
- map_file: data/deepfashion/deepfashion_map.csv
136
- sample_ratio: 0.1
137
- style_emb_shape:
138
- - 257
139
- - 1024
140
- style_names:
141
- - background
142
- - face
143
- - hair
144
- - headwear
145
- - top
146
- - outer
147
- - bottom
148
- - shoes
149
- - accesories
150
-
151
- test:
152
- target: visconet.deepfashion.DeepFashionDataset
153
- params:
154
- image_root: "/home/soon/datasets/deepfashion_inshop"
155
- image_dir: img_512_padded
156
- pose_dir: openpose_hand_512
157
- style_dir: styles
158
- style_postfix: _hidden
159
- mask_dir: smpl_256
160
- data_files:
161
- - data/deepfashion/pairs-test-all.csv
162
- map_file: data/deepfashion/deepfashion_map.csv
163
- style_emb_shape:
164
- - 257
165
- - 1024
166
- style_names:
167
- - background
168
- - face
169
- - hair
170
- - headwear
171
- - top
172
- - outer
173
- - bottom
174
- - shoes
175
- - accesories
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/visconet_v15.yaml DELETED
@@ -1,145 +0,0 @@
1
- model:
2
- target: visconet.visconet.ViscoNetLDM
3
- params:
4
- linear_start: 0.00085
5
- linear_end: 0.0120
6
- num_timesteps_cond: 1
7
- log_every_t: 200
8
- timesteps: 1000
9
- first_stage_key: "jpg"
10
- cond_stage_key: "txt"
11
- control_key: "hint"
12
- control_crossattn_key: "styles"
13
- mask_key: "human_mask"
14
- image_size: 64
15
- channels: 4
16
- cond_stage_trainable: false
17
- conditioning_key: crossattn
18
- monitor: val/loss_simple_ema
19
- scale_factor: 0.18215
20
- use_ema: False
21
- only_mid_control: False
22
-
23
- control_cond_config:
24
- target: visconet.modules.LinearProj
25
- #target: visconet.modules.ClipImageEncoder
26
- params:
27
- context_dim: 768
28
-
29
- control_stage_config:
30
- target: cldm.cldm.ControlNet
31
- params:
32
- image_size: 32 # unused
33
- in_channels: 4
34
- hint_channels: 3
35
- model_channels: 320
36
- attention_resolutions: [ 4, 2, 1 ]
37
- num_res_blocks: 2
38
- channel_mult: [ 1, 2, 4, 4 ]
39
- num_heads: 8
40
- use_spatial_transformer: True
41
- transformer_depth: 1
42
- context_dim: 768
43
- use_checkpoint: True
44
- legacy: False
45
-
46
- unet_config:
47
- target: cldm.cldm.ControlledUnetModel
48
- params:
49
- image_size: 32 # unused
50
- in_channels: 4
51
- out_channels: 4
52
- model_channels: 320
53
- attention_resolutions: [ 4, 2, 1 ]
54
- num_res_blocks: 2
55
- channel_mult: [ 1, 2, 4, 4 ]
56
- num_heads: 8
57
- use_spatial_transformer: True
58
- transformer_depth: 1
59
- context_dim: 768
60
- use_checkpoint: True
61
- legacy: False
62
-
63
- first_stage_config:
64
- target: ldm.models.autoencoder.AutoencoderKL
65
- params:
66
- embed_dim: 4
67
- monitor: val/rec_loss
68
- ddconfig:
69
- double_z: true
70
- z_channels: 4
71
- resolution: 256
72
- in_channels: 3
73
- out_ch: 3
74
- ch: 128
75
- ch_mult:
76
- - 1
77
- - 2
78
- - 4
79
- - 4
80
- num_res_blocks: 2
81
- attn_resolutions: []
82
- dropout: 0.0
83
- lossconfig:
84
- target: torch.nn.Identity
85
-
86
- cond_stage_config:
87
- target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
88
-
89
- style_embedding_config:
90
- target: scripts.image_emb.ClipImageEncoder
91
- params:
92
- context_dim: 768
93
-
94
-
95
- dataset:
96
- train:
97
- target: visconet.deepfashion.DeepFashionDataset
98
- params:
99
- image_root: "/home/soon/datasets/deepfashion_inshop"
100
- image_dir: img_512_padded
101
- pose_dir: openpose_512
102
- style_dir: styles
103
- mask_dir: mask_512_padded
104
- data_files:
105
- - data/deepfashion/mm-train-all.csv
106
- map_file: data/deepfashion/deepfashion_map.csv
107
- style_emb_shape:
108
- - 1
109
- - 768 #control_stage_config.params.context_dim
110
- style_names:
111
- - face
112
- - hair
113
- - headwear
114
- - background
115
- - top
116
- - outer
117
- - bottom
118
- - shoes
119
- - accesories
120
-
121
- val:
122
- target: visconet.deepfashion.DeepFashionDataset
123
- params:
124
- image_root: "/home/soon/datasets/deepfashion_inshop"
125
- image_dir: img_512_padded
126
- pose_dir: openpose_512
127
- style_dir: styles
128
- mask_dir: mask_512_padded
129
- data_files:
130
- - data/deepfashion/mm-test-all.csv
131
- map_file: data/deepfashion/deepfashion_map.csv
132
- sample_ratio: 0.1
133
- style_emb_shape:
134
- - 1
135
- - 768 #control_stage_config.params.context_dim
136
- style_names:
137
- - face
138
- - hair
139
- - headwear
140
- - background
141
- - top
142
- - outer
143
- - bottom
144
- - shoes
145
- - accesories
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/visconet_v1_1.yaml DELETED
@@ -1,168 +0,0 @@
1
- model:
2
- target: visconet.visconet.ViscoNetLDM
3
- params:
4
- linear_start: 0.00085
5
- linear_end: 0.0120
6
- num_timesteps_cond: 1
7
- log_every_t: 200
8
- timesteps: 1000
9
- first_stage_key: "jpg"
10
- cond_stage_key: "txt"
11
- control_key: "hint"
12
- control_crossattn_key: "styles"
13
- mask_key: "human_mask"
14
- image_size: 64
15
- channels: 4
16
- cond_stage_trainable: false
17
- conditioning_key: crossattn
18
- monitor: val/loss_simple_ema
19
- scale_factor: 0.18215
20
- use_ema: False
21
- only_mid_control: False
22
-
23
- scheduler_config:
24
- target: torch.optim.lr_scheduler.ReduceLROnPlateau
25
- monitor: val/loss_simple_ema
26
- params:
27
- mode: min
28
- factor: 0.5
29
- patience: 3
30
- cooldown: 0
31
- min_lr: 0.00001
32
- threshold: 0.001
33
- verbose: True
34
-
35
- control_cond_config:
36
- target: visconet.modules.ProjectLocalStyle
37
- params:
38
- pool_size: 9
39
- local_emb_size: 257
40
- bias: True
41
- #target: visconet.modules.ClipImageEncoder
42
-
43
- control_stage_config:
44
- target: cldm.cldm.ControlNet
45
- params:
46
- use_checkpoint: True
47
- image_size: 32 # unused
48
- in_channels: 4
49
- hint_channels: 3
50
- model_channels: 320
51
- attention_resolutions: [ 4, 2, 1 ]
52
- num_res_blocks: 2
53
- channel_mult: [ 1, 2, 4, 4 ]
54
- num_head_channels: 64 # need to fix for flash-attn
55
- use_spatial_transformer: True
56
- use_linear_in_transformer: True
57
- transformer_depth: 1
58
- context_dim: 1024
59
- legacy: False
60
-
61
- unet_config:
62
- target: cldm.cldm.ControlledUnetModel
63
- params:
64
- use_checkpoint: True
65
- image_size: 32 # unused
66
- in_channels: 4
67
- out_channels: 4
68
- model_channels: 320
69
- attention_resolutions: [ 4, 2, 1 ]
70
- num_res_blocks: 2
71
- channel_mult: [ 1, 2, 4, 4 ]
72
- num_head_channels: 64 # need to fix for flash-attn
73
- use_spatial_transformer: True
74
- use_linear_in_transformer: True
75
- transformer_depth: 1
76
- context_dim: 1024
77
- legacy: False
78
-
79
- first_stage_config:
80
- target: ldm.models.autoencoder.AutoencoderKL
81
- params:
82
- embed_dim: 4
83
- monitor: val/rec_loss
84
- ddconfig:
85
- #attn_type: "vanilla-xformers"
86
- double_z: true
87
- z_channels: 4
88
- resolution: 256
89
- in_channels: 3
90
- out_ch: 3
91
- ch: 128
92
- ch_mult:
93
- - 1
94
- - 2
95
- - 4
96
- - 4
97
- num_res_blocks: 2
98
- attn_resolutions: []
99
- dropout: 0.0
100
- lossconfig:
101
- target: torch.nn.Identity
102
-
103
- cond_stage_config:
104
- target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
105
- params:
106
- freeze: True
107
- layer: "penultimate"
108
-
109
- style_embedding_config:
110
- target: scripts.image_emb_hidden.ClipImageEncoder
111
-
112
- dataset:
113
- train:
114
- target: visconet.deepfashion.DeepFashionDataset
115
- params:
116
- image_root: "/home/soon/datasets/deepfashion_inshop"
117
- image_dir: img_512_padded
118
- pose_dir: openpose_hand_default_512
119
- style_dir: styles_default
120
- style_postfix: _hidden
121
- mask_dir: smpl_256
122
- data_files:
123
- - data/deepfashion/pairs-train-all.csv
124
- - data/deepfashion/solo-train-all.csv
125
- map_file: data/deepfashion/deepfashion_map.csv
126
- style_emb_shape:
127
- - 257
128
- - 1024
129
- crop_shape:
130
- - 512
131
- - 384
132
- style_names:
133
- - face
134
- - hair
135
- - headwear
136
- - top
137
- - outer
138
- - bottom
139
- - shoes
140
- - accesories
141
- val:
142
- target: visconet.deepfashion.DeepFashionDataset
143
- params:
144
- image_root: "/home/soon/datasets/deepfashion_inshop"
145
- image_dir: img_512_padded
146
- pose_dir: openpose_hand_default_512
147
- style_dir: styles_default
148
- style_postfix: _hidden
149
- mask_dir: smpl_256
150
- data_files:
151
- - data/deepfashion/pairs-test-all.csv
152
- map_file: data/deepfashion/deepfashion_map.csv
153
- sample_ratio: 1.0
154
- style_emb_shape:
155
- - 257
156
- - 1024
157
- crop_shape:
158
- - 512
159
- - 384
160
- style_names:
161
- - face
162
- - hair
163
- - headwear
164
- - top
165
- - outer
166
- - bottom
167
- - shoes
168
- - accesories
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/visconet_v21.yaml DELETED
@@ -1,149 +0,0 @@
1
- model:
2
- target: visconet.visconet.ViscoNetLDM
3
- params:
4
- linear_start: 0.00085
5
- linear_end: 0.0120
6
- num_timesteps_cond: 1
7
- log_every_t: 200
8
- timesteps: 1000
9
- first_stage_key: "jpg"
10
- cond_stage_key: "txt"
11
- control_key: "hint"
12
- control_crossattn_key: "styles"
13
- mask_key: "human_mask"
14
- image_size: 64
15
- channels: 4
16
- cond_stage_trainable: false
17
- conditioning_key: crossattn
18
- monitor: val/loss_simple_ema
19
- scale_factor: 0.18215
20
- use_ema: False
21
- only_mid_control: False
22
-
23
- control_cond_config:
24
- target: visconet.modules.LinearProj
25
- #target: visconet.modules.ClipImageEncoder
26
- params:
27
- context_dim: 1024
28
-
29
- control_stage_config:
30
- target: cldm.cldm.ControlNet
31
- params:
32
- use_checkpoint: True
33
- image_size: 32 # unused
34
- in_channels: 4
35
- hint_channels: 3
36
- model_channels: 320
37
- attention_resolutions: [ 4, 2, 1 ]
38
- num_res_blocks: 2
39
- channel_mult: [ 1, 2, 4, 4 ]
40
- num_head_channels: 64 # need to fix for flash-attn
41
- use_spatial_transformer: True
42
- use_linear_in_transformer: True
43
- transformer_depth: 1
44
- context_dim: 1024
45
- legacy: False
46
-
47
- unet_config:
48
- target: cldm.cldm.ControlledUnetModel
49
- params:
50
- use_checkpoint: True
51
- image_size: 32 # unused
52
- in_channels: 4
53
- out_channels: 4
54
- model_channels: 320
55
- attention_resolutions: [ 4, 2, 1 ]
56
- num_res_blocks: 2
57
- channel_mult: [ 1, 2, 4, 4 ]
58
- num_head_channels: 64 # need to fix for flash-attn
59
- use_spatial_transformer: True
60
- use_linear_in_transformer: True
61
- transformer_depth: 1
62
- context_dim: 1024
63
- legacy: False
64
-
65
- first_stage_config:
66
- target: ldm.models.autoencoder.AutoencoderKL
67
- params:
68
- embed_dim: 4
69
- monitor: val/rec_loss
70
- ddconfig:
71
- #attn_type: "vanilla-xformers"
72
- double_z: true
73
- z_channels: 4
74
- resolution: 256
75
- in_channels: 3
76
- out_ch: 3
77
- ch: 128
78
- ch_mult:
79
- - 1
80
- - 2
81
- - 4
82
- - 4
83
- num_res_blocks: 2
84
- attn_resolutions: []
85
- dropout: 0.0
86
- lossconfig:
87
- target: torch.nn.Identity
88
-
89
- cond_stage_config:
90
- target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
91
- params:
92
- freeze: True
93
- layer: "penultimate"
94
- style_embedding_config:
95
- target: scripts.image_emb.ClipImageEncoder
96
- params:
97
- context_dim: 768
98
-
99
- dataset:
100
- train:
101
- target: visconet.deepfashion.DeepFashionDataset
102
- params:
103
- image_root: "/home/soon/datasets/deepfashion_inshop"
104
- image_dir: img_512_padded
105
- pose_dir: openpose_512
106
- style_dir: styles
107
- mask_dir: mask_512_padded
108
- data_files:
109
- - data/deepfashion/mm-train-all.csv
110
- map_file: data/deepfashion/deepfashion_map.csv
111
- style_emb_shape:
112
- - 1
113
- - 768 #control_stage_config.params.context_dim
114
- style_names:
115
- - face
116
- - hair
117
- - headwear
118
- - background
119
- - top
120
- - outer
121
- - bottom
122
- - shoes
123
- - accesories
124
-
125
- val:
126
- target: visconet.deepfashion.DeepFashionDataset
127
- params:
128
- image_root: "/home/soon/datasets/deepfashion_inshop"
129
- image_dir: img_512_padded
130
- pose_dir: openpose_512
131
- style_dir: styles
132
- mask_dir: mask_512_padded
133
- data_files:
134
- - data/deepfashion/mm-test-all.csv
135
- map_file: data/deepfashion/deepfashion_map.csv
136
- sample_ratio: 0.1
137
- style_emb_shape:
138
- - 1
139
- - 768 #control_stage_config.params.context_dim
140
- style_names:
141
- - face
142
- - hair
143
- - headwear
144
- - background
145
- - top
146
- - outer
147
- - bottom
148
- - shoes
149
- - accesories
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -2,6 +2,7 @@ pip==20.3
2
  torch==1.12.1
3
  torchvision==0.13.1
4
  numpy==1.23.1
 
5
  gradio==3.39.0
6
  albumentations==1.3.0
7
  imageio==2.9.0
@@ -26,5 +27,4 @@ basicsr==1.4.2
26
  xformers==0.0.13
27
  deepface
28
  #opencv-contrib-python
29
- --extra-index-url https://download.pytorch.org/whl/cu113
30
- torch
 
2
  torch==1.12.1
3
  torchvision==0.13.1
4
  numpy==1.23.1
5
+ git+https://github.com/openai/CLIP.git
6
  gradio==3.39.0
7
  albumentations==1.3.0
8
  imageio==2.9.0
 
27
  xformers==0.0.13
28
  deepface
29
  #opencv-contrib-python
30
+ --extra-index-url https://download.pytorch.org/whl/cu113