chaojiemao commited on
Commit
ec43f9b
·
verified ·
1 Parent(s): 87e4b76

Rename config/models/ace_0.6b_1024.yaml to config/models/ace_flux_dev.yaml

Browse files
config/models/ace_0.6b_1024.yaml DELETED
@@ -1,132 +0,0 @@
1
- NAME: ACE_0.6B_1024_REFINER
2
- IS_DEFAULT: True
3
- USE_DYNAMIC_MODEL: False
4
- DEFAULT_PARAS:
5
- PARAS:
6
- #
7
- INPUT:
8
- INPUT_IMAGE:
9
- INPUT_MASK:
10
- TASK:
11
- PROMPT: ""
12
- NEGATIVE_PROMPT: ""
13
- OUTPUT_HEIGHT: 1024
14
- OUTPUT_WIDTH: 1024
15
- SAMPLER: ddim
16
- SAMPLE_STEPS: 50
17
- GUIDE_SCALE: 4.5
18
- GUIDE_RESCALE: 0.5
19
- SEED: -1
20
- TAR_INDEX: 0
21
- REFINER_SCALE: 0.2
22
- USE_ACE: True
23
- #REFINER_PROMPT: "High Resolution, Sharpness, Clarity, Detail Enhancement, Noise Reduction, HD, 4k, Image Restoration, HDR"
24
- REFINER_PROMPT: "High Resolution, Sharpness, Clarity, Detail Enhancement, Noise Reduction, HD, 4k, Image Restoration, HDR"
25
- OUTPUT:
26
- LATENT:
27
- IMAGES:
28
- SEED:
29
- MODULES_PARAS:
30
- FIRST_STAGE_MODEL:
31
- FUNCTION:
32
- - NAME: encode
33
- DTYPE: float16
34
- INPUT: ["IMAGE"]
35
- - NAME: decode
36
- DTYPE: float16
37
- INPUT: ["LATENT"]
38
- #
39
- DIFFUSION_MODEL:
40
- FUNCTION:
41
- - NAME: forward
42
- DTYPE: float16
43
- INPUT: ["SAMPLE_STEPS", "SAMPLE", "GUIDE_SCALE"]
44
- #
45
- COND_STAGE_MODEL:
46
- FUNCTION:
47
- - NAME: encode_list_of_list
48
- DTYPE: bfloat16
49
- INPUT: ["PROMPT"]
50
- #
51
- MODEL:
52
- NAME: LatentDiffusionACE
53
- PRETRAINED_MODEL:
54
- IGNORE_KEYS: [ ]
55
- SCALE_FACTOR: 0.18215
56
- SIZE_FACTOR: 8
57
- DECODER_BIAS: 0.5
58
- DEFAULT_N_PROMPT: ""
59
- TEXT_IDENTIFIER: [ '{image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
60
- USE_TEXT_POS_EMBEDDINGS: True
61
- #
62
- DIFFUSION:
63
- NAME: BaseDiffusion
64
- PREDICTION_TYPE: eps
65
- MIN_SNR_GAMMA:
66
- NOISE_SCHEDULER:
67
- NAME: LinearScheduler
68
- NUM_TIMESTEPS: 1000
69
- BETA_MIN: 0.0001
70
- BETA_MAX: 0.02
71
- #
72
- DIFFUSION_MODEL:
73
- NAME: ACE
74
- PRETRAINED_MODEL: hf://scepter-studio/ACE-0.6B-1024px@models/dit/ace_0.6b_1024px.pth
75
- IGNORE_KEYS: [ ]
76
- PATCH_SIZE: 2
77
- IN_CHANNELS: 4
78
- HIDDEN_SIZE: 1152
79
- DEPTH: 28
80
- NUM_HEADS: 16
81
- MLP_RATIO: 4.0
82
- PRED_SIGMA: True
83
- DROP_PATH: 0.0
84
- WINDOW_DIZE: 0
85
- Y_CHANNELS: 4096
86
- MAX_SEQ_LEN: 4096
87
- QK_NORM: True
88
- USE_GRAD_CHECKPOINT: True
89
- ATTENTION_BACKEND: flash_attn
90
- #
91
- FIRST_STAGE_MODEL:
92
- NAME: AutoencoderKL
93
- EMBED_DIM: 4
94
- PRETRAINED_MODEL: hf://scepter-studio/ACE-0.6B-1024px@models/vae/vae.bin
95
- IGNORE_KEYS: []
96
- #
97
- ENCODER:
98
- NAME: Encoder
99
- CH: 128
100
- OUT_CH: 3
101
- NUM_RES_BLOCKS: 2
102
- IN_CHANNELS: 3
103
- ATTN_RESOLUTIONS: [ ]
104
- CH_MULT: [ 1, 2, 4, 4 ]
105
- Z_CHANNELS: 4
106
- DOUBLE_Z: True
107
- DROPOUT: 0.0
108
- RESAMP_WITH_CONV: True
109
- #
110
- DECODER:
111
- NAME: Decoder
112
- CH: 128
113
- OUT_CH: 3
114
- NUM_RES_BLOCKS: 2
115
- IN_CHANNELS: 3
116
- ATTN_RESOLUTIONS: [ ]
117
- CH_MULT: [ 1, 2, 4, 4 ]
118
- Z_CHANNELS: 4
119
- DROPOUT: 0.0
120
- RESAMP_WITH_CONV: True
121
- GIVE_PRE_END: False
122
- TANH_OUT: False
123
- #
124
- COND_STAGE_MODEL:
125
- NAME: T5EmbedderHF
126
- PRETRAINED_MODEL: hf://scepter-studio/ACE-0.6B-1024px@models/text_encoder/t5-v1_1-xxl/
127
- TOKENIZER_PATH: hf://scepter-studio/ACE-0.6B-1024px@models/tokenizer/t5-v1_1-xxl
128
- LENGTH: 120
129
- T5_DTYPE: bfloat16
130
- ADDED_IDENTIFIER: [ '{image}', '{caption}', '{mask}', '{ref_image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
131
- CLEAN: whitespace
132
- USE_GRAD: False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/models/ace_flux_dev.yaml ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NAME: ACE_FLUX.1_dev
2
+ IS_DEFAULT: True
3
+ USE_DYNAMIC_MODEL: False
4
+ INFERENCE_TYPE: ACE_FLUX
5
+ MAX_SEQ_LENGTH: 4096
6
+ SRC_MAX_SEQ_LENGTH: 4096
7
+ DEFAULT_PARAS:
8
+ PARAS:
9
+ #
10
+ INPUT:
11
+ INPUT_IMAGE:
12
+ INPUT_MASK:
13
+ TASK:
14
+ PROMPT: ""
15
+ OUTPUT_HEIGHT: 1024
16
+ OUTPUT_WIDTH: 1024
17
+ SAMPLER: flow_euler
18
+ SAMPLE_STEPS: 50
19
+ GUIDE_SCALE: 3.5
20
+ SEED: -1
21
+ TAR_INDEX: 0
22
+ ALIGN: False
23
+ OUTPUT:
24
+ LATENT:
25
+ IMAGES:
26
+ SEED:
27
+ MODULES_PARAS:
28
+ FIRST_STAGE_MODEL:
29
+ FUNCTION:
30
+ - NAME: encode
31
+ DTYPE: bfloat16
32
+ INPUT: [ "IMAGE" ]
33
+ - NAME: decode
34
+ DTYPE: bfloat16
35
+ INPUT: [ "LATENT" ]
36
+ PARAS:
37
+ SCALE_FACTOR: 1.5305
38
+ SHIFT_FACTOR: 0.0609
39
+ SIZE_FACTOR: 8
40
+ DIFFUSION_MODEL:
41
+ FUNCTION:
42
+ - NAME: forward
43
+ DTYPE: bfloat16
44
+ INPUT: [ "SAMPLE_STEPS", "SAMPLE", "GUIDE_SCALE" ]
45
+ COND_STAGE_MODEL:
46
+ FUNCTION:
47
+ - NAME: encode_list_of_list
48
+ DTYPE: bfloat16
49
+ INPUT: [ "PROMPT" ]
50
+ REF_COND_STAGE_MODEL:
51
+ FUNCTION:
52
+ - NAME: encode_list_of_list
53
+ DTYPE: bfloat16
54
+ INPUT: [ "IMAGE" ]
55
+
56
+ #
57
+ MODEL:
58
+ NAME: LatentDiffusionFluxEdit
59
+ PARAMETERIZATION: rf
60
+ PRETRAINED_MODEL:
61
+ IGNORE_KEYS: [ ]
62
+ SIZE_FACTOR: 8
63
+ TEXT_IDENTIFIER: [ '{image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
64
+ IMAGE_TOKEN: '<img>'
65
+ USE_TEXT_POS_EMBEDDINGS: True
66
+ DIFFUSION:
67
+ # NAME DESCRIPTION: TYPE: default: 'DiffusionFluxRF'
68
+ NAME: DiffusionFluxRF
69
+ PREDICTION_TYPE: raw
70
+ # NOISE_SCHEDULER DESCRIPTION: TYPE: default: ''
71
+ NOISE_SCHEDULER:
72
+ # NAME DESCRIPTION: TYPE: default: 'FlowMatchSigmaScheduler'
73
+ NAME: FlowMatchFluxShiftScheduler
74
+ # SHIFT DESCRIPTION: Use timestamp shift or not, default is True. TYPE: bool default: True
75
+ SHIFT: True
76
+ # SIGMOID_SCALE DESCRIPTION: The scale of sigmoid function for sampling timesteps. TYPE: int default: 1
77
+ SIGMOID_SCALE: 1
78
+ # BASE_SHIFT DESCRIPTION: The base shift factor for the timestamp. TYPE: float default: 0.5
79
+ BASE_SHIFT: 0.5
80
+ # MAX_SHIFT DESCRIPTION: The max shift factor for the timestamp. TYPE: float default: 1.15
81
+ MAX_SHIFT: 1.15
82
+ #
83
+ DIFFUSION_MODEL:
84
+ # NAME DESCRIPTION: TYPE: default: 'Flux'
85
+ NAME: FluxEdit
86
+ PRETRAINED_MODEL:
87
+ DIFFUSERS_LORA_MODEL:
88
+ PRETRAIN_ADAPTER:
89
+ # IN_CHANNELS DESCRIPTION: model's input channels. TYPE: int default: 64
90
+ IN_CHANNELS: 64
91
+ # OUT_CHANNELS DESCRIPTION: model's input channels. TYPE: int default: 64
92
+ OUT_CHANNELS: 64
93
+ # HIDDEN_SIZE DESCRIPTION: model's hidden size. TYPE: int default: 1024
94
+ HIDDEN_SIZE: 3072
95
+ REDUX_DIM: 1152
96
+ # NUM_HEADS DESCRIPTION: number of heads in the transformer. TYPE: int default: 16
97
+ NUM_HEADS: 24
98
+ # AXES_DIM DESCRIPTION: dimensions of the axes of the positional encoding. TYPE: list default: [16, 56, 56]
99
+ AXES_DIM: [ 16, 56, 56 ]
100
+ # THETA DESCRIPTION: theta for positional encoding. TYPE: int default: 10000
101
+ THETA: 10000
102
+ # VEC_IN_DIM DESCRIPTION: dimension of the vector input. TYPE: int default: 768
103
+ VEC_IN_DIM: 768
104
+ # GUIDANCE_EMBED DESCRIPTION: whether to use guidance embedding. TYPE: bool default: False
105
+ GUIDANCE_EMBED: True
106
+ # CONTEXT_IN_DIM DESCRIPTION: dimension of the context input. TYPE: int default: 4096
107
+ CONTEXT_IN_DIM: 4096
108
+ # MLP_RATIO DESCRIPTION: ratio of mlp hidden size to hidden size. TYPE: float default: 4.0
109
+ MLP_RATIO: 4.0
110
+ # QKV_BIAS DESCRIPTION: whether to use bias in qkv projection. TYPE: bool default: True
111
+ QKV_BIAS: True
112
+ # DEPTH DESCRIPTION: number of transformer blocks. TYPE: int default: 19
113
+ DEPTH: 19
114
+ # DEPTH_SINGLE_BLOCKS DESCRIPTION: number of transformer blocks in the single stream block. TYPE: int default: 38
115
+ DEPTH_SINGLE_BLOCKS: 38
116
+ ATTN_BACKEND: flash_attn
117
+
118
+ #
119
+ FIRST_STAGE_MODEL:
120
+ NAME: AutoencoderKLFlux
121
+ EMBED_DIM: 16
122
+ PRETRAINED_MODEL:
123
+ IGNORE_KEYS: [ ]
124
+ BATCH_SIZE: 8
125
+ USE_CONV: False
126
+ SCALE_FACTOR: 0.3611
127
+ SHIFT_FACTOR: 0.1159
128
+ #
129
+ ENCODER:
130
+ NAME: Encoder
131
+ USE_CHECKPOINT: True
132
+ CH: 128
133
+ OUT_CH: 3
134
+ NUM_RES_BLOCKS: 2
135
+ IN_CHANNELS: 3
136
+ ATTN_RESOLUTIONS: [ ]
137
+ CH_MULT: [ 1, 2, 4, 4 ]
138
+ Z_CHANNELS: 16
139
+ DOUBLE_Z: True
140
+ DROPOUT: 0.0
141
+ RESAMP_WITH_CONV: True
142
+ #
143
+ DECODER:
144
+ NAME: Decoder
145
+ USE_CHECKPOINT: True
146
+ CH: 128
147
+ OUT_CH: 3
148
+ NUM_RES_BLOCKS: 2
149
+ IN_CHANNELS: 3
150
+ ATTN_RESOLUTIONS: [ ]
151
+ CH_MULT: [ 1, 2, 4, 4 ]
152
+ Z_CHANNELS: 16
153
+ DROPOUT: 0.0
154
+ RESAMP_WITH_CONV: True
155
+ GIVE_PRE_END: False
156
+ TANH_OUT: False
157
+ #
158
+ COND_STAGE_MODEL:
159
+ # NAME DESCRIPTION: TYPE: default: 'T5PlusClipFluxEmbedder'
160
+ NAME: T5PlusClipFluxEmbedder
161
+ # T5_MODEL DESCRIPTION: TYPE: default: ''
162
+ T5_MODEL:
163
+ # NAME DESCRIPTION: TYPE: default: 'HFEmbedder'
164
+ NAME: HFEmbedder
165
+ # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
166
+ HF_MODEL_CLS: T5EncoderModel
167
+ # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
168
+ MODEL_PATH:
169
+ # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
170
+ HF_TOKENIZER_CLS: T5Tokenizer
171
+ # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
172
+ TOKENIZER_PATH:
173
+ ADDED_IDENTIFIER: [ '<img>','{image}', '{caption}', '{mask}', '{ref_image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
174
+ # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
175
+ MAX_LENGTH: 512
176
+ # OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'
177
+ OUTPUT_KEY: last_hidden_state
178
+ # D_TYPE DESCRIPTION: dtype TYPE: str default: 'bfloat16'
179
+ D_TYPE: bfloat16
180
+ # BATCH_INFER DESCRIPTION: batch infer TYPE: bool default: False
181
+ BATCH_INFER: False
182
+ CLEAN: whitespace
183
+ # CLIP_MODEL DESCRIPTION: TYPE: default: ''
184
+ CLIP_MODEL:
185
+ # NAME DESCRIPTION: TYPE: default: 'HFEmbedder'
186
+ NAME: HFEmbedder
187
+ # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
188
+ HF_MODEL_CLS: CLIPTextModel
189
+ # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
190
+ MODEL_PATH:
191
+ # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
192
+ HF_TOKENIZER_CLS: CLIPTokenizer
193
+ # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
194
+ TOKENIZER_PATH:
195
+ # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
196
+ MAX_LENGTH: 77
197
+ # OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'
198
+ OUTPUT_KEY: pooler_output
199
+ # D_TYPE DESCRIPTION: dtype TYPE: str default: 'bfloat16'
200
+ D_TYPE: bfloat16
201
+ # BATCH_INFER DESCRIPTION: batch infer TYPE: bool default: False
202
+ BATCH_INFER: True
203
+ CLEAN: whitespace