File size: 6,686 Bytes
ec43f9b
 
 
 
0004c96
270a849
ec43f9b
 
 
 
 
 
 
 
 
 
 
0d206f3
ec43f9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d206f3
ec43f9b
 
 
 
0d206f3
ec43f9b
 
 
 
 
 
 
 
 
 
 
 
0d206f3
 
 
 
 
ec43f9b
 
 
0d206f3
3000b0a
 
ec43f9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d206f3
 
ec43f9b
 
 
 
3000b0a
ec43f9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf6a8e1
ec43f9b
 
 
bf6a8e1
ec43f9b
 
 
3000b0a
ec43f9b
 
 
3000b0a
ec43f9b
 
 
 
 
 
 
 
 
 
 
 
 
bf6a8e1
ec43f9b
 
 
3000b0a
ec43f9b
 
 
3000b0a
ec43f9b
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
NAME: ACE_FLUX.1_dev
IS_DEFAULT: True
USE_DYNAMIC_MODEL: False
INFERENCE_TYPE: ACE_FLUX
MAX_SEQ_LENGTH: 3072
SRC_MAX_SEQ_LENGTH: 2048
DEFAULT_PARAS:
  PARAS:
  #
  INPUT:
    INPUT_IMAGE:
    INPUT_MASK:
    TASK:
    PROMPT: ""
    OUTPUT_HEIGHT: 1024
    OUTPUT_WIDTH: 1024
    SAMPLER: flow_euler
    SAMPLE_STEPS: 20
    GUIDE_SCALE: 3.5
    SEED: -1
    TAR_INDEX: 0
    ALIGN: False
  OUTPUT:
    LATENT:
    IMAGES:
    SEED:
  MODULES_PARAS:
    FIRST_STAGE_MODEL:
      FUNCTION:
        - NAME: encode
          DTYPE: bfloat16
          INPUT: [ "IMAGE" ]
        - NAME: decode
          DTYPE: bfloat16
          INPUT: [ "LATENT" ]
      PARAS:
        SCALE_FACTOR: 1.5305
        SHIFT_FACTOR: 0.0609
        SIZE_FACTOR: 8
    DIFFUSION_MODEL:
      FUNCTION:
        - NAME: forward
          DTYPE: bfloat16
          INPUT: [ "SAMPLE_STEPS", "SAMPLE", "GUIDE_SCALE" ]
    COND_STAGE_MODEL:
      FUNCTION:
        - NAME: encode_list
          DTYPE: bfloat16
          INPUT: [ "PROMPT" ]
#
MODEL:
  NAME: LatentDiffusionACEFlux
  PARAMETERIZATION: rf
  PRETRAINED_MODEL:
  IGNORE_KEYS: [ ]
  SIZE_FACTOR: 8
  TEXT_IDENTIFIER: [ '{image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
  USE_TEXT_POS_EMBEDDINGS: True
  DIFFUSION:
    # NAME DESCRIPTION:  TYPE:  default: 'DiffusionFluxRF'
    NAME: DiffusionFluxRF
    PREDICTION_TYPE: raw
    # NOISE_SCHEDULER DESCRIPTION:  TYPE:  default: ''
    NOISE_SCHEDULER:
        NAME: FlowMatchFluxShiftScheduler
        SHIFT: True
        SIGMOID_SCALE: 1
        BASE_SHIFT: 0.5
        MAX_SHIFT: 1.15
      #
  DIFFUSION_MODEL:
    # NAME DESCRIPTION:  TYPE:  default: 'Flux'
    NAME: ACEFlux
    PRETRAINED_MODEL: hf://black-forest-labs/FLUX.1-dev@flux1-dev.safetensors
    SWIFT_LORA_MODEL: ["hf://scepter-studio/ACE-FLUX.1-dev@ace_flux.1_dev_lora.bin"]
    # IN_CHANNELS DESCRIPTION: model's input channels. TYPE: int default: 64
    IN_CHANNELS: 64
    # HIDDEN_SIZE DESCRIPTION: model's hidden size. TYPE: int default: 1024
    HIDDEN_SIZE: 3072
    # NUM_HEADS DESCRIPTION: number of heads in the transformer. TYPE: int default: 16
    NUM_HEADS: 24
    # AXES_DIM DESCRIPTION: dimensions of the axes of the positional encoding. TYPE: list default: [16, 56, 56]
    AXES_DIM: [ 16, 56, 56 ]
    # THETA DESCRIPTION: theta for positional encoding. TYPE: int default: 10000
    THETA: 10000
    # VEC_IN_DIM DESCRIPTION: dimension of the vector input. TYPE: int default: 768
    VEC_IN_DIM: 768
    # GUIDANCE_EMBED DESCRIPTION: whether to use guidance embedding. TYPE: bool default: False
    GUIDANCE_EMBED: True
    # CONTEXT_IN_DIM DESCRIPTION: dimension of the context input. TYPE: int default: 4096
    CONTEXT_IN_DIM: 4096
    # MLP_RATIO DESCRIPTION: ratio of mlp hidden size to hidden size. TYPE: float default: 4.0
    MLP_RATIO: 4.0
    # QKV_BIAS DESCRIPTION: whether to use bias in qkv projection. TYPE: bool default: True
    QKV_BIAS: True
    # DEPTH DESCRIPTION: number of transformer blocks. TYPE: int default: 19
    DEPTH: 19
    # DEPTH_SINGLE_BLOCKS DESCRIPTION: number of transformer blocks in the single stream block. TYPE: int default: 38
    DEPTH_SINGLE_BLOCKS: 38
    ATTN_BACKEND: pytorch

  #
  FIRST_STAGE_MODEL:
    NAME: AutoencoderKLFlux
    EMBED_DIM: 16
    PRETRAINED_MODEL: hf://black-forest-labs/FLUX.1-dev@ae.safetensors
    IGNORE_KEYS: [ ]
    BATCH_SIZE: 8
    USE_CONV: False
    SCALE_FACTOR: 0.3611
    SHIFT_FACTOR: 0.1159
    #
    ENCODER:
      NAME: Encoder
      USE_CHECKPOINT: True
      CH: 128
      OUT_CH: 3
      NUM_RES_BLOCKS: 2
      IN_CHANNELS: 3
      ATTN_RESOLUTIONS: [ ]
      CH_MULT: [ 1, 2, 4, 4 ]
      Z_CHANNELS: 16
      DOUBLE_Z: True
      DROPOUT: 0.0
      RESAMP_WITH_CONV: True
    #
    DECODER:
      NAME: Decoder
      USE_CHECKPOINT: True
      CH: 128
      OUT_CH: 3
      NUM_RES_BLOCKS: 2
      IN_CHANNELS: 3
      ATTN_RESOLUTIONS: [ ]
      CH_MULT: [ 1, 2, 4, 4 ]
      Z_CHANNELS: 16
      DROPOUT: 0.0
      RESAMP_WITH_CONV: True
      GIVE_PRE_END: False
      TANH_OUT: False
  #
  COND_STAGE_MODEL:
    # NAME DESCRIPTION:  TYPE:  default: 'T5PlusClipFluxEmbedder'
    NAME: T5ACEPlusClipFluxEmbedder
    # T5_MODEL DESCRIPTION:  TYPE:  default: ''
    T5_MODEL:
      # NAME DESCRIPTION:  TYPE:  default: 'HFEmbedder'
      NAME: ACEHFEmbedder
      # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
      HF_MODEL_CLS: T5EncoderModel
      # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
      MODEL_PATH: hf://black-forest-labs/FLUX.1-dev@text_encoder_2/
      # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
      HF_TOKENIZER_CLS: T5Tokenizer
      # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
      TOKENIZER_PATH: hf://black-forest-labs/FLUX.1-dev@tokenizer_2/
      ADDED_IDENTIFIER: [ '<img>','{image}', '{caption}', '{mask}', '{ref_image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
      # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
      MAX_LENGTH: 512
      # OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'
      OUTPUT_KEY: last_hidden_state
      # D_TYPE DESCRIPTION: dtype TYPE: str default: 'bfloat16'
      D_TYPE: bfloat16
      # BATCH_INFER DESCRIPTION: batch infer TYPE: bool default: False
      BATCH_INFER: False
      CLEAN: whitespace
    # CLIP_MODEL DESCRIPTION:  TYPE:  default: ''
    CLIP_MODEL:
      # NAME DESCRIPTION:  TYPE:  default: 'HFEmbedder'
      NAME: ACEHFEmbedder
      # HF_MODEL_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
      HF_MODEL_CLS: CLIPTextModel
      # MODEL_PATH DESCRIPTION: model folder path TYPE: NoneType default: None
      MODEL_PATH: hf://black-forest-labs/FLUX.1-dev@text_encoder/
      # HF_TOKENIZER_CLS DESCRIPTION: huggingface cls in transfomer TYPE: NoneType default: None
      HF_TOKENIZER_CLS: CLIPTokenizer
      # TOKENIZER_PATH DESCRIPTION: tokenizer folder path TYPE: NoneType default: None
      TOKENIZER_PATH: hf://black-forest-labs/FLUX.1-dev@tokenizer/
      # MAX_LENGTH DESCRIPTION: max length of input TYPE: int default: 77
      MAX_LENGTH: 77
      # OUTPUT_KEY DESCRIPTION: output key TYPE: str default: 'last_hidden_state'
      OUTPUT_KEY: pooler_output
      # D_TYPE DESCRIPTION: dtype TYPE: str default: 'bfloat16'
      D_TYPE: bfloat16
      # BATCH_INFER DESCRIPTION: batch infer TYPE: bool default: False
      BATCH_INFER: True
      CLEAN: whitespace