File size: 3,045 Bytes
2a00960
 
62720f9
2a00960
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fbb38c
2a00960
 
 
 
5fbb38c
2a00960
 
 
 
 
 
 
 
 
 
5fbb38c
2a00960
 
 
 
 
 
 
 
 
5fbb38c
2a00960
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fbb38c
2a00960
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
NAME: ACE_0.6B_512
IS_DEFAULT: False
USE_DYNAMIC_MODEL: False
DEFAULT_PARAS:
  PARAS:
  #
  INPUT:
    INPUT_IMAGE:
    INPUT_MASK:
    TASK:
    PROMPT: ""
    NEGATIVE_PROMPT: ""
    OUTPUT_HEIGHT: 512
    OUTPUT_WIDTH: 512
    SAMPLER: ddim
    SAMPLE_STEPS: 20
    GUIDE_SCALE: 4.5
    GUIDE_RESCALE: 0.5
    SEED: -1
    TAR_INDEX: 0
  OUTPUT:
    LATENT:
    IMAGES:
    SEED:
  MODULES_PARAS:
    FIRST_STAGE_MODEL:
      FUNCTION:
        - NAME: encode
          DTYPE: float16
          INPUT: ["IMAGE"]
        - NAME: decode
          DTYPE: float16
          INPUT: ["LATENT"]
    #
    DIFFUSION_MODEL:
      FUNCTION:
        - NAME: forward
          DTYPE: float16
          INPUT: ["SAMPLE_STEPS", "SAMPLE", "GUIDE_SCALE"]
    #
    COND_STAGE_MODEL:
      FUNCTION:
        - NAME: encode_list_of_list
          DTYPE: bfloat16
          INPUT: ["PROMPT"]
#
MODEL:
  NAME: LatentDiffusionACE
  PRETRAINED_MODEL:
  IGNORE_KEYS: [ ]
  SCALE_FACTOR: 0.18215
  SIZE_FACTOR: 8
  DECODER_BIAS: 0.5
  DEFAULT_N_PROMPT: ""
  TEXT_IDENTIFIER: [ '{image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
  USE_TEXT_POS_EMBEDDINGS: True
  #
  DIFFUSION:
    NAME: BaseDiffusion
    PREDICTION_TYPE: eps
    MIN_SNR_GAMMA:
    NOISE_SCHEDULER:
      NAME: LinearScheduler
      NUM_TIMESTEPS: 1000
      BETA_MIN: 0.0001
      BETA_MAX: 0.02
  #
  DIFFUSION_MODEL:
    NAME: ACE
    PRETRAINED_MODEL: hf://scepter-studio/ACE-0.6B-512px@models/dit/ace_0.6b_512px.pth
    IGNORE_KEYS: [ ]
    PATCH_SIZE: 2
    IN_CHANNELS: 4
    HIDDEN_SIZE: 1152
    DEPTH: 28
    NUM_HEADS: 16
    MLP_RATIO: 4.0
    PRED_SIGMA: True
    DROP_PATH: 0.0
    WINDOW_DIZE: 0
    Y_CHANNELS: 4096
    MAX_SEQ_LEN: 1024
    QK_NORM: True
    USE_GRAD_CHECKPOINT: True
    ATTENTION_BACKEND: flash_attn
  #
  FIRST_STAGE_MODEL:
    NAME: AutoencoderKL
    EMBED_DIM: 4
    PRETRAINED_MODEL: hf://scepter-studio/ACE-0.6B-512px@models/vae/vae.bin
    IGNORE_KEYS: []
    #
    ENCODER:
      NAME: Encoder
      CH: 128
      OUT_CH: 3
      NUM_RES_BLOCKS: 2
      IN_CHANNELS: 3
      ATTN_RESOLUTIONS: [ ]
      CH_MULT: [ 1, 2, 4, 4 ]
      Z_CHANNELS: 4
      DOUBLE_Z: True
      DROPOUT: 0.0
      RESAMP_WITH_CONV: True
    #
    DECODER:
      NAME: Decoder
      CH: 128
      OUT_CH: 3
      NUM_RES_BLOCKS: 2
      IN_CHANNELS: 3
      ATTN_RESOLUTIONS: [ ]
      CH_MULT: [ 1, 2, 4, 4 ]
      Z_CHANNELS: 4
      DROPOUT: 0.0
      RESAMP_WITH_CONV: True
      GIVE_PRE_END: False
      TANH_OUT: False
  #
  COND_STAGE_MODEL:
    NAME: T5EmbedderHF
    PRETRAINED_MODEL: hf://scepter-studio/ACE-0.6B-512px@models/text_encoder/t5-v1_1-xxl/
    TOKENIZER_PATH: hf://scepter-studio/ACE-0.6B-512px@models/tokenizer/t5-v1_1-xxl
    LENGTH: 120
    T5_DTYPE: bfloat16
    ADDED_IDENTIFIER: [ '{image}', '{caption}', '{mask}', '{ref_image}', '{image1}', '{image2}', '{image3}', '{image4}', '{image5}', '{image6}', '{image7}', '{image8}', '{image9}' ]
    CLEAN: whitespace
    USE_GRAD: False