Wawaworker commited on
Commit
486e564
1 Parent(s): a7c0391

Create Trainining_Config

Browse files
Files changed (1) hide show
  1. Trainining_Config +244 -0
Trainining_Config ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configure these values.
2
+
3
+ # 'lora' or 'full'
4
+ # lora - train a small network for a character or style, or both. quite versatile.
5
+ # full - requires lots of vram, trains very slowly, needs a lot of data and concepts.
6
+ export MODEL_TYPE='lora'
7
+
8
+ # SDXL is trained by default, but you will need to enable one of these options for anything else.
9
+
10
+ # Set this to 'true' if you are training a Stable Diffusion 3 checkpoint.
11
+ # Use MODEL_NAME="stabilityai/stable-diffusion-3-medium-diffusers"
12
+ export STABLE_DIFFUSION_3=false
13
+ # Similarly, this is to train PixArt Sigma (1K or 2K) models.
14
+ # Use MODEL_NAME="PixArt-alpha/PixArt-Sigma-XL-2-1024-MS"
15
+ export PIXART_SIGMA=false
16
+ # For old Stable Diffusion 1.x/2.x models, you'll enable this.
17
+ # Use MODEL_NAME="stabilityai/stable-diffusion-2-1"
18
+ export STABLE_DIFFUSION_LEGACY=false
19
+ # For Kwai-Kolors, enable KOLORS.
20
+ # Use MODEL_NAME="kwai-kolors/kolors-diffusers"
21
+ export KOLORS=false
22
+ # For Flux, if you have 8 GPUs and DeepSpeed configured.
23
+ # Use MODEL_NAME="black-forest-labs/FLUX.1-dev"
24
+ export FLUX=true
25
+
26
+ # ControlNet model training is only supported when MODEL_TYPE='full'
27
+ # See this document for more information: https://github.com/bghira/SimpleTuner/blob/main/documentation/CONTROLNET.md
28
+ # DeepFloyd, PixArt, and SD3 do not currently support ControlNet model training.
29
+ export CONTROLNET=false
30
+
31
+ # DoRA enhances the training style of LoRA, but it will run more slowly at the same rank.
32
+ # See: https://arxiv.org/abs/2402.09353
33
+ # See: https://github.com/huggingface/peft/pull/1474
34
+ export USE_DORA=false
35
+
36
+ # BitFit freeze strategy for the u-net causes everything but the biases to be frozen.
37
+ # This may help retain the full model's underlying capabilities. LoRA is currently not tested/known to work.
38
+ #if [[ "$MODEL_TYPE" == "full" ]]; then
39
+ # # When training a full model, we will rely on BitFit to keep the u-net intact.
40
+ # export USE_BITFIT=true
41
+ #elif [[ "$MODEL_TYPE" == "lora" ]]; then
42
+ # # LoRA can not use BitFit.
43
+ # export USE_BITFIT=false
44
+ #elif [[ "$MODEL_TYPE" == "deepfloyd-full" ]]; then
45
+ # export USE_BITFIT=true
46
+ #fi
47
+
48
+ # Restart where we left off. Change this to "checkpoint-1234" to start from a specific checkpoint.
49
+ export RESUME_CHECKPOINT="latest"
50
+
51
+ # How often to checkpoint. Depending on your learning rate, you may wish to change this.
52
+ # For the default settings with 10 gradient accumulations, more frequent checkpoints might be preferable at first.
53
+ export CHECKPOINTING_STEPS=100
54
+ # This is how many checkpoints we will keep. Two is safe, but three is safer.
55
+ export CHECKPOINTING_LIMIT=3
56
+
57
+ # This is decided as a relatively conservative 'constant' learning rate.
58
+ # Adjust higher or lower depending on how burnt your model becomes.
59
+ # export LEARNING_RATE=8e-7 #@param {type:"number"}
60
+ export LEARNING_RATE=0.0001 #@param {type:"number"}
61
+
62
+ # Using a Huggingface Hub model:
63
+ export MODEL_NAME="black-forest-labs/FLUX.1-dev"
64
+ # Using a local path to a huggingface hub model or saved checkpoint:
65
+ #export MODEL_NAME="/datasets/models/pipeline"
66
+
67
+ # Make DEBUG_EXTRA_ARGS empty to disable wandb.
68
+ export DEBUG_EXTRA_ARGS="--report_to=wandb"
69
+ export TRACKER_PROJECT_NAME="hvvshimFluxV1"
70
+ export TRACKER_RUN_NAME="flux-V1"
71
+
72
+ # Max number of steps OR epochs can be used. Not both.
73
+ export MAX_NUM_STEPS=3000
74
+ # Will likely overtrain, but that's fine.
75
+ export NUM_EPOCHS=0
76
+
77
+ # A convenient prefix for all of your training paths.
78
+ # These may be absolute or relative paths. Here, we are using relative paths.
79
+ # The output will just be in a folder called "output/models" by default.
80
+ export DATALOADER_CONFIG="config/multidatabackend.json"
81
+ export OUTPUT_DIR="output/models"
82
+
83
+ # Set this to "true" to push your model to Hugging Face Hub.
84
+ export PUSH_TO_HUB="true"
85
+ # If PUSH_TO_HUB and PUSH_CHECKPOINTS are both enabled, every saved checkpoint will be pushed to Hugging Face Hub.
86
+ export PUSH_CHECKPOINTS="true"
87
+ # This will be the model name for your final hub upload, eg. "yourusername/yourmodelname"
88
+ # It defaults to the wandb project name, but you can override this here.
89
+ export HUB_MODEL_NAME=$TRACKER_PROJECT_NAME
90
+
91
+ # By default, images will be resized so their SMALLER EDGE is 1024 pixels, maintaining aspect ratio.
92
+ # Setting this value to 768px might result in more reasonable training data sizes for SDXL.
93
+ export RESOLUTION=1024
94
+ # If you want to have the training data resized by pixel area (Megapixels) rather than edge length,
95
+ # set this value to "area" instead of "pixel", and uncomment the next RESOLUTION declaration.
96
+ export RESOLUTION_TYPE="pixel"
97
+ #export RESOLUTION=1 # 1.0 Megapixel training sizes
98
+ # If RESOLUTION_TYPE="pixel", the minimum resolution specifies the smaller edge length, measured in pixels. Recommended: 1024.
99
+ # If RESOLUTION_TYPE="area", the minimum resolution specifies the total image area, measured in megapixels. Recommended: 1.
100
+ export MINIMUM_RESOLUTION=$RESOLUTION
101
+
102
+ # How many decimals to round aspect buckets to.
103
+ #export ASPECT_BUCKET_ROUNDING=2
104
+
105
+ # Use this to append an instance prompt to each caption, used for adding trigger words.
106
+ # This has not been tested in SDXL.
107
+ #export INSTANCE_PROMPT="lotr style "
108
+ # If you also supply a user prompt library or `--use_prompt_library`, this will be added to those lists.
109
+ export VALIDATION_PROMPT="a portrait of hvvshim man on the moon"
110
+ export VALIDATION_GUIDANCE=3.5
111
+ # You'll want to set this to 0.7 if you are training a terminal SNR model.
112
+ export VALIDATION_GUIDANCE_RESCALE=0.0
113
+ # How frequently we will save and run a pipeline for validations.
114
+ export VALIDATION_STEPS=100
115
+ export VALIDATION_NUM_INFERENCE_STEPS=20
116
+ export VALIDATION_NEGATIVE_PROMPT="blurry, cropped, ugly,fat"
117
+ export VALIDATION_SEED=42
118
+ export VALIDATION_RESOLUTION=$RESOLUTION
119
+
120
+
121
+ # Adjust this for your GPU memory size. This, and resolution, are the biggest VRAM killers.
122
+ export TRAIN_BATCH_SIZE=1
123
+ # Accumulate your update gradient over many steps, to save VRAM while still having higher effective batch size:
124
+ # effective batch size = ($TRAIN_BATCH_SIZE * $GRADIENT_ACCUMULATION_STEPS).
125
+ export GRADIENT_ACCUMULATION_STEPS=1
126
+ # How many images to encode at once with the VAE. Can increase VRAM use.
127
+ export VAE_BATCH_SIZE=1
128
+
129
+ # Use any standard scheduler type. constant, polynomial, constant_with_warmup
130
+ export LR_SCHEDULE="constant_with_warmup"
131
+ # A warmup period allows the model and the EMA weights more importantly to familiarise itself with the current quanta.
132
+ # For the cosine or sine type schedules, the warmup period defines the interval between peaks or valleys.
133
+ # Use a sine schedule to simulate a warmup period, or a Cosine period to simulate a polynomial start.
134
+ export LR_WARMUP_STEPS=$((MAX_NUM_STEPS / 10))
135
+ # export LR_WARMUP_STEPS=1000
136
+
137
+ # Caption dropout probability. Set to 0.1 for 10% of captions dropped out. Set to 0 to disable.
138
+ # You may wish to disable dropout if you want to limit your changes strictly to the prompts you show the model.
139
+ # You may wish to increase the rate of dropout if you want to more broadly adopt your changes across the model.
140
+ export CAPTION_DROPOUT_PROBABILITY=0.1
141
+
142
+ export METADATA_UPDATE_INTERVAL=65
143
+
144
+ # How many workers to use for VAE caching.
145
+ export MAX_WORKERS=32
146
+ # Read and write batch sizes for VAE caching.
147
+ export READ_BATCH_SIZE=25
148
+ export WRITE_BATCH_SIZE=64
149
+ # How many images to process at once (resize, crop, transform) during VAE caching.
150
+ export IMAGE_PROCESSING_BATCH_SIZE=32
151
+ # When using large batch sizes, you'll need to increase the pool connection limit.
152
+ export AWS_MAX_POOL_CONNECTIONS=128
153
+ # For very large systems, setting this can reduce CPU overhead of torch spawning an unnecessarily large number of threads.
154
+ export TORCH_NUM_THREADS=8
155
+
156
+ # If this is set, any images that fail to open will be DELETED to avoid re-checking them every time.
157
+ export DELETE_ERRORED_IMAGES=0
158
+ # If this is set, any images that are too small for the minimum resolution size will be DELETED.
159
+ export DELETE_SMALL_IMAGES=0
160
+
161
+ # Bytedance recommends these be set to "trailing" so that inference and training behave in a more congruent manner.
162
+ # To follow the original SDXL training strategy, use "leading" instead, though results are generally worse.
163
+ export TRAINING_SCHEDULER_TIMESTEP_SPACING="trailing"
164
+ export INFERENCE_SCHEDULER_TIMESTEP_SPACING="trailing"
165
+
166
+ # Removing this option or unsetting it uses vanilla training. Setting it reweights the loss by the position of the timestep in the noise schedule.
167
+ # A value "5" is recommended by the researchers. A value of "20" is the least impact, and "1" is the most impact.
168
+ export MIN_SNR_GAMMA=5
169
+
170
+ # Set this to an explicit value of "false" to disable Xformers. Probably required for AMD users.
171
+ export USE_XFORMERS=false
172
+
173
+ # There's basically no reason to unset this. However, to disable it, use an explicit value of "false".
174
+ # This will save a lot of memory consumption when enabled.
175
+ export USE_GRADIENT_CHECKPOINTING=true
176
+
177
+ ##
178
+ # Options below here may require a bit more complicated configuration, so they are not simple variables.
179
+ ##
180
+
181
+ # TF32 is great on Ampere or Ada, not sure about earlier generations.
182
+ export ALLOW_TF32=true
183
+
184
+ # AdamW 8Bit is a robust and lightweight choice. Adafactor might reduce memory consumption, and Dadaptation is slow and experimental.
185
+ # AdamW is the default optimizer, but it uses a lot of memory and is slower than AdamW8Bit or Adafactor.
186
+ # NOTE: When training a quantised base model, you can't use adamw_bf16. Instead, try adafactor or adamw.
187
+ # Choices: adamw, adamw8bit, adafactor, dadaptation, adamw_bf16
188
+ export OPTIMIZER="adamw_bf16"
189
+
190
+
191
+ # EMA is a strong regularisation method that uses a lot of extra VRAM to hold two copies of the weights.
192
+ # This is worthwhile on large training runs, but not so much for smaller training runs.
193
+ # NOTE: EMA is not currently applied to LoRA.
194
+ export USE_EMA=false
195
+ export EMA_DECAY=0.999
196
+
197
+ export TRAINER_EXTRA_ARGS="--base_model_precision=int8-quanto"
198
+ ## For offset noise training:
199
+ # Not recommended for terminal SNR models.
200
+ export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --keep_vae_loaded"
201
+ export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --lora_rank=32"
202
+ export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --lora_alpha=32"
203
+ export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --text_encoder_1_precision=no_change --text_encoder_2_precision=no_change"
204
+
205
+ ## For terminal SNR training:
206
+ #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --prediction_type=v_prediction --rescale_betas_zero_snr"
207
+ #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --training_scheduler_timestep_spacing=trailing --inference_scheduler_timestep_spacing=trailing"
208
+ ## You may benefit from directing training toward a specific weighted subset of timesteps.
209
+ # In this example, we train the final 25% of the timestep schedule with a 3x bias.
210
+ #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=later --timestep_bias_portion=0.25 --timestep_bias_multiplier=3"
211
+ # In this example, we train the earliest 25% of the timestep schedule with a 5x bias.
212
+ #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=earlier --timestep_bias_portion=0.25 --timestep_bias_multiplier=5"
213
+ # Here, we designate that specifically, timesteps 200 to 500 should be prioritised.
214
+ #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=range --timestep_bias_begin=200 --timestep_bias_end=500 --timestep_bias_multiplier=3"
215
+
216
+ ## For experimental min-SNR weighted loss training (5 is suggested value by the original researchers):
217
+ # Not recommended for terminal SNR models.
218
+ #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --snr_gamma=5.0"
219
+
220
+ # For Wasabi S3 filesystem backend (experimental)
221
+ #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --data_backend=aws --aws_bucket_name=test123"
222
+ #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_endpoint_url=https://s3.wasabisys.com"
223
+ #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_access_key=1234567890"
224
+ #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_secret_access_key=0987654321"
225
+
226
+
227
+ # Reproducible training. Set to -1 to disable.
228
+ export TRAINING_SEED=42
229
+
230
+ # Mixed precision is the best. You honestly might need to YOLO it in fp16 mode for Google Colab type setups.
231
+ export MIXED_PRECISION="bf16" # Might not be supported on all GPUs. fp32 will be needed for others.
232
+ export PURE_BF16=true
233
+
234
+ # This has to be changed if you're training with multiple GPUs.
235
+ export TRAINING_NUM_PROCESSES=1
236
+ export TRAINING_NUM_MACHINES=1
237
+ export ACCELERATE_EXTRA_ARGS="" # --multi_gpu or other similar flags for huggingface accelerate
238
+
239
+ # With Pytorch 2.1, you might have pretty good luck here.
240
+ # If you're using aspect bucketing however, each resolution change will recompile. Seriously, just don't do it.
241
+ # Well, then again... Pytorch 2.2 has support for dynamic shapes. Why not?
242
+ export TRAINING_DYNAMO_BACKEND='no' # or 'no' if you want to disable torch compile in case of performance issues or lack of support (eg. AMD)
243
+
244
+ export TOKENIZERS_PARALLELISM=false