Spaces:
Running
on
Zero
Running
on
Zero
trainer: | |
target: trainer.TrainerSDTurboSR | |
sd_pipe: | |
target: diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline | |
num_train_steps: 1000 | |
enable_grad_checkpoint: True | |
compile: False | |
vae_split: 8 | |
params: | |
pretrained_model_name_or_path: stabilityai/sd-turbo | |
cache_dir: weights | |
use_safetensors: True | |
torch_dtype: torch.float16 | |
llpips: | |
target: latent_lpips.lpips.LPIPS | |
ckpt_path: weights/vgg16_sdturbo_lpips.pth | |
compile: False | |
params: | |
pretrained: False | |
net: vgg16 | |
lpips: True | |
spatial: False | |
pnet_rand: False | |
pnet_tune: True | |
use_dropout: True | |
eval_mode: True | |
latent: True | |
in_chans: 4 | |
verbose: True | |
model: | |
target: diffusers.models.autoencoders.NoisePredictor | |
ckpt_start_path: ~ # only used for training the intermidiate model | |
ckpt_path: ~ # For initializing | |
compile: False | |
params: | |
in_channels: 3 | |
down_block_types: | |
- AttnDownBlock2D | |
- AttnDownBlock2D | |
up_block_types: | |
- AttnUpBlock2D | |
- AttnUpBlock2D | |
block_out_channels: | |
- 256 # 192, 256 | |
- 512 # 384, 512 | |
layers_per_block: | |
- 3 | |
- 3 | |
act_fn: silu | |
latent_channels: 4 | |
norm_num_groups: 32 | |
sample_size: 128 | |
mid_block_add_attention: True | |
resnet_time_scale_shift: default | |
temb_channels: 512 | |
attention_head_dim: 64 | |
freq_shift: 0 | |
flip_sin_to_cos: True | |
double_z: True | |
discriminator: | |
target: diffusers.models.unets.unet_2d_condition_discriminator.UNet2DConditionDiscriminator | |
enable_grad_checkpoint: True | |
compile: False | |
params: | |
sample_size: 64 | |
in_channels: 4 | |
center_input_sample: False | |
flip_sin_to_cos: True | |
freq_shift: 0 | |
down_block_types: | |
- DownBlock2D | |
- CrossAttnDownBlock2D | |
- CrossAttnDownBlock2D | |
mid_block_type: UNetMidBlock2DCrossAttn | |
up_block_types: | |
- CrossAttnUpBlock2D | |
- CrossAttnUpBlock2D | |
- UpBlock2D | |
only_cross_attention: False | |
block_out_channels: | |
- 128 | |
- 256 | |
- 512 | |
layers_per_block: | |
- 1 | |
- 2 | |
- 2 | |
downsample_padding: 1 | |
mid_block_scale_factor: 1 | |
dropout: 0.0 | |
act_fn: silu | |
norm_num_groups: 32 | |
norm_eps: 1e-5 | |
cross_attention_dim: 1024 | |
transformer_layers_per_block: 1 | |
reverse_transformer_layers_per_block: ~ | |
encoder_hid_dim: ~ | |
encoder_hid_dim_type: ~ | |
attention_head_dim: | |
- 8 | |
- 16 | |
- 16 | |
num_attention_heads: ~ | |
dual_cross_attention: False | |
use_linear_projection: False | |
class_embed_type: ~ | |
addition_embed_type: text | |
addition_time_embed_dim: 256 | |
num_class_embeds: ~ | |
upcast_attention: ~ | |
resnet_time_scale_shift: default | |
resnet_skip_time_act: False | |
resnet_out_scale_factor: 1.0 | |
time_embedding_type: positional | |
time_embedding_dim: ~ | |
time_embedding_act_fn: ~ | |
timestep_post_act: ~ | |
time_cond_proj_dim: ~ | |
conv_in_kernel: 3 | |
conv_out_kernel: 3 | |
projection_class_embeddings_input_dim: 2560 | |
attention_type: default | |
class_embeddings_concat: False | |
mid_block_only_cross_attention: ~ | |
cross_attention_norm: ~ | |
addition_embed_type_num_heads: 64 | |
degradation: | |
sf: 4 | |
# the first degradation process | |
resize_prob: [0.2, 0.7, 0.1] # up, down, keep | |
resize_range: [0.15, 1.5] | |
gaussian_noise_prob: 0.5 | |
noise_range: [1, 30] | |
poisson_scale_range: [0.05, 3.0] | |
gray_noise_prob: 0.4 | |
jpeg_range: [30, 95] | |
# the second degradation process | |
second_order_prob: 0.5 | |
second_blur_prob: 0.8 | |
resize_prob2: [0.3, 0.4, 0.3] # up, down, keep | |
resize_range2: [0.3, 1.2] | |
gaussian_noise_prob2: 0.5 | |
noise_range2: [1, 25] | |
poisson_scale_range2: [0.05, 2.5] | |
gray_noise_prob2: 0.4 | |
jpeg_range2: [30, 95] | |
gt_size: 512 | |
resize_back: False | |
use_sharp: False | |
data: | |
train: | |
type: realesrgan | |
params: | |
data_source: | |
source1: | |
root_path: /mnt/sfs-common/zsyue/database/FFHQ | |
image_path: images1024 | |
moment_path: ~ | |
text_path: ~ | |
im_ext: png | |
length: 20000 | |
source2: | |
root_path: /mnt/sfs-common/zsyue/database/LSDIR/train | |
image_path: images | |
moment_path: ~ | |
text_path: ~ | |
im_ext: png | |
max_token_length: 77 # 77 | |
io_backend: | |
type: disk | |
blur_kernel_size: 21 | |
kernel_list: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso'] | |
kernel_prob: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03] | |
sinc_prob: 0.1 | |
blur_sigma: [0.2, 3.0] | |
betag_range: [0.5, 4.0] | |
betap_range: [1, 2.0] | |
blur_kernel_size2: 15 | |
kernel_list2: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso'] | |
kernel_prob2: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03] | |
sinc_prob2: 0.1 | |
blur_sigma2: [0.2, 1.5] | |
betag_range2: [0.5, 4.0] | |
betap_range2: [1, 2.0] | |
final_sinc_prob: 0.8 | |
gt_size: ${degradation.gt_size} | |
use_hflip: True | |
use_rot: False | |
random_crop: True | |
val: | |
type: base | |
params: | |
dir_path: /mnt/sfs-common/zsyue/projects/DifInv/SR/testingdata/imagenet512/lq | |
transform_type: default | |
transform_kwargs: | |
mean: 0.0 | |
std: 1.0 | |
extra_dir_path: /mnt/sfs-common/zsyue/projects/DifInv/SR/testingdata/imagenet512/gt | |
extra_transform_type: default | |
extra_transform_kwargs: | |
mean: 0.0 | |
std: 1.0 | |
im_exts: png | |
length: 16 | |
recursive: False | |
train: | |
# predict started inverser | |
start_mode: True | |
# learning rate | |
lr: 5e-5 # learning rate | |
lr_min: 5e-5 # learning rate | |
lr_schedule: ~ | |
warmup_iterations: 2000 | |
# discriminator | |
lr_dis: 5e-5 # learning rate for dicriminator | |
weight_decay_dis: 1e-3 # weight decay for dicriminator | |
dis_init_iterations: 10000 # iterations used for updating the discriminator | |
dis_update_freq: 1 | |
# dataloader | |
batch: 64 | |
microbatch: 16 | |
num_workers: 4 | |
prefetch_factor: 2 | |
use_text: True | |
# optimization settings | |
weight_decay: 0 | |
ema_rate: 0.999 | |
iterations: 200000 # total iterations | |
# logging | |
save_freq: 5000 | |
log_freq: [200, 5000] # [training loss, training images, val images] | |
local_logging: True # manually save images | |
tf_logging: False # tensorboard logging | |
# loss | |
loss_type: L2 | |
loss_coef: | |
ldif: 1.0 | |
timesteps: [200, 100] | |
num_inference_steps: 5 | |
# mixed precision | |
use_amp: True | |
use_fsdp: False | |
# random seed | |
seed: 123456 | |
global_seeding: False | |
noise_detach: False | |
validate: | |
batch: 2 | |
use_ema: True | |
log_freq: 4 # logging frequence | |
val_y_channel: True | |