trainer: target: trainer.TrainerSDTurboSR sd_pipe: target: diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline num_train_steps: 1000 enable_grad_checkpoint: True compile: False vae_split: 8 params: pretrained_model_name_or_path: stabilityai/sd-turbo cache_dir: weights use_safetensors: True torch_dtype: torch.float16 llpips: target: latent_lpips.lpips.LPIPS ckpt_path: weights/vgg16_sdturbo_lpips.pth compile: False params: pretrained: False net: vgg16 lpips: True spatial: False pnet_rand: False pnet_tune: True use_dropout: True eval_mode: True latent: True in_chans: 4 verbose: True model: target: diffusers.models.autoencoders.NoisePredictor ckpt_start_path: ~ # only used for training the intermidiate model ckpt_path: ~ # For initializing compile: False params: in_channels: 3 down_block_types: - AttnDownBlock2D - AttnDownBlock2D up_block_types: - AttnUpBlock2D - AttnUpBlock2D block_out_channels: - 256 # 192, 256 - 512 # 384, 512 layers_per_block: - 3 - 3 act_fn: silu latent_channels: 4 norm_num_groups: 32 sample_size: 128 mid_block_add_attention: True resnet_time_scale_shift: default temb_channels: 512 attention_head_dim: 64 freq_shift: 0 flip_sin_to_cos: True double_z: True discriminator: target: diffusers.models.unets.unet_2d_condition_discriminator.UNet2DConditionDiscriminator enable_grad_checkpoint: True compile: False params: sample_size: 64 in_channels: 4 center_input_sample: False flip_sin_to_cos: True freq_shift: 0 down_block_types: - DownBlock2D - CrossAttnDownBlock2D - CrossAttnDownBlock2D mid_block_type: UNetMidBlock2DCrossAttn up_block_types: - CrossAttnUpBlock2D - CrossAttnUpBlock2D - UpBlock2D only_cross_attention: False block_out_channels: - 128 - 256 - 512 layers_per_block: - 1 - 2 - 2 downsample_padding: 1 mid_block_scale_factor: 1 dropout: 0.0 act_fn: silu norm_num_groups: 32 norm_eps: 1e-5 cross_attention_dim: 1024 transformer_layers_per_block: 1 reverse_transformer_layers_per_block: ~ encoder_hid_dim: ~ encoder_hid_dim_type: ~ attention_head_dim: - 8 - 16 - 16 num_attention_heads: ~ dual_cross_attention: False use_linear_projection: False class_embed_type: ~ addition_embed_type: text addition_time_embed_dim: 256 num_class_embeds: ~ upcast_attention: ~ resnet_time_scale_shift: default resnet_skip_time_act: False resnet_out_scale_factor: 1.0 time_embedding_type: positional time_embedding_dim: ~ time_embedding_act_fn: ~ timestep_post_act: ~ time_cond_proj_dim: ~ conv_in_kernel: 3 conv_out_kernel: 3 projection_class_embeddings_input_dim: 2560 attention_type: default class_embeddings_concat: False mid_block_only_cross_attention: ~ cross_attention_norm: ~ addition_embed_type_num_heads: 64 degradation: sf: 4 # the first degradation process resize_prob: [0.2, 0.7, 0.1] # up, down, keep resize_range: [0.15, 1.5] gaussian_noise_prob: 0.5 noise_range: [1, 30] poisson_scale_range: [0.05, 3.0] gray_noise_prob: 0.4 jpeg_range: [30, 95] # the second degradation process second_order_prob: 0.5 second_blur_prob: 0.8 resize_prob2: [0.3, 0.4, 0.3] # up, down, keep resize_range2: [0.3, 1.2] gaussian_noise_prob2: 0.5 noise_range2: [1, 25] poisson_scale_range2: [0.05, 2.5] gray_noise_prob2: 0.4 jpeg_range2: [30, 95] gt_size: 512 resize_back: False use_sharp: False data: train: type: realesrgan params: data_source: source1: root_path: /mnt/sfs-common/zsyue/database/FFHQ image_path: images1024 moment_path: ~ text_path: ~ im_ext: png length: 20000 source2: root_path: /mnt/sfs-common/zsyue/database/LSDIR/train image_path: images moment_path: ~ text_path: ~ im_ext: png max_token_length: 77 # 77 io_backend: type: disk blur_kernel_size: 21 kernel_list: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso'] kernel_prob: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03] sinc_prob: 0.1 blur_sigma: [0.2, 3.0] betag_range: [0.5, 4.0] betap_range: [1, 2.0] blur_kernel_size2: 15 kernel_list2: ['iso', 'aniso', 'generalized_iso', 'generalized_aniso', 'plateau_iso', 'plateau_aniso'] kernel_prob2: [0.45, 0.25, 0.12, 0.03, 0.12, 0.03] sinc_prob2: 0.1 blur_sigma2: [0.2, 1.5] betag_range2: [0.5, 4.0] betap_range2: [1, 2.0] final_sinc_prob: 0.8 gt_size: ${degradation.gt_size} use_hflip: True use_rot: False random_crop: True val: type: base params: dir_path: /mnt/sfs-common/zsyue/projects/DifInv/SR/testingdata/imagenet512/lq transform_type: default transform_kwargs: mean: 0.0 std: 1.0 extra_dir_path: /mnt/sfs-common/zsyue/projects/DifInv/SR/testingdata/imagenet512/gt extra_transform_type: default extra_transform_kwargs: mean: 0.0 std: 1.0 im_exts: png length: 16 recursive: False train: # predict started inverser start_mode: True # learning rate lr: 5e-5 # learning rate lr_min: 5e-5 # learning rate lr_schedule: ~ warmup_iterations: 2000 # discriminator lr_dis: 5e-5 # learning rate for dicriminator weight_decay_dis: 1e-3 # weight decay for dicriminator dis_init_iterations: 10000 # iterations used for updating the discriminator dis_update_freq: 1 # dataloader batch: 64 microbatch: 16 num_workers: 4 prefetch_factor: 2 use_text: True # optimization settings weight_decay: 0 ema_rate: 0.999 iterations: 200000 # total iterations # logging save_freq: 5000 log_freq: [200, 5000] # [training loss, training images, val images] local_logging: True # manually save images tf_logging: False # tensorboard logging # loss loss_type: L2 loss_coef: ldif: 1.0 timesteps: [200, 100] num_inference_steps: 5 # mixed precision use_amp: True use_fsdp: False # random seed seed: 123456 global_seeding: False noise_detach: False validate: batch: 2 use_ema: True log_freq: 4 # logging frequence val_y_channel: True