num_frames: &num_frames 16 resolution: &resolution [576, 1024] model: base_learning_rate: 1.0e-5 scale_lr: false target: core.models.diffusion.DualStreamMultiViewDiffusionModel params: use_task_embedding: false ray_as_image: false apply_condition_mask_in_training_loss: true separate_noise_and_condition: true condition_padding_with_anchor: false use_ray_decoder_loss_high_frequency_isolation: false train_with_multi_view_feature_alignment: true use_text_cross_attention_condition: false linear_start: 0.00085 linear_end: 0.012 num_time_steps_cond: 1 log_every_t: 200 time_steps: 1000 data_key_images: combined_images data_key_rays: combined_rays data_key_text_condition: caption cond_stage_trainable: false image_size: [72, 128] channels: 10 monitor: global_step scale_by_std: false scale_factor: 0.18215 use_dynamic_rescale: true base_scale: 0.3 use_ema: false uncond_prob: 0.05 uncond_type: 'empty_seq' use_camera_pose_query_transformer: false random_cond: false cond_concat: true frame_mask: false padding: true per_frame_auto_encoding: true parameterization: "v" rescale_betas_zero_snr: true use_noise_offset: false scheduler_config: target: utils.lr_scheduler.LambdaLRScheduler interval: 'step' frequency: 100 params: start_step: 0 final_decay_ratio: 0.1 decay_steps: 100 bd_noise: false unet_config: target: core.modules.networks.unet_modules.UNetModel params: in_channels: 20 out_channels: 10 model_channels: 320 attention_resolutions: - 4 - 2 - 1 num_res_blocks: 2 channel_mult: - 1 - 2 - 4 - 4 dropout: 0.1 num_head_channels: 64 transformer_depth: 1 context_dim: 1024 use_linear: true use_checkpoint: true temporal_conv: true temporal_attention: true temporal_selfatt_only: true use_relative_position: false use_causal_attention: false temporal_length: *num_frames addition_attention: true image_cross_attention: true image_cross_attention_scale_learnable: true default_fs: 3 fs_condition: false use_spatial_temporal_attention: true use_addition_ray_output_head: true ray_channels: 6 use_lora_for_rays_in_output_blocks: false use_task_embedding: false use_ray_decoder: true use_ray_decoder_residual: true full_spatial_temporal_attention: true enhance_multi_view_correspondence: false camera_pose_condition: true use_feature_alignment: true first_stage_config: target: core.models.autoencoder.AutoencoderKL params: embed_dim: 4 monitor: val/rec_loss ddconfig: double_z: true z_channels: 4 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [1, 2, 4, 4] num_res_blocks: 2 attn_resolutions: [] dropout: 0.0 lossconfig: target: torch.nn.Identity cond_img_config: target: core.modules.encoders.condition.FrozenOpenCLIPImageEmbedderV2 params: freeze: true image_proj_model_config: target: core.modules.encoders.resampler.Resampler params: dim: 1024 depth: 4 dim_head: 64 heads: 12 num_queries: 16 embedding_dim: 1280 output_dim: 1024 ff_mult: 4 video_length: *num_frames