args: latent_channels: 16 mode: inference load: "CogVideoX-2b-sat/transformer" batch_size: 1 input_type: txt input_file: test.txt sampling_num_frames: 13 # Must be 13, 11 or 9 sampling_fps: 8 fp16: True output_dir: outputs/ force_inference: True model: scale_factor: 1.15258426 disable_first_stage_autocast: true log_keys: - txt denoiser_config: target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser params: num_idx: 1000 quantize_c_noise: False weighting_config: target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting scaling_config: target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling discretization_config: target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization params: shift_scale: 3.0 network_config: target: dit_video_concat.DiffusionTransformer params: time_embed_dim: 512 elementwise_affine: True num_frames: 49 time_compressed_rate: 4 latent_width: 90 latent_height: 60 num_layers: 30 patch_size: 2 in_channels: 16 out_channels: 16 hidden_size: 1920 adm_in_channels: 256 num_attention_heads: 30 transformer_args: vocab_size: 1 max_sequence_length: 64 layernorm_order: pre skip_init: false model_parallel_size: 1 is_decoder: false modules: pos_embed_config: target: dit_video_concat.Basic3DPositionEmbeddingMixin params: text_length: 226 height_interpolation: 1.875 width_interpolation: 1.875 patch_embed_config: target: dit_video_concat.ImagePatchEmbeddingMixin params: text_hidden_size: 4096 adaln_layer_config: target: dit_video_concat.AdaLNMixin params: qk_ln: True final_layer_config: target: dit_video_concat.FinalLayerMixin conditioner_config: target: sgm.modules.GeneralConditioner params: emb_models: - is_trainable: false input_key: txt ucg_rate: 0.1 target: sgm.modules.encoders.modules.FrozenT5Embedder params: model_dir: "google/t5-v1_1-xxl" max_length: 226 first_stage_config: target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper params: cp_size: 1 ckpt_path: "CogVideoX-2b-sat/vae/3d-vae.pt" ignore_keys: [ 'loss' ] loss_config: target: torch.nn.Identity regularizer_config: target: vae_modules.regularizers.DiagonalGaussianRegularizer encoder_config: target: vae_modules.cp_enc_dec.ContextParallelEncoder3D params: double_z: true z_channels: 16 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [ 1, 2, 2, 4 ] attn_resolutions: [ ] num_res_blocks: 3 dropout: 0.0 gather_norm: True decoder_config: target: vae_modules.cp_enc_dec.ContextParallelDecoder3D params: double_z: True z_channels: 16 resolution: 256 in_channels: 3 out_ch: 3 ch: 128 ch_mult: [ 1, 2, 2, 4 ] attn_resolutions: [ ] num_res_blocks: 3 dropout: 0.0 gather_norm: false loss_fn_config: target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss params: offset_noise_level: 0 sigma_sampler_config: target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling params: uniform_sampling: True num_idx: 1000 discretization_config: target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization params: shift_scale: 3.0 sampler_config: target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler params: num_steps: 50 verbose: True discretization_config: target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization params: shift_scale: 3.0 guider_config: target: sgm.modules.diffusionmodules.guiders.DynamicCFG params: scale: 6 exp: 5 num_steps: 50