_target_: latent_motion_tokenizer.src.models.latent_motion_tokenizer.LatentMotionTokenizer codebook_dim: 32 commit_loss_w: 1.0 recon_loss_w: 1.0 perceptual_loss_w: 1.0 image_encoder: _target_: transformers.ViTMAEModel.from_pretrained pretrained_model_name_or_path: "facebook/vit-mae-large" m_former: _target_: latent_motion_tokenizer.src.models.m_former.MFormer add_pooling_layer: false config: _target_: transformers.ViTConfig query_num: 8 input_hidden_size: 1024 num_patches: 197 # include the [CLS] token attention_probs_dropout_prob: 0.0 hidden_act: "gelu" hidden_dropout_prob: 0.0 hidden_size: 768 # the hidden size of MAE decoder is 512 initializer_range: 0.02 intermediate_size: 3072 layer_norm_eps: 1e-12 model_type: "vit" num_attention_heads: 12 num_hidden_layers: 4 qkv_bias: true vector_quantizer: _target_: latent_motion_tokenizer.src.models.vector_quantizer.VectorQuantizer2 n_e: 128 e_dim: 32 beta: 0.25 remap: null sane_index_shape: true decoder: _target_: latent_motion_tokenizer.src.models.latent_motion_decoder.LatentMotionDecoder config: _target_: transformers.ViTConfig query_num: 8 attention_probs_dropout_prob: 0.0 hidden_act: "gelu" hidden_dropout_prob: 0.0 hidden_size: 768 image_size: 224 initializer_range: 0.02 intermediate_size: 3072 layer_norm_eps: 1e-12 model_type: "vit" num_attention_heads: 12 num_channels: 3 num_hidden_layers: 12 patch_size: 16 qkv_bias: true encoder_stride: 16 num_patches: 196