|
_target_: latent_motion_tokenizer.src.models.latent_motion_tokenizer.LatentMotionTokenizer |
|
codebook_dim: 32 |
|
commit_loss_w: 1.0 |
|
recon_loss_w: 1.0 |
|
perceptual_loss_w: 1.0 |
|
|
|
image_encoder: |
|
_target_: transformers.ViTMAEModel.from_pretrained |
|
pretrained_model_name_or_path: "facebook/vit-mae-large" |
|
|
|
m_former: |
|
_target_: latent_motion_tokenizer.src.models.m_former.MFormer |
|
add_pooling_layer: false |
|
config: |
|
_target_: transformers.ViTConfig |
|
query_num: 8 |
|
input_hidden_size: 1024 |
|
num_patches: 197 |
|
attention_probs_dropout_prob: 0.0 |
|
hidden_act: "gelu" |
|
hidden_dropout_prob: 0.0 |
|
hidden_size: 768 |
|
initializer_range: 0.02 |
|
intermediate_size: 3072 |
|
layer_norm_eps: 1e-12 |
|
model_type: "vit" |
|
num_attention_heads: 12 |
|
num_hidden_layers: 4 |
|
qkv_bias: true |
|
|
|
vector_quantizer: |
|
_target_: latent_motion_tokenizer.src.models.vector_quantizer.VectorQuantizer2 |
|
n_e: 128 |
|
e_dim: 32 |
|
beta: 0.25 |
|
remap: null |
|
sane_index_shape: true |
|
|
|
decoder: |
|
_target_: latent_motion_tokenizer.src.models.latent_motion_decoder.LatentMotionDecoder |
|
config: |
|
_target_: transformers.ViTConfig |
|
query_num: 8 |
|
attention_probs_dropout_prob: 0.0 |
|
hidden_act: "gelu" |
|
hidden_dropout_prob: 0.0 |
|
hidden_size: 768 |
|
image_size: 224 |
|
initializer_range: 0.02 |
|
intermediate_size: 3072 |
|
layer_norm_eps: 1e-12 |
|
model_type: "vit" |
|
num_attention_heads: 12 |
|
num_channels: 3 |
|
num_hidden_layers: 12 |
|
patch_size: 16 |
|
qkv_bias: true |
|
encoder_stride: 16 |
|
num_patches: 196 |