|
{ |
|
"architectures": [ |
|
"MotionAssociativeMemoryEncoder" |
|
], |
|
"attention_probs_dropout_prob": 0.0, |
|
"hidden_act": "gelu_fast", |
|
"hidden_dropout_prob": 0.0, |
|
"hidden_size": 768, |
|
"image_size": [ |
|
72, |
|
128 |
|
], |
|
"initializer_range": 0.02, |
|
"intermediate_size": 3072, |
|
"layer_norm_eps": 1e-06, |
|
"model_type": "vivit", |
|
"num_attention_heads": 8, |
|
"num_channels": 4, |
|
"num_frames": 24, |
|
"num_hidden_layers": 6, |
|
"qkv_bias": true, |
|
"query_dims": [ |
|
160, |
|
320, |
|
640 |
|
], |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.47.1", |
|
"tubelet_size": [ |
|
2, |
|
9, |
|
16 |
|
], |
|
"unet_attention_names": { |
|
"down_blocks_0_attentions_0_temporal_transformer_blocks_0_attn1": 320, |
|
"down_blocks_0_attentions_0_transformer_blocks_0_attn1": 320, |
|
"down_blocks_0_attentions_1_temporal_transformer_blocks_0_attn1": 320, |
|
"down_blocks_0_attentions_1_transformer_blocks_0_attn1": 320, |
|
"down_blocks_1_attentions_0_temporal_transformer_blocks_0_attn1": 640, |
|
"down_blocks_1_attentions_0_transformer_blocks_0_attn1": 640, |
|
"down_blocks_1_attentions_1_temporal_transformer_blocks_0_attn1": 640, |
|
"down_blocks_1_attentions_1_transformer_blocks_0_attn1": 640, |
|
"down_blocks_2_attentions_0_temporal_transformer_blocks_0_attn1": 1280, |
|
"down_blocks_2_attentions_0_transformer_blocks_0_attn1": 1280, |
|
"down_blocks_2_attentions_1_temporal_transformer_blocks_0_attn1": 1280, |
|
"down_blocks_2_attentions_1_transformer_blocks_0_attn1": 1280, |
|
"mid_block_attentions_0_temporal_transformer_blocks_0_attn1": 1280, |
|
"mid_block_attentions_0_transformer_blocks_0_attn1": 1280, |
|
"up_blocks_1_attentions_0_temporal_transformer_blocks_0_attn1": 1280, |
|
"up_blocks_1_attentions_0_transformer_blocks_0_attn1": 1280, |
|
"up_blocks_1_attentions_1_temporal_transformer_blocks_0_attn1": 1280, |
|
"up_blocks_1_attentions_1_transformer_blocks_0_attn1": 1280, |
|
"up_blocks_1_attentions_2_temporal_transformer_blocks_0_attn1": 1280, |
|
"up_blocks_1_attentions_2_transformer_blocks_0_attn1": 1280, |
|
"up_blocks_2_attentions_0_temporal_transformer_blocks_0_attn1": 640, |
|
"up_blocks_2_attentions_0_transformer_blocks_0_attn1": 640, |
|
"up_blocks_2_attentions_1_temporal_transformer_blocks_0_attn1": 640, |
|
"up_blocks_2_attentions_1_transformer_blocks_0_attn1": 640, |
|
"up_blocks_2_attentions_2_temporal_transformer_blocks_0_attn1": 640, |
|
"up_blocks_2_attentions_2_transformer_blocks_0_attn1": 640, |
|
"up_blocks_3_attentions_0_temporal_transformer_blocks_0_attn1": 320, |
|
"up_blocks_3_attentions_0_transformer_blocks_0_attn1": 320, |
|
"up_blocks_3_attentions_1_temporal_transformer_blocks_0_attn1": 320, |
|
"up_blocks_3_attentions_1_transformer_blocks_0_attn1": 320, |
|
"up_blocks_3_attentions_2_temporal_transformer_blocks_0_attn1": 320, |
|
"up_blocks_3_attentions_2_transformer_blocks_0_attn1": 320 |
|
}, |
|
"zero_init_theta": false |
|
} |
|
|