_target_: moto_gpt.src.models.moto_gpt.MotoGPT model_lang: _target_: transformers.T5EncoderModel.from_pretrained pretrained_model_name_or_path: "t5-base" model_vision: _target_: moto_gpt.src.models.mae_model.MaeEncoder use_obs_feature: true pretrained_model_name_or_path: "facebook/vit-mae-large" model_causal_transformer: _target_: moto_gpt.src.models.trajectory_gpt2.GPT2Model config: _target_: moto_gpt.src.models.trajectory_gpt2.GPT2Config vocab_size: 1 n_embd: 768 n_layer: 12 n_head: 12 activation_function: "relu" dropout: 0.1 n_positions: 1024 act_dim: 7 hidden_size: 768 sequence_length: 2 chunk_size: 3 per_latent_motion_len: 8 latent_motion_codebook_size: 128 latent_motion_pred: true act_pred: false img_feat_dim: 1024 patch_feat_dim: 1024 lang_feat_dim: 768 mask_latent_motion_probability: 0.5 freeze_lang: true freeze_vision: true