{
  "model_type": "mllama",
  "architectures": [
    "MllamaForConditionalGeneration"
  ],
  "_name_or_path": "meta-llama/Llama-3.2-11B-Vision-Instruct",
  "torch_dtype": "bfloat16",
  "transformers_version": "4.36.0",
  "image_token_index": 128256,
  "text_config": {
    "model_type": "mllama_text_model",
    "hidden_size": 4096,
    "intermediate_size": 14336,
    "num_attention_heads": 32,
    "num_hidden_layers": 40,
    "num_key_value_heads": 8,
    "hidden_act": "silu",
    "max_position_embeddings": 131072,
    "rms_norm_eps": 1e-05,
    "vocab_size": 128256,
    "torch_dtype": "bfloat16"
  },
  "vision_config": {
    "model_type": "mllama_vision_model",
    "hidden_size": 1280,
    "intermediate_size": 5120,
    "num_hidden_layers": 32,
    "num_attention_heads": 16,
    "hidden_act": "gelu",
    "image_size": 560,
    "patch_size": 14,
    "num_channels": 3,
    "norm_eps": 1e-05,
    "vision_output_dim": 7680
  }
}