|
from transformers.configuration_utils import PretrainedConfig |
|
from typing import List |
|
|
|
|
|
class VLMConfig(PretrainedConfig): |
|
model_type = "vlm" |
|
|
|
def __init__( |
|
self, |
|
text_decoder_name_or_path: str = "", |
|
image_encoder_name_or_path: str = "", |
|
image_size: int = 336, |
|
image_pooler_num_attn_heads: int = 16, |
|
image_pooler_intermediate_size: int = 3200, |
|
image_token_id: int = 151646, |
|
image_encoder_hidden_size: int = 1280, |
|
image_encoder_patch_size: int = 14, |
|
image_encoder_num_layers: int = 32, |
|
image_encoder_num_heads: int = 16, |
|
image_encoder_pooling: str = "cls", |
|
num_image_latents: int = 256, |
|
initializer_range: float = 0.02, |
|
use_cache: bool = True, |
|
**kwargs, |
|
): |
|
self.text_decoder_name_or_path = text_decoder_name_or_path |
|
self.image_encoder_name_or_path = image_encoder_name_or_path |
|
|
|
self.image_pooler_num_attn_heads = image_pooler_num_attn_heads |
|
self.image_pooler_intermediate_size = image_pooler_intermediate_size |
|
self.image_token_id = image_token_id |
|
self.image_size = image_size |
|
self.image_encoder_hidden_size = image_encoder_hidden_size |
|
self.image_encoder_patch_size = image_encoder_patch_size |
|
self.image_encoder_num_layers = image_encoder_num_layers |
|
self.image_encoder_num_heads = image_encoder_num_heads |
|
self.image_encoder_pooling = image_encoder_pooling |
|
self.num_image_latents = num_image_latents |
|
|
|
self.initializer_range = initializer_range |
|
self.use_cache = use_cache |
|
|
|
super().__init__(**kwargs) |