from transformers import PretrainedConfig, PreTrainedModel import json class Idefics2ConnectorConfig(PretrainedConfig): r""" Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the perceiver block. resampler_n_latents (`int`, *optional*, defaults to 64): Number of latent embeddings to resample ("compress") the input sequence to (usually < 128). resampler_depth (`int`, *optional*, defaults to 3): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (<= 3). resampler_n_heads (`int`, *optional*, defaults to 16): Number of heads in each Transformer block (for multi-headed self-attention). resampler_head_dim (`int`, *optional*, defaults to 96): Dimensionality of each head projection in the Transformer block. num_key_value_heads (`int`, *optional*, defaults to 4): Number of key-value heads in the perceiver attention block. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. """ _auto_class = 'AutoConfig' model_type = "Idefics2ConnectorConfig" def __init__( self, vision_hidden_size=1152, hidden_size=4096, hidden_act="silu", resampler_n_latents=64, resampler_depth=3, rms_norm_eps=1e-05, resampler_n_heads=16, resampler_head_dim=96, num_key_value_heads=4, attention_dropout=0.0, intermediate_size=14336, integrate_sub_images=None, num_sub_images=None, **kwargs, ): super().__init__(**kwargs) self.vision_hidden_size = vision_hidden_size self.hidden_size = hidden_size self.hidden_act = hidden_act self.resampler_n_latents = resampler_n_latents self.resampler_depth = resampler_depth self.rms_norm_eps = rms_norm_eps self.resampler_n_heads = resampler_n_heads self.num_key_value_heads = num_key_value_heads self.resampler_head_dim = resampler_head_dim self.attention_dropout = attention_dropout self.intermediate_size = intermediate_size self.integrate_sub_images = integrate_sub_images self.num_sub_images = num_sub_images if self.num_key_value_heads > self.resampler_n_heads: raise ValueError( f"num_key_value_heads={self.num_key_value_heads} must be less than or equal to" f" resampler_n_heads={self.resampler_n_heads}" ) @classmethod def from_pretrained(cls, config_path, **kwargs) -> "PretrainedConfig": with open(config_path, "r", encoding="utf-8") as f: config_dict = json.load(f) cls = Idefics2ConnectorConfig( vision_hidden_size=config_dict['vision_hidden_size'], hidden_size=config_dict['hidden_size'], hidden_act="silu", resampler_n_latents=config_dict['resampler_n_latents'], resampler_depth=config_dict['resampler_depth'], rms_norm_eps=config_dict['rms_norm_eps'], intermediate_size=config_dict['intermediate_size'], integrate_sub_images=config_dict['integrate_sub_images'], num_sub_images=config_dict['num_sub_images'] ) return cls