|
|
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
from open_clip import get_model_config |
|
from configuration_phi import PhiConfig |
|
|
|
|
|
class LlavaConfig(PretrainedConfig): |
|
model_type = "llava" |
|
is_composition = False |
|
|
|
def __init__( |
|
self, |
|
text_config=None, |
|
vision_tower_name="ViT-SO400M-14-SigLIP-384", |
|
ignore_index=-100, |
|
image_token_index=50297, |
|
projector_hidden_act="gelu", |
|
projector_tokens_num=1, |
|
vocab_size=51200, |
|
**kwargs, |
|
): |
|
self.ignore_index = ignore_index |
|
self.image_token_index = image_token_index |
|
self.projector_hidden_act = projector_hidden_act |
|
self.projector_tokens_num = projector_tokens_num |
|
self.vocab_size = vocab_size |
|
|
|
self.vision_tower_name = vision_tower_name |
|
vision_config = get_model_config(vision_tower_name) |
|
self.vision_embed_dim = vision_config["embed_dim"] |
|
|
|
self.vocab_size = self.vocab_size |
|
|
|
self.text_config = text_config |
|
if isinstance(self.text_config, dict): |
|
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama" |
|
self.text_config = PhiConfig(**text_config) |
|
self.vocab_size = self.text_config.vocab_size |
|
|
|
super().__init__(**kwargs) |