File size: 1,489 Bytes
b51625d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# coding=utf-8
""" mPLUGOwl3 model configuration"""

import os
from typing import Union

from transformers.utils import logging
from .configuration_hyper_qwen2 import HyperQwen2Config
from transformers.models.siglip.configuration_siglip import SiglipVisionConfig
logger = logging.get_logger(__name__)


class mPLUGOwl3Config(HyperQwen2Config):
    model_type = "mplugowl3"
    keys_to_ignore_at_inference = ["past_key_values"]

    default_vision_config = {
        "hidden_size": 1152,
        "image_size": 384,
        "intermediate_size": 4304,
        "model_type": "siglip_vision_model",
        "num_attention_heads": 16,
        "num_hidden_layers": 27,
        "patch_size": 14
    }


    def __init__(
        self,
        use_cache=True,
        vision_config=None,
        **kwargs,
    ):
        self.use_cache = use_cache

        # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
        if vision_config is None:
            self.vision_config = SiglipVisionConfig(**self.default_vision_config)
            logger.info("vision_config is None, using default vision config")
        elif isinstance(vision_config, dict):
            self.vision_config = SiglipVisionConfig(**vision_config)
        elif isinstance(vision_config, SiglipVisionConfig):
            self.vision_config = vision_config
        self.image_size = self.vision_config.image_size
        self.patch_size = self.vision_config.patch_size

        super().__init__(**kwargs)