Image-to-Text
Transformers
Safetensors
English
vlm
feature-extraction
image-captioning
visual-question-answering
custom_code
uform-gen2-dpo / configuration_uform_gen.py
VoVoR's picture
Add: DPO model card init
0556e7f
from transformers.configuration_utils import PretrainedConfig
from typing import List
class VLMConfig(PretrainedConfig):
model_type = "vlm"
def __init__(
self,
text_decoder_name_or_path: str = "",
image_encoder_name_or_path: str = "",
image_size: int = 336,
image_pooler_num_attn_heads: int = 16,
image_pooler_intermediate_size: int = 3200,
image_token_id: int = 151646,
image_encoder_hidden_size: int = 1280,
image_encoder_patch_size: int = 14,
image_encoder_num_layers: int = 32,
image_encoder_num_heads: int = 16,
image_encoder_pooling: str = "cls",
num_image_latents: int = 256,
initializer_range: float = 0.02,
use_cache: bool = True,
**kwargs,
):
self.text_decoder_name_or_path = text_decoder_name_or_path
self.image_encoder_name_or_path = image_encoder_name_or_path
self.image_pooler_num_attn_heads = image_pooler_num_attn_heads
self.image_pooler_intermediate_size = image_pooler_intermediate_size
self.image_token_id = image_token_id
self.image_size = image_size
self.image_encoder_hidden_size = image_encoder_hidden_size
self.image_encoder_patch_size = image_encoder_patch_size
self.image_encoder_num_layers = image_encoder_num_layers
self.image_encoder_num_heads = image_encoder_num_heads
self.image_encoder_pooling = image_encoder_pooling
self.num_image_latents = num_image_latents
self.initializer_range = initializer_range
self.use_cache = use_cache
super().__init__(**kwargs)