from transformers.models.clip.configuration_clip import CLIPVisionConfig | |
class AestheticsPredictorConfig(CLIPVisionConfig): | |
model_type = "aesthetics_predictor" | |
def __init__( | |
self, | |
hidden_size: int = 768, | |
intermediate_size: int = 3072, | |
projection_dim: int = 512, | |
num_hidden_layers: int = 12, | |
num_attention_heads: int = 12, | |
num_channels: int = 3, | |
image_size: int = 224, | |
patch_size: int = 32, | |
hidden_act: str = "quick_gelu", | |
layer_norm_eps: float = 0.00001, | |
attention_dropout: float = 0, | |
initializer_range: float = 0.02, | |
initializer_factor: float = 1, | |
**kwargs, | |
): | |
super().__init__( | |
hidden_size, | |
intermediate_size, | |
projection_dim, | |
num_hidden_layers, | |
num_attention_heads, | |
num_channels, | |
image_size, | |
patch_size, | |
hidden_act, | |
layer_norm_eps, | |
attention_dropout, | |
initializer_range, | |
initializer_factor, | |
**kwargs, | |
) | |