_base_: "default.yml" defaults: - _self_ model: type: MaskClip clip_model: CLIP-ViT-B-16-laion2B-s34B-b88K backbone: img_size: 448 patch_size: 16 patch_bias: False in_channels: 3 embed_dims: 768 num_layers: 12 num_heads: 12 mlp_ratio: 4 out_indices: -1 qkv_bias: True drop_rate: 0.0 attn_drop_rate: 0.0 drop_path_rate: 0.0 with_cls_token: True output_cls_token: False norm_cfg: type: 'LN' eps: 1e-6 act_cfg: type: 'GELU' patch_norm: False pre_norm: True final_norm: True return_qkv: True interpolate_mode: 'bicubic' num_fcs: 2 norm_eval: False pretrained: 'clip-dinoiser/checkpoints/ViT-16-laion_clip_backbone.pth' decode_head: type: MaskClipHead in_channels: 768 channels: 0 text_channels: 512 in_index: -1 norm_cfg: type: 'SyncBN' requires_grad: False align_corners: False visual_projs_path: 'clip-dinoiser/checkpoints/ViT-16-laion_clip_proj.pth' model_prefix: 'hf-hub:laion' use_templates: True