hidden_size: &hidden_size 768 vocab_size: &vocab_size 30522 type_vocab_size: &type_vocab_size 2 max_position_embeddings: &max_position_embeddings 512 pad_token_id: &pad_token_id 0 embed_size: &embed_size 256 seed: 42 world_size: 1 device: "cuda" dist_url: "env://" output_path: "./examples/albef/outputs/retrieval_output.pt" datamodule_args: train_files: ["./examples/albef/data_files/coco_train.json"] test_files: ["./examples/albef/data_files/coco_test.json"] image_root: "./examples/albef/data_files/coco" batch_size: 32 num_workers: 8 vision_encoder_args: hidden_size: *hidden_size image_size: 384 patch_size: 16 num_hidden_layers: 12 num_attention_heads: 12 mlp_dim: 3072 dropout: 0.0 attention_dropout: 0.0 layer_norm_eps: 1e-6 text_encoder_args: vocab_size: *vocab_size hidden_size: *hidden_size type_vocab_size: *type_vocab_size max_position_embeddings: *max_position_embeddings pad_token_id: *pad_token_id num_hidden_layers: 6 num_attention_heads: 12 intermediate_size: 3072 layer_norm_eps: 1e-12 dropout: 0.0 multimodal_encoder_args: hidden_size: *hidden_size num_hidden_layers: 6 num_attention_heads: 12 intermediate_size: 3072 layer_norm_eps: 1e-12 projection_args: in_features: *hidden_size out_features: *embed_size similarity_args: embed_size: *embed_size queue_size: 65536 temp: 0.07 training_args: log_every_n_steps: 100 alpha: 0.4 weight_decay: 0.02 lr: 1e-5 min_lr: 1e-6 max_epochs: 5 step_size: 100 warmup_steps: 1 checkpoint_root: "./examples/albef/checkpoints" eval_args: log_every_n_steps: 100 k_test: 256