albef-vqa / configs /vqa.yaml
ryanramos's picture
Add source code
d1b8c9b
raw
history blame
1.95 kB
hidden_size: &hidden_size 768
vocab_size: &vocab_size 30522
type_vocab_size: &type_vocab_size 2
max_position_embeddings: &max_position_embeddings 512
pad_token_id: &pad_token_id 0
seed: 42
world_size: 1
device: "cuda"
dist_url: "env://"
output_root: "./examples/albef/outputs"
datamodule_args:
train_files: ["./examples/albef/data_files/vqa_train.json", "./examples/albef/data_files/vg_qa.json", "./examples/albef/data_files/vqa_val.json"]
test_files: ["./examples/albef/data_files/vqa_test.json"]
answer_list: "./examples/albef/data_files/answer_list.json"
vqa_root: "./examples/albef/data_files/coco"
vg_root: "./examples/albef/data_files/visual_genome"
batch_size: 32
num_workers: 8
vision_encoder_args:
hidden_size: *hidden_size
image_size: 384
patch_size: 16
num_hidden_layers: 12
num_attention_heads: 12
mlp_dim: 3072
dropout: 0.0
attention_dropout: 0.0
layer_norm_eps: 1e-6
text_encoder_args:
vocab_size: *vocab_size
hidden_size: *hidden_size
type_vocab_size: *type_vocab_size
max_position_embeddings: *max_position_embeddings
pad_token_id: *pad_token_id
num_hidden_layers: 6
num_attention_heads: 12
intermediate_size: 3072
layer_norm_eps: 1e-12
dropout: 0.0
multimodal_encoder_args:
hidden_size: *hidden_size
num_hidden_layers: 6
num_attention_heads: 12
intermediate_size: 3072
layer_norm_eps: 1e-12
text_embeddings_args:
hidden_size: *hidden_size
vocab_size: *vocab_size
pad_token_id: *pad_token_id
max_position_embeddings: *max_position_embeddings
type_vocab_size: *type_vocab_size
layer_norm_eps: 1e-12
prediction_head_args:
hidden_size: *hidden_size
vocab_size: *vocab_size
layer_norm_eps: 1e-12
training_args:
log_every_n_steps: 100
alpha: 0.4
weight_decay: 0.02
lr: 2e-5
min_lr: 1e-6
max_epochs: 8
step_size: 100
warmup_steps: 4
checkpoint_root: "./examples/albef/checkpoints"
eval_args:
log_every_n_steps: 100
k_test: 128