_BASE_: "base_model_bert_l12_h192.yaml" | |
SHARED_TARGETS: | |
- | |
NAME: 'VQA_Answer' | |
SHARED_TARGETS_CFG: | |
FILE_PATH: 'open_source_dataset/VQA_Answers_CLIP_with_endoftext.pkl' | |
DISTRIBUTED: True | |
TASKS: | |
- | |
NAME: vqa | |
DATASETS: | |
TRAIN: 'VQADataset' | |
VAL: 'VQADataset' | |
DATASET_NAME: 'VQA' | |
TASK_TYPE: 'vqa' | |
TARGET_SET: ['VQA_Answer'] | |
DATALOADER: | |
TRAIN_BATCH_SIZE: 256 | |
TEST_BATCH_SIZE: 128 | |
NUM_WORKERS: 4 | |
FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin' | |
ANNO_FOLDER: 'open_source_dataset/VQA' | |
SEQ_PER_SAMPLE: 1 | |
MAX_FEAT_NUM: 51 | |
SAMPLING_WEIGHT: 1.0 | |
TRANSFORM: 'clip_transforms' | |
DO_AS_GEN: True | |
SINGLE_CLASS: True | |
MODEL: | |
# VOCAB_SIZE: 49409 # include <BOS>/<EOS> | |
PREDICTOR: 'MLPClassifer' | |
# MM_PREDICTOR: | |
# LABELS_NUM: 3129 | |
# PREDICT: 'first_one' | |
# PRED_DROPOUT: 0.5 | |
MAX_SEQ_LEN: 23 | |
# QUERY_EMBED: | |
# NAME: QueryBaseEmbedding | |
# DIM: 512 | |
# QUERY_SIZE: 10 # more than 1 is ok | |
# ACTIVATION: 'none' | |
# USE_NORM: True | |
# DROPOUT: 0.1 | |
# POSITION: 'none' # must be none now | |
# TYPE_VOCAB_SIZE: -1 # must < 0 | |
LOSSES: | |
# not single class | |
# NAMES: ['BCEWithLogits'] | |
# LOSS_WEIGHT: 0.05 | |
# for single class | |
NAMES: ['CrossEntropy'] | |
LOSS_WEIGHT: 0.1 | |
INFERENCE: | |
VOCAB: 'CLIP' | |
NAME: 'VQAEvaler' | |
ID_KEY: 'question_id' | |
VALUE: 'answer' | |
VAL_ANNFILE: 'open_source_dataset/VQA/val_target.pkl' | |
TEST_ANNFILE: '' | |
GENERATION_MODE: False | |
######################################### Engine ######################################### | |
ENGINE: | |
NAME: 'UnifiedTrainer' | |
######################################### Scheduled sampling ######################################### | |
SCHEDULED_SAMPLING: | |
START_EPOCH: 0 | |
INC_EVERY_EPOCH: 5 | |
INC_PROB: 0.05 | |
MAX_PROB: 0.25 | |
DATALOADER: | |
USE_WEIGHTED_SAMPLER: True | |
UNIFIED_DATASET: True | |
######################################### MODEL ######################################### | |
MODEL: | |
TEMP_NAME: logit_scale_downstream | |
# VOCAB_SIZE: 49409 # include <BOS>/<EOS> | |
META_ARCHITECTURE: 'MultiTaskTransformerEncoder' | |
ENCODER: 'UnifiedBertEncoder' | |
# ENCODER_DIM: 512 | |
# DECODER: 'UnifiedTransformerDecoder' | |
# DECODER_DIM: 512 | |
BertParamsInit: True | |
# WEIGHTS: open_source_dataset/our_model/cc3m_encoder_decoder_warm1w_150k_retrivetask_gatherfeature_caption_mlm/model_Epoch_90000_Iter_0089999.pth | |
CLS_TOKEN: True | |
# PREDICTOR: 'BasePredictor' | |
# PRED_DROPOUT: 0.5 | |
# MAX_SEQ_LEN: 20 | |
# #################################### Token embedding #################################### | |
# TOKEN_EMBED: | |
# NAME: 'TokenBaseEmbedding' | |
# DIM: 512 | |
# ACTIVATION: 'none' | |
# USE_NORM: True | |
# DROPOUT: 0.1 | |
# POSITION: 'NNEmbeddingEncoding' | |
# POSITION_MAX_LEN: 512 | |
# TYPE_VOCAB_SIZE: 2 | |
# #################################### Visual embedding #################################### | |
# VISUAL_EMBED: | |
# NAME: 'VisualPatchEmbedding' | |
# IN_DIM: 3 | |
# OUT_DIM: 512 | |
# ACTIVATION: 'none' | |
# USE_NORM: True | |
# DROPOUT: 0.0 | |
# PATCH_SIZE: 16 | |
####################################### BERT ############################################ | |
BERT: | |
DROP_PATH_PROB: 0.05 | |
# HIDDEN_SIZE: 512 | |
HIDDEN_SIZE: 192 | |
HIDDEN_DROPOUT_PROB: 0. | |
HIDDEN_ACT: "gelu" | |
NUM_ATTENTION_HEADS: 8 | |
INTERMEDIATE_SIZE: 2048 | |
INTERMEDIATE_DROP: 0. | |
FFN_DROPOUT_PROB: 0. | |
ATTENTION_PROBS_DROPOUT_PROB: 0. | |
NUM_HIDDEN_LAYERS: 6 | |
NUM_GENERATION_LAYERS: 6 | |
####################################### Optimizer ####################################### | |
SOLVER: | |
NAME: 'AdamW' | |
# EPOCH: 1 | |
MAX_ITER: 30000 | |
CHECKPOINT_PERIOD: 5000 | |
CHECKPOINT_MAX_SAVE: 5 | |
EVAL_PERIOD: 1000 | |
BASE_LR: 0.00005 | |
BIAS_LR_FACTOR: 1.0 | |
WEIGHT_DECAY: 0.01 | |
WEIGHT_DECAY_NORM: 0.0 | |
WEIGHT_DECAY_BIAS: 0.0 | |
MOMENTUM: 0.9 | |
DAMPENING: 0.0 | |
NESTEROV: 0.0 | |
BETAS: [0.9, 0.999] | |
EPS: 1e-8 | |
GRAD_CLIP: 5.0 | |
GRAD_CLIP_TYPE: 'norm' | |
ACCUM_ITER: 0 | |
AMP_FP16: True | |
APEX_FP16: False # dangerous | |
CHECKPOINT_MAPPING: | |
# - | |
# ORIGIN: cc3m_caption | |
# DEST: mscoco | |
- | |
ORIGIN: cc3m_retrieve | |
DEST: flickr30k | |
CHECKPOINT_MAP: True | |
####################################### lr scheduler ####################################### | |
LR_SCHEDULER: | |
NAME: 'WarmupCosine' | |
WARMUP: 1000 | |
MIN_LR: 0.00000001 | |
# ####################################### losses ####################################### | |
# LOSSES: | |
# NAMES: ['LabelSmoothing'] | |
# LABELSMOOTHING: 0.1 | |
####################################### decode strategy ####################################### | |
# DECODE_STRATEGY: | |
# NAME: 'BeamSearcher' | |
# BEAM_SIZE: 2 | |
####################################### evaluation ####################################### | |
INFERENCE: | |
VOCAB: 'CLIP' | |
ITER_BASED: True | |
find_unused_parameters: true | |