_BASE_: "base_model_bert_l12_h192.yaml" SHARED_TARGETS: - NAME: 'VQA_Answer' SHARED_TARGETS_CFG: FILE_PATH: 'open_source_dataset/VQA_Answers_CLIP_with_endoftext.pkl' DISTRIBUTED: True TASKS: - NAME: vqa DATASETS: TRAIN: 'VQADataset' VAL: 'VQADataset' DATASET_NAME: 'VQA' TASK_TYPE: 'vqa' TARGET_SET: ['VQA_Answer'] DATALOADER: TRAIN_BATCH_SIZE: 256 TEST_BATCH_SIZE: 128 NUM_WORKERS: 4 FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin' ANNO_FOLDER: 'open_source_dataset/VQA' SEQ_PER_SAMPLE: 1 MAX_FEAT_NUM: 51 SAMPLING_WEIGHT: 1.0 TRANSFORM: 'clip_transforms' DO_AS_GEN: True SINGLE_CLASS: True MODEL: # VOCAB_SIZE: 49409 # include / PREDICTOR: 'MLPClassifer' # MM_PREDICTOR: # LABELS_NUM: 3129 # PREDICT: 'first_one' # PRED_DROPOUT: 0.5 MAX_SEQ_LEN: 23 # QUERY_EMBED: # NAME: QueryBaseEmbedding # DIM: 512 # QUERY_SIZE: 10 # more than 1 is ok # ACTIVATION: 'none' # USE_NORM: True # DROPOUT: 0.1 # POSITION: 'none' # must be none now # TYPE_VOCAB_SIZE: -1 # must < 0 LOSSES: # not single class # NAMES: ['BCEWithLogits'] # LOSS_WEIGHT: 0.05 # for single class NAMES: ['CrossEntropy'] LOSS_WEIGHT: 0.1 INFERENCE: VOCAB: 'CLIP' NAME: 'VQAEvaler' ID_KEY: 'question_id' VALUE: 'answer' VAL_ANNFILE: 'open_source_dataset/VQA/val_target.pkl' TEST_ANNFILE: '' GENERATION_MODE: False ######################################### Engine ######################################### ENGINE: NAME: 'UnifiedTrainer' ######################################### Scheduled sampling ######################################### SCHEDULED_SAMPLING: START_EPOCH: 0 INC_EVERY_EPOCH: 5 INC_PROB: 0.05 MAX_PROB: 0.25 DATALOADER: USE_WEIGHTED_SAMPLER: True UNIFIED_DATASET: True ######################################### MODEL ######################################### MODEL: TEMP_NAME: logit_scale_downstream # VOCAB_SIZE: 49409 # include / META_ARCHITECTURE: 'MultiTaskTransformerEncoder' ENCODER: 'UnifiedBertEncoder' # ENCODER_DIM: 512 # DECODER: 'UnifiedTransformerDecoder' # DECODER_DIM: 512 BertParamsInit: True # WEIGHTS: open_source_dataset/our_model/cc3m_encoder_decoder_warm1w_150k_retrivetask_gatherfeature_caption_mlm/model_Epoch_90000_Iter_0089999.pth CLS_TOKEN: True # PREDICTOR: 'BasePredictor' # PRED_DROPOUT: 0.5 # MAX_SEQ_LEN: 20 # #################################### Token embedding #################################### # TOKEN_EMBED: # NAME: 'TokenBaseEmbedding' # DIM: 512 # ACTIVATION: 'none' # USE_NORM: True # DROPOUT: 0.1 # POSITION: 'NNEmbeddingEncoding' # POSITION_MAX_LEN: 512 # TYPE_VOCAB_SIZE: 2 # #################################### Visual embedding #################################### # VISUAL_EMBED: # NAME: 'VisualPatchEmbedding' # IN_DIM: 3 # OUT_DIM: 512 # ACTIVATION: 'none' # USE_NORM: True # DROPOUT: 0.0 # PATCH_SIZE: 16 ####################################### BERT ############################################ BERT: DROP_PATH_PROB: 0.05 # HIDDEN_SIZE: 512 HIDDEN_SIZE: 192 HIDDEN_DROPOUT_PROB: 0. HIDDEN_ACT: "gelu" NUM_ATTENTION_HEADS: 8 INTERMEDIATE_SIZE: 2048 INTERMEDIATE_DROP: 0. FFN_DROPOUT_PROB: 0. ATTENTION_PROBS_DROPOUT_PROB: 0. NUM_HIDDEN_LAYERS: 6 NUM_GENERATION_LAYERS: 6 ####################################### Optimizer ####################################### SOLVER: NAME: 'AdamW' # EPOCH: 1 MAX_ITER: 30000 CHECKPOINT_PERIOD: 5000 CHECKPOINT_MAX_SAVE: 5 EVAL_PERIOD: 1000 BASE_LR: 0.00005 BIAS_LR_FACTOR: 1.0 WEIGHT_DECAY: 0.01 WEIGHT_DECAY_NORM: 0.0 WEIGHT_DECAY_BIAS: 0.0 MOMENTUM: 0.9 DAMPENING: 0.0 NESTEROV: 0.0 BETAS: [0.9, 0.999] EPS: 1e-8 GRAD_CLIP: 5.0 GRAD_CLIP_TYPE: 'norm' ACCUM_ITER: 0 AMP_FP16: True APEX_FP16: False # dangerous CHECKPOINT_MAPPING: # - # ORIGIN: cc3m_caption # DEST: mscoco - ORIGIN: cc3m_retrieve DEST: flickr30k CHECKPOINT_MAP: True ####################################### lr scheduler ####################################### LR_SCHEDULER: NAME: 'WarmupCosine' WARMUP: 1000 MIN_LR: 0.00000001 # ####################################### losses ####################################### # LOSSES: # NAMES: ['LabelSmoothing'] # LABELSMOOTHING: 0.1 ####################################### decode strategy ####################################### # DECODE_STRATEGY: # NAME: 'BeamSearcher' # BEAM_SIZE: 2 ####################################### evaluation ####################################### INFERENCE: VOCAB: 'CLIP' ITER_BASED: True find_unused_parameters: true