herrius's picture
Upload 259 files
32b542e
_BASE_: "base_model_bert_l12_h192.yaml"
SHARED_TARGETS:
-
NAME: 'VQA_Answer'
SHARED_TARGETS_CFG:
FILE_PATH: 'open_source_dataset/VQA_Answers_CLIP_with_endoftext.pkl'
DISTRIBUTED: True
TASKS:
-
NAME: vqa
DATASETS:
TRAIN: 'VQADataset'
VAL: 'VQADataset'
DATASET_NAME: 'VQA'
TASK_TYPE: 'vqa'
TARGET_SET: ['VQA_Answer']
DATALOADER:
TRAIN_BATCH_SIZE: 256
TEST_BATCH_SIZE: 128
NUM_WORKERS: 4
FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
ANNO_FOLDER: 'open_source_dataset/VQA'
SEQ_PER_SAMPLE: 1
MAX_FEAT_NUM: 51
SAMPLING_WEIGHT: 1.0
TRANSFORM: 'clip_transforms'
DO_AS_GEN: True
SINGLE_CLASS: True
MODEL:
# VOCAB_SIZE: 49409 # include <BOS>/<EOS>
PREDICTOR: 'MLPClassifer'
# MM_PREDICTOR:
# LABELS_NUM: 3129
# PREDICT: 'first_one'
# PRED_DROPOUT: 0.5
MAX_SEQ_LEN: 23
# QUERY_EMBED:
# NAME: QueryBaseEmbedding
# DIM: 512
# QUERY_SIZE: 10 # more than 1 is ok
# ACTIVATION: 'none'
# USE_NORM: True
# DROPOUT: 0.1
# POSITION: 'none' # must be none now
# TYPE_VOCAB_SIZE: -1 # must < 0
LOSSES:
# not single class
# NAMES: ['BCEWithLogits']
# LOSS_WEIGHT: 0.05
# for single class
NAMES: ['CrossEntropy']
LOSS_WEIGHT: 0.1
INFERENCE:
VOCAB: 'CLIP'
NAME: 'VQAEvaler'
ID_KEY: 'question_id'
VALUE: 'answer'
VAL_ANNFILE: 'open_source_dataset/VQA/val_target.pkl'
TEST_ANNFILE: ''
GENERATION_MODE: False
######################################### Engine #########################################
ENGINE:
NAME: 'UnifiedTrainer'
######################################### Scheduled sampling #########################################
SCHEDULED_SAMPLING:
START_EPOCH: 0
INC_EVERY_EPOCH: 5
INC_PROB: 0.05
MAX_PROB: 0.25
DATALOADER:
USE_WEIGHTED_SAMPLER: True
UNIFIED_DATASET: True
######################################### MODEL #########################################
MODEL:
TEMP_NAME: logit_scale_downstream
# VOCAB_SIZE: 49409 # include <BOS>/<EOS>
META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
ENCODER: 'UnifiedBertEncoder'
# ENCODER_DIM: 512
# DECODER: 'UnifiedTransformerDecoder'
# DECODER_DIM: 512
BertParamsInit: True
# WEIGHTS: open_source_dataset/our_model/cc3m_encoder_decoder_warm1w_150k_retrivetask_gatherfeature_caption_mlm/model_Epoch_90000_Iter_0089999.pth
CLS_TOKEN: True
# PREDICTOR: 'BasePredictor'
# PRED_DROPOUT: 0.5
# MAX_SEQ_LEN: 20
# #################################### Token embedding ####################################
# TOKEN_EMBED:
# NAME: 'TokenBaseEmbedding'
# DIM: 512
# ACTIVATION: 'none'
# USE_NORM: True
# DROPOUT: 0.1
# POSITION: 'NNEmbeddingEncoding'
# POSITION_MAX_LEN: 512
# TYPE_VOCAB_SIZE: 2
# #################################### Visual embedding ####################################
# VISUAL_EMBED:
# NAME: 'VisualPatchEmbedding'
# IN_DIM: 3
# OUT_DIM: 512
# ACTIVATION: 'none'
# USE_NORM: True
# DROPOUT: 0.0
# PATCH_SIZE: 16
####################################### BERT ############################################
BERT:
DROP_PATH_PROB: 0.05
# HIDDEN_SIZE: 512
HIDDEN_SIZE: 192
HIDDEN_DROPOUT_PROB: 0.
HIDDEN_ACT: "gelu"
NUM_ATTENTION_HEADS: 8
INTERMEDIATE_SIZE: 2048
INTERMEDIATE_DROP: 0.
FFN_DROPOUT_PROB: 0.
ATTENTION_PROBS_DROPOUT_PROB: 0.
NUM_HIDDEN_LAYERS: 6
NUM_GENERATION_LAYERS: 6
####################################### Optimizer #######################################
SOLVER:
NAME: 'AdamW'
# EPOCH: 1
MAX_ITER: 30000
CHECKPOINT_PERIOD: 5000
CHECKPOINT_MAX_SAVE: 5
EVAL_PERIOD: 1000
BASE_LR: 0.00005
BIAS_LR_FACTOR: 1.0
WEIGHT_DECAY: 0.01
WEIGHT_DECAY_NORM: 0.0
WEIGHT_DECAY_BIAS: 0.0
MOMENTUM: 0.9
DAMPENING: 0.0
NESTEROV: 0.0
BETAS: [0.9, 0.999]
EPS: 1e-8
GRAD_CLIP: 5.0
GRAD_CLIP_TYPE: 'norm'
ACCUM_ITER: 0
AMP_FP16: True
APEX_FP16: False # dangerous
CHECKPOINT_MAPPING:
# -
# ORIGIN: cc3m_caption
# DEST: mscoco
-
ORIGIN: cc3m_retrieve
DEST: flickr30k
CHECKPOINT_MAP: True
####################################### lr scheduler #######################################
LR_SCHEDULER:
NAME: 'WarmupCosine'
WARMUP: 1000
MIN_LR: 0.00000001
# ####################################### losses #######################################
# LOSSES:
# NAMES: ['LabelSmoothing']
# LABELSMOOTHING: 0.1
####################################### decode strategy #######################################
# DECODE_STRATEGY:
# NAME: 'BeamSearcher'
# BEAM_SIZE: 2
####################################### evaluation #######################################
INFERENCE:
VOCAB: 'CLIP'
ITER_BASED: True
find_unused_parameters: true