herrius's picture
Upload 259 files
32b542e
_BASE_: "../base_model_bert_l12_h768.yaml"
TASKS:
-
NAME: msvd_retrieval
DATASETS:
TRAIN: 'MSVDDataset'
TEST: 'MSVDDataset'
TASK_TYPE: 'video_retrieval'
DATASET_NAME: 'MSVDDataset'
# TARGET_SET: ['Vocab_Word']
DATALOADER:
TRAIN_BATCH_SIZE: 6
TEST_BATCH_SIZE: 64
NUM_WORKERS: 6
FEATS_FOLDER: 'open_source_dataset/msvd_dataset/YouTubeClips'
ANNO_FOLDER: 'open_source_dataset/msvd_dataset/new_annotations'
STRIDE: 32
FRAMES_PER_CLIP: 4
S3_PATH: 's3://msvd/YouTubeClips/'
TIMESFORMER_AUG: True
SAMPLING_WEIGHT: 0.5
MODEL:
MAX_SEQ_LEN: 77
EVAL_MAX_SEQ_LEN: 21
TEMP_NAME: logit_scale_caption
LOSSES:
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
LABELSMOOTHING: 0.1
# NAMES: ['CrossEntropy', 'Accuracy']
LOSS_WEIGHT: 1.0
REDUCTION: 'mean'
INFERENCE:
NAME: 'RetrievalEvaler'
GENERATION_MODE: False
ENGINE:
NAME: 'UnifiedTrainer'
MODEL:
META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
ENCODER: 'UnifiedBertEncoder'
SHARE_LAYERNORM: True
BERT:
NORMALIZE_DECISION: "BERTPre"
DROP_PATH_PROB: 0.1
DROP_PATH_PROB_FIXED: True
MODEL_EMA: False
MODEL_EMA_DECAY: 0.9999
MAEParamsInit: True
POSEMBEDFIX: True
IMG_INPUT_SIZE: 224
PATCH_SIZE: 16
# POSEMBED_SCALE: !!python/object/apply:eval ["160/224"]
CHECKPOINT_FILETER: False
OLD_CHECKPONT: True
LAYER_SCALE: True
LAYER_SCALE_INIT: 1e-3
DATALOADER:
USE_WEIGHTED_SAMPLER: True
UNIFIED_DATASET: True
NUM_WORKERS: 6
PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
####################################### Optimizer #######################################
SOLVER:
NAME: 'Adam'
TORCH_OPTIMIZER: True
PARAMS_SEPERATE: True
# PARAMS_GROUP: True
# EPOCH: 1
MAX_ITER: 450000
CHECKPOINT_PERIOD: 50000
EVAL_PERIOD: 500000
BASE_LR: 0.001
BIAS_LR_FACTOR: 1.0
WEIGHT_DECAY: 0.05
WEIGHT_DECAY_NORM: 0.0
WEIGHT_DECAY_BIAS: 0.0
WEIGHT_DECAY_EMBEDDING: 0.0
MOMENTUM: 0.9
DAMPENING: 0.0
NESTEROV: 0.0
BETAS: [0.9, 0.95]
EPS: 1e-6
GRAD_CLIP: 0.1
GRAD_CLIP_TYPE: 'norm'
ACCUM_ITER: 0
AMP_FP16: True
APEX_FP16: False # dangerous
WRITE_PERIOD: 50
MIN_LOSS_SCLE: 2048.0
# BF16: False # True
# ZEROSTAGE: 2
LOSS_SCALE_WINDOW: 200
####################################### lr scheduler #######################################
LR_SCHEDULER:
NAME: 'WarmupCosine'
WARMUP: 20000
MIN_LR: 0.000001
####################################### evaluation #######################################
INFERENCE:
VOCAB: 'CLIP'
ITER_BASED: True
find_unused_parameters: true
# ENCODERS:
# -
# NAME: VisualEncoder
# TYPE: VisualEncoder
# DROP_PATH_PROB: 0.0
# HIDDEN_SIZE: 192
# HIDDEN_DROPOUT_PROB: 0.
# HIDDEN_ACT: "gelu"
# NUM_ATTENTION_HEADS: 3
# INTERMEDIATE_SIZE: 768
# INTERMEDIATE_DROP: 0.
# FFN_DROPOUT_PROB: 0.
# ATTENTION_PROBS_DROPOUT_PROB: 0.
# NUM_HIDDEN_LAYERS: 6
# NUM_GENERATION_LAYERS: 0
# DROP_PATH_PROB_FIXED: True
# -
# NAME: TextEncoder
# TYPE: TextEncoder
# DROP_PATH_PROB: 0.0
# HIDDEN_SIZE: 192
# HIDDEN_DROPOUT_PROB: 0.
# HIDDEN_ACT: "gelu"
# NUM_ATTENTION_HEADS: 3
# INTERMEDIATE_SIZE: 768
# INTERMEDIATE_DROP: 0.
# FFN_DROPOUT_PROB: 0.
# ATTENTION_PROBS_DROPOUT_PROB: 0.
# NUM_HIDDEN_LAYERS: 6
# NUM_GENERATION_LAYERS: 0
# DROP_PATH_PROB_FIXED: True