herrius's picture
Upload 259 files
32b542e
_BASE_: "base_model_bert_l12_h768.yaml"
SHARED_TARGETS:
-
NAME: 'ImageNet22k'
SHARED_TARGETS_CFG:
FILE_PATH: 'open_source_dataset/imagenet_22k_class_name_CLIP_with_endoftext.pkl'
DISTRIBUTED: True
-
NAME: 'Vocab_Word'
SHARED_TARGETS_CFG:
FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl'
DISTRIBUTED: True
-
NAME: 'MomentsInTime'
SHARED_TARGETS_CFG:
FILE_PATH: 'open_source_dataset/MiT_class_name_CLIP_with_endoftext.pkl'
DISTRIBUTED: False
-
NAME: 'Kinetics700'
SHARED_TARGETS_CFG:
FILE_PATH: 'open_source_dataset/k700_class_name_CLIP_with_endoftext.pkl'
DISTRIBUTED: False
TASKS:
-
NAME: imagenet22k
DATASETS:
TRAIN: 'ImageNet22KDataset'
TASK_TYPE: 'image_classification'
DATASET_NAME: 'ImageNet22k'
TARGET_SET: ['ImageNet22k']
DATALOADER:
TRAIN_BATCH_SIZE: 720
# TEST_BATCH_SIZE: 2
NUM_WORKERS: 2
FEATS_FOLDER: 'open_source_dataset/imagenet22k'
S3_PATH: 'cluster2:s3://imagenet22k'
ANNO_FOLDER: 'open_source_dataset/'
SAMPLING_WEIGHT: 2.486
MIXUP: 0.8
CUTMIX: 1.0
MIXUP_PROB: 1.0
MIXUP_SWITCH_PROB: 0.5
MIXUP_MODE: 'batch'
MIXUP_LABEL_SMOOTHING: 0.1
MODEL:
MAX_SEQ_LEN: -1
LABELS_NUM: 21842
TEMP_NAME: logit_scale_img_cls
LOSSES:
NAMES: ['SoftTargetCrossEntropy', 'Accuracy']
LOSS_WEIGHT: 1.0
REDUCTION: 'mean'
-
NAME: K700_retrieve
DATASETS:
TRAIN: 'VideoDataSet'
TASK_TYPE: 'video_classification'
DATASET_NAME: 'K700'
TARGET_SET: ['Kinetics700']
DATALOADER:
TRAIN_BATCH_SIZE: 64
TEST_BATCH_SIZE: 24
NUM_WORKERS: 2
FEATS_FOLDER: 'open_source_dataset/K700'
ANNO_FOLDER: 'open_source_dataset/K700'
S3_PATH: 's3://K700/'
FRAMES_PER_CLIP: 4
STRIDE: 32
FILE_EXTENSION: ''
ANNO_FILE: 'annotation.json'
TIMESFORMER_AUG: True
SAMPLING_WEIGHT: 0.76
MODEL:
MAX_SEQ_LEN: -1
TEMP_NAME: logit_scale_video_cls
LOSSES:
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
LABELSMOOTHING: 0.1
LOSS_WEIGHT: 0.1
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: False
-
NAME: MomentsInTime
DATASETS:
TRAIN: 'VideoDataSet'
TASK_TYPE: 'video_classification'
DATASET_NAME: 'MiT'
TARGET_SET: ['MomentsInTime']
DATALOADER:
TRAIN_BATCH_SIZE: 112
TEST_BATCH_SIZE: 8
NUM_WORKERS: 2
FEATS_FOLDER: 'open_source_dataset/MomentsInTime'
ANNO_FOLDER: 'open_source_dataset/MomentsInTime'
S3_PATH: 's3://MomentsInTime/'
FRAMES_PER_CLIP: 3
STRIDE: 32
FILE_EXTENSION: ''
ANNO_FILE: 'annotation.json'
TIMESFORMER_AUG: True
SAMPLING_WEIGHT: 0.44
MODEL:
MAX_SEQ_LEN: -1
TEMP_NAME: logit_scale_video_cls
LOSSES:
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
LABELSMOOTHING: 0.1
LOSS_WEIGHT: 0.1
INFERENCE:
NAME: 'MiTEvaler'
ID_KEY: 'video_name'
VALUE: 'label'
VAL_ANNFILE: 'open_source_dataset/MomentsInTime/annotation.json'
TEST_ANNFILE: ''
GENERATION_MODE: False
NUM_VIEWS: 1
-
NAME: bookswiki_pretrain
DATASETS:
TRAIN: 'GeneralCorpusDataset'
TASK_TYPE: 'text_mlm'
DATASET_NAME: 'BooksWiki'
TARGET_SET: ['Vocab_Word']
VERSION: 'v2'
DATALOADER:
TRAIN_BATCH_SIZE: 512
TEST_BATCH_SIZE: 32
NUM_WORKERS: 2
ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki'
# ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki'
SEQ_PER_SAMPLE: 1
SAMPLER: NodeDistributed
CACHE_MODE: True
SEQ_PER_SAMPLE: 128
MIN_SEQ_PER_SAMPLE: 128
APPEND_EOS: True
ONE_STREAM: False
SAMPLING_WEIGHT: 2.75
RANDOM_MASK: True
MODEL:
MAX_SEQ_LEN: 128
TEMP_NAME: logit_scale_text_mlm
LOSSES:
NAMES: ['CrossEntropy', 'Accuracy']
LOSS_WEIGHT: 0.5
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: False
-
NAME: yfcc_caption
DATASETS:
TRAIN: 'ImageTextPairDataset'
TASK_TYPE: 'image_caption'
DATASET_NAME: 'YFCC'
TARGET_SET: ['Vocab_Word']
DATALOADER:
TRAIN_BATCH_SIZE: 300
TEST_BATCH_SIZE: 32
NUM_WORKERS: 2
S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
ANNO_FOLDER: 'open_source_dataset/yfcc'
ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
FEATS_FOLDER: 'open_source_dataset/yfcc/'
S3_PATH: 'cluster2:s3://yfcc/'
SEQ_PER_SAMPLE: 1
SAMPLER: NodeDistributed
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: True
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.5840
TRANSFORM: 'clip_transforms'
MODEL:
MAX_SEQ_LEN: 50
TEMP_NAME: logit_scale_caption
LOSSES:
NAMES: ['CrossEntropy', 'Accuracy']
LOSS_WEIGHT: 1.0
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: False
-
NAME: cc12m_caption
DATASETS:
TRAIN: 'ImageTextPairDataset'
TASK_TYPE: 'image_caption'
DATASET_NAME: 'CC12M'
TARGET_SET: ['Vocab_Word']
DATALOADER:
TRAIN_BATCH_SIZE: 300
TEST_BATCH_SIZE: 32
NUM_WORKERS: 2
S3_ANNO_FOLDER: 's3://cc12m/'
ANNO_FOLDER: 'open_source_dataset/c12m/'
ANNO_FILENAME: 'train_available.json'
FEATS_FOLDER: 'open_source_dataset/c12m/'
S3_PATH: 's3://cc12m/'
SEQ_PER_SAMPLE: 1
SAMPLER: NodeDistributed
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: False
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.5057
TRANSFORM: 'clip_transforms'
MODEL:
MAX_SEQ_LEN: 50
TEMP_NAME: logit_scale_caption
LOSSES:
NAMES: ['CrossEntropy', 'Accuracy']
LOSS_WEIGHT: 1.0
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: False
-
NAME: cc3m_caption
DATASETS:
TRAIN: 'ImageTextPairDataset'
TASK_TYPE: 'image_caption'
DATASET_NAME: 'CC3M'
TARGET_SET: ['Vocab_Word']
DATALOADER:
TRAIN_BATCH_SIZE: 300
TEST_BATCH_SIZE: 32
NUM_WORKERS: 2
S3_ANNO_FOLDER: 's3://cc3m/'
ANNO_FOLDER: 'open_source_dataset/cc3m/'
ANNO_FILENAME: 'train_spacy.json'
FEATS_FOLDER: 'open_source_dataset/cc3m/'
S3_PATH: 's3://cc3m/'
SEQ_PER_SAMPLE: 1
SAMPLER: NodeDistributed
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: False
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.26295
TRANSFORM: 'clip_transforms'
MODEL:
MAX_SEQ_LEN: 50
TEMP_NAME: logit_scale_caption
LOSSES:
NAMES: ['CrossEntropy', 'Accuracy']
LOSS_WEIGHT: 1.0
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: False
-
NAME: vg_caption
DATASETS:
TRAIN: 'ImageTextPairDataset'
TASK_TYPE: 'image_caption'
DATASET_NAME: 'VG'
TARGET_SET: ['Vocab_Word']
DATALOADER:
TRAIN_BATCH_SIZE: 300
TEST_BATCH_SIZE: 32
NUM_WORKERS: 2
FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
S3_PATH: 's3://visual_genome/images'
ANNO_FILENAME: 'vg_captions_128filter.json'
SEQ_PER_SAMPLE: 1
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: False
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.1766
TRANSFORM: 'clip_transforms'
MODEL:
MAX_SEQ_LEN: 30
TEMP_NAME: logit_scale_caption
LOSSES:
NAMES: ['CrossEntropy', 'Accuracy']
LOSS_WEIGHT: 1.0
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: True
-
NAME: mscoco_caption
DATASETS:
TRAIN: 'ImageTextPairDataset'
# VAL: 'ImageTextPairDataset'
# TEST: 'ImageTextPairDataset'
TASK_TYPE: 'image_caption'
DATASET_NAME: 'MSCOCO'
TARGET_SET: ['Vocab_Word']
DATALOADER:
TRAIN_BATCH_SIZE: 300
TEST_BATCH_SIZE: 32
NUM_WORKERS: 1
FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
S3_PATH: 's3://coco/'
SEQ_PER_SAMPLE: 1
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: False
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.1144
TRANSFORM: 'clip_transforms'
RANDOM_MASK: True
MODEL:
MAX_SEQ_LEN: 50
EVAL_MAX_SEQ_LEN: 21
TEMP_NAME: logit_scale_caption
LOSSES:
NAMES: ['CrossEntropy', 'Accuracy']
LOSS_WEIGHT: 1.0
REDUCTION: 'mean'
DECODE_STRATEGY:
NAME: 'CaptionBeamSearcherV3'
BEAM_SIZE: 2
# LEN_PENALTY: 1.0
INFERENCE:
NAME: 'COCOEvaler'
VOCAB: 'CLIP'
ID_KEY: 'image_id'
VALUE: 'caption'
VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json'
TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json'
GENERATION_MODE: True
-
NAME: sbu_caption
DATASETS:
TRAIN: 'ImageTextPairDataset'
TASK_TYPE: 'image_caption'
DATASET_NAME: 'SBU'
TARGET_SET: ['Vocab_Word']
DATALOADER:
TRAIN_BATCH_SIZE: 300
TEST_BATCH_SIZE: 32
NUM_WORKERS: 1
S3_ANNO_FOLDER: 's3://SBU/annotations'
ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
ANNO_FILENAME: 'subcaption.json'
FEATS_FOLDER: 'open_source_dataset/sbucaption/'
S3_PATH: 's3://SBU/images'
SEQ_PER_SAMPLE: 1
SAMPLER: NodeDistributed
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: False
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.1383
TRANSFORM: 'clip_transforms'
MODEL:
MAX_SEQ_LEN: 50
TEMP_NAME: logit_scale_caption
LOSSES:
NAMES: ['CrossEntropy', 'Accuracy']
LOSS_WEIGHT: 1.0
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: False
-
NAME: yfcc_retrieve
DATASETS:
TRAIN: 'ImageTextPairDataset'
TASK_TYPE: 'image_retrieval'
DATASET_NAME: 'YFCC'
DATALOADER:
TRAIN_BATCH_SIZE: 512
TEST_BATCH_SIZE: 32
NUM_WORKERS: 2
S3_ANNO_FOLDER: 'cluster2:s3://yfcc'
ANNO_FOLDER: 'open_source_dataset/yfcc'
ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json'
FEATS_FOLDER: 'open_source_dataset/yfcc/'
S3_PATH: 'cluster2:s3://yfcc/'
SAMPLER: NodeDistributed
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: True
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.5840
TRANSFORM: 'clip_transforms'
MODEL:
MAX_SEQ_LEN: 50
TEMP_NAME: logit_scale_retrieve
LOSSES:
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
LABELSMOOTHING: 0.1
LOSS_WEIGHT: 0.5
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: False
-
NAME: cc12m_retrieve
DATASETS:
TRAIN: 'ImageTextPairDataset'
TASK_TYPE: 'image_retrieval'
DATASET_NAME: 'CC12M'
DATALOADER:
TRAIN_BATCH_SIZE: 512
TEST_BATCH_SIZE: 32
NUM_WORKERS: 2
S3_ANNO_FOLDER: 's3://cc12m/'
ANNO_FOLDER: 'open_source_dataset/c12m/'
ANNO_FILENAME: 'train_available.json'
FEATS_FOLDER: 'open_source_dataset/c12m/'
S3_PATH: 's3://cc12m/'
SAMPLER: NodeDistributed
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: False
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.5057
TRANSFORM: 'clip_transforms'
MODEL:
MAX_SEQ_LEN: 50
TEMP_NAME: logit_scale_retrieve
LOSSES:
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
LABELSMOOTHING: 0.1
LOSS_WEIGHT: 0.5
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: False
-
NAME: cc3m_retrieve
DATASETS:
TRAIN: 'ImageTextPairDataset'
TASK_TYPE: 'image_retrieval'
DATASET_NAME: 'CC3M'
DATALOADER:
TRAIN_BATCH_SIZE: 512
TEST_BATCH_SIZE: 32
NUM_WORKERS: 2
S3_ANNO_FOLDER: 's3://cc3m/'
ANNO_FOLDER: 'open_source_dataset/cc3m/'
ANNO_FILENAME: 'train_spacy.json'
FEATS_FOLDER: 'open_source_dataset/cc3m/'
S3_PATH: 's3://cc3m/'
SAMPLER: NodeDistributed
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: False
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.26295
TRANSFORM: 'clip_transforms'
MODEL:
MAX_SEQ_LEN: 50
TEMP_NAME: logit_scale_retrieve
LOSSES:
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
LABELSMOOTHING: 0.1
LOSS_WEIGHT: 0.5
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: False
-
NAME: vg_retrieve
DATASETS:
TRAIN: 'ImageTextPairDataset'
TASK_TYPE: 'image_retrieval'
DATASET_NAME: 'VG'
DATALOADER:
TRAIN_BATCH_SIZE: 512
TEST_BATCH_SIZE: 32
NUM_WORKERS: 2
FEATS_FOLDER: 'open_source_dataset/visual_genome/images'
ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations'
S3_PATH: 's3://visual_genome/images'
ANNO_FILENAME: 'vg_captions_128filter.json'
SEQ_PER_SAMPLE: 1
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: False
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.1766
TRANSFORM: 'clip_transforms'
MODEL:
MAX_SEQ_LEN: 30
TEMP_NAME: logit_scale_retrieve
LOSSES:
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
LABELSMOOTHING: 0.1
LOSS_WEIGHT: 0.5
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: False
-
NAME: mscoco_retrieve
DATASETS:
TRAIN: 'ImageTextPairDataset'
# TEST: 'ImageTextPairDataset'
TASK_TYPE: 'image_retrieval'
DATASET_NAME: 'MSCOCO'
DATALOADER:
TRAIN_BATCH_SIZE: 512
TEST_BATCH_SIZE: 32
NUM_WORKERS: 1
FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin'
ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations'
S3_PATH: 's3://coco/'
SEQ_PER_SAMPLE: 1
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: False
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.1144
TRANSFORM: 'clip_transforms'
MODEL:
MAX_SEQ_LEN: 50
TEMP_NAME: logit_scale_retrieve
LOSSES:
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
LABELSMOOTHING: 0.1
LOSS_WEIGHT: 0.5
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
ID_KEY: 'image_id'
VALUE: 'caption'
NAME: 'RetrievalEvaler'
VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline'
TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline'
GENERATION_MODE: False
-
NAME: sbu_retrieve
DATASETS:
TRAIN: 'ImageTextPairDataset'
TASK_TYPE: 'image_retrieval'
DATASET_NAME: 'SBU'
DATALOADER:
TRAIN_BATCH_SIZE: 512
TEST_BATCH_SIZE: 32
NUM_WORKERS: 1
S3_ANNO_FOLDER: 's3://SBU/annotations'
ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations'
ANNO_FILENAME: 'subcaption.json'
FEATS_FOLDER: 'open_source_dataset/sbucaption/'
S3_PATH: 's3://SBU/images'
SAMPLER: NodeDistributed
CACHE_MODE: True
CIRCULAR_CACHE_MODE: False
ZIP_MODE: False
CACHE_ORIGIN_IMAGE: False
RANDOM_CAPTION: False
AS_NUMPY_AS_POSSIBLE: False
SAMPLING_WEIGHT: 0.1383
TRANSFORM: 'clip_transforms'
MODEL:
MAX_SEQ_LEN: 50
TEMP_NAME: logit_scale_retrieve
LOSSES:
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy']
LABELSMOOTHING: 0.1
LOSS_WEIGHT: 0.5
REDUCTION: 'mean'
INFERENCE:
VOCAB: 'CLIP'
GENERATION_MODE: False
ENGINE:
NAME: 'UnifiedTrainer'
MODEL:
META_ARCHITECTURE: 'MultiTaskTransformerEncoder'
ENCODER: 'UnifiedBertEncoder'
SHARE_LAYERNORM: True
BERT:
NORMALIZE_DECISION: "BERTPre"
DROP_PATH_PROB: 0.1
DROP_PATH_PROB_FIXED: True
UNIFY_QKV: True
MODEL_EMA: False
MODEL_EMA_DECAY: 0.9999
MAEParamsInit: True
POSEMBEDFIX: True
IMG_INPUT_SIZE: 160
PATCH_SIZE: 16
LAYER_SCALE: True
LAYER_SCALE_INIT: 1e-3
DATALOADER:
USE_WEIGHTED_SAMPLER: True
UNIFIED_DATASET: True
NUM_WORKERS: 32
PADDING_TO_MAX: False # True for debugging or token moe with distributed moe
####################################### Optimizer #######################################
SOLVER:
NAME: 'Adam'
TORCH_OPTIMIZER: True
PARAMS_SEPERATE: True
# PARAMS_GROUP: True
# EPOCH: 1
MAX_ITER: 400000
CHECKPOINT_PERIOD: 5000
EVAL_PERIOD: 10000000
BASE_LR: 0.001
BIAS_LR_FACTOR: 1.0
WEIGHT_DECAY: 0.2
WEIGHT_DECAY_NORM: 0.0
WEIGHT_DECAY_BIAS: 0.0
WEIGHT_DECAY_EMBEDDING: 0.0
MOMENTUM: 0.9
DAMPENING: 0.0
NESTEROV: 0.0
BETAS: [0.9, 0.95]
EPS: 1e-6
GRAD_CLIP: 0.1
GRAD_CLIP_TYPE: 'norm'
ACCUM_ITER: 0
AMP_FP16: True
APEX_FP16: False # dangerous
WRITE_PERIOD: 50
MIN_LOSS_SCLE: 2048.0
# BF16: False # True
# ZEROSTAGE: 2
LOSS_SCALE_WINDOW: 200
####################################### lr scheduler #######################################
LR_SCHEDULER:
NAME: 'WarmupCosine'
WARMUP: 10000
MIN_LR: 0.000001
####################################### evaluation #######################################
INFERENCE:
VOCAB: 'CLIP'
ITER_BASED: True
find_unused_parameters: true
# ENCODERS:
# -
# NAME: VisualEncoder
# TYPE: VisualEncoder
# DROP_PATH_PROB: 0.0
# HIDDEN_SIZE: 192
# HIDDEN_DROPOUT_PROB: 0.
# HIDDEN_ACT: "gelu"
# NUM_ATTENTION_HEADS: 3
# INTERMEDIATE_SIZE: 768
# INTERMEDIATE_DROP: 0.
# FFN_DROPOUT_PROB: 0.
# ATTENTION_PROBS_DROPOUT_PROB: 0.
# NUM_HIDDEN_LAYERS: 6
# NUM_GENERATION_LAYERS: 0
# DROP_PATH_PROB_FIXED: True
# -
# NAME: TextEncoder
# TYPE: TextEncoder
# DROP_PATH_PROB: 0.0
# HIDDEN_SIZE: 192
# HIDDEN_DROPOUT_PROB: 0.
# HIDDEN_ACT: "gelu"
# NUM_ATTENTION_HEADS: 3
# INTERMEDIATE_SIZE: 768
# INTERMEDIATE_DROP: 0.
# FFN_DROPOUT_PROB: 0.
# ATTENTION_PROBS_DROPOUT_PROB: 0.
# NUM_HIDDEN_LAYERS: 6
# NUM_GENERATION_LAYERS: 0
# DROP_PATH_PROB_FIXED: True