_BASE_: "base_model_bert_l12_h192.yaml" | |
SHARED_TARGETS: | |
# - | |
# NAME: 'ImageNet1k' | |
# SHARED_TARGETS_CFG: | |
# FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl' | |
# DISTRIBUTED: False | |
- | |
NAME: 'Vocab_Word' | |
SHARED_TARGETS_CFG: | |
FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl' | |
DISTRIBUTED: True | |
# - | |
# NAME: 'Kinetics400' | |
# SHARED_TARGETS_CFG: | |
# FILE_PATH: 'open_source_dataset/k400_class_name_CLIP_with_endoftext.pkl' | |
# DISTRIBUTED: False | |
TASKS: | |
# - | |
# NAME: imagenet | |
# DATASETS: | |
# TRAIN: 'ImageNetDataset' | |
# VAL: 'ImageNetDataset' | |
# TASK_TYPE: 'image_classification' | |
# DATASET_NAME: 'ImageNet1k' | |
# TARGET_SET: ['ImageNet1k'] | |
# DATALOADER: | |
# TRAIN_BATCH_SIZE: 720 | |
# # TEST_BATCH_SIZE: 2 | |
# NUM_WORKERS: 4 | |
# FEATS_FOLDER: 'cluster2:s3://imagenet' | |
# ANNO_FOLDER: 'open_source_dataset/imagenet/meta' | |
# SAMPLING_WEIGHT: 2.5 | |
# CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl' | |
# MIXUP: 0.8 | |
# CUTMIX: 1.0 | |
# MIXUP_PROB: 1.0 | |
# MIXUP_SWITCH_PROB: 0.5 | |
# MIXUP_MODE: 'batch' | |
# MIXUP_LABEL_SMOOTHING: 0.1 | |
# MODEL: | |
# MAX_SEQ_LEN: -1 | |
# LABELS_NUM: 1000 | |
# TEMP_NAME: logit_scale_img_cls | |
# LOSSES: | |
# NAMES: ['SoftTargetCrossEntropy', 'Accuracy'] | |
# LOSS_WEIGHT: 1.0 | |
# REDUCTION: 'mean' | |
# # LOSS_FP32: True | |
# INFERENCE: | |
# NAME: 'ImageNetEvaler' | |
# ID_KEY: 'image_id' | |
# VALUE: 'cls_logits' | |
# VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt' | |
# TEST_ANNFILE: '' | |
# GENERATION_MODE: False | |
# - | |
# NAME: K400_retrieve | |
# DATASETS: | |
# TRAIN: 'VideoDataSet' | |
# VAL: 'VideoDataSet' | |
# TASK_TYPE: 'video_classification' | |
# DATASET_NAME: 'K400' | |
# TARGET_SET: ['Kinetics400'] | |
# DATALOADER: | |
# TRAIN_BATCH_SIZE: 12 # 256 | |
# TEST_BATCH_SIZE: 4 # debug | |
# NUM_WORKERS: 4 # debug 4 | |
# FEATS_FOLDER: 'open_source_dataset/K400_official' | |
# ANNO_FOLDER: 'open_source_dataset/K400_official' | |
# S3_PATH: 's3://K400/' | |
# FRAMES_PER_CLIP: 8 | |
# STRIDE: 32 | |
# FILE_EXTENSION: '' | |
# ANNO_FILE: 'annotation.json' | |
# TIMESFORMER_AUG: True | |
# SAMPLING_WEIGHT: 1.0 | |
# MODEL: | |
# MAX_SEQ_LEN: -1 | |
# TEMP_NAME: logit_scale_video_cls | |
# LOSSES: | |
# NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy'] | |
# LABELSMOOTHING: 0.1 | |
# LOSS_WEIGHT: 1.0 | |
# INFERENCE: | |
# NAME: 'MiTEvaler' | |
# ID_KEY: 'video_name' | |
# VALUE: 'label' | |
# VAL_ANNFILE: 'open_source_dataset/K400_official/annotation.json' | |
# TEST_ANNFILE: '' | |
# GENERATION_MODE: False | |
# NUM_VIEWS: 1 | |
# - | |
# NAME: bookswiki_pretrain | |
# DATASETS: | |
# TRAIN: 'GeneralCorpusDataset' | |
# TASK_TYPE: 'text_mlm' | |
# DATASET_NAME: 'BooksWiki' | |
# TARGET_SET: ['Vocab_Word'] | |
# VERSION: 'v2' | |
# DATALOADER: | |
# TRAIN_BATCH_SIZE: 512 | |
# TEST_BATCH_SIZE: 32 | |
# NUM_WORKERS: 2 | |
# ANNO_FOLDER: 'open_source_dataset/text_corpus' # 'open_source_dataset/bert_pretrain_data/bookswiki' | |
# # ANNO_FOLDER: 'open_source_dataset/bert_pretrain_data/bookswiki' | |
# SEQ_PER_SAMPLE: 1 | |
# SAMPLER: NodeDistributed | |
# CACHE_MODE: True | |
# SEQ_PER_SAMPLE: 128 | |
# MIN_SEQ_PER_SAMPLE: 128 | |
# APPEND_EOS: True | |
# ONE_STREAM: False | |
# SAMPLING_WEIGHT: 3.5 | |
# RANDOM_MASK: True | |
# MODEL: | |
# MAX_SEQ_LEN: 128 | |
# TEMP_NAME: logit_scale_text_mlm | |
# LOSSES: | |
# NAMES: ['CrossEntropy', 'Accuracy'] | |
# LOSS_WEIGHT: 0.33333 | |
# REDUCTION: 'mean' | |
# INFERENCE: | |
# VOCAB: 'CLIP' | |
# GENERATION_MODE: False | |
# - | |
# NAME: mscoco_retrieve | |
# DATASETS: | |
# TRAIN: 'ImageTextPairDataset' | |
# TEST: 'ImageTextPairDataset' | |
# TASK_TYPE: 'image_retrieval' | |
# DATASET_NAME: 'MSCOCO' | |
# DATALOADER: | |
# TRAIN_BATCH_SIZE: 100 | |
# TEST_BATCH_SIZE: 32 | |
# NUM_WORKERS: 1 | |
# FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin' | |
# ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations' | |
# S3_PATH: 's3://coco/' | |
# SEQ_PER_SAMPLE: 1 | |
# CACHE_MODE: True | |
# CIRCULAR_CACHE_MODE: False | |
# ZIP_MODE: False | |
# CACHE_ORIGIN_IMAGE: False | |
# RANDOM_CAPTION: False | |
# AS_NUMPY_AS_POSSIBLE: False | |
# SAMPLING_WEIGHT: 1.0 | |
# TRANSFORM: 'clip_transforms' | |
# MODEL: | |
# MAX_SEQ_LEN: 50 | |
# TEMP_NAME: logit_scale_retrieve | |
# LOSSES: | |
# NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy'] | |
# LABELSMOOTHING: 0.1 | |
# LOSS_WEIGHT: 1.0 | |
# REDUCTION: 'mean' | |
# INFERENCE: | |
# VOCAB: 'CLIP' | |
# ID_KEY: 'image_id' | |
# VALUE: 'caption' | |
# NAME: 'RetrievalEvaler' | |
# VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline' | |
# TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline' | |
# GENERATION_MODE: False | |
########## Image Captioning ########### | |
# - | |
# NAME: cc12m_caption | |
# DATASETS: | |
# TRAIN: 'ImageTextPairDataset' | |
# TASK_TYPE: 'image_caption' | |
# DATASET_NAME: 'CC12M' | |
# TARGET_SET: ['Vocab_Word'] | |
# DATALOADER: | |
# TRAIN_BATCH_SIZE: 300 | |
# TEST_BATCH_SIZE: 32 | |
# NUM_WORKERS: 2 | |
# S3_ANNO_FOLDER: 's3://cc12m/' | |
# ANNO_FOLDER: 'open_source_dataset/c12m/' | |
# ANNO_FILENAME: 'train_available.json' | |
# FEATS_FOLDER: 'open_source_dataset/c12m/' | |
# S3_PATH: 's3://cc12m/' | |
# SEQ_PER_SAMPLE: 1 | |
# SAMPLER: NodeDistributed | |
# CACHE_MODE: True | |
# CIRCULAR_CACHE_MODE: False | |
# ZIP_MODE: False | |
# CACHE_ORIGIN_IMAGE: False | |
# RANDOM_CAPTION: False | |
# AS_NUMPY_AS_POSSIBLE: False | |
# SAMPLING_WEIGHT: 1.6889 | |
# TRANSFORM: 'clip_transforms' | |
# MODEL: | |
# MAX_SEQ_LEN: 50 | |
# TEMP_NAME: logit_scale_caption | |
# LOSSES: | |
# NAMES: ['CrossEntropy', 'Accuracy'] | |
# LOSS_WEIGHT: 0.33333 | |
# REDUCTION: 'mean' | |
# INFERENCE: | |
# VOCAB: 'CLIP' | |
# GENERATION_MODE: False | |
# - | |
# NAME: cc3m_caption | |
# DATASETS: | |
# TRAIN: 'ImageTextPairDataset' | |
# TASK_TYPE: 'image_caption' | |
# DATASET_NAME: 'CC3M' | |
# TARGET_SET: ['Vocab_Word'] | |
# DATALOADER: | |
# TRAIN_BATCH_SIZE: 300 | |
# TEST_BATCH_SIZE: 32 | |
# NUM_WORKERS: 2 | |
# ANNO_FOLDER: 's3://cc3m/' | |
# ANNO_FILENAME: 'train_spacy.json' | |
# FEATS_FOLDER: 'open_source_dataset/cc3m/' | |
# S3_PATH: 's3://cc3m/' | |
# SEQ_PER_SAMPLE: 1 | |
# SAMPLER: NodeDistributed | |
# CACHE_MODE: True | |
# CIRCULAR_CACHE_MODE: False | |
# ZIP_MODE: False | |
# CACHE_ORIGIN_IMAGE: False | |
# RANDOM_CAPTION: False | |
# AS_NUMPY_AS_POSSIBLE: False | |
# SAMPLING_WEIGHT: 0.8780 | |
# TRANSFORM: 'clip_transforms' | |
# MODEL: | |
# MAX_SEQ_LEN: 50 | |
# TEMP_NAME: logit_scale_caption | |
# LOSSES: | |
# NAMES: ['CrossEntropy', 'Accuracy'] | |
# LOSS_WEIGHT: 0.33333 | |
# REDUCTION: 'mean' | |
# INFERENCE: | |
# VOCAB: 'CLIP' | |
# GENERATION_MODE: False | |
# - | |
# NAME: vg_caption | |
# DATASETS: | |
# TRAIN: 'ImageTextPairDataset' | |
# TASK_TYPE: 'image_caption' | |
# DATASET_NAME: 'VG' | |
# TARGET_SET: ['Vocab_Word'] | |
# DATALOADER: | |
# TRAIN_BATCH_SIZE: 300 | |
# TEST_BATCH_SIZE: 32 | |
# NUM_WORKERS: 2 | |
# FEATS_FOLDER: 'open_source_dataset/visual_genome/images' | |
# ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations' | |
# S3_PATH: 's3://visual_genome/images' | |
# ANNO_FILENAME: 'vg_captions_128filter.json' | |
# SEQ_PER_SAMPLE: 1 | |
# CACHE_MODE: True | |
# CIRCULAR_CACHE_MODE: False | |
# ZIP_MODE: False | |
# CACHE_ORIGIN_IMAGE: False | |
# RANDOM_CAPTION: False | |
# AS_NUMPY_AS_POSSIBLE: False | |
# SAMPLING_WEIGHT: 0.5895 | |
# TRANSFORM: 'clip_transforms' | |
# MODEL: | |
# MAX_SEQ_LEN: 30 | |
# TEMP_NAME: logit_scale_caption | |
# LOSSES: | |
# NAMES: ['CrossEntropy', 'Accuracy'] | |
# LOSS_WEIGHT: 0.33333 | |
# REDUCTION: 'mean' | |
# INFERENCE: | |
# VOCAB: 'CLIP' | |
# GENERATION_MODE: True | |
- | |
NAME: mscoco_caption | |
DATASETS: | |
TRAIN: 'ImageTextPairDataset' | |
# VAL: 'ImageTextPairDataset' | |
TEST: 'ImageTextPairDataset' | |
TASK_TYPE: 'image_caption' | |
DATASET_NAME: 'MSCOCO' | |
TARGET_SET: ['Vocab_Word'] | |
DATALOADER: | |
TRAIN_BATCH_SIZE: 32 | |
TEST_BATCH_SIZE: 2 | |
NUM_WORKERS: 4 | |
FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin' | |
ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations' | |
S3_PATH: 's3://coco/' | |
SEQ_PER_SAMPLE: 1 | |
CACHE_MODE: True | |
CIRCULAR_CACHE_MODE: False | |
ZIP_MODE: False | |
CACHE_ORIGIN_IMAGE: False | |
RANDOM_CAPTION: False | |
AS_NUMPY_AS_POSSIBLE: False | |
SAMPLING_WEIGHT: 0.3817 | |
TRANSFORM: 'clip_transforms' | |
RANDOM_MASK: True | |
MODEL: | |
MAX_SEQ_LEN: 50 | |
EVAL_MAX_SEQ_LEN: 21 | |
TEMP_NAME: logit_scale_caption | |
LOSSES: | |
NAMES: ['CrossEntropy', 'Accuracy'] | |
LOSS_WEIGHT: 0.33333 | |
REDUCTION: 'mean' | |
DECODE_STRATEGY: | |
NAME: 'CaptionBeamSearcherV3' | |
BEAM_SIZE: 2 | |
# LEN_PENALTY: 2.0 | |
INFERENCE: | |
NAME: 'COCOEvaler' | |
VOCAB: 'CLIP' | |
ID_KEY: 'image_id' | |
VALUE: 'caption' | |
VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json' | |
TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json' | |
GENERATION_MODE: True | |
# - | |
# NAME: sbu_caption | |
# DATASETS: | |
# TRAIN: 'ImageTextPairDataset' | |
# TASK_TYPE: 'image_caption' | |
# DATASET_NAME: 'SBU' | |
# TARGET_SET: ['Vocab_Word'] | |
# DATALOADER: | |
# TRAIN_BATCH_SIZE: 300 | |
# TEST_BATCH_SIZE: 32 | |
# NUM_WORKERS: 1 | |
# S3_ANNO_FOLDER: 's3://SBU/annotations' | |
# ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations' | |
# ANNO_FILENAME: 'subcaption.json' | |
# FEATS_FOLDER: 'open_source_dataset/sbucaption/' | |
# S3_PATH: 's3://SBU/images' | |
# SEQ_PER_SAMPLE: 1 | |
# SAMPLER: NodeDistributed | |
# CACHE_MODE: True | |
# CIRCULAR_CACHE_MODE: False | |
# ZIP_MODE: False | |
# CACHE_ORIGIN_IMAGE: False | |
# RANDOM_CAPTION: False | |
# AS_NUMPY_AS_POSSIBLE: False | |
# SAMPLING_WEIGHT: 0.4618 | |
# TRANSFORM: 'clip_transforms' | |
# MODEL: | |
# MAX_SEQ_LEN: 50 | |
# TEMP_NAME: logit_scale_caption | |
# LOSSES: | |
# NAMES: ['CrossEntropy', 'Accuracy'] | |
# LOSS_WEIGHT: 0.33333 | |
# REDUCTION: 'mean' | |
# INFERENCE: | |
# VOCAB: 'CLIP' | |
# GENERATION_MODE: False | |
ENGINE: | |
NAME: 'UnifiedTrainer' | |
MODEL: | |
META_ARCHITECTURE: 'MultiTaskTransformerEncoder' | |
ENCODER: 'UnifiedBertEncoder' | |
IN_TUNING: True # use IN1k instead of 22k | |
SHARE_LAYERNORM: True | |
BERT: | |
NORMALIZE_DECISION: "BERTPre" | |
DROP_PATH_PROB: 0.0 | |
DROP_PATH_PROB_FIXED: True | |
MODEL_EMA: False | |
MODEL_EMA_DECAY: 0.9999 | |
MAEParamsInit: True | |
POSEMBEDFIX: True | |
IMG_INPUT_SIZE: 224 | |
PATCH_SIZE: 16 | |
LAYER_SCALE: True | |
LAYER_SCALE_INIT: 1e-3 | |
LAYER_SCALE_FP32: True | |
GATE_FP32: False | |
TAG_TRANSFORM_FP32: False | |
DATALOADER: | |
USE_WEIGHTED_SAMPLER: True | |
UNIFIED_DATASET: True | |
NUM_WORKERS: 32 | |
STRATEGY: 'turn' | |
PADDING_TO_MAX: False # True for debugging or token moe with distributed moe | |
####################################### Optimizer ####################################### | |
SOLVER: | |
NAME: 'Adam' | |
TORCH_OPTIMIZER: True | |
PARAMS_SEPERATE: True | |
# PARAMS_GROUP: True | |
# EPOCH: 1 | |
MAX_ITER: 150000 | |
CHECKPOINT_PERIOD: 5000 | |
EVAL_PERIOD: 500000 | |
BASE_LR: 0.001 | |
BIAS_LR_FACTOR: 1.0 | |
WEIGHT_DECAY: 0.05 | |
WEIGHT_DECAY_NORM: 0.0 | |
WEIGHT_DECAY_BIAS: 0.0 | |
WEIGHT_DECAY_EMBEDDING: 0.0 | |
MOMENTUM: 0.9 | |
DAMPENING: 0.0 | |
NESTEROV: 0.0 | |
BETAS: [0.9, 0.95] | |
EPS: 1e-6 | |
GRAD_CLIP: 0.1 | |
GRAD_CLIP_TYPE: 'norm' | |
ACCUM_ITER: 0 | |
AMP_FP16: True | |
APEX_FP16: False # dangerous | |
WRITE_PERIOD: 50 | |
MIN_LOSS_SCLE: 2048.0 | |
# BF16: False # True | |
# ZEROSTAGE: 2 | |
LOSS_SCALE_WINDOW: 200 | |
FORCE_SOFTMAX_FP16: True | |
FORCE_LN_FP16: True | |
FORCE_NORM_FP16: True | |
# FORCE_TEMP_FP16: True | |
FORCE_EMBED_FP16: True | |
####################################### lr scheduler ####################################### | |
LR_SCHEDULER: | |
NAME: 'WarmupCosine' | |
WARMUP: 5000 | |
MIN_LR: 0.000001 | |
####################################### evaluation ####################################### | |
INFERENCE: | |
VOCAB: 'CLIP' | |
ITER_BASED: True | |
find_unused_parameters: true | |
# ENCODERS: | |
# - | |
# NAME: VisualEncoder | |
# TYPE: VisualEncoder | |
# DROP_PATH_PROB: 0.0 | |
# HIDDEN_SIZE: 192 | |
# HIDDEN_DROPOUT_PROB: 0. | |
# HIDDEN_ACT: "gelu" | |
# NUM_ATTENTION_HEADS: 3 | |
# INTERMEDIATE_SIZE: 768 | |
# INTERMEDIATE_DROP: 0. | |
# FFN_DROPOUT_PROB: 0. | |
# ATTENTION_PROBS_DROPOUT_PROB: 0. | |
# NUM_HIDDEN_LAYERS: 6 | |
# NUM_GENERATION_LAYERS: 0 | |
# DROP_PATH_PROB_FIXED: True | |
# - | |
# NAME: TextEncoder | |
# TYPE: TextEncoder | |
# DROP_PATH_PROB: 0.0 | |
# HIDDEN_SIZE: 192 | |
# HIDDEN_DROPOUT_PROB: 0. | |
# HIDDEN_ACT: "gelu" | |
# NUM_ATTENTION_HEADS: 3 | |
# INTERMEDIATE_SIZE: 768 | |
# INTERMEDIATE_DROP: 0. | |
# FFN_DROPOUT_PROB: 0. | |
# ATTENTION_PROBS_DROPOUT_PROB: 0. | |
# NUM_HIDDEN_LAYERS: 6 | |
# NUM_GENERATION_LAYERS: 0 | |
# DROP_PATH_PROB_FIXED: True | |
MOE: | |
MOE: True | |
MOE_TYPE: 'attribute' | |
TAG_Transform: True | |
ATTRIBUTE_LENGTH: 8 | |
EP_WORLD_SIZE: 1 # tag moe only | |
NUM_EXPERTS: 8 | |
TOP_K: 2 | |
CAPACITY_FACTOR: 3.0 | |
EVAL_MIN_CAPACITY: 4.0 | |
MIN_CAPACITY: 4 | |
NOISY_GATE_POLICY: 'vmoe' | |
MOE_PARAM_GROUP: True | |
MOE_EXPERT_TYPE: 'FFN,SA' | |
SA_LINEAR_OUT_MOE: True | |
MOE_EXPERT_LOCATION: 'all' # 'odd' | |
# MOE_LAYER_START_IDX: 3 | |
# MOE_LAYER_END_IDX: 21 | |
# MOE_LAYER_START_IDX: 18 | |
# MOE_LAYER_END_IDX: 12 | |
BATCH_PRIO: True | |
USE_TUTEL: True | |
FFN_SHARE_GATE_DECISION: True |