|
_BASE_: "base_model_bert_l12_h192.yaml" |
|
|
|
SHARED_TARGETS: |
|
|
|
- |
|
NAME: 'ImageNet1k' |
|
SHARED_TARGETS_CFG: |
|
FILE_PATH: 'open_source_dataset/imagenet_class_name_CLIP_with_endoftext.pkl' |
|
DISTRIBUTED: False |
|
|
|
- |
|
NAME: 'Vocab_Word' |
|
SHARED_TARGETS_CFG: |
|
FILE_PATH: 'open_source_dataset/vocabulary_CLIP_with_endoftext.pkl' |
|
DISTRIBUTED: True |
|
|
|
|
|
|
|
TASKS: |
|
|
|
- |
|
NAME: imagenet |
|
DATASETS: |
|
TRAIN: 'ImageNetDataset' |
|
|
|
TASK_TYPE: 'image_classification' |
|
DATASET_NAME: 'ImageNet1k' |
|
TARGET_SET: ['ImageNet1k'] |
|
|
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 4 |
|
|
|
NUM_WORKERS: 4 |
|
FEATS_FOLDER: 'open_source_dataset/imagenet' |
|
S3_PATH: 'cluster2:s3://imagenet' |
|
ANNO_FOLDER: 'open_source_dataset/imagenet/meta' |
|
SAMPLING_WEIGHT: 1.0 |
|
CLASS_NAME_FILE: 'open_source_dataset/imagenet_class_name.pkl' |
|
MIXUP: 0.8 |
|
CUTMIX: 1.0 |
|
MIXUP_PROB: 1.0 |
|
MIXUP_SWITCH_PROB: 0.5 |
|
MIXUP_MODE: 'batch' |
|
MIXUP_LABEL_SMOOTHING: 0.1 |
|
MODEL: |
|
MAX_SEQ_LEN: -1 |
|
LABELS_NUM: 1000 |
|
TEMP_NAME: logit_scale_img_cls |
|
LOSSES: |
|
NAMES: ['SoftTargetCrossEntropy', 'Accuracy'] |
|
LOSS_WEIGHT: 1.0 |
|
REDUCTION: 'mean' |
|
|
|
INFERENCE: |
|
NAME: 'ImageNetEvaler' |
|
ID_KEY: 'image_id' |
|
VALUE: 'cls_logits' |
|
VAL_ANNFILE: 'open_source_dataset/imagenet/meta/val.txt' |
|
TEST_ANNFILE: '' |
|
GENERATION_MODE: False |
|
|
|
- |
|
NAME: mscoco_caption |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
|
|
|
|
TASK_TYPE: 'image_caption' |
|
DATASET_NAME: 'MSCOCO' |
|
TARGET_SET: ['Vocab_Word'] |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 64 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 4 |
|
FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin' |
|
ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations' |
|
S3_PATH: 's3://coco/' |
|
SEQ_PER_SAMPLE: 1 |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
RANDOM_MASK: True |
|
MODEL: |
|
MAX_SEQ_LEN: 50 |
|
EVAL_MAX_SEQ_LEN: 21 |
|
TEMP_NAME: logit_scale_caption |
|
LOSSES: |
|
NAMES: ['CrossEntropy', 'Accuracy'] |
|
LOSS_WEIGHT: 0.33333 |
|
REDUCTION: 'mean' |
|
DECODE_STRATEGY: |
|
NAME: 'CaptionBeamSearcherV3' |
|
BEAM_SIZE: 2 |
|
|
|
INFERENCE: |
|
NAME: 'COCOEvaler' |
|
VOCAB: 'CLIP' |
|
ID_KEY: 'image_id' |
|
VALUE: 'caption' |
|
VAL_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_val5k.json' |
|
TEST_ANNFILE: 'open_source_dataset/mscoco_dataset/new_annotations/captions_test5k.json' |
|
GENERATION_MODE: True |
|
|
|
- |
|
NAME: yfcc_caption |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_caption' |
|
DATASET_NAME: 'YFCC' |
|
TARGET_SET: ['Vocab_Word'] |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 64 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 2 |
|
S3_ANNO_FOLDER: 'cluster2:s3://yfcc' |
|
ANNO_FOLDER: 'open_source_dataset/yfcc' |
|
ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json' |
|
FEATS_FOLDER: 'open_source_dataset/yfcc/' |
|
S3_PATH: 'cluster2:s3://yfcc/' |
|
SEQ_PER_SAMPLE: 1 |
|
SAMPLER: NodeDistributed |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: True |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 50 |
|
TEMP_NAME: logit_scale_caption |
|
LOSSES: |
|
NAMES: ['CrossEntropy', 'Accuracy'] |
|
LOSS_WEIGHT: 1.0 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
GENERATION_MODE: False |
|
|
|
- |
|
NAME: cc12m_caption |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_caption' |
|
DATASET_NAME: 'CC12M' |
|
TARGET_SET: ['Vocab_Word'] |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 64 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 2 |
|
S3_ANNO_FOLDER: 's3://cc12m/' |
|
ANNO_FOLDER: 'open_source_dataset/c12m/' |
|
ANNO_FILENAME: 'train_available.json' |
|
FEATS_FOLDER: 'open_source_dataset/c12m/' |
|
S3_PATH: 's3://cc12m/' |
|
SEQ_PER_SAMPLE: 1 |
|
SAMPLER: NodeDistributed |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 50 |
|
TEMP_NAME: logit_scale_caption |
|
LOSSES: |
|
NAMES: ['CrossEntropy', 'Accuracy'] |
|
LOSS_WEIGHT: 1.0 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
GENERATION_MODE: False |
|
|
|
- |
|
NAME: cc3m_caption |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_caption' |
|
DATASET_NAME: 'CC3M' |
|
TARGET_SET: ['Vocab_Word'] |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 64 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 2 |
|
S3_ANNO_FOLDER: 's3://cc3m/' |
|
ANNO_FOLDER: 'open_source_dataset/cc3m/' |
|
ANNO_FILENAME: 'train_spacy.json' |
|
FEATS_FOLDER: 'open_source_dataset/cc3m/' |
|
S3_PATH: 's3://cc3m/' |
|
SEQ_PER_SAMPLE: 1 |
|
SAMPLER: NodeDistributed |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 50 |
|
TEMP_NAME: logit_scale_caption |
|
LOSSES: |
|
NAMES: ['CrossEntropy', 'Accuracy'] |
|
LOSS_WEIGHT: 1.0 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
GENERATION_MODE: False |
|
|
|
- |
|
NAME: sbu_caption |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_caption' |
|
DATASET_NAME: 'SBU' |
|
TARGET_SET: ['Vocab_Word'] |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 64 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 1 |
|
S3_ANNO_FOLDER: 's3://SBU/annotations' |
|
ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations' |
|
ANNO_FILENAME: 'subcaption.json' |
|
FEATS_FOLDER: 'open_source_dataset/sbucaption/' |
|
S3_PATH: 's3://SBU/images' |
|
SEQ_PER_SAMPLE: 1 |
|
SAMPLER: NodeDistributed |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 50 |
|
TEMP_NAME: logit_scale_caption |
|
LOSSES: |
|
NAMES: ['CrossEntropy', 'Accuracy'] |
|
LOSS_WEIGHT: 1.0 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
GENERATION_MODE: False |
|
|
|
- |
|
NAME: vg_caption |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_caption' |
|
DATASET_NAME: 'VG' |
|
TARGET_SET: ['Vocab_Word'] |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 64 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 2 |
|
FEATS_FOLDER: 'open_source_dataset/visual_genome/images' |
|
ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations' |
|
S3_PATH: 's3://visual_genome/images' |
|
ANNO_FILENAME: 'vg_captions_128filter.json' |
|
SEQ_PER_SAMPLE: 1 |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 30 |
|
TEMP_NAME: logit_scale_caption |
|
LOSSES: |
|
NAMES: ['CrossEntropy', 'Accuracy'] |
|
LOSS_WEIGHT: 1.0 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
GENERATION_MODE: True |
|
|
|
- |
|
NAME: mscoco_retrieve |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
|
|
TASK_TYPE: 'image_retrieval' |
|
DATASET_NAME: 'MSCOCO' |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 100 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 1 |
|
FEATS_FOLDER: 'open_source_dataset/mscoco_dataset/coco_origin' |
|
ANNO_FOLDER: 'open_source_dataset/mscoco_dataset/new_annotations' |
|
S3_PATH: 's3://coco/' |
|
SEQ_PER_SAMPLE: 1 |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 50 |
|
TEMP_NAME: logit_scale_retrieve |
|
LOSSES: |
|
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy'] |
|
LABELSMOOTHING: 0.1 |
|
LOSS_WEIGHT: 1.0 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
ID_KEY: 'image_id' |
|
VALUE: 'caption' |
|
NAME: 'RetrievalEvaler' |
|
VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline' |
|
TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline' |
|
GENERATION_MODE: False |
|
|
|
- |
|
NAME: yfcc_retrieve |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_retrieval' |
|
DATASET_NAME: 'YFCC' |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 64 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 2 |
|
S3_ANNO_FOLDER: 'cluster2:s3://yfcc' |
|
ANNO_FOLDER: 'open_source_dataset/yfcc' |
|
ANNO_FILENAME: 'yfcc100m_subset_available_untokenized.json' |
|
FEATS_FOLDER: 'open_source_dataset/yfcc/' |
|
S3_PATH: 'cluster2:s3://yfcc/' |
|
SAMPLER: NodeDistributed |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: True |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 50 |
|
TEMP_NAME: logit_scale_retrieve |
|
LOSSES: |
|
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy'] |
|
LABELSMOOTHING: 0.1 |
|
LOSS_WEIGHT: 0.5 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
GENERATION_MODE: False |
|
|
|
- |
|
NAME: cc12m_retrieve |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_retrieval' |
|
DATASET_NAME: 'CC12M' |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 64 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 2 |
|
S3_ANNO_FOLDER: 's3://cc12m/' |
|
ANNO_FOLDER: 'open_source_dataset/c12m/' |
|
ANNO_FILENAME: 'train_available.json' |
|
FEATS_FOLDER: 'open_source_dataset/c12m/' |
|
S3_PATH: 's3://cc12m/' |
|
SAMPLER: NodeDistributed |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 50 |
|
TEMP_NAME: logit_scale_retrieve |
|
LOSSES: |
|
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy'] |
|
LABELSMOOTHING: 0.1 |
|
LOSS_WEIGHT: 0.5 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
GENERATION_MODE: False |
|
|
|
- |
|
NAME: cc3m_retrieve |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_retrieval' |
|
DATASET_NAME: 'CC3M' |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 64 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 2 |
|
S3_ANNO_FOLDER: 's3://cc3m/' |
|
ANNO_FOLDER: 'open_source_dataset/cc3m/' |
|
ANNO_FILENAME: 'train_spacy.json' |
|
FEATS_FOLDER: 'open_source_dataset/cc3m/' |
|
S3_PATH: 's3://cc3m/' |
|
SAMPLER: NodeDistributed |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 50 |
|
TEMP_NAME: logit_scale_retrieve |
|
LOSSES: |
|
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy'] |
|
LABELSMOOTHING: 0.1 |
|
LOSS_WEIGHT: 0.5 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
GENERATION_MODE: False |
|
|
|
|
|
|
|
- |
|
NAME: vg_retrieve |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_retrieval' |
|
DATASET_NAME: 'VG' |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 64 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 2 |
|
FEATS_FOLDER: 'open_source_dataset/visual_genome/images' |
|
ANNO_FOLDER: 'open_source_dataset/visual_genome/annotations' |
|
S3_PATH: 's3://visual_genome/images' |
|
ANNO_FILENAME: 'vg_captions_128filter.json' |
|
SEQ_PER_SAMPLE: 1 |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 30 |
|
TEMP_NAME: logit_scale_retrieve |
|
LOSSES: |
|
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy'] |
|
LABELSMOOTHING: 0.1 |
|
LOSS_WEIGHT: 0.5 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
GENERATION_MODE: False |
|
|
|
- |
|
NAME: sbu_retrieve |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_retrieval' |
|
DATASET_NAME: 'SBU' |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 64 |
|
TEST_BATCH_SIZE: 32 |
|
NUM_WORKERS: 1 |
|
S3_ANNO_FOLDER: 's3://SBU/annotations' |
|
ANNO_FOLDER: 'open_source_dataset/sbucaption/annotations' |
|
ANNO_FILENAME: 'subcaption.json' |
|
FEATS_FOLDER: 'open_source_dataset/sbucaption/' |
|
S3_PATH: 's3://SBU/images' |
|
SAMPLER: NodeDistributed |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 50 |
|
TEMP_NAME: logit_scale_retrieve |
|
LOSSES: |
|
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy'] |
|
LABELSMOOTHING: 0.1 |
|
LOSS_WEIGHT: 0.5 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
GENERATION_MODE: False |
|
|
|
- |
|
NAME: flickr30k_retrieve |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_retrieval' |
|
TEST: 'ImageTextPairDataset' |
|
DATASET_NAME: 'FLICKR' |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 128 |
|
TEST_BATCH_SIZE: 128 |
|
NUM_WORKERS: 2 |
|
FEATS_FOLDER: 'open_source_dataset/flickr30k_images/flickr30k_images/flickr30k_images' |
|
ANNO_FOLDER: 'open_source_dataset/flickr30k' |
|
S3_PATH: "s3://open_dataset/flickr30k/flickr30k_images" |
|
SEQ_PER_SAMPLE: 1 |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
MODEL: |
|
MAX_SEQ_LEN: 77 |
|
TEMP_NAME: logit_scale_retrieve |
|
LOSSES: |
|
NAMES: ['LabelSmoothingCrossEntropy'] |
|
LABELSMOOTHING: 0.1 |
|
LOSS_WEIGHT: 1.0 |
|
REDUCTION: 'mean' |
|
INFERENCE: |
|
VOCAB: 'CLIP' |
|
ID_KEY: 'image_id' |
|
VALUE: 'caption' |
|
NAME: 'RetrievalEvaler' |
|
VAL_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_val_set0_2014.jsonline' |
|
TEST_ANNFILE: 'open_source_dataset/flickr30k/all_data_final_test_set0_2014.jsonline' |
|
GENERATION_MODE: False |
|
|
|
- |
|
NAME: flickr30k_caption |
|
DATASETS: |
|
TRAIN: 'ImageTextPairDataset' |
|
TASK_TYPE: 'image_caption' |
|
TEST: 'ImageTextPairDataset' |
|
DATASET_NAME: 'FLICKR' |
|
TARGET_SET: ['Vocab_Word'] |
|
DATALOADER: |
|
TRAIN_BATCH_SIZE: 32 |
|
TEST_BATCH_SIZE: 8 |
|
NUM_WORKERS: 4 |
|
FEATS_FOLDER: 'open_source_dataset/flickr30k_images/flickr30k_images/flickr30k_images' |
|
ANNO_FOLDER: 'open_source_dataset/flickr30k' |
|
S3_PATH: "s3://open_dataset/flickr30k/flickr30k_images" |
|
SEQ_PER_SAMPLE: 1 |
|
CACHE_MODE: True |
|
CIRCULAR_CACHE_MODE: False |
|
ZIP_MODE: False |
|
CACHE_ORIGIN_IMAGE: False |
|
RANDOM_CAPTION: False |
|
AS_NUMPY_AS_POSSIBLE: False |
|
SAMPLING_WEIGHT: 1.0 |
|
TRANSFORM: 'clip_transforms' |
|
TASK_TYPE: caption |
|
|
|
MODEL: |
|
MAX_SEQ_LEN: 21 |
|
TEMP_NAME: logit_scale_caption |
|
LOSSES: |
|
NAMES: ['LabelSmoothingCrossEntropy', 'Accuracy'] |
|
LABELSMOOTHING: 0.1 |
|
LOSS_WEIGHT: 1.0 |
|
REDUCTION: 'mean' |
|
DECODE_STRATEGY: |
|
NAME: 'CaptionBeamSearcherV3' |
|
BEAM_SIZE: 2 |
|
INFERENCE: |
|
NAME: 'COCOEvaler' |
|
VOCAB: 'CLIP' |
|
ID_KEY: 'image_id' |
|
VALUE: 'caption' |
|
VAL_ANNFILE: 'open_source_dataset/flickr30k/captions_val.json' |
|
TEST_ANNFILE: 'open_source_dataset/flickr30k/captions_test.json' |
|
GENERATION_MODE: True |
|
|
|
|
|
ENGINE: |
|
NAME: 'UnifiedTrainer' |
|
|
|
MODEL: |
|
META_ARCHITECTURE: 'MultiTaskTransformerEncoder' |
|
ENCODER: 'UnifiedBertEncoder' |
|
|
|
IN_TUNING: True |
|
SHARE_LAYERNORM: True |
|
BERT: |
|
NORMALIZE_DECISION: "BERTPre" |
|
DROP_PATH_PROB: 0.1 |
|
NUM_HIDDEN_LAYERS: 1 |
|
DROP_PATH_PROB_FIXED: True |
|
|
|
UNIFY_QKV: True |
|
|
|
MODEL_EMA: False |
|
MODEL_EMA_DECAY: 0.9999 |
|
|
|
MAEParamsInit: True |
|
POSEMBEDFIX: True |
|
|
|
|
|
IMG_INPUT_SIZE: 224 |
|
PATCH_SIZE: 16 |
|
|
|
LAYER_SCALE: True |
|
LAYER_SCALE_INIT: 1e-3 |
|
|
|
|
|
DATALOADER: |
|
USE_WEIGHTED_SAMPLER: True |
|
UNIFIED_DATASET: True |
|
NUM_WORKERS: 16 |
|
|
|
PADDING_TO_MAX: False |
|
|
|
|
|
|
|
|
|
SOLVER: |
|
NAME: 'Adam' |
|
TORCH_OPTIMIZER: True |
|
PARAMS_SEPERATE: True |
|
|
|
|
|
MAX_ITER: 150000 |
|
CHECKPOINT_PERIOD: 5000 |
|
EVAL_PERIOD: 500000 |
|
BASE_LR: 0.001 |
|
BIAS_LR_FACTOR: 1.0 |
|
WEIGHT_DECAY: 0.05 |
|
WEIGHT_DECAY_NORM: 0.0 |
|
WEIGHT_DECAY_BIAS: 0.0 |
|
WEIGHT_DECAY_EMBEDDING: 0.0 |
|
MOMENTUM: 0.9 |
|
DAMPENING: 0.0 |
|
NESTEROV: 0.0 |
|
BETAS: [0.9, 0.95] |
|
EPS: 1e-6 |
|
GRAD_CLIP: 0.1 |
|
GRAD_CLIP_TYPE: 'norm' |
|
ACCUM_ITER: 0 |
|
AMP_FP16: True |
|
APEX_FP16: False |
|
|
|
WRITE_PERIOD: 50 |
|
MIN_LOSS_SCLE: 2048.0 |
|
|
|
|
|
|
|
LOSS_SCALE_WINDOW: 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LR_SCHEDULER: |
|
NAME: 'WarmupCosine' |
|
WARMUP: 5000 |
|
MIN_LR: 0.000001 |
|
|
|
|
|
|
|
|
|
|
|
INFERENCE: |
|
|
|
VOCAB: 'CLIP' |
|
ITER_BASED: True |
|
|
|
|
|
find_unused_parameters: true |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|