Spaces:
Sleeping
Sleeping
# -------------------------------------------------------- | |
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language | |
# Copyright (c) 2022 Microsoft | |
# Licensed under The MIT License [see LICENSE for details] | |
# Written by Xueyan Zou (xueyan@cs.wisc.edu) | |
# -------------------------------------------------------- | |
# Define Test/Trainer/Saving | |
PIPELINE: XDecoderPipeline | |
TRAINER: xdecoder | |
SAVE_DIR: './output' | |
base_path: "./" | |
# Resume Logistic | |
RESUME: false | |
WEIGHT: false | |
RESUME_FROM: '' | |
EVAL_AT_START: false | |
SAVE_CHECKPOINT: True | |
# Logging and Debug | |
WANDB: False | |
LOG_EVERY: 100 | |
FIND_UNUSED_PARAMETERS: false | |
# Speed up training | |
FP16: false | |
PORT: '36873' | |
# misc | |
LOADER: | |
JOINT: True | |
KEY_DATASET: "" | |
SAMPLE_PROB: "prop" # sampling probability proportional to data size. Use "equal" for each bach from all datasets | |
MIXING_LEVEL: 1 # num of different datasets for batch mixing on each GPU | |
RANDOM_SEED: 2024 | |
STANDARD_TEXT_FOR_EVAL: False | |
################## | |
# Task settings | |
################## | |
VERBOSE: true | |
MODEL: | |
NAME: seem_model_v1 | |
HEAD: xdecoder_head | |
MASK_ON: false | |
KEYPOINT_ON: false | |
LOAD_PROPOSALS: false | |
DIM_PROJ: 512 | |
TEXT: | |
ARCH: vlpencoder | |
NAME: transformer | |
TOKENIZER: clip | |
CONTEXT_LENGTH: 77 #256 # 77 | |
WIDTH: 512 # 768 # 512 | |
HEADS: 8 | |
LAYERS: 12 # 6 | |
AUTOGRESSIVE: True | |
BACKBONE: | |
NAME: focal # focal_dw # focal | |
PRETRAINED: '' | |
LOAD_PRETRAINED: false | |
FOCAL: | |
PRETRAIN_IMG_SIZE: 224 | |
PATCH_SIZE: 4 | |
EMBED_DIM: 192 # 96 # 192 | |
DEPTHS: [2, 2, 18, 2] # [2, 2, 6, 2] # [2, 2, 18, 2] | |
FOCAL_LEVELS: [4, 4, 4, 4] # [3, 3, 3, 3] # [4, 4, 4, 4] | |
FOCAL_WINDOWS: [3, 3, 3, 3] | |
DROP_PATH_RATE: 0.3 | |
MLP_RATIO: 4.0 | |
DROP_RATE: 0.0 | |
PATCH_NORM: True | |
USE_CONV_EMBED: True | |
SCALING_MODULATOR: True | |
USE_CHECKPOINT: False | |
USE_POSTLN: true | |
USE_POSTLN_IN_MODULATION: false | |
USE_LAYERSCALE: True | |
OUT_FEATURES: ["res2", "res3", "res4", "res5"] | |
OUT_INDICES: [0, 1, 2, 3] | |
ENCODER: | |
NAME: transformer_encoder_fpn | |
IGNORE_VALUE: 255 | |
NUM_CLASSES: 16 | |
BINARY_CLASSES: False | |
LOSS_WEIGHT: 1.0 | |
CONVS_DIM: 512 | |
MASK_DIM: 512 | |
NORM: "GN" | |
IN_FEATURES: ["res2", "res3", "res4", "res5"] | |
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] | |
COMMON_STRIDE: 4 | |
TRANSFORMER_ENC_LAYERS: 6 | |
DECODER: | |
NAME: seem_v1 | |
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" | |
MASK: | |
ENABLED: True | |
DETECTION: False | |
SPATIAL: | |
ENABLED: True | |
MAX_ITER: 1 | |
GROUNDING: | |
ENABLED: True | |
MAX_LEN: 10 | |
TEXT_WEIGHT: 2.0 | |
CLASS_WEIGHT: 0.5 | |
RETRIEVAL: | |
ENABLED: False | |
LVIS: | |
ENABLED: False | |
THRES: 0.7 | |
OPENIMAGE: | |
ENABLED: False | |
NEGATIVE_SAMPLES: 5 | |
GROUNDING: | |
ENABLED: False | |
MAX_LEN: 5 | |
CAPTION: | |
ENABLED: False | |
PHRASE_PROB: 0.5 | |
SIM_THRES: 0.95 | |
DEEP_SUPERVISION: True | |
NO_OBJECT_WEIGHT: 0.1 | |
GCLASS_WEIGHT: 0.4 | |
GMASK_WEIGHT: 1.0 | |
GDICE_WEIGHT: 1.0 | |
SCLASS_WEIGHT: 0.4 | |
SMASK_WEIGHT: 1.0 | |
SDICE_WEIGHT: 1.0 | |
OCLASS_WEIGHT: 0.4 | |
OMASK_WEIGHT: 1.0 | |
ODICE_WEIGHT: 1.0 | |
CLASS_WEIGHT: 2.0 | |
MASK_WEIGHT: 5.0 | |
DICE_WEIGHT: 5.0 | |
BBOX_WEIGHT: 5.0 | |
GIOU_WEIGHT: 2.0 | |
CAPTION_WEIGHT: 2.0 | |
COST_SPATIAL: | |
CLASS_WEIGHT: 5.0 | |
MASK_WEIGHT: 2.0 | |
DICE_WEIGHT: 2.0 | |
HIDDEN_DIM: 512 | |
NUM_OBJECT_QUERIES: 101 | |
NHEADS: 8 | |
DROPOUT: 0.0 | |
DIM_FEEDFORWARD: 2048 | |
MAX_SPATIAL_LEN: [512, 512, 512, 512] | |
# ENC_LAYERS: 0 | |
PRE_NORM: False | |
ENFORCE_INPUT_PROJ: False | |
SIZE_DIVISIBILITY: 32 | |
TRAIN_NUM_POINTS: 12544 | |
OVERSAMPLE_RATIO: 3.0 | |
IMPORTANCE_SAMPLE_RATIO: 0.75 | |
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query | |
TOP_GROUNDING_LAYERS: 10 | |
TOP_CAPTION_LAYERS: 10 | |
TOP_SPATIAL_LAYERS: 10 | |
TOP_OPENIMAGE_LAYERS: 10 | |
TEST: | |
SEMANTIC_ON: False | |
INSTANCE_ON: False | |
PANOPTIC_ON: False | |
OVERLAP_THRESHOLD: 0.8 | |
OBJECT_MASK_THRESHOLD: 0.8 | |
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: true | |
# Spatial sampler | |
STROKE_SAMPLER: | |
MAX_CANDIDATE: 1 | |
CANDIDATE_PROBS: [0.25, 0.25, 0.25, 0.25] # for training only | |
CANDIDATE_NAMES: ["Point", "Polygon", "Scribble", "Circle"] | |
DILATION: 3 | |
CIRCLE: | |
NUM_STROKES: 5 | |
STROKE_PRESET: ['object_like', 'object_like_middle', 'object_like_small'] | |
STROKE_PROB: [0.33, 0.33, 0.33] | |
SCRIBBLE: | |
NUM_STROKES: 5 | |
STROKE_PRESET: ['rand_curve', 'rand_curve_small'] | |
STROKE_PROB: [0.5, 0.5] | |
POINT: | |
NUM_POINTS: 20 | |
POLYGON: | |
MAX_POINTS: 9 | |
EVAL: | |
MODE: 'best' # best/random/best_random | |
NEGATIVE: False | |
MAX_ITER: 1 | |
IOU_ITER: 1 | |
GROUNDING: True | |
# Multi-modal Architecture, order matters | |
ATTENTION_ARCH: | |
VARIABLE: | |
queries: ['object', 'grounding', 'spatial'] | |
tokens: ['grounding', 'spatial'] | |
memories: ['spatial'] | |
SELF_ATTENTION: | |
queries: | |
object: ['queries_object'] | |
grounding: ['queries_grounding', 'tokens_grounding'] | |
spatial: ['queries_spatial', 'tokens_spatial', 'memories_spatial'] | |
tokens: | |
grounding: ['queries_grounding', 'tokens_grounding'] | |
spatial: ['tokens_spatial'] | |
memories: | |
spatial: ['memories_spatial'] | |
CROSS_ATTENTION: | |
queries: | |
object: True | |
grounding: True | |
spatial: True | |
memories: | |
spatial: True | |
tokens: | |
grounding: False | |
spatial: False | |
MASKING: ['tokens_spatial', 'tokens_grounding'] | |
DUPLICATION: | |
queries: | |
grounding: 'queries_object' | |
spatial: 'queries_object' | |
SPATIAL_MEMORIES: 32 | |
QUERY_NUMBER: 3 | |
DATASETS: | |
TRAIN: [ | |
'biomed_BiomedParseData-Demo_demo' # Add your registered training datasets here | |
] | |
TEST: [ | |
'biomed_BiomedParseData-Demo_demo' # Add your registered test datasets here | |
] | |
CLASS_CONCAT: false | |
SIZE_DIVISIBILITY: 32 | |
PROPOSAL_FILES_TRAIN: [] | |
INPUT: | |
PIXEL_MEAN: [123.675, 116.280, 103.530] | |
PIXEL_STD: [58.395, 57.120, 57.375] | |
TRAIN: | |
ASPECT_RATIO_GROUPING: true | |
BATCH_SIZE_TOTAL: 4 | |
BATCH_SIZE_PER_GPU: 4 | |
SHUFFLE: true | |
TEST: | |
DETECTIONS_PER_IMAGE: 100 | |
NAME: coco_eval | |
IOU_TYPE: ['bbox', 'segm'] | |
USE_MULTISCALE: false | |
BATCH_SIZE_TOTAL: 4 | |
MODEL_FILE: '' | |
AUG: | |
ENABLED: False | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: False | |
NUM_WORKERS: 8 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: True | |
BioMed: | |
INPUT: | |
PIXEL_MEAN: [64.284, 59.293, 59.962] | |
PIXEL_STD: [62.484, 60.865, 59.835] | |
DATASET_MAPPER_NAME: "biomed_interactive" | |
MIN_SIZE_TRAIN: 900 | |
MAX_SIZE_TRAIN: 1100 | |
MIN_SIZE_TRAIN_SAMPLING: 'choice' | |
MIN_SIZE_TEST: 900 | |
MAX_SIZE_TEST: 1100 | |
IMAGE_SIZE: 1024 | |
MIN_SCALE: 0.9 | |
MAX_SCALE: 1.1 | |
IGNORE_VALUE: 255 | |
COLOR_AUG_SSD: False | |
SIZE_DIVISIBILITY: 32 | |
RANDOM_FLIP: "none" | |
RANDOM_ROTATE: False | |
MASK_FORMAT: "polygon" | |
MIN_AREA: 30 | |
FORMAT: "RGB" | |
SPATIAL: True | |
CROP: | |
ENABLED: True | |
DATASET: | |
DATASET: "biomed" | |
# Detectron2 training config for optimizer and lr scheduler | |
SOLVER: | |
BASE_LR: 0.0001 | |
STEPS: [0.88889, 0.96296] | |
MAX_ITER: 1 | |
GAMMA: 0.1 | |
WARMUP_FACTOR: 1.0 | |
WARMUP_ITERS: 10 | |
WARMUP_METHOD: "linear" | |
WEIGHT_DECAY: 0.05 | |
OPTIMIZER: "ADAMW" | |
LR_SCHEDULER_NAME: "WarmupMultiStepLR" | |
LR_MULTIPLIER: | |
backbone: 0.1 | |
lang_encoder: 0.1 | |
FIX_PARAM: | |
backbone: True | |
lang_encoder: True | |
pixel_decoder: True | |
WEIGHT_DECAY_NORM: 0.0 | |
WEIGHT_DECAY_EMBED: 0.0 | |
CLIP_GRADIENTS: | |
ENABLED: True | |
CLIP_TYPE: "full_model" | |
CLIP_VALUE: 5.0 # 0.01 | |
NORM_TYPE: 2.0 | |
MAX_NUM_EPOCHS: 50 |