Spaces:
Sleeping
Sleeping
File size: 4,537 Bytes
edebe10 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# Define Test/Trainer/Saving
PIPELINE: XDecoderPipeline
TRAINER: xdecoder
SAVE_DIR: '../../data/output/test'
base_path: "./"
# Resume Logistic
RESUME: false
WEIGHT: false
RESUME_FROM: ''
EVAL_AT_START: false
# Logging and Debug
WANDB: False
LOG_EVERY: 100
FIND_UNUSED_PARAMETERS: false
# Speed up training
FP16: false
PORT: '36873'
# misc
LOADER:
JOINT: False
KEY_DATASET: 'coco'
STANDARD_TEXT_FOR_EVAL: False
##################
# Task settings
##################
VERBOSE: true
MODEL:
NAME: seem_model_demo
HEAD: xdecoder_head
DIM_PROJ: 512
TEXT:
ARCH: vlpencoder
NAME: transformer
TOKENIZER: clip
CONTEXT_LENGTH: 77 # 77
WIDTH: 512
HEADS: 8
LAYERS: 12 # 6
AUTOGRESSIVE: True
BACKBONE:
NAME: focal
PRETRAINED: ''
LOAD_PRETRAINED: false
FOCAL:
PRETRAIN_IMG_SIZE: 224
PATCH_SIZE: 4
EMBED_DIM: 192
DEPTHS: [2, 2, 18, 2]
FOCAL_LEVELS: [4, 4, 4, 4]
FOCAL_WINDOWS: [3, 3, 3, 3]
DROP_PATH_RATE: 0.3
MLP_RATIO: 4.0
DROP_RATE: 0.0
PATCH_NORM: True
USE_CONV_EMBED: True
SCALING_MODULATOR: True
USE_CHECKPOINT: False
USE_POSTLN: true
USE_POSTLN_IN_MODULATION: false
USE_LAYERSCALE: True
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
OUT_INDICES: [0, 1, 2, 3]
ENCODER:
NAME: transformer_encoder_fpn
IGNORE_VALUE: 255
NUM_CLASSES: 16
BINARY_CLASSES: False
LOSS_WEIGHT: 1.0
CONVS_DIM: 512
MASK_DIM: 512
NORM: "GN"
IN_FEATURES: ["res2", "res3", "res4", "res5"]
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
COMMON_STRIDE: 4
TRANSFORMER_ENC_LAYERS: 6
DECODER:
NAME: seem_demo
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
MASK:
ENABLED: False
DETECTION: False
SPATIAL:
ENABLED: True
MAX_ITER: 1
GROUNDING:
ENABLED: True
MAX_LEN: 5
TEXT_WEIGHT: 2.0
CLASS_WEIGHT: 0.5
VISUAL:
ENABLED: False
AUDIO:
ENABLED: False
RETRIEVAL:
ENABLED: False
LVIS:
ENABLED: True
THRES: 0.7
OPENIMAGE:
ENABLED: False
NEGATIVE_SAMPLES: 5
GROUNDING:
ENABLED: False
MAX_LEN: 5
CAPTION:
ENABLED: False
PHRASE_PROB: 0.5
SIM_THRES: 0.95
DEEP_SUPERVISION: True
NO_OBJECT_WEIGHT: 0.1
GCLASS_WEIGHT: 0.4
GMASK_WEIGHT: 1.0
GDICE_WEIGHT: 1.0
SCLASS_WEIGHT: 0.4
SMASK_WEIGHT: 1.0
SDICE_WEIGHT: 1.0
OCLASS_WEIGHT: 0.4
OMASK_WEIGHT: 1.0
ODICE_WEIGHT: 1.0
CLASS_WEIGHT: 2.0
MASK_WEIGHT: 5.0
DICE_WEIGHT: 5.0
BBOX_WEIGHT: 5.0
GIOU_WEIGHT: 2.0
CAPTION_WEIGHT: 2.0
COST_SPATIAL:
CLASS_WEIGHT: 5.0
MASK_WEIGHT: 2.0
DICE_WEIGHT: 2.0
HIDDEN_DIM: 512
NUM_OBJECT_QUERIES: 101
NHEADS: 8
DROPOUT: 0.0
DIM_FEEDFORWARD: 2048
MAX_SPATIAL_LEN: [512, 512, 512, 512]
# ENC_LAYERS: 0
PRE_NORM: False
ENFORCE_INPUT_PROJ: False
SIZE_DIVISIBILITY: 32
TRAIN_NUM_POINTS: 12544
OVERSAMPLE_RATIO: 3.0
IMPORTANCE_SAMPLE_RATIO: 0.75
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
TOP_GROUNDING_LAYERS: 10
TOP_CAPTION_LAYERS: 10
TOP_SPATIAL_LAYERS: 10
TOP_OPENIMAGE_LAYERS: 10
TEST:
SEMANTIC_ON: True
INSTANCE_ON: True
PANOPTIC_ON: True
OVERLAP_THRESHOLD: 0.8
OBJECT_MASK_THRESHOLD: 0.4
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
DETECTIONS_PER_IMAGE: 100
# Multi-modal Architecture, order matters
ATTENTION_ARCH:
VARIABLE:
queries: ['object']
tokens: ['grounding', 'spatial', 'visual', 'audio']
SELF_ATTENTION:
queries:
object: ['queries_object', 'tokens_grounding', 'tokens_spatial', 'tokens_visual', 'tokens_audio']
tokens:
grounding: ['queries_object', 'tokens_grounding']
spatial: ['tokens_spatial']
visual: ['tokens_visual']
audio: ['queries_object', 'tokens_audio']
CROSS_ATTENTION:
queries:
object: True
tokens:
grounding: False
spatial: False
visual: False
audio: False
MASKING: ['tokens_spatial', 'tokens_grounding', 'tokens_visual', 'tokens_audio']
DUPLICATION:
queries:
grounding: 'queries_object'
spatial: 'queries_object'
SPATIAL_MEMORIES: 32
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
# INPUT:
# PIXEL_MEAN: [64.284, 59.293, 59.962]
# PIXEL_STD: [62.484, 60.865, 59.835] |