apoorvkh commited on
Commit
dc861cc
1 Parent(s): 5f14eda

Adding Viper models

Browse files
viper/glip/checkpoints/glip_large_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcb0178264b3193901a11560aa7f31b05821a5bf15225eeb0eeebbedaaa27791
3
+ size 6896153761
viper/glip/configs/glip_Swin_L.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_large_patch4_window12_384_22k.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+
11
+ SWINT:
12
+ EMBED_DIM: 192
13
+ DEPTHS: (2, 2, 18, 2)
14
+ NUM_HEADS: (6, 12, 24, 48)
15
+ WINDOW_SIZE: 12
16
+ OUT_CHANNELS: (192, 384, 768, 1536)
17
+ DROP_PATH_RATE: 0.4
18
+
19
+ LANGUAGE_BACKBONE:
20
+ FREEZE: False
21
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
22
+ MASK_SPECIAL: False
23
+
24
+ RPN:
25
+ USE_FPN: True
26
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
27
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
28
+ ASPECT_RATIOS: (1.0,)
29
+ SCALES_PER_OCTAVE: 1
30
+
31
+ DYHEAD:
32
+ CHANNELS: 256
33
+ NUM_CONVS: 8
34
+ USE_GN: True
35
+ USE_DYRELU: True
36
+ USE_DFCONV: True
37
+ USE_DYFUSE: True
38
+ TOPK: 9 # topk for selecting candidate positive samples from each level
39
+ SCORE_AGG: "MEAN"
40
+ LOG_SCALE: 0.0
41
+
42
+ USE_CHECKPOINT: True
43
+ FUSE_CONFIG:
44
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
45
+ EARLY_FUSE_ON: True
46
+ TYPE: "MHA-B"
47
+ USE_CLASSIFICATION_LOSS: False
48
+ USE_TOKEN_LOSS: False
49
+ USE_CONTRASTIVE_ALIGN_LOSS: False
50
+ CONTRASTIVE_HIDDEN_DIM: 64
51
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
52
+ USE_LAYER_SCALE: True
53
+ CLAMP_MIN_FOR_UNDERFLOW: True
54
+ CLAMP_MAX_FOR_OVERFLOW: True
55
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
56
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
57
+ CLAMP_DOT_PRODUCT: True
58
+
59
+ DATASETS:
60
+
61
+ TRAIN: ("mixed_train_no_coco",) # Place holder dataset for now. To be updated in the next version
62
+ TEST: ("coco_2017_val", )
63
+
64
+ ONE_HOT: False
65
+ FLICKR_COPY: 8 # 0.15 * 8 = ~1.2M
66
+ MIXED_COPY: 4 # 0.6 * 4 = ~2.4M
67
+ OBJECT365_COPY: 2 # 1.4 * 2 = ~2.8M
68
+ VG_COPY: 3 # 0.4 * 3 = ~1.2M
69
+ IN_COPY: 2 # 0.67 * 2 = ~1.33M
70
+ OI_COPY: 1 # 2M * 1 = 2M
71
+
72
+ DISABLE_SHUFFLE: False
73
+ ADD_DET_PROMPT: False
74
+ RANDOM_SAMPLE_NEG: 85
75
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
76
+ FURTHER_SCREEN: True
77
+ CAPTION_CONF: 0.5
78
+ CAPTION_NMS: -1.0
79
+ CAPTION_MIN_BOX: 1
80
+
81
+ SEPARATION_TOKENS: ". "
82
+
83
+ PACK_RANDOM_CAPTION_NUMBER: 20
84
+ NO_RANDOM_PACK_PROBABILITY: 0.4
85
+ RANDOM_PACK_PROB: 0.5
86
+ CAPTION_FORMAT_VERSION: "v2"
87
+
88
+ INPUT:
89
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
90
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
91
+ MIN_SIZE_TRAIN: 800
92
+ MAX_SIZE_TRAIN: 1333
93
+ MIN_SIZE_TEST: 800
94
+ MAX_SIZE_TEST: 1333
95
+
96
+ AUGMENT:
97
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
98
+
99
+ DATALOADER:
100
+ SIZE_DIVISIBILITY: 32
101
+
102
+ SOLVER:
103
+ OPTIMIZER: ADAMW
104
+ BASE_LR: 0.0001
105
+ LANG_LR: 0.00001
106
+ WEIGHT_DECAY: 0.01
107
+ WEIGHT_DECAY_SCHEDULE: True
108
+ STEPS: (0.67, 0.89)
109
+ MAX_ITER: 1000000
110
+ IMS_PER_BATCH: 64
111
+ WARMUP_ITERS: 2000
112
+ WARMUP_FACTOR: 0.001
113
+
114
+ FIND_UNUSED_PARAMETERS: False
115
+
116
+ CLIP_GRADIENTS:
117
+ ENABLED: True
118
+ CLIP_TYPE: "full_model"
119
+ CLIP_VALUE: 1.0
120
+ NORM_TYPE: 2.0
viper/xvlm/retrieval_mscoco_checkpoint_9.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa979bde985e2b91ce58e33a385ead52cc8edc249cc68e4f83e7bf3878effd1d
3
+ size 869714238