XiangpengYang commited on
Commit
5602c9a
·
0 Parent(s):

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +3 -0
  3. LICENSE.md +21 -0
  4. __pycache__/ptp_utils_null_text_inversion.cpython-310.pyc +0 -0
  5. __pycache__/ptp_utils_null_text_inversion.cpython-38.pyc +0 -0
  6. __pycache__/utils.cpython-310.pyc +0 -0
  7. __pycache__/xformers.cpython-310.pyc +0 -0
  8. annotator/__pycache__/util.cpython-310.pyc +0 -0
  9. annotator/__pycache__/util.cpython-39.pyc +0 -0
  10. annotator/canny/__init__.py +6 -0
  11. annotator/canny/__pycache__/__init__.cpython-39.pyc +0 -0
  12. annotator/dwpose/__init__.py +71 -0
  13. annotator/dwpose/__pycache__/__init__.cpython-310.pyc +0 -0
  14. annotator/dwpose/__pycache__/__init__.cpython-38.pyc +0 -0
  15. annotator/dwpose/__pycache__/onnxdet.cpython-310.pyc +0 -0
  16. annotator/dwpose/__pycache__/onnxdet.cpython-38.pyc +0 -0
  17. annotator/dwpose/__pycache__/onnxpose.cpython-310.pyc +0 -0
  18. annotator/dwpose/__pycache__/onnxpose.cpython-38.pyc +0 -0
  19. annotator/dwpose/__pycache__/util.cpython-310.pyc +0 -0
  20. annotator/dwpose/__pycache__/util.cpython-38.pyc +0 -0
  21. annotator/dwpose/__pycache__/wholebody.cpython-310.pyc +0 -0
  22. annotator/dwpose/__pycache__/wholebody.cpython-38.pyc +0 -0
  23. annotator/dwpose/dwpose_config/dwpose-l_384x288.py +257 -0
  24. annotator/dwpose/onnxdet.py +125 -0
  25. annotator/dwpose/onnxpose.py +360 -0
  26. annotator/dwpose/util.py +297 -0
  27. annotator/dwpose/wholebody.py +142 -0
  28. annotator/dwpose/yolox_config/yolox_l_8xb8-300e_coco.py +245 -0
  29. annotator/hed/__init__.py +132 -0
  30. annotator/hed/__pycache__/__init__.cpython-39.pyc +0 -0
  31. annotator/midas/__init__.py +38 -0
  32. annotator/midas/__pycache__/__init__.cpython-310.pyc +0 -0
  33. annotator/midas/__pycache__/api.cpython-310.pyc +0 -0
  34. annotator/midas/api.py +169 -0
  35. annotator/midas/midas/__init__.py +0 -0
  36. annotator/midas/midas/__pycache__/__init__.cpython-310.pyc +0 -0
  37. annotator/midas/midas/__pycache__/base_model.cpython-310.pyc +0 -0
  38. annotator/midas/midas/__pycache__/blocks.cpython-310.pyc +0 -0
  39. annotator/midas/midas/__pycache__/dpt_depth.cpython-310.pyc +0 -0
  40. annotator/midas/midas/__pycache__/midas_net.cpython-310.pyc +0 -0
  41. annotator/midas/midas/__pycache__/midas_net_custom.cpython-310.pyc +0 -0
  42. annotator/midas/midas/__pycache__/transforms.cpython-310.pyc +0 -0
  43. annotator/midas/midas/__pycache__/vit.cpython-310.pyc +0 -0
  44. annotator/midas/midas/base_model.py +16 -0
  45. annotator/midas/midas/blocks.py +342 -0
  46. annotator/midas/midas/dpt_depth.py +109 -0
  47. annotator/midas/midas/midas_net.py +76 -0
  48. annotator/midas/midas/midas_net_custom.py +128 -0
  49. annotator/midas/midas/transforms.py +234 -0
  50. annotator/midas/midas/vit.py +491 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.gif filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ annotator/ckpts/**
2
+ result/**
3
+ trash/**
LICENSE.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 ST-Modulator authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
__pycache__/ptp_utils_null_text_inversion.cpython-310.pyc ADDED
Binary file (10 kB). View file
 
__pycache__/ptp_utils_null_text_inversion.cpython-38.pyc ADDED
Binary file (9.33 kB). View file
 
__pycache__/utils.cpython-310.pyc ADDED
Binary file (2.01 kB). View file
 
__pycache__/xformers.cpython-310.pyc ADDED
Binary file (359 Bytes). View file
 
annotator/__pycache__/util.cpython-310.pyc ADDED
Binary file (2.07 kB). View file
 
annotator/__pycache__/util.cpython-39.pyc ADDED
Binary file (1.89 kB). View file
 
annotator/canny/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import cv2
2
+
3
+
4
+ class CannyDetector:
5
+ def __call__(self, img, low_threshold, high_threshold):
6
+ return cv2.Canny(img, low_threshold, high_threshold)
annotator/canny/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (504 Bytes). View file
 
annotator/dwpose/__init__.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Openpose
2
+ # Original from CMU https://github.com/CMU-Perceptual-Computing-Lab/openpose
3
+ # 2nd Edited by https://github.com/Hzzone/pytorch-openpose
4
+ # 3rd Edited by ControlNet
5
+ # 4th Edited by ControlNet (added face and correct hands)
6
+
7
+ import os
8
+ os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
9
+
10
+ import torch
11
+ import numpy as np
12
+ from . import util
13
+ from .wholebody import Wholebody
14
+
15
+ def draw_pose(pose, H, W, draw_body=True, draw_hand=True, draw_face=True):
16
+ bodies = pose['bodies']
17
+ faces = pose['faces']
18
+ hands = pose['hands']
19
+ candidate = bodies['candidate']
20
+ subset = bodies['subset']
21
+ canvas = np.zeros(shape=(H, W, 3), dtype=np.uint8)
22
+
23
+ if draw_body:
24
+ canvas = util.draw_bodypose(canvas, candidate, subset)
25
+
26
+ if draw_hand:
27
+ canvas = util.draw_handpose(canvas, hands)
28
+
29
+ if draw_face:
30
+ canvas = util.draw_facepose(canvas, faces)
31
+
32
+ return canvas
33
+
34
+
35
+ class DWposeDetector:
36
+ def __init__(self):
37
+
38
+ self.pose_estimation = Wholebody()
39
+
40
+ def __call__(self, oriImg,hand=False, face=False):
41
+ oriImg = oriImg.copy()
42
+ H, W, C = oriImg.shape
43
+ with torch.no_grad():
44
+ candidate, subset = self.pose_estimation(oriImg)
45
+ nums, keys, locs = candidate.shape
46
+ candidate[..., 0] /= float(W)
47
+ candidate[..., 1] /= float(H)
48
+ body = candidate[:,:18].copy()
49
+ body = body.reshape(nums*18, locs)
50
+ score = subset[:,:18]
51
+ for i in range(len(score)):
52
+ for j in range(len(score[i])):
53
+ if score[i][j] > 0.3:
54
+ score[i][j] = int(18*i+j)
55
+ else:
56
+ score[i][j] = -1
57
+
58
+ un_visible = subset<0.3
59
+ candidate[un_visible] = -1
60
+
61
+ foot = candidate[:,18:24]
62
+
63
+ faces = candidate[:,24:92]
64
+
65
+ hands = candidate[:,92:113]
66
+ hands = np.vstack([hands, candidate[:,113:]])
67
+
68
+ bodies = dict(candidate=body, subset=score)
69
+ pose = dict(bodies=bodies, hands=hands, faces=faces)
70
+
71
+ return draw_pose(pose, H, W, draw_body=True, draw_hand=hand, draw_face=face)
annotator/dwpose/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (2.01 kB). View file
 
annotator/dwpose/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (1.91 kB). View file
 
annotator/dwpose/__pycache__/onnxdet.cpython-310.pyc ADDED
Binary file (4.18 kB). View file
 
annotator/dwpose/__pycache__/onnxdet.cpython-38.pyc ADDED
Binary file (4.21 kB). View file
 
annotator/dwpose/__pycache__/onnxpose.cpython-310.pyc ADDED
Binary file (10.3 kB). View file
 
annotator/dwpose/__pycache__/onnxpose.cpython-38.pyc ADDED
Binary file (10.3 kB). View file
 
annotator/dwpose/__pycache__/util.cpython-310.pyc ADDED
Binary file (7.73 kB). View file
 
annotator/dwpose/__pycache__/util.cpython-38.pyc ADDED
Binary file (7.57 kB). View file
 
annotator/dwpose/__pycache__/wholebody.cpython-310.pyc ADDED
Binary file (1.74 kB). View file
 
annotator/dwpose/__pycache__/wholebody.cpython-38.pyc ADDED
Binary file (2.77 kB). View file
 
annotator/dwpose/dwpose_config/dwpose-l_384x288.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # runtime
2
+ max_epochs = 270
3
+ stage2_num_epochs = 30
4
+ base_lr = 4e-3
5
+
6
+ train_cfg = dict(max_epochs=max_epochs, val_interval=10)
7
+ randomness = dict(seed=21)
8
+
9
+ # optimizer
10
+ optim_wrapper = dict(
11
+ type='OptimWrapper',
12
+ optimizer=dict(type='AdamW', lr=base_lr, weight_decay=0.05),
13
+ paramwise_cfg=dict(
14
+ norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
15
+
16
+ # learning rate
17
+ param_scheduler = [
18
+ dict(
19
+ type='LinearLR',
20
+ start_factor=1.0e-5,
21
+ by_epoch=False,
22
+ begin=0,
23
+ end=1000),
24
+ dict(
25
+ # use cosine lr from 150 to 300 epoch
26
+ type='CosineAnnealingLR',
27
+ eta_min=base_lr * 0.05,
28
+ begin=max_epochs // 2,
29
+ end=max_epochs,
30
+ T_max=max_epochs // 2,
31
+ by_epoch=True,
32
+ convert_to_iter_based=True),
33
+ ]
34
+
35
+ # automatically scaling LR based on the actual training batch size
36
+ auto_scale_lr = dict(base_batch_size=512)
37
+
38
+ # codec settings
39
+ codec = dict(
40
+ type='SimCCLabel',
41
+ input_size=(288, 384),
42
+ sigma=(6., 6.93),
43
+ simcc_split_ratio=2.0,
44
+ normalize=False,
45
+ use_dark=False)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='TopdownPoseEstimator',
50
+ data_preprocessor=dict(
51
+ type='PoseDataPreprocessor',
52
+ mean=[123.675, 116.28, 103.53],
53
+ std=[58.395, 57.12, 57.375],
54
+ bgr_to_rgb=True),
55
+ backbone=dict(
56
+ _scope_='mmdet',
57
+ type='CSPNeXt',
58
+ arch='P5',
59
+ expand_ratio=0.5,
60
+ deepen_factor=1.,
61
+ widen_factor=1.,
62
+ out_indices=(4, ),
63
+ channel_attention=True,
64
+ norm_cfg=dict(type='SyncBN'),
65
+ act_cfg=dict(type='SiLU'),
66
+ init_cfg=dict(
67
+ type='Pretrained',
68
+ prefix='backbone.',
69
+ checkpoint='https://download.openmmlab.com/mmpose/v1/projects/'
70
+ 'rtmpose/cspnext-l_udp-aic-coco_210e-256x192-273b7631_20230130.pth' # noqa
71
+ )),
72
+ head=dict(
73
+ type='RTMCCHead',
74
+ in_channels=1024,
75
+ out_channels=133,
76
+ input_size=codec['input_size'],
77
+ in_featuremap_size=(9, 12),
78
+ simcc_split_ratio=codec['simcc_split_ratio'],
79
+ final_layer_kernel_size=7,
80
+ gau_cfg=dict(
81
+ hidden_dims=256,
82
+ s=128,
83
+ expansion_factor=2,
84
+ dropout_rate=0.,
85
+ drop_path=0.,
86
+ act_fn='SiLU',
87
+ use_rel_bias=False,
88
+ pos_enc=False),
89
+ loss=dict(
90
+ type='KLDiscretLoss',
91
+ use_target_weight=True,
92
+ beta=10.,
93
+ label_softmax=True),
94
+ decoder=codec),
95
+ test_cfg=dict(flip_test=True, ))
96
+
97
+ # base dataset settings
98
+ dataset_type = 'CocoWholeBodyDataset'
99
+ data_mode = 'topdown'
100
+ data_root = '/data/'
101
+
102
+ backend_args = dict(backend='local')
103
+ # backend_args = dict(
104
+ # backend='petrel',
105
+ # path_mapping=dict({
106
+ # f'{data_root}': 's3://openmmlab/datasets/detection/coco/',
107
+ # f'{data_root}': 's3://openmmlab/datasets/detection/coco/'
108
+ # }))
109
+
110
+ # pipelines
111
+ train_pipeline = [
112
+ dict(type='LoadImage', backend_args=backend_args),
113
+ dict(type='GetBBoxCenterScale'),
114
+ dict(type='RandomFlip', direction='horizontal'),
115
+ dict(type='RandomHalfBody'),
116
+ dict(
117
+ type='RandomBBoxTransform', scale_factor=[0.6, 1.4], rotate_factor=80),
118
+ dict(type='TopdownAffine', input_size=codec['input_size']),
119
+ dict(type='mmdet.YOLOXHSVRandomAug'),
120
+ dict(
121
+ type='Albumentation',
122
+ transforms=[
123
+ dict(type='Blur', p=0.1),
124
+ dict(type='MedianBlur', p=0.1),
125
+ dict(
126
+ type='CoarseDropout',
127
+ max_holes=1,
128
+ max_height=0.4,
129
+ max_width=0.4,
130
+ min_holes=1,
131
+ min_height=0.2,
132
+ min_width=0.2,
133
+ p=1.0),
134
+ ]),
135
+ dict(type='GenerateTarget', encoder=codec),
136
+ dict(type='PackPoseInputs')
137
+ ]
138
+ val_pipeline = [
139
+ dict(type='LoadImage', backend_args=backend_args),
140
+ dict(type='GetBBoxCenterScale'),
141
+ dict(type='TopdownAffine', input_size=codec['input_size']),
142
+ dict(type='PackPoseInputs')
143
+ ]
144
+
145
+ train_pipeline_stage2 = [
146
+ dict(type='LoadImage', backend_args=backend_args),
147
+ dict(type='GetBBoxCenterScale'),
148
+ dict(type='RandomFlip', direction='horizontal'),
149
+ dict(type='RandomHalfBody'),
150
+ dict(
151
+ type='RandomBBoxTransform',
152
+ shift_factor=0.,
153
+ scale_factor=[0.75, 1.25],
154
+ rotate_factor=60),
155
+ dict(type='TopdownAffine', input_size=codec['input_size']),
156
+ dict(type='mmdet.YOLOXHSVRandomAug'),
157
+ dict(
158
+ type='Albumentation',
159
+ transforms=[
160
+ dict(type='Blur', p=0.1),
161
+ dict(type='MedianBlur', p=0.1),
162
+ dict(
163
+ type='CoarseDropout',
164
+ max_holes=1,
165
+ max_height=0.4,
166
+ max_width=0.4,
167
+ min_holes=1,
168
+ min_height=0.2,
169
+ min_width=0.2,
170
+ p=0.5),
171
+ ]),
172
+ dict(type='GenerateTarget', encoder=codec),
173
+ dict(type='PackPoseInputs')
174
+ ]
175
+
176
+ datasets = []
177
+ dataset_coco=dict(
178
+ type=dataset_type,
179
+ data_root=data_root,
180
+ data_mode=data_mode,
181
+ ann_file='coco/annotations/coco_wholebody_train_v1.0.json',
182
+ data_prefix=dict(img='coco/train2017/'),
183
+ pipeline=[],
184
+ )
185
+ datasets.append(dataset_coco)
186
+
187
+ scene = ['Magic_show', 'Entertainment', 'ConductMusic', 'Online_class',
188
+ 'TalkShow', 'Speech', 'Fitness', 'Interview', 'Olympic', 'TVShow',
189
+ 'Singing', 'SignLanguage', 'Movie', 'LiveVlog', 'VideoConference']
190
+
191
+ for i in range(len(scene)):
192
+ datasets.append(
193
+ dict(
194
+ type=dataset_type,
195
+ data_root=data_root,
196
+ data_mode=data_mode,
197
+ ann_file='UBody/annotations/'+scene[i]+'/keypoint_annotation.json',
198
+ data_prefix=dict(img='UBody/images/'+scene[i]+'/'),
199
+ pipeline=[],
200
+ )
201
+ )
202
+
203
+ # data loaders
204
+ train_dataloader = dict(
205
+ batch_size=32,
206
+ num_workers=10,
207
+ persistent_workers=True,
208
+ sampler=dict(type='DefaultSampler', shuffle=True),
209
+ dataset=dict(
210
+ type='CombinedDataset',
211
+ metainfo=dict(from_file='configs/_base_/datasets/coco_wholebody.py'),
212
+ datasets=datasets,
213
+ pipeline=train_pipeline,
214
+ test_mode=False,
215
+ ))
216
+ val_dataloader = dict(
217
+ batch_size=32,
218
+ num_workers=10,
219
+ persistent_workers=True,
220
+ drop_last=False,
221
+ sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
222
+ dataset=dict(
223
+ type=dataset_type,
224
+ data_root=data_root,
225
+ data_mode=data_mode,
226
+ ann_file='coco/annotations/coco_wholebody_val_v1.0.json',
227
+ bbox_file=f'{data_root}coco/person_detection_results/'
228
+ 'COCO_val2017_detections_AP_H_56_person.json',
229
+ data_prefix=dict(img='coco/val2017/'),
230
+ test_mode=True,
231
+ pipeline=val_pipeline,
232
+ ))
233
+ test_dataloader = val_dataloader
234
+
235
+ # hooks
236
+ default_hooks = dict(
237
+ checkpoint=dict(
238
+ save_best='coco-wholebody/AP', rule='greater', max_keep_ckpts=1))
239
+
240
+ custom_hooks = [
241
+ dict(
242
+ type='EMAHook',
243
+ ema_type='ExpMomentumEMA',
244
+ momentum=0.0002,
245
+ update_buffers=True,
246
+ priority=49),
247
+ dict(
248
+ type='mmdet.PipelineSwitchHook',
249
+ switch_epoch=max_epochs - stage2_num_epochs,
250
+ switch_pipeline=train_pipeline_stage2)
251
+ ]
252
+
253
+ # evaluators
254
+ val_evaluator = dict(
255
+ type='CocoWholeBodyMetric',
256
+ ann_file=data_root + 'coco/annotations/coco_wholebody_val_v1.0.json')
257
+ test_evaluator = val_evaluator
annotator/dwpose/onnxdet.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+
4
+ import onnxruntime
5
+
6
+ def nms(boxes, scores, nms_thr):
7
+ """Single class NMS implemented in Numpy."""
8
+ x1 = boxes[:, 0]
9
+ y1 = boxes[:, 1]
10
+ x2 = boxes[:, 2]
11
+ y2 = boxes[:, 3]
12
+
13
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
14
+ order = scores.argsort()[::-1]
15
+
16
+ keep = []
17
+ while order.size > 0:
18
+ i = order[0]
19
+ keep.append(i)
20
+ xx1 = np.maximum(x1[i], x1[order[1:]])
21
+ yy1 = np.maximum(y1[i], y1[order[1:]])
22
+ xx2 = np.minimum(x2[i], x2[order[1:]])
23
+ yy2 = np.minimum(y2[i], y2[order[1:]])
24
+
25
+ w = np.maximum(0.0, xx2 - xx1 + 1)
26
+ h = np.maximum(0.0, yy2 - yy1 + 1)
27
+ inter = w * h
28
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
29
+
30
+ inds = np.where(ovr <= nms_thr)[0]
31
+ order = order[inds + 1]
32
+
33
+ return keep
34
+
35
+ def multiclass_nms(boxes, scores, nms_thr, score_thr):
36
+ """Multiclass NMS implemented in Numpy. Class-aware version."""
37
+ final_dets = []
38
+ num_classes = scores.shape[1]
39
+ for cls_ind in range(num_classes):
40
+ cls_scores = scores[:, cls_ind]
41
+ valid_score_mask = cls_scores > score_thr
42
+ if valid_score_mask.sum() == 0:
43
+ continue
44
+ else:
45
+ valid_scores = cls_scores[valid_score_mask]
46
+ valid_boxes = boxes[valid_score_mask]
47
+ keep = nms(valid_boxes, valid_scores, nms_thr)
48
+ if len(keep) > 0:
49
+ cls_inds = np.ones((len(keep), 1)) * cls_ind
50
+ dets = np.concatenate(
51
+ [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
52
+ )
53
+ final_dets.append(dets)
54
+ if len(final_dets) == 0:
55
+ return None
56
+ return np.concatenate(final_dets, 0)
57
+
58
+ def demo_postprocess(outputs, img_size, p6=False):
59
+ grids = []
60
+ expanded_strides = []
61
+ strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
62
+
63
+ hsizes = [img_size[0] // stride for stride in strides]
64
+ wsizes = [img_size[1] // stride for stride in strides]
65
+
66
+ for hsize, wsize, stride in zip(hsizes, wsizes, strides):
67
+ xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
68
+ grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
69
+ grids.append(grid)
70
+ shape = grid.shape[:2]
71
+ expanded_strides.append(np.full((*shape, 1), stride))
72
+
73
+ grids = np.concatenate(grids, 1)
74
+ expanded_strides = np.concatenate(expanded_strides, 1)
75
+ outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
76
+ outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
77
+
78
+ return outputs
79
+
80
+ def preprocess(img, input_size, swap=(2, 0, 1)):
81
+ if len(img.shape) == 3:
82
+ padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
83
+ else:
84
+ padded_img = np.ones(input_size, dtype=np.uint8) * 114
85
+
86
+ r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
87
+ resized_img = cv2.resize(
88
+ img,
89
+ (int(img.shape[1] * r), int(img.shape[0] * r)),
90
+ interpolation=cv2.INTER_LINEAR,
91
+ ).astype(np.uint8)
92
+ padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
93
+
94
+ padded_img = padded_img.transpose(swap)
95
+ padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
96
+ return padded_img, r
97
+
98
+ def inference_detector(session, oriImg):
99
+ input_shape = (640,640)
100
+ img, ratio = preprocess(oriImg, input_shape)
101
+
102
+ ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
103
+ output = session.run(None, ort_inputs)
104
+ predictions = demo_postprocess(output[0], input_shape)[0]
105
+
106
+ boxes = predictions[:, :4]
107
+ scores = predictions[:, 4:5] * predictions[:, 5:]
108
+
109
+ boxes_xyxy = np.ones_like(boxes)
110
+ boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
111
+ boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
112
+ boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
113
+ boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
114
+ boxes_xyxy /= ratio
115
+ dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
116
+ if dets is not None:
117
+ final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
118
+ isscore = final_scores>0.3
119
+ iscat = final_cls_inds == 0
120
+ isbbox = [ i and j for (i, j) in zip(isscore, iscat)]
121
+ final_boxes = final_boxes[isbbox]
122
+ else:
123
+ final_boxes = np.array([])
124
+
125
+ return final_boxes
annotator/dwpose/onnxpose.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Tuple
2
+
3
+ import cv2
4
+ import numpy as np
5
+ import onnxruntime as ort
6
+
7
+ def preprocess(
8
+ img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256)
9
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
10
+ """Do preprocessing for RTMPose model inference.
11
+
12
+ Args:
13
+ img (np.ndarray): Input image in shape.
14
+ input_size (tuple): Input image size in shape (w, h).
15
+
16
+ Returns:
17
+ tuple:
18
+ - resized_img (np.ndarray): Preprocessed image.
19
+ - center (np.ndarray): Center of image.
20
+ - scale (np.ndarray): Scale of image.
21
+ """
22
+ # get shape of image
23
+ img_shape = img.shape[:2]
24
+ out_img, out_center, out_scale = [], [], []
25
+ if len(out_bbox) == 0:
26
+ out_bbox = [[0, 0, img_shape[1], img_shape[0]]]
27
+ for i in range(len(out_bbox)):
28
+ x0 = out_bbox[i][0]
29
+ y0 = out_bbox[i][1]
30
+ x1 = out_bbox[i][2]
31
+ y1 = out_bbox[i][3]
32
+ bbox = np.array([x0, y0, x1, y1])
33
+
34
+ # get center and scale
35
+ center, scale = bbox_xyxy2cs(bbox, padding=1.25)
36
+
37
+ # do affine transformation
38
+ resized_img, scale = top_down_affine(input_size, scale, center, img)
39
+
40
+ # normalize image
41
+ mean = np.array([123.675, 116.28, 103.53])
42
+ std = np.array([58.395, 57.12, 57.375])
43
+ resized_img = (resized_img - mean) / std
44
+
45
+ out_img.append(resized_img)
46
+ out_center.append(center)
47
+ out_scale.append(scale)
48
+
49
+ return out_img, out_center, out_scale
50
+
51
+
52
+ def inference(sess: ort.InferenceSession, img: np.ndarray) -> np.ndarray:
53
+ """Inference RTMPose model.
54
+
55
+ Args:
56
+ sess (ort.InferenceSession): ONNXRuntime session.
57
+ img (np.ndarray): Input image in shape.
58
+
59
+ Returns:
60
+ outputs (np.ndarray): Output of RTMPose model.
61
+ """
62
+ all_out = []
63
+ # build input
64
+ for i in range(len(img)):
65
+ input = [img[i].transpose(2, 0, 1)]
66
+
67
+ # build output
68
+ sess_input = {sess.get_inputs()[0].name: input}
69
+ sess_output = []
70
+ for out in sess.get_outputs():
71
+ sess_output.append(out.name)
72
+
73
+ # run model
74
+ outputs = sess.run(sess_output, sess_input)
75
+ all_out.append(outputs)
76
+
77
+ return all_out
78
+
79
+
80
+ def postprocess(outputs: List[np.ndarray],
81
+ model_input_size: Tuple[int, int],
82
+ center: Tuple[int, int],
83
+ scale: Tuple[int, int],
84
+ simcc_split_ratio: float = 2.0
85
+ ) -> Tuple[np.ndarray, np.ndarray]:
86
+ """Postprocess for RTMPose model output.
87
+
88
+ Args:
89
+ outputs (np.ndarray): Output of RTMPose model.
90
+ model_input_size (tuple): RTMPose model Input image size.
91
+ center (tuple): Center of bbox in shape (x, y).
92
+ scale (tuple): Scale of bbox in shape (w, h).
93
+ simcc_split_ratio (float): Split ratio of simcc.
94
+
95
+ Returns:
96
+ tuple:
97
+ - keypoints (np.ndarray): Rescaled keypoints.
98
+ - scores (np.ndarray): Model predict scores.
99
+ """
100
+ all_key = []
101
+ all_score = []
102
+ for i in range(len(outputs)):
103
+ # use simcc to decode
104
+ simcc_x, simcc_y = outputs[i]
105
+ keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio)
106
+
107
+ # rescale keypoints
108
+ keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2
109
+ all_key.append(keypoints[0])
110
+ all_score.append(scores[0])
111
+
112
+ return np.array(all_key), np.array(all_score)
113
+
114
+
115
+ def bbox_xyxy2cs(bbox: np.ndarray,
116
+ padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]:
117
+ """Transform the bbox format from (x,y,w,h) into (center, scale)
118
+
119
+ Args:
120
+ bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
121
+ as (left, top, right, bottom)
122
+ padding (float): BBox padding factor that will be multilied to scale.
123
+ Default: 1.0
124
+
125
+ Returns:
126
+ tuple: A tuple containing center and scale.
127
+ - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
128
+ (n, 2)
129
+ - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
130
+ (n, 2)
131
+ """
132
+ # convert single bbox from (4, ) to (1, 4)
133
+ dim = bbox.ndim
134
+ if dim == 1:
135
+ bbox = bbox[None, :]
136
+
137
+ # get bbox center and scale
138
+ x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
139
+ center = np.hstack([x1 + x2, y1 + y2]) * 0.5
140
+ scale = np.hstack([x2 - x1, y2 - y1]) * padding
141
+
142
+ if dim == 1:
143
+ center = center[0]
144
+ scale = scale[0]
145
+
146
+ return center, scale
147
+
148
+
149
+ def _fix_aspect_ratio(bbox_scale: np.ndarray,
150
+ aspect_ratio: float) -> np.ndarray:
151
+ """Extend the scale to match the given aspect ratio.
152
+
153
+ Args:
154
+ scale (np.ndarray): The image scale (w, h) in shape (2, )
155
+ aspect_ratio (float): The ratio of ``w/h``
156
+
157
+ Returns:
158
+ np.ndarray: The reshaped image scale in (2, )
159
+ """
160
+ w, h = np.hsplit(bbox_scale, [1])
161
+ bbox_scale = np.where(w > h * aspect_ratio,
162
+ np.hstack([w, w / aspect_ratio]),
163
+ np.hstack([h * aspect_ratio, h]))
164
+ return bbox_scale
165
+
166
+
167
+ def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
168
+ """Rotate a point by an angle.
169
+
170
+ Args:
171
+ pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
172
+ angle_rad (float): rotation angle in radian
173
+
174
+ Returns:
175
+ np.ndarray: Rotated point in shape (2, )
176
+ """
177
+ sn, cs = np.sin(angle_rad), np.cos(angle_rad)
178
+ rot_mat = np.array([[cs, -sn], [sn, cs]])
179
+ return rot_mat @ pt
180
+
181
+
182
+ def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray:
183
+ """To calculate the affine matrix, three pairs of points are required. This
184
+ function is used to get the 3rd point, given 2D points a & b.
185
+
186
+ The 3rd point is defined by rotating vector `a - b` by 90 degrees
187
+ anticlockwise, using b as the rotation center.
188
+
189
+ Args:
190
+ a (np.ndarray): The 1st point (x,y) in shape (2, )
191
+ b (np.ndarray): The 2nd point (x,y) in shape (2, )
192
+
193
+ Returns:
194
+ np.ndarray: The 3rd point.
195
+ """
196
+ direction = a - b
197
+ c = b + np.r_[-direction[1], direction[0]]
198
+ return c
199
+
200
+
201
+ def get_warp_matrix(center: np.ndarray,
202
+ scale: np.ndarray,
203
+ rot: float,
204
+ output_size: Tuple[int, int],
205
+ shift: Tuple[float, float] = (0., 0.),
206
+ inv: bool = False) -> np.ndarray:
207
+ """Calculate the affine transformation matrix that can warp the bbox area
208
+ in the input image to the output size.
209
+
210
+ Args:
211
+ center (np.ndarray[2, ]): Center of the bounding box (x, y).
212
+ scale (np.ndarray[2, ]): Scale of the bounding box
213
+ wrt [width, height].
214
+ rot (float): Rotation angle (degree).
215
+ output_size (np.ndarray[2, ] | list(2,)): Size of the
216
+ destination heatmaps.
217
+ shift (0-100%): Shift translation ratio wrt the width/height.
218
+ Default (0., 0.).
219
+ inv (bool): Option to inverse the affine transform direction.
220
+ (inv=False: src->dst or inv=True: dst->src)
221
+
222
+ Returns:
223
+ np.ndarray: A 2x3 transformation matrix
224
+ """
225
+ shift = np.array(shift)
226
+ src_w = scale[0]
227
+ dst_w = output_size[0]
228
+ dst_h = output_size[1]
229
+
230
+ # compute transformation matrix
231
+ rot_rad = np.deg2rad(rot)
232
+ src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad)
233
+ dst_dir = np.array([0., dst_w * -0.5])
234
+
235
+ # get four corners of the src rectangle in the original image
236
+ src = np.zeros((3, 2), dtype=np.float32)
237
+ src[0, :] = center + scale * shift
238
+ src[1, :] = center + src_dir + scale * shift
239
+ src[2, :] = _get_3rd_point(src[0, :], src[1, :])
240
+
241
+ # get four corners of the dst rectangle in the input image
242
+ dst = np.zeros((3, 2), dtype=np.float32)
243
+ dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
244
+ dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
245
+ dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
246
+
247
+ if inv:
248
+ warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
249
+ else:
250
+ warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))
251
+
252
+ return warp_mat
253
+
254
+
255
+ def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict,
256
+ img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
257
+ """Get the bbox image as the model input by affine transform.
258
+
259
+ Args:
260
+ input_size (dict): The input size of the model.
261
+ bbox_scale (dict): The bbox scale of the img.
262
+ bbox_center (dict): The bbox center of the img.
263
+ img (np.ndarray): The original image.
264
+
265
+ Returns:
266
+ tuple: A tuple containing center and scale.
267
+ - np.ndarray[float32]: img after affine transform.
268
+ - np.ndarray[float32]: bbox scale after affine transform.
269
+ """
270
+ w, h = input_size
271
+ warp_size = (int(w), int(h))
272
+
273
+ # reshape bbox to fixed aspect ratio
274
+ bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)
275
+
276
+ # get the affine matrix
277
+ center = bbox_center
278
+ scale = bbox_scale
279
+ rot = 0
280
+ warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))
281
+
282
+ # do affine transform
283
+ img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR)
284
+
285
+ return img, bbox_scale
286
+
287
+
288
+ def get_simcc_maximum(simcc_x: np.ndarray,
289
+ simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
290
+ """Get maximum response location and value from simcc representations.
291
+
292
+ Note:
293
+ instance number: N
294
+ num_keypoints: K
295
+ heatmap height: H
296
+ heatmap width: W
297
+
298
+ Args:
299
+ simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
300
+ simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)
301
+
302
+ Returns:
303
+ tuple:
304
+ - locs (np.ndarray): locations of maximum heatmap responses in shape
305
+ (K, 2) or (N, K, 2)
306
+ - vals (np.ndarray): values of maximum heatmap responses in shape
307
+ (K,) or (N, K)
308
+ """
309
+ N, K, Wx = simcc_x.shape
310
+ simcc_x = simcc_x.reshape(N * K, -1)
311
+ simcc_y = simcc_y.reshape(N * K, -1)
312
+
313
+ # get maximum value locations
314
+ x_locs = np.argmax(simcc_x, axis=1)
315
+ y_locs = np.argmax(simcc_y, axis=1)
316
+ locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
317
+ max_val_x = np.amax(simcc_x, axis=1)
318
+ max_val_y = np.amax(simcc_y, axis=1)
319
+
320
+ # get maximum value across x and y axis
321
+ mask = max_val_x > max_val_y
322
+ max_val_x[mask] = max_val_y[mask]
323
+ vals = max_val_x
324
+ locs[vals <= 0.] = -1
325
+
326
+ # reshape
327
+ locs = locs.reshape(N, K, 2)
328
+ vals = vals.reshape(N, K)
329
+
330
+ return locs, vals
331
+
332
+
333
+ def decode(simcc_x: np.ndarray, simcc_y: np.ndarray,
334
+ simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]:
335
+ """Modulate simcc distribution with Gaussian.
336
+
337
+ Args:
338
+ simcc_x (np.ndarray[K, Wx]): model predicted simcc in x.
339
+ simcc_y (np.ndarray[K, Wy]): model predicted simcc in y.
340
+ simcc_split_ratio (int): The split ratio of simcc.
341
+
342
+ Returns:
343
+ tuple: A tuple containing center and scale.
344
+ - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2)
345
+ - np.ndarray[float32]: scores in shape (K,) or (n, K)
346
+ """
347
+ keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
348
+ keypoints /= simcc_split_ratio
349
+
350
+ return keypoints, scores
351
+
352
+
353
+ def inference_pose(session, out_bbox, oriImg):
354
+ h, w = session.get_inputs()[0].shape[2:]
355
+ model_input_size = (w, h)
356
+ resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size)
357
+ outputs = inference(session, resized_img)
358
+ keypoints, scores = postprocess(outputs, model_input_size, center, scale)
359
+
360
+ return keypoints, scores
annotator/dwpose/util.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import matplotlib
4
+ import cv2
5
+
6
+
7
+ eps = 0.01
8
+
9
+
10
+ def smart_resize(x, s):
11
+ Ht, Wt = s
12
+ if x.ndim == 2:
13
+ Ho, Wo = x.shape
14
+ Co = 1
15
+ else:
16
+ Ho, Wo, Co = x.shape
17
+ if Co == 3 or Co == 1:
18
+ k = float(Ht + Wt) / float(Ho + Wo)
19
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
20
+ else:
21
+ return np.stack([smart_resize(x[:, :, i], s) for i in range(Co)], axis=2)
22
+
23
+
24
+ def smart_resize_k(x, fx, fy):
25
+ if x.ndim == 2:
26
+ Ho, Wo = x.shape
27
+ Co = 1
28
+ else:
29
+ Ho, Wo, Co = x.shape
30
+ Ht, Wt = Ho * fy, Wo * fx
31
+ if Co == 3 or Co == 1:
32
+ k = float(Ht + Wt) / float(Ho + Wo)
33
+ return cv2.resize(x, (int(Wt), int(Ht)), interpolation=cv2.INTER_AREA if k < 1 else cv2.INTER_LANCZOS4)
34
+ else:
35
+ return np.stack([smart_resize_k(x[:, :, i], fx, fy) for i in range(Co)], axis=2)
36
+
37
+
38
+ def padRightDownCorner(img, stride, padValue):
39
+ h = img.shape[0]
40
+ w = img.shape[1]
41
+
42
+ pad = 4 * [None]
43
+ pad[0] = 0 # up
44
+ pad[1] = 0 # left
45
+ pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
46
+ pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
47
+
48
+ img_padded = img
49
+ pad_up = np.tile(img_padded[0:1, :, :]*0 + padValue, (pad[0], 1, 1))
50
+ img_padded = np.concatenate((pad_up, img_padded), axis=0)
51
+ pad_left = np.tile(img_padded[:, 0:1, :]*0 + padValue, (1, pad[1], 1))
52
+ img_padded = np.concatenate((pad_left, img_padded), axis=1)
53
+ pad_down = np.tile(img_padded[-2:-1, :, :]*0 + padValue, (pad[2], 1, 1))
54
+ img_padded = np.concatenate((img_padded, pad_down), axis=0)
55
+ pad_right = np.tile(img_padded[:, -2:-1, :]*0 + padValue, (1, pad[3], 1))
56
+ img_padded = np.concatenate((img_padded, pad_right), axis=1)
57
+
58
+ return img_padded, pad
59
+
60
+
61
+ def transfer(model, model_weights):
62
+ transfered_model_weights = {}
63
+ for weights_name in model.state_dict().keys():
64
+ transfered_model_weights[weights_name] = model_weights['.'.join(weights_name.split('.')[1:])]
65
+ return transfered_model_weights
66
+
67
+
68
+ def draw_bodypose(canvas, candidate, subset):
69
+ H, W, C = canvas.shape
70
+ candidate = np.array(candidate)
71
+ subset = np.array(subset)
72
+
73
+ stickwidth = 4
74
+
75
+ limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], \
76
+ [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], \
77
+ [1, 16], [16, 18], [3, 17], [6, 18]]
78
+
79
+ colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
80
+ [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
81
+ [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
82
+
83
+ for i in range(17):
84
+ for n in range(len(subset)):
85
+ index = subset[n][np.array(limbSeq[i]) - 1]
86
+ if -1 in index:
87
+ continue
88
+ Y = candidate[index.astype(int), 0] * float(W)
89
+ X = candidate[index.astype(int), 1] * float(H)
90
+ mX = np.mean(X)
91
+ mY = np.mean(Y)
92
+ length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
93
+ angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
94
+ polygon = cv2.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stickwidth), int(angle), 0, 360, 1)
95
+ cv2.fillConvexPoly(canvas, polygon, colors[i])
96
+
97
+ canvas = (canvas * 0.6).astype(np.uint8)
98
+
99
+ for i in range(18):
100
+ for n in range(len(subset)):
101
+ index = int(subset[n][i])
102
+ if index == -1:
103
+ continue
104
+ x, y = candidate[index][0:2]
105
+ x = int(x * W)
106
+ y = int(y * H)
107
+ cv2.circle(canvas, (int(x), int(y)), 4, colors[i], thickness=-1)
108
+
109
+ return canvas
110
+
111
+
112
+ def draw_handpose(canvas, all_hand_peaks):
113
+ H, W, C = canvas.shape
114
+
115
+ edges = [[0, 1], [1, 2], [2, 3], [3, 4], [0, 5], [5, 6], [6, 7], [7, 8], [0, 9], [9, 10], \
116
+ [10, 11], [11, 12], [0, 13], [13, 14], [14, 15], [15, 16], [0, 17], [17, 18], [18, 19], [19, 20]]
117
+
118
+ for peaks in all_hand_peaks:
119
+ peaks = np.array(peaks)
120
+
121
+ for ie, e in enumerate(edges):
122
+ x1, y1 = peaks[e[0]]
123
+ x2, y2 = peaks[e[1]]
124
+ x1 = int(x1 * W)
125
+ y1 = int(y1 * H)
126
+ x2 = int(x2 * W)
127
+ y2 = int(y2 * H)
128
+ if x1 > eps and y1 > eps and x2 > eps and y2 > eps:
129
+ cv2.line(canvas, (x1, y1), (x2, y2), matplotlib.colors.hsv_to_rgb([ie / float(len(edges)), 1.0, 1.0]) * 255, thickness=2)
130
+
131
+ for i, keyponit in enumerate(peaks):
132
+ x, y = keyponit
133
+ x = int(x * W)
134
+ y = int(y * H)
135
+ if x > eps and y > eps:
136
+ cv2.circle(canvas, (x, y), 4, (0, 0, 255), thickness=-1)
137
+ return canvas
138
+
139
+
140
+ def draw_facepose(canvas, all_lmks):
141
+ H, W, C = canvas.shape
142
+ for lmks in all_lmks:
143
+ lmks = np.array(lmks)
144
+ for lmk in lmks:
145
+ x, y = lmk
146
+ x = int(x * W)
147
+ y = int(y * H)
148
+ if x > eps and y > eps:
149
+ cv2.circle(canvas, (x, y), 3, (255, 255, 255), thickness=-1)
150
+ return canvas
151
+
152
+
153
+ # detect hand according to body pose keypoints
154
+ # please refer to https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/src/openpose/hand/handDetector.cpp
155
+ def handDetect(candidate, subset, oriImg):
156
+ # right hand: wrist 4, elbow 3, shoulder 2
157
+ # left hand: wrist 7, elbow 6, shoulder 5
158
+ ratioWristElbow = 0.33
159
+ detect_result = []
160
+ image_height, image_width = oriImg.shape[0:2]
161
+ for person in subset.astype(int):
162
+ # if any of three not detected
163
+ has_left = np.sum(person[[5, 6, 7]] == -1) == 0
164
+ has_right = np.sum(person[[2, 3, 4]] == -1) == 0
165
+ if not (has_left or has_right):
166
+ continue
167
+ hands = []
168
+ #left hand
169
+ if has_left:
170
+ left_shoulder_index, left_elbow_index, left_wrist_index = person[[5, 6, 7]]
171
+ x1, y1 = candidate[left_shoulder_index][:2]
172
+ x2, y2 = candidate[left_elbow_index][:2]
173
+ x3, y3 = candidate[left_wrist_index][:2]
174
+ hands.append([x1, y1, x2, y2, x3, y3, True])
175
+ # right hand
176
+ if has_right:
177
+ right_shoulder_index, right_elbow_index, right_wrist_index = person[[2, 3, 4]]
178
+ x1, y1 = candidate[right_shoulder_index][:2]
179
+ x2, y2 = candidate[right_elbow_index][:2]
180
+ x3, y3 = candidate[right_wrist_index][:2]
181
+ hands.append([x1, y1, x2, y2, x3, y3, False])
182
+
183
+ for x1, y1, x2, y2, x3, y3, is_left in hands:
184
+ # pos_hand = pos_wrist + ratio * (pos_wrist - pos_elbox) = (1 + ratio) * pos_wrist - ratio * pos_elbox
185
+ # handRectangle.x = posePtr[wrist*3] + ratioWristElbow * (posePtr[wrist*3] - posePtr[elbow*3]);
186
+ # handRectangle.y = posePtr[wrist*3+1] + ratioWristElbow * (posePtr[wrist*3+1] - posePtr[elbow*3+1]);
187
+ # const auto distanceWristElbow = getDistance(poseKeypoints, person, wrist, elbow);
188
+ # const auto distanceElbowShoulder = getDistance(poseKeypoints, person, elbow, shoulder);
189
+ # handRectangle.width = 1.5f * fastMax(distanceWristElbow, 0.9f * distanceElbowShoulder);
190
+ x = x3 + ratioWristElbow * (x3 - x2)
191
+ y = y3 + ratioWristElbow * (y3 - y2)
192
+ distanceWristElbow = math.sqrt((x3 - x2) ** 2 + (y3 - y2) ** 2)
193
+ distanceElbowShoulder = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
194
+ width = 1.5 * max(distanceWristElbow, 0.9 * distanceElbowShoulder)
195
+ # x-y refers to the center --> offset to topLeft point
196
+ # handRectangle.x -= handRectangle.width / 2.f;
197
+ # handRectangle.y -= handRectangle.height / 2.f;
198
+ x -= width / 2
199
+ y -= width / 2 # width = height
200
+ # overflow the image
201
+ if x < 0: x = 0
202
+ if y < 0: y = 0
203
+ width1 = width
204
+ width2 = width
205
+ if x + width > image_width: width1 = image_width - x
206
+ if y + width > image_height: width2 = image_height - y
207
+ width = min(width1, width2)
208
+ # the max hand box value is 20 pixels
209
+ if width >= 20:
210
+ detect_result.append([int(x), int(y), int(width), is_left])
211
+
212
+ '''
213
+ return value: [[x, y, w, True if left hand else False]].
214
+ width=height since the network require squared input.
215
+ x, y is the coordinate of top left
216
+ '''
217
+ return detect_result
218
+
219
+
220
+ # Written by Lvmin
221
+ def faceDetect(candidate, subset, oriImg):
222
+ # left right eye ear 14 15 16 17
223
+ detect_result = []
224
+ image_height, image_width = oriImg.shape[0:2]
225
+ for person in subset.astype(int):
226
+ has_head = person[0] > -1
227
+ if not has_head:
228
+ continue
229
+
230
+ has_left_eye = person[14] > -1
231
+ has_right_eye = person[15] > -1
232
+ has_left_ear = person[16] > -1
233
+ has_right_ear = person[17] > -1
234
+
235
+ if not (has_left_eye or has_right_eye or has_left_ear or has_right_ear):
236
+ continue
237
+
238
+ head, left_eye, right_eye, left_ear, right_ear = person[[0, 14, 15, 16, 17]]
239
+
240
+ width = 0.0
241
+ x0, y0 = candidate[head][:2]
242
+
243
+ if has_left_eye:
244
+ x1, y1 = candidate[left_eye][:2]
245
+ d = max(abs(x0 - x1), abs(y0 - y1))
246
+ width = max(width, d * 3.0)
247
+
248
+ if has_right_eye:
249
+ x1, y1 = candidate[right_eye][:2]
250
+ d = max(abs(x0 - x1), abs(y0 - y1))
251
+ width = max(width, d * 3.0)
252
+
253
+ if has_left_ear:
254
+ x1, y1 = candidate[left_ear][:2]
255
+ d = max(abs(x0 - x1), abs(y0 - y1))
256
+ width = max(width, d * 1.5)
257
+
258
+ if has_right_ear:
259
+ x1, y1 = candidate[right_ear][:2]
260
+ d = max(abs(x0 - x1), abs(y0 - y1))
261
+ width = max(width, d * 1.5)
262
+
263
+ x, y = x0, y0
264
+
265
+ x -= width
266
+ y -= width
267
+
268
+ if x < 0:
269
+ x = 0
270
+
271
+ if y < 0:
272
+ y = 0
273
+
274
+ width1 = width * 2
275
+ width2 = width * 2
276
+
277
+ if x + width > image_width:
278
+ width1 = image_width - x
279
+
280
+ if y + width > image_height:
281
+ width2 = image_height - y
282
+
283
+ width = min(width1, width2)
284
+
285
+ if width >= 20:
286
+ detect_result.append([int(x), int(y), int(width)])
287
+
288
+ return detect_result
289
+
290
+
291
+ # get max index of 2d array
292
+ def npmax(array):
293
+ arrayindex = array.argmax(1)
294
+ arrayvalue = array.max(1)
295
+ i = arrayvalue.argmax()
296
+ j = arrayindex[i]
297
+ return i, j
annotator/dwpose/wholebody.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+
4
+ import onnxruntime as ort
5
+ from .onnxdet import inference_detector
6
+ from .onnxpose import inference_pose
7
+
8
+ class Wholebody:
9
+ def __init__(self):
10
+ device = 'cuda:0'
11
+ providers = ['CPUExecutionProvider'
12
+ ] if device == 'cpu' else ['CUDAExecutionProvider']
13
+ onnx_det = 'annotator/ckpts/yolox_l.onnx'
14
+ onnx_pose = 'annotator/ckpts/dw-ll_ucoco_384.onnx'
15
+
16
+ self.session_det = ort.InferenceSession(path_or_bytes=onnx_det, providers=providers)
17
+ self.session_pose = ort.InferenceSession(path_or_bytes=onnx_pose, providers=providers)
18
+
19
+ def __call__(self, oriImg):
20
+ det_result = inference_detector(self.session_det, oriImg)
21
+ keypoints, scores = inference_pose(self.session_pose, det_result, oriImg)
22
+
23
+ keypoints_info = np.concatenate(
24
+ (keypoints, scores[..., None]), axis=-1)
25
+ # compute neck joint
26
+ neck = np.mean(keypoints_info[:, [5, 6]], axis=1)
27
+ # neck score when visualizing pred
28
+ neck[:, 2:4] = np.logical_and(
29
+ keypoints_info[:, 5, 2:4] > 0.3,
30
+ keypoints_info[:, 6, 2:4] > 0.3).astype(int)
31
+ new_keypoints_info = np.insert(
32
+ keypoints_info, 17, neck, axis=1)
33
+ mmpose_idx = [
34
+ 17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3
35
+ ]
36
+ openpose_idx = [
37
+ 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17
38
+ ]
39
+ new_keypoints_info[:, openpose_idx] = \
40
+ new_keypoints_info[:, mmpose_idx]
41
+ keypoints_info = new_keypoints_info
42
+
43
+ keypoints, scores = keypoints_info[
44
+ ..., :2], keypoints_info[..., 2]
45
+
46
+ return keypoints, scores
47
+
48
+
49
+
50
+
51
+ # # Copyright (c) OpenMMLab. All rights reserved.
52
+ # import numpy as np
53
+ # from . import util
54
+ # import cv2
55
+ # import mmcv
56
+ # import torch
57
+ # import matplotlib.pyplot as plt
58
+ # from mmpose.apis import inference_topdown
59
+ # from mmpose.apis import init_model as init_pose_estimator
60
+ # from mmpose.evaluation.functional import nms
61
+ # from mmpose.utils import adapt_mmdet_pipeline
62
+ # from mmpose.structures import merge_data_samples
63
+
64
+ # from mmdet.apis import inference_detector, init_detector
65
+
66
+
67
+ # class Wholebody:
68
+ # def __init__(self):
69
+ # device = 'cuda:0'
70
+ # det_config = 'annotator/dwpose/yolox_config/yolox_l_8xb8-300e_coco.py'
71
+ # det_ckpt = 'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_l_8x8_300e_coco/yolox_l_8x8_300e_coco_20211126_140236-d3bd2b23.pth'
72
+ # pose_config = 'annotator/dwpose/dwpose_config/dwpose-l_384x288.py'
73
+ # pose_ckpt = 'annotator/ckpts/dw-ll_ucoco_384.pth'
74
+
75
+ # # build detector
76
+ # self.detector = init_detector(det_config, det_ckpt, device=device)
77
+ # self.detector.cfg = adapt_mmdet_pipeline(self.detector.cfg)
78
+
79
+ # # build pose estimator
80
+ # self.pose_estimator = init_pose_estimator(
81
+ # pose_config,
82
+ # pose_ckpt,
83
+ # device=device)
84
+
85
+ # def __call__(self, oriImg):
86
+ # # predict bbox
87
+ # det_result = inference_detector(self.detector, oriImg)
88
+ # pred_instance = det_result.pred_instances.cpu().numpy()
89
+ # bboxes = np.concatenate(
90
+ # (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1)
91
+ # bboxes = bboxes[np.logical_and(pred_instance.labels == 0,
92
+ # pred_instance.scores > 0.3)]
93
+ # # # max value
94
+ # # if len(bboxes) > 0:
95
+ # # bboxes = bboxes[0].reshape(1,-1)
96
+ # bboxes = bboxes[nms(bboxes, 0.3), :4]
97
+
98
+ # # predict keypoints
99
+ # if len(bboxes) == 0:
100
+ # pose_results = inference_topdown(self.pose_estimator, oriImg)
101
+ # else:
102
+ # pose_results = inference_topdown(self.pose_estimator, oriImg, bboxes)
103
+ # preds = merge_data_samples(pose_results)
104
+ # preds = preds.pred_instances
105
+
106
+ # # preds = pose_results[0].pred_instances
107
+ # keypoints = preds.get('transformed_keypoints',
108
+ # preds.keypoints)
109
+ # if 'keypoint_scores' in preds:
110
+ # scores = preds.keypoint_scores
111
+ # else:
112
+ # scores = np.ones(keypoints.shape[:-1])
113
+
114
+ # if 'keypoints_visible' in preds:
115
+ # visible = preds.keypoints_visible
116
+ # else:
117
+ # visible = np.ones(keypoints.shape[:-1])
118
+ # keypoints_info = np.concatenate(
119
+ # (keypoints, scores[..., None], visible[..., None]),
120
+ # axis=-1)
121
+ # # compute neck joint
122
+ # neck = np.mean(keypoints_info[:, [5, 6]], axis=1)
123
+ # # neck score when visualizing pred
124
+ # neck[:, 2:4] = np.logical_and(
125
+ # keypoints_info[:, 5, 2:4] > 0.3,
126
+ # keypoints_info[:, 6, 2:4] > 0.3).astype(int)
127
+ # new_keypoints_info = np.insert(
128
+ # keypoints_info, 17, neck, axis=1)
129
+ # mmpose_idx = [
130
+ # 17, 6, 8, 10, 7, 9, 12, 14, 16, 13, 15, 2, 1, 4, 3
131
+ # ]
132
+ # openpose_idx = [
133
+ # 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17
134
+ # ]
135
+ # new_keypoints_info[:, openpose_idx] = \
136
+ # new_keypoints_info[:, mmpose_idx]
137
+ # keypoints_info = new_keypoints_info
138
+
139
+ # keypoints, scores, visible = keypoints_info[
140
+ # ..., :2], keypoints_info[..., 2], keypoints_info[..., 3]
141
+
142
+ # return keypoints, scores
annotator/dwpose/yolox_config/yolox_l_8xb8-300e_coco.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_scale = (640, 640) # width, height
2
+
3
+ # model settings
4
+ model = dict(
5
+ type='YOLOX',
6
+ data_preprocessor=dict(
7
+ type='DetDataPreprocessor',
8
+ pad_size_divisor=32,
9
+ batch_augments=[
10
+ dict(
11
+ type='BatchSyncRandomResize',
12
+ random_size_range=(480, 800),
13
+ size_divisor=32,
14
+ interval=10)
15
+ ]),
16
+ backbone=dict(
17
+ type='CSPDarknet',
18
+ deepen_factor=1.0,
19
+ widen_factor=1.0,
20
+ out_indices=(2, 3, 4),
21
+ use_depthwise=False,
22
+ spp_kernal_sizes=(5, 9, 13),
23
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
24
+ act_cfg=dict(type='Swish'),
25
+ ),
26
+ neck=dict(
27
+ type='YOLOXPAFPN',
28
+ in_channels=[256, 512, 1024],
29
+ out_channels=256,
30
+ num_csp_blocks=3,
31
+ use_depthwise=False,
32
+ upsample_cfg=dict(scale_factor=2, mode='nearest'),
33
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
34
+ act_cfg=dict(type='Swish')),
35
+ bbox_head=dict(
36
+ type='YOLOXHead',
37
+ num_classes=80,
38
+ in_channels=256,
39
+ feat_channels=256,
40
+ stacked_convs=2,
41
+ strides=(8, 16, 32),
42
+ use_depthwise=False,
43
+ norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
44
+ act_cfg=dict(type='Swish'),
45
+ loss_cls=dict(
46
+ type='CrossEntropyLoss',
47
+ use_sigmoid=True,
48
+ reduction='sum',
49
+ loss_weight=1.0),
50
+ loss_bbox=dict(
51
+ type='IoULoss',
52
+ mode='square',
53
+ eps=1e-16,
54
+ reduction='sum',
55
+ loss_weight=5.0),
56
+ loss_obj=dict(
57
+ type='CrossEntropyLoss',
58
+ use_sigmoid=True,
59
+ reduction='sum',
60
+ loss_weight=1.0),
61
+ loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0)),
62
+ train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
63
+ # In order to align the source code, the threshold of the val phase is
64
+ # 0.01, and the threshold of the test phase is 0.001.
65
+ test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))
66
+
67
+ # dataset settings
68
+ data_root = 'data/coco/'
69
+ dataset_type = 'CocoDataset'
70
+
71
+ # Example to use different file client
72
+ # Method 1: simply set the data root and let the file I/O module
73
+ # automatically infer from prefix (not support LMDB and Memcache yet)
74
+
75
+ # data_root = 's3://openmmlab/datasets/detection/coco/'
76
+
77
+ # Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
78
+ # backend_args = dict(
79
+ # backend='petrel',
80
+ # path_mapping=dict({
81
+ # './data/': 's3://openmmlab/datasets/detection/',
82
+ # 'data/': 's3://openmmlab/datasets/detection/'
83
+ # }))
84
+ backend_args = None
85
+
86
+ train_pipeline = [
87
+ dict(type='Mosaic', img_scale=img_scale, pad_val=114.0),
88
+ dict(
89
+ type='RandomAffine',
90
+ scaling_ratio_range=(0.1, 2),
91
+ # img_scale is (width, height)
92
+ border=(-img_scale[0] // 2, -img_scale[1] // 2)),
93
+ dict(
94
+ type='MixUp',
95
+ img_scale=img_scale,
96
+ ratio_range=(0.8, 1.6),
97
+ pad_val=114.0),
98
+ dict(type='YOLOXHSVRandomAug'),
99
+ dict(type='RandomFlip', prob=0.5),
100
+ # According to the official implementation, multi-scale
101
+ # training is not considered here but in the
102
+ # 'mmdet/models/detectors/yolox.py'.
103
+ # Resize and Pad are for the last 15 epochs when Mosaic,
104
+ # RandomAffine, and MixUp are closed by YOLOXModeSwitchHook.
105
+ dict(type='Resize', scale=img_scale, keep_ratio=True),
106
+ dict(
107
+ type='Pad',
108
+ pad_to_square=True,
109
+ # If the image is three-channel, the pad value needs
110
+ # to be set separately for each channel.
111
+ pad_val=dict(img=(114.0, 114.0, 114.0))),
112
+ dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
113
+ dict(type='PackDetInputs')
114
+ ]
115
+
116
+ train_dataset = dict(
117
+ # use MultiImageMixDataset wrapper to support mosaic and mixup
118
+ type='MultiImageMixDataset',
119
+ dataset=dict(
120
+ type=dataset_type,
121
+ data_root=data_root,
122
+ ann_file='annotations/instances_train2017.json',
123
+ data_prefix=dict(img='train2017/'),
124
+ pipeline=[
125
+ dict(type='LoadImageFromFile', backend_args=backend_args),
126
+ dict(type='LoadAnnotations', with_bbox=True)
127
+ ],
128
+ filter_cfg=dict(filter_empty_gt=False, min_size=32),
129
+ backend_args=backend_args),
130
+ pipeline=train_pipeline)
131
+
132
+ test_pipeline = [
133
+ dict(type='LoadImageFromFile', backend_args=backend_args),
134
+ dict(type='Resize', scale=img_scale, keep_ratio=True),
135
+ dict(
136
+ type='Pad',
137
+ pad_to_square=True,
138
+ pad_val=dict(img=(114.0, 114.0, 114.0))),
139
+ dict(type='LoadAnnotations', with_bbox=True),
140
+ dict(
141
+ type='PackDetInputs',
142
+ meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
143
+ 'scale_factor'))
144
+ ]
145
+
146
+ train_dataloader = dict(
147
+ batch_size=8,
148
+ num_workers=4,
149
+ persistent_workers=True,
150
+ sampler=dict(type='DefaultSampler', shuffle=True),
151
+ dataset=train_dataset)
152
+ val_dataloader = dict(
153
+ batch_size=8,
154
+ num_workers=4,
155
+ persistent_workers=True,
156
+ drop_last=False,
157
+ sampler=dict(type='DefaultSampler', shuffle=False),
158
+ dataset=dict(
159
+ type=dataset_type,
160
+ data_root=data_root,
161
+ ann_file='annotations/instances_val2017.json',
162
+ data_prefix=dict(img='val2017/'),
163
+ test_mode=True,
164
+ pipeline=test_pipeline,
165
+ backend_args=backend_args))
166
+ test_dataloader = val_dataloader
167
+
168
+ val_evaluator = dict(
169
+ type='CocoMetric',
170
+ ann_file=data_root + 'annotations/instances_val2017.json',
171
+ metric='bbox',
172
+ backend_args=backend_args)
173
+ test_evaluator = val_evaluator
174
+
175
+ # training settings
176
+ max_epochs = 300
177
+ num_last_epochs = 15
178
+ interval = 10
179
+
180
+ train_cfg = dict(max_epochs=max_epochs, val_interval=interval)
181
+
182
+ # optimizer
183
+ # default 8 gpu
184
+ base_lr = 0.01
185
+ optim_wrapper = dict(
186
+ type='OptimWrapper',
187
+ optimizer=dict(
188
+ type='SGD', lr=base_lr, momentum=0.9, weight_decay=5e-4,
189
+ nesterov=True),
190
+ paramwise_cfg=dict(norm_decay_mult=0., bias_decay_mult=0.))
191
+
192
+ # learning rate
193
+ param_scheduler = [
194
+ dict(
195
+ # use quadratic formula to warm up 5 epochs
196
+ # and lr is updated by iteration
197
+ # TODO: fix default scope in get function
198
+ type='mmdet.QuadraticWarmupLR',
199
+ by_epoch=True,
200
+ begin=0,
201
+ end=5,
202
+ convert_to_iter_based=True),
203
+ dict(
204
+ # use cosine lr from 5 to 285 epoch
205
+ type='CosineAnnealingLR',
206
+ eta_min=base_lr * 0.05,
207
+ begin=5,
208
+ T_max=max_epochs - num_last_epochs,
209
+ end=max_epochs - num_last_epochs,
210
+ by_epoch=True,
211
+ convert_to_iter_based=True),
212
+ dict(
213
+ # use fixed lr during last 15 epochs
214
+ type='ConstantLR',
215
+ by_epoch=True,
216
+ factor=1,
217
+ begin=max_epochs - num_last_epochs,
218
+ end=max_epochs,
219
+ )
220
+ ]
221
+
222
+ default_hooks = dict(
223
+ checkpoint=dict(
224
+ interval=interval,
225
+ max_keep_ckpts=3 # only keep latest 3 checkpoints
226
+ ))
227
+
228
+ custom_hooks = [
229
+ dict(
230
+ type='YOLOXModeSwitchHook',
231
+ num_last_epochs=num_last_epochs,
232
+ priority=48),
233
+ dict(type='SyncNormHook', priority=48),
234
+ dict(
235
+ type='EMAHook',
236
+ ema_type='ExpMomentumEMA',
237
+ momentum=0.0001,
238
+ update_buffers=True,
239
+ priority=49)
240
+ ]
241
+
242
+ # NOTE: `auto_scale_lr` is for automatically scaling LR,
243
+ # USER SHOULD NOT CHANGE ITS VALUES.
244
+ # base_batch_size = (8 GPUs) x (8 samples per GPU)
245
+ auto_scale_lr = dict(base_batch_size=64)
annotator/hed/__init__.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ import os
4
+ import torch
5
+ from einops import rearrange
6
+ from annotator.util import annotator_ckpts_path
7
+
8
+
9
+ class Network(torch.nn.Module):
10
+ def __init__(self, model_path):
11
+ super().__init__()
12
+
13
+ self.netVggOne = torch.nn.Sequential(
14
+ torch.nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1),
15
+ torch.nn.ReLU(inplace=False),
16
+ torch.nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1),
17
+ torch.nn.ReLU(inplace=False)
18
+ )
19
+
20
+ self.netVggTwo = torch.nn.Sequential(
21
+ torch.nn.MaxPool2d(kernel_size=2, stride=2),
22
+ torch.nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
23
+ torch.nn.ReLU(inplace=False),
24
+ torch.nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1),
25
+ torch.nn.ReLU(inplace=False)
26
+ )
27
+
28
+ self.netVggThr = torch.nn.Sequential(
29
+ torch.nn.MaxPool2d(kernel_size=2, stride=2),
30
+ torch.nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
31
+ torch.nn.ReLU(inplace=False),
32
+ torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
33
+ torch.nn.ReLU(inplace=False),
34
+ torch.nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=1, padding=1),
35
+ torch.nn.ReLU(inplace=False)
36
+ )
37
+
38
+ self.netVggFou = torch.nn.Sequential(
39
+ torch.nn.MaxPool2d(kernel_size=2, stride=2),
40
+ torch.nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
41
+ torch.nn.ReLU(inplace=False),
42
+ torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
43
+ torch.nn.ReLU(inplace=False),
44
+ torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
45
+ torch.nn.ReLU(inplace=False)
46
+ )
47
+
48
+ self.netVggFiv = torch.nn.Sequential(
49
+ torch.nn.MaxPool2d(kernel_size=2, stride=2),
50
+ torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
51
+ torch.nn.ReLU(inplace=False),
52
+ torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
53
+ torch.nn.ReLU(inplace=False),
54
+ torch.nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, stride=1, padding=1),
55
+ torch.nn.ReLU(inplace=False)
56
+ )
57
+
58
+ self.netScoreOne = torch.nn.Conv2d(in_channels=64, out_channels=1, kernel_size=1, stride=1, padding=0)
59
+ self.netScoreTwo = torch.nn.Conv2d(in_channels=128, out_channels=1, kernel_size=1, stride=1, padding=0)
60
+ self.netScoreThr = torch.nn.Conv2d(in_channels=256, out_channels=1, kernel_size=1, stride=1, padding=0)
61
+ self.netScoreFou = torch.nn.Conv2d(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
62
+ self.netScoreFiv = torch.nn.Conv2d(in_channels=512, out_channels=1, kernel_size=1, stride=1, padding=0)
63
+
64
+ self.netCombine = torch.nn.Sequential(
65
+ torch.nn.Conv2d(in_channels=5, out_channels=1, kernel_size=1, stride=1, padding=0),
66
+ torch.nn.Sigmoid()
67
+ )
68
+
69
+ self.load_state_dict({strKey.replace('module', 'net'): tenWeight for strKey, tenWeight in torch.load(model_path).items()})
70
+
71
+ def forward(self, tenInput):
72
+ tenInput = tenInput * 255.0
73
+ tenInput = tenInput - torch.tensor(data=[104.00698793, 116.66876762, 122.67891434], dtype=tenInput.dtype, device=tenInput.device).view(1, 3, 1, 1)
74
+
75
+ tenVggOne = self.netVggOne(tenInput)
76
+ tenVggTwo = self.netVggTwo(tenVggOne)
77
+ tenVggThr = self.netVggThr(tenVggTwo)
78
+ tenVggFou = self.netVggFou(tenVggThr)
79
+ tenVggFiv = self.netVggFiv(tenVggFou)
80
+
81
+ tenScoreOne = self.netScoreOne(tenVggOne)
82
+ tenScoreTwo = self.netScoreTwo(tenVggTwo)
83
+ tenScoreThr = self.netScoreThr(tenVggThr)
84
+ tenScoreFou = self.netScoreFou(tenVggFou)
85
+ tenScoreFiv = self.netScoreFiv(tenVggFiv)
86
+
87
+ tenScoreOne = torch.nn.functional.interpolate(input=tenScoreOne, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
88
+ tenScoreTwo = torch.nn.functional.interpolate(input=tenScoreTwo, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
89
+ tenScoreThr = torch.nn.functional.interpolate(input=tenScoreThr, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
90
+ tenScoreFou = torch.nn.functional.interpolate(input=tenScoreFou, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
91
+ tenScoreFiv = torch.nn.functional.interpolate(input=tenScoreFiv, size=(tenInput.shape[2], tenInput.shape[3]), mode='bilinear', align_corners=False)
92
+
93
+ return self.netCombine(torch.cat([ tenScoreOne, tenScoreTwo, tenScoreThr, tenScoreFou, tenScoreFiv ], 1))
94
+
95
+
96
+ class HEDdetector:
97
+ def __init__(self):
98
+ remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth"
99
+ modelpath = os.path.join(annotator_ckpts_path, "network-bsds500.pth")
100
+ if not os.path.exists(modelpath):
101
+ from basicsr.utils.download_util import load_file_from_url
102
+ load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
103
+ self.netNetwork = Network(modelpath).cuda().eval()
104
+
105
+ def __call__(self, input_image):
106
+ assert input_image.ndim == 3
107
+ input_image = input_image[:, :, ::-1].copy()
108
+ with torch.no_grad():
109
+ image_hed = torch.from_numpy(input_image).float().cuda()
110
+ image_hed = image_hed / 255.0
111
+ image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
112
+ edge = self.netNetwork(image_hed)[0]
113
+ edge = (edge.cpu().numpy() * 255.0).clip(0, 255).astype(np.uint8)
114
+ return edge[0]
115
+
116
+
117
+ def nms(x, t, s):
118
+ x = cv2.GaussianBlur(x.astype(np.float32), (0, 0), s)
119
+
120
+ f1 = np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], dtype=np.uint8)
121
+ f2 = np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], dtype=np.uint8)
122
+ f3 = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.uint8)
123
+ f4 = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], dtype=np.uint8)
124
+
125
+ y = np.zeros_like(x)
126
+
127
+ for f in [f1, f2, f3, f4]:
128
+ np.putmask(y, cv2.dilate(x, kernel=f) == x, x)
129
+
130
+ z = np.zeros_like(y, dtype=np.uint8)
131
+ z[y > t] = 255
132
+ return z
annotator/hed/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (4.59 kB). View file
 
annotator/midas/__init__.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import torch
4
+
5
+ from einops import rearrange
6
+ from .api import MiDaSInference
7
+
8
+
9
+ class MidasDetector:
10
+ def __init__(self):
11
+ self.model = MiDaSInference(model_type="dpt_hybrid").cuda()
12
+
13
+ def __call__(self, input_image, a=np.pi * 2.0, bg_th=0.1):
14
+ assert input_image.ndim == 3
15
+ image_depth = input_image
16
+ with torch.no_grad():
17
+ image_depth = torch.from_numpy(image_depth).float().cuda()
18
+ image_depth = image_depth / 127.5 - 1.0
19
+ image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
20
+ depth = self.model(image_depth)[0]
21
+
22
+ depth_pt = depth.clone()
23
+ depth_pt -= torch.min(depth_pt)
24
+ depth_pt /= torch.max(depth_pt)
25
+ depth_pt = depth_pt.cpu().numpy()
26
+ depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
27
+
28
+ depth_np = depth.cpu().numpy()
29
+ x = cv2.Sobel(depth_np, cv2.CV_32F, 1, 0, ksize=3)
30
+ y = cv2.Sobel(depth_np, cv2.CV_32F, 0, 1, ksize=3)
31
+ z = np.ones_like(x) * a
32
+ x[depth_pt < bg_th] = 0
33
+ y[depth_pt < bg_th] = 0
34
+ normal = np.stack([x, y, z], axis=2)
35
+ normal /= np.sum(normal ** 2.0, axis=2, keepdims=True) ** 0.5
36
+ normal_image = (normal * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
37
+
38
+ return depth_image, normal_image
annotator/midas/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (1.57 kB). View file
 
annotator/midas/__pycache__/api.cpython-310.pyc ADDED
Binary file (3.69 kB). View file
 
annotator/midas/api.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # based on https://github.com/isl-org/MiDaS
2
+
3
+ import cv2
4
+ import os
5
+ import torch
6
+ import torch.nn as nn
7
+ from torchvision.transforms import Compose
8
+
9
+ from .midas.dpt_depth import DPTDepthModel
10
+ from .midas.midas_net import MidasNet
11
+ from .midas.midas_net_custom import MidasNet_small
12
+ from .midas.transforms import Resize, NormalizeImage, PrepareForNet
13
+ from annotator.util import annotator_ckpts_path
14
+
15
+
16
+ ISL_PATHS = {
17
+ "dpt_large": os.path.join(annotator_ckpts_path, "dpt_large-midas-2f21e586.pt"),
18
+ "dpt_hybrid": os.path.join(annotator_ckpts_path, "dpt_hybrid-midas-501f0c75.pt"),
19
+ "midas_v21": "",
20
+ "midas_v21_small": "",
21
+ }
22
+
23
+ remote_model_path = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt"
24
+
25
+
26
+ def disabled_train(self, mode=True):
27
+ """Overwrite model.train with this function to make sure train/eval mode
28
+ does not change anymore."""
29
+ return self
30
+
31
+
32
+ def load_midas_transform(model_type):
33
+ # https://github.com/isl-org/MiDaS/blob/master/run.py
34
+ # load transform only
35
+ if model_type == "dpt_large": # DPT-Large
36
+ net_w, net_h = 384, 384
37
+ resize_mode = "minimal"
38
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
39
+
40
+ elif model_type == "dpt_hybrid": # DPT-Hybrid
41
+ net_w, net_h = 384, 384
42
+ resize_mode = "minimal"
43
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
44
+
45
+ elif model_type == "midas_v21":
46
+ net_w, net_h = 384, 384
47
+ resize_mode = "upper_bound"
48
+ normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
49
+
50
+ elif model_type == "midas_v21_small":
51
+ net_w, net_h = 256, 256
52
+ resize_mode = "upper_bound"
53
+ normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
54
+
55
+ else:
56
+ assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
57
+
58
+ transform = Compose(
59
+ [
60
+ Resize(
61
+ net_w,
62
+ net_h,
63
+ resize_target=None,
64
+ keep_aspect_ratio=True,
65
+ ensure_multiple_of=32,
66
+ resize_method=resize_mode,
67
+ image_interpolation_method=cv2.INTER_CUBIC,
68
+ ),
69
+ normalization,
70
+ PrepareForNet(),
71
+ ]
72
+ )
73
+
74
+ return transform
75
+
76
+
77
+ def load_model(model_type):
78
+ # https://github.com/isl-org/MiDaS/blob/master/run.py
79
+ # load network
80
+ model_path = ISL_PATHS[model_type]
81
+ if model_type == "dpt_large": # DPT-Large
82
+ model = DPTDepthModel(
83
+ path=model_path,
84
+ backbone="vitl16_384",
85
+ non_negative=True,
86
+ )
87
+ net_w, net_h = 384, 384
88
+ resize_mode = "minimal"
89
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
90
+
91
+ elif model_type == "dpt_hybrid": # DPT-Hybrid
92
+ if not os.path.exists(model_path):
93
+ from basicsr.utils.download_util import load_file_from_url
94
+ load_file_from_url(remote_model_path, model_dir=annotator_ckpts_path)
95
+
96
+ model = DPTDepthModel(
97
+ path=model_path,
98
+ backbone="vitb_rn50_384",
99
+ non_negative=True,
100
+ )
101
+ net_w, net_h = 384, 384
102
+ resize_mode = "minimal"
103
+ normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
104
+
105
+ elif model_type == "midas_v21":
106
+ model = MidasNet(model_path, non_negative=True)
107
+ net_w, net_h = 384, 384
108
+ resize_mode = "upper_bound"
109
+ normalization = NormalizeImage(
110
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
111
+ )
112
+
113
+ elif model_type == "midas_v21_small":
114
+ model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
115
+ non_negative=True, blocks={'expand': True})
116
+ net_w, net_h = 256, 256
117
+ resize_mode = "upper_bound"
118
+ normalization = NormalizeImage(
119
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
120
+ )
121
+
122
+ else:
123
+ print(f"model_type '{model_type}' not implemented, use: --model_type large")
124
+ assert False
125
+
126
+ transform = Compose(
127
+ [
128
+ Resize(
129
+ net_w,
130
+ net_h,
131
+ resize_target=None,
132
+ keep_aspect_ratio=True,
133
+ ensure_multiple_of=32,
134
+ resize_method=resize_mode,
135
+ image_interpolation_method=cv2.INTER_CUBIC,
136
+ ),
137
+ normalization,
138
+ PrepareForNet(),
139
+ ]
140
+ )
141
+
142
+ return model.eval(), transform
143
+
144
+
145
+ class MiDaSInference(nn.Module):
146
+ MODEL_TYPES_TORCH_HUB = [
147
+ "DPT_Large",
148
+ "DPT_Hybrid",
149
+ "MiDaS_small"
150
+ ]
151
+ MODEL_TYPES_ISL = [
152
+ "dpt_large",
153
+ "dpt_hybrid",
154
+ "midas_v21",
155
+ "midas_v21_small",
156
+ ]
157
+
158
+ def __init__(self, model_type):
159
+ super().__init__()
160
+ assert (model_type in self.MODEL_TYPES_ISL)
161
+ model, _ = load_model(model_type)
162
+ self.model = model
163
+ self.model.train = disabled_train
164
+
165
+ def forward(self, x):
166
+ with torch.no_grad():
167
+ prediction = self.model(x)
168
+ return prediction
169
+
annotator/midas/midas/__init__.py ADDED
File without changes
annotator/midas/midas/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (155 Bytes). View file
 
annotator/midas/midas/__pycache__/base_model.cpython-310.pyc ADDED
Binary file (683 Bytes). View file
 
annotator/midas/midas/__pycache__/blocks.cpython-310.pyc ADDED
Binary file (7.2 kB). View file
 
annotator/midas/midas/__pycache__/dpt_depth.cpython-310.pyc ADDED
Binary file (2.91 kB). View file
 
annotator/midas/midas/__pycache__/midas_net.cpython-310.pyc ADDED
Binary file (2.59 kB). View file
 
annotator/midas/midas/__pycache__/midas_net_custom.cpython-310.pyc ADDED
Binary file (3.71 kB). View file
 
annotator/midas/midas/__pycache__/transforms.cpython-310.pyc ADDED
Binary file (5.67 kB). View file
 
annotator/midas/midas/__pycache__/vit.cpython-310.pyc ADDED
Binary file (9.36 kB). View file
 
annotator/midas/midas/base_model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class BaseModel(torch.nn.Module):
5
+ def load(self, path):
6
+ """Load model from file.
7
+
8
+ Args:
9
+ path (str): file path
10
+ """
11
+ parameters = torch.load(path, map_location=torch.device('cpu'))
12
+
13
+ if "optimizer" in parameters:
14
+ parameters = parameters["model"]
15
+
16
+ self.load_state_dict(parameters)
annotator/midas/midas/blocks.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from .vit import (
5
+ _make_pretrained_vitb_rn50_384,
6
+ _make_pretrained_vitl16_384,
7
+ _make_pretrained_vitb16_384,
8
+ forward_vit,
9
+ )
10
+
11
+ def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",):
12
+ if backbone == "vitl16_384":
13
+ pretrained = _make_pretrained_vitl16_384(
14
+ use_pretrained, hooks=hooks, use_readout=use_readout
15
+ )
16
+ scratch = _make_scratch(
17
+ [256, 512, 1024, 1024], features, groups=groups, expand=expand
18
+ ) # ViT-L/16 - 85.0% Top1 (backbone)
19
+ elif backbone == "vitb_rn50_384":
20
+ pretrained = _make_pretrained_vitb_rn50_384(
21
+ use_pretrained,
22
+ hooks=hooks,
23
+ use_vit_only=use_vit_only,
24
+ use_readout=use_readout,
25
+ )
26
+ scratch = _make_scratch(
27
+ [256, 512, 768, 768], features, groups=groups, expand=expand
28
+ ) # ViT-H/16 - 85.0% Top1 (backbone)
29
+ elif backbone == "vitb16_384":
30
+ pretrained = _make_pretrained_vitb16_384(
31
+ use_pretrained, hooks=hooks, use_readout=use_readout
32
+ )
33
+ scratch = _make_scratch(
34
+ [96, 192, 384, 768], features, groups=groups, expand=expand
35
+ ) # ViT-B/16 - 84.6% Top1 (backbone)
36
+ elif backbone == "resnext101_wsl":
37
+ pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
38
+ scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3
39
+ elif backbone == "efficientnet_lite3":
40
+ pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
41
+ scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3
42
+ else:
43
+ print(f"Backbone '{backbone}' not implemented")
44
+ assert False
45
+
46
+ return pretrained, scratch
47
+
48
+
49
+ def _make_scratch(in_shape, out_shape, groups=1, expand=False):
50
+ scratch = nn.Module()
51
+
52
+ out_shape1 = out_shape
53
+ out_shape2 = out_shape
54
+ out_shape3 = out_shape
55
+ out_shape4 = out_shape
56
+ if expand==True:
57
+ out_shape1 = out_shape
58
+ out_shape2 = out_shape*2
59
+ out_shape3 = out_shape*4
60
+ out_shape4 = out_shape*8
61
+
62
+ scratch.layer1_rn = nn.Conv2d(
63
+ in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
64
+ )
65
+ scratch.layer2_rn = nn.Conv2d(
66
+ in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
67
+ )
68
+ scratch.layer3_rn = nn.Conv2d(
69
+ in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
70
+ )
71
+ scratch.layer4_rn = nn.Conv2d(
72
+ in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
73
+ )
74
+
75
+ return scratch
76
+
77
+
78
+ def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
79
+ efficientnet = torch.hub.load(
80
+ "rwightman/gen-efficientnet-pytorch",
81
+ "tf_efficientnet_lite3",
82
+ pretrained=use_pretrained,
83
+ exportable=exportable
84
+ )
85
+ return _make_efficientnet_backbone(efficientnet)
86
+
87
+
88
+ def _make_efficientnet_backbone(effnet):
89
+ pretrained = nn.Module()
90
+
91
+ pretrained.layer1 = nn.Sequential(
92
+ effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
93
+ )
94
+ pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
95
+ pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
96
+ pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
97
+
98
+ return pretrained
99
+
100
+
101
+ def _make_resnet_backbone(resnet):
102
+ pretrained = nn.Module()
103
+ pretrained.layer1 = nn.Sequential(
104
+ resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
105
+ )
106
+
107
+ pretrained.layer2 = resnet.layer2
108
+ pretrained.layer3 = resnet.layer3
109
+ pretrained.layer4 = resnet.layer4
110
+
111
+ return pretrained
112
+
113
+
114
+ def _make_pretrained_resnext101_wsl(use_pretrained):
115
+ resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
116
+ return _make_resnet_backbone(resnet)
117
+
118
+
119
+
120
+ class Interpolate(nn.Module):
121
+ """Interpolation module.
122
+ """
123
+
124
+ def __init__(self, scale_factor, mode, align_corners=False):
125
+ """Init.
126
+
127
+ Args:
128
+ scale_factor (float): scaling
129
+ mode (str): interpolation mode
130
+ """
131
+ super(Interpolate, self).__init__()
132
+
133
+ self.interp = nn.functional.interpolate
134
+ self.scale_factor = scale_factor
135
+ self.mode = mode
136
+ self.align_corners = align_corners
137
+
138
+ def forward(self, x):
139
+ """Forward pass.
140
+
141
+ Args:
142
+ x (tensor): input
143
+
144
+ Returns:
145
+ tensor: interpolated data
146
+ """
147
+
148
+ x = self.interp(
149
+ x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
150
+ )
151
+
152
+ return x
153
+
154
+
155
+ class ResidualConvUnit(nn.Module):
156
+ """Residual convolution module.
157
+ """
158
+
159
+ def __init__(self, features):
160
+ """Init.
161
+
162
+ Args:
163
+ features (int): number of features
164
+ """
165
+ super().__init__()
166
+
167
+ self.conv1 = nn.Conv2d(
168
+ features, features, kernel_size=3, stride=1, padding=1, bias=True
169
+ )
170
+
171
+ self.conv2 = nn.Conv2d(
172
+ features, features, kernel_size=3, stride=1, padding=1, bias=True
173
+ )
174
+
175
+ self.relu = nn.ReLU(inplace=True)
176
+
177
+ def forward(self, x):
178
+ """Forward pass.
179
+
180
+ Args:
181
+ x (tensor): input
182
+
183
+ Returns:
184
+ tensor: output
185
+ """
186
+ out = self.relu(x)
187
+ out = self.conv1(out)
188
+ out = self.relu(out)
189
+ out = self.conv2(out)
190
+
191
+ return out + x
192
+
193
+
194
+ class FeatureFusionBlock(nn.Module):
195
+ """Feature fusion block.
196
+ """
197
+
198
+ def __init__(self, features):
199
+ """Init.
200
+
201
+ Args:
202
+ features (int): number of features
203
+ """
204
+ super(FeatureFusionBlock, self).__init__()
205
+
206
+ self.resConfUnit1 = ResidualConvUnit(features)
207
+ self.resConfUnit2 = ResidualConvUnit(features)
208
+
209
+ def forward(self, *xs):
210
+ """Forward pass.
211
+
212
+ Returns:
213
+ tensor: output
214
+ """
215
+ output = xs[0]
216
+
217
+ if len(xs) == 2:
218
+ output += self.resConfUnit1(xs[1])
219
+
220
+ output = self.resConfUnit2(output)
221
+
222
+ output = nn.functional.interpolate(
223
+ output, scale_factor=2, mode="bilinear", align_corners=True
224
+ )
225
+
226
+ return output
227
+
228
+
229
+
230
+
231
+ class ResidualConvUnit_custom(nn.Module):
232
+ """Residual convolution module.
233
+ """
234
+
235
+ def __init__(self, features, activation, bn):
236
+ """Init.
237
+
238
+ Args:
239
+ features (int): number of features
240
+ """
241
+ super().__init__()
242
+
243
+ self.bn = bn
244
+
245
+ self.groups=1
246
+
247
+ self.conv1 = nn.Conv2d(
248
+ features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
249
+ )
250
+
251
+ self.conv2 = nn.Conv2d(
252
+ features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
253
+ )
254
+
255
+ if self.bn==True:
256
+ self.bn1 = nn.BatchNorm2d(features)
257
+ self.bn2 = nn.BatchNorm2d(features)
258
+
259
+ self.activation = activation
260
+
261
+ self.skip_add = nn.quantized.FloatFunctional()
262
+
263
+ def forward(self, x):
264
+ """Forward pass.
265
+
266
+ Args:
267
+ x (tensor): input
268
+
269
+ Returns:
270
+ tensor: output
271
+ """
272
+
273
+ out = self.activation(x)
274
+ out = self.conv1(out)
275
+ if self.bn==True:
276
+ out = self.bn1(out)
277
+
278
+ out = self.activation(out)
279
+ out = self.conv2(out)
280
+ if self.bn==True:
281
+ out = self.bn2(out)
282
+
283
+ if self.groups > 1:
284
+ out = self.conv_merge(out)
285
+
286
+ return self.skip_add.add(out, x)
287
+
288
+ # return out + x
289
+
290
+
291
+ class FeatureFusionBlock_custom(nn.Module):
292
+ """Feature fusion block.
293
+ """
294
+
295
+ def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True):
296
+ """Init.
297
+
298
+ Args:
299
+ features (int): number of features
300
+ """
301
+ super(FeatureFusionBlock_custom, self).__init__()
302
+
303
+ self.deconv = deconv
304
+ self.align_corners = align_corners
305
+
306
+ self.groups=1
307
+
308
+ self.expand = expand
309
+ out_features = features
310
+ if self.expand==True:
311
+ out_features = features//2
312
+
313
+ self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
314
+
315
+ self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
316
+ self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
317
+
318
+ self.skip_add = nn.quantized.FloatFunctional()
319
+
320
+ def forward(self, *xs):
321
+ """Forward pass.
322
+
323
+ Returns:
324
+ tensor: output
325
+ """
326
+ output = xs[0]
327
+
328
+ if len(xs) == 2:
329
+ res = self.resConfUnit1(xs[1])
330
+ output = self.skip_add.add(output, res)
331
+ # output += res
332
+
333
+ output = self.resConfUnit2(output)
334
+
335
+ output = nn.functional.interpolate(
336
+ output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
337
+ )
338
+
339
+ output = self.out_conv(output)
340
+
341
+ return output
342
+
annotator/midas/midas/dpt_depth.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ from .base_model import BaseModel
6
+ from .blocks import (
7
+ FeatureFusionBlock,
8
+ FeatureFusionBlock_custom,
9
+ Interpolate,
10
+ _make_encoder,
11
+ forward_vit,
12
+ )
13
+
14
+
15
+ def _make_fusion_block(features, use_bn):
16
+ return FeatureFusionBlock_custom(
17
+ features,
18
+ nn.ReLU(False),
19
+ deconv=False,
20
+ bn=use_bn,
21
+ expand=False,
22
+ align_corners=True,
23
+ )
24
+
25
+
26
+ class DPT(BaseModel):
27
+ def __init__(
28
+ self,
29
+ head,
30
+ features=256,
31
+ backbone="vitb_rn50_384",
32
+ readout="project",
33
+ channels_last=False,
34
+ use_bn=False,
35
+ ):
36
+
37
+ super(DPT, self).__init__()
38
+
39
+ self.channels_last = channels_last
40
+
41
+ hooks = {
42
+ "vitb_rn50_384": [0, 1, 8, 11],
43
+ "vitb16_384": [2, 5, 8, 11],
44
+ "vitl16_384": [5, 11, 17, 23],
45
+ }
46
+
47
+ # Instantiate backbone and reassemble blocks
48
+ self.pretrained, self.scratch = _make_encoder(
49
+ backbone,
50
+ features,
51
+ False, # Set to true of you want to train from scratch, uses ImageNet weights
52
+ groups=1,
53
+ expand=False,
54
+ exportable=False,
55
+ hooks=hooks[backbone],
56
+ use_readout=readout,
57
+ )
58
+
59
+ self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
60
+ self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
61
+ self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
62
+ self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
63
+
64
+ self.scratch.output_conv = head
65
+
66
+
67
+ def forward(self, x):
68
+ if self.channels_last == True:
69
+ x.contiguous(memory_format=torch.channels_last)
70
+
71
+ layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
72
+
73
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
74
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
75
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
76
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
77
+
78
+ path_4 = self.scratch.refinenet4(layer_4_rn)
79
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
80
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
81
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
82
+
83
+ out = self.scratch.output_conv(path_1)
84
+
85
+ return out
86
+
87
+
88
+ class DPTDepthModel(DPT):
89
+ def __init__(self, path=None, non_negative=True, **kwargs):
90
+ features = kwargs["features"] if "features" in kwargs else 256
91
+
92
+ head = nn.Sequential(
93
+ nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
94
+ Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
95
+ nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
96
+ nn.ReLU(True),
97
+ nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
98
+ nn.ReLU(True) if non_negative else nn.Identity(),
99
+ nn.Identity(),
100
+ )
101
+
102
+ super().__init__(head, **kwargs)
103
+
104
+ if path is not None:
105
+ self.load(path)
106
+
107
+ def forward(self, x):
108
+ return super().forward(x).squeeze(dim=1)
109
+
annotator/midas/midas/midas_net.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MidashNet: Network for monocular depth estimation trained by mixing several datasets.
2
+ This file contains code that is adapted from
3
+ https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
4
+ """
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+ from .base_model import BaseModel
9
+ from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
10
+
11
+
12
+ class MidasNet(BaseModel):
13
+ """Network for monocular depth estimation.
14
+ """
15
+
16
+ def __init__(self, path=None, features=256, non_negative=True):
17
+ """Init.
18
+
19
+ Args:
20
+ path (str, optional): Path to saved model. Defaults to None.
21
+ features (int, optional): Number of features. Defaults to 256.
22
+ backbone (str, optional): Backbone network for encoder. Defaults to resnet50
23
+ """
24
+ print("Loading weights: ", path)
25
+
26
+ super(MidasNet, self).__init__()
27
+
28
+ use_pretrained = False if path is None else True
29
+
30
+ self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
31
+
32
+ self.scratch.refinenet4 = FeatureFusionBlock(features)
33
+ self.scratch.refinenet3 = FeatureFusionBlock(features)
34
+ self.scratch.refinenet2 = FeatureFusionBlock(features)
35
+ self.scratch.refinenet1 = FeatureFusionBlock(features)
36
+
37
+ self.scratch.output_conv = nn.Sequential(
38
+ nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
39
+ Interpolate(scale_factor=2, mode="bilinear"),
40
+ nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
41
+ nn.ReLU(True),
42
+ nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
43
+ nn.ReLU(True) if non_negative else nn.Identity(),
44
+ )
45
+
46
+ if path:
47
+ self.load(path)
48
+
49
+ def forward(self, x):
50
+ """Forward pass.
51
+
52
+ Args:
53
+ x (tensor): input data (image)
54
+
55
+ Returns:
56
+ tensor: depth
57
+ """
58
+
59
+ layer_1 = self.pretrained.layer1(x)
60
+ layer_2 = self.pretrained.layer2(layer_1)
61
+ layer_3 = self.pretrained.layer3(layer_2)
62
+ layer_4 = self.pretrained.layer4(layer_3)
63
+
64
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
65
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
66
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
67
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
68
+
69
+ path_4 = self.scratch.refinenet4(layer_4_rn)
70
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
71
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
72
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
73
+
74
+ out = self.scratch.output_conv(path_1)
75
+
76
+ return torch.squeeze(out, dim=1)
annotator/midas/midas/midas_net_custom.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MidashNet: Network for monocular depth estimation trained by mixing several datasets.
2
+ This file contains code that is adapted from
3
+ https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
4
+ """
5
+ import torch
6
+ import torch.nn as nn
7
+
8
+ from .base_model import BaseModel
9
+ from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder
10
+
11
+
12
+ class MidasNet_small(BaseModel):
13
+ """Network for monocular depth estimation.
14
+ """
15
+
16
+ def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True,
17
+ blocks={'expand': True}):
18
+ """Init.
19
+
20
+ Args:
21
+ path (str, optional): Path to saved model. Defaults to None.
22
+ features (int, optional): Number of features. Defaults to 256.
23
+ backbone (str, optional): Backbone network for encoder. Defaults to resnet50
24
+ """
25
+ print("Loading weights: ", path)
26
+
27
+ super(MidasNet_small, self).__init__()
28
+
29
+ use_pretrained = False if path else True
30
+
31
+ self.channels_last = channels_last
32
+ self.blocks = blocks
33
+ self.backbone = backbone
34
+
35
+ self.groups = 1
36
+
37
+ features1=features
38
+ features2=features
39
+ features3=features
40
+ features4=features
41
+ self.expand = False
42
+ if "expand" in self.blocks and self.blocks['expand'] == True:
43
+ self.expand = True
44
+ features1=features
45
+ features2=features*2
46
+ features3=features*4
47
+ features4=features*8
48
+
49
+ self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable)
50
+
51
+ self.scratch.activation = nn.ReLU(False)
52
+
53
+ self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
54
+ self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
55
+ self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
56
+ self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners)
57
+
58
+
59
+ self.scratch.output_conv = nn.Sequential(
60
+ nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups),
61
+ Interpolate(scale_factor=2, mode="bilinear"),
62
+ nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
63
+ self.scratch.activation,
64
+ nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
65
+ nn.ReLU(True) if non_negative else nn.Identity(),
66
+ nn.Identity(),
67
+ )
68
+
69
+ if path:
70
+ self.load(path)
71
+
72
+
73
+ def forward(self, x):
74
+ """Forward pass.
75
+
76
+ Args:
77
+ x (tensor): input data (image)
78
+
79
+ Returns:
80
+ tensor: depth
81
+ """
82
+ if self.channels_last==True:
83
+ print("self.channels_last = ", self.channels_last)
84
+ x.contiguous(memory_format=torch.channels_last)
85
+
86
+
87
+ layer_1 = self.pretrained.layer1(x)
88
+ layer_2 = self.pretrained.layer2(layer_1)
89
+ layer_3 = self.pretrained.layer3(layer_2)
90
+ layer_4 = self.pretrained.layer4(layer_3)
91
+
92
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
93
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
94
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
95
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
96
+
97
+
98
+ path_4 = self.scratch.refinenet4(layer_4_rn)
99
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
100
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
101
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
102
+
103
+ out = self.scratch.output_conv(path_1)
104
+
105
+ return torch.squeeze(out, dim=1)
106
+
107
+
108
+
109
+ def fuse_model(m):
110
+ prev_previous_type = nn.Identity()
111
+ prev_previous_name = ''
112
+ previous_type = nn.Identity()
113
+ previous_name = ''
114
+ for name, module in m.named_modules():
115
+ if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU:
116
+ # print("FUSED ", prev_previous_name, previous_name, name)
117
+ torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True)
118
+ elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
119
+ # print("FUSED ", prev_previous_name, previous_name)
120
+ torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True)
121
+ # elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
122
+ # print("FUSED ", previous_name, name)
123
+ # torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
124
+
125
+ prev_previous_type = previous_type
126
+ prev_previous_name = previous_name
127
+ previous_type = type(module)
128
+ previous_name = name
annotator/midas/midas/transforms.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+ import math
4
+
5
+
6
+ def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
7
+ """Rezise the sample to ensure the given size. Keeps aspect ratio.
8
+
9
+ Args:
10
+ sample (dict): sample
11
+ size (tuple): image size
12
+
13
+ Returns:
14
+ tuple: new size
15
+ """
16
+ shape = list(sample["disparity"].shape)
17
+
18
+ if shape[0] >= size[0] and shape[1] >= size[1]:
19
+ return sample
20
+
21
+ scale = [0, 0]
22
+ scale[0] = size[0] / shape[0]
23
+ scale[1] = size[1] / shape[1]
24
+
25
+ scale = max(scale)
26
+
27
+ shape[0] = math.ceil(scale * shape[0])
28
+ shape[1] = math.ceil(scale * shape[1])
29
+
30
+ # resize
31
+ sample["image"] = cv2.resize(
32
+ sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
33
+ )
34
+
35
+ sample["disparity"] = cv2.resize(
36
+ sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
37
+ )
38
+ sample["mask"] = cv2.resize(
39
+ sample["mask"].astype(np.float32),
40
+ tuple(shape[::-1]),
41
+ interpolation=cv2.INTER_NEAREST,
42
+ )
43
+ sample["mask"] = sample["mask"].astype(bool)
44
+
45
+ return tuple(shape)
46
+
47
+
48
+ class Resize(object):
49
+ """Resize sample to given size (width, height).
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ width,
55
+ height,
56
+ resize_target=True,
57
+ keep_aspect_ratio=False,
58
+ ensure_multiple_of=1,
59
+ resize_method="lower_bound",
60
+ image_interpolation_method=cv2.INTER_AREA,
61
+ ):
62
+ """Init.
63
+
64
+ Args:
65
+ width (int): desired output width
66
+ height (int): desired output height
67
+ resize_target (bool, optional):
68
+ True: Resize the full sample (image, mask, target).
69
+ False: Resize image only.
70
+ Defaults to True.
71
+ keep_aspect_ratio (bool, optional):
72
+ True: Keep the aspect ratio of the input sample.
73
+ Output sample might not have the given width and height, and
74
+ resize behaviour depends on the parameter 'resize_method'.
75
+ Defaults to False.
76
+ ensure_multiple_of (int, optional):
77
+ Output width and height is constrained to be multiple of this parameter.
78
+ Defaults to 1.
79
+ resize_method (str, optional):
80
+ "lower_bound": Output will be at least as large as the given size.
81
+ "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
82
+ "minimal": Scale as least as possible. (Output size might be smaller than given size.)
83
+ Defaults to "lower_bound".
84
+ """
85
+ self.__width = width
86
+ self.__height = height
87
+
88
+ self.__resize_target = resize_target
89
+ self.__keep_aspect_ratio = keep_aspect_ratio
90
+ self.__multiple_of = ensure_multiple_of
91
+ self.__resize_method = resize_method
92
+ self.__image_interpolation_method = image_interpolation_method
93
+
94
+ def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
95
+ y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
96
+
97
+ if max_val is not None and y > max_val:
98
+ y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
99
+
100
+ if y < min_val:
101
+ y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
102
+
103
+ return y
104
+
105
+ def get_size(self, width, height):
106
+ # determine new height and width
107
+ scale_height = self.__height / height
108
+ scale_width = self.__width / width
109
+
110
+ if self.__keep_aspect_ratio:
111
+ if self.__resize_method == "lower_bound":
112
+ # scale such that output size is lower bound
113
+ if scale_width > scale_height:
114
+ # fit width
115
+ scale_height = scale_width
116
+ else:
117
+ # fit height
118
+ scale_width = scale_height
119
+ elif self.__resize_method == "upper_bound":
120
+ # scale such that output size is upper bound
121
+ if scale_width < scale_height:
122
+ # fit width
123
+ scale_height = scale_width
124
+ else:
125
+ # fit height
126
+ scale_width = scale_height
127
+ elif self.__resize_method == "minimal":
128
+ # scale as least as possbile
129
+ if abs(1 - scale_width) < abs(1 - scale_height):
130
+ # fit width
131
+ scale_height = scale_width
132
+ else:
133
+ # fit height
134
+ scale_width = scale_height
135
+ else:
136
+ raise ValueError(
137
+ f"resize_method {self.__resize_method} not implemented"
138
+ )
139
+
140
+ if self.__resize_method == "lower_bound":
141
+ new_height = self.constrain_to_multiple_of(
142
+ scale_height * height, min_val=self.__height
143
+ )
144
+ new_width = self.constrain_to_multiple_of(
145
+ scale_width * width, min_val=self.__width
146
+ )
147
+ elif self.__resize_method == "upper_bound":
148
+ new_height = self.constrain_to_multiple_of(
149
+ scale_height * height, max_val=self.__height
150
+ )
151
+ new_width = self.constrain_to_multiple_of(
152
+ scale_width * width, max_val=self.__width
153
+ )
154
+ elif self.__resize_method == "minimal":
155
+ new_height = self.constrain_to_multiple_of(scale_height * height)
156
+ new_width = self.constrain_to_multiple_of(scale_width * width)
157
+ else:
158
+ raise ValueError(f"resize_method {self.__resize_method} not implemented")
159
+
160
+ return (new_width, new_height)
161
+
162
+ def __call__(self, sample):
163
+ width, height = self.get_size(
164
+ sample["image"].shape[1], sample["image"].shape[0]
165
+ )
166
+
167
+ # resize sample
168
+ sample["image"] = cv2.resize(
169
+ sample["image"],
170
+ (width, height),
171
+ interpolation=self.__image_interpolation_method,
172
+ )
173
+
174
+ if self.__resize_target:
175
+ if "disparity" in sample:
176
+ sample["disparity"] = cv2.resize(
177
+ sample["disparity"],
178
+ (width, height),
179
+ interpolation=cv2.INTER_NEAREST,
180
+ )
181
+
182
+ if "depth" in sample:
183
+ sample["depth"] = cv2.resize(
184
+ sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
185
+ )
186
+
187
+ sample["mask"] = cv2.resize(
188
+ sample["mask"].astype(np.float32),
189
+ (width, height),
190
+ interpolation=cv2.INTER_NEAREST,
191
+ )
192
+ sample["mask"] = sample["mask"].astype(bool)
193
+
194
+ return sample
195
+
196
+
197
+ class NormalizeImage(object):
198
+ """Normlize image by given mean and std.
199
+ """
200
+
201
+ def __init__(self, mean, std):
202
+ self.__mean = mean
203
+ self.__std = std
204
+
205
+ def __call__(self, sample):
206
+ sample["image"] = (sample["image"] - self.__mean) / self.__std
207
+
208
+ return sample
209
+
210
+
211
+ class PrepareForNet(object):
212
+ """Prepare sample for usage as network input.
213
+ """
214
+
215
+ def __init__(self):
216
+ pass
217
+
218
+ def __call__(self, sample):
219
+ image = np.transpose(sample["image"], (2, 0, 1))
220
+ sample["image"] = np.ascontiguousarray(image).astype(np.float32)
221
+
222
+ if "mask" in sample:
223
+ sample["mask"] = sample["mask"].astype(np.float32)
224
+ sample["mask"] = np.ascontiguousarray(sample["mask"])
225
+
226
+ if "disparity" in sample:
227
+ disparity = sample["disparity"].astype(np.float32)
228
+ sample["disparity"] = np.ascontiguousarray(disparity)
229
+
230
+ if "depth" in sample:
231
+ depth = sample["depth"].astype(np.float32)
232
+ sample["depth"] = np.ascontiguousarray(depth)
233
+
234
+ return sample
annotator/midas/midas/vit.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import timm
4
+ import types
5
+ import math
6
+ import torch.nn.functional as F
7
+
8
+
9
+ class Slice(nn.Module):
10
+ def __init__(self, start_index=1):
11
+ super(Slice, self).__init__()
12
+ self.start_index = start_index
13
+
14
+ def forward(self, x):
15
+ return x[:, self.start_index :]
16
+
17
+
18
+ class AddReadout(nn.Module):
19
+ def __init__(self, start_index=1):
20
+ super(AddReadout, self).__init__()
21
+ self.start_index = start_index
22
+
23
+ def forward(self, x):
24
+ if self.start_index == 2:
25
+ readout = (x[:, 0] + x[:, 1]) / 2
26
+ else:
27
+ readout = x[:, 0]
28
+ return x[:, self.start_index :] + readout.unsqueeze(1)
29
+
30
+
31
+ class ProjectReadout(nn.Module):
32
+ def __init__(self, in_features, start_index=1):
33
+ super(ProjectReadout, self).__init__()
34
+ self.start_index = start_index
35
+
36
+ self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
37
+
38
+ def forward(self, x):
39
+ readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
40
+ features = torch.cat((x[:, self.start_index :], readout), -1)
41
+
42
+ return self.project(features)
43
+
44
+
45
+ class Transpose(nn.Module):
46
+ def __init__(self, dim0, dim1):
47
+ super(Transpose, self).__init__()
48
+ self.dim0 = dim0
49
+ self.dim1 = dim1
50
+
51
+ def forward(self, x):
52
+ x = x.transpose(self.dim0, self.dim1)
53
+ return x
54
+
55
+
56
+ def forward_vit(pretrained, x):
57
+ b, c, h, w = x.shape
58
+
59
+ glob = pretrained.model.forward_flex(x)
60
+
61
+ layer_1 = pretrained.activations["1"]
62
+ layer_2 = pretrained.activations["2"]
63
+ layer_3 = pretrained.activations["3"]
64
+ layer_4 = pretrained.activations["4"]
65
+
66
+ layer_1 = pretrained.act_postprocess1[0:2](layer_1)
67
+ layer_2 = pretrained.act_postprocess2[0:2](layer_2)
68
+ layer_3 = pretrained.act_postprocess3[0:2](layer_3)
69
+ layer_4 = pretrained.act_postprocess4[0:2](layer_4)
70
+
71
+ unflatten = nn.Sequential(
72
+ nn.Unflatten(
73
+ 2,
74
+ torch.Size(
75
+ [
76
+ h // pretrained.model.patch_size[1],
77
+ w // pretrained.model.patch_size[0],
78
+ ]
79
+ ),
80
+ )
81
+ )
82
+
83
+ if layer_1.ndim == 3:
84
+ layer_1 = unflatten(layer_1)
85
+ if layer_2.ndim == 3:
86
+ layer_2 = unflatten(layer_2)
87
+ if layer_3.ndim == 3:
88
+ layer_3 = unflatten(layer_3)
89
+ if layer_4.ndim == 3:
90
+ layer_4 = unflatten(layer_4)
91
+
92
+ layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
93
+ layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
94
+ layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
95
+ layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
96
+
97
+ return layer_1, layer_2, layer_3, layer_4
98
+
99
+
100
+ def _resize_pos_embed(self, posemb, gs_h, gs_w):
101
+ posemb_tok, posemb_grid = (
102
+ posemb[:, : self.start_index],
103
+ posemb[0, self.start_index :],
104
+ )
105
+
106
+ gs_old = int(math.sqrt(len(posemb_grid)))
107
+
108
+ posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
109
+ posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
110
+ posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
111
+
112
+ posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
113
+
114
+ return posemb
115
+
116
+
117
+ def forward_flex(self, x):
118
+ b, c, h, w = x.shape
119
+
120
+ pos_embed = self._resize_pos_embed(
121
+ self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
122
+ )
123
+
124
+ B = x.shape[0]
125
+
126
+ if hasattr(self.patch_embed, "backbone"):
127
+ x = self.patch_embed.backbone(x)
128
+ if isinstance(x, (list, tuple)):
129
+ x = x[-1] # last feature if backbone outputs list/tuple of features
130
+
131
+ x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
132
+
133
+ if getattr(self, "dist_token", None) is not None:
134
+ cls_tokens = self.cls_token.expand(
135
+ B, -1, -1
136
+ ) # stole cls_tokens impl from Phil Wang, thanks
137
+ dist_token = self.dist_token.expand(B, -1, -1)
138
+ x = torch.cat((cls_tokens, dist_token, x), dim=1)
139
+ else:
140
+ cls_tokens = self.cls_token.expand(
141
+ B, -1, -1
142
+ ) # stole cls_tokens impl from Phil Wang, thanks
143
+ x = torch.cat((cls_tokens, x), dim=1)
144
+
145
+ x = x + pos_embed
146
+ x = self.pos_drop(x)
147
+
148
+ for blk in self.blocks:
149
+ x = blk(x)
150
+
151
+ x = self.norm(x)
152
+
153
+ return x
154
+
155
+
156
+ activations = {}
157
+
158
+
159
+ def get_activation(name):
160
+ def hook(model, input, output):
161
+ activations[name] = output
162
+
163
+ return hook
164
+
165
+
166
+ def get_readout_oper(vit_features, features, use_readout, start_index=1):
167
+ if use_readout == "ignore":
168
+ readout_oper = [Slice(start_index)] * len(features)
169
+ elif use_readout == "add":
170
+ readout_oper = [AddReadout(start_index)] * len(features)
171
+ elif use_readout == "project":
172
+ readout_oper = [
173
+ ProjectReadout(vit_features, start_index) for out_feat in features
174
+ ]
175
+ else:
176
+ assert (
177
+ False
178
+ ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
179
+
180
+ return readout_oper
181
+
182
+
183
+ def _make_vit_b16_backbone(
184
+ model,
185
+ features=[96, 192, 384, 768],
186
+ size=[384, 384],
187
+ hooks=[2, 5, 8, 11],
188
+ vit_features=768,
189
+ use_readout="ignore",
190
+ start_index=1,
191
+ ):
192
+ pretrained = nn.Module()
193
+
194
+ pretrained.model = model
195
+ pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
196
+ pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
197
+ pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
198
+ pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
199
+
200
+ pretrained.activations = activations
201
+
202
+ readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
203
+
204
+ # 32, 48, 136, 384
205
+ pretrained.act_postprocess1 = nn.Sequential(
206
+ readout_oper[0],
207
+ Transpose(1, 2),
208
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
209
+ nn.Conv2d(
210
+ in_channels=vit_features,
211
+ out_channels=features[0],
212
+ kernel_size=1,
213
+ stride=1,
214
+ padding=0,
215
+ ),
216
+ nn.ConvTranspose2d(
217
+ in_channels=features[0],
218
+ out_channels=features[0],
219
+ kernel_size=4,
220
+ stride=4,
221
+ padding=0,
222
+ bias=True,
223
+ dilation=1,
224
+ groups=1,
225
+ ),
226
+ )
227
+
228
+ pretrained.act_postprocess2 = nn.Sequential(
229
+ readout_oper[1],
230
+ Transpose(1, 2),
231
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
232
+ nn.Conv2d(
233
+ in_channels=vit_features,
234
+ out_channels=features[1],
235
+ kernel_size=1,
236
+ stride=1,
237
+ padding=0,
238
+ ),
239
+ nn.ConvTranspose2d(
240
+ in_channels=features[1],
241
+ out_channels=features[1],
242
+ kernel_size=2,
243
+ stride=2,
244
+ padding=0,
245
+ bias=True,
246
+ dilation=1,
247
+ groups=1,
248
+ ),
249
+ )
250
+
251
+ pretrained.act_postprocess3 = nn.Sequential(
252
+ readout_oper[2],
253
+ Transpose(1, 2),
254
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
255
+ nn.Conv2d(
256
+ in_channels=vit_features,
257
+ out_channels=features[2],
258
+ kernel_size=1,
259
+ stride=1,
260
+ padding=0,
261
+ ),
262
+ )
263
+
264
+ pretrained.act_postprocess4 = nn.Sequential(
265
+ readout_oper[3],
266
+ Transpose(1, 2),
267
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
268
+ nn.Conv2d(
269
+ in_channels=vit_features,
270
+ out_channels=features[3],
271
+ kernel_size=1,
272
+ stride=1,
273
+ padding=0,
274
+ ),
275
+ nn.Conv2d(
276
+ in_channels=features[3],
277
+ out_channels=features[3],
278
+ kernel_size=3,
279
+ stride=2,
280
+ padding=1,
281
+ ),
282
+ )
283
+
284
+ pretrained.model.start_index = start_index
285
+ pretrained.model.patch_size = [16, 16]
286
+
287
+ # We inject this function into the VisionTransformer instances so that
288
+ # we can use it with interpolated position embeddings without modifying the library source.
289
+ pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
290
+ pretrained.model._resize_pos_embed = types.MethodType(
291
+ _resize_pos_embed, pretrained.model
292
+ )
293
+
294
+ return pretrained
295
+
296
+
297
+ def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
298
+ model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
299
+
300
+ hooks = [5, 11, 17, 23] if hooks == None else hooks
301
+ return _make_vit_b16_backbone(
302
+ model,
303
+ features=[256, 512, 1024, 1024],
304
+ hooks=hooks,
305
+ vit_features=1024,
306
+ use_readout=use_readout,
307
+ )
308
+
309
+
310
+ def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
311
+ model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
312
+
313
+ hooks = [2, 5, 8, 11] if hooks == None else hooks
314
+ return _make_vit_b16_backbone(
315
+ model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
316
+ )
317
+
318
+
319
+ def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
320
+ model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
321
+
322
+ hooks = [2, 5, 8, 11] if hooks == None else hooks
323
+ return _make_vit_b16_backbone(
324
+ model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
325
+ )
326
+
327
+
328
+ def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
329
+ model = timm.create_model(
330
+ "vit_deit_base_distilled_patch16_384", pretrained=pretrained
331
+ )
332
+
333
+ hooks = [2, 5, 8, 11] if hooks == None else hooks
334
+ return _make_vit_b16_backbone(
335
+ model,
336
+ features=[96, 192, 384, 768],
337
+ hooks=hooks,
338
+ use_readout=use_readout,
339
+ start_index=2,
340
+ )
341
+
342
+
343
+ def _make_vit_b_rn50_backbone(
344
+ model,
345
+ features=[256, 512, 768, 768],
346
+ size=[384, 384],
347
+ hooks=[0, 1, 8, 11],
348
+ vit_features=768,
349
+ use_vit_only=False,
350
+ use_readout="ignore",
351
+ start_index=1,
352
+ ):
353
+ pretrained = nn.Module()
354
+
355
+ pretrained.model = model
356
+
357
+ if use_vit_only == True:
358
+ pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
359
+ pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
360
+ else:
361
+ pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
362
+ get_activation("1")
363
+ )
364
+ pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
365
+ get_activation("2")
366
+ )
367
+
368
+ pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
369
+ pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
370
+
371
+ pretrained.activations = activations
372
+
373
+ readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
374
+
375
+ if use_vit_only == True:
376
+ pretrained.act_postprocess1 = nn.Sequential(
377
+ readout_oper[0],
378
+ Transpose(1, 2),
379
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
380
+ nn.Conv2d(
381
+ in_channels=vit_features,
382
+ out_channels=features[0],
383
+ kernel_size=1,
384
+ stride=1,
385
+ padding=0,
386
+ ),
387
+ nn.ConvTranspose2d(
388
+ in_channels=features[0],
389
+ out_channels=features[0],
390
+ kernel_size=4,
391
+ stride=4,
392
+ padding=0,
393
+ bias=True,
394
+ dilation=1,
395
+ groups=1,
396
+ ),
397
+ )
398
+
399
+ pretrained.act_postprocess2 = nn.Sequential(
400
+ readout_oper[1],
401
+ Transpose(1, 2),
402
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
403
+ nn.Conv2d(
404
+ in_channels=vit_features,
405
+ out_channels=features[1],
406
+ kernel_size=1,
407
+ stride=1,
408
+ padding=0,
409
+ ),
410
+ nn.ConvTranspose2d(
411
+ in_channels=features[1],
412
+ out_channels=features[1],
413
+ kernel_size=2,
414
+ stride=2,
415
+ padding=0,
416
+ bias=True,
417
+ dilation=1,
418
+ groups=1,
419
+ ),
420
+ )
421
+ else:
422
+ pretrained.act_postprocess1 = nn.Sequential(
423
+ nn.Identity(), nn.Identity(), nn.Identity()
424
+ )
425
+ pretrained.act_postprocess2 = nn.Sequential(
426
+ nn.Identity(), nn.Identity(), nn.Identity()
427
+ )
428
+
429
+ pretrained.act_postprocess3 = nn.Sequential(
430
+ readout_oper[2],
431
+ Transpose(1, 2),
432
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
433
+ nn.Conv2d(
434
+ in_channels=vit_features,
435
+ out_channels=features[2],
436
+ kernel_size=1,
437
+ stride=1,
438
+ padding=0,
439
+ ),
440
+ )
441
+
442
+ pretrained.act_postprocess4 = nn.Sequential(
443
+ readout_oper[3],
444
+ Transpose(1, 2),
445
+ nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
446
+ nn.Conv2d(
447
+ in_channels=vit_features,
448
+ out_channels=features[3],
449
+ kernel_size=1,
450
+ stride=1,
451
+ padding=0,
452
+ ),
453
+ nn.Conv2d(
454
+ in_channels=features[3],
455
+ out_channels=features[3],
456
+ kernel_size=3,
457
+ stride=2,
458
+ padding=1,
459
+ ),
460
+ )
461
+
462
+ pretrained.model.start_index = start_index
463
+ pretrained.model.patch_size = [16, 16]
464
+
465
+ # We inject this function into the VisionTransformer instances so that
466
+ # we can use it with interpolated position embeddings without modifying the library source.
467
+ pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
468
+
469
+ # We inject this function into the VisionTransformer instances so that
470
+ # we can use it with interpolated position embeddings without modifying the library source.
471
+ pretrained.model._resize_pos_embed = types.MethodType(
472
+ _resize_pos_embed, pretrained.model
473
+ )
474
+
475
+ return pretrained
476
+
477
+
478
+ def _make_pretrained_vitb_rn50_384(
479
+ pretrained, use_readout="ignore", hooks=None, use_vit_only=False
480
+ ):
481
+ model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
482
+
483
+ hooks = [0, 1, 8, 11] if hooks == None else hooks
484
+ return _make_vit_b_rn50_backbone(
485
+ model,
486
+ features=[256, 512, 768, 768],
487
+ size=[384, 384],
488
+ hooks=hooks,
489
+ use_vit_only=use_vit_only,
490
+ use_readout=use_readout,
491
+ )