File size: 6,391 Bytes
b793f0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import numpy as np
import torch
import torch.nn as nn
from .models.data_processor import DataProcessor
from .models.mean_vfe import MeanVFE
from .models.spconv_backbone_voxelnext import VoxelResBackBone8xVoxelNeXt
from .models.voxelnext_head import VoxelNeXtHead

from .utils.image_projection import _proj_voxel_image
from segment_anything import SamPredictor, sam_model_registry

class VoxelNeXt(nn.Module):
    def __init__(self, model_cfg):
        super().__init__()

        point_cloud_range = np.array(model_cfg.POINT_CLOUD_RANGE, dtype=np.float32)

        self.data_processor = DataProcessor(
            model_cfg.DATA_PROCESSOR, point_cloud_range=point_cloud_range,
            training=False, num_point_features=len(model_cfg.USED_FEATURE_LIST)
        )

        input_channels = model_cfg.get('INPUT_CHANNELS', 5)
        grid_size = np.array(model_cfg.get('GRID_SIZE', [1440, 1440, 40]))

        class_names = model_cfg.get('CLASS_NAMES')
        kernel_size_head = model_cfg.get('KERNEL_SIZE_HEAD', 1)
        self.point_cloud_range = torch.Tensor(model_cfg.get('POINT_CLOUD_RANGE', [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]))
        self.voxel_size = torch.Tensor(model_cfg.get('VOXEL_SIZE', [0.075, 0.075, 0.2]))
        CLASS_NAMES_EACH_HEAD = model_cfg.get('CLASS_NAMES_EACH_HEAD')
        SEPARATE_HEAD_CFG = model_cfg.get('SEPARATE_HEAD_CFG')
        POST_PROCESSING = model_cfg.get('POST_PROCESSING')
        self.voxelization = MeanVFE()
        self.backbone_3d = VoxelResBackBone8xVoxelNeXt(input_channels, grid_size)
        self.dense_head = VoxelNeXtHead(class_names, self.point_cloud_range, self.voxel_size, kernel_size_head,
                 CLASS_NAMES_EACH_HEAD, SEPARATE_HEAD_CFG, POST_PROCESSING)


class Model(nn.Module):
    def __init__(self, model_cfg, device="cuda"):
        super().__init__()

        sam_type = model_cfg.get('SAM_TYPE', "vit_b")
        sam_checkpoint = model_cfg.get('SAM_CHECKPOINT', "/data/sam_vit_b_01ec64.pth")

        sam = sam_model_registry[sam_type](checkpoint=sam_checkpoint).to(device=device)
        self.sam_predictor = SamPredictor(sam)

        voxelnext_checkpoint = model_cfg.get('VOXELNEXT_CHECKPOINT', "/data/voxelnext_nuscenes_kernel1.pth")
        model_dict = torch.load(voxelnext_checkpoint)
        self.voxelnext = VoxelNeXt(model_cfg).to(device=device)
        self.voxelnext.load_state_dict(model_dict)
        self.point_features = {}
        self.device = device

    def image_embedding(self, image):
        self.sam_predictor.set_image(image)

    def point_embedding(self, data_dict, image_id):
        data_dict = self.voxelnext.data_processor.forward(
            data_dict=data_dict
        )
        data_dict['voxels'] = torch.Tensor(data_dict['voxels']).to(self.device)
        data_dict['voxel_num_points'] = torch.Tensor(data_dict['voxel_num_points']).to(self.device)
        data_dict['voxel_coords'] = torch.Tensor(data_dict['voxel_coords']).to(self.device)

        data_dict = self.voxelnext.voxelization(data_dict)
        n_voxels = data_dict['voxel_coords'].shape[0]
        device = data_dict['voxel_coords'].device
        dtype = data_dict['voxel_coords'].dtype
        data_dict['voxel_coords'] = torch.cat([torch.zeros((n_voxels, 1), device=device, dtype=dtype), data_dict['voxel_coords']], dim=1)
        data_dict['batch_size'] = 1

        if not image_id in self.point_features:
            data_dict = self.voxelnext.backbone_3d(data_dict)
            self.point_features[image_id] = data_dict
        else:
            data_dict = self.point_features[image_id]
        pred_dicts = self.voxelnext.dense_head(data_dict)

        voxel_coords = data_dict['out_voxels'][pred_dicts[0]['voxel_ids'].squeeze(-1)] * self.voxelnext.dense_head.feature_map_stride

        return pred_dicts, voxel_coords

    def generate_3D_box(self, lidar2img_rt, mask, voxel_coords, pred_dicts, quality_score=0.1):
        device = voxel_coords.device
        points_image, depth = _proj_voxel_image(voxel_coords, lidar2img_rt, self.voxelnext.voxel_size.to(device), self.voxelnext.point_cloud_range.to(device))
        points = points_image.permute(1, 0).int().cpu().numpy()
        selected_voxels = torch.zeros_like(depth).squeeze(0)

        for i in range(points.shape[0]):
            point = points[i]
            if point[0] < 0 or point[1] < 0 or point[0] >= mask.shape[1] or point[1] >= mask.shape[0]:
                continue
            if mask[point[1], point[0]]:
                selected_voxels[i] = 1

        mask_extra = (pred_dicts[0]['pred_scores'] > quality_score)
        if mask_extra.sum() == 0:
            print("no high quality 3D box related.")
            return None

        selected_voxels *= mask_extra
        if selected_voxels.sum() > 0:
            selected_box_id = pred_dicts[0]['pred_scores'][selected_voxels.bool()].argmax()
            selected_box = pred_dicts[0]['pred_boxes'][selected_voxels.bool()][selected_box_id]
        else:
            grid_x, grid_y = torch.meshgrid(torch.arange(mask.shape[0]), torch.arange(mask.shape[1]))
            mask_x, mask_y = grid_x[mask], grid_y[mask]
            mask_center = torch.Tensor([mask_y.float().mean(), mask_x.float().mean()]).to(
                pred_dicts[0]['pred_boxes'].device).unsqueeze(1)

            dist = ((points_image - mask_center) ** 2).sum(0)
            selected_id = dist[mask_extra].argmin()
            selected_box = pred_dicts[0]['pred_boxes'][mask_extra][selected_id]
        return selected_box

    def forward(self, image, point_dict, prompt_point, lidar2img_rt, image_id, quality_score=0.1):
        self.image_embedding(image)
        pred_dicts, voxel_coords = self.point_embedding(point_dict, image_id)

        masks, scores, _ = self.sam_predictor.predict(point_coords=prompt_point, point_labels=np.array([1]))
        mask = masks[0]

        box3d = self.generate_3D_box(lidar2img_rt, mask, voxel_coords, pred_dicts, quality_score=quality_score)
        return mask, box3d


if __name__ == '__main__':
    cfg_dataset = 'nuscenes_dataset.yaml'
    cfg_model = 'config.yaml'

    dataset_cfg = cfg_from_yaml_file(cfg_dataset, cfg)
    model_cfg = cfg_from_yaml_file(cfg_model, cfg)

    nuscenes_dataset = NuScenesDataset(dataset_cfg)
    model = Model(model_cfg)

    index = 0
    data_dict = nuscenes_dataset._get_points(index)
    model.point_embedding(data_dict)