diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..15375bc76cd26e5b75ac16d9664b74f9bb52491d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +GenAD-main/assets/demo.gif filter=lfs diff=lfs merge=lfs -text diff --git a/GenAD-main/LICENSE b/GenAD-main/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/GenAD-main/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/GenAD-main/README.md b/GenAD-main/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5debb2e25e51bb8d501f4b6830a0e7264dca5d5f --- /dev/null +++ b/GenAD-main/README.md @@ -0,0 +1,127 @@ +# GenAD: Generative End-to-End Autonomous Driving + +### [Paper](https://arxiv.org/pdf/2402.11502) + +> GenAD: Generative End-to-End Autonomous Driving + +> [Wenzhao Zheng](https://wzzheng.net/)\*, Ruiqi Song\*, [Xianda Guo](https://scholar.google.com/citations?user=jPvOqgYAAAAJ)\* $\dagger$, Chenming Zhang, [Long Chen](https://scholar.google.com/citations?user=jzvXnkcAAAAJ)$\dagger$ + +\* Equal contributions $\dagger$ Corresponding authors + +**GenAD casts autonomous driving as a generative modeling problem.** + +## News + +- **[2024/5/2]** Training and evaluation code release. +- **[2024/2/18]** Paper released on [arXiv](https://arxiv.org/pdf/2402.11502). + +## Demo + +![demo](./assets/demo.gif) + +## Overview + +![comparison](./assets/comparison.png) + +**Comparisons of the proposed generative end-to-end autonomous driving framework with the conventional pipeline.** Most existing methods follow a serial design of perception, prediction, and planning. They usually ignore the high-level interactions between the ego car and other agents and the structural prior of realistic trajectories. We model autonomous driving as a future generation problem and conduct motion prediction and ego planning simultaneously in a structural latent trajectory space. + +## Results + +![results](./assets/results.png) + +## Code +### Dataset + +Download nuScenes V1.0 full dataset data and CAN bus expansion data [HERE](https://www.nuscenes.org/download). Prepare nuscenes data as follows. + +**Download CAN bus expansion** + +``` +# download 'can_bus.zip' +unzip can_bus.zip +# move can_bus to data dir +``` + +**Prepare nuScenes data** + +*We genetate custom annotation files which are different from mmdet3d's* + +Generate the train file and val file: + +``` +python tools/data_converter/genad_nuscenes_converter.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag genad_nuscenes --version v1.0 --canbus ./data +``` + +Using the above code will generate `genad_nuscenes_infos_temporal_{train,val}.pkl`. + + +**Folder structure** + +``` +GenAD +├── projects/ +├── tools/ +├── configs/ +├── ckpts/ +│ ├── resnet50-19c8e357.pth +├── data/ +│ ├── can_bus/ +│ ├── nuscenes/ +│ │ ├── maps/ +│ │ ├── samples/ +│ │ ├── sweeps/ +│ │ ├── v1.0-test/ +| | ├── v1.0-trainval/ +| | ├── genad_nuscenes_infos_train.pkl +| | ├── genad_nuscenes_infos_val.pkl +``` + +### installation + +Detailed package versions can be found in [requirements.txt](../requirements.txt). + +- [Installation](docs/install.md) + +### Getting Started + +**datasets** + +https://drive.google.com/drive/folders/1gy7Ux-bk0sge77CsGgeEzPF9ImVn-WgJ?usp=drive_link + +**Checkpoints** + +https://drive.google.com/drive/folders/1nlAWJlvSHwqnTjEwlfiE99YJVRFKmqF9?usp=drive_link + +Train GenAD with 8 GPUs + +```shell +cd /path/to/GenAD +conda activate genad +python -m torch.distributed.run --nproc_per_node=8 --master_port=2333 tools/train.py projects/configs/GenAD/GenAD_config.py --launcher pytorch --deterministic --work-dir path/to/save/outputs +``` + +Eval GenAD with 1 GPU + +```shell +cd /path/to/GenAD +conda activate genad +CUDA_VISIBLE_DEVICES=0 python tools/test.py projects/configs/VAD/GenAD_config.py /path/to/ckpt.pth --launcher none --eval bbox --tmpdir outputs +``` + + + +## Related Projects + +Our code is based on [VAD](https://github.com/hustvl/VAD) and [UniAD](https://github.com/OpenDriveLab/UniAD). + +## Citation + +If you find this project helpful, please consider citing the following paper: +``` +@article{zheng2024genad, + title={GenAD: Generative End-to-End Autonomous Driving}, + author={Zheng, Wenzhao and Song, Ruiqi and Guo, Xianda and Zhang, Chenming and Chen, Long}, + journal={arXiv preprint arXiv: 2402.11502}, + year={2024} +} +``` diff --git a/GenAD-main/assets/comparison.png b/GenAD-main/assets/comparison.png new file mode 100644 index 0000000000000000000000000000000000000000..0ca4e903f20f851936a60f2f6698c917707ef08f Binary files /dev/null and b/GenAD-main/assets/comparison.png differ diff --git a/GenAD-main/assets/demo.gif b/GenAD-main/assets/demo.gif new file mode 100644 index 0000000000000000000000000000000000000000..77b3b69525fb24bdd4bfe9709cef6029bec257e1 --- /dev/null +++ b/GenAD-main/assets/demo.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bac1568c537632d144945191bb77987e00fd822faf2348c415e732b13a041ce +size 67527613 diff --git a/GenAD-main/assets/framework.png b/GenAD-main/assets/framework.png new file mode 100644 index 0000000000000000000000000000000000000000..cb14aca6353ba7051ef36d4fb2e28611b695d338 Binary files /dev/null and b/GenAD-main/assets/framework.png differ diff --git a/GenAD-main/assets/results.png b/GenAD-main/assets/results.png new file mode 100644 index 0000000000000000000000000000000000000000..f9a94fb50a6c4d52d778c5401de041e7e59abd6e Binary files /dev/null and b/GenAD-main/assets/results.png differ diff --git a/GenAD-main/docs/install.md b/GenAD-main/docs/install.md new file mode 100644 index 0000000000000000000000000000000000000000..b8270b5ec261c8c9cda9dab178d5dd4a2a35ab93 --- /dev/null +++ b/GenAD-main/docs/install.md @@ -0,0 +1,66 @@ +# installation + +Detailed package versions can be found in [requirements.txt](../requirements.txt). + + + +**a. Create a conda virtual environment and activate it.** +```shell +conda create -n genad python=3.8 -y +conda activate genad +``` + +**b. Install PyTorch and torchvision following the [official instructions](https://pytorch.org/).** +```shell +pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html +# Recommended torch>=1.9 +``` + +**c. Install gcc>=5 in conda env (optional).** +```shell +conda install -c omgarcia gcc-5 # gcc-6.2 +``` + +**c. Install mmcv-full.** +```shell +pip install mmcv-full==1.4.0 +# pip install mmcv-full==1.4.0 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html +``` + +**d. Install mmdet and mmseg.** +```shell +pip install mmdet==2.14.0 +pip install mmsegmentation==0.14.1 +``` + +**e. Install timm.** +```shell +pip install timm +``` + +**f. Install mmdet3d.** +```shell +conda activate genad +git clone https://github.com/open-mmlab/mmdetection3d.git +cd /path/to/mmdetection3d +git checkout -f v0.17.1 +python setup.py develop +``` + +**g. Install nuscenes-devkit.** +```shell +pip install nuscenes-devkit==1.1.9 +``` + +**h. Clone GenAD.** +```shell +git clone https://github.com/wzzheng/GenAD.git +``` + +**i. Prepare pretrained models.** +```shell +cd /path/to/GenAD +mkdir ckpts +cd ckpts +wget https://download.pytorch.org/models/resnet50-19c8e357.pth +``` diff --git a/GenAD-main/docs/visualization.md b/GenAD-main/docs/visualization.md new file mode 100644 index 0000000000000000000000000000000000000000..4fb56f6f320d7cac73faa8d961d22f94701d5a3c --- /dev/null +++ b/GenAD-main/docs/visualization.md @@ -0,0 +1,10 @@ +# Visualization + +We provide the script to visualize the VAD prediction to a video [here](../tools/analysis_tools/visualization.py). + +```shell +cd /path/to/GenAD/ +conda activate genad +python tools/analysis_tools/visualization.py --result-path /path/to/inference/results --save-path /path/to/save/visualization/results +``` + diff --git a/GenAD-main/projects/__init__.py b/GenAD-main/projects/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/GenAD-main/projects/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c9dde3c4d79f578fac0a631a6d128f256867e63 Binary files /dev/null and b/GenAD-main/projects/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/configs/VAD/GenAD_config.py b/GenAD-main/projects/configs/VAD/GenAD_config.py new file mode 100644 index 0000000000000000000000000000000000000000..ea4858cb90d165bd0e45759937226671450acccb --- /dev/null +++ b/GenAD-main/projects/configs/VAD/GenAD_config.py @@ -0,0 +1,443 @@ +_base_ = [ + '../datasets/custom_nus-3d.py', + '../_base_/default_runtime.py' +] +# +plugin = True +plugin_dir = 'projects/mmdet3d_plugin/' + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] +voxel_size = [0.15, 0.15, 4] + +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] +num_classes = len(class_names) + +# map has classes: divider, ped_crossing, boundary +map_classes = ['divider', 'ped_crossing', 'boundary'] +map_num_vec = 100 +map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 +map_fixed_ptsnum_per_pred_line = 20 +map_eval_use_same_gt_sample_num_flag = True +map_num_classes = len(map_classes) + +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=True) + +_dim_ = 256 +_pos_dim_ = _dim_//2 +_ffn_dim_ = _dim_*2 +_num_levels_ = 1 +bev_h_ = 100 +bev_w_ = 100 +queue_length = 3 # each sequence contains `queue_length` frames. +total_epochs = 60 + +model = dict( + type='VAD', + use_grid_mask=True, + video_test_mode=True, + pretrained=dict(img='torchvision://resnet50'), + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(3,), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch'), + img_neck=dict( + type='FPN', + in_channels=[2048], + out_channels=_dim_, + start_level=0, + add_extra_convs='on_output', + num_outs=_num_levels_, + relu_before_extra_convs=True), + pts_bbox_head=dict( + type='VADHead', + map_thresh=0.5, + dis_thresh=0.2, + pe_normalization=True, + tot_epoch=total_epochs, + use_traj_lr_warmup=False, + query_thresh=0.0, + query_use_fix_pad=False, + ego_his_encoder=None, + ego_lcf_feat_idx=None, + valid_fut_ts=6, + agent_dim = 300, + ego_agent_decoder=dict( + type='CustomTransformerDecoder', + num_layers=1, + return_intermediate=False, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('cross_attn', 'norm', 'ffn', 'norm'))), + ego_map_decoder=dict( + type='CustomTransformerDecoder', + num_layers=1, + return_intermediate=False, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('cross_attn', 'norm', 'ffn', 'norm'))), + motion_decoder=dict( + type='CustomTransformerDecoder', + num_layers=1, + return_intermediate=False, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('cross_attn', 'norm', 'ffn', 'norm'))), + motion_map_decoder=dict( + type='CustomTransformerDecoder', + num_layers=1, + return_intermediate=False, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('cross_attn', 'norm', 'ffn', 'norm'))), + use_pe=True, + bev_h=bev_h_, + bev_w=bev_w_, + num_query=300, + num_classes=num_classes, + in_channels=_dim_, + sync_cls_avg_factor=True, + with_box_refine=True, + as_two_stage=False, + map_num_vec=map_num_vec, + map_num_classes=map_num_classes, + map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line, + map_num_pts_per_gt_vec=map_fixed_ptsnum_per_gt_line, + map_query_embed_type='instance_pts', + map_transform_method='minmax', + map_gt_shift_pts_pattern='v2', + map_dir_interval=1, + map_code_size=2, + map_code_weights=[1.0, 1.0, 1.0, 1.0], + transformer=dict( + type='VADPerceptionTransformer', + map_num_vec=map_num_vec, + map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line, + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + embed_dims=_dim_, + encoder=dict( + type='BEVFormerEncoder', + num_layers=3, + pc_range=point_cloud_range, + num_points_in_pillar=4, + return_intermediate=False, + transformerlayers=dict( + type='BEVFormerLayer', + attn_cfgs=[ + dict( + type='TemporalSelfAttention', + embed_dims=_dim_, + num_levels=1), + dict( + type='SpatialCrossAttention', + pc_range=point_cloud_range, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=_dim_, + num_points=8, + num_levels=_num_levels_), + embed_dims=_dim_, + ) + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetectionTransformerDecoder', + num_layers=3, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm'))), + map_decoder=dict( + type='MapDetectionTransformerDecoder', + num_layers=3, + return_intermediate=True, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=[ + dict( + type='MultiheadAttention', + embed_dims=_dim_, + num_heads=8, + dropout=0.1), + dict( + type='CustomMSDeformableAttention', + embed_dims=_dim_, + num_levels=1), + ], + feedforward_channels=_ffn_dim_, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', + 'ffn', 'norm')))), + bbox_coder=dict( + type='CustomNMSFreeCoder', + post_center_range=[-20, -35, -10.0, 20, 35, 10.0], + pc_range=point_cloud_range, + max_num=100, + voxel_size=voxel_size, + num_classes=num_classes), + map_bbox_coder=dict( + type='MapNMSFreeCoder', + post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35], + pc_range=point_cloud_range, + max_num=50, + voxel_size=voxel_size, + num_classes=map_num_classes), + positional_encoding=dict( + type='LearnedPositionalEncoding', + num_feats=_pos_dim_, + row_num_embed=bev_h_, + col_num_embed=bev_w_, + ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_bbox=dict(type='L1Loss', loss_weight=0.25), + loss_traj=dict(type='L1Loss', loss_weight=0.2), + loss_traj_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=0.2), + loss_iou=dict(type='GIoULoss', loss_weight=0.0), + loss_map_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=2.0), + loss_map_bbox=dict(type='L1Loss', loss_weight=0.0), + loss_map_iou=dict(type='GIoULoss', loss_weight=0.0), + loss_map_pts=dict(type='PtsL1Loss', loss_weight=1.0), + loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=0.005), + loss_plan_reg=dict(type='L1Loss', loss_weight=1.0), + loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=1.0, dis_thresh=1.0), + loss_plan_col=dict(type='PlanCollisionLoss', loss_weight=1.0), + loss_plan_dir=dict(type='PlanMapDirectionLoss', loss_weight=0.5), + loss_vae_gen=dict(type='ProbabilisticLoss', loss_weight=1.0), + loss_diff_gen=dict(type='DiffusionLoss', loss_weight=0.5)), + # model training and testing settings + train_cfg=dict(pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type='HungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBox3DL1Cost', weight=0.25), + iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range), + map_assigner=dict( + type='MapHungarianAssigner3D', + cls_cost=dict(type='FocalLossCost', weight=2.0), + reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'), + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0), + pts_cost=dict(type='OrderedPtsL1Cost', weight=1.0), + pc_range=point_cloud_range)))) + +dataset_type = 'VADCustomNuScenesDataset' +data_root = 'xxx/nuscenes/' +file_client_args = dict(backend='disk') + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True), + dict(type='CustomObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='CustomObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='RandomScaleImageMultiViewImage', scales=[0.4]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='CustomDefaultFormatBundle3D', class_names=class_names, with_ego=True), + dict(type='CustomCollect3D',\ + keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'ego_his_trajs', + 'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd', 'ego_lcf_feat', 'gt_attr_labels']) +] + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True), + dict(type='CustomObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='CustomObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + # dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1600, 900), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type='RandomScaleImageMultiViewImage', scales=[0.4]), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='CustomDefaultFormatBundle3D', class_names=class_names, with_label=False, with_ego=True), + dict(type='CustomCollect3D',\ + keys=['points', 'gt_bboxes_3d', 'gt_labels_3d', 'img', 'fut_valid_flag', + 'ego_his_trajs', 'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd', + 'ego_lcf_feat', 'gt_attr_labels'])]) +] + +data = dict( + samples_per_gpu=1, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'genad_nuscenes_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + bev_size=(bev_h_, bev_w_), + pc_range=point_cloud_range, + queue_length=queue_length, + map_classes=map_classes, + map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line, + map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR', + custom_eval_version='vad_nusc_detection_cvpr_2019'), + val=dict(type=dataset_type, + data_root=data_root, + pc_range=point_cloud_range, + ann_file=data_root + 'genad_nuscenes_infos_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality, samples_per_gpu=1, + map_classes=map_classes, + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line, + map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag, + use_pkl_result=True, + custom_eval_version='vad_nusc_detection_cvpr_2019'), + test=dict(type=dataset_type, + data_root=data_root, + pc_range=point_cloud_range, + ann_file=data_root + 'genad_nuscenes_infos_val.pkl', + pipeline=test_pipeline, bev_size=(bev_h_, bev_w_), + classes=class_names, modality=input_modality, samples_per_gpu=1, + map_classes=map_classes, + map_ann_file=data_root + 'nuscenes_map_anns_val.json', + map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line, + map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag, + use_pkl_result=True, + custom_eval_version='vad_nusc_detection_cvpr_2019'), + shuffler_sampler=dict(type='DistributedGroupSampler'), + nonshuffler_sampler=dict(type='DistributedSampler') +) + +optimizer = dict( + type='AdamW', + lr=2e-4, + paramwise_cfg=dict( + custom_keys={ + 'img_backbone': dict(lr_mult=0.1), + }), + weight_decay=0.01) + +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +# learning policy +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3) + +evaluation = dict(interval=total_epochs, pipeline=test_pipeline, metric='bbox', map_metric='chamfer') + +runner = dict(type='EpochBasedRunner', max_epochs=total_epochs) + +log_config = dict( + interval=100, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# fp16 = dict(loss_scale=512.) +find_unused_parameters = True +checkpoint_config = dict(interval=1, max_keep_ckpts=total_epochs) + + +custom_hooks = [dict(type='CustomSetEpochInfoHook')] \ No newline at end of file diff --git a/GenAD-main/projects/configs/_base_/datasets/coco_instance.py b/GenAD-main/projects/configs/_base_/datasets/coco_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..f6ea4f4562a8118275a444879a884717b55caa15 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/coco_instance.py @@ -0,0 +1,48 @@ +dataset_type = 'CocoDataset' +data_root = 'data/coco/' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_train2017.json', + img_prefix=data_root + 'train2017/', + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/instances_val2017.json', + img_prefix=data_root + 'val2017/', + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) diff --git a/GenAD-main/projects/configs/_base_/datasets/kitti-3d-3class.py b/GenAD-main/projects/configs/_base_/datasets/kitti-3d-3class.py new file mode 100644 index 0000000000000000000000000000000000000000..1822af4209432eb45e105112a165668fac87b6c5 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/kitti-3d-3class.py @@ -0,0 +1,140 @@ +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Pedestrian', 'Cyclist', 'Car'] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] +input_modality = dict(use_lidar=True, use_camera=False) +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'kitti_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), + classes=class_names, + sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6)) + +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', path_mapping=dict(data='s3://kitti_data/')) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args), + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='ObjectNoise', + num_try=100, + translation_std=[1.0, 1.0, 0.5], + global_rot_range=[0.0, 0.0], + rot_range=[-0.78539816, 0.78539816]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=6, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_train.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=train_pipeline, + modality=input_modality, + classes=class_names, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR')) + +evaluation = dict(interval=1, pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/datasets/kitti-3d-car.py b/GenAD-main/projects/configs/_base_/datasets/kitti-3d-car.py new file mode 100644 index 0000000000000000000000000000000000000000..1e81226e2dfdb0e4e802daa8bf0c9f9d19adb125 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/kitti-3d-car.py @@ -0,0 +1,138 @@ +# dataset settings +dataset_type = 'KittiDataset' +data_root = 'data/kitti/' +class_names = ['Car'] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] +input_modality = dict(use_lidar=True, use_camera=False) +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'kitti_dbinfos_train.pkl', + rate=1.0, + prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), + classes=class_names, + sample_groups=dict(Car=15)) + +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', path_mapping=dict(data='s3://kitti_data/')) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args), + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='ObjectNoise', + num_try=100, + translation_std=[1.0, 1.0, 0.5], + global_rot_range=[0.0, 0.0], + rot_range=[-0.78539816, 0.78539816]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=6, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_train.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=train_pipeline, + modality=input_modality, + classes=class_names, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'kitti_infos_val.pkl', + split='training', + pts_prefix='velodyne_reduced', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR')) + +evaluation = dict(interval=1, pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/datasets/lyft-3d.py b/GenAD-main/projects/configs/_base_/datasets/lyft-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..71baff04c5b5345ab3d7340607c3496a8befc5fa --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/lyft-3d.py @@ -0,0 +1,136 @@ +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-80, -80, -5, 80, 80, 3] +# For Lyft we usually do 9-class detection +class_names = [ + 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', + 'bicycle', 'pedestrian', 'animal' +] +dataset_type = 'LyftDataset' +data_root = 'data/lyft/' +# Input modality for Lyft dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=False) +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/lyft/': 's3://lyft/lyft/', +# 'data/lyft/': 's3://lyft/lyft/' +# })) +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_test.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True)) +# For Lyft dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 24. Please change the interval accordingly if you do not +# use a default schedule. +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/datasets/nuim_instance.py b/GenAD-main/projects/configs/_base_/datasets/nuim_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..82fce56bf6f2ad2578a0426e71fc13c2feb8bf97 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/nuim_instance.py @@ -0,0 +1,59 @@ +dataset_type = 'CocoDataset' +data_root = 'data/nuimages/' +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations', with_bbox=True, with_mask=True), + dict( + type='Resize', + img_scale=[(1280, 720), (1920, 1080)], + multiscale_mode='range', + keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1600, 900), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=data_root + 'annotations/nuimages_v1.0-train.json', + img_prefix=data_root, + classes=class_names, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=data_root + 'annotations/nuimages_v1.0-val.json', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline), + test=dict( + type=dataset_type, + ann_file=data_root + 'annotations/nuimages_v1.0-val.json', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline)) +evaluation = dict(metric=['bbox', 'segm']) diff --git a/GenAD-main/projects/configs/_base_/datasets/nus-3d.py b/GenAD-main/projects/configs/_base_/datasets/nus-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..154817175df8de5768c1d56bc35efaa0da99415c --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/nus-3d.py @@ -0,0 +1,142 @@ +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-50, -50, -5, 50, 50, 3] +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +dataset_type = 'NuScenesDataset' +data_root = 'data/nuscenes/' +# Input modality for nuScenes dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=False) +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/nuscenes/': 's3://nuscenes/nuscenes/', +# 'data/nuscenes/': 's3://nuscenes/nuscenes/' +# })) +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True, + box_type_3d='LiDAR')) +# For nuScenes dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 24. Please change the interval accordingly if you do not +# use a default schedule. +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/datasets/nus-mono3d.py b/GenAD-main/projects/configs/_base_/datasets/nus-mono3d.py new file mode 100644 index 0000000000000000000000000000000000000000..1363a94ce4fbb3b1014e61dd52bc36408f119ce1 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/nus-mono3d.py @@ -0,0 +1,100 @@ +dataset_type = 'CustomNuScenesMonoDataset' +data_root = 'data/nuscenes/' +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +# Input modality for nuScenes dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=False, + use_camera=True, + use_radar=False, + use_map=False, + use_external=False) +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='LoadAnnotations3D', + with_bbox=True, + with_label=True, + with_attr_label=True, + with_bbox_3d=True, + with_label_3d=True, + with_bbox_depth=True), + dict(type='Resize', img_scale=(1600, 900), keep_ratio=True), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=[ + 'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d', + 'gt_labels_3d', 'centers2d', 'depths' + ]), +] +test_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='MultiScaleFlipAug', + scale_factor=1.0, + flip=False, + transforms=[ + dict(type='RandomFlip3D'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img']), + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict(type='LoadImageFromFileMono3D'), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['img']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json', + img_prefix=data_root, + classes=class_names, + pipeline=train_pipeline, + modality=input_modality, + test_mode=False, + box_type_3d='Camera'), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline, + modality=input_modality, + test_mode=True, + box_type_3d='Camera'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', + img_prefix=data_root, + classes=class_names, + pipeline=test_pipeline, + modality=input_modality, + test_mode=True, + box_type_3d='Camera')) +evaluation = dict(interval=2) diff --git a/GenAD-main/projects/configs/_base_/datasets/range100_lyft-3d.py b/GenAD-main/projects/configs/_base_/datasets/range100_lyft-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..efa63ea3f0d351198d609785d971c19d96532844 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/range100_lyft-3d.py @@ -0,0 +1,136 @@ +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-100, -100, -5, 100, 100, 3] +# For Lyft we usually do 9-class detection +class_names = [ + 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', + 'bicycle', 'pedestrian', 'animal' +] +dataset_type = 'LyftDataset' +data_root = 'data/lyft/' +# Input modality for Lyft dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=False) +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/lyft/': 's3://lyft/lyft/', +# 'data/lyft/': 's3://lyft/lyft/' +# })) +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_test.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True)) +# For Lyft dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 24. Please change the interval accordingly if you do not +# use a default schedule. +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/datasets/s3dis-3d-5class.py b/GenAD-main/projects/configs/_base_/datasets/s3dis-3d-5class.py new file mode 100644 index 0000000000000000000000000000000000000000..2422766fa351ee5cf7f0cd5ee5ab61b88e1d0300 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/s3dis-3d-5class.py @@ -0,0 +1,114 @@ +# dataset settings +dataset_type = 'S3DISDataset' +data_root = './data/s3dis/' +class_names = ('table', 'chair', 'sofa', 'bookcase', 'board') +train_area = [1, 2, 3, 4, 6] +test_area = 5 + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict(type='PointSample', num_points=40000), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict( + type='GlobalRotScaleTrans', + # following ScanNet dataset the rotation range is 5 degrees + rot_range=[-0.087266, 0.087266], + scale_ratio_range=[1.0, 1.0], + shift_height=True), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict(type='PointSample', num_points=40000), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type='ConcatDataset', + datasets=[ + dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + f's3dis_infos_Area_{i}.pkl', + pipeline=train_pipeline, + filter_empty_gt=False, + classes=class_names, + box_type_3d='Depth') for i in train_area + ], + separate_eval=False)), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth')) + +evaluation = dict(pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py b/GenAD-main/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py new file mode 100644 index 0000000000000000000000000000000000000000..39bf5568e01d1a781c1b712e7c20b823e7c90141 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py @@ -0,0 +1,139 @@ +# dataset settings +dataset_type = 'S3DISSegDataset' +data_root = './data/s3dis/' +class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door', + 'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter') +num_points = 4096 +train_area = [1, 2, 3, 4, 6] +test_area = 5 +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='LoadAnnotations3D', + with_bbox_3d=False, + with_label_3d=False, + with_mask_3d=False, + with_seg_3d=True), + dict( + type='PointSegClassMapping', + valid_cat_ids=tuple(range(len(class_names))), + max_cat_id=13), + dict( + type='IndoorPatchPointSample', + num_points=num_points, + block_size=1.0, + ignore_index=len(class_names), + use_normalized_coord=True, + enlarge_size=0.2, + min_unique_num=None), + dict(type='NormalizePointsColor', color_mean=None), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict(type='NormalizePointsColor', color_mean=None), + dict( + # a wrapper in order to successfully call test function + # actually we don't perform test-time-aug + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.0, + flip_ratio_bev_vertical=0.0), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +# we need to load gt seg_mask! +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='LoadAnnotations3D', + with_bbox_3d=False, + with_label_3d=False, + with_mask_3d=False, + with_seg_3d=True), + dict( + type='PointSegClassMapping', + valid_cat_ids=tuple(range(len(class_names))), + max_cat_id=13), + dict( + type='DefaultFormatBundle3D', + with_label=False, + class_names=class_names), + dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) +] + +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + # train on area 1, 2, 3, 4, 6 + # test on area 5 + train=dict( + type=dataset_type, + data_root=data_root, + ann_files=[ + data_root + f's3dis_infos_Area_{i}.pkl' for i in train_area + ], + pipeline=train_pipeline, + classes=class_names, + test_mode=False, + ignore_index=len(class_names), + scene_idxs=[ + data_root + f'seg_info/Area_{i}_resampled_scene_idxs.npy' + for i in train_area + ]), + val=dict( + type=dataset_type, + data_root=data_root, + ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + ignore_index=len(class_names), + scene_idxs=data_root + + f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + ignore_index=len(class_names))) + +evaluation = dict(pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/datasets/scannet-3d-18class.py b/GenAD-main/projects/configs/_base_/datasets/scannet-3d-18class.py new file mode 100644 index 0000000000000000000000000000000000000000..93da1e5870561363fb3686e8288ccf561ca72cd2 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/scannet-3d-18class.py @@ -0,0 +1,128 @@ +# dataset settings +dataset_type = 'ScanNetDataset' +data_root = './data/scannet/' +class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', + 'bookshelf', 'picture', 'counter', 'desk', 'curtain', + 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', + 'garbagebin') +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2]), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + with_mask_3d=True, + with_seg_3d=True), + dict(type='GlobalAlignment', rotation_axis=2), + dict( + type='PointSegClassMapping', + valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, + 36, 39), + max_cat_id=40), + dict(type='PointSample', num_points=40000), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.087266, 0.087266], + scale_ratio_range=[1.0, 1.0], + shift_height=True), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict( + type='Collect3D', + keys=[ + 'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask', + 'pts_instance_mask' + ]) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2]), + dict(type='GlobalAlignment', rotation_axis=2), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict(type='PointSample', num_points=40000), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + load_dim=6, + use_dim=[0, 1, 2]), + dict(type='GlobalAlignment', rotation_axis=2), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_train.pkl', + pipeline=train_pipeline, + filter_empty_gt=False, + classes=class_names, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='Depth')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth')) + +evaluation = dict(pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/datasets/scannet_seg-3d-20class.py b/GenAD-main/projects/configs/_base_/datasets/scannet_seg-3d-20class.py new file mode 100644 index 0000000000000000000000000000000000000000..cf73b09c8afa9317fa7077f5f67b1fae3306c1b7 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/scannet_seg-3d-20class.py @@ -0,0 +1,132 @@ +# dataset settings +dataset_type = 'ScanNetSegDataset' +data_root = './data/scannet/' +class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', + 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', + 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', + 'bathtub', 'otherfurniture') +num_points = 8192 +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='LoadAnnotations3D', + with_bbox_3d=False, + with_label_3d=False, + with_mask_3d=False, + with_seg_3d=True), + dict( + type='PointSegClassMapping', + valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, + 33, 34, 36, 39), + max_cat_id=40), + dict( + type='IndoorPatchPointSample', + num_points=num_points, + block_size=1.5, + ignore_index=len(class_names), + use_normalized_coord=False, + enlarge_size=0.2, + min_unique_num=None), + dict(type='NormalizePointsColor', color_mean=None), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict(type='NormalizePointsColor', color_mean=None), + dict( + # a wrapper in order to successfully call test function + # actually we don't perform test-time-aug + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.0, + flip_ratio_bev_vertical=0.0), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +# we need to load gt seg_mask! +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + use_color=True, + load_dim=6, + use_dim=[0, 1, 2, 3, 4, 5]), + dict( + type='LoadAnnotations3D', + with_bbox_3d=False, + with_label_3d=False, + with_mask_3d=False, + with_seg_3d=True), + dict( + type='PointSegClassMapping', + valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, + 33, 34, 36, 39), + max_cat_id=40), + dict( + type='DefaultFormatBundle3D', + with_label=False, + class_names=class_names), + dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) +] + +data = dict( + samples_per_gpu=8, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + test_mode=False, + ignore_index=len(class_names), + scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy'), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + ignore_index=len(class_names)), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'scannet_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + ignore_index=len(class_names))) + +evaluation = dict(pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/datasets/sunrgbd-3d-10class.py b/GenAD-main/projects/configs/_base_/datasets/sunrgbd-3d-10class.py new file mode 100644 index 0000000000000000000000000000000000000000..7121b75bbf0679c55f706ed07294eb2fa3495cc0 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/sunrgbd-3d-10class.py @@ -0,0 +1,107 @@ +dataset_type = 'SUNRGBDDataset' +data_root = 'data/sunrgbd/' +class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', + 'night_stand', 'bookshelf', 'bathtub') +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2]), + dict(type='LoadAnnotations3D'), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + ), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.523599, 0.523599], + scale_ratio_range=[0.85, 1.15], + shift_height=True), + dict(type='PointSample', num_points=20000), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=True, + load_dim=6, + use_dim=[0, 1, 2]), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + ), + dict(type='PointSample', num_points=20000), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='DEPTH', + shift_height=False, + load_dim=6, + use_dim=[0, 1, 2]), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=16, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=5, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + filter_empty_gt=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='Depth')), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'sunrgbd_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + test_mode=True, + box_type_3d='Depth')) + +evaluation = dict(pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-3class.py b/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-3class.py new file mode 100644 index 0000000000000000000000000000000000000000..920ac154d68cb07669642300fafd52d179be5392 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-3class.py @@ -0,0 +1,145 @@ +# dataset settings +# D5 in the config name means the whole dataset is divided into 5 folds +# We only use one fold for efficient experiments +dataset_type = 'LidarWaymoDataset' +data_root = 'data/waymo-full/kitti_format/' +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', path_mapping=dict(data='s3://waymo_data/')) + +class_names = ['Car', 'Pedestrian', 'Cyclist'] +point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] +input_modality = dict(use_lidar=True, use_camera=False) +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'waymo_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), + classes=class_names, + sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + file_client_args=file_client_args)) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args), + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_train.pkl', + split='training', + pipeline=train_pipeline, + modality=input_modality, + classes=class_names, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR', + # load one frame every five frames + load_interval=5)), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR')) + +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-car.py b/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-car.py new file mode 100644 index 0000000000000000000000000000000000000000..02e262721b29ede7e29d0d0046eba243f2c82249 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-car.py @@ -0,0 +1,143 @@ +# dataset settings +# D5 in the config name means the whole dataset is divided into 5 folds +# We only use one fold for efficient experiments +dataset_type = 'WaymoDataset' +data_root = 'data/waymo/kitti_format/' +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', path_mapping=dict(data='s3://waymo_data/')) + +class_names = ['Car'] +point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] +input_modality = dict(use_lidar=True, use_camera=False) +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'waymo_dbinfos_train.pkl', + rate=1.0, + prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), + classes=class_names, + sample_groups=dict(Car=15), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + file_client_args=file_client_args)) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args), + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='RandomFlip3D', + sync_2d=False, + flip_ratio_bev_horizontal=0.5, + flip_ratio_bev_vertical=0.5), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.78539816, 0.78539816], + scale_ratio_range=[0.95, 1.05]), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_train.pkl', + split='training', + pipeline=train_pipeline, + modality=input_modality, + classes=class_names, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR', + # load one frame every five frames + load_interval=5)), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR')) + +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/_base_/default_runtime.py b/GenAD-main/projects/configs/_base_/default_runtime.py new file mode 100644 index 0000000000000000000000000000000000000000..4e85b69abed5f51238da4f183163066073664350 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/default_runtime.py @@ -0,0 +1,18 @@ +checkpoint_config = dict(interval=1) +# yapf:disable push +# By default we use textlogger hook and tensorboard +# For more loggers see +# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = None +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/GenAD-main/projects/configs/_base_/models/3dssd.py b/GenAD-main/projects/configs/_base_/models/3dssd.py new file mode 100644 index 0000000000000000000000000000000000000000..55344c7ddff660dc0306542d94260efad39f8df7 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/3dssd.py @@ -0,0 +1,77 @@ +model = dict( + type='SSD3DNet', + backbone=dict( + type='PointNet2SAMSG', + in_channels=4, + num_points=(4096, 512, (256, 256)), + radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)), + num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)), + sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)), + ((64, 64, 128), (64, 64, 128), (64, 96, 128)), + ((128, 128, 256), (128, 192, 256), (128, 256, 256))), + aggregation_channels=(64, 128, 256), + fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')), + fps_sample_range_lists=((-1), (-1), (512, -1)), + norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), + sa_cfg=dict( + type='PointSAModuleMSG', + pool_mod='max', + use_xyz=True, + normalize_xyz=False)), + bbox_head=dict( + type='SSD3DHead', + in_channels=256, + vote_module_cfg=dict( + in_channels=256, + num_points=256, + gt_per_seed=1, + conv_channels=(128, ), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), + with_res_feat=False, + vote_xyz_range=(3.0, 3.0, 2.0)), + vote_aggregation_cfg=dict( + type='PointSAModuleMSG', + num_point=256, + radii=(4.8, 6.4), + sample_nums=(16, 32), + mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)), + norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), + use_xyz=True, + normalize_xyz=False, + bias=True), + pred_layer_cfg=dict( + in_channels=1536, + shared_conv_channels=(512, 128), + cls_conv_channels=(128, ), + reg_conv_channels=(128, ), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), + bias=True), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), + objectness_loss=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0), + center_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=1.0), + dir_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + dir_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=1.0), + size_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=1.0), + corner_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=1.0), + vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05), + test_cfg=dict( + nms_cfg=dict(type='nms', iou_thr=0.1), + sample_mod='spec', + score_thr=0.0, + per_class_proposal=True, + max_output_num=100)) diff --git a/GenAD-main/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/GenAD-main/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..fb9e0a8f06d3f597e90156efc9f30264c678fe85 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py @@ -0,0 +1,200 @@ +# model settings +model = dict( + type='CascadeRCNN', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), + roi_head=dict( + type='CascadeRoIHead', + num_stages=3, + stage_loss_weights=[1, 0.5, 0.25], + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=[ + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.05, 0.05, 0.1, 0.1]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, + loss_weight=1.0)), + dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.033, 0.033, 0.067, 0.067]), + reg_class_agnostic=True, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) + ], + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=2000, + max_num=2000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=[ + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.6, + neg_iou_thr=0.6, + min_pos_iou=0.6, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False), + dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.7, + min_pos_iou=0.7, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False) + ]), + test_cfg=dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/GenAD-main/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py b/GenAD-main/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py new file mode 100644 index 0000000000000000000000000000000000000000..efdce59c6d59c6564c6558a7a800852fe14314d7 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py @@ -0,0 +1,83 @@ +voxel_size = [0.1, 0.1, 0.2] +model = dict( + type='CenterPoint', + pts_voxel_layer=dict( + max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)), + pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), + pts_middle_encoder=dict( + type='SparseEncoder', + in_channels=5, + sparse_shape=[41, 1024, 1024], + output_channels=128, + order=('conv', 'norm', 'act'), + encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, + 128)), + encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), + block_type='basicblock'), + pts_backbone=dict( + type='SECOND', + in_channels=256, + out_channels=[128, 256], + layer_nums=[5, 5], + layer_strides=[1, 2], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + pts_neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + out_channels=[256, 256], + upsample_strides=[1, 2], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + use_conv_for_no_stride=True), + pts_bbox_head=dict( + type='CenterHead', + in_channels=sum([256, 256]), + tasks=[ + dict(num_class=1, class_names=['car']), + dict(num_class=2, class_names=['truck', 'construction_vehicle']), + dict(num_class=2, class_names=['bus', 'trailer']), + dict(num_class=1, class_names=['barrier']), + dict(num_class=2, class_names=['motorcycle', 'bicycle']), + dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), + ], + common_heads=dict( + reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), + share_conv_channel=64, + bbox_coder=dict( + type='CenterPointBBoxCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_num=500, + score_threshold=0.1, + out_size_factor=8, + voxel_size=voxel_size[:2], + code_size=9), + separate_head=dict( + type='SeparateHead', init_bias=-2.19, final_kernel=3), + loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), + loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), + norm_bbox=True), + # model training and testing settings + train_cfg=dict( + pts=dict( + grid_size=[1024, 1024, 40], + voxel_size=voxel_size, + out_size_factor=8, + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), + test_cfg=dict( + pts=dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + max_pool_nms=False, + min_radius=[4, 12, 10, 1, 0.85, 0.175], + score_threshold=0.1, + out_size_factor=8, + voxel_size=voxel_size[:2], + nms_type='rotate', + pre_max_size=1000, + post_max_size=83, + nms_thr=0.2))) diff --git a/GenAD-main/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py b/GenAD-main/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py new file mode 100644 index 0000000000000000000000000000000000000000..311d76373bd261ed8827409be68db0e577b38327 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py @@ -0,0 +1,83 @@ +voxel_size = [0.2, 0.2, 8] +model = dict( + type='CenterPoint', + pts_voxel_layer=dict( + max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)), + pts_voxel_encoder=dict( + type='PillarFeatureNet', + in_channels=5, + feat_channels=[64], + with_distance=False, + voxel_size=(0.2, 0.2, 8), + norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), + legacy=False), + pts_middle_encoder=dict( + type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)), + pts_backbone=dict( + type='SECOND', + in_channels=64, + out_channels=[64, 128, 256], + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + pts_neck=dict( + type='SECONDFPN', + in_channels=[64, 128, 256], + out_channels=[128, 128, 128], + upsample_strides=[0.5, 1, 2], + norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + use_conv_for_no_stride=True), + pts_bbox_head=dict( + type='CenterHead', + in_channels=sum([128, 128, 128]), + tasks=[ + dict(num_class=1, class_names=['car']), + dict(num_class=2, class_names=['truck', 'construction_vehicle']), + dict(num_class=2, class_names=['bus', 'trailer']), + dict(num_class=1, class_names=['barrier']), + dict(num_class=2, class_names=['motorcycle', 'bicycle']), + dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), + ], + common_heads=dict( + reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), + share_conv_channel=64, + bbox_coder=dict( + type='CenterPointBBoxCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_num=500, + score_threshold=0.1, + out_size_factor=4, + voxel_size=voxel_size[:2], + code_size=9), + separate_head=dict( + type='SeparateHead', init_bias=-2.19, final_kernel=3), + loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), + loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), + norm_bbox=True), + # model training and testing settings + train_cfg=dict( + pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + out_size_factor=4, + dense_reg=1, + gaussian_overlap=0.1, + max_objs=500, + min_radius=2, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), + test_cfg=dict( + pts=dict( + post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + max_per_img=500, + max_pool_nms=False, + min_radius=[4, 12, 10, 1, 0.85, 0.175], + score_threshold=0.1, + pc_range=[-51.2, -51.2], + out_size_factor=4, + voxel_size=voxel_size[:2], + nms_type='rotate', + pre_max_size=1000, + post_max_size=83, + nms_thr=0.2))) diff --git a/GenAD-main/projects/configs/_base_/models/fcos3d.py b/GenAD-main/projects/configs/_base_/models/fcos3d.py new file mode 100644 index 0000000000000000000000000000000000000000..92ea90760519d6205d75af6a39f927503de89aad --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/fcos3d.py @@ -0,0 +1,74 @@ +model = dict( + type='FCOSMono3D', + pretrained='open-mmlab://detectron2/resnet101_caffe', + backbone=dict( + type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + start_level=1, + add_extra_convs='on_output', + num_outs=5, + relu_before_extra_convs=True), + bbox_head=dict( + type='FCOSMono3DHead', + num_classes=10, + in_channels=256, + stacked_convs=2, + feat_channels=256, + use_direction_classifier=True, + diff_rad_by_sin=True, + pred_attrs=True, + pred_velo=True, + dir_offset=0.7854, # pi/4 + strides=[8, 16, 32, 64, 128], + group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo + cls_branch=(256, ), + reg_branch=( + (256, ), # offset + (256, ), # depth + (256, ), # size + (256, ), # rot + () # velo + ), + dir_branch=(256, ), + attr_branch=(256, ), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_attr=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_centerness=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + norm_on_bbox=True, + centerness_on_reg=True, + center_sampling=True, + conv_bias=True, + dcn_on_last_conv=True), + train_cfg=dict( + allowed_border=0, + code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05], + pos_weight=-1, + debug=False), + test_cfg=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=1000, + nms_thr=0.8, + score_thr=0.05, + min_bbox_size=0, + max_per_img=200)) diff --git a/GenAD-main/projects/configs/_base_/models/groupfree3d.py b/GenAD-main/projects/configs/_base_/models/groupfree3d.py new file mode 100644 index 0000000000000000000000000000000000000000..077d049662fe16b91639af4a5923a4e8e540148d --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/groupfree3d.py @@ -0,0 +1,71 @@ +model = dict( + type='GroupFree3DNet', + backbone=dict( + type='PointNet2SASSG', + in_channels=3, + num_points=(2048, 1024, 512, 256), + radius=(0.2, 0.4, 0.8, 1.2), + num_samples=(64, 32, 16, 16), + sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), + (128, 128, 256)), + fp_channels=((256, 256), (256, 288)), + norm_cfg=dict(type='BN2d'), + sa_cfg=dict( + type='PointSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=True)), + bbox_head=dict( + type='GroupFree3DHead', + in_channels=288, + num_decoder_layers=6, + num_proposal=256, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=dict( + type='GroupFree3DMHA', + embed_dims=288, + num_heads=8, + attn_drop=0.1, + dropout_layer=dict(type='Dropout', drop_prob=0.1)), + ffn_cfgs=dict( + embed_dims=288, + feedforward_channels=2048, + ffn_drop=0.1, + act_cfg=dict(type='ReLU', inplace=True)), + operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', + 'norm')), + pred_layer_cfg=dict( + in_channels=288, shared_conv_channels=(288, 288), bias=True), + sampling_objectness_loss=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=8.0), + objectness_loss=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + center_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + dir_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + dir_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + size_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + size_res_loss=dict( + type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0), + semantic_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(sample_mod='kps'), + test_cfg=dict( + sample_mod='kps', + nms_thr=0.25, + score_thr=0.0, + per_class_proposal=True, + prediction_stages='last')) diff --git a/GenAD-main/projects/configs/_base_/models/h3dnet.py b/GenAD-main/projects/configs/_base_/models/h3dnet.py new file mode 100644 index 0000000000000000000000000000000000000000..760566744f6484cde261f87f0d95a1182786779c --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/h3dnet.py @@ -0,0 +1,341 @@ +primitive_z_cfg = dict( + type='PrimitiveHead', + num_dims=2, + num_classes=18, + primitive_mode='z', + upper_thresh=100.0, + surface_thresh=0.5, + vote_module_cfg=dict( + in_channels=256, + vote_per_seed=1, + gt_per_seed=1, + conv_channels=(256, 256), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + norm_feats=True, + vote_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='none', + loss_dst_weight=10.0)), + vote_aggregation_cfg=dict( + type='PointSAModule', + num_point=1024, + radius=0.3, + num_sample=16, + mlp_channels=[256, 128, 128, 128], + use_xyz=True, + normalize_xyz=True), + feat_channels=(128, 128), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.4, 0.6], + reduction='mean', + loss_weight=30.0), + center_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=0.5, + loss_dst_weight=0.5), + semantic_reg_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=0.5, + loss_dst_weight=0.5), + semantic_cls_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + train_cfg=dict( + dist_thresh=0.2, + var_thresh=1e-2, + lower_thresh=1e-6, + num_point=100, + num_point_line=10, + line_thresh=0.2)) + +primitive_xy_cfg = dict( + type='PrimitiveHead', + num_dims=1, + num_classes=18, + primitive_mode='xy', + upper_thresh=100.0, + surface_thresh=0.5, + vote_module_cfg=dict( + in_channels=256, + vote_per_seed=1, + gt_per_seed=1, + conv_channels=(256, 256), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + norm_feats=True, + vote_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='none', + loss_dst_weight=10.0)), + vote_aggregation_cfg=dict( + type='PointSAModule', + num_point=1024, + radius=0.3, + num_sample=16, + mlp_channels=[256, 128, 128, 128], + use_xyz=True, + normalize_xyz=True), + feat_channels=(128, 128), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.4, 0.6], + reduction='mean', + loss_weight=30.0), + center_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=0.5, + loss_dst_weight=0.5), + semantic_reg_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=0.5, + loss_dst_weight=0.5), + semantic_cls_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + train_cfg=dict( + dist_thresh=0.2, + var_thresh=1e-2, + lower_thresh=1e-6, + num_point=100, + num_point_line=10, + line_thresh=0.2)) + +primitive_line_cfg = dict( + type='PrimitiveHead', + num_dims=0, + num_classes=18, + primitive_mode='line', + upper_thresh=100.0, + surface_thresh=0.5, + vote_module_cfg=dict( + in_channels=256, + vote_per_seed=1, + gt_per_seed=1, + conv_channels=(256, 256), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + norm_feats=True, + vote_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='none', + loss_dst_weight=10.0)), + vote_aggregation_cfg=dict( + type='PointSAModule', + num_point=1024, + radius=0.3, + num_sample=16, + mlp_channels=[256, 128, 128, 128], + use_xyz=True, + normalize_xyz=True), + feat_channels=(128, 128), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.4, 0.6], + reduction='mean', + loss_weight=30.0), + center_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=1.0, + loss_dst_weight=1.0), + semantic_reg_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='sum', + loss_src_weight=1.0, + loss_dst_weight=1.0), + semantic_cls_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=2.0), + train_cfg=dict( + dist_thresh=0.2, + var_thresh=1e-2, + lower_thresh=1e-6, + num_point=100, + num_point_line=10, + line_thresh=0.2)) + +model = dict( + type='H3DNet', + backbone=dict( + type='MultiBackbone', + num_streams=4, + suffixes=['net0', 'net1', 'net2', 'net3'], + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01), + act_cfg=dict(type='ReLU'), + backbones=dict( + type='PointNet2SASSG', + in_channels=4, + num_points=(2048, 1024, 512, 256), + radius=(0.2, 0.4, 0.8, 1.2), + num_samples=(64, 32, 16, 16), + sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), + (128, 128, 256)), + fp_channels=((256, 256), (256, 256)), + norm_cfg=dict(type='BN2d'), + sa_cfg=dict( + type='PointSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=True))), + rpn_head=dict( + type='VoteHead', + vote_module_cfg=dict( + in_channels=256, + vote_per_seed=1, + gt_per_seed=3, + conv_channels=(256, 256), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + norm_feats=True, + vote_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='none', + loss_dst_weight=10.0)), + vote_aggregation_cfg=dict( + type='PointSAModule', + num_point=256, + radius=0.3, + num_sample=16, + mlp_channels=[256, 128, 128, 128], + use_xyz=True, + normalize_xyz=True), + pred_layer_cfg=dict( + in_channels=128, shared_conv_channels=(128, 128), bias=True), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.2, 0.8], + reduction='sum', + loss_weight=5.0), + center_loss=dict( + type='ChamferDistance', + mode='l2', + reduction='sum', + loss_src_weight=10.0, + loss_dst_weight=10.0), + dir_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + dir_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + size_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + size_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + semantic_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), + roi_head=dict( + type='H3DRoIHead', + primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg], + bbox_head=dict( + type='H3DBboxHead', + gt_per_seed=3, + num_proposal=256, + suface_matching_cfg=dict( + type='PointSAModule', + num_point=256 * 6, + radius=0.5, + num_sample=32, + mlp_channels=[128 + 6, 128, 64, 32], + use_xyz=True, + normalize_xyz=True), + line_matching_cfg=dict( + type='PointSAModule', + num_point=256 * 12, + radius=0.5, + num_sample=32, + mlp_channels=[128 + 12, 128, 64, 32], + use_xyz=True, + normalize_xyz=True), + feat_channels=(128, 128), + primitive_refine_channels=[128, 128, 128], + upper_thresh=100.0, + surface_thresh=0.5, + line_thresh=0.5, + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.2, 0.8], + reduction='sum', + loss_weight=5.0), + center_loss=dict( + type='ChamferDistance', + mode='l2', + reduction='sum', + loss_src_weight=10.0, + loss_dst_weight=10.0), + dir_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), + dir_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + size_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), + size_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + semantic_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), + cues_objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.3, 0.7], + reduction='mean', + loss_weight=5.0), + cues_semantic_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.3, 0.7], + reduction='mean', + loss_weight=5.0), + proposal_objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.2, 0.8], + reduction='none', + loss_weight=5.0), + primitive_center_loss=dict( + type='MSELoss', reduction='none', loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'), + rpn_proposal=dict(use_nms=False), + rcnn=dict( + pos_distance_thr=0.3, + neg_distance_thr=0.6, + sample_mod='vote', + far_threshold=0.6, + near_threshold=0.3, + mask_surface_threshold=0.3, + label_surface_threshold=0.3, + mask_line_threshold=0.3, + label_line_threshold=0.3)), + test_cfg=dict( + rpn=dict( + sample_mod='seed', + nms_thr=0.25, + score_thr=0.05, + per_class_proposal=True, + use_nms=False), + rcnn=dict( + sample_mod='seed', + nms_thr=0.25, + score_thr=0.05, + per_class_proposal=True))) diff --git a/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py new file mode 100644 index 0000000000000000000000000000000000000000..87c7fe0c6145f0cceadafd7f51c98f209538796d --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py @@ -0,0 +1,22 @@ +_base_ = './hv_pointpillars_fpn_nus.py' + +# model settings (based on nuScenes model settings) +# Voxel size for voxel encoder +# Usually voxel size is changed consistently with the point cloud range +# If point cloud range is modified, do remember to change all related +# keys in the config. +model = dict( + pts_voxel_layer=dict( + max_num_points=20, + point_cloud_range=[-80, -80, -5, 80, 80, 3], + max_voxels=(60000, 60000)), + pts_voxel_encoder=dict( + feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]), + pts_middle_encoder=dict(output_shape=[640, 640]), + pts_bbox_head=dict( + num_classes=9, + anchor_generator=dict( + ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]), + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)), + # model training settings (based on nuScenes model settings) + train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]))) diff --git a/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py new file mode 100644 index 0000000000000000000000000000000000000000..e153f6c6e69171d29f79b627dd6d152a842d0db2 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py @@ -0,0 +1,96 @@ +# model settings +# Voxel size for voxel encoder +# Usually voxel size is changed consistently with the point cloud range +# If point cloud range is modified, do remember to change all related +# keys in the config. +voxel_size = [0.25, 0.25, 8] +model = dict( + type='MVXFasterRCNN', + pts_voxel_layer=dict( + max_num_points=64, + point_cloud_range=[-50, -50, -5, 50, 50, 3], + voxel_size=voxel_size, + max_voxels=(30000, 40000)), + pts_voxel_encoder=dict( + type='HardVFE', + in_channels=4, + feat_channels=[64, 64], + with_distance=False, + voxel_size=voxel_size, + with_cluster_center=True, + with_voxel_center=True, + point_cloud_range=[-50, -50, -5, 50, 50, 3], + norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)), + pts_middle_encoder=dict( + type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]), + pts_backbone=dict( + type='SECOND', + in_channels=64, + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + out_channels=[64, 128, 256]), + pts_neck=dict( + type='FPN', + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + act_cfg=dict(type='ReLU'), + in_channels=[64, 128, 256], + out_channels=256, + start_level=0, + num_outs=3), + pts_bbox_head=dict( + type='Anchor3DHead', + num_classes=10, + in_channels=256, + feat_channels=256, + use_direction_classifier=True, + anchor_generator=dict( + type='AlignedAnchor3DRangeGenerator', + ranges=[[-50, -50, -1.8, 50, 50, -1.8]], + scales=[1, 2, 4], + sizes=[ + [0.8660, 2.5981, 1.], # 1.5/sqrt(3) + [0.5774, 1.7321, 1.], # 1/sqrt(3) + [1., 1., 1.], + [0.4, 0.4, 1], + ], + custom_values=[0, 0], + rotations=[0, 1.57], + reshape_out=True), + assigner_per_size=False, + diff_rad_by_sin=True, + dir_offset=0.7854, # pi/4 + dir_limit_offset=0, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + # model training and testing settings + train_cfg=dict( + pts=dict( + assigner=dict( + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.6, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + allowed_border=0, + code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + pos_weight=-1, + debug=False)), + test_cfg=dict( + pts=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=1000, + nms_thr=0.2, + score_thr=0.05, + min_bbox_size=0, + max_num=500))) diff --git a/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py new file mode 100644 index 0000000000000000000000000000000000000000..9cd200f3e4c0dfb7da1823263b22bbcd63d77d63 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py @@ -0,0 +1,22 @@ +_base_ = './hv_pointpillars_fpn_nus.py' + +# model settings (based on nuScenes model settings) +# Voxel size for voxel encoder +# Usually voxel size is changed consistently with the point cloud range +# If point cloud range is modified, do remember to change all related +# keys in the config. +model = dict( + pts_voxel_layer=dict( + max_num_points=20, + point_cloud_range=[-100, -100, -5, 100, 100, 3], + max_voxels=(60000, 60000)), + pts_voxel_encoder=dict( + feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]), + pts_middle_encoder=dict(output_shape=[800, 800]), + pts_bbox_head=dict( + num_classes=9, + anchor_generator=dict( + ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]), + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)), + # model training settings (based on nuScenes model settings) + train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]))) diff --git a/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py new file mode 100644 index 0000000000000000000000000000000000000000..85076d0798bc49e1564d6eabe177d1ae92be0aef --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py @@ -0,0 +1,93 @@ +voxel_size = [0.16, 0.16, 4] + +model = dict( + type='VoxelNet', + voxel_layer=dict( + max_num_points=32, # max_points_per_voxel + point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1], + voxel_size=voxel_size, + max_voxels=(16000, 40000) # (training, testing) max_voxels + ), + voxel_encoder=dict( + type='PillarFeatureNet', + in_channels=4, + feat_channels=[64], + with_distance=False, + voxel_size=voxel_size, + point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]), + middle_encoder=dict( + type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]), + backbone=dict( + type='SECOND', + in_channels=64, + layer_nums=[3, 5, 5], + layer_strides=[2, 2, 2], + out_channels=[64, 128, 256]), + neck=dict( + type='SECONDFPN', + in_channels=[64, 128, 256], + upsample_strides=[1, 2, 4], + out_channels=[128, 128, 128]), + bbox_head=dict( + type='Anchor3DHead', + num_classes=3, + in_channels=384, + feat_channels=384, + use_direction_classifier=True, + anchor_generator=dict( + type='Anchor3DRangeGenerator', + ranges=[ + [0, -39.68, -0.6, 70.4, 39.68, -0.6], + [0, -39.68, -0.6, 70.4, 39.68, -0.6], + [0, -39.68, -1.78, 70.4, 39.68, -1.78], + ], + sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + rotations=[0, 1.57], + reshape_out=False), + diff_rad_by_sin=True, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + # model training and testing settings + train_cfg=dict( + assigner=[ + dict( # for Pedestrian + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.35, + min_pos_iou=0.35, + ignore_iof_thr=-1), + dict( # for Cyclist + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.35, + min_pos_iou=0.35, + ignore_iof_thr=-1), + dict( # for Car + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1), + ], + allowed_border=0, + pos_weight=-1, + debug=False), + test_cfg=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_thr=0.01, + score_thr=0.1, + min_bbox_size=0, + nms_pre=100, + max_num=50)) diff --git a/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py new file mode 100644 index 0000000000000000000000000000000000000000..14873ead474761d96b8487d48765bf2486277bed --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py @@ -0,0 +1,108 @@ +# model settings +# Voxel size for voxel encoder +# Usually voxel size is changed consistently with the point cloud range +# If point cloud range is modified, do remember to change all related +# keys in the config. +voxel_size = [0.32, 0.32, 6] +model = dict( + type='MVXFasterRCNN', + pts_voxel_layer=dict( + max_num_points=20, + point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], + voxel_size=voxel_size, + max_voxels=(32000, 32000)), + pts_voxel_encoder=dict( + type='HardVFE', + in_channels=5, + feat_channels=[64], + with_distance=False, + voxel_size=voxel_size, + with_cluster_center=True, + with_voxel_center=True, + point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], + norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)), + pts_middle_encoder=dict( + type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]), + pts_backbone=dict( + type='SECOND', + in_channels=64, + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + layer_nums=[3, 5, 5], + layer_strides=[1, 2, 2], + out_channels=[64, 128, 256]), + pts_neck=dict( + type='SECONDFPN', + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + in_channels=[64, 128, 256], + upsample_strides=[1, 2, 4], + out_channels=[128, 128, 128]), + pts_bbox_head=dict( + type='Anchor3DHead', + num_classes=3, + in_channels=384, + feat_channels=384, + use_direction_classifier=True, + anchor_generator=dict( + type='AlignedAnchor3DRangeGenerator', + ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345], + [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188], + [-74.88, -74.88, 0, 74.88, 74.88, 0]], + sizes=[ + [2.08, 4.73, 1.77], # car + [0.84, 1.81, 1.77], # cyclist + [0.84, 0.91, 1.74] # pedestrian + ], + rotations=[0, 1.57], + reshape_out=False), + diff_rad_by_sin=True, + dir_offset=0.7854, # pi/4 + dir_limit_offset=0, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + # model training and testing settings + train_cfg=dict( + pts=dict( + assigner=[ + dict( # car + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.55, + neg_iou_thr=0.4, + min_pos_iou=0.4, + ignore_iof_thr=-1), + dict( # cyclist + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + dict( # pedestrian + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + ], + allowed_border=0, + code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + pos_weight=-1, + debug=False)), + test_cfg=dict( + pts=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=4096, + nms_thr=0.25, + score_thr=0.1, + min_bbox_size=0, + max_num=500))) diff --git a/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_kitti.py b/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_kitti.py new file mode 100644 index 0000000000000000000000000000000000000000..6bf18abe1df08680cc2bb86dfb7b445af4d63ec8 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_kitti.py @@ -0,0 +1,89 @@ +voxel_size = [0.05, 0.05, 0.1] + +model = dict( + type='VoxelNet', + voxel_layer=dict( + max_num_points=5, + point_cloud_range=[0, -40, -3, 70.4, 40, 1], + voxel_size=voxel_size, + max_voxels=(16000, 40000)), + voxel_encoder=dict(type='HardSimpleVFE'), + middle_encoder=dict( + type='SparseEncoder', + in_channels=4, + sparse_shape=[41, 1600, 1408], + order=('conv', 'norm', 'act')), + backbone=dict( + type='SECOND', + in_channels=256, + layer_nums=[5, 5], + layer_strides=[1, 2], + out_channels=[128, 256]), + neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + upsample_strides=[1, 2], + out_channels=[256, 256]), + bbox_head=dict( + type='Anchor3DHead', + num_classes=3, + in_channels=512, + feat_channels=512, + use_direction_classifier=True, + anchor_generator=dict( + type='Anchor3DRangeGenerator', + ranges=[ + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -1.78, 70.4, 40.0, -1.78], + ], + sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + rotations=[0, 1.57], + reshape_out=False), + diff_rad_by_sin=True, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + # model training and testing settings + train_cfg=dict( + assigner=[ + dict( # for Pedestrian + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.35, + neg_iou_thr=0.2, + min_pos_iou=0.2, + ignore_iof_thr=-1), + dict( # for Cyclist + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.35, + neg_iou_thr=0.2, + min_pos_iou=0.2, + ignore_iof_thr=-1), + dict( # for Car + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1), + ], + allowed_border=0, + pos_weight=-1, + debug=False), + test_cfg=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_thr=0.01, + score_thr=0.1, + min_bbox_size=0, + nms_pre=100, + max_num=50)) diff --git a/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_waymo.py b/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_waymo.py new file mode 100644 index 0000000000000000000000000000000000000000..eb9bd3ae5cd6c94e56aa9d88765746853ca58f3e --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_waymo.py @@ -0,0 +1,100 @@ +# model settings +# Voxel size for voxel encoder +# Usually voxel size is changed consistently with the point cloud range +# If point cloud range is modified, do remember to change all related +# keys in the config. +voxel_size = [0.08, 0.08, 0.1] +model = dict( + type='VoxelNet', + voxel_layer=dict( + max_num_points=10, + point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4], + voxel_size=voxel_size, + max_voxels=(80000, 90000)), + voxel_encoder=dict(type='HardSimpleVFE', num_features=5), + middle_encoder=dict( + type='SparseEncoder', + in_channels=5, + sparse_shape=[61, 1280, 1920], + order=('conv', 'norm', 'act')), + backbone=dict( + type='SECOND', + in_channels=384, + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + layer_nums=[5, 5], + layer_strides=[1, 2], + out_channels=[128, 256]), + neck=dict( + type='SECONDFPN', + norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), + in_channels=[128, 256], + upsample_strides=[1, 2], + out_channels=[256, 256]), + bbox_head=dict( + type='Anchor3DHead', + num_classes=3, + in_channels=512, + feat_channels=512, + use_direction_classifier=True, + anchor_generator=dict( + type='AlignedAnchor3DRangeGenerator', + ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345], + [-76.8, -51.2, 0, 76.8, 51.2, 0], + [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]], + sizes=[ + [2.08, 4.73, 1.77], # car + [0.84, 0.91, 1.74], # pedestrian + [0.84, 1.81, 1.77] # cyclist + ], + rotations=[0, 1.57], + reshape_out=False), + diff_rad_by_sin=True, + dir_offset=0.7854, # pi/4 + dir_limit_offset=0, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + # model training and testing settings + train_cfg=dict( + assigner=[ + dict( # car + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.55, + neg_iou_thr=0.4, + min_pos_iou=0.4, + ignore_iof_thr=-1), + dict( # pedestrian + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1), + dict( # cyclist + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.3, + min_pos_iou=0.3, + ignore_iof_thr=-1) + ], + allowed_border=0, + code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + pos_weight=-1, + debug=False), + test_cfg=dict( + use_rotate_nms=True, + nms_across_levels=False, + nms_pre=4096, + nms_thr=0.25, + score_thr=0.1, + min_bbox_size=0, + max_num=500)) diff --git a/GenAD-main/projects/configs/_base_/models/imvotenet_image.py b/GenAD-main/projects/configs/_base_/models/imvotenet_image.py new file mode 100644 index 0000000000000000000000000000000000000000..981f8bc9be90a3c2d0ff1edfef3cb3ce91d20d41 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/imvotenet_image.py @@ -0,0 +1,108 @@ +model = dict( + type='ImVoteNet', + img_backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='caffe'), + img_neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + img_rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + img_roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=10, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0))), + + # model training and testing settings + train_cfg=dict( + img_rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + img_rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + img_rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=False, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + pos_weight=-1, + debug=False)), + test_cfg=dict( + img_rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + img_rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100))) diff --git a/GenAD-main/projects/configs/_base_/models/mask_rcnn_r50_fpn.py b/GenAD-main/projects/configs/_base_/models/mask_rcnn_r50_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..c5d5e32b0427cf29b7240b26c7f506c283ae6c04 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/mask_rcnn_r50_fpn.py @@ -0,0 +1,124 @@ +# model settings +model = dict( + type='MaskRCNN', + pretrained='torchvision://resnet50', + backbone=dict( + type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch'), + neck=dict( + type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict( + type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict( + type='StandardRoIHead', + bbox_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + mask_roi_extractor=dict( + type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict( + type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict( + type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_across_levels=False, + nms_pre=2000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + assigner=dict( + type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict( + type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)), + test_cfg=dict( + rpn=dict( + nms_across_levels=False, + nms_pre=1000, + nms_post=1000, + max_num=1000, + nms_thr=0.7, + min_bbox_size=0), + rcnn=dict( + score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/GenAD-main/projects/configs/_base_/models/paconv_cuda_ssg.py b/GenAD-main/projects/configs/_base_/models/paconv_cuda_ssg.py new file mode 100644 index 0000000000000000000000000000000000000000..f513bd4a2f94964f70dba926ef03b427a795e417 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/paconv_cuda_ssg.py @@ -0,0 +1,7 @@ +_base_ = './paconv_ssg.py' + +model = dict( + backbone=dict( + sa_cfg=dict( + type='PAConvCUDASAModule', + scorenet_cfg=dict(mlp_channels=[8, 16, 16])))) diff --git a/GenAD-main/projects/configs/_base_/models/paconv_ssg.py b/GenAD-main/projects/configs/_base_/models/paconv_ssg.py new file mode 100644 index 0000000000000000000000000000000000000000..1d4f1ed39373b40e0871bc97dafaf664ff68594d --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/paconv_ssg.py @@ -0,0 +1,49 @@ +# model settings +model = dict( + type='EncoderDecoder3D', + backbone=dict( + type='PointNet2SASSG', + in_channels=9, # [xyz, rgb, normalized_xyz] + num_points=(1024, 256, 64, 16), + radius=(None, None, None, None), # use kNN instead of ball query + num_samples=(32, 32, 32, 32), + sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256, + 512)), + fp_channels=(), + norm_cfg=dict(type='BN2d', momentum=0.1), + sa_cfg=dict( + type='PAConvSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=False, + paconv_num_kernels=[16, 16, 16], + paconv_kernel_input='w_neighbor', + scorenet_input='w_neighbor_dist', + scorenet_cfg=dict( + mlp_channels=[16, 16, 16], + score_norm='softmax', + temp_factor=1.0, + last_bn=False))), + decode_head=dict( + type='PAConvHead', + # PAConv model's decoder takes skip connections from beckbone + # different from PointNet++, it also concats input features in the last + # level of decoder, leading to `128 + 6` as the channel number + fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128), + (128 + 6, 128, 128, 128)), + channels=128, + dropout_ratio=0.5, + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU'), + loss_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + class_weight=None, # should be modified with dataset + loss_weight=1.0)), + # correlation loss to regularize PAConv's kernel weights + loss_regularization=dict( + type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='slide')) diff --git a/GenAD-main/projects/configs/_base_/models/parta2.py b/GenAD-main/projects/configs/_base_/models/parta2.py new file mode 100644 index 0000000000000000000000000000000000000000..6c5ae9a66372c404923b21f5ee37dfcacd7347ec --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/parta2.py @@ -0,0 +1,201 @@ +# model settings +voxel_size = [0.05, 0.05, 0.1] +point_cloud_range = [0, -40, -3, 70.4, 40, 1] + +model = dict( + type='PartA2', + voxel_layer=dict( + max_num_points=5, # max_points_per_voxel + point_cloud_range=point_cloud_range, + voxel_size=voxel_size, + max_voxels=(16000, 40000) # (training, testing) max_voxels + ), + voxel_encoder=dict(type='HardSimpleVFE'), + middle_encoder=dict( + type='SparseUNet', + in_channels=4, + sparse_shape=[41, 1600, 1408], + order=('conv', 'norm', 'act')), + backbone=dict( + type='SECOND', + in_channels=256, + layer_nums=[5, 5], + layer_strides=[1, 2], + out_channels=[128, 256]), + neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + upsample_strides=[1, 2], + out_channels=[256, 256]), + rpn_head=dict( + type='PartA2RPNHead', + num_classes=3, + in_channels=512, + feat_channels=512, + use_direction_classifier=True, + anchor_generator=dict( + type='Anchor3DRangeGenerator', + ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -0.6, 70.4, 40.0, -0.6], + [0, -40.0, -1.78, 70.4, 40.0, -1.78]], + sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]], + rotations=[0, 1.57], + reshape_out=False), + diff_rad_by_sin=True, + assigner_per_size=True, + assign_per_class=True, + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), + loss_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), + loss_dir=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), + roi_head=dict( + type='PartAggregationROIHead', + num_classes=3, + semantic_head=dict( + type='PointwiseSemanticHead', + in_channels=16, + extra_width=0.2, + seg_score_thr=0.3, + num_classes=3, + loss_seg=dict( + type='FocalLoss', + use_sigmoid=True, + reduction='sum', + gamma=2.0, + alpha=0.25, + loss_weight=1.0), + loss_part=dict( + type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), + seg_roi_extractor=dict( + type='Single3DRoIAwareExtractor', + roi_layer=dict( + type='RoIAwarePool3d', + out_size=14, + max_pts_per_voxel=128, + mode='max')), + part_roi_extractor=dict( + type='Single3DRoIAwareExtractor', + roi_layer=dict( + type='RoIAwarePool3d', + out_size=14, + max_pts_per_voxel=128, + mode='avg')), + bbox_head=dict( + type='PartA2BboxHead', + num_classes=3, + seg_in_channels=16, + part_in_channels=4, + seg_conv_channels=[64, 64], + part_conv_channels=[64, 64], + merge_conv_channels=[128, 128], + down_conv_channels=[128, 256], + bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), + shared_fc_channels=[256, 512, 512, 512], + cls_channels=[256, 256], + reg_channels=[256, 256], + dropout_ratio=0.1, + roi_feat_size=14, + with_corner_loss=True, + loss_bbox=dict( + type='SmoothL1Loss', + beta=1.0 / 9.0, + reduction='sum', + loss_weight=1.0), + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + reduction='sum', + loss_weight=1.0))), + # model training and testing settings + train_cfg=dict( + rpn=dict( + assigner=[ + dict( # for Pedestrian + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.35, + min_pos_iou=0.35, + ignore_iof_thr=-1), + dict( # for Cyclist + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.5, + neg_iou_thr=0.35, + min_pos_iou=0.35, + ignore_iof_thr=-1), + dict( # for Car + type='MaxIoUAssigner', + iou_calculator=dict(type='BboxOverlapsNearest3D'), + pos_iou_thr=0.6, + neg_iou_thr=0.45, + min_pos_iou=0.45, + ignore_iof_thr=-1) + ], + allowed_border=0, + pos_weight=-1, + debug=False), + rpn_proposal=dict( + nms_pre=9000, + nms_post=512, + max_num=512, + nms_thr=0.8, + score_thr=0, + use_rotate_nms=False), + rcnn=dict( + assigner=[ + dict( # for Pedestrian + type='MaxIoUAssigner', + iou_calculator=dict( + type='BboxOverlaps3D', coordinate='lidar'), + pos_iou_thr=0.55, + neg_iou_thr=0.55, + min_pos_iou=0.55, + ignore_iof_thr=-1), + dict( # for Cyclist + type='MaxIoUAssigner', + iou_calculator=dict( + type='BboxOverlaps3D', coordinate='lidar'), + pos_iou_thr=0.55, + neg_iou_thr=0.55, + min_pos_iou=0.55, + ignore_iof_thr=-1), + dict( # for Car + type='MaxIoUAssigner', + iou_calculator=dict( + type='BboxOverlaps3D', coordinate='lidar'), + pos_iou_thr=0.55, + neg_iou_thr=0.55, + min_pos_iou=0.55, + ignore_iof_thr=-1) + ], + sampler=dict( + type='IoUNegPiecewiseSampler', + num=128, + pos_fraction=0.55, + neg_piece_fractions=[0.8, 0.2], + neg_iou_piece_thrs=[0.55, 0.1], + neg_pos_ub=-1, + add_gt_as_proposals=False, + return_iou=True), + cls_pos_thr=0.75, + cls_neg_thr=0.25)), + test_cfg=dict( + rpn=dict( + nms_pre=1024, + nms_post=100, + max_num=100, + nms_thr=0.7, + score_thr=0, + use_rotate_nms=True), + rcnn=dict( + use_rotate_nms=True, + use_raw_score=True, + nms_thr=0.01, + score_thr=0.1))) diff --git a/GenAD-main/projects/configs/_base_/models/pointnet2_msg.py b/GenAD-main/projects/configs/_base_/models/pointnet2_msg.py new file mode 100644 index 0000000000000000000000000000000000000000..222ab885557984125eb52a934f443870e6c6918d --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/pointnet2_msg.py @@ -0,0 +1,28 @@ +_base_ = './pointnet2_ssg.py' + +# model settings +model = dict( + backbone=dict( + _delete_=True, + type='PointNet2SAMSG', + in_channels=6, # [xyz, rgb], should be modified with dataset + num_points=(1024, 256, 64, 16), + radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)), + num_samples=((16, 32), (16, 32), (16, 32), (16, 32)), + sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96, + 128)), + ((128, 196, 256), (128, 196, 256)), ((256, 256, 512), + (256, 384, 512))), + aggregation_channels=(None, None, None, None), + fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')), + fps_sample_range_lists=((-1), (-1), (-1), (-1)), + dilated_group=(False, False, False, False), + out_indices=(0, 1, 2, 3), + sa_cfg=dict( + type='PointSAModuleMSG', + pool_mod='max', + use_xyz=True, + normalize_xyz=False)), + decode_head=dict( + fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128), + (128, 128, 128, 128)))) diff --git a/GenAD-main/projects/configs/_base_/models/pointnet2_ssg.py b/GenAD-main/projects/configs/_base_/models/pointnet2_ssg.py new file mode 100644 index 0000000000000000000000000000000000000000..58b4c243ded042612abb1c15c9c175f5e932af38 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/pointnet2_ssg.py @@ -0,0 +1,35 @@ +# model settings +model = dict( + type='EncoderDecoder3D', + backbone=dict( + type='PointNet2SASSG', + in_channels=6, # [xyz, rgb], should be modified with dataset + num_points=(1024, 256, 64, 16), + radius=(0.1, 0.2, 0.4, 0.8), + num_samples=(32, 32, 32, 32), + sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256, + 512)), + fp_channels=(), + norm_cfg=dict(type='BN2d'), + sa_cfg=dict( + type='PointSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=False)), + decode_head=dict( + type='PointNet2Head', + fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128), + (128, 128, 128, 128)), + channels=128, + dropout_ratio=0.5, + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + act_cfg=dict(type='ReLU'), + loss_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + class_weight=None, # should be modified with dataset + loss_weight=1.0)), + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='slide')) diff --git a/GenAD-main/projects/configs/_base_/models/votenet.py b/GenAD-main/projects/configs/_base_/models/votenet.py new file mode 100644 index 0000000000000000000000000000000000000000..129339dc9eaa3f74c0547a39fa527c14be03743c --- /dev/null +++ b/GenAD-main/projects/configs/_base_/models/votenet.py @@ -0,0 +1,73 @@ +model = dict( + type='VoteNet', + backbone=dict( + type='PointNet2SASSG', + in_channels=4, + num_points=(2048, 1024, 512, 256), + radius=(0.2, 0.4, 0.8, 1.2), + num_samples=(64, 32, 16, 16), + sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), + (128, 128, 256)), + fp_channels=((256, 256), (256, 256)), + norm_cfg=dict(type='BN2d'), + sa_cfg=dict( + type='PointSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=True)), + bbox_head=dict( + type='VoteHead', + vote_module_cfg=dict( + in_channels=256, + vote_per_seed=1, + gt_per_seed=3, + conv_channels=(256, 256), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + norm_feats=True, + vote_loss=dict( + type='ChamferDistance', + mode='l1', + reduction='none', + loss_dst_weight=10.0)), + vote_aggregation_cfg=dict( + type='PointSAModule', + num_point=256, + radius=0.3, + num_sample=16, + mlp_channels=[256, 128, 128, 128], + use_xyz=True, + normalize_xyz=True), + pred_layer_cfg=dict( + in_channels=128, shared_conv_channels=(128, 128), bias=True), + conv_cfg=dict(type='Conv1d'), + norm_cfg=dict(type='BN1d'), + objectness_loss=dict( + type='CrossEntropyLoss', + class_weight=[0.2, 0.8], + reduction='sum', + loss_weight=5.0), + center_loss=dict( + type='ChamferDistance', + mode='l2', + reduction='sum', + loss_src_weight=10.0, + loss_dst_weight=10.0), + dir_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + dir_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0), + size_class_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), + size_res_loss=dict( + type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0), + semantic_loss=dict( + type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), + # model training and testing settings + train_cfg=dict( + pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'), + test_cfg=dict( + sample_mod='seed', + nms_thr=0.25, + score_thr=0.05, + per_class_proposal=True)) diff --git a/GenAD-main/projects/configs/_base_/schedules/cosine.py b/GenAD-main/projects/configs/_base_/schedules/cosine.py new file mode 100644 index 0000000000000000000000000000000000000000..69cb7df87d23846ea7b64fb6d882679e315e55cf --- /dev/null +++ b/GenAD-main/projects/configs/_base_/schedules/cosine.py @@ -0,0 +1,20 @@ +# This schedule is mainly used by models with dynamic voxelization +# optimizer +lr = 0.003 # max learning rate +optimizer = dict( + type='AdamW', + lr=lr, + betas=(0.95, 0.99), # the momentum is change during training + weight_decay=0.001) +optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) + +lr_config = dict( + policy='CosineAnnealing', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 10, + min_lr_ratio=1e-5) + +momentum_config = None + +runner = dict(type='EpochBasedRunner', max_epochs=40) diff --git a/GenAD-main/projects/configs/_base_/schedules/cyclic_20e.py b/GenAD-main/projects/configs/_base_/schedules/cyclic_20e.py new file mode 100644 index 0000000000000000000000000000000000000000..704740ee5676515213fd30839f5e116c0b4ebfc7 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/schedules/cyclic_20e.py @@ -0,0 +1,24 @@ +# For nuScenes dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 20. Please change the interval accordingly if you do not +# use a default schedule. +# optimizer +# This schedule is mainly used by models on nuScenes dataset +optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01) +# max_norm=10 is better for SECOND +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='cyclic', + target_ratio=(10, 1e-4), + cyclic_times=1, + step_ratio_up=0.4, +) +momentum_config = dict( + policy='cyclic', + target_ratio=(0.85 / 0.95, 1), + cyclic_times=1, + step_ratio_up=0.4, +) + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=20) diff --git a/GenAD-main/projects/configs/_base_/schedules/cyclic_40e.py b/GenAD-main/projects/configs/_base_/schedules/cyclic_40e.py new file mode 100644 index 0000000000000000000000000000000000000000..4a711acf4f31cca94ea7a10d035282a45f648c9c --- /dev/null +++ b/GenAD-main/projects/configs/_base_/schedules/cyclic_40e.py @@ -0,0 +1,31 @@ +# The schedule is usually used by models trained on KITTI dataset + +# The learning rate set in the cyclic schedule is the initial learning rate +# rather than the max learning rate. Since the target_ratio is (10, 1e-4), +# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4 +lr = 0.0018 +# The optimizer follows the setting in SECOND.Pytorch, but here we use +# the offcial AdamW optimizer implemented by PyTorch. +optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) +optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) +# We use cyclic learning rate and momentum schedule following SECOND.Pytorch +# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69 # noqa +# We implement them in mmcv, for more details, please refer to +# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327 # noqa +# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130 # noqa +lr_config = dict( + policy='cyclic', + target_ratio=(10, 1e-4), + cyclic_times=1, + step_ratio_up=0.4, +) +momentum_config = dict( + policy='cyclic', + target_ratio=(0.85 / 0.95, 1), + cyclic_times=1, + step_ratio_up=0.4, +) +# Although the max_epochs is 40, this schedule is usually used we +# RepeatDataset with repeat ratio N, thus the actual max epoch +# number could be Nx40 +runner = dict(type='EpochBasedRunner', max_epochs=40) diff --git a/GenAD-main/projects/configs/_base_/schedules/mmdet_schedule_1x.py b/GenAD-main/projects/configs/_base_/schedules/mmdet_schedule_1x.py new file mode 100644 index 0000000000000000000000000000000000000000..13b3783cbbe93b6c32bc415dc50f633dffa4aec7 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/schedules/mmdet_schedule_1x.py @@ -0,0 +1,11 @@ +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 11]) +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/GenAD-main/projects/configs/_base_/schedules/schedule_2x.py b/GenAD-main/projects/configs/_base_/schedules/schedule_2x.py new file mode 100644 index 0000000000000000000000000000000000000000..afde799d9de1e9c03587b54458938b63b1f7de41 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/schedules/schedule_2x.py @@ -0,0 +1,14 @@ +# optimizer +# This schedule is mainly used by models on nuScenes dataset +optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01) +# max_norm=10 is better for SECOND +optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=1.0 / 1000, + step=[20, 23]) +momentum_config = None +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=24) diff --git a/GenAD-main/projects/configs/_base_/schedules/schedule_3x.py b/GenAD-main/projects/configs/_base_/schedules/schedule_3x.py new file mode 100644 index 0000000000000000000000000000000000000000..115cd26b760e749b3ccdd50a6f4d201ea38f824e --- /dev/null +++ b/GenAD-main/projects/configs/_base_/schedules/schedule_3x.py @@ -0,0 +1,9 @@ +# optimizer +# This schedule is mainly used by models on indoor dataset, +# e.g., VoteNet on SUNRGBD and ScanNet +lr = 0.008 # max learning rate +optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01) +optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) +lr_config = dict(policy='step', warmup=None, step=[24, 32]) +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/GenAD-main/projects/configs/_base_/schedules/seg_cosine_150e.py b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_150e.py new file mode 100644 index 0000000000000000000000000000000000000000..04b44e51de071dc9158e31fe7c51420326f0493c --- /dev/null +++ b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_150e.py @@ -0,0 +1,9 @@ +# optimizer +# This schedule is mainly used on S3DIS dataset in segmentation task +optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9) +optimizer_config = dict(grad_clip=None) +lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002) +momentum_config = None + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=150) diff --git a/GenAD-main/projects/configs/_base_/schedules/seg_cosine_200e.py b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_200e.py new file mode 100644 index 0000000000000000000000000000000000000000..6a49484c8b37d3c44b7a2979a3173af6a407b967 --- /dev/null +++ b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_200e.py @@ -0,0 +1,9 @@ +# optimizer +# This schedule is mainly used on ScanNet dataset in segmentation task +optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01) +optimizer_config = dict(grad_clip=None) +lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5) +momentum_config = None + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=200) diff --git a/GenAD-main/projects/configs/_base_/schedules/seg_cosine_50e.py b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_50e.py new file mode 100644 index 0000000000000000000000000000000000000000..975a8f9ff8e5140b0f1707490c282998666c71ef --- /dev/null +++ b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_50e.py @@ -0,0 +1,9 @@ +# optimizer +# This schedule is mainly used on S3DIS dataset in segmentation task +optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001) +optimizer_config = dict(grad_clip=None) +lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5) +momentum_config = None + +# runtime settings +runner = dict(type='EpochBasedRunner', max_epochs=50) diff --git a/GenAD-main/projects/configs/datasets/custom_lyft-3d.py b/GenAD-main/projects/configs/datasets/custom_lyft-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..5a95d898c91e463b731a08f7c52b8186e99da83a --- /dev/null +++ b/GenAD-main/projects/configs/datasets/custom_lyft-3d.py @@ -0,0 +1,136 @@ +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-80, -80, -5, 80, 80, 3] +# For Lyft we usually do 9-class detection +class_names = [ + 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', + 'bicycle', 'pedestrian', 'animal' +] +dataset_type = 'CustomLyftDataset' +data_root = 'data/lyft/' +# Input modality for Lyft dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=True) +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/lyft/': 's3://lyft/lyft/', +# 'data/lyft/': 's3://lyft/lyft/' +# })) +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'lyft_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True)) +# For Lyft dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 24. Please change the interval accordingly if you do not +# use a default schedule. +evaluation = dict(interval=24, pipeline=eval_pipeline) \ No newline at end of file diff --git a/GenAD-main/projects/configs/datasets/custom_nus-3d.py b/GenAD-main/projects/configs/datasets/custom_nus-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..af81f9b20d182222d0b69fc26fe32c1e66905a16 --- /dev/null +++ b/GenAD-main/projects/configs/datasets/custom_nus-3d.py @@ -0,0 +1,141 @@ +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-50, -50, -5, 50, 50, 3] +# For nuScenes we usually do 10-class detection +class_names = [ + 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', + 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' +] +dataset_type = 'NuScenesDataset_eval_modified' +data_root = 'data/nuscenes/' +# Input modality for nuScenes dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict( + use_lidar=True, + use_camera=False, + use_radar=False, + use_map=False, + use_external=False) +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/nuscenes/': 's3://nuscenes/nuscenes/', +# 'data/nuscenes/': 's3://nuscenes/nuscenes/' +# })) +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), + dict( + type='GlobalRotScaleTrans', + rot_range=[-0.3925, 0.3925], + scale_ratio_range=[0.95, 1.05], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='PointShuffle'), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) +] +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='GlobalRotScaleTrans', + rot_range=[0, 0], + scale_ratio_range=[1., 1.], + translation_std=[0, 0, 0]), + dict(type='RandomFlip3D'), + dict( + type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) + ]) +] +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) +eval_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + file_client_args=file_client_args), + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='Collect3D', keys=['points']) +] + +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_train.pkl', + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'), + val=dict( + type=dataset_type, + ann_file=data_root + 'nuscenes_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + pipeline=test_pipeline, + classes=class_names, + modality=input_modality, + test_mode=True, + box_type_3d='LiDAR')) +# For nuScenes dataset, we usually evaluate the model at the end of training. +# Since the models are trained by 24 epochs by default, we set evaluation +# interval to be 24. Please change the interval accordingly if you do not +# use a default schedule. +evaluation = dict(interval=24, pipeline=eval_pipeline) diff --git a/GenAD-main/projects/configs/datasets/custom_waymo-3d.py b/GenAD-main/projects/configs/datasets/custom_waymo-3d.py new file mode 100644 index 0000000000000000000000000000000000000000..4100e13546badb06e69fd0b1ed20158de8acf893 --- /dev/null +++ b/GenAD-main/projects/configs/datasets/custom_waymo-3d.py @@ -0,0 +1,112 @@ +# dataset settings +# D5 in the config name means the whole dataset is divided into 5 folds +# We only use one fold for efficient experiments +dataset_type = 'CustomWaymoDataset' +data_root = 'data/waymo/kitti_format/' +file_client_args = dict(backend='disk') +# Uncomment the following if use ceph or other file clients. +# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient +# for more details. +# file_client_args = dict( +# backend='petrel', path_mapping=dict(data='s3://waymo_data/')) + +img_norm_cfg = dict( + mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) +class_names = ['Car', 'Pedestrian', 'Cyclist'] +point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] +input_modality = dict(use_lidar=False, use_camera=True) +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'waymo_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), + classes=class_names, + sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + file_client_args=file_client_args)) + + + +train_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='PhotoMetricDistortionMultiViewImage'), + dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectNameFilter', classes=class_names), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict(type='DefaultFormatBundle3D', class_names=class_names), + dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img']) +] + + +test_pipeline = [ + dict(type='LoadMultiViewImageFromFiles', to_float32=True), + dict(type='NormalizeMultiviewImage', **img_norm_cfg), + dict(type='PadMultiViewImage', size_divisor=32), + dict( + type='MultiScaleFlipAug3D', + img_scale=(1920, 1280), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict( + type='DefaultFormatBundle3D', + class_names=class_names, + with_label=False), + dict(type='CustomCollect3D', keys=['img']) + ]) +] + + +# construct a pipeline for data and gt loading in show function +# please keep its loading function consistent with test_pipeline (e.g. client) + +data = dict( + samples_per_gpu=2, + workers_per_gpu=4, + train=dict( + type='RepeatDataset', + times=2, + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_train.pkl', + split='training', + pipeline=train_pipeline, + modality=input_modality, + classes=class_names, + test_mode=False, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR', + # load one frame every five frames + load_interval=5)), + val=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR'), + test=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + 'waymo_infos_val.pkl', + split='training', + pipeline=test_pipeline, + modality=input_modality, + classes=class_names, + test_mode=True, + box_type_3d='LiDAR')) + +evaluation = dict(interval=24, pipeline=test_pipeline) \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/VAD.py b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD.py new file mode 100644 index 0000000000000000000000000000000000000000..d4876b135aedb12f4d508acfc171a24031510c31 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD.py @@ -0,0 +1,668 @@ +import time +import copy + +import torch +from mmdet.models import DETECTORS +from mmdet3d.core import bbox3d2result +from mmcv.runner import force_fp32, auto_fp16 +from scipy.optimize import linear_sum_assignment +from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector + +from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask +from projects.mmdet3d_plugin.VAD.planner.metric_stp3 import PlanningMetric + + +@DETECTORS.register_module() +class VAD(MVXTwoStageDetector): + """VAD model. + """ + def __init__(self, + use_grid_mask=False, + pts_voxel_layer=None, + pts_voxel_encoder=None, + pts_middle_encoder=None, + pts_fusion_layer=None, + img_backbone=None, + pts_backbone=None, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_roi_head=None, + img_rpn_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + video_test_mode=False, + fut_ts=6, + fut_mode=6 + ): + + super(VAD, + self).__init__(pts_voxel_layer, pts_voxel_encoder, + pts_middle_encoder, pts_fusion_layer, + img_backbone, pts_backbone, img_neck, pts_neck, + pts_bbox_head, img_roi_head, img_rpn_head, + train_cfg, test_cfg, pretrained) + self.grid_mask = GridMask( + True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7) + self.use_grid_mask = use_grid_mask + self.fp16_enabled = False + self.fut_ts = fut_ts + self.fut_mode = fut_mode + self.valid_fut_ts = pts_bbox_head['valid_fut_ts'] + + # temporal + self.video_test_mode = video_test_mode + self.prev_frame_info = { + 'prev_bev': None, + 'scene_token': None, + 'prev_pos': 0, + 'prev_angle': 0, + } + + self.planning_metric = None + + def extract_img_feat(self, img, img_metas, len_queue=None): + """Extract features of images.""" + B = img.size(0) + if img is not None: + + # input_shape = img.shape[-2:] + # # update real input shape of each single img + # for img_meta in img_metas: + # img_meta.update(input_shape=input_shape) + + if img.dim() == 5 and img.size(0) == 1: + img.squeeze_() + elif img.dim() == 5 and img.size(0) > 1: + B, N, C, H, W = img.size() + img = img.reshape(B * N, C, H, W) + if self.use_grid_mask: + img = self.grid_mask(img) + + img_feats = self.img_backbone(img) + if isinstance(img_feats, dict): + img_feats = list(img_feats.values()) + else: + return None + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + + img_feats_reshaped = [] + for img_feat in img_feats: + BN, C, H, W = img_feat.size() + if len_queue is not None: + img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W)) + else: + img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W)) + return img_feats_reshaped + + @auto_fp16(apply_to=('img'), out_fp32=True) + def extract_feat(self, img, img_metas=None, len_queue=None): + """Extract features from images and points.""" + + img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue) + + return img_feats + + def forward_pts_train(self, + pts_feats, + gt_bboxes_3d, + gt_labels_3d, + map_gt_bboxes_3d, + map_gt_labels_3d, + img_metas, + gt_bboxes_ignore=None, + map_gt_bboxes_ignore=None, + prev_bev=None, + ego_his_trajs=None, + ego_fut_trajs=None, + ego_fut_masks=None, + ego_fut_cmd=None, + ego_lcf_feat=None, + gt_attr_labels=None): + """Forward function' + Args: + pts_feats (list[torch.Tensor]): Features of point cloud branch + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth + boxes for each sample. + gt_labels_3d (list[torch.Tensor]): Ground truth labels for + boxes of each sampole + img_metas (list[dict]): Meta information of samples. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + boxes to be ignored. Defaults to None. + prev_bev (torch.Tensor, optional): BEV features of previous frame. + Returns: + dict: Losses of each branch. + """ + + outs = self.pts_bbox_head(pts_feats, img_metas, prev_bev, + ego_his_trajs=ego_his_trajs, ego_lcf_feat=ego_lcf_feat, + gt_labels_3d=gt_labels_3d, gt_attr_labels=gt_attr_labels, + ego_fut_trajs=ego_fut_trajs) + loss_inputs = [ + gt_bboxes_3d, gt_labels_3d, map_gt_bboxes_3d, map_gt_labels_3d, + outs, ego_fut_trajs, ego_fut_masks, ego_fut_cmd, gt_attr_labels, + ] + losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas) + return losses + + def forward_dummy(self, img): + dummy_metas = None + return self.forward_test(img=img, img_metas=[[dummy_metas]]) + + def forward(self, return_loss=True, **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. + Note this setting will change the expected inputs. When + `return_loss=True`, img and img_metas are single-nested (i.e. + torch.Tensor and list[dict]), and when `resturn_loss=False`, img and + img_metas should be double nested (i.e. list[torch.Tensor], + list[list[dict]]), with the outer list indicating test time + augmentations. + """ + if return_loss: + return self.forward_train(**kwargs) + else: + return self.forward_test(**kwargs) + + def obtain_history_bev(self, imgs_queue, img_metas_list): + """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated. + """ + self.eval() + + with torch.no_grad(): + prev_bev = None + bs, len_queue, num_cams, C, H, W = imgs_queue.shape + imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W) + img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue) + for i in range(len_queue): + img_metas = [each[i] for each in img_metas_list] + # img_feats = self.extract_feat(img=img, img_metas=img_metas) + img_feats = [each_scale[:, i] for each_scale in img_feats_list] + prev_bev = self.pts_bbox_head( + img_feats, img_metas, prev_bev, only_bev=True) + self.train() + return prev_bev + + # @auto_fp16(apply_to=('img', 'points')) + @force_fp32(apply_to=('img','points','prev_bev')) + def forward_train(self, + points=None, + img_metas=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + map_gt_bboxes_3d=None, + map_gt_labels_3d=None, + gt_labels=None, + gt_bboxes=None, + img=None, + proposals=None, + gt_bboxes_ignore=None, + map_gt_bboxes_ignore=None, + img_depth=None, + img_mask=None, + ego_his_trajs=None, + ego_fut_trajs=None, + ego_fut_masks=None, + ego_fut_cmd=None, + ego_lcf_feat=None, + gt_attr_labels=None + ): + """Forward training function. + Args: + points (list[torch.Tensor], optional): Points of each sample. + Defaults to None. + img_metas (list[dict], optional): Meta information of each sample. + Defaults to None. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): + Ground truth 3D boxes. Defaults to None. + gt_labels_3d (list[torch.Tensor], optional): Ground truth labels + of 3D boxes. Defaults to None. + gt_labels (list[torch.Tensor], optional): Ground truth labels + of 2D boxes in images. Defaults to None. + gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in + images. Defaults to None. + img (torch.Tensor optional): Images of each sample with shape + (N, C, H, W). Defaults to None. + proposals ([list[torch.Tensor], optional): Predicted proposals + used for training Fast RCNN. Defaults to None. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + 2D boxes in images to be ignored. Defaults to None. + Returns: + dict: Losses of different branches. + """ + + len_queue = img.size(1) + prev_img = img[:, :-1, ...] + img = img[:, -1, ...] + + prev_img_metas = copy.deepcopy(img_metas) + # prev_bev = self.obtain_history_bev(prev_img, prev_img_metas) + # import pdb;pdb.set_trace() + prev_bev = self.obtain_history_bev(prev_img, prev_img_metas) if len_queue > 1 else None + + img_metas = [each[len_queue-1] for each in img_metas] + img_feats = self.extract_feat(img=img, img_metas=img_metas) + losses = dict() + losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, gt_labels_3d, + map_gt_bboxes_3d, map_gt_labels_3d, img_metas, + gt_bboxes_ignore, map_gt_bboxes_ignore, prev_bev, + ego_his_trajs=ego_his_trajs, ego_fut_trajs=ego_fut_trajs, + ego_fut_masks=ego_fut_masks, ego_fut_cmd=ego_fut_cmd, + ego_lcf_feat=ego_lcf_feat, gt_attr_labels=gt_attr_labels) + + losses.update(losses_pts) + return losses + + def forward_test( + self, + img_metas, + gt_bboxes_3d, + gt_labels_3d, + img=None, + ego_his_trajs=None, + ego_fut_trajs=None, + ego_fut_cmd=None, + ego_lcf_feat=None, + gt_attr_labels=None, + **kwargs + ): + for var, name in [(img_metas, 'img_metas')]: + if not isinstance(var, list): + raise TypeError('{} must be a list, but got {}'.format( + name, type(var))) + img = [img] if img is None else img + + if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']: + # the first sample of each scene is truncated + self.prev_frame_info['prev_bev'] = None + # update idx + self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token'] + + # do not use temporal information + if not self.video_test_mode: + self.prev_frame_info['prev_bev'] = None + + # Get the delta of ego position and angle between two timestamps. + tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3]) + tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1]) + if self.prev_frame_info['prev_bev'] is not None: + img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos'] + img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle'] + else: + img_metas[0][0]['can_bus'][-1] = 0 + img_metas[0][0]['can_bus'][:3] = 0 + + new_prev_bev, bbox_results = self.simple_test( + img_metas=img_metas[0], + img=img[0], + prev_bev=self.prev_frame_info['prev_bev'], + gt_bboxes_3d=gt_bboxes_3d, + gt_labels_3d=gt_labels_3d, + ego_his_trajs=ego_his_trajs[0], + ego_fut_trajs=ego_fut_trajs[0], + ego_fut_cmd=ego_fut_cmd[0], + ego_lcf_feat=ego_lcf_feat[0], + gt_attr_labels=gt_attr_labels, + **kwargs + ) + # During inference, we save the BEV features and ego motion of each timestamp. + self.prev_frame_info['prev_pos'] = tmp_pos + self.prev_frame_info['prev_angle'] = tmp_angle + self.prev_frame_info['prev_bev'] = new_prev_bev + + return bbox_results + + def simple_test( + self, + img_metas, + gt_bboxes_3d, + gt_labels_3d, + img=None, + prev_bev=None, + points=None, + fut_valid_flag=None, + rescale=False, + ego_his_trajs=None, + ego_fut_trajs=None, + ego_fut_cmd=None, + ego_lcf_feat=None, + gt_attr_labels=None, + **kwargs + ): + """Test function without augmentaiton.""" + img_feats = self.extract_feat(img=img, img_metas=img_metas) + bbox_list = [dict() for i in range(len(img_metas))] + new_prev_bev, bbox_pts, metric_dict = self.simple_test_pts( + img_feats, + img_metas, + gt_bboxes_3d, + gt_labels_3d, + prev_bev, + fut_valid_flag=fut_valid_flag, + rescale=rescale, + start=None, + ego_his_trajs=ego_his_trajs, + ego_fut_trajs=ego_fut_trajs, + ego_fut_cmd=ego_fut_cmd, + ego_lcf_feat=ego_lcf_feat, + gt_attr_labels=gt_attr_labels, + ) + for result_dict, pts_bbox in zip(bbox_list, bbox_pts): + result_dict['pts_bbox'] = pts_bbox + result_dict['metric_results'] = metric_dict + + return new_prev_bev, bbox_list + + def simple_test_pts( + self, + x, + img_metas, + gt_bboxes_3d, + gt_labels_3d, + prev_bev=None, + fut_valid_flag=None, + rescale=False, + start=None, + ego_his_trajs=None, + ego_fut_trajs=None, + ego_fut_cmd=None, + ego_lcf_feat=None, + gt_attr_labels=None, + ): + """Test function""" + mapped_class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', + 'trailer', 'barrier', 'motorcycle', 'bicycle', + 'pedestrian', 'traffic_cone' + ] + + + outs = self.pts_bbox_head(x, img_metas, prev_bev=prev_bev, + ego_his_trajs=ego_his_trajs, ego_lcf_feat=ego_lcf_feat) + bbox_list = self.pts_bbox_head.get_bboxes(outs, img_metas, rescale=rescale) + + bbox_results = [] + for i, (bboxes, scores, labels, trajs, map_bboxes, \ + map_scores, map_labels, map_pts) in enumerate(bbox_list): + bbox_result = bbox3d2result(bboxes, scores, labels) + bbox_result['trajs_3d'] = trajs.cpu() + map_bbox_result = self.map_pred2result(map_bboxes, map_scores, map_labels, map_pts) + bbox_result.update(map_bbox_result) + bbox_result['ego_fut_preds'] = outs['ego_fut_preds'][i].cpu() + bbox_result['ego_fut_cmd'] = ego_fut_cmd.cpu() + bbox_results.append(bbox_result) + + assert len(bbox_results) == 1, 'only support batch_size=1 now' + score_threshold = 0.6 + with torch.no_grad(): + c_bbox_results = copy.deepcopy(bbox_results) + + bbox_result = c_bbox_results[0] + gt_bbox = gt_bboxes_3d[0][0] + gt_label = gt_labels_3d[0][0].to('cpu') + gt_attr_label = gt_attr_labels[0][0].to('cpu') + fut_valid_flag = bool(fut_valid_flag[0][0]) + # filter pred bbox by score_threshold + mask = bbox_result['scores_3d'] > score_threshold + bbox_result['boxes_3d'] = bbox_result['boxes_3d'][mask] + bbox_result['scores_3d'] = bbox_result['scores_3d'][mask] + bbox_result['labels_3d'] = bbox_result['labels_3d'][mask] + bbox_result['trajs_3d'] = bbox_result['trajs_3d'][mask] + + matched_bbox_result = self.assign_pred_to_gt_vip3d( + bbox_result, gt_bbox, gt_label) + + metric_dict = self.compute_motion_metric_vip3d( + gt_bbox, gt_label, gt_attr_label, bbox_result, + matched_bbox_result, mapped_class_names) + + # ego planning metric + assert ego_fut_trajs.shape[0] == 1, 'only support batch_size=1 for testing' + ego_fut_preds = bbox_result['ego_fut_preds'] + ego_fut_trajs = ego_fut_trajs[0, 0] + ego_fut_cmd = ego_fut_cmd[0, 0, 0] + ego_fut_cmd_idx = torch.nonzero(ego_fut_cmd)[0, 0] + ego_fut_pred = ego_fut_preds[ego_fut_cmd_idx] + ego_fut_pred = ego_fut_pred.cumsum(dim=-2) + ego_fut_trajs = ego_fut_trajs.cumsum(dim=-2) + + metric_dict_planner_stp3 = self.compute_planner_metric_stp3( + pred_ego_fut_trajs = ego_fut_pred[None], + gt_ego_fut_trajs = ego_fut_trajs[None], + gt_agent_boxes = gt_bbox, + gt_agent_feats = gt_attr_label.unsqueeze(0), + fut_valid_flag = fut_valid_flag + ) + metric_dict.update(metric_dict_planner_stp3) + + return outs['bev_embed'], bbox_results, metric_dict + + def map_pred2result(self, bboxes, scores, labels, pts, attrs=None): + """Convert detection results to a list of numpy arrays. + + Args: + bboxes (torch.Tensor): Bounding boxes with shape of (n, 5). + labels (torch.Tensor): Labels with shape of (n, ). + scores (torch.Tensor): Scores with shape of (n, ). + attrs (torch.Tensor, optional): Attributes with shape of (n, ). \ + Defaults to None. + + Returns: + dict[str, torch.Tensor]: Bounding box results in cpu mode. + + - boxes_3d (torch.Tensor): 3D boxes. + - scores (torch.Tensor): Prediction scores. + - labels_3d (torch.Tensor): Box labels. + - attrs_3d (torch.Tensor, optional): Box attributes. + """ + result_dict = dict( + map_boxes_3d=bboxes.to('cpu'), + map_scores_3d=scores.cpu(), + map_labels_3d=labels.cpu(), + map_pts_3d=pts.to('cpu')) + + if attrs is not None: + result_dict['map_attrs_3d'] = attrs.cpu() + + return result_dict + + def assign_pred_to_gt_vip3d( + self, + bbox_result, + gt_bbox, + gt_label, + match_dis_thresh=2.0 + ): + """Assign pred boxs to gt boxs according to object center preds in lcf. + Args: + bbox_result (dict): Predictions. + 'boxes_3d': (LiDARInstance3DBoxes) + 'scores_3d': (Tensor), [num_pred_bbox] + 'labels_3d': (Tensor), [num_pred_bbox] + 'trajs_3d': (Tensor), [fut_ts*2] + gt_bboxs (LiDARInstance3DBoxes): GT Bboxs. + gt_label (Tensor): GT labels for gt_bbox, [num_gt_bbox]. + match_dis_thresh (float): dis thresh for determine a positive sample for a gt bbox. + + Returns: + matched_bbox_result (np.array): assigned pred index for each gt box [num_gt_bbox]. + """ + dynamic_list = [0,1,3,4,6,7,8] + matched_bbox_result = torch.ones( + (len(gt_bbox)), dtype=torch.long) * -1 # -1: not assigned + gt_centers = gt_bbox.center[:, :2] + pred_centers = bbox_result['boxes_3d'].center[:, :2] + dist = torch.linalg.norm(pred_centers[:, None, :] - gt_centers[None, :, :], dim=-1) + pred_not_dyn = [label not in dynamic_list for label in bbox_result['labels_3d']] + gt_not_dyn = [label not in dynamic_list for label in gt_label] + dist[pred_not_dyn] = 1e6 + dist[:, gt_not_dyn] = 1e6 + dist[dist > match_dis_thresh] = 1e6 + + r_list, c_list = linear_sum_assignment(dist) + + for i in range(len(r_list)): + if dist[r_list[i], c_list[i]] <= match_dis_thresh: + matched_bbox_result[c_list[i]] = r_list[i] + + return matched_bbox_result + + def compute_motion_metric_vip3d( + self, + gt_bbox: object, + gt_label: object, + gt_attr_label: object, + pred_bbox: object, + matched_bbox_result: object, + mapped_class_names: object, + match_dis_thresh: object = 2.0, + ) -> object: + """Compute EPA metric for one sample. + Args: + gt_bboxs (LiDARInstance3DBoxes): GT Bboxs. + gt_label (Tensor): GT labels for gt_bbox, [num_gt_bbox]. + pred_bbox (dict): Predictions. + 'boxes_3d': (LiDARInstance3DBoxes) + 'scores_3d': (Tensor), [num_pred_bbox] + 'labels_3d': (Tensor), [num_pred_bbox] + 'trajs_3d': (Tensor), [fut_ts*2] + matched_bbox_result (np.array): assigned pred index for each gt box [num_gt_bbox]. + match_dis_thresh (float): dis thresh for determine a positive sample for a gt bbox. + + Returns: + EPA_dict (dict): EPA metric dict of each cared class. + """ + motion_cls_names = ['car', 'pedestrian'] + motion_metric_names = ['gt', 'cnt_ade', 'cnt_fde', 'hit', + 'fp', 'ADE', 'FDE', 'MR'] + + metric_dict = {} + for met in motion_metric_names: + for cls in motion_cls_names: + metric_dict[met+'_'+cls] = 0.0 + + + + + # ignore_list = ['construction_vehicle', 'barrier', + # 'traffic_cone', 'motorcycle', 'bicycle'] + veh_list = [0, 1, 2, 3, 4, 6, 7] + ignore_list = ['barrier', 'traffic_cone'] + + for i in range(pred_bbox['labels_3d'].shape[0]): + pred_bbox['labels_3d'][i] = 0 if pred_bbox['labels_3d'][i] in veh_list else pred_bbox['labels_3d'][i] + box_name = mapped_class_names[pred_bbox['labels_3d'][i]] + if box_name in ignore_list: + continue + if i not in matched_bbox_result: + metric_dict['fp_'+box_name] += 1 + + for i in range(gt_label.shape[0]): + gt_label[i] = 0 if gt_label[i] in veh_list else gt_label[i] + box_name = mapped_class_names[gt_label[i]] + if box_name in ignore_list: + continue + gt_fut_masks = gt_attr_label[i][self.fut_ts*2:self.fut_ts*3] + num_valid_ts = sum(gt_fut_masks==1) + if num_valid_ts == self.fut_ts: + metric_dict['gt_'+box_name] += 1 + if matched_bbox_result[i] >= 0 and num_valid_ts > 0: + metric_dict['cnt_ade_'+box_name] += 1 + m_pred_idx = matched_bbox_result[i] + gt_fut_trajs = gt_attr_label[i][:self.fut_ts*2].reshape(-1, 2) + gt_fut_trajs = gt_fut_trajs[:num_valid_ts] + pred_fut_trajs = pred_bbox['trajs_3d'][m_pred_idx].reshape(self.fut_mode, self.fut_ts, 2) + pred_fut_trajs = pred_fut_trajs[:, :num_valid_ts, :] + gt_fut_trajs = gt_fut_trajs.cumsum(dim=-2) + pred_fut_trajs = pred_fut_trajs.cumsum(dim=-2) + gt_fut_trajs = gt_fut_trajs + gt_bbox[i].center[0, :2] + pred_fut_trajs = pred_fut_trajs + pred_bbox['boxes_3d'][int(m_pred_idx)].center[0, :2] + + dist = torch.linalg.norm(gt_fut_trajs[None, :, :] - pred_fut_trajs, dim=-1) + ade = dist.sum(-1) / num_valid_ts + ade = ade.min() + + metric_dict['ADE_'+box_name] += ade + if num_valid_ts == self.fut_ts: + fde = dist[:, -1].min() + metric_dict['cnt_fde_'+box_name] += 1 + metric_dict['FDE_'+box_name] += fde + if fde <= match_dis_thresh: + metric_dict['hit_'+box_name] += 1 + else: + metric_dict['MR_'+box_name] += 1 + + return metric_dict + + ### same planning metric as stp3 + def compute_planner_metric_stp3( + self, + pred_ego_fut_trajs, + gt_ego_fut_trajs, + gt_agent_boxes, + gt_agent_feats, + fut_valid_flag + ): + """Compute planner metric for one sample same as stp3.""" + metric_dict = { + 'plan_L2_1s':0, + 'plan_L2_2s':0, + 'plan_L2_3s':0, + 'plan_obj_col_1s':0, + 'plan_obj_col_2s':0, + 'plan_obj_col_3s':0, + 'plan_obj_box_col_1s':0, + 'plan_obj_box_col_2s':0, + 'plan_obj_box_col_3s':0, + } + metric_dict['fut_valid_flag'] = fut_valid_flag + future_second = 3 + assert pred_ego_fut_trajs.shape[0] == 1, 'only support bs=1' + if self.planning_metric is None: + self.planning_metric = PlanningMetric() + segmentation, pedestrian = self.planning_metric.get_label( + gt_agent_boxes, gt_agent_feats) + occupancy = torch.logical_or(segmentation, pedestrian) + + for i in range(future_second): + if fut_valid_flag: + cur_time = (i+1)*2 + traj_L2 = self.planning_metric.compute_L2( + pred_ego_fut_trajs[0, :cur_time].detach().to(gt_ego_fut_trajs.device), + gt_ego_fut_trajs[0, :cur_time] + ) + traj_L2_stp3 = self.planning_metric.compute_L2_stp3( + pred_ego_fut_trajs[0, :cur_time].detach().to(gt_ego_fut_trajs.device), + gt_ego_fut_trajs[0, :cur_time] + ) + obj_coll, obj_box_coll = self.planning_metric.evaluate_coll( + pred_ego_fut_trajs[:, :cur_time].detach(), + gt_ego_fut_trajs[:, :cur_time], + occupancy) + metric_dict['plan_L2_{}s'.format(i+1)] = traj_L2 + metric_dict['plan_L2_stp3_{}s'.format(i+1)] = traj_L2_stp3 + metric_dict['plan_obj_col_{}s'.format(i+1)] = obj_coll.mean().item() + metric_dict['plan_obj_col_stp3_{}s'.format(i + 1)] = obj_coll[-1].item() + metric_dict['plan_obj_box_col_{}s'.format(i+1)] = obj_box_coll.mean().item() + metric_dict['plan_obj_box_col_stp3_{}s'.format(i + 1)] = obj_box_coll[-1].item() + # if (i == 0): + # metric_dict['plan_1'] = obj_box_coll[0].item() + # metric_dict['plan_2'] = obj_box_coll[1].item() + # if (i == 1): + # metric_dict['plan_3'] = obj_box_coll[2].item() + # metric_dict['plan_4'] = obj_box_coll[3].item() + # if (i == 2): + # metric_dict['plan_5'] = obj_box_coll[4].item() + # metric_dict['plan_6'] = obj_box_coll[5].item() + else: + metric_dict['plan_L2_{}s'.format(i+1)] = 0.0 + metric_dict['plan_L2_stp3_{}s'.format(i + 1)] = 0.0 + metric_dict['plan_obj_col_{}s'.format(i+1)] = 0.0 + metric_dict['plan_obj_box_col_{}s'.format(i+1)] = 0.0 + + return metric_dict + + def set_epoch(self, epoch): + self.pts_bbox_head.epoch = epoch \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_head.py b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_head.py new file mode 100644 index 0000000000000000000000000000000000000000..7b9d30933e35171927e860806db204407bc70838 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_head.py @@ -0,0 +1,2156 @@ +import copy +from math import pi, cos, sin + +import torch +import numpy as np +import torch.nn as nn +import matplotlib.pyplot as plt +import torch.nn.functional as F +from mmdet.models import HEADS, build_loss +from mmdet.models.dense_heads import DETRHead +from mmcv.runner import force_fp32, auto_fp16 +from mmcv.utils import TORCH_VERSION, digit_version +from mmdet.core import build_assigner, build_sampler +from mmdet3d.core.bbox.coders import build_bbox_coder +from mmdet.models.utils.transformer import inverse_sigmoid +from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh +from mmcv.cnn import Linear, bias_init_with_prob, xavier_init +from mmdet.core import (multi_apply, multi_apply, reduce_mean) +from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence + +from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox +from projects.mmdet3d_plugin.VAD.utils.traj_lr_warmup import get_traj_warmup_loss_weight +from projects.mmdet3d_plugin.VAD.utils.map_utils import ( + normalize_2d_pts, normalize_2d_bbox, denormalize_2d_pts, denormalize_2d_bbox +) + +from projects.mmdet3d_plugin.VAD.generator import DistributionModule, PredictModel +from projects.mmdet3d_plugin.VAD.generator import FuturePrediction + + +class MLP(nn.Module): + def __init__(self, in_channels, hidden_unit, verbose=False): + super(MLP, self).__init__() + self.mlp = nn.Sequential( + nn.Linear(in_channels, hidden_unit), + nn.LayerNorm(hidden_unit), + nn.ReLU() + ) + + def forward(self, x): + x = self.mlp(x) + return x + + +class LaneNet(nn.Module): + def __init__(self, in_channels, hidden_unit, num_subgraph_layers): + super(LaneNet, self).__init__() + self.num_subgraph_layers = num_subgraph_layers + self.layer_seq = nn.Sequential() + for i in range(num_subgraph_layers): + self.layer_seq.add_module( + f'lmlp_{i}', MLP(in_channels, hidden_unit)) + in_channels = hidden_unit * 2 + + def forward(self, pts_lane_feats): + ''' + Extract lane_feature from vectorized lane representation + + Args: + pts_lane_feats: [batch size, max_pnum, pts, D] + + Returns: + inst_lane_feats: [batch size, max_pnum, D] + ''' + x = pts_lane_feats + for name, layer in self.layer_seq.named_modules(): + if isinstance(layer, MLP): + # x [bs,max_lane_num,9,dim] + x = layer(x) + x_max = torch.max(x, -2)[0] + x_max = x_max.unsqueeze(2).repeat(1, 1, x.shape[2], 1) + x = torch.cat([x, x_max], dim=-1) + x_max = torch.max(x, -2)[0] + return x_max + + +@HEADS.register_module() +class VADHead(DETRHead): + """Head of VAD model. + Args: + with_box_refine (bool): Whether to refine the reference points + in the decoder. Defaults to False. + as_two_stage (bool) : Whether to generate the proposal from + the outputs of encoder. + transformer (obj:`ConfigDict`): ConfigDict is used for building + the Encoder and Decoder. + bev_h, bev_w (int): spatial shape of BEV queries. + """ + + def __init__(self, + *args, + with_box_refine=False, + as_two_stage=False, + transformer=None, + bbox_coder=None, + num_cls_fcs=2, + code_weights=None, + bev_h=30, + bev_w=30, + fut_ts=6, + fut_mode=6, + loss_traj=dict(type='L1Loss', loss_weight=0.25), + loss_traj_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=0.8), + map_bbox_coder=None, + map_num_query=900, + map_num_classes=3, + map_num_vec=20, + map_num_pts_per_vec=2, + map_num_pts_per_gt_vec=2, + map_query_embed_type='all_pts', + map_transform_method='minmax', + map_gt_shift_pts_pattern='v0', + map_dir_interval=1, + map_code_size=None, + map_code_weights=None, + loss_map_cls=dict( + type='CrossEntropyLoss', + bg_cls_weight=0.1, + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + loss_map_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_map_iou=dict(type='GIoULoss', loss_weight=2.0), + loss_map_pts=dict( + type='ChamferDistance', loss_src_weight=1.0, loss_dst_weight=1.0 + ), + loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=2.0), + loss_vae_gen=dict(type='ProbabilisticLoss', loss_weight=1.0), + tot_epoch=None, + use_traj_lr_warmup=False, + motion_decoder=None, + motion_map_decoder=None, + use_pe=False, + motion_det_score=None, + map_thresh=0.5, + dis_thresh=0.2, + pe_normalization=True, + ego_his_encoder=None, + ego_fut_mode=3, + loss_plan_reg=dict(type='L1Loss', loss_weight=0.25), + loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=0.1), + loss_plan_col=dict(type='PlanAgentDisLoss', loss_weight=0.1), + loss_plan_dir=dict(type='PlanMapThetaLoss', loss_weight=0.1), + ego_agent_decoder=None, + ego_map_decoder=None, + query_thresh=None, + query_use_fix_pad=None, + ego_lcf_feat_idx=None, + valid_fut_ts=6, + agent_dim=300, + **kwargs): + + self.bev_h = bev_h + self.bev_w = bev_w + self.fp16_enabled = False + self.fut_ts = fut_ts + self.fut_mode = fut_mode + self.tot_epoch = tot_epoch + self.use_traj_lr_warmup = use_traj_lr_warmup + self.motion_decoder = motion_decoder + self.motion_map_decoder = motion_map_decoder + self.use_pe = use_pe + self.motion_det_score = motion_det_score + self.map_thresh = map_thresh + self.dis_thresh = dis_thresh + self.pe_normalization = pe_normalization + self.ego_his_encoder = ego_his_encoder + self.ego_fut_mode = ego_fut_mode + self.ego_agent_decoder = ego_agent_decoder + self.ego_map_decoder = ego_map_decoder + self.query_thresh = query_thresh + self.query_use_fix_pad = query_use_fix_pad + self.ego_lcf_feat_idx = ego_lcf_feat_idx + self.valid_fut_ts = valid_fut_ts + self.agent_dim = agent_dim + self.with_cur = True + + if loss_traj_cls['use_sigmoid'] == True: + self.traj_num_cls = 1 + else: + self.traj_num_cls = 2 + + self.with_box_refine = with_box_refine + self.as_two_stage = as_two_stage + if self.as_two_stage: + transformer['as_two_stage'] = self.as_two_stage + if 'code_size' in kwargs: + self.code_size = kwargs['code_size'] + else: + self.code_size = 10 + if code_weights is not None: + self.code_weights = code_weights + else: + self.code_weights = [1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] + if map_code_size is not None: + self.map_code_size = map_code_size + else: + self.map_code_size = 10 + if map_code_weights is not None: + self.map_code_weights = map_code_weights + else: + self.map_code_weights = [1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] + + self.bbox_coder = build_bbox_coder(bbox_coder) + self.pc_range = self.bbox_coder.pc_range + self.real_w = self.pc_range[3] - self.pc_range[0] + self.real_h = self.pc_range[4] - self.pc_range[1] + self.num_cls_fcs = num_cls_fcs - 1 + + self.map_bbox_coder = build_bbox_coder(map_bbox_coder) + self.map_query_embed_type = map_query_embed_type + self.map_transform_method = map_transform_method + self.map_gt_shift_pts_pattern = map_gt_shift_pts_pattern + map_num_query = map_num_vec * map_num_pts_per_vec + self.map_num_query = map_num_query + self.map_num_classes = map_num_classes + self.map_num_vec = map_num_vec + self.map_num_pts_per_vec = map_num_pts_per_vec + self.map_num_pts_per_gt_vec = map_num_pts_per_gt_vec + self.map_dir_interval = map_dir_interval + + if loss_map_cls['use_sigmoid'] == True: + self.map_cls_out_channels = map_num_classes + else: + self.map_cls_out_channels = map_num_classes + 1 + + self.map_bg_cls_weight = 0 + map_class_weight = loss_map_cls.get('class_weight', None) + if map_class_weight is not None and (self.__class__ is VADHead): + assert isinstance(map_class_weight, float), 'Expected ' \ + 'class_weight to have type float. Found ' \ + f'{type(map_class_weight)}.' + # NOTE following the official DETR rep0, bg_cls_weight means + # relative classification weight of the no-object class. + map_bg_cls_weight = loss_map_cls.get('bg_cls_weight', map_class_weight) + assert isinstance(map_bg_cls_weight, float), 'Expected ' \ + 'bg_cls_weight to have type float. Found ' \ + f'{type(map_bg_cls_weight)}.' + map_class_weight = torch.ones(map_num_classes + 1) * map_class_weight + # set background class as the last indice + map_class_weight[map_num_classes] = map_bg_cls_weight + loss_map_cls.update({'class_weight': map_class_weight}) + if 'bg_cls_weight' in loss_map_cls: + loss_map_cls.pop('bg_cls_weight') + self.map_bg_cls_weight = map_bg_cls_weight + + self.traj_bg_cls_weight = 0 + + super(VADHead, self).__init__(*args, transformer=transformer, **kwargs) + self.code_weights = nn.Parameter(torch.tensor( + self.code_weights, requires_grad=False), requires_grad=False) + self.map_code_weights = nn.Parameter(torch.tensor( + self.map_code_weights, requires_grad=False), requires_grad=False) + + if kwargs['train_cfg'] is not None: + assert 'map_assigner' in kwargs['train_cfg'], 'map assigner should be provided ' \ + 'when train_cfg is set.' + map_assigner = kwargs['train_cfg']['map_assigner'] + assert loss_map_cls['loss_weight'] == map_assigner['cls_cost']['weight'], \ + 'The classification weight for loss and matcher should be' \ + 'exactly the same.' + assert loss_map_bbox['loss_weight'] == map_assigner['reg_cost'][ + 'weight'], 'The regression L1 weight for loss and matcher ' \ + 'should be exactly the same.' + assert loss_map_iou['loss_weight'] == map_assigner['iou_cost']['weight'], \ + 'The regression iou weight for loss and matcher should be' \ + 'exactly the same.' + assert loss_map_pts['loss_weight'] == map_assigner['pts_cost']['weight'], \ + 'The regression l1 weight for map pts loss and matcher should be' \ + 'exactly the same.' + + self.map_assigner = build_assigner(map_assigner) + # DETR sampling=False, so use PseudoSampler + sampler_cfg = dict(type='PseudoSampler') + self.map_sampler = build_sampler(sampler_cfg, context=self) + + self.loss_traj = build_loss(loss_traj) + self.loss_traj_cls = build_loss(loss_traj_cls) + self.loss_map_bbox = build_loss(loss_map_bbox) + self.loss_map_cls = build_loss(loss_map_cls) + self.loss_map_iou = build_loss(loss_map_iou) + self.loss_map_pts = build_loss(loss_map_pts) + self.loss_map_dir = build_loss(loss_map_dir) + self.loss_plan_reg = build_loss(loss_plan_reg) + self.loss_plan_bound = build_loss(loss_plan_bound) + self.loss_plan_col = build_loss(loss_plan_col) + self.loss_plan_dir = build_loss(loss_plan_dir) + self.loss_vae_gen = build_loss(loss_vae_gen) + + def _init_layers(self): + """Initialize classification branch and regression branch of head.""" + cls_branch = [] + for _ in range(self.num_reg_fcs): + cls_branch.append(Linear(self.embed_dims, self.embed_dims)) + cls_branch.append(nn.LayerNorm(self.embed_dims)) + cls_branch.append(nn.ReLU(inplace=True)) + cls_branch.append(Linear(self.embed_dims, self.cls_out_channels)) + cls_branch = nn.Sequential(*cls_branch) + + reg_branch = [] + for _ in range(self.num_reg_fcs): + reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + reg_branch.append(nn.ReLU()) + reg_branch.append(Linear(self.embed_dims, self.code_size)) + reg_branch = nn.Sequential(*reg_branch) + + traj_branch = [] + if self.with_cur: + traj_in_dim = self.embed_dims * 4 + else: + traj_in_dim = self.embed_dims * 2 + for _ in range(self.num_reg_fcs): + traj_branch.append(Linear(traj_in_dim, traj_in_dim)) + traj_branch.append(nn.ReLU()) + traj_branch.append(Linear(traj_in_dim, 2)) + traj_branch = nn.Sequential(*traj_branch) + + traj_cls_branch = [] + # for _ in range(self.num_reg_fcs): + traj_cls_branch.append(Linear(self.embed_dims * 14, self.embed_dims * 2)) + traj_cls_branch.append(nn.LayerNorm(self.embed_dims * 2)) + traj_cls_branch.append(nn.ReLU(inplace=True)) + traj_cls_branch.append(Linear(self.embed_dims * 2, self.embed_dims * 2)) + traj_cls_branch.append(nn.LayerNorm(self.embed_dims * 2)) + traj_cls_branch.append(nn.ReLU(inplace=True)) + traj_cls_branch.append(Linear(self.embed_dims * 2, self.traj_num_cls)) + traj_cls_branch = nn.Sequential(*traj_cls_branch) + + map_cls_branch = [] + for _ in range(self.num_reg_fcs): + map_cls_branch.append(Linear(self.embed_dims, self.embed_dims)) + map_cls_branch.append(nn.LayerNorm(self.embed_dims)) + map_cls_branch.append(nn.ReLU(inplace=True)) + map_cls_branch.append(Linear(self.embed_dims, self.map_cls_out_channels)) + map_cls_branch = nn.Sequential(*map_cls_branch) + + map_reg_branch = [] + for _ in range(self.num_reg_fcs): + map_reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + map_reg_branch.append(nn.ReLU()) + map_reg_branch.append(Linear(self.embed_dims, self.map_code_size)) + map_reg_branch = nn.Sequential(*map_reg_branch) + + def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + # last reg_branch is used to generate proposal from + # encode feature map when as_two_stage is True. + num_decoder_layers = 1 + num_map_decoder_layers = 1 + if self.transformer.decoder is not None: + num_decoder_layers = self.transformer.decoder.num_layers + if self.transformer.map_decoder is not None: + num_map_decoder_layers = self.transformer.map_decoder.num_layers + num_motion_decoder_layers = 1 + num_pred = (num_decoder_layers + 1) if \ + self.as_two_stage else num_decoder_layers + motion_num_pred = (num_motion_decoder_layers + 1) if \ + self.as_two_stage else num_motion_decoder_layers + map_num_pred = (num_map_decoder_layers + 1) if \ + self.as_two_stage else num_map_decoder_layers + + if self.with_box_refine: + self.cls_branches = _get_clones(cls_branch, num_pred) + self.reg_branches = _get_clones(reg_branch, num_pred) + self.traj_branches = _get_clones(traj_branch, motion_num_pred) + self.traj_cls_branches = _get_clones(traj_cls_branch, motion_num_pred) + self.map_cls_branches = _get_clones(map_cls_branch, map_num_pred) + self.map_reg_branches = _get_clones(map_reg_branch, map_num_pred) + else: + self.cls_branches = nn.ModuleList( + [cls_branch for _ in range(num_pred)]) + self.reg_branches = nn.ModuleList( + [reg_branch for _ in range(num_pred)]) + self.traj_branches = nn.ModuleList( + [traj_branch for _ in range(motion_num_pred)]) + self.traj_cls_branches = nn.ModuleList( + [traj_cls_branch for _ in range(motion_num_pred)]) + self.map_cls_branches = nn.ModuleList( + [map_cls_branch for _ in range(map_num_pred)]) + self.map_reg_branches = nn.ModuleList( + [map_reg_branch for _ in range(map_num_pred)]) + + if not self.as_two_stage: + self.bev_embedding = nn.Embedding( + self.bev_h * self.bev_w, self.embed_dims) + self.query_embedding = nn.Embedding(self.num_query, + self.embed_dims * 2) + if self.map_query_embed_type == 'all_pts': + self.map_query_embedding = nn.Embedding(self.map_num_query, + self.embed_dims * 2) + elif self.map_query_embed_type == 'instance_pts': + self.map_query_embedding = None + self.map_instance_embedding = nn.Embedding(self.map_num_vec, self.embed_dims * 2) + self.map_pts_embedding = nn.Embedding(self.map_num_pts_per_vec, self.embed_dims * 2) + + if self.motion_decoder is not None: + self.motion_decoder = build_transformer_layer_sequence(self.motion_decoder) + self.motion_mode_query = nn.Embedding(self.fut_mode, self.embed_dims) + self.motion_mode_query.weight.requires_grad = True + if self.use_pe: + self.pos_mlp_sa = nn.Linear(2, self.embed_dims) + else: + raise NotImplementedError('Not implement yet') + + if self.motion_map_decoder is not None: + self.lane_encoder = LaneNet(256, 128, 3) + self.motion_map_decoder = build_transformer_layer_sequence(self.motion_map_decoder) + if self.use_pe: + self.pos_mlp = nn.Linear(2, self.embed_dims) + + if self.ego_his_encoder is not None: + self.ego_his_encoder = LaneNet(2, self.embed_dims // 2, 3) + else: + self.ego_query = nn.Embedding(1, self.embed_dims) + + if self.ego_agent_decoder is not None: + self.ego_agent_decoder = build_transformer_layer_sequence(self.ego_agent_decoder) + if self.use_pe: + self.ego_agent_pos_mlp = nn.Linear(2, self.embed_dims) + + if self.ego_map_decoder is not None: + self.ego_map_decoder = build_transformer_layer_sequence(self.ego_map_decoder) + if self.use_pe: + self.ego_map_pos_mlp = nn.Linear(2, self.embed_dims) + + ego_fut_decoder = [] + ego_fut_dec_in_dim = self.embed_dims * 2 + len(self.ego_lcf_feat_idx) \ + if self.ego_lcf_feat_idx is not None else self.embed_dims * 2 + if self.with_cur: + ego_fut_dec_in_dim = int(ego_fut_dec_in_dim * 2) + for _ in range(self.num_reg_fcs): + ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, ego_fut_dec_in_dim)) + ego_fut_decoder.append(nn.ReLU()) + ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, self.ego_fut_mode * 2)) + self.ego_fut_decoder = nn.Sequential(*ego_fut_decoder) + + self.agent_fus_mlp = nn.Sequential( + nn.Linear(self.fut_mode * 2 * self.embed_dims, self.embed_dims, bias=True), + nn.LayerNorm(self.embed_dims), + nn.ReLU(), + nn.Linear(self.embed_dims, self.embed_dims, bias=True)) + + ######################################################### + self.ego_coord_mlp = nn.Linear(2, 2) + + self.layer_dim = 4 + self.state_gru = nn.GRU(input_size=32, hidden_size=512, num_layers=self.layer_dim) + + self.ego_gru = nn.GRU(512, 512, 4) + self.motion_gru = nn.GRU(512, 512, 4) + + # motion head + + traj_branch_ar = [] + for _ in range(self.num_reg_fcs): + traj_branch_ar.append(Linear(self.embed_dims * 2, self.embed_dims * 2)) + traj_branch_ar.append(nn.ReLU()) + traj_branch_ar.append(Linear(self.embed_dims * 2, 2)) + traj_branch_ar = nn.Sequential(*traj_branch_ar) + + traj_cls_branch_ar = [] + for _ in range(self.num_reg_fcs): + traj_cls_branch_ar.append(Linear(self.embed_dims * 2, self.embed_dims * 2)) + traj_cls_branch_ar.append(nn.LayerNorm(self.embed_dims * 2)) + traj_cls_branch_ar.append(nn.ReLU(inplace=True)) + traj_cls_branch_ar.append(Linear(self.embed_dims * 2, self.traj_num_cls)) + traj_cls_branch_ar = nn.Sequential(*traj_cls_branch_ar) + + if self.with_box_refine: + self.traj_branches_ar = _get_clones(traj_branch_ar, motion_num_pred) + self.traj_cls_branches_ar = _get_clones(traj_cls_branch_ar, motion_num_pred) + else: + self.traj_branches_ar = nn.ModuleList( + [traj_branch_ar for _ in range(motion_num_pred)]) + self.traj_cls_branches_ar = nn.ModuleList( + [traj_cls_branch_ar for _ in range(motion_num_pred)]) + + # planning head + ego_fut_decoder_ar = [] + ego_fut_dec_in_dim_ar = self.embed_dims * 2 + len(self.ego_lcf_feat_idx) \ + if self.ego_lcf_feat_idx is not None else self.embed_dims * 2 + for _ in range(self.num_reg_fcs): + ego_fut_decoder_ar.append(Linear(ego_fut_dec_in_dim_ar, ego_fut_dec_in_dim_ar)) + ego_fut_decoder_ar.append(nn.ReLU()) + ego_fut_decoder_ar.append(Linear(ego_fut_dec_in_dim_ar, self.ego_fut_mode * 2)) + self.ego_fut_decoder_ar = nn.Sequential(*ego_fut_decoder_ar) + + self.ar = True + + # generator motion & planning + self.present_distribution_in_channels = 512 + self.future_distribution_in_channels = 524 + self.now_pred_in_channels = 64 + self.PROBABILISTIC = True + self.latent_dim = 32 + self.MIN_LOG_SIGMA = -5.0 + self.MAX_LOG_SIGMA = 5.0 + self.FUTURE_DIM = 6 + self.N_GRU_BLOCKS = 3 + self.N_RES_LAYERS = 3 + + self.present_distribution = DistributionModule( + self.present_distribution_in_channels, + self.latent_dim, + min_log_sigma=self.MIN_LOG_SIGMA, + max_log_sigma=self.MAX_LOG_SIGMA, + ) + + # future_distribution_in_channels = (self.future_pred_in_channels + # + 4 * self.FUTURE_DIM + # ) + self.future_distribution = DistributionModule( + self.future_distribution_in_channels, + self.latent_dim, + min_log_sigma=self.MIN_LOG_SIGMA, + max_log_sigma=self.MAX_LOG_SIGMA, + ) + + # Future prediction + self.future_prediction = FuturePrediction( + in_channels=self.present_distribution_in_channels, + latent_dim=self.latent_dim, + n_gru_blocks=self.N_GRU_BLOCKS, + n_res_layers=self.N_RES_LAYERS, + ) + + self.predict_model = PredictModel( + in_channels=self.latent_dim, + out_channels=self.embed_dims * 2, + hidden_channels=self.latent_dim * 4, + num_layers=self.layer_dim + ) + + def init_weights(self): + """Initialize weights of the DeformDETR head.""" + self.transformer.init_weights() + if self.loss_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.cls_branches: + nn.init.constant_(m[-1].bias, bias_init) + if self.loss_map_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.map_cls_branches: + nn.init.constant_(m[-1].bias, bias_init) + if self.loss_traj_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.traj_cls_branches: + nn.init.constant_(m[-1].bias, bias_init) + # for m in self.map_reg_branches: + # constant_init(m[-1], 0, bias=0) + # nn.init.constant_(self.map_reg_branches[0][-1].bias.data[2:], 0.) + if self.motion_decoder is not None: + for p in self.motion_decoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + nn.init.orthogonal_(self.motion_mode_query.weight) + if self.use_pe: + xavier_init(self.pos_mlp_sa, distribution='uniform', bias=0.) + if self.motion_map_decoder is not None: + for p in self.motion_map_decoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for p in self.lane_encoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + if self.use_pe: + xavier_init(self.pos_mlp, distribution='uniform', bias=0.) + if self.ego_his_encoder is not None: + for p in self.ego_his_encoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + if self.ego_agent_decoder is not None: + for p in self.ego_agent_decoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + if self.ego_map_decoder is not None: + for p in self.ego_map_decoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + # @auto_fp16(apply_to=('mlvl_feats')) + + # @auto_fp16(apply_to=('mlvl_feats')) + @force_fp32(apply_to=('mlvl_feats', 'prev_bev')) + def forward(self, + mlvl_feats, + img_metas, + prev_bev=None, + only_bev=False, + ego_his_trajs=None, + ego_lcf_feat=None, + gt_labels_3d=None, + gt_attr_labels=None, + ego_fut_trajs=None, + ): + """Forward function. + Args: + mlvl_feats (tuple[Tensor]): Features from the upstream + network, each is a 5D-tensor with shape + (B, N, C, H, W). + prev_bev: previous bev featues + only_bev: only compute BEV features with encoder. + Returns: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + """ + + bs, num_cam, _, _, _ = mlvl_feats[0].shape + dtype = mlvl_feats[0].dtype + object_query_embeds = self.query_embedding.weight.to(dtype) + + if self.map_query_embed_type == 'all_pts': + map_query_embeds = self.map_query_embedding.weight.to(dtype) + elif self.map_query_embed_type == 'instance_pts': + map_pts_embeds = self.map_pts_embedding.weight.unsqueeze(0) + map_instance_embeds = self.map_instance_embedding.weight.unsqueeze(1) + map_query_embeds = (map_pts_embeds + map_instance_embeds).flatten(0, 1).to(dtype) + + bev_queries = self.bev_embedding.weight.to(dtype) + + bev_mask = torch.zeros((bs, self.bev_h, self.bev_w), + device=bev_queries.device).to(dtype) + bev_pos = self.positional_encoding(bev_mask).to(dtype) + + if only_bev: # only use encoder to obtain BEV features, TODO: refine the workaround + return self.transformer.get_bev_features( + mlvl_feats, + bev_queries, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + img_metas=img_metas, + prev_bev=prev_bev, + ) + else: + outputs = self.transformer( + mlvl_feats, + bev_queries, + object_query_embeds, + map_query_embeds, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 + cls_branches=self.cls_branches if self.as_two_stage else None, + map_reg_branches=self.map_reg_branches if self.with_box_refine else None, # noqa:E501 + map_cls_branches=self.map_cls_branches if self.as_two_stage else None, + img_metas=img_metas, + prev_bev=prev_bev + ) + + # bev_embed: bev features + # hs: agent_query + # init_reference: reference points init + # inter_references: reference points processing + # map_hs: map_query + # map_init_reference: reference points init + # map_inter_references: reference points processing + + bev_embed, hs, init_reference, inter_references, \ + map_hs, map_init_reference, map_inter_references = outputs + + hs = hs.permute(0, 2, 1, 3) + outputs_classes = [] + outputs_coords = [] + outputs_coords_bev = [] + outputs_trajs = [] + outputs_trajs_classes = [] + + map_hs = map_hs.permute(0, 2, 1, 3) + map_outputs_classes = [] + map_outputs_coords = [] + map_outputs_pts_coords = [] + map_outputs_coords_bev = [] + + for lvl in range(hs.shape[0]): + if lvl == 0: + reference = init_reference + else: + reference = inter_references[lvl - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.cls_branches[lvl](hs[lvl]) + tmp = self.reg_branches[lvl](hs[lvl]) + + # TODO: check the shape of reference + assert reference.shape[-1] == 3 + tmp[..., 0:2] = tmp[..., 0:2] + reference[..., 0:2] + tmp[..., 0:2] = tmp[..., 0:2].sigmoid() + outputs_coords_bev.append(tmp[..., 0:2].clone().detach()) + tmp[..., 4:5] = tmp[..., 4:5] + reference[..., 2:3] + tmp[..., 4:5] = tmp[..., 4:5].sigmoid() + tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] - + self.pc_range[0]) + self.pc_range[0]) + tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] - + self.pc_range[1]) + self.pc_range[1]) + tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] - + self.pc_range[2]) + self.pc_range[2]) + + # TODO: check if using sigmoid + outputs_coord = tmp + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + + for lvl in range(map_hs.shape[0]): + if lvl == 0: + reference = map_init_reference + else: + reference = map_inter_references[lvl - 1] + reference = inverse_sigmoid(reference) + map_outputs_class = self.map_cls_branches[lvl]( + map_hs[lvl].view(bs, self.map_num_vec, self.map_num_pts_per_vec, -1).mean(2) + ) + tmp = self.map_reg_branches[lvl](map_hs[lvl]) + # TODO: check the shape of reference + assert reference.shape[-1] == 2 + tmp[..., 0:2] += reference[..., 0:2] + tmp = tmp.sigmoid() # cx,cy,w,h + map_outputs_coord, map_outputs_pts_coord = self.map_transform_box(tmp) + map_outputs_coords_bev.append(map_outputs_pts_coord.clone().detach()) + map_outputs_classes.append(map_outputs_class) + map_outputs_coords.append(map_outputs_coord) + map_outputs_pts_coords.append(map_outputs_pts_coord) + + # motion prediction + + # motion query + if self.motion_decoder is not None: + batch_size, num_agent = outputs_coords_bev[-1].shape[:2] + # motion_query + motion_query = hs[-1].permute(1, 0, 2) # [A, B, D] + mode_query = self.motion_mode_query.weight # [fut_mode, D] + # [M, B, D], M=A*fut_mode + motion_query = (motion_query[:, None, :, :] + mode_query[None, :, None, :]).flatten(0, 1) + if self.use_pe: + motion_coords = outputs_coords_bev[-1] # [B, A, 2] + motion_pos = self.pos_mlp_sa(motion_coords) # [B, A, D] + motion_pos = motion_pos.unsqueeze(2).repeat(1, 1, self.fut_mode, 1).flatten(1, 2) + motion_pos = motion_pos.permute(1, 0, 2) # [M, B, D] + else: + motion_pos = None + + if self.motion_det_score is not None: + motion_score = outputs_classes[-1] + max_motion_score = motion_score.max(dim=-1)[0] + invalid_motion_idx = max_motion_score < self.motion_det_score # [B, A] + invalid_motion_idx = invalid_motion_idx.unsqueeze(2).repeat(1, 1, self.fut_mode).flatten(1, 2) + else: + invalid_motion_idx = None + + # ego query + # batch = batch_size + if self.ego_his_encoder is not None: + ego_his_feats = self.ego_his_encoder(ego_his_trajs) # [B, 1, dim] + else: + ego_his_feats = self.ego_query.weight.unsqueeze(0).repeat(batch_size, 1, 1) + # ego <-> agent Interaction + ego_query = ego_his_feats.permute(1, 0, 2) + ego_pos = torch.zeros((batch_size, 1, 2), device=ego_query.device).permute(1, 0, 2) + ego_pos_emb = self.ego_agent_pos_mlp(ego_pos) + + motion_query = torch.cat([motion_query, ego_query], dim=0) + motion_pos = torch.cat([motion_pos, ego_pos_emb], dim=0) + + motion_hs = self.motion_decoder( + query=motion_query, + key=motion_query, + value=motion_query, + query_pos=motion_pos, + key_pos=motion_pos, + key_padding_mask=invalid_motion_idx) + + if self.motion_map_decoder is not None: + # map preprocess + motion_coords = outputs_coords_bev[-1] # [B, A, 2] + motion_coords = motion_coords.unsqueeze(2).repeat(1, 1, self.fut_mode, 1).flatten(1, 2) + + # ego_coords = torch.Tensor(1, 1, 2).cuda(1) + ego_coords = torch.zeros([batch_size, 1, 2], device=motion_hs.device) + ego_coords_embd = self.ego_coord_mlp(ego_coords) + # ego_coords_embd = torch.zeros([batch_size, 1, 2], device=motion_hs.device) + motion_coords = torch.cat([motion_coords, ego_coords_embd], dim=1) + + map_query = map_hs[-1].view(batch_size, self.map_num_vec, self.map_num_pts_per_vec, -1) + map_query = self.lane_encoder(map_query) # [B, P, pts, D] -> [B, P, D] + map_score = map_outputs_classes[-1] + map_pos = map_outputs_coords_bev[-1] + map_query, map_pos, key_padding_mask = self.select_and_pad_pred_map( + motion_coords, map_query, map_score, map_pos, + map_thresh=self.map_thresh, dis_thresh=self.dis_thresh, + pe_normalization=self.pe_normalization, use_fix_pad=True) + map_query = map_query.permute(1, 0, 2) # [P, B*M, D] + ca_motion_query = motion_hs.permute(1, 0, 2).flatten(0, 1).unsqueeze(0) + + # position encoding + if self.use_pe: + (num_query, batch) = ca_motion_query.shape[:2] + motion_pos = torch.zeros((num_query, batch, 2), device=motion_hs.device) + motion_pos = self.pos_mlp(motion_pos) + map_pos = map_pos.permute(1, 0, 2) + map_pos = self.pos_mlp(map_pos) + else: + motion_pos, map_pos = None, None + + ca_motion_query = self.motion_map_decoder( + query=ca_motion_query, + key=map_query, + value=map_query, + query_pos=motion_pos, + key_pos=map_pos, + key_padding_mask=key_padding_mask) + else: + ca_motion_query = motion_hs.permute(1, 0, 2).flatten(0, 1).unsqueeze(0) + + ######################################## + # generator for planning & motion + current_states = torch.cat((motion_hs.permute(1, 0, 2), + ca_motion_query.reshape(batch_size, -1, self.embed_dims)), dim=2) + distribution_comp = {} + # states = torch.randn((2, 1, 64, 200, 200), device=motion_hs.device) + # future_distribution_inputs = torch.randn((2, 5, 6, 200, 200), device=motion_hs.device) + noise = None + if self.training: + future_distribution_inputs = self.get_future_labels(gt_labels_3d, gt_attr_labels, + ego_fut_trajs, motion_hs.device) + else: + future_distribution_inputs = None + + # 1. model CVA distribution for state + if self.fut_ts > 0: + # present_state = states[:, :1].contiguous() + if self.PROBABILISTIC: + # Do probabilistic computation + sample, output_distribution = self.distribution_forward( + current_states, future_distribution_inputs, noise + ) + distribution_comp = {**distribution_comp, **output_distribution} + + # 2. predict future state from distribution + hidden_states = current_states + states_hs, future_states_hs = \ + self.future_states_predict(batch_size, sample, hidden_states, current_states) + + ego_query_hs = \ + states_hs[:, :, self.agent_dim * self.fut_mode, :].unsqueeze(1).permute(0, 2, 1, 3) + motion_query_hs = states_hs[:, :, 0:self.agent_dim * self.fut_mode, :] + motion_query_hs = \ + motion_query_hs.reshape(self.fut_ts, batch_size, -1, self.fut_ts, motion_query_hs.shape[-1]) + ego_fut_trajs_list = [] + motion_fut_trajs_list = [] + for i in range(self.fut_ts): + outputs_ego_trajs = self.ego_fut_decoder(ego_query_hs[i]).reshape(batch_size, self.ego_fut_mode, 2) + ego_fut_trajs_list.append(outputs_ego_trajs) + outputs_agent_trajs = self.traj_branches[0](motion_query_hs[i]) + motion_fut_trajs_list.append(outputs_agent_trajs) + + ego_trajs = torch.stack(ego_fut_trajs_list, dim=2) + agent_trajs = torch.stack(motion_fut_trajs_list, dim=3).reshape(batch_size, 1, self.agent_dim, + self.fut_mode, -1) + + motion_cls_hs = torch.cat((future_states_hs[:, :, 0:self.agent_dim * self.fut_mode, :]. + reshape(batch_size, self.agent_dim, self.fut_mode, -1), + current_states[:, 0:self.agent_dim * self.fut_mode, :]. + reshape(batch_size, self.agent_dim, self.fut_mode, -1)), dim=-1) + + # outputs_traj_class = self.traj_cls_branches[0](motion_query_hs) + + # outputs_traj = self.traj_branches[0](motion_hs) + # outputs_trajs.append(outputs_traj) + outputs_traj_class = self.traj_cls_branches[0](motion_cls_hs) + outputs_trajs_classes.append(outputs_traj_class.squeeze(-1)) + + map_outputs_classes = torch.stack(map_outputs_classes) + map_outputs_coords = torch.stack(map_outputs_coords) + map_outputs_pts_coords = torch.stack(map_outputs_pts_coords) + + outputs_classes = torch.stack(outputs_classes) + outputs_coords = torch.stack(outputs_coords) + outputs_trajs = agent_trajs.permute(1, 0, 2, 3, 4) + outputs_trajs_classes = torch.stack(outputs_trajs_classes) + + # print(future_states.shape) + + # Ego prediction + # ego_feats [1, 1, 512] + # outputs_ego_trajs = self.ego_fut_decoder(ego_feats) + # outputs_ego_trajs = outputs_ego_trajs.reshape(outputs_ego_trajs.shape[0], + # self.ego_fut_mode, self.fut_ts, 2) + + outs = { + 'bev_embed': bev_embed, # torch.Size([10000, 1, 256]) + 'all_cls_scores': outputs_classes, # torch.Size([3, 1, 300, 10]) + 'all_bbox_preds': outputs_coords, # torch.Size([3, 1, 300, 10]) + 'all_traj_preds': outputs_trajs.repeat(outputs_coords.shape[0], 1, 1, 1, 1), + # torch.Size([3, 1, 300, 6, 12]) + 'all_traj_cls_scores': outputs_trajs_classes.repeat(outputs_coords.shape[0], 1, 1, 1), + # torch.Size([3, 1, 300, 6]) + 'map_all_cls_scores': map_outputs_classes, # torch.Size([3, 1, 100, 3]) map_outputs_classes + 'map_all_bbox_preds': map_outputs_coords, # torch.Size([3, 1, 100, 4]) map_outputs_coords + 'map_all_pts_preds': map_outputs_pts_coords, # torch.Size([3, 1, 100, 20, 2]) + 'enc_cls_scores': None, + 'enc_bbox_preds': None, + 'map_enc_cls_scores': None, + 'map_enc_bbox_preds': None, + 'map_enc_pts_preds': None, + 'ego_fut_preds': ego_trajs, # torch.Size([1, 3, 6, 2]) + 'loss_vae_gen': distribution_comp, + } + + return outs + + def map_transform_box(self, pts, y_first=False): + """ + Converting the points set into bounding box. + + Args: + pts: the input points sets (fields), each points + set (fields) is represented as 2n scalar. + y_first: if y_fisrt=True, the point set is represented as + [y1, x1, y2, x2 ... yn, xn], otherwise the point set is + represented as [x1, y1, x2, y2 ... xn, yn]. + Returns: + The bbox [cx, cy, w, h] transformed from points. + """ + pts_reshape = pts.view(pts.shape[0], self.map_num_vec, + self.map_num_pts_per_vec, 2) + pts_y = pts_reshape[:, :, :, 0] if y_first else pts_reshape[:, :, :, 1] + pts_x = pts_reshape[:, :, :, 1] if y_first else pts_reshape[:, :, :, 0] + if self.map_transform_method == 'minmax': + # import pdb;pdb.set_trace() + + xmin = pts_x.min(dim=2, keepdim=True)[0] + xmax = pts_x.max(dim=2, keepdim=True)[0] + ymin = pts_y.min(dim=2, keepdim=True)[0] + ymax = pts_y.max(dim=2, keepdim=True)[0] + bbox = torch.cat([xmin, ymin, xmax, ymax], dim=2) + bbox = bbox_xyxy_to_cxcywh(bbox) + else: + raise NotImplementedError + return bbox, pts_reshape + + def _get_target_single(self, + cls_score, + bbox_pred, + gt_labels, + gt_bboxes, + gt_attr_labels, + gt_bboxes_ignore=None): + """"Compute regression and classification targets for one image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_query, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_query, 10]. + gt_bboxes (Tensor): Ground truth bboxes for one image with + shape (num_gts, 9) in [x,y,z,w,l,h,yaw,vx,vy] format. + gt_labels (Tensor): Ground truth class indices for one image + with shape (num_gts, ). + gt_bboxes_ignore (Tensor, optional): Bounding boxes + which can be ignored. Default None. + Returns: + tuple[Tensor]: a tuple containing the following for one image. + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + + num_bboxes = bbox_pred.size(0) + # assigner and sampler + gt_fut_trajs = gt_attr_labels[:, :self.fut_ts * 2] + gt_fut_masks = gt_attr_labels[:, self.fut_ts * 2:self.fut_ts * 3] + gt_bbox_c = gt_bboxes.shape[-1] + num_gt_bbox, gt_traj_c = gt_fut_trajs.shape + + assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, + gt_labels, gt_bboxes_ignore) + + sampling_result = self.sampler.sample(assign_result, bbox_pred, + gt_bboxes) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + + # label targets + labels = gt_bboxes.new_full((num_bboxes,), + self.num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_bbox_c] + bbox_weights = torch.zeros_like(bbox_pred) + bbox_weights[pos_inds] = 1.0 + + # trajs targets + traj_targets = torch.zeros((num_bboxes, gt_traj_c), dtype=torch.float32, device=bbox_pred.device) + traj_weights = torch.zeros_like(traj_targets) + traj_targets[pos_inds] = gt_fut_trajs[sampling_result.pos_assigned_gt_inds] + traj_weights[pos_inds] = 1.0 + + # Filter out invalid fut trajs + traj_masks = torch.zeros_like(traj_targets) # [num_bboxes, fut_ts*2] + gt_fut_masks = gt_fut_masks.unsqueeze(-1).repeat(1, 1, 2).view(num_gt_bbox, -1) # [num_gt_bbox, fut_ts*2] + traj_masks[pos_inds] = gt_fut_masks[sampling_result.pos_assigned_gt_inds] + traj_weights = traj_weights * traj_masks + + # Extra future timestamp mask for controlling pred horizon + fut_ts_mask = torch.zeros((num_bboxes, self.fut_ts, 2), + dtype=torch.float32, device=bbox_pred.device) + fut_ts_mask[:, :self.valid_fut_ts, :] = 1.0 + fut_ts_mask = fut_ts_mask.view(num_bboxes, -1) + traj_weights = traj_weights * fut_ts_mask + + # DETR + bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes + + return ( + labels, label_weights, bbox_targets, bbox_weights, traj_targets, + traj_weights, traj_masks.view(-1, self.fut_ts, 2)[..., 0], + pos_inds, neg_inds + ) + + def _map_get_target_single(self, + cls_score, + bbox_pred, + pts_pred, + gt_labels, + gt_bboxes, + gt_shifts_pts, + gt_bboxes_ignore=None): + """"Compute regression and classification targets for one image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_query, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_query, 4]. + gt_bboxes (Tensor): Ground truth bboxes for one image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (Tensor): Ground truth class indices for one image + with shape (num_gts, ). + gt_bboxes_ignore (Tensor, optional): Bounding boxes + which can be ignored. Default None. + Returns: + tuple[Tensor]: a tuple containing the following for one image. + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + num_bboxes = bbox_pred.size(0) + # assigner and sampler + gt_c = gt_bboxes.shape[-1] + assign_result, order_index = self.map_assigner.assign(bbox_pred, cls_score, pts_pred, + gt_bboxes, gt_labels, gt_shifts_pts, + gt_bboxes_ignore) + + sampling_result = self.map_sampler.sample(assign_result, bbox_pred, + gt_bboxes) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + # label targets + labels = gt_bboxes.new_full((num_bboxes,), + self.map_num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c] + bbox_weights = torch.zeros_like(bbox_pred) + bbox_weights[pos_inds] = 1.0 + # pts targets + if order_index is None: + assigned_shift = gt_labels[sampling_result.pos_assigned_gt_inds] + else: + assigned_shift = order_index[sampling_result.pos_inds, sampling_result.pos_assigned_gt_inds] + pts_targets = pts_pred.new_zeros((pts_pred.size(0), + pts_pred.size(1), pts_pred.size(2))) + pts_weights = torch.zeros_like(pts_targets) + pts_weights[pos_inds] = 1.0 + # DETR + bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes + pts_targets[pos_inds] = gt_shifts_pts[sampling_result.pos_assigned_gt_inds, assigned_shift, :, :] + return (labels, label_weights, bbox_targets, bbox_weights, + pts_targets, pts_weights, + pos_inds, neg_inds) + + def get_targets(self, + cls_scores_list, + bbox_preds_list, + gt_bboxes_list, + gt_labels_list, + gt_attr_labels_list, + gt_bboxes_ignore_list=None): + """"Compute regression and classification targets for a batch image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image with shape [num_query, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + tuple: a tuple containing the following targets. + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all \ + images. + - bbox_targets_list (list[Tensor]): BBox targets for all \ + images. + - bbox_weights_list (list[Tensor]): BBox weights for all \ + images. + - num_total_pos (int): Number of positive samples in all \ + images. + - num_total_neg (int): Number of negative samples in all \ + images. + """ + assert gt_bboxes_ignore_list is None, \ + 'Only supports for gt_bboxes_ignore setting to None.' + num_imgs = len(cls_scores_list) + gt_bboxes_ignore_list = [ + gt_bboxes_ignore_list for _ in range(num_imgs) + ] + + (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, traj_targets_list, traj_weights_list, + gt_fut_masks_list, pos_inds_list, neg_inds_list) = multi_apply( + self._get_target_single, cls_scores_list, bbox_preds_list, + gt_labels_list, gt_bboxes_list, gt_attr_labels_list, gt_bboxes_ignore_list + ) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + traj_targets_list, traj_weights_list, gt_fut_masks_list, num_total_pos, num_total_neg) + + def map_get_targets(self, + cls_scores_list, + bbox_preds_list, + pts_preds_list, + gt_bboxes_list, + gt_labels_list, + gt_shifts_pts_list, + gt_bboxes_ignore_list=None): + """"Compute regression and classification targets for a batch image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image with shape [num_query, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + tuple: a tuple containing the following targets. + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all \ + images. + - bbox_targets_list (list[Tensor]): BBox targets for all \ + images. + - bbox_weights_list (list[Tensor]): BBox weights for all \ + images. + - num_total_pos (int): Number of positive samples in all \ + images. + - num_total_neg (int): Number of negative samples in all \ + images. + """ + assert gt_bboxes_ignore_list is None, \ + 'Only supports for gt_bboxes_ignore setting to None.' + num_imgs = len(cls_scores_list) + gt_bboxes_ignore_list = [ + gt_bboxes_ignore_list for _ in range(num_imgs) + ] + + (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, pts_targets_list, pts_weights_list, + pos_inds_list, neg_inds_list) = multi_apply( + self._map_get_target_single, cls_scores_list, bbox_preds_list, pts_preds_list, + gt_labels_list, gt_bboxes_list, gt_shifts_pts_list, gt_bboxes_ignore_list) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, pts_targets_list, pts_weights_list, + num_total_pos, num_total_neg) + + def loss_planning(self, + ego_fut_preds, + ego_fut_gt, + ego_fut_masks, + ego_fut_cmd, + lane_preds, + lane_score_preds, + agent_preds, + agent_fut_preds, + agent_score_preds, + agent_fut_cls_preds): + """"Loss function for ego vehicle planning. + Args: + ego_fut_preds (Tensor): [B, ego_fut_mode, fut_ts, 2] + ego_fut_gt (Tensor): [B, fut_ts, 2] + ego_fut_masks (Tensor): [B, fut_ts] + ego_fut_cmd (Tensor): [B, ego_fut_mode] + lane_preds (Tensor): [B, num_vec, num_pts, 2] + lane_score_preds (Tensor): [B, num_vec, 3] + agent_preds (Tensor): [B, num_agent, 2] + agent_fut_preds (Tensor): [B, num_agent, fut_mode, fut_ts, 2] + agent_score_preds (Tensor): [B, num_agent, 10] + agent_fut_cls_scores (Tensor): [B, num_agent, fut_mode] + Returns: + loss_plan_reg (Tensor): planning reg loss. + loss_plan_bound (Tensor): planning map boundary constraint loss. + loss_plan_col (Tensor): planning col constraint loss. + loss_plan_dir (Tensor): planning directional constraint loss. + """ + + ego_fut_gt = ego_fut_gt.unsqueeze(1).repeat(1, self.ego_fut_mode, 1, 1) + loss_plan_l1_weight = ego_fut_cmd[..., None, None] * ego_fut_masks[:, None, :, None] + loss_plan_l1_weight = loss_plan_l1_weight.repeat(1, 1, 1, 2) + + loss_plan_l1 = self.loss_plan_reg( + ego_fut_preds, + ego_fut_gt, + loss_plan_l1_weight + ) + + loss_plan_bound = self.loss_plan_bound( + ego_fut_preds[ego_fut_cmd == 1], + lane_preds, + lane_score_preds, + weight=ego_fut_masks + ) + + loss_plan_col = self.loss_plan_col( + ego_fut_preds[ego_fut_cmd == 1], + agent_preds, + agent_fut_preds, + agent_score_preds, + agent_fut_cls_preds, + weight=ego_fut_masks[:, :, None].repeat(1, 1, 2) + ) + + loss_plan_dir = self.loss_plan_dir( + ego_fut_preds[ego_fut_cmd == 1], + lane_preds, + lane_score_preds, + weight=ego_fut_masks + ) + + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + loss_plan_l1 = torch.nan_to_num(loss_plan_l1) + loss_plan_bound = torch.nan_to_num(loss_plan_bound) + loss_plan_col = torch.nan_to_num(loss_plan_col) + loss_plan_dir = torch.nan_to_num(loss_plan_dir) + + loss_plan_dict = dict() + loss_plan_dict['loss_plan_reg'] = loss_plan_l1 + loss_plan_dict['loss_plan_bound'] = loss_plan_bound + loss_plan_dict['loss_plan_col'] = loss_plan_col + loss_plan_dict['loss_plan_dir'] = loss_plan_dir + + return loss_plan_dict + + def loss_single(self, + cls_scores, + bbox_preds, + traj_preds, + traj_cls_preds, + gt_bboxes_list, + gt_labels_list, + gt_attr_labels_list, + gt_bboxes_ignore_list=None): + """"Loss function for outputs from a single decoder layer of a single + feature level. + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images. Shape [bs, num_query, cls_out_channels]. + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape [bs, num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components for outputs from + a single decoder layer. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, + gt_bboxes_list, gt_labels_list, + gt_attr_labels_list, gt_bboxes_ignore_list) + + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + traj_targets_list, traj_weights_list, gt_fut_masks_list, + num_total_pos, num_total_neg) = cls_reg_targets + + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + traj_targets = torch.cat(traj_targets_list, 0) + traj_weights = torch.cat(traj_weights_list, 0) + gt_fut_masks = torch.cat(gt_fut_masks_list, 0) + + # classification loss + cls_scores = cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + + cls_avg_factor = max(cls_avg_factor, 1) + loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes accross all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # regression L1 loss + bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) + normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range) + isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) + bbox_weights = bbox_weights * self.code_weights + loss_bbox = self.loss_bbox( + bbox_preds[isnotnan, :10], + normalized_bbox_targets[isnotnan, :10], + bbox_weights[isnotnan, :10], + avg_factor=num_total_pos) + + # traj regression loss + best_traj_preds = self.get_best_fut_preds( + traj_preds.reshape(-1, self.fut_mode, self.fut_ts, 2), + traj_targets.reshape(-1, self.fut_ts, 2), gt_fut_masks) + + neg_inds = (bbox_weights[:, 0] == 0) + traj_labels = self.get_traj_cls_target( + traj_preds.reshape(-1, self.fut_mode, self.fut_ts, 2), + traj_targets.reshape(-1, self.fut_ts, 2), + gt_fut_masks, neg_inds) + + loss_traj = self.loss_traj( + best_traj_preds[isnotnan], + traj_targets[isnotnan], + traj_weights[isnotnan], + avg_factor=num_total_pos) + + if self.use_traj_lr_warmup: + loss_scale_factor = get_traj_warmup_loss_weight(self.epoch, self.tot_epoch) + loss_traj = loss_scale_factor * loss_traj + + # traj classification loss + traj_cls_scores = traj_cls_preds.reshape(-1, self.fut_mode) + # construct weighted avg_factor to match with the official DETR repo + traj_cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.traj_bg_cls_weight + if self.sync_cls_avg_factor: + traj_cls_avg_factor = reduce_mean( + traj_cls_scores.new_tensor([traj_cls_avg_factor])) + + traj_cls_avg_factor = max(traj_cls_avg_factor, 1) + loss_traj_cls = self.loss_traj_cls( + traj_cls_scores, traj_labels, label_weights, avg_factor=traj_cls_avg_factor + ) + + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + loss_cls = torch.nan_to_num(loss_cls) + loss_bbox = torch.nan_to_num(loss_bbox) + loss_traj = torch.nan_to_num(loss_traj) + loss_traj_cls = torch.nan_to_num(loss_traj_cls) + + return loss_cls, loss_bbox, loss_traj, loss_traj_cls + + def get_best_fut_preds(self, + traj_preds, + traj_targets, + gt_fut_masks): + """"Choose best preds among all modes. + Args: + traj_preds (Tensor): MultiModal traj preds with shape (num_box_preds, fut_mode, fut_ts, 2). + traj_targets (Tensor): Ground truth traj for each pred box with shape (num_box_preds, fut_ts, 2). + gt_fut_masks (Tensor): Ground truth traj mask with shape (num_box_preds, fut_ts). + pred_box_centers (Tensor): Pred box centers with shape (num_box_preds, 2). + gt_box_centers (Tensor): Ground truth box centers with shape (num_box_preds, 2). + + Returns: + best_traj_preds (Tensor): best traj preds (min displacement error with gt) + with shape (num_box_preds, fut_ts*2). + """ + + cum_traj_preds = traj_preds.cumsum(dim=-2) + cum_traj_targets = traj_targets.cumsum(dim=-2) + + # Get min pred mode indices. + # (num_box_preds, fut_mode, fut_ts) + dist = torch.linalg.norm(cum_traj_targets[:, None, :, :] - cum_traj_preds, dim=-1) + dist = dist * gt_fut_masks[:, None, :] + dist = dist[..., -1] + dist[torch.isnan(dist)] = dist[torch.isnan(dist)] * 0 + min_mode_idxs = torch.argmin(dist, dim=-1).tolist() + box_idxs = torch.arange(traj_preds.shape[0]).tolist() + best_traj_preds = traj_preds[box_idxs, min_mode_idxs, :, :].reshape(-1, self.fut_ts * 2) + + return best_traj_preds + + def get_traj_cls_target(self, + traj_preds, + traj_targets, + gt_fut_masks, + neg_inds): + """"Get Trajectory mode classification target. + Args: + traj_preds (Tensor): MultiModal traj preds with shape (num_box_preds, fut_mode, fut_ts, 2). + traj_targets (Tensor): Ground truth traj for each pred box with shape (num_box_preds, fut_ts, 2). + gt_fut_masks (Tensor): Ground truth traj mask with shape (num_box_preds, fut_ts). + neg_inds (Tensor): Negtive indices with shape (num_box_preds,) + + Returns: + traj_labels (Tensor): traj cls labels (num_box_preds,). + """ + + cum_traj_preds = traj_preds.cumsum(dim=-2) + cum_traj_targets = traj_targets.cumsum(dim=-2) + + # Get min pred mode indices. + # (num_box_preds, fut_mode, fut_ts) + dist = torch.linalg.norm(cum_traj_targets[:, None, :, :] - cum_traj_preds, dim=-1) + dist = dist * gt_fut_masks[:, None, :] + dist = dist[..., -1] + dist[torch.isnan(dist)] = dist[torch.isnan(dist)] * 0 + traj_labels = torch.argmin(dist, dim=-1) + traj_labels[neg_inds] = self.fut_mode + + return traj_labels + + def map_loss_single(self, + cls_scores, + bbox_preds, + pts_preds, + gt_bboxes_list, + gt_labels_list, + gt_shifts_pts_list, + gt_bboxes_ignore_list=None): + """"Loss function for outputs from a single decoder layer of a single + feature level. + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images. Shape [bs, num_query, cls_out_channels]. + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape [bs, num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_pts_list (list[Tensor]): Ground truth pts for each image + with shape (num_gts, fixed_num, 2) in [x,y] format. + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components for outputs from + a single decoder layer. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + pts_preds_list = [pts_preds[i] for i in range(num_imgs)] + + cls_reg_targets = self.map_get_targets(cls_scores_list, bbox_preds_list, pts_preds_list, + gt_bboxes_list, gt_labels_list, gt_shifts_pts_list, + gt_bboxes_ignore_list) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + pts_targets_list, pts_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + pts_targets = torch.cat(pts_targets_list, 0) + pts_weights = torch.cat(pts_weights_list, 0) + + # classification loss + cls_scores = cls_scores.reshape(-1, self.map_cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.map_bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + + cls_avg_factor = max(cls_avg_factor, 1) + loss_cls = self.loss_map_cls( + cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes accross all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # regression L1 loss + bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) + normalized_bbox_targets = normalize_2d_bbox(bbox_targets, self.pc_range) + # normalized_bbox_targets = bbox_targets + isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) + bbox_weights = bbox_weights * self.map_code_weights + + loss_bbox = self.loss_map_bbox( + bbox_preds[isnotnan, :4], + normalized_bbox_targets[isnotnan, :4], + bbox_weights[isnotnan, :4], + avg_factor=num_total_pos) + + # regression pts CD loss + # num_samples, num_order, num_pts, num_coords + normalized_pts_targets = normalize_2d_pts(pts_targets, self.pc_range) + + # num_samples, num_pts, num_coords + pts_preds = pts_preds.reshape(-1, pts_preds.size(-2), pts_preds.size(-1)) + if self.map_num_pts_per_vec != self.map_num_pts_per_gt_vec: + pts_preds = pts_preds.permute(0, 2, 1) + pts_preds = F.interpolate(pts_preds, size=(self.map_num_pts_per_gt_vec), mode='linear', + align_corners=True) + pts_preds = pts_preds.permute(0, 2, 1).contiguous() + + loss_pts = self.loss_map_pts( + pts_preds[isnotnan, :, :], + normalized_pts_targets[isnotnan, :, :], + pts_weights[isnotnan, :, :], + avg_factor=num_total_pos) + + dir_weights = pts_weights[:, :-self.map_dir_interval, 0] + denormed_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range) + denormed_pts_preds_dir = denormed_pts_preds[:, self.map_dir_interval:, :] - \ + denormed_pts_preds[:, :-self.map_dir_interval, :] + pts_targets_dir = pts_targets[:, self.map_dir_interval:, :] - pts_targets[:, :-self.map_dir_interval, :] + + loss_dir = self.loss_map_dir( + denormed_pts_preds_dir[isnotnan, :, :], + pts_targets_dir[isnotnan, :, :], + dir_weights[isnotnan, :], + avg_factor=num_total_pos) + + bboxes = denormalize_2d_bbox(bbox_preds, self.pc_range) + # regression IoU loss, defaultly GIoU loss + loss_iou = self.loss_map_iou( + bboxes[isnotnan, :4], + bbox_targets[isnotnan, :4], + bbox_weights[isnotnan, :4], + avg_factor=num_total_pos) + + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + loss_cls = torch.nan_to_num(loss_cls) + loss_bbox = torch.nan_to_num(loss_bbox) + loss_iou = torch.nan_to_num(loss_iou) + loss_pts = torch.nan_to_num(loss_pts) + loss_dir = torch.nan_to_num(loss_dir) + + return loss_cls, loss_bbox, loss_iou, loss_pts, loss_dir + + def distribution_loss(self, output): + kl_loss = self.loss_vae_gen(output) + return kl_loss + + @force_fp32(apply_to=('preds_dicts')) + def loss(self, + gt_bboxes_list, + gt_labels_list, + map_gt_bboxes_list, + map_gt_labels_list, + preds_dicts, + ego_fut_gt, + ego_fut_masks, + ego_fut_cmd, + gt_attr_labels, + gt_bboxes_ignore=None, + map_gt_bboxes_ignore=None, + img_metas=None): + """"Loss function. + Args: + + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + preds_dicts: + all_cls_scores (Tensor): Classification score of all + decoder layers, has shape + [nb_dec, bs, num_query, cls_out_channels]. + all_bbox_preds (Tensor): Sigmoid regression + outputs of all decode layers. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and shape + [nb_dec, bs, num_query, 4]. + enc_cls_scores (Tensor): Classification scores of + points on encode feature map , has shape + (N, h*w, num_classes). Only be passed when as_two_stage is + True, otherwise is None. + enc_bbox_preds (Tensor): Regression results of each points + on the encode feature map, has shape (N, h*w, 4). Only be + passed when as_two_stage is True, otherwise is None. + gt_bboxes_ignore (list[Tensor], optional): Bounding boxes + which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert gt_bboxes_ignore is None, \ + f'{self.__class__.__name__} only supports ' \ + f'for gt_bboxes_ignore setting to None.' + + map_gt_vecs_list = copy.deepcopy(map_gt_bboxes_list) + + all_cls_scores = preds_dicts['all_cls_scores'] + all_bbox_preds = preds_dicts['all_bbox_preds'] + all_traj_preds = preds_dicts['all_traj_preds'] + all_traj_cls_scores = preds_dicts['all_traj_cls_scores'] + enc_cls_scores = preds_dicts['enc_cls_scores'] + enc_bbox_preds = preds_dicts['enc_bbox_preds'] + map_all_cls_scores = preds_dicts['map_all_cls_scores'] + map_all_bbox_preds = preds_dicts['map_all_bbox_preds'] + map_all_pts_preds = preds_dicts['map_all_pts_preds'] + map_enc_cls_scores = preds_dicts['map_enc_cls_scores'] + map_enc_bbox_preds = preds_dicts['map_enc_bbox_preds'] + map_enc_pts_preds = preds_dicts['map_enc_pts_preds'] + ego_fut_preds = preds_dicts['ego_fut_preds'] + distribution_pred = preds_dicts['loss_vae_gen'] + + num_dec_layers = len(all_cls_scores) + device = gt_labels_list[0].device + + gt_bboxes_list = [torch.cat( + (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), + dim=1).to(device) for gt_bboxes in gt_bboxes_list] + + all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] + all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] + all_gt_attr_labels_list = [gt_attr_labels for _ in range(num_dec_layers)] + all_gt_bboxes_ignore_list = [ + gt_bboxes_ignore for _ in range(num_dec_layers) + ] + + losses_cls, losses_bbox, loss_traj, loss_traj_cls = multi_apply( + self.loss_single, all_cls_scores, all_bbox_preds, all_traj_preds, + all_traj_cls_scores, all_gt_bboxes_list, all_gt_labels_list, + all_gt_attr_labels_list, all_gt_bboxes_ignore_list) + + num_dec_layers = len(map_all_cls_scores) + device = map_gt_labels_list[0].device + + map_gt_bboxes_list = [ + map_gt_bboxes.bbox.to(device) for map_gt_bboxes in map_gt_vecs_list] + map_gt_pts_list = [ + map_gt_bboxes.fixed_num_sampled_points.to(device) for map_gt_bboxes in map_gt_vecs_list] + if self.map_gt_shift_pts_pattern == 'v0': + map_gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points.to(device) for gt_bboxes in map_gt_vecs_list] + elif self.map_gt_shift_pts_pattern == 'v1': + map_gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v1.to(device) for gt_bboxes in map_gt_vecs_list] + elif self.map_gt_shift_pts_pattern == 'v2': + map_gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v2.to(device) for gt_bboxes in map_gt_vecs_list] + elif self.map_gt_shift_pts_pattern == 'v3': + map_gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v3.to(device) for gt_bboxes in map_gt_vecs_list] + elif self.map_gt_shift_pts_pattern == 'v4': + map_gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v4.to(device) for gt_bboxes in map_gt_vecs_list] + else: + raise NotImplementedError + map_all_gt_bboxes_list = [map_gt_bboxes_list for _ in range(num_dec_layers)] + map_all_gt_labels_list = [map_gt_labels_list for _ in range(num_dec_layers)] + map_all_gt_pts_list = [map_gt_pts_list for _ in range(num_dec_layers)] + map_all_gt_shifts_pts_list = [map_gt_shifts_pts_list for _ in range(num_dec_layers)] + map_all_gt_bboxes_ignore_list = [ + map_gt_bboxes_ignore for _ in range(num_dec_layers) + ] + + map_losses_cls, map_losses_bbox, map_losses_iou, \ + map_losses_pts, map_losses_dir = multi_apply( + self.map_loss_single, map_all_cls_scores, map_all_bbox_preds, + map_all_pts_preds, map_all_gt_bboxes_list, map_all_gt_labels_list, + map_all_gt_shifts_pts_list, map_all_gt_bboxes_ignore_list) + + loss_dict = dict() + # loss from the last decoder layer + loss_dict['loss_cls'] = losses_cls[-1] + loss_dict['loss_bbox'] = losses_bbox[-1] + loss_dict['loss_traj'] = loss_traj[-1] + loss_dict['loss_traj_cls'] = loss_traj_cls[-1] + # loss from the last decoder layer + loss_dict['loss_map_cls'] = map_losses_cls[-1] + loss_dict['loss_map_bbox'] = map_losses_bbox[-1] + loss_dict['loss_map_iou'] = map_losses_iou[-1] + loss_dict['loss_map_pts'] = map_losses_pts[-1] + loss_dict['loss_map_dir'] = map_losses_dir[-1] + + # Planning Loss + ego_fut_gt = ego_fut_gt.squeeze(1) + ego_fut_masks = ego_fut_masks.squeeze(1).squeeze(1) + ego_fut_cmd = ego_fut_cmd.squeeze(1).squeeze(1) + + batch, num_agent = all_traj_preds[-1].shape[:2] + agent_fut_preds = all_traj_preds[-1].view(batch, num_agent, self.fut_mode, self.fut_ts, 2) + agent_fut_cls_preds = all_traj_cls_scores[-1].view(batch, num_agent, self.fut_mode) + loss_plan_input = [ego_fut_preds, ego_fut_gt, ego_fut_masks, ego_fut_cmd, + map_all_pts_preds[-1], map_all_cls_scores[-1].sigmoid(), + all_bbox_preds[-1][..., 0:2], agent_fut_preds, + all_cls_scores[-1].sigmoid(), agent_fut_cls_preds.sigmoid()] + + loss_planning_dict = self.loss_planning(*loss_plan_input) + loss_dict['loss_plan_reg'] = loss_planning_dict['loss_plan_reg'] + loss_dict['loss_plan_bound'] = loss_planning_dict['loss_plan_bound'] + loss_dict['loss_plan_col'] = loss_planning_dict['loss_plan_col'] + loss_dict['loss_plan_dir'] = loss_planning_dict['loss_plan_dir'] + + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]): + loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i + num_dec_layer += 1 + # loss from other decoder layers + num_dec_layer = 0 + for map_loss_cls_i, map_loss_bbox_i, map_loss_iou_i, map_loss_pts_i, map_loss_dir_i in zip( + map_losses_cls[:-1], + map_losses_bbox[:-1], + map_losses_iou[:-1], + map_losses_pts[:-1], + map_losses_dir[:-1] + ): + loss_dict[f'd{num_dec_layer}.loss_map_cls'] = map_loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_map_bbox'] = map_loss_bbox_i + loss_dict[f'd{num_dec_layer}.loss_map_iou'] = map_loss_iou_i + loss_dict[f'd{num_dec_layer}.loss_map_pts'] = map_loss_pts_i + loss_dict[f'd{num_dec_layer}.loss_map_dir'] = map_loss_dir_i + num_dec_layer += 1 + + # loss of proposal generated from encode feature map. + if enc_cls_scores is not None: + binary_labels_list = [ + torch.zeros_like(gt_labels_list[i]) + for i in range(len(all_gt_labels_list)) + ] + enc_loss_cls, enc_losses_bbox = \ + self.loss_single(enc_cls_scores, enc_bbox_preds, + gt_bboxes_list, binary_labels_list, + gt_bboxes_ignore) + loss_dict['enc_loss_cls'] = enc_loss_cls + loss_dict['enc_loss_bbox'] = enc_losses_bbox + + if map_enc_cls_scores is not None: + map_binary_labels_list = [ + torch.zeros_like(map_gt_labels_list[i]) + for i in range(len(map_all_gt_labels_list)) + ] + # TODO bug here, but we dont care enc_loss now + map_enc_loss_cls, map_enc_loss_bbox, map_enc_loss_iou, \ + map_enc_loss_pts, map_enc_loss_dir = \ + self.map_loss_single( + map_enc_cls_scores, map_enc_bbox_preds, + map_enc_pts_preds, map_gt_bboxes_list, + map_binary_labels_list, map_gt_pts_list, + map_gt_bboxes_ignore + ) + loss_dict['enc_loss_map_cls'] = map_enc_loss_cls + loss_dict['enc_loss_map_bbox'] = map_enc_loss_bbox + loss_dict['enc_loss_map_iou'] = map_enc_loss_iou + loss_dict['enc_loss_map_pts'] = map_enc_loss_pts + loss_dict['enc_loss_map_dir'] = map_enc_loss_dir + + loss_dict['loss_vae_gen'] = self.loss_vae_gen(distribution_pred) + + return loss_dict + + @force_fp32(apply_to=('preds_dicts')) + def get_bboxes(self, preds_dicts, img_metas, rescale=False): + """Generate bboxes from bbox head predictions. + Args: + preds_dicts (tuple[list[dict]]): Prediction results. + img_metas (list[dict]): Point cloud and image's meta info. + Returns: + list[dict]: Decoded bbox, scores and labels after nms. + """ + + det_preds_dicts = self.bbox_coder.decode(preds_dicts) + # map_bboxes: xmin, ymin, xmax, ymax + map_preds_dicts = self.map_bbox_coder.decode(preds_dicts) + + num_samples = len(det_preds_dicts) + assert len(det_preds_dicts) == len(map_preds_dicts), \ + 'len(preds_dict) should be equal to len(map_preds_dicts)' + ret_list = [] + for i in range(num_samples): + preds = det_preds_dicts[i] + bboxes = preds['bboxes'] + bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 + code_size = bboxes.shape[-1] + bboxes = img_metas[i]['box_type_3d'](bboxes, code_size) + scores = preds['scores'] + labels = preds['labels'] + trajs = preds['trajs'] + + map_preds = map_preds_dicts[i] + map_bboxes = map_preds['map_bboxes'] + map_scores = map_preds['map_scores'] + map_labels = map_preds['map_labels'] + map_pts = map_preds['map_pts'] + + ret_list.append([bboxes, scores, labels, trajs, map_bboxes, + map_scores, map_labels, map_pts]) + + return ret_list + + def select_and_pad_pred_map( + self, + motion_pos, + map_query, + map_score, + map_pos, + map_thresh=0.5, + dis_thresh=None, + pe_normalization=True, + use_fix_pad=False + ): + """select_and_pad_pred_map. + Args: + motion_pos: [B, A, 2] + map_query: [B, P, D]. + map_score: [B, P, 3]. + map_pos: [B, P, pts, 2]. + map_thresh: map confidence threshold for filtering low-confidence preds + dis_thresh: distance threshold for masking far maps for each agent in cross-attn + use_fix_pad: always pad one lane instance for each batch + Returns: + selected_map_query: [B*A, P1(+1), D], P1 is the max inst num after filter and pad. + selected_map_pos: [B*A, P1(+1), 2] + selected_padding_mask: [B*A, P1(+1)] + """ + + if dis_thresh is None: + raise NotImplementedError('Not implement yet') + + # use the most close pts pos in each map inst as the inst's pos + batch, num_map = map_pos.shape[:2] + map_dis = torch.sqrt(map_pos[..., 0] ** 2 + map_pos[..., 1] ** 2) + min_map_pos_idx = map_dis.argmin(dim=-1).flatten() # [B*P] + min_map_pos = map_pos.flatten(0, 1) # [B*P, pts, 2] + min_map_pos = min_map_pos[range(min_map_pos.shape[0]), min_map_pos_idx] # [B*P, 2] + min_map_pos = min_map_pos.view(batch, num_map, 2) # [B, P, 2] + + # select & pad map vectors for different batch using map_thresh + map_score = map_score.sigmoid() + map_max_score = map_score.max(dim=-1)[0] + map_idx = map_max_score > map_thresh + batch_max_pnum = 0 + for i in range(map_score.shape[0]): + pnum = map_idx[i].sum() + if pnum > batch_max_pnum: + batch_max_pnum = pnum + + selected_map_query, selected_map_pos, selected_padding_mask = [], [], [] + for i in range(map_score.shape[0]): + dim = map_query.shape[-1] + valid_pnum = map_idx[i].sum() + valid_map_query = map_query[i, map_idx[i]] + valid_map_pos = min_map_pos[i, map_idx[i]] + pad_pnum = batch_max_pnum - valid_pnum + padding_mask = torch.tensor([False], device=map_score.device).repeat(batch_max_pnum) + if pad_pnum != 0: + valid_map_query = torch.cat([valid_map_query, torch.zeros((pad_pnum, dim), device=map_score.device)], + dim=0) + valid_map_pos = torch.cat([valid_map_pos, torch.zeros((pad_pnum, 2), device=map_score.device)], dim=0) + padding_mask[valid_pnum:] = True + selected_map_query.append(valid_map_query) + selected_map_pos.append(valid_map_pos) + selected_padding_mask.append(padding_mask) + + selected_map_query = torch.stack(selected_map_query, dim=0) + selected_map_pos = torch.stack(selected_map_pos, dim=0) + selected_padding_mask = torch.stack(selected_padding_mask, dim=0) + + # generate different pe for map vectors for each agent + num_agent = motion_pos.shape[1] + selected_map_query = selected_map_query.unsqueeze(1).repeat(1, num_agent, 1, 1) # [B, A, max_P, D] + selected_map_pos = selected_map_pos.unsqueeze(1).repeat(1, num_agent, 1, 1) # [B, A, max_P, 2] + selected_padding_mask = selected_padding_mask.unsqueeze(1).repeat(1, num_agent, 1) # [B, A, max_P] + # move lane to per-car coords system + selected_map_dist = selected_map_pos - motion_pos[:, :, None, :] # [B, A, max_P, 2] + if pe_normalization: + selected_map_pos = selected_map_pos - motion_pos[:, :, None, :] # [B, A, max_P, 2] + + # filter far map inst for each agent + map_dis = torch.sqrt(selected_map_dist[..., 0] ** 2 + selected_map_dist[..., 1] ** 2) + valid_map_inst = (map_dis <= dis_thresh) # [B, A, max_P] + invalid_map_inst = (valid_map_inst == False) + selected_padding_mask = selected_padding_mask + invalid_map_inst + + selected_map_query = selected_map_query.flatten(0, 1) + selected_map_pos = selected_map_pos.flatten(0, 1) + selected_padding_mask = selected_padding_mask.flatten(0, 1) + + num_batch = selected_padding_mask.shape[0] + feat_dim = selected_map_query.shape[-1] + if use_fix_pad: + pad_map_query = torch.zeros((num_batch, 1, feat_dim), device=selected_map_query.device) + pad_map_pos = torch.ones((num_batch, 1, 2), device=selected_map_pos.device) + pad_lane_mask = torch.tensor([False], device=selected_padding_mask.device).unsqueeze(0).repeat(num_batch, 1) + selected_map_query = torch.cat([selected_map_query, pad_map_query], dim=1) + selected_map_pos = torch.cat([selected_map_pos, pad_map_pos], dim=1) + selected_padding_mask = torch.cat([selected_padding_mask, pad_lane_mask], dim=1) + + return selected_map_query, selected_map_pos, selected_padding_mask + + def select_and_pad_query( + self, + query, + query_pos, + query_score, + score_thresh=0.5, + use_fix_pad=True + ): + """select_and_pad_query. + Args: + query: [B, Q, D]. + query_pos: [B, Q, 2] + query_score: [B, Q, C]. + score_thresh: confidence threshold for filtering low-confidence query + use_fix_pad: always pad one query instance for each batch + Returns: + selected_query: [B, Q', D] + selected_query_pos: [B, Q', 2] + selected_padding_mask: [B, Q'] + """ + + # select & pad query for different batch using score_thresh + query_score = query_score.sigmoid() + query_score = query_score.max(dim=-1)[0] + query_idx = query_score > score_thresh + batch_max_qnum = 0 + for i in range(query_score.shape[0]): + qnum = query_idx[i].sum() + if qnum > batch_max_qnum: + batch_max_qnum = qnum + + selected_query, selected_query_pos, selected_padding_mask = [], [], [] + for i in range(query_score.shape[0]): + dim = query.shape[-1] + valid_qnum = query_idx[i].sum() + valid_query = query[i, query_idx[i]] + valid_query_pos = query_pos[i, query_idx[i]] + pad_qnum = batch_max_qnum - valid_qnum + padding_mask = torch.tensor([False], device=query_score.device).repeat(batch_max_qnum) + if pad_qnum != 0: + valid_query = torch.cat([valid_query, torch.zeros((pad_qnum, dim), device=query_score.device)], dim=0) + valid_query_pos = torch.cat([valid_query_pos, torch.zeros((pad_qnum, 2), device=query_score.device)], + dim=0) + padding_mask[valid_qnum:] = True + selected_query.append(valid_query) + selected_query_pos.append(valid_query_pos) + selected_padding_mask.append(padding_mask) + + selected_query = torch.stack(selected_query, dim=0) + selected_query_pos = torch.stack(selected_query_pos, dim=0) + selected_padding_mask = torch.stack(selected_padding_mask, dim=0) + + num_batch = selected_padding_mask.shape[0] + feat_dim = selected_query.shape[-1] + if use_fix_pad: + pad_query = torch.zeros((num_batch, 1, feat_dim), device=selected_query.device) + pad_query_pos = torch.ones((num_batch, 1, 2), device=selected_query_pos.device) + pad_mask = torch.tensor([False], device=selected_padding_mask.device).unsqueeze(0).repeat(num_batch, 1) + selected_query = torch.cat([selected_query, pad_query], dim=1) + selected_query_pos = torch.cat([selected_query_pos, pad_query_pos], dim=1) + selected_padding_mask = torch.cat([selected_padding_mask, pad_mask], dim=1) + + return selected_query, selected_query_pos, selected_padding_mask + + def distribution_forward(self, present_features, future_distribution_inputs=None, noise=None): + """ + Parameters + ---------- + present_features: 5-D output from dynamics module with shape (b, 1, c, h, w) + future_distribution_inputs: 5-D tensor containing labels shape (b, s, cfg.PROB_FUTURE_DIM, h, w) + noise: a sample from a (0, 1) gaussian with shape (b, s, latent_dim). If None, will sample in function + + Returns + ------- + sample: sample taken from present/future distribution, broadcast to shape (b, s, latent_dim, h, w) + present_distribution_mu: shape (b, s, latent_dim) + present_distribution_log_sigma: shape (b, s, latent_dim) + future_distribution_mu: shape (b, s, latent_dim) + future_distribution_log_sigma: shape (b, s, latent_dim) + """ + + b = present_features.shape[0] + c = present_features.shape[1] + present_mu, present_log_sigma = self.present_distribution(present_features) + + future_mu, future_log_sigma = None, None + if future_distribution_inputs is not None: + # Concatenate future labels to z_t + # future_features = future_distribution_inputs[:, 1:].contiguous().view(b, 1, -1, h, w) + future_features = torch.cat([present_features, future_distribution_inputs], dim=2) + future_mu, future_log_sigma = self.future_distribution(future_features) + + if noise is None: + if self.training: + noise = torch.randn_like(present_mu) + else: + noise = torch.randn_like(present_mu) + # print('################################') + # print('noise: ', noise) + # print('################################') + if self.training: + mu = future_mu + sigma = torch.exp(future_log_sigma) + else: + mu = present_mu + sigma = torch.exp(present_log_sigma) + sample = mu + sigma * noise + + # Spatially broadcast sample to the dimensions of present_features + sample = sample.permute(0, 2, 1).expand(b, self.latent_dim, c) + + output_distribution = { + 'present_mu': present_mu, + 'present_log_sigma': present_log_sigma, + 'future_mu': future_mu, + 'future_log_sigma': future_log_sigma, + } + + return sample, output_distribution + + def get_future_labels(self, gt_labels_3d, gt_attr_labels, ego_fut_trajs, device): + + agent_dim = 300 + veh_list = [0, 1, 3, 4] + mapped_class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', + 'trailer', 'barrier', 'motorcycle', 'bicycle', + 'pedestrian', 'traffic_cone' + ] + ignore_list = ['construction_vehicle', 'barrier', + 'traffic_cone', 'motorcycle', 'bicycle'] + + batch_size = len(gt_labels_3d) + + # gt_label = gt_labels_3d[0] + # gt_attr_label = gt_attr_labels[0] + + gt_fut_trajs_bz_list = [] + + for bz in range(batch_size): + gt_fut_trajs_list = [] + gt_label = gt_labels_3d[bz] + gt_attr_label = gt_attr_labels[bz] + for i in range(gt_label.shape[0]): + gt_label[i] = 0 if gt_label[i] in veh_list else gt_label[i] + box_name = mapped_class_names[gt_label[i]] + if box_name in ignore_list: + continue + gt_fut_masks = gt_attr_label[i][self.fut_ts * 2:self.fut_ts * 3] + num_valid_ts = sum(gt_fut_masks == 1) + gt_fut_traj = gt_attr_label[i][:self.fut_ts * 2].reshape(-1, 2) + gt_fut_traj = gt_fut_traj[:num_valid_ts] + if gt_fut_traj.shape[0] == 0: + gt_fut_traj = torch.zeros([self.fut_ts - gt_fut_traj.shape[0], 2], device=device) + if gt_fut_traj.shape[0] < self.fut_ts: + gt_fut_traj = torch.cat( + (gt_fut_traj, torch.zeros([self.fut_ts - gt_fut_traj.shape[0], 2], device=device)), 0) + gt_fut_trajs_list.append(gt_fut_traj) + + if len(gt_fut_trajs_list) != 0 & len(gt_fut_trajs_list) < agent_dim: + gt_fut_trajs = torch.cat( + (torch.stack(gt_fut_trajs_list), + torch.zeros([agent_dim - len(gt_fut_trajs_list), self.fut_ts, 2], device=device)), 0) + else: + gt_fut_trajs = torch.zeros([agent_dim, self.fut_ts, 2], device=device) + + gt_fut_trajs_bz_list.append(gt_fut_trajs) + + if len(gt_fut_trajs_bz_list) != 0: + gt_trajs = torch.cat((torch.stack(gt_fut_trajs_bz_list).repeat(1, 6, 1, 1), ego_fut_trajs), dim=1) + else: + gt_trajs = ego_fut_trajs + # future_states = gt_trajs.reshape(batch_size, gt_trajs.shape[1], -1) + + # [bz, a, t, 2] + return gt_trajs.reshape(batch_size, gt_trajs.shape[1], -1) + + def future_states_predict(self, batch_size, sample, hidden_states, current_states): + + future_prediction_input = sample.unsqueeze(0).expand(self.fut_ts, -1, -1, -1) + # + # future_states = self.future_prediction(future_prediction_input, hidden_state) + future_prediction_input = future_prediction_input.reshape(self.fut_ts, -1, self.latent_dim) + + hidden_state = hidden_states.reshape(self.layer_dim, -1, int(self.embed_dims / 2)) + # future_states, future_hidden = self.state_gru(future_prediction_input, hidden_state) + future_states = self.predict_model(future_prediction_input, hidden_state) + + current_states_hs = current_states.unsqueeze(0).repeat(6, 1, 1, 1) + future_states_hs = future_states.reshape(self.fut_ts, batch_size, -1, future_states.shape[2]) + + if self.with_cur: + states_hs = torch.cat((current_states_hs, future_states_hs), dim=-1) + else: + states_hs = future_states_hs + + return states_hs, future_states_hs + + + + + + diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_transformer.py b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..c8ffae5f11233f04ac7bbaecae775517c502e72e --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_transformer.py @@ -0,0 +1,489 @@ +import torch +import numpy as np +import torch.nn as nn +from mmcv.cnn import xavier_init +from mmcv.utils import ext_loader +from torch.nn.init import normal_ +from mmcv.runner.base_module import BaseModule +from mmdet.models.utils.builder import TRANSFORMER +from torchvision.transforms.functional import rotate +from mmcv.cnn.bricks.registry import TRANSFORMER_LAYER_SEQUENCE +from mmcv.cnn.bricks.transformer import TransformerLayerSequence +from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence + +from projects.mmdet3d_plugin.VAD.modules.decoder import CustomMSDeformableAttention +from projects.mmdet3d_plugin.VAD.modules.temporal_self_attention import TemporalSelfAttention +from projects.mmdet3d_plugin.VAD.modules.spatial_cross_attention import MSDeformableAttention3D + + +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + +def inverse_sigmoid(x, eps=1e-5): + """Inverse function of sigmoid. + Args: + x (Tensor): The tensor to do the + inverse. + eps (float): EPS avoid numerical + overflow. Defaults 1e-5. + Returns: + Tensor: The x has passed the inverse + function of sigmoid, has same + shape with input. + """ + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class MapDetectionTransformerDecoder(TransformerLayerSequence): + """Implements the decoder in DETR3D transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, return_intermediate=False, **kwargs): + super(MapDetectionTransformerDecoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + self.fp16_enabled = False + + def forward(self, + query, + *args, + reference_points=None, + reg_branches=None, + key_padding_mask=None, + **kwargs): + """Forward function for `Detr3DTransformerDecoder`. + Args: + query (Tensor): Input query with shape + `(num_query, bs, embed_dims)`. + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + reg_branch: (obj:`nn.ModuleList`): Used for + refining the regression results. Only would + be passed when with_box_refine is True, + otherwise would be passed a `None`. + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + output = query + intermediate = [] + intermediate_reference_points = [] + for lid, layer in enumerate(self.layers): + + reference_points_input = reference_points[..., :2].unsqueeze( + 2) # BS NUM_QUERY NUM_LEVEL 2 + output = layer( + output, + *args, + reference_points=reference_points_input, + key_padding_mask=key_padding_mask, + **kwargs) + output = output.permute(1, 0, 2) + + if reg_branches is not None: + tmp = reg_branches[lid](output) + + assert reference_points.shape[-1] == 2 + + new_reference_points = torch.zeros_like(reference_points) + new_reference_points[..., :2] = tmp[ + ..., :2] + inverse_sigmoid(reference_points[..., :2]) + # new_reference_points[..., 2:3] = tmp[ + # ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3]) + + new_reference_points = new_reference_points.sigmoid() + + reference_points = new_reference_points.detach() + + output = output.permute(1, 0, 2) + if self.return_intermediate: + intermediate.append(output) + intermediate_reference_points.append(reference_points) + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return output, reference_points + + +@TRANSFORMER.register_module() +class VADPerceptionTransformer(BaseModule): + """Implements the Detr3D transformer. + Args: + as_two_stage (bool): Generate query from encoder features. + Default: False. + num_feature_levels (int): Number of feature maps from FPN: + Default: 4. + two_stage_num_proposals (int): Number of proposals when set + `as_two_stage` as True. Default: 300. + """ + + def __init__(self, + num_feature_levels=4, + num_cams=6, + two_stage_num_proposals=300, + encoder=None, + decoder=None, + map_decoder=None, + embed_dims=256, + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + can_bus_norm=True, + use_cams_embeds=True, + rotate_center=[100, 100], + map_num_vec=50, + map_num_pts_per_vec=10, + **kwargs): + super(VADPerceptionTransformer, self).__init__(**kwargs) + self.encoder = build_transformer_layer_sequence(encoder) + if decoder is not None: + self.decoder = build_transformer_layer_sequence(decoder) + else: + self.decoder = None + if map_decoder is not None: + self.map_decoder = build_transformer_layer_sequence(map_decoder) + else: + self.map_decoder = None + + self.embed_dims = embed_dims + self.num_feature_levels = num_feature_levels + self.num_cams = num_cams + self.fp16_enabled = False + self.rotate_prev_bev = rotate_prev_bev + self.use_shift = use_shift + self.use_can_bus = use_can_bus + self.can_bus_norm = can_bus_norm + self.use_cams_embeds = use_cams_embeds + self.two_stage_num_proposals = two_stage_num_proposals + self.rotate_center = rotate_center + self.map_num_vec = map_num_vec + self.map_num_pts_per_vec = map_num_pts_per_vec + self.init_layers() + + def init_layers(self): + """Initialize layers of the Detr3DTransformer.""" + self.level_embeds = nn.Parameter(torch.Tensor( + self.num_feature_levels, self.embed_dims)) + self.cams_embeds = nn.Parameter( + torch.Tensor(self.num_cams, self.embed_dims)) + self.reference_points = nn.Linear(self.embed_dims, 3) + self.map_reference_points = nn.Linear(self.embed_dims, 2) + self.can_bus_mlp = nn.Sequential( + nn.Linear(18, self.embed_dims // 2), + nn.ReLU(inplace=True), + nn.Linear(self.embed_dims // 2, self.embed_dims), + nn.ReLU(inplace=True), + ) + if self.can_bus_norm: + self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims)) + + def init_weights(self): + """Initialize the transformer weights.""" + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \ + or isinstance(m, CustomMSDeformableAttention): + try: + m.init_weight() + except AttributeError: + m.init_weights() + normal_(self.level_embeds) + normal_(self.cams_embeds) + xavier_init(self.reference_points, distribution='uniform', bias=0.) + xavier_init(self.map_reference_points, distribution='uniform', bias=0.) + xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.) + + # TODO apply fp16 to this module cause grad_norm NAN + # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos')) + def get_bev_features( + self, + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + prev_bev=None, + **kwargs): + """ + obtain bev features. + """ + + bs = mlvl_feats[0].size(0) + bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1) + bev_pos = bev_pos.flatten(2).permute(2, 0, 1) + + # obtain rotation angle and shift with ego motion + delta_x = np.array([each['can_bus'][0] + for each in kwargs['img_metas']]) + delta_y = np.array([each['can_bus'][1] + for each in kwargs['img_metas']]) + ego_angle = np.array( + [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']]) + grid_length_y = grid_length[0] + grid_length_x = grid_length[1] + translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2) + translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180 + bev_angle = ego_angle - translation_angle + shift_y = translation_length * \ + np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h + shift_x = translation_length * \ + np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w + shift_y = shift_y * self.use_shift + shift_x = shift_x * self.use_shift + shift = bev_queries.new_tensor( + [shift_x, shift_y]).permute(1, 0) # xy, bs -> bs, xy + + if prev_bev is not None: + if prev_bev.shape[1] == bev_h * bev_w: + prev_bev = prev_bev.permute(1, 0, 2) + if self.rotate_prev_bev: + for i in range(bs): + # num_prev_bev = prev_bev.size(1) + rotation_angle = kwargs['img_metas'][i]['can_bus'][-1] + tmp_prev_bev = prev_bev[:, i].reshape( + bev_h, bev_w, -1).permute(2, 0, 1) + tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle, + center=self.rotate_center) + tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape( + bev_h * bev_w, 1, -1) + prev_bev[:, i] = tmp_prev_bev[:, 0] + + # add can bus signals + can_bus = bev_queries.new_tensor( + [each['can_bus'] for each in kwargs['img_metas']]) # [:, :] + can_bus = self.can_bus_mlp(can_bus)[None, :, :] + bev_queries = bev_queries + can_bus * self.use_can_bus + + feat_flatten = [] + spatial_shapes = [] + for lvl, feat in enumerate(mlvl_feats): + bs, num_cam, c, h, w = feat.shape + spatial_shape = (h, w) + feat = feat.flatten(3).permute(1, 0, 3, 2) + if self.use_cams_embeds: + feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype) + feat = feat + self.level_embeds[None, + None, lvl:lvl + 1, :].to(feat.dtype) + spatial_shapes.append(spatial_shape) + feat_flatten.append(feat) + + feat_flatten = torch.cat(feat_flatten, 2) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=bev_pos.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + + feat_flatten = feat_flatten.permute( + 0, 2, 1, 3) # (num_cam, H*W, bs, embed_dims) + + bev_embed = self.encoder( + bev_queries, + feat_flatten, + feat_flatten, + bev_h=bev_h, + bev_w=bev_w, + bev_pos=bev_pos, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + prev_bev=prev_bev, + shift=shift, + **kwargs + ) + + return bev_embed + + # TODO apply fp16 to this module cause grad_norm NAN + # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos')) + def forward(self, + mlvl_feats, + bev_queries, + object_query_embed, + map_query_embed, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + reg_branches=None, + cls_branches=None, + map_reg_branches=None, + map_cls_branches=None, + prev_bev=None, + **kwargs): + """Forward function for `Detr3DTransformer`. + Args: + mlvl_feats (list(Tensor)): Input queries from + different level. Each element has shape + [bs, num_cams, embed_dims, h, w]. + bev_queries (Tensor): (bev_h*bev_w, c) + bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w) + object_query_embed (Tensor): The query embedding for decoder, + with shape [num_query, c]. + reg_branches (obj:`nn.ModuleList`): Regression heads for + feature maps from each decoder layer. Only would + be passed when `with_box_refine` is True. Default to None. + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + - bev_embed: BEV features + - inter_states: Outputs from decoder. If + return_intermediate_dec is True output has shape \ + (num_dec_layers, bs, num_query, embed_dims), else has \ + shape (1, bs, num_query, embed_dims). + - init_reference_out: The initial value of reference \ + points, has shape (bs, num_queries, 4). + - inter_references_out: The internal value of reference \ + points in decoder, has shape \ + (num_dec_layers, bs,num_query, embed_dims) + - enc_outputs_class: The classification score of \ + proposals generated from \ + encoder's feature maps, has shape \ + (batch, h*w, num_classes). \ + Only would be returned when `as_two_stage` is True, \ + otherwise None. + - enc_outputs_coord_unact: The regression results \ + generated from encoder's feature maps., has shape \ + (batch, h*w, 4). Only would \ + be returned when `as_two_stage` is True, \ + otherwise None. + """ + + bev_embed = self.get_bev_features( + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=grid_length, + bev_pos=bev_pos, + prev_bev=prev_bev, + **kwargs) # bev_embed shape: bs, bev_h*bev_w, embed_dims + + bs = mlvl_feats[0].size(0) + query_pos, query = torch.split( + object_query_embed, self.embed_dims, dim=1) + query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) + query = query.unsqueeze(0).expand(bs, -1, -1) + reference_points = self.reference_points(query_pos) + reference_points = reference_points.sigmoid() + init_reference_out = reference_points + + map_query_pos, map_query = torch.split( + map_query_embed, self.embed_dims, dim=1) + map_query_pos = map_query_pos.unsqueeze(0).expand(bs, -1, -1) + map_query = map_query.unsqueeze(0).expand(bs, -1, -1) + map_reference_points = self.map_reference_points(map_query_pos) + map_reference_points = map_reference_points.sigmoid() + map_init_reference_out = map_reference_points + + query = query.permute(1, 0, 2) + query_pos = query_pos.permute(1, 0, 2) + map_query = map_query.permute(1, 0, 2) + map_query_pos = map_query_pos.permute(1, 0, 2) + bev_embed = bev_embed.permute(1, 0, 2) + + if self.decoder is not None: + # [L, Q, B, D], [L, B, Q, D] + inter_states, inter_references = self.decoder( + query=query, + key=None, + value=bev_embed, + query_pos=query_pos, + reference_points=reference_points, + reg_branches=reg_branches, + cls_branches=cls_branches, + spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device), + level_start_index=torch.tensor([0], device=query.device), + **kwargs) + inter_references_out = inter_references + else: + inter_states = query.unsqueeze(0) + inter_references_out = reference_points.unsqueeze(0) + + if self.map_decoder is not None: + # [L, Q, B, D], [L, B, Q, D] + map_inter_states, map_inter_references = self.map_decoder( + query=map_query, + key=None, + value=bev_embed, + query_pos=map_query_pos, + reference_points=map_reference_points, + reg_branches=map_reg_branches, + cls_branches=map_cls_branches, + spatial_shapes=torch.tensor([[bev_h, bev_w]], device=map_query.device), + level_start_index=torch.tensor([0], device=map_query.device), + **kwargs) + map_inter_references_out = map_inter_references + else: + map_inter_states = map_query.unsqueeze(0) + map_inter_references_out = map_reference_points.unsqueeze(0) + + return ( + bev_embed, inter_states, init_reference_out, inter_references_out, + map_inter_states, map_init_reference_out, map_inter_references_out) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class CustomTransformerDecoder(TransformerLayerSequence): + """Implements the decoder in DETR3D transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`. + """ + + def __init__(self, *args, return_intermediate=False, **kwargs): + super(CustomTransformerDecoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + self.fp16_enabled = False + + def forward(self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + key_padding_mask=None, + *args, + **kwargs): + """Forward function for `Detr3DTransformerDecoder`. + Args: + query (Tensor): Input query with shape + `(num_query, bs, embed_dims)`. + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + intermediate = [] + for lid, layer in enumerate(self.layers): + query = layer( + query=query, + key=key, + value=value, + query_pos=query_pos, + key_pos=key_pos, + attn_masks=attn_masks, + key_padding_mask=key_padding_mask, + *args, + **kwargs) + + if self.return_intermediate: + intermediate.append(query) + + if self.return_intermediate: + return torch.stack(intermediate) + + return query \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d7d8488aa61f6ec0230f4f17772698cf1d3c062d --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/__init__.py @@ -0,0 +1,11 @@ +from .modules import * +from .runner import * +from .hooks import * + +from .VAD import VAD +# from .VAD_head_v2 import VADHead +from .VAD_head import VADHead +from .VAD_transformer import VADPerceptionTransformer, \ + CustomTransformerDecoder, MapDetectionTransformerDecoder + +from .generator import * \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d7efcdfb0646a7a40210462c84cd1618dc2cc82 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD_head.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD_head.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36415fa1a21a02127430dd5d165e7cd9ae6d869c Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD_head.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..04d7958b4d87926ac3ef9fbc4f6e89d1a467c409 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..15dff22b7478a0f30151d376d41f3dc46e88ba7d --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__init__.py @@ -0,0 +1,3 @@ +from .train import custom_train_model +from .mmdet_train import custom_train_detector +# from .test import custom_multi_gpu_test \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c610e7f1e0c12eb1bce9408ec700f64a02641e9b Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/mmdet_train.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/mmdet_train.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..945c209515ec3d19c99302ecaebf8ddfd8de7c15 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/mmdet_train.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/test.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/test.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42c7bb316260bb47844d913be0ff89407a70832c Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/test.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/train.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/train.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14f0e9a4285dbeab77adb9ba7962313c09c9e2b2 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/train.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/mmdet_train.py b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/mmdet_train.py new file mode 100644 index 0000000000000000000000000000000000000000..449d49dc4795b3f5f93b275ba58c74357b85bb5d --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/mmdet_train.py @@ -0,0 +1,195 @@ +import random +import warnings + +import numpy as np +import torch +import torch.distributed as dist +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner, + Fp16OptimizerHook, OptimizerHook, build_optimizer, + build_runner, get_dist_info) +from mmcv.utils import build_from_cfg + +from mmdet.core import EvalHook + +from mmdet.datasets import (build_dataset, + replace_ImageToTensor) +from mmdet.utils import get_root_logger +import time +import os.path as osp +from projects.mmdet3d_plugin.datasets.builder import build_dataloader +from projects.mmdet3d_plugin.core.evaluation.eval_hooks import CustomDistEvalHook +from projects.mmdet3d_plugin.datasets.builder import custom_build_dataset +def custom_train_detector(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + eval_model=None, + meta=None): + logger = get_root_logger(cfg.log_level) + + # prepare data loaders + + dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] + #assert len(dataset)==1s + if 'imgs_per_gpu' in cfg.data: + logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' + 'Please use "samples_per_gpu" instead') + if 'samples_per_gpu' in cfg.data: + logger.warning( + f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' + f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' + f'={cfg.data.imgs_per_gpu} is used in this experiments') + else: + logger.warning( + 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' + f'{cfg.data.imgs_per_gpu} in this experiments') + cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu + + data_loaders = [ + build_dataloader( + ds, + cfg.data.samples_per_gpu, + cfg.data.workers_per_gpu, + # cfg.gpus will be ignored if distributed + len(cfg.gpu_ids), + dist=distributed, + seed=cfg.seed, + shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'), + nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'), + ) for ds in dataset + ] + + # put model on gpus + if distributed: + find_unused_parameters = cfg.get('find_unused_parameters', False) + # Sets the `find_unused_parameters` parameter in + # torch.nn.parallel.DistributedDataParallel + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + if eval_model is not None: + eval_model = MMDistributedDataParallel( + eval_model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + else: + model = MMDataParallel( + model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) + if eval_model is not None: + eval_model = MMDataParallel( + eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) + + + # build runner + optimizer = build_optimizer(model, cfg.optimizer) + + if 'runner' not in cfg: + cfg.runner = { + 'type': 'EpochBasedRunner', + 'max_epochs': cfg.total_epochs + } + warnings.warn( + 'config is now expected to have a `runner` section, ' + 'please set `runner` in your config.', UserWarning) + else: + if 'total_epochs' in cfg: + assert cfg.total_epochs == cfg.runner.max_epochs + if eval_model is not None: + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + eval_model=eval_model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta)) + else: + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta)) + + # an ugly workaround to make .log and .log.json filenames the same + runner.timestamp = timestamp + + # fp16 setting + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + optimizer_config = Fp16OptimizerHook( + **cfg.optimizer_config, **fp16_cfg, distributed=distributed) + elif distributed and 'type' not in cfg.optimizer_config: + optimizer_config = OptimizerHook(**cfg.optimizer_config) + else: + optimizer_config = cfg.optimizer_config + + # register hooks + runner.register_training_hooks(cfg.lr_config, optimizer_config, + cfg.checkpoint_config, cfg.log_config, + cfg.get('momentum_config', None)) + + # register profiler hook + #trace_config = dict(type='tb_trace', dir_name='work_dir') + #profiler_config = dict(on_trace_ready=trace_config) + #runner.register_profiler_hook(profiler_config) + + if distributed: + if isinstance(runner, EpochBasedRunner): + runner.register_hook(DistSamplerSeedHook()) + + # register eval hooks + if validate: + # Support batch_size > 1 in validation + val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) + if val_samples_per_gpu > 1: + assert False + # Replace 'ImageToTensor' to 'DefaultFormatBundle' + cfg.data.val.pipeline = replace_ImageToTensor( + cfg.data.val.pipeline) + val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True)) + + val_dataloader = build_dataloader( + val_dataset, + samples_per_gpu=val_samples_per_gpu, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=distributed, + shuffle=False, + shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'), + nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'), + ) + eval_cfg = cfg.get('evaluation', {}) + eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' + eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_')) + eval_hook = CustomDistEvalHook if distributed else EvalHook + runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) + + # user-defined hooks + if cfg.get('custom_hooks', None): + custom_hooks = cfg.custom_hooks + assert isinstance(custom_hooks, list), \ + f'custom_hooks expect list type, but got {type(custom_hooks)}' + for hook_cfg in cfg.custom_hooks: + assert isinstance(hook_cfg, dict), \ + 'Each item in custom_hooks expects dict type, but got ' \ + f'{type(hook_cfg)}' + hook_cfg = hook_cfg.copy() + priority = hook_cfg.pop('priority', 'NORMAL') + hook = build_from_cfg(hook_cfg, HOOKS) + runner.register_hook(hook, priority=priority) + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run(data_loaders, cfg.workflow) + diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/test.py b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/test.py new file mode 100644 index 0000000000000000000000000000000000000000..fc09efb0687246cb539b4d9f3c56a53e90b6a453 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/test.py @@ -0,0 +1,159 @@ +import os.path as osp +import pickle +import shutil +import tempfile +import time + +import mmcv +import torch +import torch.distributed as dist +from mmcv.image import tensor2imgs +from mmcv.runner import get_dist_info + +from mmdet.core import encode_mask_results + + +import mmcv +import numpy as np +import pycocotools.mask as mask_util + +def custom_encode_mask_results(mask_results): + """Encode bitmap mask to RLE code. Semantic Masks only + Args: + mask_results (list | tuple[list]): bitmap mask results. + In mask scoring rcnn, mask_results is a tuple of (segm_results, + segm_cls_score). + Returns: + list | tuple: RLE encoded mask. + """ + cls_segms = mask_results + num_classes = len(cls_segms) + encoded_mask_results = [] + for i in range(len(cls_segms)): + encoded_mask_results.append( + mask_util.encode( + np.array( + cls_segms[i][:, :, np.newaxis], order='F', + dtype='uint8'))[0]) # encoded with RLE + return [encoded_mask_results] + +def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): + """Test model with multiple gpus. + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + Returns: + list: The prediction results. + """ + model.eval() + bbox_results = [] + mask_results = [] + dataset = data_loader.dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + time.sleep(2) # This line can prevent deadlock problem in some cases. + have_mask = False + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + # encode mask results + if isinstance(result, dict): + if 'bbox_results' in result.keys(): + bbox_result = result['bbox_results'] + batch_size = len(result['bbox_results']) + bbox_results.extend(bbox_result) + if 'mask_results' in result.keys() and result['mask_results'] is not None: + mask_result = custom_encode_mask_results(result['mask_results']) + mask_results.extend(mask_result) + have_mask = True + else: + batch_size = len(result) + bbox_results.extend(result) + + #if isinstance(result[0], tuple): + # assert False, 'this code is for instance segmentation, which our code will not utilize.' + # result = [(bbox_results, encode_mask_results(mask_results)) + # for bbox_results, mask_results in result] + if rank == 0: + + for _ in range(batch_size * world_size): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + bbox_results = collect_results_gpu(bbox_results, len(dataset)) + if have_mask: + mask_results = collect_results_gpu(mask_results, len(dataset)) + else: + mask_results = None + else: + bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir) + tmpdir = tmpdir+'_mask' if tmpdir is not None else None + if have_mask: + mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir) + else: + mask_results = None + + if mask_results is None: + return {'bbox_results': bbox_results} + return {'bbox_results': bbox_results, 'mask_results': mask_results} + + +def collect_results_cpu(result_part, size, tmpdir=None): + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + mmcv.mkdir_or_exist('.dist_test') + tmpdir = tempfile.mkdtemp(dir='.dist_test') + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_list.append(mmcv.load(part_file)) + # sort the results + ordered_results = [] + ''' + bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample, + ''' + #for res in zip(*part_list): + for res in part_list: + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + collect_results_cpu(result_part, size) \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/train.py b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f44c9d8372e9fc429c62c3fa304497f5c051e6af --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/train.py @@ -0,0 +1,61 @@ +from .mmdet_train import custom_train_detector +from mmseg.apis import train_segmentor +from mmdet.apis import train_detector + +def custom_train_model(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + eval_model=None, + meta=None): + """A function wrapper for launching model training according to cfg. + + Because we need different eval_hook in runner. Should be deprecated in the + future. + """ + if cfg.model.type in ['EncoderDecoder3D']: + assert False + else: + custom_train_detector( + model, + dataset, + cfg, + distributed=distributed, + validate=validate, + timestamp=timestamp, + eval_model=eval_model, + meta=meta) + + +def train_model(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + meta=None): + """A function wrapper for launching model training according to cfg. + + Because we need different eval_hook in runner. Should be deprecated in the + future. + """ + if cfg.model.type in ['EncoderDecoder3D']: + train_segmentor( + model, + dataset, + cfg, + distributed=distributed, + validate=validate, + timestamp=timestamp, + meta=meta) + else: + train_detector( + model, + dataset, + cfg, + distributed=distributed, + validate=validate, + timestamp=timestamp, + meta=meta) diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e1082f134a1f594370163d1eee2dd28b65488d1a --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__init__.py @@ -0,0 +1,6 @@ +from .distributions import DistributionModule, PredictModel, DistributionDecoder1DV2, PredictModelHidden +from .layers import Bottleneck, SpatialGRU +from .state_prediction import FuturePrediction +# from .diffusion_model import DDIMScheduler +# from .diffusion_states_estimate import DDIMDepthEstimateRes, EmbeddingDimForward, EmbeddingDimReverse, \ +# DiffusionHeadMotion, DiffusionHeadPlan, AutoRegMotionPredict, AutoRegEgoPredict, AutoRegMotionPredictAll, AutoRegEgoPredictAll diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e435fdea401be719fae45ad3cf7fa41a36f0c2c7 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/distributions.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/distributions.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..01c94943dbbbeb3d283818df00764b2cd0910c9e Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/distributions.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/layers.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/layers.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea2c8df9de5b9fc768893b97fc23c77f22b704d7 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/layers.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/state_prediction.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/state_prediction.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5960f5b61b81837598ca541032292732e4193044 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/state_prediction.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/distributions.py b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/distributions.py new file mode 100644 index 0000000000000000000000000000000000000000..e0eda3c6359d2236cb439f1b702a4de1ac7ce783 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/distributions.py @@ -0,0 +1,182 @@ + +import torch +import torch.nn as nn + +from mmdet.models import LOSSES + +from .layers import Bottleneck + + +class DistributionModule(nn.Module): + """ + A convolutional net that parametrises a diagonal Gaussian distribution. + """ + + def __init__( + self, in_channels, latent_dim, min_log_sigma, max_log_sigma): + super().__init__() + self.compress_dim = in_channels // 2 + self.latent_dim = latent_dim + self.min_log_sigma = min_log_sigma + self.max_log_sigma = max_log_sigma + + # self.encoder = DistributionEncoder2D( + # in_channels, + # self.compress_dim, + # ) + + self.encoder = DistributionEncoder1DV2( + in_channels, + self.compress_dim, + ) + + self.last_conv = nn.Sequential( + nn.AdaptiveAvgPool1d(1), nn.Conv1d(self.compress_dim, out_channels=2 * self.latent_dim, kernel_size=1) + ) + + def forward(self, s_t): + encoding = self.encoder(s_t.permute(0, 2, 1)) + mu_log_sigma = self.last_conv(encoding).permute(0, 2, 1) + mu = mu_log_sigma[:, :, :self.latent_dim] + log_sigma = mu_log_sigma[:, :, self.latent_dim:] + + # clip the log_sigma value for numerical stability + log_sigma = torch.clamp(log_sigma, self.min_log_sigma, self.max_log_sigma) + return mu, log_sigma + +class DistributionEncoder2D(nn.Module): + """Encodes s_t or (s_t, y_{t+1}, ..., y_{t+H}). + """ + def __init__(self, in_channels, out_channels): + super().__init__() + + self.model = nn.Sequential( + Bottleneck(in_channels, out_channels=out_channels, downsample=True), + Bottleneck(out_channels, out_channels=out_channels, downsample=True), + Bottleneck(out_channels, out_channels=out_channels, downsample=True), + Bottleneck(out_channels, out_channels=out_channels, downsample=True), + ) + + def forward(self, s_t): + return self.model(s_t) + +class DistributionEncoder1D(nn.Module): + """Encodes s_t or (s_t, y_{t+1}, ..., y_{t+H}). + """ + def __init__(self, in_channels, out_channels): + super().__init__() + + self.model = nn.Sequential( + nn.Conv1d(in_channels, out_channels=in_channels*2, kernel_size=1, stride=1), + nn.Conv1d(in_channels*2, out_channels=in_channels*2, kernel_size=1, stride=1), + nn.Conv1d(in_channels*2, out_channels=in_channels, kernel_size=1, stride=1), + nn.Conv1d(in_channels, out_channels=out_channels, kernel_size=1, stride=1), + ) + + def forward(self, s_t): + return self.model(s_t) + +class DistributionEncoder1DV2(nn.Module): + """Encodes s_t or (s_t, y_{t+1}, ..., y_{t+H}). + """ + def __init__(self, in_channels, out_channels): + super().__init__() + + self.conv1 = nn.Conv1d(in_channels, out_channels=in_channels * 2, kernel_size=1, stride=1) + self.conv2 = nn.Conv1d(in_channels * 2, out_channels=in_channels * 2, kernel_size=1, stride=1) + self.conv3 = nn.Conv1d(in_channels * 2, out_channels=out_channels, kernel_size=1, stride=1) + self.relu = nn.ReLU(inplace=True) + + def forward(self, s_t): + s_t = self.relu(self.conv1(s_t)) + s_t = self.relu(self.conv2(s_t)) + s_t = self.conv3(s_t) + + return s_t + +class DistributionDecoder1DV2(nn.Module): + """Decodes sample to future states. + """ + def __init__(self, in_channels, out_channels): + super().__init__() + + self.conv1 = nn.Conv1d(in_channels, out_channels=in_channels * 8, kernel_size=1, stride=1) + self.conv2 = nn.Conv1d(in_channels * 8, out_channels=in_channels * 8, kernel_size=1, stride=1) + self.conv3 = nn.Conv1d(in_channels * 8, out_channels=out_channels, kernel_size=1, stride=1) + self.relu = nn.ReLU(inplace=True) + + def forward(self, f_t): + f_t = self.relu(self.conv1(f_t)) + f_t = self.relu(self.conv2(f_t)) + f_t = self.conv3(f_t) + + return f_t + +class PredictModel(nn.Module): + """predict future states with rnn. + """ + def __init__(self, in_channels, out_channels, hidden_channels, num_layers): + super().__init__() + self.gru = nn.GRU(input_size=in_channels, hidden_size=hidden_channels, num_layers=num_layers) + self.linear1 = nn.Linear(hidden_channels, hidden_channels*2) + self.linear2 = nn.Linear(hidden_channels*2, hidden_channels*4) + self.linear3 = nn.Linear(hidden_channels*4, out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x , h): + x, h = self.gru(x, h) + x = self.relu(self.linear1(x)) + x = self.relu(self.linear2(x)) + x = self.linear3(x) + return x + + +class PredictModelHidden(nn.Module): + """predict future states with rnn. + """ + def __init__(self, in_channels, out_channels, hidden_channels, num_layers): + super().__init__() + self.gru = nn.GRU(input_size=in_channels, hidden_size=hidden_channels, num_layers=num_layers) + self.linear1 = nn.Linear(hidden_channels, hidden_channels*2) + self.linear2 = nn.Linear(hidden_channels*2, hidden_channels*4) + self.linear3 = nn.Linear(hidden_channels*4, out_channels) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + x, h = self.gru(x) + x = self.relu(self.linear1(x)) + x = self.relu(self.linear2(x)) + x = self.linear3(x) + return x + + + + +@LOSSES.register_module() +class ProbabilisticLoss(nn.Module): + def __init__(self, loss_weight=1.0): + super().__init__() + self.loss_weight = loss_weight + + def forward(self, output): + present_mu = output['present_mu'] + present_log_sigma = output['present_log_sigma'] + future_mu = output['future_mu'] + future_log_sigma = output['future_log_sigma'] + + var_future = torch.exp(2 * future_log_sigma) + var_present = torch.exp(2 * present_log_sigma) + kl_div = ( + present_log_sigma - future_log_sigma - 0.5 + (var_future + (future_mu - present_mu) ** 2) / ( + 2 * var_present) + ) + + kl_loss = torch.mean(torch.sum(kl_div, dim=-1)) * self.loss_weight + + return kl_loss + + + + + + diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/layers.py b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..068369ca950891321a0113d4f8680ad90a8bc23c --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/layers.py @@ -0,0 +1,235 @@ + +from collections import OrderedDict + +import torch +import torch.nn as nn + +from functools import partial + +class Bottleneck(nn.Module): + """ + Defines a bottleneck module with a residual connection + """ + + def __init__( + self, + in_channels, + out_channels=None, + kernel_size=3, + dilation=1, + groups=1, + upsample=False, + downsample=False, + dropout=0.0, + ): + super().__init__() + self._downsample = downsample + bottleneck_channels = int(in_channels / 2) + out_channels = out_channels or in_channels + padding_size = ((kernel_size - 1) * dilation + 1) // 2 + + # Define the main conv operation + assert dilation == 1 + if upsample: + assert not downsample, 'downsample and upsample not possible simultaneously.' + bottleneck_conv = nn.ConvTranspose2d( + bottleneck_channels, + bottleneck_channels, + kernel_size=kernel_size, + bias=False, + dilation=1, + stride=2, + output_padding=padding_size, + padding=padding_size, + groups=groups, + ) + elif downsample: + bottleneck_conv = nn.Conv2d( + bottleneck_channels, + bottleneck_channels, + kernel_size=kernel_size, + bias=False, + dilation=dilation, + stride=2, + padding=padding_size, + groups=groups, + ) + else: + bottleneck_conv = nn.Conv2d( + bottleneck_channels, + bottleneck_channels, + kernel_size=kernel_size, + bias=False, + dilation=dilation, + padding=padding_size, + groups=groups, + ) + + self.layers = nn.Sequential( + OrderedDict( + [ + # First projection with 1x1 kernel + ('conv_down_project', nn.Conv2d(in_channels, bottleneck_channels, kernel_size=1, bias=False)), + ('abn_down_project', nn.Sequential(nn.BatchNorm2d(bottleneck_channels), + nn.ReLU(inplace=True))), + # Second conv block + ('conv', bottleneck_conv), + ('abn', nn.Sequential(nn.BatchNorm2d(bottleneck_channels), nn.ReLU(inplace=True))), + # Final projection with 1x1 kernel + ('conv_up_project', nn.Conv2d(bottleneck_channels, out_channels, kernel_size=1, bias=False)), + ('abn_up_project', nn.Sequential(nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True))), + # Regulariser + ('dropout', nn.Dropout2d(p=dropout)), + ] + ) + ) + + if out_channels == in_channels and not downsample and not upsample: + self.projection = None + else: + projection = OrderedDict() + if upsample: + projection.update({'upsample_skip_proj': Interpolate(scale_factor=2)}) + elif downsample: + projection.update({'upsample_skip_proj': nn.MaxPool2d(kernel_size=2, stride=2)}) + projection.update( + { + 'conv_skip_proj': nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False), + 'bn_skip_proj': nn.BatchNorm2d(out_channels), + } + ) + self.projection = nn.Sequential(projection) + + # pylint: disable=arguments-differ + def forward(self, *args): + (x,) = args + x_residual = self.layers(x) + if self.projection is not None: + if self._downsample: + # pad h/w dimensions if they are odd to prevent shape mismatch with residual layer + x = nn.functional.pad(x, (0, x.shape[-1] % 2, 0, x.shape[-2] % 2), value=0) + return x_residual + self.projection(x) + return x_residual + x + +class ConvBlock(nn.Module): + """2D convolution followed by + - an optional normalisation (batch norm or instance norm) + - an optional activation (ReLU, LeakyReLU, or tanh) + """ + + def __init__( + self, + in_channels, + out_channels=None, + kernel_size=3, + stride=1, + norm='bn', + activation='relu', + bias=False, + transpose=False, + ): + super().__init__() + out_channels = out_channels or in_channels + padding = int((kernel_size - 1) / 2) + self.conv = nn.Conv2d if not transpose else partial(nn.ConvTranspose2d, output_padding=1) + self.conv = self.conv(in_channels, out_channels, kernel_size, stride, padding=padding, bias=bias) + + if norm == 'bn': + self.norm = nn.BatchNorm2d(out_channels) + elif norm == 'in': + self.norm = nn.InstanceNorm2d(out_channels) + elif norm == 'none': + self.norm = None + else: + raise ValueError('Invalid norm {}'.format(norm)) + + if activation == 'relu': + self.activation = nn.ReLU(inplace=True) + elif activation == 'lrelu': + self.activation = nn.LeakyReLU(0.1, inplace=True) + elif activation == 'elu': + self.activation = nn.ELU(inplace=True) + elif activation == 'tanh': + self.activation = nn.Tanh(inplace=True) + elif activation == 'none': + self.activation = None + else: + raise ValueError('Invalid activation {}'.format(activation)) + + def forward(self, x): + x = self.conv(x) + + if self.norm: + x = self.norm(x) + if self.activation: + x = self.activation(x) + return x + + +class SpatialGRU(nn.Module): + """A GRU cell that takes an input tensor [BxTxCxHxW] and an optional previous state and passes a + convolutional gated recurrent unit over the data""" + + def __init__(self, input_size, hidden_size, gru_bias_init=0.0, norm='bn', activation='relu'): + super().__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.gru_bias_init = gru_bias_init + + self.conv_update = nn.Conv2d(input_size + hidden_size, hidden_size, kernel_size=3, bias=True, padding=1) + self.conv_reset = nn.Conv2d(input_size + hidden_size, hidden_size, kernel_size=3, bias=True, padding=1) + + self.conv_state_tilde = ConvBlock( + input_size + hidden_size, hidden_size, kernel_size=3, bias=False, norm=norm, activation=activation + ) + + def forward(self, x, state=None, flow=None, mode='bilinear'): + # pylint: disable=unused-argument, arguments-differ + # Check size + assert len(x.size()) == 5, 'Input tensor must be BxTxCxHxW.' + b, timesteps, c, h, w = x.size() + assert c == self.input_size, f'feature sizes must match, got input {c} for layer with size {self.input_size}' + + # recurrent layers + rnn_output = [] + rnn_state = torch.zeros(b, self.hidden_size, h, w, device=x.device) if state is None else state + for t in range(timesteps): + x_t = x[:, t] + # if flow is not None: + # rnn_state = warp_features(rnn_state, flow[:, t], mode=mode) + + # propagate rnn state + rnn_state = self.gru_cell(x_t, rnn_state) + rnn_output.append(rnn_state) + + # reshape rnn output to batch tensor + return torch.stack(rnn_output, dim=1) + + def gru_cell(self, x, state): + # Compute gates + x_and_state = torch.cat([x, state], dim=1) + update_gate = self.conv_update(x_and_state) + reset_gate = self.conv_reset(x_and_state) + # Add bias to initialise gate as close to identity function + update_gate = torch.sigmoid(update_gate + self.gru_bias_init) + reset_gate = torch.sigmoid(reset_gate + self.gru_bias_init) + + # Compute proposal state, activation is defined in norm_act_config (can be tanh, ReLU etc) + state_tilde = self.conv_state_tilde(torch.cat([x, (1.0 - reset_gate) * state], dim=1)) + + output = (1.0 - update_gate) * state + update_gate * state_tilde + return output + + +class Interpolate(nn.Module): + def __init__(self, scale_factor: int = 2): + super().__init__() + self._interpolate = nn.functional.interpolate + self._scale_factor = scale_factor + + # pylint: disable=arguments-differ + def forward(self, x): + return self._interpolate(x, scale_factor=self._scale_factor, mode='bilinear', align_corners=False) + + diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/state_prediction.py b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/state_prediction.py new file mode 100644 index 0000000000000000000000000000000000000000..bf9e94c65658ddcc5c37335972a9e586b7f7b09c --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/state_prediction.py @@ -0,0 +1,37 @@ + +import torch + +from .layers import Bottleneck +from .layers import SpatialGRU + + +class FuturePrediction(torch.nn.Module): + def __init__(self, in_channels, latent_dim, n_gru_blocks=3, n_res_layers=3): + super().__init__() + self.n_gru_blocks = n_gru_blocks + + # Convolutional recurrent model with z_t as an initial hidden state and inputs the sample + # from the probabilistic model. The architecture of the model is: + # [Spatial GRU - [Bottleneck] x n_res_layers] x n_gru_blocks + self.spatial_grus = [] + self.res_blocks = [] + + for i in range(self.n_gru_blocks): + gru_in_channels = latent_dim if i == 0 else in_channels + self.spatial_grus.append(SpatialGRU(gru_in_channels, in_channels)) + self.res_blocks.append(torch.nn.Sequential(*[Bottleneck(in_channels) + for _ in range(n_res_layers)])) + + self.spatial_grus = torch.nn.ModuleList(self.spatial_grus) + self.res_blocks = torch.nn.ModuleList(self.res_blocks) + + def forward(self, x, hidden_state): + # x has shape (b, n_future, c, h, w), hidden_state (b, c, h, w) + for i in range(self.n_gru_blocks): + x = self.spatial_grus[i](x, hidden_state, flow=None) + b, n_future, c, h, w = x.shape + + x = self.res_blocks[i](x.view(b * n_future, c, h, w)) + x = x.view(b, n_future, c, h, w) + + return x diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..081ed0325719dcdefec0f2003d38d35e21362cb7 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__init__.py @@ -0,0 +1 @@ +from .custom_hooks import TransferWeight, CustomSetEpochInfoHook \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65652b63ff5f4240d9ee7537cdff876a005c6cc8 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/custom_hooks.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/custom_hooks.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ef022782f399c448b4d218531d5fcd54e30fe80 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/custom_hooks.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/custom_hooks.py b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/custom_hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..93ce7a27a3ae798aa37b83f1c4a11f08fa0edb93 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/custom_hooks.py @@ -0,0 +1,26 @@ +from mmcv.runner.hooks.hook import HOOKS, Hook +from projects.mmdet3d_plugin.models.utils import run_time +from mmcv.parallel import is_module_wrapper + + +@HOOKS.register_module() +class TransferWeight(Hook): + + def __init__(self, every_n_inters=1): + self.every_n_inters=every_n_inters + + def after_train_iter(self, runner): + if self.every_n_inner_iters(runner, self.every_n_inters): + runner.eval_model.load_state_dict(runner.model.state_dict()) + +@HOOKS.register_module() +class CustomSetEpochInfoHook(Hook): + """Set runner's epoch information to the model.""" + + def before_train_epoch(self, runner): + epoch = runner.epoch + model = runner.model + if is_module_wrapper(model): + model = model.module + model.set_epoch(epoch) + diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8d49802f796a43b6d323b6c7a62b0377867cc057 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__init__.py @@ -0,0 +1,5 @@ +from .transformer import PerceptionTransformer +from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D +from .temporal_self_attention import TemporalSelfAttention +from .encoder import BEVFormerEncoder, BEVFormerLayer +from .decoder import DetectionTransformerDecoder \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c447eb7e44e7fe9980801db9dee872db4c179040 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/custom_base_transformer_layer.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/custom_base_transformer_layer.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..059ae5c20ab7afcd09d460091de7fd9985ab4803 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/custom_base_transformer_layer.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/decoder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/decoder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d1aec994047d2e23bc767110b139636cfd35ea57 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/decoder.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/encoder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/encoder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae19d0fcc772be7be8b44e9d88162f7aced9cdf3 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/encoder.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/multi_scale_deformable_attn_function.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/multi_scale_deformable_attn_function.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5579a07e4ca4d8f5a009f5714375f72354da2b3b Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/multi_scale_deformable_attn_function.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/spatial_cross_attention.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/spatial_cross_attention.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..428200cbd28f549c7f2ec49dd1f44bda898d3f6e Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/spatial_cross_attention.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/temporal_self_attention.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/temporal_self_attention.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8da0fd1a7f4ac3a01b80576e2a2f4b8b07610c58 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/temporal_self_attention.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/transformer.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/transformer.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65c35b82974c48e4a57b27484bdbe5b16c322ee9 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/transformer.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/custom_base_transformer_layer.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/custom_base_transformer_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..0a040177285ceccf5b291c718f3cdae4587da9a2 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/custom_base_transformer_layer.py @@ -0,0 +1,254 @@ +import copy +import warnings + +import torch +import torch.nn as nn + +from mmcv import ConfigDict, deprecated_api_warning +from mmcv.cnn import Linear, build_activation_layer, build_norm_layer +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential + +from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, + TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) + +# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file +try: + from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401 + warnings.warn( + ImportWarning( + '``MultiScaleDeformableAttention`` has been moved to ' + '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 + '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 + 'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 + )) +except ImportError: + warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' + '``mmcv.ops.multi_scale_deform_attn``, ' + 'You should install ``mmcv-full`` if you need this module. ') +from mmcv.cnn.bricks.transformer import build_feedforward_network, build_attention + + +@TRANSFORMER_LAYER.register_module() +class MyCustomBaseTransformerLayer(BaseModule): + """Base `TransformerLayer` for vision transformer. + It can be built from `mmcv.ConfigDict` and support more flexible + customization, for example, using any number of `FFN or LN ` and + use different kinds of `attention` by specifying a list of `ConfigDict` + named `attn_cfgs`. It is worth mentioning that it supports `prenorm` + when you specifying `norm` as the first element of `operation_order`. + More details about the `prenorm`: `On Layer Normalization in the + Transformer Architecture `_ . + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for `self_attention` or `cross_attention` modules, + The order of the configs in the list should be consistent with + corresponding attentions in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. Default: None. + ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for FFN, The order of the configs in the list should be + consistent with corresponding ffn in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Support `prenorm` when you specifying first element as `norm`. + Default:None. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): Key, Query and Value are shape + of (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + """ + + def __init__(self, + attn_cfgs=None, + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True), + ), + operation_order=None, + norm_cfg=dict(type='LN'), + init_cfg=None, + batch_first=True, + **kwargs): + + deprecated_args = dict( + feedforward_channels='feedforward_channels', + ffn_dropout='ffn_drop', + ffn_num_fcs='num_fcs') + for ori_name, new_name in deprecated_args.items(): + if ori_name in kwargs: + warnings.warn( + f'The arguments `{ori_name}` in BaseTransformerLayer ' + f'has been deprecated, now you should set `{new_name}` ' + f'and other FFN related arguments ' + f'to a dict named `ffn_cfgs`. ') + ffn_cfgs[new_name] = kwargs[ori_name] + + super(MyCustomBaseTransformerLayer, self).__init__(init_cfg) + + self.batch_first = batch_first + + assert set(operation_order) & set( + ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ + set(operation_order), f'The operation_order of' \ + f' {self.__class__.__name__} should ' \ + f'contains all four operation type ' \ + f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" + + num_attn = operation_order.count('self_attn') + operation_order.count( + 'cross_attn') + if isinstance(attn_cfgs, dict): + attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] + else: + assert num_attn == len(attn_cfgs), f'The length ' \ + f'of attn_cfg {num_attn} is ' \ + f'not consistent with the number of attention' \ + f'in operation_order {operation_order}.' + + self.num_attn = num_attn + self.operation_order = operation_order + self.norm_cfg = norm_cfg + self.pre_norm = operation_order[0] == 'norm' + self.attentions = ModuleList() + + index = 0 + for operation_name in operation_order: + if operation_name in ['self_attn', 'cross_attn']: + if 'batch_first' in attn_cfgs[index]: + assert self.batch_first == attn_cfgs[index]['batch_first'] + else: + attn_cfgs[index]['batch_first'] = self.batch_first + attention = build_attention(attn_cfgs[index]) + # Some custom attentions used as `self_attn` + # or `cross_attn` can have different behavior. + attention.operation_name = operation_name + self.attentions.append(attention) + index += 1 + + self.embed_dims = self.attentions[0].embed_dims + + self.ffns = ModuleList() + num_ffns = operation_order.count('ffn') + if isinstance(ffn_cfgs, dict): + ffn_cfgs = ConfigDict(ffn_cfgs) + if isinstance(ffn_cfgs, dict): + ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] + assert len(ffn_cfgs) == num_ffns + for ffn_index in range(num_ffns): + if 'embed_dims' not in ffn_cfgs[ffn_index]: + ffn_cfgs['embed_dims'] = self.embed_dims + else: + assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims + + self.ffns.append( + build_feedforward_network(ffn_cfgs[ffn_index])) + + self.norms = ModuleList() + num_norms = operation_order.count('norm') + for _ in range(num_norms): + self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) + + def forward(self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + **kwargs contains some specific arguments of attentions. + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + if layer == 'self_attn': + temp_key = temp_value = query + query = self.attentions[attn_index]( + query, + temp_key, + temp_value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/decoder.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..7d982baf1adab4d3b8ac41e1a6dc7622512f16f3 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/decoder.py @@ -0,0 +1,339 @@ +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import mmcv +import cv2 as cv +import copy +import warnings +from matplotlib import pyplot as plt +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import TransformerLayerSequence +import math +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, + to_2tuple) + +from mmcv.utils import ext_loader +from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ + MultiScaleDeformableAttnFunction_fp16 + +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +def inverse_sigmoid(x, eps=1e-5): + """Inverse function of sigmoid. + Args: + x (Tensor): The tensor to do the + inverse. + eps (float): EPS avoid numerical + overflow. Defaults 1e-5. + Returns: + Tensor: The x has passed the inverse + function of sigmoid, has same + shape with input. + """ + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class DetectionTransformerDecoder(TransformerLayerSequence): + """Implements the decoder in DETR3D transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, return_intermediate=False, **kwargs): + super(DetectionTransformerDecoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + self.fp16_enabled = False + + def forward(self, + query, + *args, + reference_points=None, + reg_branches=None, + key_padding_mask=None, + **kwargs): + """Forward function for `Detr3DTransformerDecoder`. + Args: + query (Tensor): Input query with shape + `(num_query, bs, embed_dims)`. + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + reg_branch: (obj:`nn.ModuleList`): Used for + refining the regression results. Only would + be passed when with_box_refine is True, + otherwise would be passed a `None`. + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + output = query + intermediate = [] + intermediate_reference_points = [] + for lid, layer in enumerate(self.layers): + + reference_points_input = reference_points[..., :2].unsqueeze( + 2) # BS NUM_QUERY NUM_LEVEL 2 + output = layer( + output, + *args, + reference_points=reference_points_input, + key_padding_mask=key_padding_mask, + **kwargs) + output = output.permute(1, 0, 2) + + if reg_branches is not None: + tmp = reg_branches[lid](output) + + assert reference_points.shape[-1] == 3 + + new_reference_points = torch.zeros_like(reference_points) + new_reference_points[..., :2] = tmp[ + ..., :2] + inverse_sigmoid(reference_points[..., :2]) + new_reference_points[..., 2:3] = tmp[ + ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3]) + + new_reference_points = new_reference_points.sigmoid() + + reference_points = new_reference_points.detach() + + output = output.permute(1, 0, 2) + if self.return_intermediate: + intermediate.append(output) + intermediate_reference_points.append(reference_points) + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return output, reference_points + + +@ATTENTION.register_module() +class CustomMSDeformableAttention(BaseModule): + """An attention module used in Deformable-Detr. + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + im2col_step=64, + dropout=0.1, + batch_first=False, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.dropout = nn.Dropout(dropout) + self.batch_first = batch_first + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.sampling_offsets = nn.Linear( + embed_dims, num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims, + num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels, self.num_points, 1) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + @deprecated_api_warning({'residual': 'identity'}, + cls_name='MultiScaleDeformableAttention') + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + flag='decoder', + **kwargs): + """Forward Function of MultiScaleDeformAttention. + + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points) + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets \ + / offset_normalizer[None, None, None, :, None, :] + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ + * 0.5 + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: + + # using fp16 deformable attention is unstable because it performs many sum operations + if value.dtype == torch.float16: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + else: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + output = MultiScaleDeformableAttnFunction.apply( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights, self.im2col_step) + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + + output = self.output_proj(output) + + if not self.batch_first: + # (num_query, bs ,embed_dims) + output = output.permute(1, 0, 2) + + return self.dropout(output) + identity diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/encoder.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..27b34c3ed041d84282ade8015b98c28b379253f8 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/encoder.py @@ -0,0 +1,396 @@ +from projects.mmdet3d_plugin.models.utils.bricks import run_time +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from .custom_base_transformer_layer import MyCustomBaseTransformerLayer +import copy +import warnings +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import TransformerLayerSequence +from mmcv.runner import force_fp32, auto_fp16 +import numpy as np +import torch +import cv2 as cv +import mmcv +from mmcv.utils import TORCH_VERSION, digit_version +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class BEVFormerEncoder(TransformerLayerSequence): + + """ + Attention with both self and cross + Implements the decoder in DETR transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes', + **kwargs): + + super(BEVFormerEncoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + + self.num_points_in_pillar = num_points_in_pillar + self.pc_range = pc_range + self.fp16_enabled = False + + @staticmethod + def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs=1, device='cuda', dtype=torch.float): + """Get the reference points used in SCA and TSA. + Args: + H, W: spatial shape of bev. + Z: hight of pillar. + D: sample D points uniformly from each pillar. + device (obj:`device`): The device where + reference_points should be. + Returns: + Tensor: reference points used in decoder, has \ + shape (bs, num_keys, num_levels, 2). + """ + + # reference points in 3D space, used in spatial cross-attention (SCA) + if dim == '3d': + zs = torch.linspace(0.5, Z - 0.5, num_points_in_pillar, dtype=dtype, + device=device).view(-1, 1, 1).expand(num_points_in_pillar, H, W) / Z + xs = torch.linspace(0.5, W - 0.5, W, dtype=dtype, + device=device).view(1, 1, W).expand(num_points_in_pillar, H, W) / W + ys = torch.linspace(0.5, H - 0.5, H, dtype=dtype, + device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H + ref_3d = torch.stack((xs, ys, zs), -1) + ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1) + ref_3d = ref_3d[None].repeat(bs, 1, 1, 1) + return ref_3d + + # reference points on 2D bev plane, used in temporal self-attention (TSA). + elif dim == '2d': + ref_y, ref_x = torch.meshgrid( + torch.linspace( + 0.5, H - 0.5, H, dtype=dtype, device=device), + torch.linspace( + 0.5, W - 0.5, W, dtype=dtype, device=device) + ) + ref_y = ref_y.reshape(-1)[None] / H + ref_x = ref_x.reshape(-1)[None] / W + ref_2d = torch.stack((ref_x, ref_y), -1) + ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2) + return ref_2d + + # This function must use fp32!!! + @force_fp32(apply_to=('reference_points', 'img_metas')) + def point_sampling(self, reference_points, pc_range, img_metas): + + lidar2img = [] + for img_meta in img_metas: + lidar2img.append(img_meta['lidar2img']) + lidar2img = np.asarray(lidar2img) + lidar2img = reference_points.new_tensor(lidar2img) # (B, N, 4, 4) + reference_points = reference_points.clone() + + reference_points[..., 0:1] = reference_points[..., 0:1] * \ + (pc_range[3] - pc_range[0]) + pc_range[0] + reference_points[..., 1:2] = reference_points[..., 1:2] * \ + (pc_range[4] - pc_range[1]) + pc_range[1] + reference_points[..., 2:3] = reference_points[..., 2:3] * \ + (pc_range[5] - pc_range[2]) + pc_range[2] + + reference_points = torch.cat( + (reference_points, torch.ones_like(reference_points[..., :1])), -1) + + reference_points = reference_points.permute(1, 0, 2, 3) + D, B, num_query = reference_points.size()[:3] + num_cam = lidar2img.size(1) + + reference_points = reference_points.view( + D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1) + + lidar2img = lidar2img.view( + 1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1) + + reference_points_cam = torch.matmul(lidar2img.to(torch.float32), + reference_points.to(torch.float32)).squeeze(-1) + eps = 1e-5 + + bev_mask = (reference_points_cam[..., 2:3] > eps) + reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum( + reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps) + + reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1] + reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0] + + bev_mask = (bev_mask & (reference_points_cam[..., 1:2] > 0.0) + & (reference_points_cam[..., 1:2] < 1.0) + & (reference_points_cam[..., 0:1] < 1.0) + & (reference_points_cam[..., 0:1] > 0.0)) + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + bev_mask = torch.nan_to_num(bev_mask) + else: + bev_mask = bev_mask.new_tensor( + np.nan_to_num(bev_mask.cpu().numpy())) + + reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4) + bev_mask = bev_mask.permute(2, 1, 3, 0, 4).squeeze(-1) + + return reference_points_cam, bev_mask + + @auto_fp16() + def forward(self, + bev_query, + key, + value, + *args, + bev_h=None, + bev_w=None, + bev_pos=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + prev_bev=None, + shift=0., + **kwargs): + """Forward function for `TransformerDecoder`. + Args: + bev_query (Tensor): Input BEV query with shape + `(num_query, bs, embed_dims)`. + key & value (Tensor): Input multi-cameta features with shape + (num_cam, num_value, bs, embed_dims) + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + valid_ratios (Tensor): The radios of valid + points on the feature map, has shape + (bs, num_levels, 2) + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + + output = bev_query + intermediate = [] + + ref_3d = self.get_reference_points( + bev_h, bev_w, self.pc_range[5]-self.pc_range[2], self.num_points_in_pillar, dim='3d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) + ref_2d = self.get_reference_points( + bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) + + reference_points_cam, bev_mask = self.point_sampling( + ref_3d, self.pc_range, kwargs['img_metas']) + + # bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper. + shift_ref_2d = ref_2d # .clone() + shift_ref_2d += shift[:, None, None, :] + + # (num_query, bs, embed_dims) -> (bs, num_query, embed_dims) + bev_query = bev_query.permute(1, 0, 2) + bev_pos = bev_pos.permute(1, 0, 2) + bs, len_bev, num_bev_level, _ = ref_2d.shape + if prev_bev is not None: + prev_bev = prev_bev.permute(1, 0, 2) + prev_bev = torch.stack( + [prev_bev, bev_query], 1).reshape(bs*2, len_bev, -1) + hybird_ref_2d = torch.stack([shift_ref_2d, ref_2d], 1).reshape( + bs*2, len_bev, num_bev_level, 2) + else: + hybird_ref_2d = torch.stack([ref_2d, ref_2d], 1).reshape( + bs*2, len_bev, num_bev_level, 2) + + for lid, layer in enumerate(self.layers): + output = layer( + bev_query, + key, + value, + *args, + bev_pos=bev_pos, + ref_2d=hybird_ref_2d, + ref_3d=ref_3d, + bev_h=bev_h, + bev_w=bev_w, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + reference_points_cam=reference_points_cam, + bev_mask=bev_mask, + prev_bev=prev_bev, + **kwargs) + + bev_query = output + if self.return_intermediate: + intermediate.append(output) + + if self.return_intermediate: + return torch.stack(intermediate) + + return output + + +@TRANSFORMER_LAYER.register_module() +class BEVFormerLayer(MyCustomBaseTransformerLayer): + """Implements decoder layer in DETR transformer. + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): + Configs for self_attention or cross_attention, the order + should be consistent with it in `operation_order`. If it is + a dict, it would be expand to the number of attention in + `operation_order`. + feedforward_channels (int): The hidden dimension for FFNs. + ffn_dropout (float): Probability of an element to be zeroed + in ffn. Default 0.0. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Default:None + act_cfg (dict): The activation config for FFNs. Default: `LN` + norm_cfg (dict): Config dict for normalization layer. + Default: `LN`. + ffn_num_fcs (int): The number of fully-connected layers in FFNs. + Default:2. + """ + + def __init__(self, + attn_cfgs, + feedforward_channels, + ffn_dropout=0.0, + operation_order=None, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN'), + ffn_num_fcs=2, + **kwargs): + super(BEVFormerLayer, self).__init__( + attn_cfgs=attn_cfgs, + feedforward_channels=feedforward_channels, + ffn_dropout=ffn_dropout, + operation_order=operation_order, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + ffn_num_fcs=ffn_num_fcs, + **kwargs) + self.fp16_enabled = False + assert len(operation_order) == 6 + assert set(operation_order) == set( + ['self_attn', 'norm', 'cross_attn', 'ffn']) + + def forward(self, + query, + key=None, + value=None, + bev_pos=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + ref_2d=None, + ref_3d=None, + bev_h=None, + bev_w=None, + reference_points_cam=None, + mask=None, + spatial_shapes=None, + level_start_index=None, + prev_bev=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + + **kwargs contains some specific arguments of attentions. + + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + # temporal self attention + if layer == 'self_attn': + + query = self.attentions[attn_index]( + query, + prev_bev, + prev_bev, + identity if self.pre_norm else None, + query_pos=bev_pos, + key_pos=bev_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + reference_points=ref_2d, + spatial_shapes=torch.tensor( + [[bev_h, bev_w]], device=query.device), + level_start_index=torch.tensor([0], device=query.device), + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + # spaital cross attention + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + reference_points=ref_3d, + reference_points_cam=reference_points_cam, + mask=mask, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/multi_scale_deformable_attn_function.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/multi_scale_deformable_attn_function.py new file mode 100644 index 0000000000000000000000000000000000000000..613dd7c41dd61bc4765ceac7bc45c8e9add07fe6 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/multi_scale_deformable_attn_function.py @@ -0,0 +1,157 @@ +import torch +from torch.cuda.amp import custom_bwd, custom_fwd +from torch.autograd.function import Function, once_differentiable +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +class MultiScaleDeformableAttnFunction_fp16(Function): + + @staticmethod + @custom_fwd(cast_inputs=torch.float16) + def forward(ctx, value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights, im2col_step): + """GPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + im2col_step (Tensor): The step used in image to column. + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + ctx.im2col_step = im2col_step + output = ext_module.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step=ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights) + return output + + @staticmethod + @once_differentiable + @custom_bwd + def backward(ctx, grad_output): + """GPU version of backward function. + + Args: + grad_output (Tensor): Gradient + of output tensor of forward. + + Returns: + Tuple[Tensor]: Gradient + of input tensors in forward. + """ + value, value_spatial_shapes, value_level_start_index, \ + sampling_locations, attention_weights = ctx.saved_tensors + grad_value = torch.zeros_like(value) + grad_sampling_loc = torch.zeros_like(sampling_locations) + grad_attn_weight = torch.zeros_like(attention_weights) + + ext_module.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output.contiguous(), + grad_value, + grad_sampling_loc, + grad_attn_weight, + im2col_step=ctx.im2col_step) + + return grad_value, None, None, \ + grad_sampling_loc, grad_attn_weight, None + + +class MultiScaleDeformableAttnFunction_fp32(Function): + + @staticmethod + @custom_fwd(cast_inputs=torch.float32) + def forward(ctx, value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights, im2col_step): + """GPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + im2col_step (Tensor): The step used in image to column. + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + + ctx.im2col_step = im2col_step + output = ext_module.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step=ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights) + return output + + @staticmethod + @once_differentiable + @custom_bwd + def backward(ctx, grad_output): + """GPU version of backward function. + + Args: + grad_output (Tensor): Gradient + of output tensor of forward. + + Returns: + Tuple[Tensor]: Gradient + of input tensors in forward. + """ + value, value_spatial_shapes, value_level_start_index, \ + sampling_locations, attention_weights = ctx.saved_tensors + grad_value = torch.zeros_like(value) + grad_sampling_loc = torch.zeros_like(sampling_locations) + grad_attn_weight = torch.zeros_like(attention_weights) + + ext_module.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output.contiguous(), + grad_value, + grad_sampling_loc, + grad_attn_weight, + im2col_step=ctx.im2col_step) + + return grad_value, None, None, \ + grad_sampling_loc, grad_attn_weight, None diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/spatial_cross_attention.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/spatial_cross_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..3362ea053f28b0297d2830ca337722f8978e7b71 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/spatial_cross_attention.py @@ -0,0 +1,393 @@ + +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import warnings +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import build_attention +import math +from mmcv.runner import force_fp32, auto_fp16 + +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential + +from mmcv.utils import ext_loader +from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ + MultiScaleDeformableAttnFunction_fp16 +from projects.mmdet3d_plugin.models.utils.bricks import run_time +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +@ATTENTION.register_module() +class SpatialCrossAttention(BaseModule): + """An attention module used in BEVFormer. + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_cams (int): The number of cameras + dropout (float): A Dropout layer on `inp_residual`. + Default: 0.. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + deformable_attention: (dict): The config for the deformable attention used in SCA. + """ + + def __init__(self, + embed_dims=256, + num_cams=6, + pc_range=None, + dropout=0.1, + init_cfg=None, + batch_first=False, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=256, + num_levels=4), + **kwargs + ): + super(SpatialCrossAttention, self).__init__(init_cfg) + + self.init_cfg = init_cfg + self.dropout = nn.Dropout(dropout) + self.pc_range = pc_range + self.fp16_enabled = False + self.deformable_attention = build_attention(deformable_attention) + self.embed_dims = embed_dims + self.num_cams = num_cams + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.batch_first = batch_first + self.init_weight() + + def init_weight(self): + """Default initialization for Parameters of Module.""" + xavier_init(self.output_proj, distribution='uniform', bias=0.) + + @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam')) + def forward(self, + query, + key, + value, + residual=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + reference_points_cam=None, + bev_mask=None, + level_start_index=None, + flag='encoder', + **kwargs): + """Forward Function of Detr3DCrossAtten. + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. (B, N, C, H, W) + residual (Tensor): The tensor used for addition, with the + same shape as `x`. Default None. If None, `x` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, 4), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different level. With shape (num_levels, 2), + last dimension represent (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if key is None: + key = query + if value is None: + value = key + + if residual is None: + inp_residual = query + slots = torch.zeros_like(query) + if query_pos is not None: + query = query + query_pos + + bs, num_query, _ = query.size() + + D = reference_points_cam.size(3) + indexes = [] + for i, mask_per_img in enumerate(bev_mask): + index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1) + indexes.append(index_query_per_img) + max_len = max([len(each) for each in indexes]) + + # each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory. + queries_rebatch = query.new_zeros( + [bs, self.num_cams, max_len, self.embed_dims]) + reference_points_rebatch = reference_points_cam.new_zeros( + [bs, self.num_cams, max_len, D, 2]) + + for j in range(bs): + for i, reference_points_per_img in enumerate(reference_points_cam): + index_query_per_img = indexes[i] + queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img] + reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img] + + num_cams, l, bs, embed_dims = key.shape + + key = key.permute(2, 0, 1, 3).reshape( + bs * self.num_cams, l, self.embed_dims) + value = value.permute(2, 0, 1, 3).reshape( + bs * self.num_cams, l, self.embed_dims) + + queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value, + reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes, + level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims) + for j in range(bs): + for i, index_query_per_img in enumerate(indexes): + slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)] + + count = bev_mask.sum(-1) > 0 + count = count.permute(1, 2, 0).sum(-1) + count = torch.clamp(count, min=1.0) + slots = slots / count[..., None] + slots = self.output_proj(slots) + + return self.dropout(slots) + inp_residual + + +@ATTENTION.register_module() +class MSDeformableAttention3D(BaseModule): + """An attention module used in BEVFormer based on Deformable-Detr. + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=8, + im2col_step=64, + dropout=0.1, + batch_first=True, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.batch_first = batch_first + self.output_proj = None + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.sampling_offsets = nn.Linear( + embed_dims, num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims, + num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels, self.num_points, 1) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + **kwargs): + """Forward Function of MultiScaleDeformAttention. + Args: + query (Tensor): Query of Transformer with shape + ( bs, num_query, embed_dims). + key (Tensor): The key tensor with shape + `(bs, num_key, embed_dims)`. + value (Tensor): The value tensor with shape + `(bs, num_key, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points) + + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + + if reference_points.shape[-1] == 2: + """ + For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights. + After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image. + For each referent point, we sample `num_points` sampling points. + For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points. + """ + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + + bs, num_query, num_Z_anchors, xy = reference_points.shape + reference_points = reference_points[:, :, None, None, None, :, :] + sampling_offsets = sampling_offsets / \ + offset_normalizer[None, None, None, :, None, :] + bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape + sampling_offsets = sampling_offsets.view( + bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy) + sampling_locations = reference_points + sampling_offsets + bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape + assert num_all_points == num_points * num_Z_anchors + + sampling_locations = sampling_locations.view( + bs, num_query, num_heads, num_levels, num_all_points, xy) + + elif reference_points.shape[-1] == 4: + assert False + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + + # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 + # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points + # + + if torch.cuda.is_available() and value.is_cuda: + if value.dtype == torch.float16: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + else: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + output = MultiScaleDeformableAttnFunction.apply( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights, self.im2col_step) + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + if not self.batch_first: + output = output.permute(1, 0, 2) + + return output diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/temporal_self_attention.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/temporal_self_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..f5151ad7078fb4a93ba45d597f8a936f64bc6ba4 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/temporal_self_attention.py @@ -0,0 +1,266 @@ +from projects.mmdet3d_plugin.models.utils.bricks import run_time +from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32 +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import warnings +import torch +import torch.nn as nn +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import ATTENTION +import math +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, + to_2tuple) + +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +@ATTENTION.register_module() +class TemporalSelfAttention(BaseModule): + """An attention module used in BEVFormer based on Deformable-Detr. + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to True. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV. + the length of BEV queue is 2. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + num_bev_queue=2, + im2col_step=64, + dropout=0.1, + batch_first=True, + norm_cfg=None, + init_cfg=None): + + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.dropout = nn.Dropout(dropout) + self.batch_first = batch_first + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.num_bev_queue = num_bev_queue + self.sampling_offsets = nn.Linear( + embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue, + num_bev_queue*num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1) + + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + flag='decoder', + + **kwargs): + """Forward Function of MultiScaleDeformAttention. + + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + assert self.batch_first + bs, len_bev, c = query.shape + value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c) + + # value = torch.cat([query, query], 0) + + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + bs, num_query, embed_dims = query.shape + _, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + assert self.num_bev_queue == 2 + + query = torch.cat([value[:bs], query], -1) + value = self.value_proj(value) + + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + + value = value.reshape(bs*self.num_bev_queue, + num_value, self.num_heads, -1) + + sampling_offsets = self.sampling_offsets(query) + sampling_offsets = sampling_offsets.view( + bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels * self.num_points) + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_bev_queue, + self.num_levels, + self.num_points) + + attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\ + .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous() + sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\ + .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2) + + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets \ + / offset_normalizer[None, None, None, :, None, :] + + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ + * 0.5 + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: + + # using fp16 deformable attention is unstable because it performs many sum operations + if value.dtype == torch.float16: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + else: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + output = MultiScaleDeformableAttnFunction.apply( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights, self.im2col_step) + else: + + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + + # output shape (bs*num_bev_queue, num_query, embed_dims) + # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue) + output = output.permute(1, 2, 0) + + # fuse history value and current value + # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue) + output = output.view(num_query, embed_dims, bs, self.num_bev_queue) + output = output.mean(-1) + + # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims) + output = output.permute(2, 0, 1) + + output = self.output_proj(output) + + if not self.batch_first: + output = output.permute(1, 0, 2) + + return self.dropout(output) + identity diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/transformer.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..55f0a15009e5bf3ac806feda0d9adefd7e16dea3 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/transformer.py @@ -0,0 +1,283 @@ +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import xavier_init +from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence +from mmcv.runner.base_module import BaseModule + +from mmdet.models.utils.builder import TRANSFORMER +from torch.nn.init import normal_ +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from mmcv.runner.base_module import BaseModule +from torchvision.transforms.functional import rotate +from .temporal_self_attention import TemporalSelfAttention +from .spatial_cross_attention import MSDeformableAttention3D +from .decoder import CustomMSDeformableAttention +from projects.mmdet3d_plugin.models.utils.bricks import run_time +from mmcv.runner import force_fp32, auto_fp16 + + +@TRANSFORMER.register_module() +class PerceptionTransformer(BaseModule): + """Implements the Detr3D transformer. + Args: + as_two_stage (bool): Generate query from encoder features. + Default: False. + num_feature_levels (int): Number of feature maps from FPN: + Default: 4. + two_stage_num_proposals (int): Number of proposals when set + `as_two_stage` as True. Default: 300. + """ + + def __init__(self, + num_feature_levels=4, + num_cams=6, + two_stage_num_proposals=300, + encoder=None, + decoder=None, + embed_dims=256, + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + can_bus_norm=True, + use_cams_embeds=True, + rotate_center=[100, 100], + **kwargs): + super(PerceptionTransformer, self).__init__(**kwargs) + self.encoder = build_transformer_layer_sequence(encoder) + self.decoder = build_transformer_layer_sequence(decoder) + self.embed_dims = embed_dims + self.num_feature_levels = num_feature_levels + self.num_cams = num_cams + self.fp16_enabled = False + + self.rotate_prev_bev = rotate_prev_bev + self.use_shift = use_shift + self.use_can_bus = use_can_bus + self.can_bus_norm = can_bus_norm + self.use_cams_embeds = use_cams_embeds + + self.two_stage_num_proposals = two_stage_num_proposals + self.init_layers() + self.rotate_center = rotate_center + + def init_layers(self): + """Initialize layers of the Detr3DTransformer.""" + self.level_embeds = nn.Parameter(torch.Tensor( + self.num_feature_levels, self.embed_dims)) + self.cams_embeds = nn.Parameter( + torch.Tensor(self.num_cams, self.embed_dims)) + self.reference_points = nn.Linear(self.embed_dims, 3) + self.can_bus_mlp = nn.Sequential( + nn.Linear(18, self.embed_dims // 2), + nn.ReLU(inplace=True), + nn.Linear(self.embed_dims // 2, self.embed_dims), + nn.ReLU(inplace=True), + ) + if self.can_bus_norm: + self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims)) + + def init_weights(self): + """Initialize the transformer weights.""" + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \ + or isinstance(m, CustomMSDeformableAttention): + try: + m.init_weight() + except AttributeError: + m.init_weights() + normal_(self.level_embeds) + normal_(self.cams_embeds) + xavier_init(self.reference_points, distribution='uniform', bias=0.) + xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.) + + @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos')) + def get_bev_features( + self, + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + prev_bev=None, + **kwargs): + """ + obtain bev features. + """ + + bs = mlvl_feats[0].size(0) + bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1) + bev_pos = bev_pos.flatten(2).permute(2, 0, 1) + + # obtain rotation angle and shift with ego motion + delta_x = np.array([each['can_bus'][0] + for each in kwargs['img_metas']]) + delta_y = np.array([each['can_bus'][1] + for each in kwargs['img_metas']]) + ego_angle = np.array( + [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']]) + grid_length_y = grid_length[0] + grid_length_x = grid_length[1] + translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2) + translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180 + bev_angle = ego_angle - translation_angle + shift_y = translation_length * \ + np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h + shift_x = translation_length * \ + np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w + shift_y = shift_y * self.use_shift + shift_x = shift_x * self.use_shift + shift = bev_queries.new_tensor( + [shift_x, shift_y]).permute(1, 0) # xy, bs -> bs, xy + + if prev_bev is not None: + if prev_bev.shape[1] == bev_h * bev_w: + prev_bev = prev_bev.permute(1, 0, 2) + if self.rotate_prev_bev: + for i in range(bs): + # num_prev_bev = prev_bev.size(1) + rotation_angle = kwargs['img_metas'][i]['can_bus'][-1] + tmp_prev_bev = prev_bev[:, i].reshape( + bev_h, bev_w, -1).permute(2, 0, 1) + tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle, + center=self.rotate_center) + tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape( + bev_h * bev_w, 1, -1) + prev_bev[:, i] = tmp_prev_bev[:, 0] + + # add can bus signals + can_bus = bev_queries.new_tensor( + [each['can_bus'] for each in kwargs['img_metas']]) # [:, :] + can_bus = self.can_bus_mlp(can_bus)[None, :, :] + bev_queries = bev_queries + can_bus * self.use_can_bus + + feat_flatten = [] + spatial_shapes = [] + for lvl, feat in enumerate(mlvl_feats): + bs, num_cam, c, h, w = feat.shape + spatial_shape = (h, w) + feat = feat.flatten(3).permute(1, 0, 3, 2) + if self.use_cams_embeds: + feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype) + feat = feat + self.level_embeds[None, + None, lvl:lvl + 1, :].to(feat.dtype) + spatial_shapes.append(spatial_shape) + feat_flatten.append(feat) + + feat_flatten = torch.cat(feat_flatten, 2) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=bev_pos.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + + feat_flatten = feat_flatten.permute( + 0, 2, 1, 3) # (num_cam, H*W, bs, embed_dims) + + bev_embed = self.encoder( + bev_queries, + feat_flatten, + feat_flatten, + bev_h=bev_h, + bev_w=bev_w, + bev_pos=bev_pos, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + prev_bev=prev_bev, + shift=shift, + **kwargs + ) + + return bev_embed + + @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos')) + def forward(self, + mlvl_feats, + bev_queries, + object_query_embed, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + reg_branches=None, + cls_branches=None, + prev_bev=None, + **kwargs): + """Forward function for `Detr3DTransformer`. + Args: + mlvl_feats (list(Tensor)): Input queries from + different level. Each element has shape + [bs, num_cams, embed_dims, h, w]. + bev_queries (Tensor): (bev_h*bev_w, c) + bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w) + object_query_embed (Tensor): The query embedding for decoder, + with shape [num_query, c]. + reg_branches (obj:`nn.ModuleList`): Regression heads for + feature maps from each decoder layer. Only would + be passed when `with_box_refine` is True. Default to None. + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + - bev_embed: BEV features + - inter_states: Outputs from decoder. If + return_intermediate_dec is True output has shape \ + (num_dec_layers, bs, num_query, embed_dims), else has \ + shape (1, bs, num_query, embed_dims). + - init_reference_out: The initial value of reference \ + points, has shape (bs, num_queries, 4). + - inter_references_out: The internal value of reference \ + points in decoder, has shape \ + (num_dec_layers, bs,num_query, embed_dims) + - enc_outputs_class: The classification score of \ + proposals generated from \ + encoder's feature maps, has shape \ + (batch, h*w, num_classes). \ + Only would be returned when `as_two_stage` is True, \ + otherwise None. + - enc_outputs_coord_unact: The regression results \ + generated from encoder's feature maps., has shape \ + (batch, h*w, 4). Only would \ + be returned when `as_two_stage` is True, \ + otherwise None. + """ + + bev_embed = self.get_bev_features( + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=grid_length, + bev_pos=bev_pos, + prev_bev=prev_bev, + **kwargs) # bev_embed shape: bs, bev_h*bev_w, embed_dims + + bs = mlvl_feats[0].size(0) + query_pos, query = torch.split( + object_query_embed, self.embed_dims, dim=1) + query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) + query = query.unsqueeze(0).expand(bs, -1, -1) + reference_points = self.reference_points(query_pos) + reference_points = reference_points.sigmoid() + init_reference_out = reference_points + + query = query.permute(1, 0, 2) + query_pos = query_pos.permute(1, 0, 2) + bev_embed = bev_embed.permute(1, 0, 2) + + inter_states, inter_references = self.decoder( + query=query, + key=None, + value=bev_embed, + query_pos=query_pos, + reference_points=reference_points, + reg_branches=reg_branches, + cls_branches=cls_branches, + spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device), + level_start_index=torch.tensor([0], device=query.device), + **kwargs) + + inter_references_out = inter_references + + return bev_embed, inter_states, init_reference_out, inter_references_out diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/planner/__pycache__/metric_stp3.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/planner/__pycache__/metric_stp3.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5f21912ce8b3dcef93b1ba06b2be744e63605d8 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/planner/__pycache__/metric_stp3.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/planner/metric_stp3.py b/GenAD-main/projects/mmdet3d_plugin/VAD/planner/metric_stp3.py new file mode 100644 index 0000000000000000000000000000000000000000..da3ac1a9b4969130759d84091e0251b4d78370b2 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/planner/metric_stp3.py @@ -0,0 +1,351 @@ +''' +calculate planner metric same as stp3 +''' +import numpy as np +import torch +import cv2 +import copy +import matplotlib.pyplot as plt +from projects.mmdet3d_plugin.core.evaluation.metric_motion import get_ade, get_fde +from skimage.draw import polygon +from nuscenes.utils.data_classes import Box +from scipy.spatial.transform import Rotation as R + +ego_width, ego_length = 1.85, 4.084 + +class PlanningMetric(): + def __init__(self): + super().__init__() + self.X_BOUND = [-50.0, 50.0, 0.5] # Forward + self.Y_BOUND = [-50.0, 50.0, 0.5] # Sides + self.Z_BOUND = [-10.0, 10.0, 20.0] # Height + dx, bx, _ = self.gen_dx_bx(self.X_BOUND, self.Y_BOUND, self.Z_BOUND) + self.dx, self.bx = dx[:2], bx[:2] + + bev_resolution, bev_start_position, bev_dimension = self.calculate_birds_eye_view_parameters( + self.X_BOUND, self.Y_BOUND, self.Z_BOUND + ) + self.bev_resolution = bev_resolution.numpy() + self.bev_start_position = bev_start_position.numpy() + self.bev_dimension = bev_dimension.numpy() + + self.W = ego_width + self.H = ego_length + + self.category_index = { + 'human':[2,3,4,5,6,7,8], + 'vehicle':[14,15,16,17,18,19,20,21,22,23] + } + + # self.n_future = n_future + + # self.add_state("obj_col", default=torch.zeros(self.n_future), dist_reduce_fx="sum") + # self.add_state("obj_box_col", default=torch.zeros(self.n_future), dist_reduce_fx="sum") + # self.add_state("L2", default=torch.zeros(self.n_future),dist_reduce_fx="sum") + # self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum") + + def gen_dx_bx(self, xbound, ybound, zbound): + dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]]) + bx = torch.Tensor([row[0] + row[2]/2.0 for row in [xbound, ybound, zbound]]) + nx = torch.LongTensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]]) + + return dx, bx, nx + + def calculate_birds_eye_view_parameters(self, x_bounds, y_bounds, z_bounds): + """ + Parameters + ---------- + x_bounds: Forward direction in the ego-car. + y_bounds: Sides + z_bounds: Height + + Returns + ------- + bev_resolution: Bird's-eye view bev_resolution + bev_start_position Bird's-eye view first element + bev_dimension Bird's-eye view tensor spatial dimension + """ + bev_resolution = torch.tensor([row[2] for row in [x_bounds, y_bounds, z_bounds]]) + bev_start_position = torch.tensor([row[0] + row[2] / 2.0 for row in [x_bounds, y_bounds, z_bounds]]) + bev_dimension = torch.tensor([(row[1] - row[0]) / row[2] for row in [x_bounds, y_bounds, z_bounds]], + dtype=torch.long) + + return bev_resolution, bev_start_position, bev_dimension + + def get_label( + self, + gt_agent_boxes, + gt_agent_feats + ): + segmentation_np, pedestrian_np = self.get_birds_eye_view_label(gt_agent_boxes,gt_agent_feats) + segmentation = torch.from_numpy(segmentation_np).long().unsqueeze(0) + pedestrian = torch.from_numpy(pedestrian_np).long().unsqueeze(0) + + return segmentation, pedestrian + + def get_birds_eye_view_label( + self, + gt_agent_boxes, + gt_agent_feats + ): + ''' + gt_agent_boxes (LiDARInstance3DBoxes): list of GT Bboxs. + dim 9 = (x,y,z)+(w,l,h)+yaw+(vx,vy) + gt_agent_feats: (B, A, 34) + dim 34 = fut_traj(6*2) + fut_mask(6) + goal(1) + lcf_feat(9) + fut_yaw(6) + lcf_feat (x, y, yaw, vx, vy, width, length, height, type) + ego_lcf_feats: (B, 9) + dim 8 = (vx, vy, ax, ay, w, length, width, vel, steer) + ''' + T = 6 + segmentation = np.zeros((T,self.bev_dimension[0], self.bev_dimension[1])) + pedestrian = np.zeros((T,self.bev_dimension[0], self.bev_dimension[1])) + agent_num = gt_agent_feats.shape[1] + + gt_agent_boxes = gt_agent_boxes.tensor.cpu().numpy() #(N, 9) + gt_agent_feats = gt_agent_feats.cpu().numpy() + + gt_agent_fut_trajs = gt_agent_feats[..., :T*2].reshape(-1, 6, 2) + gt_agent_fut_mask = gt_agent_feats[..., T*2:T*3].reshape(-1, 6) + # gt_agent_lcf_feat = gt_agent_feats[..., T*3+1:T*3+10].reshape(-1, 9) + gt_agent_fut_yaw = gt_agent_feats[..., T*3+10:T*4+10].reshape(-1, 6, 1) + gt_agent_fut_trajs = np.cumsum(gt_agent_fut_trajs, axis=1) + gt_agent_fut_yaw = np.cumsum(gt_agent_fut_yaw, axis=1) + + gt_agent_boxes[:,6:7] = -1*(gt_agent_boxes[:,6:7] + np.pi/2) # NOTE: convert yaw to lidar frame + gt_agent_fut_trajs = gt_agent_fut_trajs + gt_agent_boxes[:, np.newaxis, 0:2] + gt_agent_fut_yaw = gt_agent_fut_yaw + gt_agent_boxes[:, np.newaxis, 6:7] + + for t in range(T): + for i in range(agent_num): + if gt_agent_fut_mask[i][t] == 1: + # Filter out all non vehicle instances + category_index = int(gt_agent_feats[0,i][27]) + agent_length, agent_width = gt_agent_boxes[i][4], gt_agent_boxes[i][3] + x_a = gt_agent_fut_trajs[i, t, 0] + y_a = gt_agent_fut_trajs[i, t, 1] + yaw_a = gt_agent_fut_yaw[i, t, 0] + param = [x_a,y_a,yaw_a,agent_length, agent_width] + if (category_index in self.category_index['vehicle']): + poly_region = self._get_poly_region_in_image(param) + cv2.fillPoly(segmentation[t], [poly_region], 1.0) + if (category_index in self.category_index['human']): + poly_region = self._get_poly_region_in_image(param) + cv2.fillPoly(pedestrian[t], [poly_region], 1.0) + + # vis for debug + # plt.figure('debug') + # for i in range(T): + # plt.subplot(2,T,i+1) + # plt.imshow(segmentation[i]) + # plt.subplot(2,T,i+1+T) + # plt.imshow(pedestrian[i]) + # plt.savefig('/home/users/qing01.xu/bevformer/debug_figs/car_ped_occ.jpg') + # plt.close() + + return segmentation, pedestrian + + def _get_poly_region_in_image(self,param): + lidar2cv_rot = np.array([[1,0], [0,-1]]) + x_a,y_a,yaw_a,agent_length, agent_width = param + trans_a = np.array([[x_a,y_a]]).T + rot_mat_a = np.array([[np.cos(yaw_a), -np.sin(yaw_a)], + [np.sin(yaw_a), np.cos(yaw_a)]]) + agent_corner = np.array([ + [agent_length/2, -agent_length/2, -agent_length/2, agent_length/2], + [agent_width/2, agent_width/2, -agent_width/2, -agent_width/2]]) #(2,4) + agent_corner_lidar = np.matmul(rot_mat_a, agent_corner) + trans_a #(2,4) + # convert to cv frame + agent_corner_cv2 = (np.matmul(lidar2cv_rot, agent_corner_lidar) \ + - self.bev_start_position[:2,None] + self.bev_resolution[:2,None] / 2.0).T / self.bev_resolution[:2] #(4,2) + agent_corner_cv2 = np.round(agent_corner_cv2).astype(np.int32) + + return agent_corner_cv2 + + + def evaluate_single_coll(self, traj, segmentation, input_gt): + ''' + traj: torch.Tensor (n_future, 2) + 自车lidar系为轨迹参考系 + ^ y + | + | + 0-------> + x + segmentation: torch.Tensor (n_future, 200, 200) + ''' + pts = np.array([ + [-self.H / 2. + 0.5, self.W / 2.], + [self.H / 2. + 0.5, self.W / 2.], + [self.H / 2. + 0.5, -self.W / 2.], + [-self.H / 2. + 0.5, -self.W / 2.], + ]) + pts = (pts - self.bx.cpu().numpy()) / (self.dx.cpu().numpy()) + pts[:, [0, 1]] = pts[:, [1, 0]] + rr, cc = polygon(pts[:,1], pts[:,0]) + rc = np.concatenate([rr[:,None], cc[:,None]], axis=-1) + + n_future, _ = traj.shape + trajs = traj.view(n_future, 1, 2) + # 轨迹坐标系转换为: + # ^ x + # | + # | + # 0-------> y + trajs_ = copy.deepcopy(trajs) + trajs_[:,:,[0,1]] = trajs_[:,:,[1,0]] # can also change original tensor + trajs_ = trajs_ / self.dx.to(trajs.device) + trajs_ = trajs_.cpu().numpy() + rc # (n_future, 32, 2) + + r = (self.bev_dimension[0] - trajs_[:,:,0]).astype(np.int32) + r = np.clip(r, 0, self.bev_dimension[0] - 1) + + c = trajs_[:,:,1].astype(np.int32) + c = np.clip(c, 0, self.bev_dimension[1] - 1) + + collision = np.full(n_future, False) + for t in range(n_future): + rr = r[t] + cc = c[t] + I = np.logical_and( + np.logical_and(rr >= 0, rr < self.bev_dimension[0]), + np.logical_and(cc >= 0, cc < self.bev_dimension[1]), + ) + collision[t] = np.any(segmentation[t, rr[I], cc[I]].cpu().numpy()) + + # vis for debug + # obs_occ = copy.deepcopy(segmentation) + # ego_occ = torch.zeros_like(obs_occ) + # for t in range(n_future): + # rr = r[t] + # cc = c[t] + # I = np.logical_and( + # np.logical_and(rr >= 0, rr < self.bev_dimension[0]), + # np.logical_and(cc >= 0, cc < self.bev_dimension[1]), + # ) + # ego_occ[t, rr[I], cc[I]]=1 + + # plt.figure() + # for i in range(6): + # plt.subplot(2,6,i+1) + # plt.imshow(obs_occ[i]) + # plt.subplot(2,6,i+7) + # plt.imshow(ego_occ[i]) + # if input_gt: + # plt.savefig('/home/users/qing01.xu/bevformer/debug_figs/occ_metric_stp3_gt.jpg') + # else: + # plt.savefig('/home/users/qing01.xu/bevformer/debug_figs/occ_metric_stp3_pred.jpg') + # plt.close() + + return torch.from_numpy(collision).to(device=traj.device) + + def evaluate_coll( + self, + trajs, + gt_trajs, + segmentation + ): + ''' + trajs: torch.Tensor (B, n_future, 2) + 自车lidar系为轨迹参考系 + ^ y + | + | + 0-------> + x + gt_trajs: torch.Tensor (B, n_future, 2) + segmentation: torch.Tensor (B, n_future, 200, 200) + + ''' + B, n_future, _ = trajs.shape + # trajs = trajs * torch.tensor([-1, 1], device=trajs.device) + # gt_trajs = gt_trajs * torch.tensor([-1, 1], device=gt_trajs.device) + + obj_coll_sum = torch.zeros(n_future, device=segmentation.device) + obj_box_coll_sum = torch.zeros(n_future, device=segmentation.device) + + for i in range(B): + gt_box_coll = self.evaluate_single_coll(gt_trajs[i], segmentation[i], input_gt=True) + + xx, yy = trajs[i,:,0], trajs[i, :, 1] + # lidar系下的轨迹转换到图片坐标系下 + xi = ((-self.bx[0]/2 - yy) / self.dx[0]).long() + yi = ((-self.bx[1]/2 + xx) / self.dx[1]).long() + + m1 = torch.logical_and( + torch.logical_and(xi >= 0, xi < self.bev_dimension[0]), + torch.logical_and(yi >= 0, yi < self.bev_dimension[1]), + ).to(gt_box_coll.device) + m1 = torch.logical_and(m1, torch.logical_not(gt_box_coll)) + + ti = torch.arange(n_future) + obj_coll_sum[ti[m1]] += segmentation[i, ti[m1], xi[m1], yi[m1]].long() + + m2 = torch.logical_not(gt_box_coll) + box_coll = self.evaluate_single_coll(trajs[i], segmentation[i], input_gt=False).to(ti.device) + obj_box_coll_sum[ti[m2]] += (box_coll[ti[m2]]).long() + + return obj_coll_sum, obj_box_coll_sum + + def compute_L2(self, trajs, gt_trajs): + ''' + trajs: torch.Tensor (n_future, 2) + gt_trajs: torch.Tensor (n_future, 2) + ''' + # return torch.sqrt(((trajs[:, :, :2] - gt_trajs[:, :, :2]) ** 2).sum(dim=-1)) + pred_len = trajs.shape[0] + ade = float( + sum( + torch.sqrt( + (trajs[i, 0] - gt_trajs[i, 0]) ** 2 + + (trajs[i, 1] - gt_trajs[i, 1]) ** 2 + ) + for i in range(pred_len) + ) + / pred_len + ) + + return ade + + def compute_L2_stp3(self, trajs, gt_trajs): + ''' + trajs: torch.Tensor (n_future, 2) + gt_trajs: torch.Tensor (n_future, 2) + ''' + # return torch.sqrt(((trajs[:, :, :2] - gt_trajs[:, :, :2]) ** 2).sum(dim=-1)) + pred_len = trajs.shape[0] + ade = float( + torch.sqrt( + (trajs[-1, 0] - gt_trajs[-1, 0]) ** 2 + + (trajs[-1, 1] - gt_trajs[-1, 1]) ** 2 + ) + ) + return ade + + # def update(self, trajs, gt_trajs, segmentation): + # ''' + # trajs: torch.Tensor (B, n_future, 3) + # gt_trajs: torch.Tensor (B, n_future, 3) + # segmentation: torch.Tensor (B, n_future, 200, 200) + # ''' + # assert trajs.shape == gt_trajs.shape + # L2 = self.compute_L2(trajs, gt_trajs) + # obj_coll_sum, obj_box_coll_sum = self.evaluate_coll(trajs[:,:,:2], gt_trajs[:,:,:2], segmentation) + + # if torch.isnan(L2).max().item(): + # debug = 1 + # else: + # self.obj_col += obj_coll_sum + # self.obj_box_col += obj_box_coll_sum + # self.L2 += L2.sum(dim=0) + # if torch.isnan(self.L2).max().item(): + # debug=1 + # self.total +=len(trajs) + + + # def compute(self): + # return { + # 'obj_col': self.obj_col / self.total, + # 'obj_box_col': self.obj_box_col / self.total, + # 'L2' : self.L2 / self.total + # } \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..03f906ce601e2dfac207af680774086067808830 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__init__.py @@ -0,0 +1 @@ +from .epoch_based_runner import EpochBasedRunner_video \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de53c43bbb77b246753a5b2322a4360cac37eadb Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/epoch_based_runner.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/epoch_based_runner.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a40300a965582c2e82141e7bf5a063b433658ff Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/epoch_based_runner.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/runner/epoch_based_runner.py b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/epoch_based_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..a3c4c62bc576fb066f5b76ee433759339dd155cd --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/epoch_based_runner.py @@ -0,0 +1,91 @@ +import os.path as osp +import torch +import mmcv +from mmcv.runner.base_runner import BaseRunner +from mmcv.runner.epoch_based_runner import EpochBasedRunner +from mmcv.runner.builder import RUNNERS +from mmcv.runner.checkpoint import save_checkpoint +from mmcv.runner.utils import get_host_info +from pprint import pprint +from mmcv.parallel.data_container import DataContainer + + +@RUNNERS.register_module() +class EpochBasedRunner_video(EpochBasedRunner): + + ''' + # basic logic + + input_sequence = [a, b, c] # given a sequence of samples + + prev_bev = None + for each in input_sequcene[:-1] + prev_bev = eval_model(each, prev_bev)) # inference only. + + model(input_sequcene[-1], prev_bev) # train the last sample. + ''' + + def __init__(self, + model, + eval_model=None, + batch_processor=None, + optimizer=None, + work_dir=None, + logger=None, + meta=None, + keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'], + max_iters=None, + max_epochs=None): + super().__init__(model, + batch_processor, + optimizer, + work_dir, + logger, + meta, + max_iters, + max_epochs) + keys.append('img_metas') + self.keys = keys + self.eval_model = eval_model + self.eval_model.eval() + + def run_iter(self, data_batch, train_mode, **kwargs): + if self.batch_processor is not None: + assert False + # outputs = self.batch_processor( + # self.model, data_batch, train_mode=train_mode, **kwargs) + elif train_mode: + + num_samples = data_batch['img'].data[0].size(1) + data_list = [] + prev_bev = None + for i in range(num_samples): + data = {} + for key in self.keys: + if key not in ['img_metas', 'img', 'points']: + data[key] = data_batch[key] + else: + if key == 'img': + data['img'] = DataContainer(data=[data_batch['img'].data[0][:, i]], cpu_only=data_batch['img'].cpu_only, stack=True) + elif key == 'img_metas': + data['img_metas'] = DataContainer(data=[[each[i] for each in data_batch['img_metas'].data[0]]], cpu_only=data_batch['img_metas'].cpu_only) + else: + assert False + data_list.append(data) + with torch.no_grad(): + for i in range(num_samples-1): + if i>0: data_list[i]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False) + prev_bev = self.eval_model.val_step(data_list[i], self.optimizer, **kwargs) + + data_list[-1]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False) + outputs = self.model.train_step(data_list[-1], self.optimizer, **kwargs) + else: + assert False + # outputs = self.model.val_step(data_batch, self.optimizer, **kwargs) + + if not isinstance(outputs, dict): + raise TypeError('"batch_processor()" or "model.train_step()"' + 'and "model.val_step()" must return a dict') + if 'log_vars' in outputs: + self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) + self.outputs = outputs \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/CD_loss.py b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/CD_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..ed18fc58734de15957235443621d26c7d7785dcd --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/CD_loss.py @@ -0,0 +1,718 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn as nn +from torch.nn.functional import l1_loss, mse_loss, smooth_l1_loss + +from mmdet.models.builder import LOSSES +from mmdet.models import weighted_loss +import mmcv +import torch.nn.functional as F +from mmdet.core.bbox.match_costs.builder import MATCH_COST +import functools + + +def reduce_loss(loss, reduction): + """Reduce loss as specified. + + Args: + loss (Tensor): Elementwise loss tensor. + reduction (str): Options are "none", "mean" and "sum". + + Return: + Tensor: Reduced loss tensor. + """ + reduction_enum = F._Reduction.get_enum(reduction) + # none: 0, elementwise_mean:1, sum: 2 + if reduction_enum == 0: + return loss + elif reduction_enum == 1: + return loss.mean() + elif reduction_enum == 2: + return loss.sum() + +@mmcv.jit(derivate=True, coderize=True) +def custom_weight_dir_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None): + """Apply element-wise weight and reduce loss. + + Args: + loss (Tensor): num_sample, num_dir + weight (Tensor): Element-wise weights. + reduction (str): Same as built-in losses of PyTorch. + avg_factor (float): Average factor when computing the mean of losses. + + Returns: + Tensor: Processed loss values. + """ + # if weight is specified, apply element-wise weight + if weight is not None: + loss = loss * weight + + # if avg_factor is not specified, just reduce the loss + if avg_factor is None: + raise ValueError('avg_factor should not be none for OrderedPtsL1Loss') + # loss = reduce_loss(loss, reduction) + else: + # if reduction is mean, then average the loss by avg_factor + if reduction == 'mean': + # import pdb;pdb.set_trace() + # loss = loss.permute(1,0,2,3).contiguous() + loss = loss.sum() + loss = loss / avg_factor + # if reduction is 'none', then do nothing, otherwise raise an error + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + return loss + +@mmcv.jit(derivate=True, coderize=True) +def custom_weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None): + """Apply element-wise weight and reduce loss. + + Args: + loss (Tensor): num_sample, num_order, num_pts, num_coords + weight (Tensor): Element-wise weights. + reduction (str): Same as built-in losses of PyTorch. + avg_factor (float): Average factor when computing the mean of losses. + + Returns: + Tensor: Processed loss values. + """ + # if weight is specified, apply element-wise weight + if weight is not None: + loss = loss * weight + + # if avg_factor is not specified, just reduce the loss + if avg_factor is None: + raise ValueError('avg_factor should not be none for OrderedPtsL1Loss') + # loss = reduce_loss(loss, reduction) + else: + # if reduction is mean, then average the loss by avg_factor + if reduction == 'mean': + # import pdb;pdb.set_trace() + loss = loss.permute(1,0,2,3).contiguous() + loss = loss.sum((1,2,3)) + loss = loss / avg_factor + # if reduction is 'none', then do nothing, otherwise raise an error + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + return loss + +def custom_weighted_loss(loss_func): + """Create a weighted version of a given loss function. + + To use this decorator, the loss function must have the signature like + `loss_func(pred, target, **kwargs)`. The function only needs to compute + element-wise loss without any reduction. This decorator will add weight + and reduction arguments to the function. The decorated function will have + the signature like `loss_func(pred, target, weight=None, reduction='mean', + avg_factor=None, **kwargs)`. + + :Example: + + >>> import torch + >>> @weighted_loss + >>> def l1_loss(pred, target): + >>> return (pred - target).abs() + + >>> pred = torch.Tensor([0, 2, 3]) + >>> target = torch.Tensor([1, 1, 1]) + >>> weight = torch.Tensor([1, 0, 1]) + + >>> l1_loss(pred, target) + tensor(1.3333) + >>> l1_loss(pred, target, weight) + tensor(1.) + >>> l1_loss(pred, target, reduction='none') + tensor([1., 1., 2.]) + >>> l1_loss(pred, target, weight, avg_factor=2) + tensor(1.5000) + """ + + @functools.wraps(loss_func) + def wrapper(pred, + target, + weight=None, + reduction='mean', + avg_factor=None, + **kwargs): + # get element-wise loss + loss = loss_func(pred, target, **kwargs) + loss = custom_weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + return wrapper + + +def custom_weighted_dir_loss(loss_func): + """Create a weighted version of a given loss function. + + To use this decorator, the loss function must have the signature like + `loss_func(pred, target, **kwargs)`. The function only needs to compute + element-wise loss without any reduction. This decorator will add weight + and reduction arguments to the function. The decorated function will have + the signature like `loss_func(pred, target, weight=None, reduction='mean', + avg_factor=None, **kwargs)`. + + :Example: + + >>> import torch + >>> @weighted_loss + >>> def l1_loss(pred, target): + >>> return (pred - target).abs() + + >>> pred = torch.Tensor([0, 2, 3]) + >>> target = torch.Tensor([1, 1, 1]) + >>> weight = torch.Tensor([1, 0, 1]) + + >>> l1_loss(pred, target) + tensor(1.3333) + >>> l1_loss(pred, target, weight) + tensor(1.) + >>> l1_loss(pred, target, reduction='none') + tensor([1., 1., 2.]) + >>> l1_loss(pred, target, weight, avg_factor=2) + tensor(1.5000) + """ + + @functools.wraps(loss_func) + def wrapper(pred, + target, + weight=None, + reduction='mean', + avg_factor=None, + **kwargs): + # get element-wise loss + loss = loss_func(pred, target, **kwargs) + loss = custom_weight_dir_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + return wrapper + +@mmcv.jit(derivate=True, coderize=True) +@custom_weighted_loss +def ordered_pts_smooth_l1_loss(pred, target): + """L1 loss. + + Args: + pred (torch.Tensor): shape [num_samples, num_pts, num_coords] + target (torch.Tensor): shape [num_samples, num_order, num_pts, num_coords] + + Returns: + torch.Tensor: Calculated loss + """ + if target.numel() == 0: + return pred.sum() * 0 + pred = pred.unsqueeze(1).repeat(1, target.size(1),1,1) + assert pred.size() == target.size() + loss =smooth_l1_loss(pred,target, reduction='none') + # import pdb;pdb.set_trace() + return loss + +@mmcv.jit(derivate=True, coderize=True) +@weighted_loss +def pts_l1_loss(pred, target): + """L1 loss. + + Args: + pred (torch.Tensor): shape [num_samples, num_pts, num_coords] + target (torch.Tensor): shape [num_samples, num_pts, num_coords] + + Returns: + torch.Tensor: Calculated loss + """ + if target.numel() == 0: + return pred.sum() * 0 + assert pred.size() == target.size() + loss = torch.abs(pred - target) + return loss + +@mmcv.jit(derivate=True, coderize=True) +@custom_weighted_loss +def ordered_pts_l1_loss(pred, target): + """L1 loss. + + Args: + pred (torch.Tensor): shape [num_samples, num_pts, num_coords] + target (torch.Tensor): shape [num_samples, num_order, num_pts, num_coords] + + Returns: + torch.Tensor: Calculated loss + """ + if target.numel() == 0: + return pred.sum() * 0 + pred = pred.unsqueeze(1).repeat(1, target.size(1),1,1) + assert pred.size() == target.size() + loss = torch.abs(pred - target) + return loss + +@mmcv.jit(derivate=True, coderize=True) +@custom_weighted_dir_loss +def pts_dir_cos_loss(pred, target): + """ Dir cosine similiarity loss + pred (torch.Tensor): shape [num_samples, num_dir, num_coords] + target (torch.Tensor): shape [num_samples, num_dir, num_coords] + + """ + if target.numel() == 0: + return pred.sum() * 0 + # import pdb;pdb.set_trace() + num_samples, num_dir, num_coords = pred.shape + loss_func = torch.nn.CosineEmbeddingLoss(reduction='none') + tgt_param = target.new_ones((num_samples, num_dir)) + tgt_param = tgt_param.flatten(0) + loss = loss_func(pred.flatten(0,1), target.flatten(0,1), tgt_param) + loss = loss.view(num_samples, num_dir) + return loss + +@LOSSES.register_module() +class OrderedPtsSmoothL1Loss(nn.Module): + """L1 loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, reduction='mean', loss_weight=1.0): + super(OrderedPtsSmoothL1Loss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + # import pdb;pdb.set_trace() + loss_bbox = self.loss_weight * ordered_pts_smooth_l1_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_bbox + + +@LOSSES.register_module() +class PtsDirCosLoss(nn.Module): + """L1 loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, reduction='mean', loss_weight=1.0): + super(PtsDirCosLoss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + # import pdb;pdb.set_trace() + loss_dir = self.loss_weight * pts_dir_cos_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_dir + + + +@LOSSES.register_module() +class PtsL1Loss(nn.Module): + """L1 loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, reduction='mean', loss_weight=1.0): + super(PtsL1Loss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + # import pdb;pdb.set_trace() + loss_bbox = self.loss_weight * pts_l1_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_bbox + +@LOSSES.register_module() +class OrderedPtsL1Loss(nn.Module): + """L1 loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, reduction='mean', loss_weight=1.0): + super(OrderedPtsL1Loss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, + pred, + target, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + # import pdb;pdb.set_trace() + loss_bbox = self.loss_weight * ordered_pts_l1_loss( + pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_bbox + + + + +@MATCH_COST.register_module() +class OrderedPtsSmoothL1Cost(object): + """OrderedPtsL1Cost. + Args: + weight (int | float, optional): loss_weight + """ + + def __init__(self, weight=1.): + self.weight = weight + + def __call__(self, bbox_pred, gt_bboxes): + """ + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (x, y), which are all in range [0, 1]. Shape + [num_query, num_pts, 2]. + gt_bboxes (Tensor): Ground truth boxes with normalized + coordinates (x,y). + Shape [num_gt, num_ordered, num_pts, 2]. + Returns: + torch.Tensor: bbox_cost value with weight + """ + num_gts, num_orders, num_pts, num_coords = gt_bboxes.shape + # import pdb;pdb.set_trace() + bbox_pred = bbox_pred.view(bbox_pred.size(0),-1).unsqueeze(1).repeat(1,num_gts*num_orders,1) + gt_bboxes = gt_bboxes.flatten(2).view(num_gts*num_orders,-1).unsqueeze(0).repeat(bbox_pred.size(0),1,1) + # import pdb;pdb.set_trace() + bbox_cost = smooth_l1_loss(bbox_pred, gt_bboxes, reduction='none').sum(-1) + # bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) + return bbox_cost * self.weight + +@MATCH_COST.register_module() +class PtsL1Cost(object): + """OrderedPtsL1Cost. + Args: + weight (int | float, optional): loss_weight + """ + + def __init__(self, weight=1.): + self.weight = weight + + def __call__(self, bbox_pred, gt_bboxes): + """ + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (x, y), which are all in range [0, 1]. Shape + [num_query, num_pts, 2]. + gt_bboxes (Tensor): Ground truth boxes with normalized + coordinates (x,y). + Shape [num_gt, num_ordered, num_pts, 2]. + Returns: + torch.Tensor: bbox_cost value with weight + """ + num_gts, num_pts, num_coords = gt_bboxes.shape + # import pdb;pdb.set_trace() + bbox_pred = bbox_pred.view(bbox_pred.size(0),-1) + gt_bboxes = gt_bboxes.view(num_gts,-1) + bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) + return bbox_cost * self.weight + +@MATCH_COST.register_module() +class OrderedPtsL1Cost(object): + """OrderedPtsL1Cost. + Args: + weight (int | float, optional): loss_weight + """ + + def __init__(self, weight=1.): + self.weight = weight + + def __call__(self, bbox_pred, gt_bboxes): + """ + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (x, y), which are all in range [0, 1]. Shape + [num_query, num_pts, 2]. + gt_bboxes (Tensor): Ground truth boxes with normalized + coordinates (x,y). + Shape [num_gt, num_ordered, num_pts, 2]. + Returns: + torch.Tensor: bbox_cost value with weight + """ + num_gts, num_orders, num_pts, num_coords = gt_bboxes.shape + # import pdb;pdb.set_trace() + bbox_pred = bbox_pred.view(bbox_pred.size(0),-1) + gt_bboxes = gt_bboxes.flatten(2).view(num_gts*num_orders,-1) + bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) + return bbox_cost * self.weight + +@MATCH_COST.register_module() +class MyChamferDistanceCost: + def __init__(self, loss_src_weight=1., loss_dst_weight=1.): + # assert mode in ['smooth_l1', 'l1', 'l2'] + # self.mode = mode + self.loss_src_weight = loss_src_weight + self.loss_dst_weight = loss_dst_weight + + def __call__(self, src, dst,src_weight=1.0,dst_weight=1.0,): + """ + pred_pts (Tensor): normed coordinate(x,y), shape (num_q, num_pts_M, 2) + gt_pts (Tensor): normed coordinate(x,y), shape (num_gt, num_pts_N, 2) + """ + # criterion_mode = self.mode + # if criterion_mode == 'smooth_l1': + # criterion = smooth_l1_loss + # elif criterion_mode == 'l1': + # criterion = l1_loss + # elif criterion_mode == 'l2': + # criterion = mse_loss + # else: + # raise NotImplementedError + # import pdb;pdb.set_trace() + src_expand = src.unsqueeze(1).repeat(1,dst.shape[0],1,1) + dst_expand = dst.unsqueeze(0).repeat(src.shape[0],1,1,1) + # src_expand = src.unsqueeze(2).unsqueeze(1).repeat(1,dst.shape[0], 1, dst.shape[1], 1) + # dst_expand = dst.unsqueeze(1).unsqueeze(0).repeat(src.shape[0],1, src.shape[1], 1, 1) + distance = torch.cdist(src_expand, dst_expand) + src2dst_distance = torch.min(distance, dim=3)[0] # (num_q, num_gt, num_pts_N) + dst2src_distance = torch.min(distance, dim=2)[0] # (num_q, num_gt, num_pts_M) + loss_src = (src2dst_distance * src_weight).mean(-1) + loss_dst = (dst2src_distance * dst_weight).mean(-1) + loss = loss_src*self.loss_src_weight + loss_dst * self.loss_dst_weight + return loss + +@mmcv.jit(derivate=True, coderize=True) +def chamfer_distance(src, + dst, + src_weight=1.0, + dst_weight=1.0, + # criterion_mode='l1', + reduction='mean', + avg_factor=None): + """Calculate Chamfer Distance of two sets. + + Args: + src (torch.Tensor): Source set with shape [B, N, C] to + calculate Chamfer Distance. + dst (torch.Tensor): Destination set with shape [B, M, C] to + calculate Chamfer Distance. + src_weight (torch.Tensor or float): Weight of source loss. + dst_weight (torch.Tensor or float): Weight of destination loss. + criterion_mode (str): Criterion mode to calculate distance. + The valid modes are smooth_l1, l1 or l2. + reduction (str): Method to reduce losses. + The valid reduction method are 'none', 'sum' or 'mean'. + + Returns: + tuple: Source and Destination loss with the corresponding indices. + + - loss_src (torch.Tensor): The min distance \ + from source to destination. + - loss_dst (torch.Tensor): The min distance \ + from destination to source. + - indices1 (torch.Tensor): Index the min distance point \ + for each point in source to destination. + - indices2 (torch.Tensor): Index the min distance point \ + for each point in destination to source. + """ + + # if criterion_mode == 'smooth_l1': + # criterion = smooth_l1_loss + # elif criterion_mode == 'l1': + # criterion = l1_loss + # elif criterion_mode == 'l2': + # criterion = mse_loss + # else: + # raise NotImplementedError + + # src_expand = src.unsqueeze(2).repeat(1, 1, dst.shape[1], 1) + # dst_expand = dst.unsqueeze(1).repeat(1, src.shape[1], 1, 1) + # import pdb;pdb.set_trace() + distance = torch.cdist(src, dst) + src2dst_distance, indices1 = torch.min(distance, dim=2) # (B,N) + dst2src_distance, indices2 = torch.min(distance, dim=1) # (B,M) + # import pdb;pdb.set_trace() + #TODO this may be wrong for misaligned src_weight, now[N,fixed_num] + # should be [N], then view + loss_src = (src2dst_distance * src_weight) + loss_dst = (dst2src_distance * dst_weight) + if avg_factor is None: + reduction_enum = F._Reduction.get_enum(reduction) + if reduction_enum == 0: + raise ValueError('MyCDLoss can not be used with reduction=`none`') + elif reduction_enum == 1: + loss_src = loss_src.mean(-1).mean() + loss_dst = loss_dst.mean(-1).mean() + elif reduction_enum == 2: + loss_src = loss_src.mean(-1).sum() + loss_dst = loss_dst.mean(-1).sum() + else: + raise NotImplementedError + else: + if reduction == 'mean': + eps = torch.finfo(torch.float32).eps + loss_src = loss_src.mean(-1).sum() / (avg_factor + eps) + loss_dst = loss_dst.mean(-1).sum() / (avg_factor + eps) + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + + return loss_src, loss_dst, indices1, indices2 + + +@LOSSES.register_module() +class MyChamferDistance(nn.Module): + """Calculate Chamfer Distance of two sets. + + Args: + mode (str): Criterion mode to calculate distance. + The valid modes are smooth_l1, l1 or l2. + reduction (str): Method to reduce losses. + The valid reduction method are none, sum or mean. + loss_src_weight (float): Weight of loss_source. + loss_dst_weight (float): Weight of loss_target. + """ + + def __init__(self, + # mode='l1', + reduction='mean', + loss_src_weight=1.0, + loss_dst_weight=1.0): + super(MyChamferDistance, self).__init__() + + # assert mode in ['smooth_l1', 'l1', 'l2'] + assert reduction in ['none', 'sum', 'mean'] + # self.mode = mode + self.reduction = reduction + self.loss_src_weight = loss_src_weight + self.loss_dst_weight = loss_dst_weight + + def forward(self, + source, + target, + src_weight=1.0, + dst_weight=1.0, + avg_factor=None, + reduction_override=None, + return_indices=False, + **kwargs): + """Forward function of loss calculation. + + Args: + source (torch.Tensor): Source set with shape [B, N, C] to + calculate Chamfer Distance. + target (torch.Tensor): Destination set with shape [B, M, C] to + calculate Chamfer Distance. + src_weight (torch.Tensor | float, optional): + Weight of source loss. Defaults to 1.0. + dst_weight (torch.Tensor | float, optional): + Weight of destination loss. Defaults to 1.0. + reduction_override (str, optional): Method to reduce losses. + The valid reduction method are 'none', 'sum' or 'mean'. + Defaults to None. + return_indices (bool, optional): Whether to return indices. + Defaults to False. + + Returns: + tuple[torch.Tensor]: If ``return_indices=True``, return losses of \ + source and target with their corresponding indices in the \ + order of ``(loss_source, loss_target, indices1, indices2)``. \ + If ``return_indices=False``, return \ + ``(loss_source, loss_target)``. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + + loss_source, loss_target, indices1, indices2 = chamfer_distance( + source, target, src_weight, dst_weight, reduction, + avg_factor=avg_factor) + + loss_source *= self.loss_src_weight + loss_target *= self.loss_dst_weight + + loss_pts = loss_source + loss_target + + if return_indices: + return loss_pts, indices1, indices2 + else: + return loss_pts diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..126bf07bf3860d0dd817c6d41eafc299002ff956 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__init__.py @@ -0,0 +1,7 @@ +from .map_utils import normalize_2d_bbox, normalize_2d_pts, denormalize_2d_bbox, denormalize_2d_pts +from .CD_loss import ( + MyChamferDistance, MyChamferDistanceCost, + OrderedPtsL1Cost, PtsL1Cost, OrderedPtsSmoothL1Cost, + OrderedPtsL1Loss, PtsL1Loss, PtsDirCosLoss +) +from .plan_loss import PlanMapBoundLoss, PlanCollisionLoss, PlanMapDirectionLoss \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/CD_loss.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/CD_loss.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e5c3040741e07c8aa1e17aaf254c5d23a41196a Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/CD_loss.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12f06749954c89fa2ee28f3f8f761f25913fa3bf Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/map_utils.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/map_utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fbefcfd6662b2c0302c0735ef8c54678062bf221 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/map_utils.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/plan_loss.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/plan_loss.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c3483eb02a7fd5dd9d0359895086a2560de004df Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/plan_loss.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/traj_lr_warmup.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/traj_lr_warmup.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f82af54fc90b31e5f2bb513dcbe690e6c55480ec Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/traj_lr_warmup.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/map_utils.py b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/map_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a4884c1d235b5e22a5666a0f60be1487309225d0 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/map_utils.py @@ -0,0 +1,41 @@ +from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh, bbox_cxcywh_to_xyxy + +def normalize_2d_bbox(bboxes, pc_range): + + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + cxcywh_bboxes = bbox_xyxy_to_cxcywh(bboxes) + cxcywh_bboxes[...,0:1] = cxcywh_bboxes[..., 0:1] - pc_range[0] + cxcywh_bboxes[...,1:2] = cxcywh_bboxes[...,1:2] - pc_range[1] + factor = bboxes.new_tensor([patch_w, patch_h,patch_w,patch_h]) + + normalized_bboxes = cxcywh_bboxes / factor + return normalized_bboxes + +def normalize_2d_pts(pts, pc_range): + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + new_pts = pts.clone() + new_pts[...,0:1] = pts[..., 0:1] - pc_range[0] + new_pts[...,1:2] = pts[...,1:2] - pc_range[1] + factor = pts.new_tensor([patch_w, patch_h]) + normalized_pts = new_pts / factor + return normalized_pts + +def denormalize_2d_bbox(bboxes, pc_range): + + bboxes = bbox_cxcywh_to_xyxy(bboxes) + bboxes[..., 0::2] = (bboxes[..., 0::2]*(pc_range[3] - + pc_range[0]) + pc_range[0]) + bboxes[..., 1::2] = (bboxes[..., 1::2]*(pc_range[4] - + pc_range[1]) + pc_range[1]) + + return bboxes + +def denormalize_2d_pts(pts, pc_range): + new_pts = pts.clone() + new_pts[...,0:1] = (pts[..., 0:1]*(pc_range[3] - + pc_range[0]) + pc_range[0]) + new_pts[...,1:2] = (pts[...,1:2]*(pc_range[4] - + pc_range[1]) + pc_range[1]) + return new_pts \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/plan_loss.py b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/plan_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..dd32e1642bf4990d403eb78d1b196d1b1ab95039 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/plan_loss.py @@ -0,0 +1,447 @@ +import math +import mmcv +import torch +from torch import nn as nn +from mmdet.models import weighted_loss +from mmdet.models.builder import LOSSES + + +@LOSSES.register_module() +class PlanMapBoundLoss(nn.Module): + """Planning constraint to push ego vehicle away from the lane boundary. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + map_thresh (float, optional): confidence threshold to filter map predictions. + lane_bound_cls_idx (float, optional): lane_boundary class index. + dis_thresh (float, optional): distance threshold between ego vehicle and lane bound. + point_cloud_range (list, optional): point cloud range. + """ + + def __init__( + self, + reduction='mean', + loss_weight=1.0, + map_thresh=0.5, + lane_bound_cls_idx=2, + dis_thresh=1.0, + point_cloud_range=[-15.0, -30.0, -2.0, 15.0, 30.0, 2.0], + perception_detach=False + ): + super(PlanMapBoundLoss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + self.map_thresh = map_thresh + self.lane_bound_cls_idx = lane_bound_cls_idx + self.dis_thresh = dis_thresh + self.pc_range = point_cloud_range + self.perception_detach = perception_detach + + def forward(self, + ego_fut_preds, + lane_preds, + lane_score_preds, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + ego_fut_preds (Tensor): [B, fut_ts, 2] + lane_preds (Tensor): [B, num_vec, num_pts, 2] + lane_score_preds (Tensor): [B, num_vec, 3] + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + + if self.perception_detach: + lane_preds = lane_preds.detach() + lane_score_preds = lane_score_preds.detach() + + # filter lane element according to confidence score and class + not_lane_bound_mask = lane_score_preds[..., self.lane_bound_cls_idx] < self.map_thresh + # denormalize map pts + lane_bound_preds = lane_preds.clone() + lane_bound_preds[...,0:1] = (lane_bound_preds[..., 0:1] * (self.pc_range[3] - + self.pc_range[0]) + self.pc_range[0]) + lane_bound_preds[...,1:2] = (lane_bound_preds[..., 1:2] * (self.pc_range[4] - + self.pc_range[1]) + self.pc_range[1]) + # pad not-lane-boundary cls and low confidence preds + lane_bound_preds[not_lane_bound_mask] = 1e6 + + loss_bbox = self.loss_weight * plan_map_bound_loss(ego_fut_preds, lane_bound_preds, + weight=weight, dis_thresh=self.dis_thresh, + reduction=reduction, avg_factor=avg_factor) + return loss_bbox + + +@mmcv.jit(derivate=True, coderize=True) +@weighted_loss +def plan_map_bound_loss(pred, target, dis_thresh=1.0): + """Planning map bound constraint (L1 distance). + + Args: + pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2]. + target (torch.Tensor): lane_bound_preds, [B, num_vec, num_pts, 2]. + weight (torch.Tensor): [B, fut_ts] + + Returns: + torch.Tensor: Calculated loss [B, fut_ts] + """ + pred = pred.cumsum(dim=-2) + ego_traj_starts = pred[:, :-1, :] + ego_traj_ends = pred + B, T, _ = ego_traj_ends.size() + padding_zeros = torch.zeros((B, 1, 2), dtype=pred.dtype, device=pred.device) # initial position + ego_traj_starts = torch.cat((padding_zeros, ego_traj_starts), dim=1) + _, V, P, _ = target.size() + ego_traj_expanded = ego_traj_ends.unsqueeze(2).unsqueeze(3) # [B, T, 1, 1, 2] + maps_expanded = target.unsqueeze(1) # [1, 1, M, P, 2] + dist = torch.linalg.norm(ego_traj_expanded - maps_expanded, dim=-1) # [B, T, M, P] + dist = dist.min(dim=-1, keepdim=False)[0] + min_inst_idxs = torch.argmin(dist, dim=-1).tolist() + batch_idxs = [[i] for i in range(dist.shape[0])] + ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])] + bd_target = target.unsqueeze(1).repeat(1, pred.shape[1], 1, 1, 1) + min_bd_insts = bd_target[batch_idxs, ts_idxs, min_inst_idxs] # [B, T, P, 2] + bd_inst_starts = min_bd_insts[:, :, :-1, :].flatten(0, 2) + bd_inst_ends = min_bd_insts[:, :, 1:, :].flatten(0, 2) + ego_traj_starts = ego_traj_starts.unsqueeze(2).repeat(1, 1, P-1, 1).flatten(0, 2) + ego_traj_ends = ego_traj_ends.unsqueeze(2).repeat(1, 1, P-1, 1).flatten(0, 2) + + intersect_mask = segments_intersect(ego_traj_starts, ego_traj_ends, + bd_inst_starts, bd_inst_ends) + intersect_mask = intersect_mask.reshape(B, T, P-1) + intersect_mask = intersect_mask.any(dim=-1) + intersect_idx = (intersect_mask == True).nonzero() + + target = target.view(target.shape[0], -1, target.shape[-1]) + # [B, fut_ts, num_vec*num_pts] + dist = torch.linalg.norm(pred[:, :, None, :] - target[:, None, :, :], dim=-1) + min_idxs = torch.argmin(dist, dim=-1).tolist() + batch_idxs = [[i] for i in range(dist.shape[0])] + ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])] + min_dist = dist[batch_idxs, ts_idxs, min_idxs] + loss = min_dist + safe_idx = loss > dis_thresh + unsafe_idx = loss <= dis_thresh + loss[safe_idx] = 0 + loss[unsafe_idx] = dis_thresh - loss[unsafe_idx] + + for i in range(len(intersect_idx)): + loss[intersect_idx[i, 0], intersect_idx[i, 1]:] = 0 + + return loss + + +def segments_intersect(line1_start, line1_end, line2_start, line2_end): + # Calculating the differences + dx1 = line1_end[:, 0] - line1_start[:, 0] + dy1 = line1_end[:, 1] - line1_start[:, 1] + dx2 = line2_end[:, 0] - line2_start[:, 0] + dy2 = line2_end[:, 1] - line2_start[:, 1] + + # Calculating determinants + det = dx1 * dy2 - dx2 * dy1 + det_mask = det != 0 + + # Checking if lines are parallel or coincident + parallel_mask = torch.logical_not(det_mask) + + # Calculating intersection parameters + t1 = ((line2_start[:, 0] - line1_start[:, 0]) * dy2 + - (line2_start[:, 1] - line1_start[:, 1]) * dx2) / det + t2 = ((line2_start[:, 0] - line1_start[:, 0]) * dy1 + - (line2_start[:, 1] - line1_start[:, 1]) * dx1) / det + + # Checking intersection conditions + intersect_mask = torch.logical_and( + torch.logical_and(t1 >= 0, t1 <= 1), + torch.logical_and(t2 >= 0, t2 <= 1) + ) + + # Handling parallel or coincident lines + intersect_mask[parallel_mask] = False + + return intersect_mask + + +@LOSSES.register_module() +class PlanCollisionLoss(nn.Module): + """Planning constraint to push ego vehicle away from other agents. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + agent_thresh (float, optional): confidence threshold to filter agent predictions. + x_dis_thresh (float, optional): distance threshold between ego and other agents in x-axis. + y_dis_thresh (float, optional): distance threshold between ego and other agents in y-axis. + point_cloud_range (list, optional): point cloud range. + """ + + def __init__( + self, + reduction='mean', + loss_weight=1.0, + agent_thresh=0.5, + x_dis_thresh=1.5, + y_dis_thresh=3.0, + point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] + ): + super(PlanCollisionLoss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + self.agent_thresh = agent_thresh + self.x_dis_thresh = x_dis_thresh + self.y_dis_thresh = y_dis_thresh + self.pc_range = point_cloud_range + + def forward(self, + ego_fut_preds, + agent_preds, + agent_fut_preds, + agent_score_preds, + agent_fut_cls_preds, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + ego_fut_preds (Tensor): [B, fut_ts, 2] + agent_preds (Tensor): [B, num_agent, 2] + agent_fut_preds (Tensor): [B, num_agent, fut_mode, fut_ts, 2] + agent_fut_cls_preds (Tensor): [B, num_agent, fut_mode] + agent_score_preds (Tensor): [B, num_agent, 10] + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + + # filter agent element according to confidence score + agent_max_score_preds, agent_max_score_idxs = agent_score_preds.max(dim=-1) + not_valid_agent_mask = agent_max_score_preds < self.agent_thresh + # filter low confidence preds + agent_fut_preds[not_valid_agent_mask] = 1e6 + # filter not vehicle preds + not_veh_pred_mask = agent_max_score_idxs > 4 # veh idxs are 0-4 + agent_fut_preds[not_veh_pred_mask] = 1e6 + # only use best mode pred + best_mode_idxs = torch.argmax(agent_fut_cls_preds, dim=-1).tolist() + batch_idxs = [[i] for i in range(agent_fut_cls_preds.shape[0])] + agent_num_idxs = [[i for i in range(agent_fut_cls_preds.shape[1])] for j in range(agent_fut_cls_preds.shape[0])] + agent_fut_preds = agent_fut_preds[batch_idxs, agent_num_idxs, best_mode_idxs] + + loss_bbox = self.loss_weight * plan_col_loss(ego_fut_preds, agent_preds, + agent_fut_preds=agent_fut_preds, weight=weight, + x_dis_thresh=self.x_dis_thresh, + y_dis_thresh=self.y_dis_thresh, + reduction=reduction, avg_factor=avg_factor) + return loss_bbox + + +@mmcv.jit(derivate=True, coderize=True) +@weighted_loss +def plan_col_loss( + pred, + target, + agent_fut_preds, + x_dis_thresh=1.5, + y_dis_thresh=3.0, + dis_thresh=3.0 +): + """Planning ego-agent collsion constraint. + + Args: + pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2]. + target (torch.Tensor): agent_preds, [B, num_agent, 2]. + agent_fut_preds (Tensor): [B, num_agent, fut_ts, 2]. + weight (torch.Tensor): [B, fut_ts, 2]. + x_dis_thresh (float, optional): distance threshold between ego and other agents in x-axis. + y_dis_thresh (float, optional): distance threshold between ego and other agents in y-axis. + dis_thresh (float, optional): distance threshold to filter distant agents. + + Returns: + torch.Tensor: Calculated loss [B, fut_mode, fut_ts, 2] + """ + pred = pred.cumsum(dim=-2) + agent_fut_preds = agent_fut_preds.cumsum(dim=-2) + target = target[:, :, None, :] + agent_fut_preds + # filter distant agents from ego vehicle + dist = torch.linalg.norm(pred[:, None, :, :] - target, dim=-1) + dist_mask = dist > dis_thresh + target[dist_mask] = 1e6 + + # [B, num_agent, fut_ts] + x_dist = torch.abs(pred[:, None, :, 0] - target[..., 0]) + y_dist = torch.abs(pred[:, None, :, 1] - target[..., 1]) + x_min_idxs = torch.argmin(x_dist, dim=1).tolist() + y_min_idxs = torch.argmin(y_dist, dim=1).tolist() + batch_idxs = [[i] for i in range(y_dist.shape[0])] + ts_idxs = [[i for i in range(y_dist.shape[-1])] for j in range(y_dist.shape[0])] + + # [B, fut_ts] + x_min_dist = x_dist[batch_idxs, x_min_idxs, ts_idxs] + y_min_dist = y_dist[batch_idxs, y_min_idxs, ts_idxs] + x_loss = x_min_dist + safe_idx = x_loss > x_dis_thresh + unsafe_idx = x_loss <= x_dis_thresh + x_loss[safe_idx] = 0 + x_loss[unsafe_idx] = x_dis_thresh - x_loss[unsafe_idx] + y_loss = y_min_dist + safe_idx = y_loss > y_dis_thresh + unsafe_idx = y_loss <= y_dis_thresh + y_loss[safe_idx] = 0 + y_loss[unsafe_idx] = y_dis_thresh - y_loss[unsafe_idx] + loss = torch.cat([x_loss.unsqueeze(-1), y_loss.unsqueeze(-1)], dim=-1) + + return loss + + +@LOSSES.register_module() +class PlanMapDirectionLoss(nn.Module): + """Planning loss to force the ego heading angle consistent with lane direction. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + theta_thresh (float, optional): angle diff thresh between ego and lane. + point_cloud_range (list, optional): point cloud range. + """ + + def __init__( + self, + reduction='mean', + loss_weight=1.0, + map_thresh=0.5, + dis_thresh=2.0, + lane_div_cls_idx=0, + point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] + ): + super(PlanMapDirectionLoss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + self.map_thresh = map_thresh + self.dis_thresh = dis_thresh + self.lane_div_cls_idx = lane_div_cls_idx + self.pc_range = point_cloud_range + + def forward(self, + ego_fut_preds, + lane_preds, + lane_score_preds, + weight=None, + avg_factor=None, + reduction_override=None): + """Forward function. + + Args: + ego_fut_preds (Tensor): [B, fut_ts, 2] + lane_preds (Tensor): [B, num_vec, num_pts, 2] + lane_score_preds (Tensor): [B, num_vec, 3] + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + + # filter lane element according to confidence score and class + not_lane_div_mask = lane_score_preds[..., self.lane_div_cls_idx] < self.map_thresh + # denormalize map pts + lane_div_preds = lane_preds.clone() + lane_div_preds[...,0:1] = (lane_div_preds[..., 0:1] * (self.pc_range[3] - + self.pc_range[0]) + self.pc_range[0]) + lane_div_preds[...,1:2] = (lane_div_preds[..., 1:2] * (self.pc_range[4] - + self.pc_range[1]) + self.pc_range[1]) + # pad not-lane-divider cls and low confidence preds + lane_div_preds[not_lane_div_mask] = 1e6 + + loss_bbox = self.loss_weight * plan_map_dir_loss(ego_fut_preds, lane_div_preds, + weight=weight, dis_thresh=self.dis_thresh, + reduction=reduction, avg_factor=avg_factor) + return loss_bbox + + +@mmcv.jit(derivate=True, coderize=True) +@weighted_loss +def plan_map_dir_loss(pred, target, dis_thresh=2.0): + """Planning ego-map directional loss. + + Args: + pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2]. + target (torch.Tensor): lane_div_preds, [B, num_vec, num_pts, 2]. + weight (torch.Tensor): [B, fut_ts] + + Returns: + torch.Tensor: Calculated loss [B, fut_ts] + """ + num_map_pts = target.shape[2] + pred = pred.cumsum(dim=-2) + traj_dis = torch.linalg.norm(pred[:, -1, :] - pred[:, 0, :], dim=-1) + static_mask = traj_dis < 1.0 + target = target.unsqueeze(1).repeat(1, pred.shape[1], 1, 1, 1) + + # find the closest map instance for ego at each timestamp + dist = torch.linalg.norm(pred[:, :, None, None, :] - target, dim=-1) + dist = dist.min(dim=-1, keepdim=False)[0] + min_inst_idxs = torch.argmin(dist, dim=-1).tolist() + batch_idxs = [[i] for i in range(dist.shape[0])] + ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])] + target_map_inst = target[batch_idxs, ts_idxs, min_inst_idxs] # [B, fut_ts, num_pts, 2] + + # calculate distance + dist = torch.linalg.norm(pred[:, :, None, :] - target_map_inst, dim=-1) + min_pts_idxs = torch.argmin(dist, dim=-1) + min_pts_next_idxs = min_pts_idxs.clone() + is_end_point = (min_pts_next_idxs == num_map_pts-1) + not_end_point = (min_pts_next_idxs != num_map_pts-1) + min_pts_next_idxs[is_end_point] = num_map_pts - 2 + min_pts_next_idxs[not_end_point] = min_pts_next_idxs[not_end_point] + 1 + min_pts_idxs = min_pts_idxs.tolist() + min_pts_next_idxs = min_pts_next_idxs.tolist() + traj_yaw = torch.atan2(torch.diff(pred[..., 1]), torch.diff(pred[..., 0])) # [B, fut_ts-1] + # last ts yaw assume same as previous + traj_yaw = torch.cat([traj_yaw, traj_yaw[:, [-1]]], dim=-1) # [B, fut_ts] + min_pts = target_map_inst[batch_idxs, ts_idxs, min_pts_idxs] + dist = torch.linalg.norm(min_pts - pred, dim=-1) + dist_mask = dist > dis_thresh + min_pts = min_pts.unsqueeze(2) + min_pts_next = target_map_inst[batch_idxs, ts_idxs, min_pts_next_idxs].unsqueeze(2) + map_pts = torch.cat([min_pts, min_pts_next], dim=2) + lane_yaw = torch.atan2(torch.diff(map_pts[..., 1]).squeeze(-1), torch.diff(map_pts[..., 0]).squeeze(-1)) # [B, fut_ts] + yaw_diff = traj_yaw - lane_yaw + yaw_diff[yaw_diff > math.pi] = yaw_diff[yaw_diff > math.pi] - math.pi + yaw_diff[yaw_diff > math.pi/2] = yaw_diff[yaw_diff > math.pi/2] - math.pi + yaw_diff[yaw_diff < -math.pi] = yaw_diff[yaw_diff < -math.pi] + math.pi + yaw_diff[yaw_diff < -math.pi/2] = yaw_diff[yaw_diff < -math.pi/2] + math.pi + yaw_diff[dist_mask] = 0 # loss = 0 if no lane around ego + yaw_diff[static_mask] = 0 # loss = 0 if ego is static + + loss = torch.abs(yaw_diff) + + return loss # [B, fut_ts] diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/traj_lr_warmup.py b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/traj_lr_warmup.py new file mode 100644 index 0000000000000000000000000000000000000000..0b5ba1bcb7298e4dc9102af620e406731a97b9b1 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/traj_lr_warmup.py @@ -0,0 +1,13 @@ +import torch + +def get_traj_warmup_loss_weight( + cur_epoch, + tot_epoch, + start_pos=0.3, + end_pos=0.35, + scale_weight=1.1 +): + epoch_percentage = cur_epoch / tot_epoch + sigmoid_input = 5 / (end_pos-start_pos) * epoch_percentage - 2.5 * (end_pos+start_pos) / (end_pos - start_pos) + + return scale_weight * torch.sigmoid(torch.tensor(sigmoid_input)) diff --git a/GenAD-main/projects/mmdet3d_plugin/__init__.py b/GenAD-main/projects/mmdet3d_plugin/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..35e414aaf938d7422c8c72a360130f04c76537c4 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/__init__.py @@ -0,0 +1,11 @@ +from .core.bbox.assigners.hungarian_assigner_3d import HungarianAssigner3D +from .core.bbox.coders.nms_free_coder import NMSFreeCoder +from .core.bbox.match_costs import BBox3DL1Cost +from .core.evaluation.eval_hooks import CustomDistEvalHook +from .datasets.pipelines import ( + PhotoMetricDistortionMultiViewImage, PadMultiViewImage, + NormalizeMultiviewImage, CustomCollect3D) +from .models.backbones.vovnet import VoVNet +from .models.utils import * +from .models.opt.adamw import AdamW2 +from .VAD import * diff --git a/GenAD-main/projects/mmdet3d_plugin/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6b701b5d66914e44a423f2bb2fb8d419f99dd7c Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..98d6e7e00553f1435d6f0d09ca69c8a5c4f1b4d0 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/__init__.py @@ -0,0 +1,6 @@ + +from .dense_heads import * +from .detectors import * +from .modules import * +from .runner import * +from .hooks import * diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..15dff22b7478a0f30151d376d41f3dc46e88ba7d --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/__init__.py @@ -0,0 +1,3 @@ +from .train import custom_train_model +from .mmdet_train import custom_train_detector +# from .test import custom_multi_gpu_test \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py new file mode 100644 index 0000000000000000000000000000000000000000..e57bd225dc33d631849a3aef8db2bae217520658 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py @@ -0,0 +1,200 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- +import random +import warnings + +import numpy as np +import torch +import torch.distributed as dist +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner, + Fp16OptimizerHook, OptimizerHook, build_optimizer, + build_runner, get_dist_info) +from mmcv.utils import build_from_cfg + +from mmdet.core import EvalHook + +from mmdet.datasets import (build_dataset, + replace_ImageToTensor) +from mmdet.utils import get_root_logger +import time +import os.path as osp +from projects.mmdet3d_plugin.datasets.builder import build_dataloader +from projects.mmdet3d_plugin.core.evaluation.eval_hooks import CustomDistEvalHook +from projects.mmdet3d_plugin.datasets import custom_build_dataset +def custom_train_detector(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + eval_model=None, + meta=None): + logger = get_root_logger(cfg.log_level) + + # prepare data loaders + + dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] + #assert len(dataset)==1s + if 'imgs_per_gpu' in cfg.data: + logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' + 'Please use "samples_per_gpu" instead') + if 'samples_per_gpu' in cfg.data: + logger.warning( + f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' + f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' + f'={cfg.data.imgs_per_gpu} is used in this experiments') + else: + logger.warning( + 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' + f'{cfg.data.imgs_per_gpu} in this experiments') + cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu + + data_loaders = [ + build_dataloader( + ds, + cfg.data.samples_per_gpu, + cfg.data.workers_per_gpu, + # cfg.gpus will be ignored if distributed + len(cfg.gpu_ids), + dist=distributed, + seed=cfg.seed, + shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'), + nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'), + ) for ds in dataset + ] + + # put model on gpus + if distributed: + find_unused_parameters = cfg.get('find_unused_parameters', False) + # Sets the `find_unused_parameters` parameter in + # torch.nn.parallel.DistributedDataParallel + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + if eval_model is not None: + eval_model = MMDistributedDataParallel( + eval_model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + else: + model = MMDataParallel( + model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) + if eval_model is not None: + eval_model = MMDataParallel( + eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) + + + # build runner + optimizer = build_optimizer(model, cfg.optimizer) + + if 'runner' not in cfg: + cfg.runner = { + 'type': 'EpochBasedRunner', + 'max_epochs': cfg.total_epochs + } + warnings.warn( + 'config is now expected to have a `runner` section, ' + 'please set `runner` in your config.', UserWarning) + else: + if 'total_epochs' in cfg: + assert cfg.total_epochs == cfg.runner.max_epochs + if eval_model is not None: + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + eval_model=eval_model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta)) + else: + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta)) + + # an ugly workaround to make .log and .log.json filenames the same + runner.timestamp = timestamp + + # fp16 setting + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + optimizer_config = Fp16OptimizerHook( + **cfg.optimizer_config, **fp16_cfg, distributed=distributed) + elif distributed and 'type' not in cfg.optimizer_config: + optimizer_config = OptimizerHook(**cfg.optimizer_config) + else: + optimizer_config = cfg.optimizer_config + + # register hooks + runner.register_training_hooks(cfg.lr_config, optimizer_config, + cfg.checkpoint_config, cfg.log_config, + cfg.get('momentum_config', None)) + + # register profiler hook + #trace_config = dict(type='tb_trace', dir_name='work_dir') + #profiler_config = dict(on_trace_ready=trace_config) + #runner.register_profiler_hook(profiler_config) + + if distributed: + if isinstance(runner, EpochBasedRunner): + runner.register_hook(DistSamplerSeedHook()) + + # register eval hooks + if validate: + # Support batch_size > 1 in validation + val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) + if val_samples_per_gpu > 1: + assert False + # Replace 'ImageToTensor' to 'DefaultFormatBundle' + cfg.data.val.pipeline = replace_ImageToTensor( + cfg.data.val.pipeline) + val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True)) + + val_dataloader = build_dataloader( + val_dataset, + samples_per_gpu=val_samples_per_gpu, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=distributed, + shuffle=False, + shuffler_sampler=cfg.data.shuffler_sampler, # dict(type='DistributedGroupSampler'), + nonshuffler_sampler=cfg.data.nonshuffler_sampler, # dict(type='DistributedSampler'), + ) + eval_cfg = cfg.get('evaluation', {}) + eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' + eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_')) + eval_hook = CustomDistEvalHook if distributed else EvalHook + runner.register_hook(eval_hook(val_dataloader, **eval_cfg)) + + # user-defined hooks + if cfg.get('custom_hooks', None): + custom_hooks = cfg.custom_hooks + assert isinstance(custom_hooks, list), \ + f'custom_hooks expect list type, but got {type(custom_hooks)}' + for hook_cfg in cfg.custom_hooks: + assert isinstance(hook_cfg, dict), \ + 'Each item in custom_hooks expects dict type, but got ' \ + f'{type(hook_cfg)}' + hook_cfg = hook_cfg.copy() + priority = hook_cfg.pop('priority', 'NORMAL') + hook = build_from_cfg(hook_cfg, HOOKS) + runner.register_hook(hook, priority=priority) + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run(data_loaders, cfg.workflow) + diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/test.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/test.py new file mode 100644 index 0000000000000000000000000000000000000000..4e576136224e28e1b5e9a5bac0735ddc55c196bf --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/test.py @@ -0,0 +1,164 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- +import os.path as osp +import pickle +import shutil +import tempfile +import time + +import mmcv +import torch +import torch.distributed as dist +from mmcv.image import tensor2imgs +from mmcv.runner import get_dist_info + +from mmdet.core import encode_mask_results + + +import mmcv +import numpy as np +import pycocotools.mask as mask_util + +def custom_encode_mask_results(mask_results): + """Encode bitmap mask to RLE code. Semantic Masks only + Args: + mask_results (list | tuple[list]): bitmap mask results. + In mask scoring rcnn, mask_results is a tuple of (segm_results, + segm_cls_score). + Returns: + list | tuple: RLE encoded mask. + """ + cls_segms = mask_results + num_classes = len(cls_segms) + encoded_mask_results = [] + for i in range(len(cls_segms)): + encoded_mask_results.append( + mask_util.encode( + np.array( + cls_segms[i][:, :, np.newaxis], order='F', + dtype='uint8'))[0]) # encoded with RLE + return [encoded_mask_results] + +def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): + """Test model with multiple gpus. + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + Returns: + list: The prediction results. + """ + model.eval() + bbox_results = [] + mask_results = [] + dataset = data_loader.dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + time.sleep(2) # This line can prevent deadlock problem in some cases. + have_mask = False + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + # encode mask results + if isinstance(result, dict): + if 'bbox_results' in result.keys(): + bbox_result = result['bbox_results'] + batch_size = len(result['bbox_results']) + bbox_results.extend(bbox_result) + if 'mask_results' in result.keys() and result['mask_results'] is not None: + mask_result = custom_encode_mask_results(result['mask_results']) + mask_results.extend(mask_result) + have_mask = True + else: + batch_size = len(result) + bbox_results.extend(result) + + #if isinstance(result[0], tuple): + # assert False, 'this code is for instance segmentation, which our code will not utilize.' + # result = [(bbox_results, encode_mask_results(mask_results)) + # for bbox_results, mask_results in result] + if rank == 0: + + for _ in range(batch_size * world_size): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + bbox_results = collect_results_gpu(bbox_results, len(dataset)) + if have_mask: + mask_results = collect_results_gpu(mask_results, len(dataset)) + else: + mask_results = None + else: + bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir) + tmpdir = tmpdir+'_mask' if tmpdir is not None else None + if have_mask: + mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir) + else: + mask_results = None + + if mask_results is None: + return {'bbox_results': bbox_results} + return {'bbox_results': bbox_results, 'mask_results': mask_results} + + +def collect_results_cpu(result_part, size, tmpdir=None): + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + mmcv.mkdir_or_exist('.dist_test') + tmpdir = tempfile.mkdtemp(dir='.dist_test') + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_list.append(mmcv.load(part_file)) + # sort the results + ordered_results = [] + ''' + bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample, + ''' + #for res in zip(*part_list): + for res in part_list: + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + collect_results_cpu(result_part, size) \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/train.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f9391e606f29961875b48eebe36d3b9d415b6290 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/train.py @@ -0,0 +1,67 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from .mmdet_train import custom_train_detector +from mmseg.apis import train_segmentor +from mmdet.apis import train_detector + +def custom_train_model(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + eval_model=None, + meta=None): + """A function wrapper for launching model training according to cfg. + + Because we need different eval_hook in runner. Should be deprecated in the + future. + """ + if cfg.model.type in ['EncoderDecoder3D']: + assert False + else: + custom_train_detector( + model, + dataset, + cfg, + distributed=distributed, + validate=validate, + timestamp=timestamp, + eval_model=eval_model, + meta=meta) + + +def train_model(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + meta=None): + """A function wrapper for launching model training according to cfg. + + Because we need different eval_hook in runner. Should be deprecated in the + future. + """ + if cfg.model.type in ['EncoderDecoder3D']: + train_segmentor( + model, + dataset, + cfg, + distributed=distributed, + validate=validate, + timestamp=timestamp, + meta=meta) + else: + train_detector( + model, + dataset, + cfg, + distributed=distributed, + validate=validate, + timestamp=timestamp, + meta=meta) diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6823adfb593d67f27af4af2207a515af4cbab6f5 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py @@ -0,0 +1 @@ +from .bevformer_head import BEVFormerHead \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py new file mode 100644 index 0000000000000000000000000000000000000000..91d38d1411e5093cf4ae801ea08de88ef47b6a8e --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py @@ -0,0 +1,523 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import copy +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Linear, bias_init_with_prob +from mmcv.utils import TORCH_VERSION, digit_version + +from mmdet.core import (multi_apply, multi_apply, reduce_mean) +from mmdet.models.utils.transformer import inverse_sigmoid +from mmdet.models import HEADS +from mmdet.models.dense_heads import DETRHead +from mmdet3d.core.bbox.coders import build_bbox_coder +from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox +from mmcv.cnn.bricks.transformer import build_positional_encoding +from mmcv.runner import force_fp32, auto_fp16 +from projects.mmdet3d_plugin.models.utils.bricks import run_time +import numpy as np +import mmcv +import cv2 as cv +from projects.mmdet3d_plugin.models.utils.visual import save_tensor + + +@HEADS.register_module() +class BEVFormerHead(DETRHead): + """Head of Detr3D. + Args: + with_box_refine (bool): Whether to refine the reference points + in the decoder. Defaults to False. + as_two_stage (bool) : Whether to generate the proposal from + the outputs of encoder. + transformer (obj:`ConfigDict`): ConfigDict is used for building + the Encoder and Decoder. + bev_h, bev_w (int): spatial shape of BEV queries. + """ + + def __init__(self, + *args, + with_box_refine=False, + as_two_stage=False, + transformer=None, + bbox_coder=None, + num_cls_fcs=2, + code_weights=None, + bev_h=30, + bev_w=30, + **kwargs): + + self.bev_h = bev_h + self.bev_w = bev_w + self.fp16_enabled = False + + self.with_box_refine = with_box_refine + self.as_two_stage = as_two_stage + if self.as_two_stage: + transformer['as_two_stage'] = self.as_two_stage + if 'code_size' in kwargs: + self.code_size = kwargs['code_size'] + else: + self.code_size = 10 + if code_weights is not None: + self.code_weights = code_weights + else: + self.code_weights = [1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] + + self.bbox_coder = build_bbox_coder(bbox_coder) + self.pc_range = self.bbox_coder.pc_range + self.real_w = self.pc_range[3] - self.pc_range[0] + self.real_h = self.pc_range[4] - self.pc_range[1] + self.num_cls_fcs = num_cls_fcs - 1 + super(BEVFormerHead, self).__init__( + *args, transformer=transformer, **kwargs) + self.code_weights = nn.Parameter(torch.tensor( + self.code_weights, requires_grad=False), requires_grad=False) + + def _init_layers(self): + """Initialize classification branch and regression branch of head.""" + cls_branch = [] + for _ in range(self.num_reg_fcs): + cls_branch.append(Linear(self.embed_dims, self.embed_dims)) + cls_branch.append(nn.LayerNorm(self.embed_dims)) + cls_branch.append(nn.ReLU(inplace=True)) + cls_branch.append(Linear(self.embed_dims, self.cls_out_channels)) + fc_cls = nn.Sequential(*cls_branch) + + reg_branch = [] + for _ in range(self.num_reg_fcs): + reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + reg_branch.append(nn.ReLU()) + reg_branch.append(Linear(self.embed_dims, self.code_size)) + reg_branch = nn.Sequential(*reg_branch) + + def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + # last reg_branch is used to generate proposal from + # encode feature map when as_two_stage is True. + num_pred = (self.transformer.decoder.num_layers + 1) if \ + self.as_two_stage else self.transformer.decoder.num_layers + + if self.with_box_refine: + self.cls_branches = _get_clones(fc_cls, num_pred) + self.reg_branches = _get_clones(reg_branch, num_pred) + else: + self.cls_branches = nn.ModuleList( + [fc_cls for _ in range(num_pred)]) + self.reg_branches = nn.ModuleList( + [reg_branch for _ in range(num_pred)]) + + if not self.as_two_stage: + self.bev_embedding = nn.Embedding( + self.bev_h * self.bev_w, self.embed_dims) + self.query_embedding = nn.Embedding(self.num_query, + self.embed_dims * 2) + + def init_weights(self): + """Initialize weights of the DeformDETR head.""" + self.transformer.init_weights() + if self.loss_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.cls_branches: + nn.init.constant_(m[-1].bias, bias_init) + + @auto_fp16(apply_to=('mlvl_feats')) + def forward(self, mlvl_feats, img_metas, prev_bev=None, only_bev=False): + """Forward function. + Args: + mlvl_feats (tuple[Tensor]): Features from the upstream + network, each is a 5D-tensor with shape + (B, N, C, H, W). + prev_bev: previous bev featues + only_bev: only compute BEV features with encoder. + Returns: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + """ + + bs, num_cam, _, _, _ = mlvl_feats[0].shape + dtype = mlvl_feats[0].dtype + object_query_embeds = self.query_embedding.weight.to(dtype) + bev_queries = self.bev_embedding.weight.to(dtype) + + bev_mask = torch.zeros((bs, self.bev_h, self.bev_w), + device=bev_queries.device).to(dtype) + bev_pos = self.positional_encoding(bev_mask).to(dtype) + + if only_bev: # only use encoder to obtain BEV features, TODO: refine the workaround + return self.transformer.get_bev_features( + mlvl_feats, + bev_queries, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + img_metas=img_metas, + prev_bev=prev_bev, + ) + else: + outputs = self.transformer( + mlvl_feats, + bev_queries, + object_query_embeds, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 + cls_branches=self.cls_branches if self.as_two_stage else None, + img_metas=img_metas, + prev_bev=prev_bev + ) + + bev_embed, hs, init_reference, inter_references = outputs + hs = hs.permute(0, 2, 1, 3) + outputs_classes = [] + outputs_coords = [] + for lvl in range(hs.shape[0]): + if lvl == 0: + reference = init_reference + else: + reference = inter_references[lvl - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.cls_branches[lvl](hs[lvl]) + tmp = self.reg_branches[lvl](hs[lvl]) + + # TODO: check the shape of reference + assert reference.shape[-1] == 3 + tmp[..., 0:2] += reference[..., 0:2] + tmp[..., 0:2] = tmp[..., 0:2].sigmoid() + tmp[..., 4:5] += reference[..., 2:3] + tmp[..., 4:5] = tmp[..., 4:5].sigmoid() + tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] - + self.pc_range[0]) + self.pc_range[0]) + tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] - + self.pc_range[1]) + self.pc_range[1]) + tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] - + self.pc_range[2]) + self.pc_range[2]) + + # TODO: check if using sigmoid + outputs_coord = tmp + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + + outputs_classes = torch.stack(outputs_classes) + outputs_coords = torch.stack(outputs_coords) + + outs = { + 'bev_embed': bev_embed, + 'all_cls_scores': outputs_classes, + 'all_bbox_preds': outputs_coords, + 'enc_cls_scores': None, + 'enc_bbox_preds': None, + } + + return outs + + def _get_target_single(self, + cls_score, + bbox_pred, + gt_labels, + gt_bboxes, + gt_bboxes_ignore=None): + """"Compute regression and classification targets for one image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_query, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_query, 4]. + gt_bboxes (Tensor): Ground truth bboxes for one image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (Tensor): Ground truth class indices for one image + with shape (num_gts, ). + gt_bboxes_ignore (Tensor, optional): Bounding boxes + which can be ignored. Default None. + Returns: + tuple[Tensor]: a tuple containing the following for one image. + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + + num_bboxes = bbox_pred.size(0) + # assigner and sampler + gt_c = gt_bboxes.shape[-1] + + assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, + gt_labels, gt_bboxes_ignore) + + sampling_result = self.sampler.sample(assign_result, bbox_pred, + gt_bboxes) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + + # label targets + labels = gt_bboxes.new_full((num_bboxes,), + self.num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c] + bbox_weights = torch.zeros_like(bbox_pred) + bbox_weights[pos_inds] = 1.0 + + # DETR + bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes + return (labels, label_weights, bbox_targets, bbox_weights, + pos_inds, neg_inds) + + def get_targets(self, + cls_scores_list, + bbox_preds_list, + gt_bboxes_list, + gt_labels_list, + gt_bboxes_ignore_list=None): + """"Compute regression and classification targets for a batch image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image with shape [num_query, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + tuple: a tuple containing the following targets. + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all \ + images. + - bbox_targets_list (list[Tensor]): BBox targets for all \ + images. + - bbox_weights_list (list[Tensor]): BBox weights for all \ + images. + - num_total_pos (int): Number of positive samples in all \ + images. + - num_total_neg (int): Number of negative samples in all \ + images. + """ + assert gt_bboxes_ignore_list is None, \ + 'Only supports for gt_bboxes_ignore setting to None.' + num_imgs = len(cls_scores_list) + gt_bboxes_ignore_list = [ + gt_bboxes_ignore_list for _ in range(num_imgs) + ] + + (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply( + self._get_target_single, cls_scores_list, bbox_preds_list, + gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, num_total_pos, num_total_neg) + + def loss_single(self, + cls_scores, + bbox_preds, + gt_bboxes_list, + gt_labels_list, + gt_bboxes_ignore_list=None): + """"Loss function for outputs from a single decoder layer of a single + feature level. + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images. Shape [bs, num_query, cls_out_channels]. + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape [bs, num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components for outputs from + a single decoder layer. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, + gt_bboxes_list, gt_labels_list, + gt_bboxes_ignore_list) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + + # classification loss + cls_scores = cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + + cls_avg_factor = max(cls_avg_factor, 1) + loss_cls = self.loss_cls( + cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes accross all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # regression L1 loss + bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) + normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range) + isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) + bbox_weights = bbox_weights * self.code_weights + + loss_bbox = self.loss_bbox( + bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan, + :10], bbox_weights[isnotnan, :10], + avg_factor=num_total_pos) + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + loss_cls = torch.nan_to_num(loss_cls) + loss_bbox = torch.nan_to_num(loss_bbox) + return loss_cls, loss_bbox + + @force_fp32(apply_to=('preds_dicts')) + def loss(self, + gt_bboxes_list, + gt_labels_list, + preds_dicts, + gt_bboxes_ignore=None, + img_metas=None): + """"Loss function. + Args: + + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + preds_dicts: + all_cls_scores (Tensor): Classification score of all + decoder layers, has shape + [nb_dec, bs, num_query, cls_out_channels]. + all_bbox_preds (Tensor): Sigmoid regression + outputs of all decode layers. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and shape + [nb_dec, bs, num_query, 4]. + enc_cls_scores (Tensor): Classification scores of + points on encode feature map , has shape + (N, h*w, num_classes). Only be passed when as_two_stage is + True, otherwise is None. + enc_bbox_preds (Tensor): Regression results of each points + on the encode feature map, has shape (N, h*w, 4). Only be + passed when as_two_stage is True, otherwise is None. + gt_bboxes_ignore (list[Tensor], optional): Bounding boxes + which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert gt_bboxes_ignore is None, \ + f'{self.__class__.__name__} only supports ' \ + f'for gt_bboxes_ignore setting to None.' + + all_cls_scores = preds_dicts['all_cls_scores'] + all_bbox_preds = preds_dicts['all_bbox_preds'] + enc_cls_scores = preds_dicts['enc_cls_scores'] + enc_bbox_preds = preds_dicts['enc_bbox_preds'] + + num_dec_layers = len(all_cls_scores) + device = gt_labels_list[0].device + + gt_bboxes_list = [torch.cat( + (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), + dim=1).to(device) for gt_bboxes in gt_bboxes_list] + + all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] + all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] + all_gt_bboxes_ignore_list = [ + gt_bboxes_ignore for _ in range(num_dec_layers) + ] + + losses_cls, losses_bbox = multi_apply( + self.loss_single, all_cls_scores, all_bbox_preds, + all_gt_bboxes_list, all_gt_labels_list, + all_gt_bboxes_ignore_list) + + loss_dict = dict() + # loss of proposal generated from encode feature map. + if enc_cls_scores is not None: + binary_labels_list = [ + torch.zeros_like(gt_labels_list[i]) + for i in range(len(all_gt_labels_list)) + ] + enc_loss_cls, enc_losses_bbox = \ + self.loss_single(enc_cls_scores, enc_bbox_preds, + gt_bboxes_list, binary_labels_list, gt_bboxes_ignore) + loss_dict['enc_loss_cls'] = enc_loss_cls + loss_dict['enc_loss_bbox'] = enc_losses_bbox + + # loss from the last decoder layer + loss_dict['loss_cls'] = losses_cls[-1] + loss_dict['loss_bbox'] = losses_bbox[-1] + + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], + losses_bbox[:-1]): + loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i + num_dec_layer += 1 + return loss_dict + + @force_fp32(apply_to=('preds_dicts')) + def get_bboxes(self, preds_dicts, img_metas, rescale=False): + """Generate bboxes from bbox head predictions. + Args: + preds_dicts (tuple[list[dict]]): Prediction results. + img_metas (list[dict]): Point cloud and image's meta info. + Returns: + list[dict]: Decoded bbox, scores and labels after nms. + """ + + preds_dicts = self.bbox_coder.decode(preds_dicts) + + num_samples = len(preds_dicts) + ret_list = [] + for i in range(num_samples): + preds = preds_dicts[i] + bboxes = preds['bboxes'] + + bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 + + code_size = bboxes.shape[-1] + bboxes = img_metas[i]['box_type_3d'](bboxes, code_size) + scores = preds['scores'] + labels = preds['labels'] + + ret_list.append([bboxes, scores, labels]) + + return ret_list diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4c39fd341d5d65f809bb94bee71c6e9a523639e6 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/__init__.py @@ -0,0 +1,2 @@ +from .bevformer import BEVFormer +from .bevformer_fp16 import BEVFormer_fp16 \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py new file mode 100644 index 0000000000000000000000000000000000000000..8d3b676115bb46a39ef21ba7b061e98a72ae11c2 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py @@ -0,0 +1,289 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from tkinter.messagebox import NO +import torch +from mmcv.runner import force_fp32, auto_fp16 +from mmdet.models import DETECTORS +from mmdet3d.core import bbox3d2result +from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector +from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask +import time +import copy +import numpy as np +import mmdet3d +from projects.mmdet3d_plugin.models.utils.bricks import run_time + + +@DETECTORS.register_module() +class BEVFormer(MVXTwoStageDetector): + """BEVFormer. + Args: + video_test_mode (bool): Decide whether to use temporal information during inference. + """ + + def __init__(self, + use_grid_mask=False, + pts_voxel_layer=None, + pts_voxel_encoder=None, + pts_middle_encoder=None, + pts_fusion_layer=None, + img_backbone=None, + pts_backbone=None, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_roi_head=None, + img_rpn_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + video_test_mode=False + ): + + super(BEVFormer, + self).__init__(pts_voxel_layer, pts_voxel_encoder, + pts_middle_encoder, pts_fusion_layer, + img_backbone, pts_backbone, img_neck, pts_neck, + pts_bbox_head, img_roi_head, img_rpn_head, + train_cfg, test_cfg, pretrained) + self.grid_mask = GridMask( + True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7) + self.use_grid_mask = use_grid_mask + self.fp16_enabled = False + + # temporal + self.video_test_mode = video_test_mode + self.prev_frame_info = { + 'prev_bev': None, + 'scene_token': None, + 'prev_pos': 0, + 'prev_angle': 0, + } + + + def extract_img_feat(self, img, img_metas, len_queue=None): + """Extract features of images.""" + B = img.size(0) + if img is not None: + + # input_shape = img.shape[-2:] + # # update real input shape of each single img + # for img_meta in img_metas: + # img_meta.update(input_shape=input_shape) + + if img.dim() == 5 and img.size(0) == 1: + img.squeeze_() + elif img.dim() == 5 and img.size(0) > 1: + B, N, C, H, W = img.size() + img = img.reshape(B * N, C, H, W) + if self.use_grid_mask: + img = self.grid_mask(img) + + img_feats = self.img_backbone(img) + if isinstance(img_feats, dict): + img_feats = list(img_feats.values()) + else: + return None + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + + img_feats_reshaped = [] + for img_feat in img_feats: + BN, C, H, W = img_feat.size() + if len_queue is not None: + img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W)) + else: + img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W)) + return img_feats_reshaped + + @auto_fp16(apply_to=('img')) + def extract_feat(self, img, img_metas=None, len_queue=None): + """Extract features from images and points.""" + + img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue) + + return img_feats + + + def forward_pts_train(self, + pts_feats, + gt_bboxes_3d, + gt_labels_3d, + img_metas, + gt_bboxes_ignore=None, + prev_bev=None): + """Forward function' + Args: + pts_feats (list[torch.Tensor]): Features of point cloud branch + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth + boxes for each sample. + gt_labels_3d (list[torch.Tensor]): Ground truth labels for + boxes of each sampole + img_metas (list[dict]): Meta information of samples. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + boxes to be ignored. Defaults to None. + prev_bev (torch.Tensor, optional): BEV features of previous frame. + Returns: + dict: Losses of each branch. + """ + + outs = self.pts_bbox_head( + pts_feats, img_metas, prev_bev) + loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] + losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas) + return losses + + def forward_dummy(self, img): + dummy_metas = None + return self.forward_test(img=img, img_metas=[[dummy_metas]]) + + def forward(self, return_loss=True, **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. + Note this setting will change the expected inputs. When + `return_loss=True`, img and img_metas are single-nested (i.e. + torch.Tensor and list[dict]), and when `resturn_loss=False`, img and + img_metas should be double nested (i.e. list[torch.Tensor], + list[list[dict]]), with the outer list indicating test time + augmentations. + """ + if return_loss: + return self.forward_train(**kwargs) + else: + return self.forward_test(**kwargs) + + def obtain_history_bev(self, imgs_queue, img_metas_list): + """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated. + """ + self.eval() + + with torch.no_grad(): + prev_bev = None + bs, len_queue, num_cams, C, H, W = imgs_queue.shape + imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W) + img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue) + for i in range(len_queue): + img_metas = [each[i] for each in img_metas_list] + # img_feats = self.extract_feat(img=img, img_metas=img_metas) + img_feats = [each_scale[:, i] for each_scale in img_feats_list] + prev_bev = self.pts_bbox_head( + img_feats, img_metas, prev_bev, only_bev=True) + self.train() + return prev_bev + + @auto_fp16(apply_to=('img', 'points')) + def forward_train(self, + points=None, + img_metas=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + gt_labels=None, + gt_bboxes=None, + img=None, + proposals=None, + gt_bboxes_ignore=None, + img_depth=None, + img_mask=None, + ): + """Forward training function. + Args: + points (list[torch.Tensor], optional): Points of each sample. + Defaults to None. + img_metas (list[dict], optional): Meta information of each sample. + Defaults to None. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): + Ground truth 3D boxes. Defaults to None. + gt_labels_3d (list[torch.Tensor], optional): Ground truth labels + of 3D boxes. Defaults to None. + gt_labels (list[torch.Tensor], optional): Ground truth labels + of 2D boxes in images. Defaults to None. + gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in + images. Defaults to None. + img (torch.Tensor optional): Images of each sample with shape + (N, C, H, W). Defaults to None. + proposals ([list[torch.Tensor], optional): Predicted proposals + used for training Fast RCNN. Defaults to None. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + 2D boxes in images to be ignored. Defaults to None. + Returns: + dict: Losses of different branches. + """ + + len_queue = img.size(1) + prev_img = img[:, :-1, ...] + img = img[:, -1, ...] + + prev_img_metas = copy.deepcopy(img_metas) + prev_bev = self.obtain_history_bev(prev_img, prev_img_metas) + + img_metas = [each[len_queue-1] for each in img_metas] + img_feats = self.extract_feat(img=img, img_metas=img_metas) + losses = dict() + losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, + gt_labels_3d, img_metas, + gt_bboxes_ignore, prev_bev) + + losses.update(losses_pts) + return losses + + def forward_test(self, img_metas, img=None, **kwargs): + for var, name in [(img_metas, 'img_metas')]: + if not isinstance(var, list): + raise TypeError('{} must be a list, but got {}'.format( + name, type(var))) + img = [img] if img is None else img + + if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']: + # the first sample of each scene is truncated + self.prev_frame_info['prev_bev'] = None + # update idx + self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token'] + + # do not use temporal information + if not self.video_test_mode: + self.prev_frame_info['prev_bev'] = None + + # Get the delta of ego position and angle between two timestamps. + tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3]) + tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1]) + if self.prev_frame_info['prev_bev'] is not None: + img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos'] + img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle'] + else: + img_metas[0][0]['can_bus'][-1] = 0 + img_metas[0][0]['can_bus'][:3] = 0 + + new_prev_bev, bbox_results = self.simple_test( + img_metas[0], img[0], prev_bev=self.prev_frame_info['prev_bev'], **kwargs) + # During inference, we save the BEV features and ego motion of each timestamp. + self.prev_frame_info['prev_pos'] = tmp_pos + self.prev_frame_info['prev_angle'] = tmp_angle + self.prev_frame_info['prev_bev'] = new_prev_bev + return bbox_results + + def simple_test_pts(self, x, img_metas, prev_bev=None, rescale=False): + """Test function""" + outs = self.pts_bbox_head(x, img_metas, prev_bev=prev_bev) + + bbox_list = self.pts_bbox_head.get_bboxes( + outs, img_metas, rescale=rescale) + bbox_results = [ + bbox3d2result(bboxes, scores, labels) + for bboxes, scores, labels in bbox_list + ] + return outs['bev_embed'], bbox_results + + def simple_test(self, img_metas, img=None, prev_bev=None, rescale=False): + """Test function without augmentaiton.""" + img_feats = self.extract_feat(img=img, img_metas=img_metas) + + bbox_list = [dict() for i in range(len(img_metas))] + new_prev_bev, bbox_pts = self.simple_test_pts( + img_feats, img_metas, prev_bev, rescale=rescale) + for result_dict, pts_bbox in zip(bbox_list, bbox_pts): + result_dict['pts_bbox'] = pts_bbox + return new_prev_bev, bbox_list diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py new file mode 100644 index 0000000000000000000000000000000000000000..5325e3ccb8ac576a6764df3f0094ac5ea1bbc7cb --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py @@ -0,0 +1,89 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from tkinter.messagebox import NO +import torch +from mmcv.runner import force_fp32, auto_fp16 +from mmdet.models import DETECTORS +from mmdet3d.core import bbox3d2result +from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector +from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask +from projects.mmdet3d_plugin.bevformer.detectors.bevformer import BEVFormer +import time +import copy +import numpy as np +import mmdet3d +from projects.mmdet3d_plugin.models.utils.bricks import run_time + + +@DETECTORS.register_module() +class BEVFormer_fp16(BEVFormer): + """ + The default version BEVFormer currently can not support FP16. + We provide this version to resolve this issue. + """ + + @auto_fp16(apply_to=('img', 'prev_bev', 'points')) + def forward_train(self, + points=None, + img_metas=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + gt_labels=None, + gt_bboxes=None, + img=None, + proposals=None, + gt_bboxes_ignore=None, + img_depth=None, + img_mask=None, + prev_bev=None, + ): + """Forward training function. + Args: + points (list[torch.Tensor], optional): Points of each sample. + Defaults to None. + img_metas (list[dict], optional): Meta information of each sample. + Defaults to None. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): + Ground truth 3D boxes. Defaults to None. + gt_labels_3d (list[torch.Tensor], optional): Ground truth labels + of 3D boxes. Defaults to None. + gt_labels (list[torch.Tensor], optional): Ground truth labels + of 2D boxes in images. Defaults to None. + gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in + images. Defaults to None. + img (torch.Tensor optional): Images of each sample with shape + (N, C, H, W). Defaults to None. + proposals ([list[torch.Tensor], optional): Predicted proposals + used for training Fast RCNN. Defaults to None. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + 2D boxes in images to be ignored. Defaults to None. + Returns: + dict: Losses of different branches. + """ + + img_feats = self.extract_feat(img=img, img_metas=img_metas) + + losses = dict() + losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, + gt_labels_3d, img_metas, + gt_bboxes_ignore, prev_bev=prev_bev) + losses.update(losses_pts) + return losses + + + def val_step(self, data, optimizer): + """ + In BEVFormer_fp16, we use this `val_step` function to inference the `prev_pev`. + This is not the standard function of `val_step`. + """ + + img = data['img'] + img_metas = data['img_metas'] + img_feats = self.extract_feat(img=img, img_metas=img_metas) + prev_bev = data.get('prev_bev', None) + prev_bev = self.pts_bbox_head(img_feats, img_metas, prev_bev=prev_bev, only_bev=True) + return prev_bev \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa04ec16df5b0bb9f21cadf22f9172c3cc9a58c1 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/__init__.py @@ -0,0 +1 @@ +from .custom_hooks import TransferWeight \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..091738a0950869767647383ad001e5e7e5a5bcaa --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py @@ -0,0 +1,14 @@ +from mmcv.runner.hooks.hook import HOOKS, Hook +from projects.mmdet3d_plugin.models.utils import run_time + + +@HOOKS.register_module() +class TransferWeight(Hook): + + def __init__(self, every_n_inters=1): + self.every_n_inters=every_n_inters + + def after_train_iter(self, runner): + if self.every_n_inner_iters(runner, self.every_n_inters): + runner.eval_model.load_state_dict(runner.model.state_dict()) + diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1bb5e04c7f69b70088321e62760be14f3329962b --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/__init__.py @@ -0,0 +1,6 @@ +from .transformer import PerceptionTransformer +from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D +from .temporal_self_attention import TemporalSelfAttention +from .encoder import BEVFormerEncoder, BEVFormerLayer +from .decoder import DetectionTransformerDecoder + diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..a5d994cda08a8b7fc3ba3ecbadbc5f295ce3c6cc --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py @@ -0,0 +1,260 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import copy +import warnings + +import torch +import torch.nn as nn + +from mmcv import ConfigDict, deprecated_api_warning +from mmcv.cnn import Linear, build_activation_layer, build_norm_layer +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential + +from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, + TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) + +# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file +try: + from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401 + warnings.warn( + ImportWarning( + '``MultiScaleDeformableAttention`` has been moved to ' + '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 + '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 + 'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 + )) +except ImportError: + warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' + '``mmcv.ops.multi_scale_deform_attn``, ' + 'You should install ``mmcv-full`` if you need this module. ') +from mmcv.cnn.bricks.transformer import build_feedforward_network, build_attention + + +@TRANSFORMER_LAYER.register_module() +class MyCustomBaseTransformerLayer(BaseModule): + """Base `TransformerLayer` for vision transformer. + It can be built from `mmcv.ConfigDict` and support more flexible + customization, for example, using any number of `FFN or LN ` and + use different kinds of `attention` by specifying a list of `ConfigDict` + named `attn_cfgs`. It is worth mentioning that it supports `prenorm` + when you specifying `norm` as the first element of `operation_order`. + More details about the `prenorm`: `On Layer Normalization in the + Transformer Architecture `_ . + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for `self_attention` or `cross_attention` modules, + The order of the configs in the list should be consistent with + corresponding attentions in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. Default: None. + ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for FFN, The order of the configs in the list should be + consistent with corresponding ffn in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Support `prenorm` when you specifying first element as `norm`. + Default:None. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): Key, Query and Value are shape + of (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + """ + + def __init__(self, + attn_cfgs=None, + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True), + ), + operation_order=None, + norm_cfg=dict(type='LN'), + init_cfg=None, + batch_first=True, + **kwargs): + + deprecated_args = dict( + feedforward_channels='feedforward_channels', + ffn_dropout='ffn_drop', + ffn_num_fcs='num_fcs') + for ori_name, new_name in deprecated_args.items(): + if ori_name in kwargs: + warnings.warn( + f'The arguments `{ori_name}` in BaseTransformerLayer ' + f'has been deprecated, now you should set `{new_name}` ' + f'and other FFN related arguments ' + f'to a dict named `ffn_cfgs`. ') + ffn_cfgs[new_name] = kwargs[ori_name] + + super(MyCustomBaseTransformerLayer, self).__init__(init_cfg) + + self.batch_first = batch_first + + assert set(operation_order) & set( + ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ + set(operation_order), f'The operation_order of' \ + f' {self.__class__.__name__} should ' \ + f'contains all four operation type ' \ + f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" + + num_attn = operation_order.count('self_attn') + operation_order.count( + 'cross_attn') + if isinstance(attn_cfgs, dict): + attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] + else: + assert num_attn == len(attn_cfgs), f'The length ' \ + f'of attn_cfg {num_attn} is ' \ + f'not consistent with the number of attention' \ + f'in operation_order {operation_order}.' + + self.num_attn = num_attn + self.operation_order = operation_order + self.norm_cfg = norm_cfg + self.pre_norm = operation_order[0] == 'norm' + self.attentions = ModuleList() + + index = 0 + for operation_name in operation_order: + if operation_name in ['self_attn', 'cross_attn']: + if 'batch_first' in attn_cfgs[index]: + assert self.batch_first == attn_cfgs[index]['batch_first'] + else: + attn_cfgs[index]['batch_first'] = self.batch_first + attention = build_attention(attn_cfgs[index]) + # Some custom attentions used as `self_attn` + # or `cross_attn` can have different behavior. + attention.operation_name = operation_name + self.attentions.append(attention) + index += 1 + + self.embed_dims = self.attentions[0].embed_dims + + self.ffns = ModuleList() + num_ffns = operation_order.count('ffn') + if isinstance(ffn_cfgs, dict): + ffn_cfgs = ConfigDict(ffn_cfgs) + if isinstance(ffn_cfgs, dict): + ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] + assert len(ffn_cfgs) == num_ffns + for ffn_index in range(num_ffns): + if 'embed_dims' not in ffn_cfgs[ffn_index]: + ffn_cfgs['embed_dims'] = self.embed_dims + else: + assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims + + self.ffns.append( + build_feedforward_network(ffn_cfgs[ffn_index])) + + self.norms = ModuleList() + num_norms = operation_order.count('norm') + for _ in range(num_norms): + self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) + + def forward(self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + **kwargs contains some specific arguments of attentions. + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + if layer == 'self_attn': + temp_key = temp_value = query + query = self.attentions[attn_index]( + query, + temp_key, + temp_value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/decoder.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..33024f86a868c4316c15cfadeb5fb0ca58ef8895 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/decoder.py @@ -0,0 +1,345 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import mmcv +import cv2 as cv +import copy +import warnings +from matplotlib import pyplot as plt +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import TransformerLayerSequence +import math +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, + to_2tuple) + +from mmcv.utils import ext_loader +from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ + MultiScaleDeformableAttnFunction_fp16 + +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +def inverse_sigmoid(x, eps=1e-5): + """Inverse function of sigmoid. + Args: + x (Tensor): The tensor to do the + inverse. + eps (float): EPS avoid numerical + overflow. Defaults 1e-5. + Returns: + Tensor: The x has passed the inverse + function of sigmoid, has same + shape with input. + """ + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class DetectionTransformerDecoder(TransformerLayerSequence): + """Implements the decoder in DETR3D transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, return_intermediate=False, **kwargs): + super(DetectionTransformerDecoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + self.fp16_enabled = False + + def forward(self, + query, + *args, + reference_points=None, + reg_branches=None, + key_padding_mask=None, + **kwargs): + """Forward function for `Detr3DTransformerDecoder`. + Args: + query (Tensor): Input query with shape + `(num_query, bs, embed_dims)`. + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + reg_branch: (obj:`nn.ModuleList`): Used for + refining the regression results. Only would + be passed when with_box_refine is True, + otherwise would be passed a `None`. + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + output = query + intermediate = [] + intermediate_reference_points = [] + for lid, layer in enumerate(self.layers): + + reference_points_input = reference_points[..., :2].unsqueeze( + 2) # BS NUM_QUERY NUM_LEVEL 2 + output = layer( + output, + *args, + reference_points=reference_points_input, + key_padding_mask=key_padding_mask, + **kwargs) + output = output.permute(1, 0, 2) + + if reg_branches is not None: + tmp = reg_branches[lid](output) + + assert reference_points.shape[-1] == 3 + + new_reference_points = torch.zeros_like(reference_points) + new_reference_points[..., :2] = tmp[ + ..., :2] + inverse_sigmoid(reference_points[..., :2]) + new_reference_points[..., 2:3] = tmp[ + ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3]) + + new_reference_points = new_reference_points.sigmoid() + + reference_points = new_reference_points.detach() + + output = output.permute(1, 0, 2) + if self.return_intermediate: + intermediate.append(output) + intermediate_reference_points.append(reference_points) + + if self.return_intermediate: + return torch.stack(intermediate), torch.stack( + intermediate_reference_points) + + return output, reference_points + + +@ATTENTION.register_module() +class CustomMSDeformableAttention(BaseModule): + """An attention module used in Deformable-Detr. + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + im2col_step=64, + dropout=0.1, + batch_first=False, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.dropout = nn.Dropout(dropout) + self.batch_first = batch_first + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.sampling_offsets = nn.Linear( + embed_dims, num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims, + num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels, self.num_points, 1) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + @deprecated_api_warning({'residual': 'identity'}, + cls_name='MultiScaleDeformableAttention') + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + flag='decoder', + **kwargs): + """Forward Function of MultiScaleDeformAttention. + + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points) + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets \ + / offset_normalizer[None, None, None, :, None, :] + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ + * 0.5 + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: + + # using fp16 deformable attention is unstable because it performs many sum operations + if value.dtype == torch.float16: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + else: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + output = MultiScaleDeformableAttnFunction.apply( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights, self.im2col_step) + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + + output = self.output_proj(output) + + if not self.batch_first: + # (num_query, bs ,embed_dims) + output = output.permute(1, 0, 2) + + return self.dropout(output) + identity diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/encoder.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..b1ee30065be23c97371f45d48780acb1f04bfc9f --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/encoder.py @@ -0,0 +1,403 @@ + +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from projects.mmdet3d_plugin.models.utils.bricks import run_time +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from .custom_base_transformer_layer import MyCustomBaseTransformerLayer +import copy +import warnings +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import TransformerLayerSequence +from mmcv.runner import force_fp32, auto_fp16 +import numpy as np +import torch +import cv2 as cv +import mmcv +from mmcv.utils import TORCH_VERSION, digit_version +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class BEVFormerEncoder(TransformerLayerSequence): + + """ + Attention with both self and cross + Implements the decoder in DETR transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + coder_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes', + **kwargs): + + super(BEVFormerEncoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + + self.num_points_in_pillar = num_points_in_pillar + self.pc_range = pc_range + self.fp16_enabled = False + + @staticmethod + def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs=1, device='cuda', dtype=torch.float): + """Get the reference points used in SCA and TSA. + Args: + H, W: spatial shape of bev. + Z: hight of pillar. + D: sample D points uniformly from each pillar. + device (obj:`device`): The device where + reference_points should be. + Returns: + Tensor: reference points used in decoder, has \ + shape (bs, num_keys, num_levels, 2). + """ + + # reference points in 3D space, used in spatial cross-attention (SCA) + if dim == '3d': + zs = torch.linspace(0.5, Z - 0.5, num_points_in_pillar, dtype=dtype, + device=device).view(-1, 1, 1).expand(num_points_in_pillar, H, W) / Z + xs = torch.linspace(0.5, W - 0.5, W, dtype=dtype, + device=device).view(1, 1, W).expand(num_points_in_pillar, H, W) / W + ys = torch.linspace(0.5, H - 0.5, H, dtype=dtype, + device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H + ref_3d = torch.stack((xs, ys, zs), -1) + ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1) + ref_3d = ref_3d[None].repeat(bs, 1, 1, 1) + return ref_3d + + # reference points on 2D bev plane, used in temporal self-attention (TSA). + elif dim == '2d': + ref_y, ref_x = torch.meshgrid( + torch.linspace( + 0.5, H - 0.5, H, dtype=dtype, device=device), + torch.linspace( + 0.5, W - 0.5, W, dtype=dtype, device=device) + ) + ref_y = ref_y.reshape(-1)[None] / H + ref_x = ref_x.reshape(-1)[None] / W + ref_2d = torch.stack((ref_x, ref_y), -1) + ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2) + return ref_2d + + # This function must use fp32!!! + @force_fp32(apply_to=('reference_points', 'img_metas')) + def point_sampling(self, reference_points, pc_range, img_metas): + + lidar2img = [] + for img_meta in img_metas: + lidar2img.append(img_meta['lidar2img']) + lidar2img = np.asarray(lidar2img) + lidar2img = reference_points.new_tensor(lidar2img) # (B, N, 4, 4) + reference_points = reference_points.clone() + + reference_points[..., 0:1] = reference_points[..., 0:1] * \ + (pc_range[3] - pc_range[0]) + pc_range[0] + reference_points[..., 1:2] = reference_points[..., 1:2] * \ + (pc_range[4] - pc_range[1]) + pc_range[1] + reference_points[..., 2:3] = reference_points[..., 2:3] * \ + (pc_range[5] - pc_range[2]) + pc_range[2] + + reference_points = torch.cat( + (reference_points, torch.ones_like(reference_points[..., :1])), -1) + + reference_points = reference_points.permute(1, 0, 2, 3) + D, B, num_query = reference_points.size()[:3] + num_cam = lidar2img.size(1) + + reference_points = reference_points.view( + D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1) + + lidar2img = lidar2img.view( + 1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1) + + reference_points_cam = torch.matmul(lidar2img.to(torch.float32), + reference_points.to(torch.float32)).squeeze(-1) + eps = 1e-5 + + bev_mask = (reference_points_cam[..., 2:3] > eps) + reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum( + reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps) + + reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1] + reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0] + + bev_mask = (bev_mask & (reference_points_cam[..., 1:2] > 0.0) + & (reference_points_cam[..., 1:2] < 1.0) + & (reference_points_cam[..., 0:1] < 1.0) + & (reference_points_cam[..., 0:1] > 0.0)) + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + bev_mask = torch.nan_to_num(bev_mask) + else: + bev_mask = bev_mask.new_tensor( + np.nan_to_num(bev_mask.cpu().numpy())) + + reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4) + bev_mask = bev_mask.permute(2, 1, 3, 0, 4).squeeze(-1) + + return reference_points_cam, bev_mask + + @auto_fp16() + def forward(self, + bev_query, + key, + value, + *args, + bev_h=None, + bev_w=None, + bev_pos=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + prev_bev=None, + shift=0., + **kwargs): + """Forward function for `TransformerDecoder`. + Args: + bev_query (Tensor): Input BEV query with shape + `(num_query, bs, embed_dims)`. + key & value (Tensor): Input multi-cameta features with shape + (num_cam, num_value, bs, embed_dims) + reference_points (Tensor): The reference + points of offset. has shape + (bs, num_query, 4) when as_two_stage, + otherwise has shape ((bs, num_query, 2). + valid_ratios (Tensor): The radios of valid + points on the feature map, has shape + (bs, num_levels, 2) + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + + output = bev_query + intermediate = [] + + ref_3d = self.get_reference_points( + bev_h, bev_w, self.pc_range[5]-self.pc_range[2], self.num_points_in_pillar, dim='3d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) + ref_2d = self.get_reference_points( + bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) + + reference_points_cam, bev_mask = self.point_sampling( + ref_3d, self.pc_range, kwargs['img_metas']) + + # bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper. + shift_ref_2d = ref_2d # .clone() + shift_ref_2d += shift[:, None, None, :] + + # (num_query, bs, embed_dims) -> (bs, num_query, embed_dims) + bev_query = bev_query.permute(1, 0, 2) + bev_pos = bev_pos.permute(1, 0, 2) + bs, len_bev, num_bev_level, _ = ref_2d.shape + if prev_bev is not None: + prev_bev = prev_bev.permute(1, 0, 2) + prev_bev = torch.stack( + [prev_bev, bev_query], 1).reshape(bs*2, len_bev, -1) + hybird_ref_2d = torch.stack([shift_ref_2d, ref_2d], 1).reshape( + bs*2, len_bev, num_bev_level, 2) + else: + hybird_ref_2d = torch.stack([ref_2d, ref_2d], 1).reshape( + bs*2, len_bev, num_bev_level, 2) + + for lid, layer in enumerate(self.layers): + output = layer( + bev_query, + key, + value, + *args, + bev_pos=bev_pos, + ref_2d=hybird_ref_2d, + ref_3d=ref_3d, + bev_h=bev_h, + bev_w=bev_w, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + reference_points_cam=reference_points_cam, + bev_mask=bev_mask, + prev_bev=prev_bev, + **kwargs) + + bev_query = output + if self.return_intermediate: + intermediate.append(output) + + if self.return_intermediate: + return torch.stack(intermediate) + + return output + + +@TRANSFORMER_LAYER.register_module() +class BEVFormerLayer(MyCustomBaseTransformerLayer): + """Implements decoder layer in DETR transformer. + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): + Configs for self_attention or cross_attention, the order + should be consistent with it in `operation_order`. If it is + a dict, it would be expand to the number of attention in + `operation_order`. + feedforward_channels (int): The hidden dimension for FFNs. + ffn_dropout (float): Probability of an element to be zeroed + in ffn. Default 0.0. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Default:None + act_cfg (dict): The activation config for FFNs. Default: `LN` + norm_cfg (dict): Config dict for normalization layer. + Default: `LN`. + ffn_num_fcs (int): The number of fully-connected layers in FFNs. + Default:2. + """ + + def __init__(self, + attn_cfgs, + feedforward_channels, + ffn_dropout=0.0, + operation_order=None, + act_cfg=dict(type='ReLU', inplace=True), + norm_cfg=dict(type='LN'), + ffn_num_fcs=2, + **kwargs): + super(BEVFormerLayer, self).__init__( + attn_cfgs=attn_cfgs, + feedforward_channels=feedforward_channels, + ffn_dropout=ffn_dropout, + operation_order=operation_order, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + ffn_num_fcs=ffn_num_fcs, + **kwargs) + self.fp16_enabled = False + assert len(operation_order) == 6 + assert set(operation_order) == set( + ['self_attn', 'norm', 'cross_attn', 'ffn']) + + def forward(self, + query, + key=None, + value=None, + bev_pos=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + ref_2d=None, + ref_3d=None, + bev_h=None, + bev_w=None, + reference_points_cam=None, + mask=None, + spatial_shapes=None, + level_start_index=None, + prev_bev=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + + **kwargs contains some specific arguments of attentions. + + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + # temporal self attention + if layer == 'self_attn': + + query = self.attentions[attn_index]( + query, + prev_bev, + prev_bev, + identity if self.pre_norm else None, + query_pos=bev_pos, + key_pos=bev_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + reference_points=ref_2d, + spatial_shapes=torch.tensor( + [[bev_h, bev_w]], device=query.device), + level_start_index=torch.tensor([0], device=query.device), + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + # spaital cross attention + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + reference_points=ref_3d, + reference_points_cam=reference_points_cam, + mask=mask, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py new file mode 100644 index 0000000000000000000000000000000000000000..77b0f319ccff7e023e1c2d94b63f8c2d7b9c727d --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py @@ -0,0 +1,163 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import torch +from torch.cuda.amp import custom_bwd, custom_fwd +from torch.autograd.function import Function, once_differentiable +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +class MultiScaleDeformableAttnFunction_fp16(Function): + + @staticmethod + @custom_fwd(cast_inputs=torch.float16) + def forward(ctx, value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights, im2col_step): + """GPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + im2col_step (Tensor): The step used in image to column. + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + ctx.im2col_step = im2col_step + output = ext_module.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step=ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights) + return output + + @staticmethod + @once_differentiable + @custom_bwd + def backward(ctx, grad_output): + """GPU version of backward function. + + Args: + grad_output (Tensor): Gradient + of output tensor of forward. + + Returns: + Tuple[Tensor]: Gradient + of input tensors in forward. + """ + value, value_spatial_shapes, value_level_start_index, \ + sampling_locations, attention_weights = ctx.saved_tensors + grad_value = torch.zeros_like(value) + grad_sampling_loc = torch.zeros_like(sampling_locations) + grad_attn_weight = torch.zeros_like(attention_weights) + + ext_module.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output.contiguous(), + grad_value, + grad_sampling_loc, + grad_attn_weight, + im2col_step=ctx.im2col_step) + + return grad_value, None, None, \ + grad_sampling_loc, grad_attn_weight, None + + +class MultiScaleDeformableAttnFunction_fp32(Function): + + @staticmethod + @custom_fwd(cast_inputs=torch.float32) + def forward(ctx, value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights, im2col_step): + """GPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + im2col_step (Tensor): The step used in image to column. + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + + ctx.im2col_step = im2col_step + output = ext_module.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step=ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights) + return output + + @staticmethod + @once_differentiable + @custom_bwd + def backward(ctx, grad_output): + """GPU version of backward function. + + Args: + grad_output (Tensor): Gradient + of output tensor of forward. + + Returns: + Tuple[Tensor]: Gradient + of input tensors in forward. + """ + value, value_spatial_shapes, value_level_start_index, \ + sampling_locations, attention_weights = ctx.saved_tensors + grad_value = torch.zeros_like(value) + grad_sampling_loc = torch.zeros_like(sampling_locations) + grad_attn_weight = torch.zeros_like(attention_weights) + + ext_module.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output.contiguous(), + grad_value, + grad_sampling_loc, + grad_attn_weight, + im2col_step=ctx.im2col_step) + + return grad_value, None, None, \ + grad_sampling_loc, grad_attn_weight, None diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..100d94fef34456a0454eb7a328ca8688df1c30c1 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py @@ -0,0 +1,399 @@ + +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import warnings +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import (ATTENTION, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE) +from mmcv.cnn.bricks.transformer import build_attention +import math +from mmcv.runner import force_fp32, auto_fp16 + +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential + +from mmcv.utils import ext_loader +from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ + MultiScaleDeformableAttnFunction_fp16 +from projects.mmdet3d_plugin.models.utils.bricks import run_time +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +@ATTENTION.register_module() +class SpatialCrossAttention(BaseModule): + """An attention module used in BEVFormer. + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_cams (int): The number of cameras + dropout (float): A Dropout layer on `inp_residual`. + Default: 0.. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + deformable_attention: (dict): The config for the deformable attention used in SCA. + """ + + def __init__(self, + embed_dims=256, + num_cams=6, + pc_range=None, + dropout=0.1, + init_cfg=None, + batch_first=False, + deformable_attention=dict( + type='MSDeformableAttention3D', + embed_dims=256, + num_levels=4), + **kwargs + ): + super(SpatialCrossAttention, self).__init__(init_cfg) + + self.init_cfg = init_cfg + self.dropout = nn.Dropout(dropout) + self.pc_range = pc_range + self.fp16_enabled = False + self.deformable_attention = build_attention(deformable_attention) + self.embed_dims = embed_dims + self.num_cams = num_cams + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.batch_first = batch_first + self.init_weight() + + def init_weight(self): + """Default initialization for Parameters of Module.""" + xavier_init(self.output_proj, distribution='uniform', bias=0.) + + @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam')) + def forward(self, + query, + key, + value, + residual=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + reference_points_cam=None, + bev_mask=None, + level_start_index=None, + flag='encoder', + **kwargs): + """Forward Function of Detr3DCrossAtten. + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. (B, N, C, H, W) + residual (Tensor): The tensor used for addition, with the + same shape as `x`. Default None. If None, `x` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, 4), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different level. With shape (num_levels, 2), + last dimension represent (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape (num_levels) and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if key is None: + key = query + if value is None: + value = key + + if residual is None: + inp_residual = query + slots = torch.zeros_like(query) + if query_pos is not None: + query = query + query_pos + + bs, num_query, _ = query.size() + + D = reference_points_cam.size(3) + indexes = [] + for i, mask_per_img in enumerate(bev_mask): + index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1) + indexes.append(index_query_per_img) + max_len = max([len(each) for each in indexes]) + + # each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory. + queries_rebatch = query.new_zeros( + [bs, self.num_cams, max_len, self.embed_dims]) + reference_points_rebatch = reference_points_cam.new_zeros( + [bs, self.num_cams, max_len, D, 2]) + + for j in range(bs): + for i, reference_points_per_img in enumerate(reference_points_cam): + index_query_per_img = indexes[i] + queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img] + reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img] + + num_cams, l, bs, embed_dims = key.shape + + key = key.permute(2, 0, 1, 3).reshape( + bs * self.num_cams, l, self.embed_dims) + value = value.permute(2, 0, 1, 3).reshape( + bs * self.num_cams, l, self.embed_dims) + + queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value, + reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes, + level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims) + for j in range(bs): + for i, index_query_per_img in enumerate(indexes): + slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)] + + count = bev_mask.sum(-1) > 0 + count = count.permute(1, 2, 0).sum(-1) + count = torch.clamp(count, min=1.0) + slots = slots / count[..., None] + slots = self.output_proj(slots) + + return self.dropout(slots) + inp_residual + + +@ATTENTION.register_module() +class MSDeformableAttention3D(BaseModule): + """An attention module used in BEVFormer based on Deformable-Detr. + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=8, + im2col_step=64, + dropout=0.1, + batch_first=True, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.batch_first = batch_first + self.output_proj = None + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.sampling_offsets = nn.Linear( + embed_dims, num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims, + num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels, self.num_points, 1) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + **kwargs): + """Forward Function of MultiScaleDeformAttention. + Args: + query (Tensor): Query of Transformer with shape + ( bs, num_query, embed_dims). + key (Tensor): The key tensor with shape + `(bs, num_key, embed_dims)`. + value (Tensor): The value tensor with shape + `(bs, num_key, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points) + + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + + if reference_points.shape[-1] == 2: + """ + For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights. + After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image. + For each referent point, we sample `num_points` sampling points. + For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points. + """ + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + + bs, num_query, num_Z_anchors, xy = reference_points.shape + reference_points = reference_points[:, :, None, None, None, :, :] + sampling_offsets = sampling_offsets / \ + offset_normalizer[None, None, None, :, None, :] + bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape + sampling_offsets = sampling_offsets.view( + bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy) + sampling_locations = reference_points + sampling_offsets + bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape + assert num_all_points == num_points * num_Z_anchors + + sampling_locations = sampling_locations.view( + bs, num_query, num_heads, num_levels, num_all_points, xy) + + elif reference_points.shape[-1] == 4: + assert False + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + + # sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2 + # attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points + # + + if torch.cuda.is_available() and value.is_cuda: + if value.dtype == torch.float16: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + else: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + output = MultiScaleDeformableAttnFunction.apply( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights, self.im2col_step) + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + if not self.batch_first: + output = output.permute(1, 0, 2) + + return output diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..78fb9f529c925d1a4f74f1cc1f83de6b1cb20f67 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py @@ -0,0 +1,272 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from projects.mmdet3d_plugin.models.utils.bricks import run_time +from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32 +from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch +import warnings +import torch +import torch.nn as nn +from mmcv.cnn import xavier_init, constant_init +from mmcv.cnn.bricks.registry import ATTENTION +import math +from mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, + to_2tuple) + +from mmcv.utils import ext_loader +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +@ATTENTION.register_module() +class TemporalSelfAttention(BaseModule): + """An attention module used in BEVFormer based on Deformable-Detr. + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to True. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV. + the length of BEV queue is 2. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + num_bev_queue=2, + im2col_step=64, + dropout=0.1, + batch_first=True, + norm_cfg=None, + init_cfg=None): + + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.dropout = nn.Dropout(dropout) + self.batch_first = batch_first + self.fp16_enabled = False + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.num_bev_queue = num_bev_queue + self.sampling_offsets = nn.Linear( + embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue, + num_bev_queue*num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1) + + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + flag='decoder', + + **kwargs): + """Forward Function of MultiScaleDeformAttention. + + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + assert self.batch_first + bs, len_bev, c = query.shape + value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c) + + # value = torch.cat([query, query], 0) + + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + bs, num_query, embed_dims = query.shape + _, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + assert self.num_bev_queue == 2 + + query = torch.cat([value[:bs], query], -1) + value = self.value_proj(value) + + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + + value = value.reshape(bs*self.num_bev_queue, + num_value, self.num_heads, -1) + + sampling_offsets = self.sampling_offsets(query) + sampling_offsets = sampling_offsets.view( + bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_bev_queue, self.num_levels * self.num_points) + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_bev_queue, + self.num_levels, + self.num_points) + + attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\ + .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous() + sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\ + .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2) + + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets \ + / offset_normalizer[None, None, None, :, None, :] + + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ + * 0.5 + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: + + # using fp16 deformable attention is unstable because it performs many sum operations + if value.dtype == torch.float16: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + else: + MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 + output = MultiScaleDeformableAttnFunction.apply( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights, self.im2col_step) + else: + + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + + # output shape (bs*num_bev_queue, num_query, embed_dims) + # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue) + output = output.permute(1, 2, 0) + + # fuse history value and current value + # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue) + output = output.view(num_query, embed_dims, bs, self.num_bev_queue) + output = output.mean(-1) + + # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims) + output = output.permute(2, 0, 1) + + output = self.output_proj(output) + + if not self.batch_first: + output = output.permute(1, 0, 2) + + return self.dropout(output) + identity diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/transformer.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b740fccf5f5ab16ee4cb101fdb8874f2e6c147d2 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/transformer.py @@ -0,0 +1,289 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import numpy as np +import torch +import torch.nn as nn +from mmcv.cnn import xavier_init +from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence +from mmcv.runner.base_module import BaseModule + +from mmdet.models.utils.builder import TRANSFORMER +from torch.nn.init import normal_ +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from mmcv.runner.base_module import BaseModule +from torchvision.transforms.functional import rotate +from .temporal_self_attention import TemporalSelfAttention +from .spatial_cross_attention import MSDeformableAttention3D +from .decoder import CustomMSDeformableAttention +from projects.mmdet3d_plugin.models.utils.bricks import run_time +from mmcv.runner import force_fp32, auto_fp16 + + +@TRANSFORMER.register_module() +class PerceptionTransformer(BaseModule): + """Implements the Detr3D transformer. + Args: + as_two_stage (bool): Generate query from encoder features. + Default: False. + num_feature_levels (int): Number of feature maps from FPN: + Default: 4. + two_stage_num_proposals (int): Number of proposals when set + `as_two_stage` as True. Default: 300. + """ + + def __init__(self, + num_feature_levels=4, + num_cams=6, + two_stage_num_proposals=300, + encoder=None, + decoder=None, + embed_dims=256, + rotate_prev_bev=True, + use_shift=True, + use_can_bus=True, + can_bus_norm=True, + use_cams_embeds=True, + rotate_center=[100, 100], + **kwargs): + super(PerceptionTransformer, self).__init__(**kwargs) + self.encoder = build_transformer_layer_sequence(encoder) + self.decoder = build_transformer_layer_sequence(decoder) + self.embed_dims = embed_dims + self.num_feature_levels = num_feature_levels + self.num_cams = num_cams + self.fp16_enabled = False + + self.rotate_prev_bev = rotate_prev_bev + self.use_shift = use_shift + self.use_can_bus = use_can_bus + self.can_bus_norm = can_bus_norm + self.use_cams_embeds = use_cams_embeds + + self.two_stage_num_proposals = two_stage_num_proposals + self.init_layers() + self.rotate_center = rotate_center + + def init_layers(self): + """Initialize layers of the Detr3DTransformer.""" + self.level_embeds = nn.Parameter(torch.Tensor( + self.num_feature_levels, self.embed_dims)) + self.cams_embeds = nn.Parameter( + torch.Tensor(self.num_cams, self.embed_dims)) + self.reference_points = nn.Linear(self.embed_dims, 3) + self.can_bus_mlp = nn.Sequential( + nn.Linear(18, self.embed_dims // 2), + nn.ReLU(inplace=True), + nn.Linear(self.embed_dims // 2, self.embed_dims), + nn.ReLU(inplace=True), + ) + if self.can_bus_norm: + self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims)) + + def init_weights(self): + """Initialize the transformer weights.""" + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \ + or isinstance(m, CustomMSDeformableAttention): + try: + m.init_weight() + except AttributeError: + m.init_weights() + normal_(self.level_embeds) + normal_(self.cams_embeds) + xavier_init(self.reference_points, distribution='uniform', bias=0.) + xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.) + + @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos')) + def get_bev_features( + self, + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + prev_bev=None, + **kwargs): + """ + obtain bev features. + """ + + bs = mlvl_feats[0].size(0) + bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1) + bev_pos = bev_pos.flatten(2).permute(2, 0, 1) + + # obtain rotation angle and shift with ego motion + delta_x = np.array([each['can_bus'][0] + for each in kwargs['img_metas']]) + delta_y = np.array([each['can_bus'][1] + for each in kwargs['img_metas']]) + ego_angle = np.array( + [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']]) + grid_length_y = grid_length[0] + grid_length_x = grid_length[1] + translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2) + translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180 + bev_angle = ego_angle - translation_angle + shift_y = translation_length * \ + np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h + shift_x = translation_length * \ + np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w + shift_y = shift_y * self.use_shift + shift_x = shift_x * self.use_shift + shift = bev_queries.new_tensor( + [shift_x, shift_y]).permute(1, 0) # xy, bs -> bs, xy + + if prev_bev is not None: + if prev_bev.shape[1] == bev_h * bev_w: + prev_bev = prev_bev.permute(1, 0, 2) + if self.rotate_prev_bev: + for i in range(bs): + # num_prev_bev = prev_bev.size(1) + rotation_angle = kwargs['img_metas'][i]['can_bus'][-1] + tmp_prev_bev = prev_bev[:, i].reshape( + bev_h, bev_w, -1).permute(2, 0, 1) + tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle, + center=self.rotate_center) + tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape( + bev_h * bev_w, 1, -1) + prev_bev[:, i] = tmp_prev_bev[:, 0] + + # add can bus signals + can_bus = bev_queries.new_tensor( + [each['can_bus'] for each in kwargs['img_metas']]) # [:, :] + can_bus = self.can_bus_mlp(can_bus)[None, :, :] + bev_queries = bev_queries + can_bus * self.use_can_bus + + feat_flatten = [] + spatial_shapes = [] + for lvl, feat in enumerate(mlvl_feats): + bs, num_cam, c, h, w = feat.shape + spatial_shape = (h, w) + feat = feat.flatten(3).permute(1, 0, 3, 2) + if self.use_cams_embeds: + feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype) + feat = feat + self.level_embeds[None, + None, lvl:lvl + 1, :].to(feat.dtype) + spatial_shapes.append(spatial_shape) + feat_flatten.append(feat) + + feat_flatten = torch.cat(feat_flatten, 2) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=bev_pos.device) + level_start_index = torch.cat((spatial_shapes.new_zeros( + (1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + + feat_flatten = feat_flatten.permute( + 0, 2, 1, 3) # (num_cam, H*W, bs, embed_dims) + + bev_embed = self.encoder( + bev_queries, + feat_flatten, + feat_flatten, + bev_h=bev_h, + bev_w=bev_w, + bev_pos=bev_pos, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + prev_bev=prev_bev, + shift=shift, + **kwargs + ) + + return bev_embed + + @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos')) + def forward(self, + mlvl_feats, + bev_queries, + object_query_embed, + bev_h, + bev_w, + grid_length=[0.512, 0.512], + bev_pos=None, + reg_branches=None, + cls_branches=None, + prev_bev=None, + **kwargs): + """Forward function for `Detr3DTransformer`. + Args: + mlvl_feats (list(Tensor)): Input queries from + different level. Each element has shape + [bs, num_cams, embed_dims, h, w]. + bev_queries (Tensor): (bev_h*bev_w, c) + bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w) + object_query_embed (Tensor): The query embedding for decoder, + with shape [num_query, c]. + reg_branches (obj:`nn.ModuleList`): Regression heads for + feature maps from each decoder layer. Only would + be passed when `with_box_refine` is True. Default to None. + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + - bev_embed: BEV features + - inter_states: Outputs from decoder. If + return_intermediate_dec is True output has shape \ + (num_dec_layers, bs, num_query, embed_dims), else has \ + shape (1, bs, num_query, embed_dims). + - init_reference_out: The initial value of reference \ + points, has shape (bs, num_queries, 4). + - inter_references_out: The internal value of reference \ + points in decoder, has shape \ + (num_dec_layers, bs,num_query, embed_dims) + - enc_outputs_class: The classification score of \ + proposals generated from \ + encoder's feature maps, has shape \ + (batch, h*w, num_classes). \ + Only would be returned when `as_two_stage` is True, \ + otherwise None. + - enc_outputs_coord_unact: The regression results \ + generated from encoder's feature maps., has shape \ + (batch, h*w, 4). Only would \ + be returned when `as_two_stage` is True, \ + otherwise None. + """ + + bev_embed = self.get_bev_features( + mlvl_feats, + bev_queries, + bev_h, + bev_w, + grid_length=grid_length, + bev_pos=bev_pos, + prev_bev=prev_bev, + **kwargs) # bev_embed shape: bs, bev_h*bev_w, embed_dims + + bs = mlvl_feats[0].size(0) + query_pos, query = torch.split( + object_query_embed, self.embed_dims, dim=1) + query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1) + query = query.unsqueeze(0).expand(bs, -1, -1) + reference_points = self.reference_points(query_pos) + reference_points = reference_points.sigmoid() + init_reference_out = reference_points + + query = query.permute(1, 0, 2) + query_pos = query_pos.permute(1, 0, 2) + bev_embed = bev_embed.permute(1, 0, 2) + + inter_states, inter_references = self.decoder( + query=query, + key=None, + value=bev_embed, + query_pos=query_pos, + reference_points=reference_points, + reg_branches=reg_branches, + cls_branches=cls_branches, + spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device), + level_start_index=torch.tensor([0], device=query.device), + **kwargs) + + inter_references_out = inter_references + + return bev_embed, inter_states, init_reference_out, inter_references_out diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..03f906ce601e2dfac207af680774086067808830 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/__init__.py @@ -0,0 +1 @@ +from .epoch_based_runner import EpochBasedRunner_video \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..e73e5e7873f831b3c6e0f19715d950701b65fa25 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +import os.path as osp +import torch +import mmcv +from mmcv.runner.base_runner import BaseRunner +from mmcv.runner.epoch_based_runner import EpochBasedRunner +from mmcv.runner.builder import RUNNERS +from mmcv.runner.checkpoint import save_checkpoint +from mmcv.runner.utils import get_host_info +from pprint import pprint +from mmcv.parallel.data_container import DataContainer + + +@RUNNERS.register_module() +class EpochBasedRunner_video(EpochBasedRunner): + + ''' + # basic logic + + input_sequence = [a, b, c] # given a sequence of samples + + prev_bev = None + for each in input_sequcene[:-1] + prev_bev = eval_model(each, prev_bev)) # inference only. + + model(input_sequcene[-1], prev_bev) # train the last sample. + ''' + + def __init__(self, + model, + eval_model=None, + batch_processor=None, + optimizer=None, + work_dir=None, + logger=None, + meta=None, + keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'], + max_iters=None, + max_epochs=None): + super().__init__(model, + batch_processor, + optimizer, + work_dir, + logger, + meta, + max_iters, + max_epochs) + keys.append('img_metas') + self.keys = keys + self.eval_model = eval_model + self.eval_model.eval() + + def run_iter(self, data_batch, train_mode, **kwargs): + if self.batch_processor is not None: + assert False + # outputs = self.batch_processor( + # self.model, data_batch, train_mode=train_mode, **kwargs) + elif train_mode: + + num_samples = data_batch['img'].data[0].size(1) + data_list = [] + prev_bev = None + for i in range(num_samples): + data = {} + for key in self.keys: + if key not in ['img_metas', 'img', 'points']: + data[key] = data_batch[key] + else: + if key == 'img': + data['img'] = DataContainer(data=[data_batch['img'].data[0][:, i]], cpu_only=data_batch['img'].cpu_only, stack=True) + elif key == 'img_metas': + data['img_metas'] = DataContainer(data=[[each[i] for each in data_batch['img_metas'].data[0]]], cpu_only=data_batch['img_metas'].cpu_only) + else: + assert False + data_list.append(data) + with torch.no_grad(): + for i in range(num_samples-1): + if i>0: data_list[i]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False) + prev_bev = self.eval_model.val_step(data_list[i], self.optimizer, **kwargs) + + data_list[-1]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False) + outputs = self.model.train_step(data_list[-1], self.optimizer, **kwargs) + else: + assert False + # outputs = self.model.val_step(data_batch, self.optimizer, **kwargs) + + if not isinstance(outputs, dict): + raise TypeError('"batch_processor()" or "model.train_step()"' + 'and "model.val_step()" must return a dict') + if 'log_vars' in outputs: + self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) + self.outputs = outputs \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/__pycache__/util.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/__pycache__/util.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d5aec925098e5e62d94070d0d92fab349404df0 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/__pycache__/util.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3250ef2bad5b2c52a43bae642b6761b8fa4908a7 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py @@ -0,0 +1,4 @@ +from .hungarian_assigner_3d import HungarianAssigner3D +from .map_hungarian_assigner_3d import MapHungarianAssigner3D + +__all__ = ['HungarianAssigner3D', 'MapHungarianAssigner3D'] diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..579ae0b53e2d313a8a6de5f4581507f0827d6d78 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/hungarian_assigner_3d.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/hungarian_assigner_3d.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..313f212912cac148f79cd15dd0f1bb55aa1ec9bc Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/hungarian_assigner_3d.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/map_hungarian_assigner_3d.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/map_hungarian_assigner_3d.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8be7a223985767e0993f9672ff2d41273cedde3a Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/map_hungarian_assigner_3d.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..583fcab72f6b2bbf20bda90b8f877cc1f81072d9 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py @@ -0,0 +1,136 @@ +import torch + +from mmdet.core.bbox.builder import BBOX_ASSIGNERS +from mmdet.core.bbox.assigners import AssignResult +from mmdet.core.bbox.assigners import BaseAssigner +from mmdet.core.bbox.match_costs import build_match_cost +from mmdet.models.utils.transformer import inverse_sigmoid +from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox + +try: + from scipy.optimize import linear_sum_assignment +except ImportError: + linear_sum_assignment = None + + +@BBOX_ASSIGNERS.register_module() +class HungarianAssigner3D(BaseAssigner): + """Computes one-to-one matching between predictions and ground truth. + This class computes an assignment between the targets and the predictions + based on the costs. The costs are weighted sum of three components: + classification cost, regression L1 cost and regression iou cost. The + targets don't include the no_object, so generally there are more + predictions than targets. After the one-to-one matching, the un-matched + are treated as backgrounds. Thus each query prediction will be assigned + with `0` or a positive integer indicating the ground truth index: + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + Args: + cls_weight (int | float, optional): The scale factor for classification + cost. Default 1.0. + bbox_weight (int | float, optional): The scale factor for regression + L1 cost. Default 1.0. + iou_weight (int | float, optional): The scale factor for regression + iou cost. Default 1.0. + iou_calculator (dict | optional): The config for the iou calculation. + Default type `BboxOverlaps2D`. + iou_mode (str | optional): "iou" (intersection over union), "iof" + (intersection over foreground), or "giou" (generalized + intersection over union). Default "giou". + """ + + def __init__(self, + cls_cost=dict(type='ClassificationCost', weight=1.), + reg_cost=dict(type='BBoxL1Cost', weight=1.0), + iou_cost=dict(type='IoUCost', weight=0.0), + pc_range=None): + self.cls_cost = build_match_cost(cls_cost) + self.reg_cost = build_match_cost(reg_cost) + self.iou_cost = build_match_cost(iou_cost) + self.pc_range = pc_range + + def assign(self, + bbox_pred, + cls_pred, + gt_bboxes, + gt_labels, + gt_bboxes_ignore=None, + eps=1e-7): + """Computes one-to-one matching based on the weighted costs. + This method assign each query prediction to a ground truth or + background. The `assigned_gt_inds` with -1 means don't care, + 0 means negative sample, and positive number is the index (1-based) + of assigned gt. + The assignment is done in the following steps, the order matters. + 1. assign every prediction to -1 + 2. compute the weighted costs + 3. do Hungarian matching on CPU based on the costs + 4. assign all to 0 (background) first, then for each matched pair + between predictions and gts, treat this prediction as foreground + and assign the corresponding gt index (plus 1) to it. + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx, cy, w, h), which are all in range [0, 1]. Shape + [num_query, 4]. + cls_pred (Tensor): Predicted classification logits, shape + [num_query, num_class]. + gt_bboxes (Tensor): Ground truth boxes with unnormalized + coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are + labelled as `ignored`. Default None. + eps (int | float, optional): A value added to the denominator for + numerical stability. Default 1e-7. + Returns: + :obj:`AssignResult`: The assigned result. + """ + assert gt_bboxes_ignore is None, \ + 'Only case when gt_bboxes_ignore is None is supported.' + num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) + + # 1. assign -1 by default + assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), + -1, + dtype=torch.long) + assigned_labels = bbox_pred.new_full((num_bboxes, ), + -1, + dtype=torch.long) + if num_gts == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + if num_gts == 0: + # No ground truth, assign all to background + assigned_gt_inds[:] = 0 + return AssignResult( + num_gts, assigned_gt_inds, None, labels=assigned_labels) + + # 2. compute the weighted costs + # classification and bboxcost. + cls_cost = self.cls_cost(cls_pred, gt_labels) + # regression L1 cost + + normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range) + + reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8]) + + # weighted sum of above two costs + cost = cls_cost + reg_cost + + # 3. do Hungarian matching on CPU using linear_sum_assignment + cost = cost.detach().cpu() + if linear_sum_assignment is None: + raise ImportError('Please run "pip install scipy" ' + 'to install scipy first.') + matched_row_inds, matched_col_inds = linear_sum_assignment(cost) + matched_row_inds = torch.from_numpy(matched_row_inds).to( + bbox_pred.device) + matched_col_inds = torch.from_numpy(matched_col_inds).to( + bbox_pred.device) + + # 4. assign backgrounds and foregrounds + # assign all indices to backgrounds first + assigned_gt_inds[:] = 0 + # assign foregrounds based on matching results + assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 + assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] + return AssignResult( + num_gts, assigned_gt_inds, None, labels=assigned_labels) \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/map_hungarian_assigner_3d.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/map_hungarian_assigner_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..e6afa75e7b5daefa3fe1592175c628e6ad62c29a --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/map_hungarian_assigner_3d.py @@ -0,0 +1,162 @@ +import torch +import torch.nn.functional as F + +from mmdet.core.bbox.builder import BBOX_ASSIGNERS +from mmdet.core.bbox.assigners import AssignResult +from mmdet.core.bbox.assigners import BaseAssigner +from mmdet.core.bbox.match_costs import build_match_cost +from mmdet.models.utils.transformer import inverse_sigmoid +from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox +from projects.mmdet3d_plugin.VAD.utils.map_utils import ( + normalize_2d_bbox, normalize_2d_pts, denormalize_2d_bbox +) + +try: + from scipy.optimize import linear_sum_assignment +except ImportError: + linear_sum_assignment = None + +@BBOX_ASSIGNERS.register_module() +class MapHungarianAssigner3D(BaseAssigner): + """Computes one-to-one matching between predictions and ground truth. + This class computes an assignment between the targets and the predictions + based on the costs. The costs are weighted sum of three components: + classification cost, regression L1 cost and regression iou cost. The + targets don't include the no_object, so generally there are more + predictions than targets. After the one-to-one matching, the un-matched + are treated as backgrounds. Thus each query prediction will be assigned + with `0` or a positive integer indicating the ground truth index: + - 0: negative sample, no assigned gt + - positive integer: positive sample, index (1-based) of assigned gt + Args: + cls_weight (int | float, optional): The scale factor for classification + cost. Default 1.0. + bbox_weight (int | float, optional): The scale factor for regression + L1 cost. Default 1.0. + iou_weight (int | float, optional): The scale factor for regression + iou cost. Default 1.0. + iou_calculator (dict | optional): The config for the iou calculation. + Default type `BboxOverlaps2D`. + iou_mode (str | optional): "iou" (intersection over union), "iof" + (intersection over foreground), or "giou" (generalized + intersection over union). Default "giou". + """ + + def __init__(self, + cls_cost=dict(type='ClassificationCost', weight=1.), + reg_cost=dict(type='BBoxL1Cost', weight=1.0), + iou_cost=dict(type='IoUCost', weight=0.0), + pts_cost=dict(type='ChamferDistance',loss_src_weight=1.0,loss_dst_weight=1.0), + pc_range=None): + self.cls_cost = build_match_cost(cls_cost) + self.reg_cost = build_match_cost(reg_cost) + self.iou_cost = build_match_cost(iou_cost) + self.pts_cost = build_match_cost(pts_cost) + self.pc_range = pc_range + + def assign(self, + bbox_pred, + cls_pred, + pts_pred, + gt_bboxes, + gt_labels, + gt_pts, + gt_bboxes_ignore=None, + eps=1e-7): + """Computes one-to-one matching based on the weighted costs. + This method assign each query prediction to a ground truth or + background. The `assigned_gt_inds` with -1 means don't care, + 0 means negative sample, and positive number is the index (1-based) + of assigned gt. + The assignment is done in the following steps, the order matters. + 1. assign every prediction to -1 + 2. compute the weighted costs + 3. do Hungarian matching on CPU based on the costs + 4. assign all to 0 (background) first, then for each matched pair + between predictions and gts, treat this prediction as foreground + and assign the corresponding gt index (plus 1) to it. + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx, cy, w, h), which are all in range [0, 1]. Shape + [num_query, 4]. + cls_pred (Tensor): Predicted classification logits, shape + [num_query, num_class]. + gt_bboxes (Tensor): Ground truth boxes with unnormalized + coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are + labelled as `ignored`. Default None. + eps (int | float, optional): A value added to the denominator for + numerical stability. Default 1e-7. + Returns: + :obj:`AssignResult`: The assigned result. + """ + assert gt_bboxes_ignore is None, \ + 'Only case when gt_bboxes_ignore is None is supported.' + assert bbox_pred.shape[-1] == 4, \ + 'Only support bbox pred shape is 4 dims' + num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) + + # 1. assign -1 by default + assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), + -1, + dtype=torch.long) + assigned_labels = bbox_pred.new_full((num_bboxes, ), + -1, + dtype=torch.long) + if num_gts == 0 or num_bboxes == 0: + # No ground truth or boxes, return empty assignment + if num_gts == 0: + # No ground truth, assign all to background + assigned_gt_inds[:] = 0 + return AssignResult( + num_gts, assigned_gt_inds, None, labels=assigned_labels), None + + # 2. compute the weighted costs + # classification and bboxcost. + cls_cost = self.cls_cost(cls_pred, gt_labels) + # regression L1 cost + + normalized_gt_bboxes = normalize_2d_bbox(gt_bboxes, self.pc_range) + # normalized_gt_bboxes = gt_bboxes + # import pdb;pdb.set_trace() + reg_cost = self.reg_cost(bbox_pred[:, :4], normalized_gt_bboxes[:, :4]) + + _, num_orders, num_pts_per_gtline, num_coords = gt_pts.shape + normalized_gt_pts = normalize_2d_pts(gt_pts, self.pc_range) + num_pts_per_predline = pts_pred.size(1) + if num_pts_per_predline != num_pts_per_gtline: + pts_pred_interpolated = F.interpolate(pts_pred.permute(0,2,1),size=(num_pts_per_gtline), + mode='linear', align_corners=True) + pts_pred_interpolated = pts_pred_interpolated.permute(0,2,1).contiguous() + else: + pts_pred_interpolated = pts_pred + # num_q, num_pts, 2 <-> num_gt, num_pts, 2 + pts_cost_ordered = self.pts_cost(pts_pred_interpolated, normalized_gt_pts) + pts_cost_ordered = pts_cost_ordered.view(num_bboxes, num_gts, num_orders) + pts_cost, order_index = torch.min(pts_cost_ordered, 2) + + bboxes = denormalize_2d_bbox(bbox_pred, self.pc_range) + iou_cost = self.iou_cost(bboxes, gt_bboxes) + # weighted sum of above three costs + cost = cls_cost + reg_cost + iou_cost + pts_cost + + # 3. do Hungarian matching on CPU using linear_sum_assignment + cost = cost.detach().cpu() + if linear_sum_assignment is None: + raise ImportError('Please run "pip install scipy" ' + 'to install scipy first.') + matched_row_inds, matched_col_inds = linear_sum_assignment(cost) + matched_row_inds = torch.from_numpy(matched_row_inds).to( + bbox_pred.device) + matched_col_inds = torch.from_numpy(matched_col_inds).to( + bbox_pred.device) + + # 4. assign backgrounds and foregrounds + # assign all indices to backgrounds first + assigned_gt_inds[:] = 0 + # assign foregrounds based on matching results + assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 + assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] + return AssignResult( + num_gts, assigned_gt_inds, None, labels=assigned_labels), order_index \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__init__.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cb9c159fd905a4670c06167abc101d178a24da2c --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__init__.py @@ -0,0 +1,7 @@ +from .nms_free_coder import NMSFreeCoder +from .fut_nms_free_coder import CustomNMSFreeCoder +from .map_nms_free_coder import MapNMSFreeCoder + +__all__ = ['NMSFreeCoder', + 'CustomNMSFreeCoder', + 'MapNMSFreeCoder'] diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f80b6fb06b7c41d528470d9d72a7176987e76dd Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/fut_nms_free_coder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/fut_nms_free_coder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50c3475a8f9dae2ce6faaa7891f4e0457578dd38 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/fut_nms_free_coder.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/map_nms_free_coder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/map_nms_free_coder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e48cf1bfc297b48796b643e8c36ab4153c88b020 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/map_nms_free_coder.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/nms_free_coder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/nms_free_coder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..17f5aee76da10720a3d82378c3aa9b3fcfa895b1 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/nms_free_coder.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/fut_nms_free_coder.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/fut_nms_free_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..fafb3d6aecfc410d7b69715fdf227a8c72feb4d4 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/fut_nms_free_coder.py @@ -0,0 +1,127 @@ +import torch + +from mmdet.core.bbox import BaseBBoxCoder +from mmdet.core.bbox.builder import BBOX_CODERS +from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox +import numpy as np + + +@BBOX_CODERS.register_module() +class CustomNMSFreeCoder(BaseBBoxCoder): + """Bbox coder for NMS-free detector. + Args: + pc_range (list[float]): Range of point cloud. + post_center_range (list[float]): Limit of the center. + Default: None. + max_num (int): Max number to be kept. Default: 100. + score_threshold (float): Threshold to filter boxes based on score. + Default: None. + code_size (int): Code size of bboxes. Default: 9 + """ + + def __init__(self, + pc_range, + voxel_size=None, + post_center_range=None, + max_num=100, + score_threshold=None, + num_classes=10): + self.pc_range = pc_range + self.voxel_size = voxel_size + self.post_center_range = post_center_range + self.max_num = max_num + self.score_threshold = score_threshold + self.num_classes = num_classes + + def encode(self): + + pass + + def decode_single(self, cls_scores, bbox_preds, traj_preds): + """Decode bboxes. + Args: + cls_scores (Tensor): Outputs from the classification head, \ + shape [num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + bbox_preds (Tensor): Outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [num_query, 9]. + Returns: + list[dict]: Decoded boxes. + """ + max_num = self.max_num + + cls_scores = cls_scores.sigmoid() + scores, indexs = cls_scores.view(-1).topk(max_num) + labels = indexs % self.num_classes + bbox_index = indexs // self.num_classes + bbox_preds = bbox_preds[bbox_index] + traj_preds = traj_preds[bbox_index] + + final_box_preds = denormalize_bbox(bbox_preds, self.pc_range) + final_scores = scores + final_preds = labels + final_traj_preds = traj_preds + + # use score threshold + if self.score_threshold is not None: + thresh_mask = final_scores > self.score_threshold + tmp_score = self.score_threshold + while thresh_mask.sum() == 0: + tmp_score *= 0.9 + if tmp_score < 0.01: + thresh_mask = final_scores > -1 + break + thresh_mask = final_scores >= tmp_score + + if self.post_center_range is not None: + self.post_center_range = torch.tensor( + self.post_center_range, device=scores.device) + mask = (final_box_preds[..., :3] >= + self.post_center_range[:3]).all(1) + mask &= (final_box_preds[..., :3] <= + self.post_center_range[3:]).all(1) + + if self.score_threshold: + mask &= thresh_mask + + boxes3d = final_box_preds[mask] + scores = final_scores[mask] + labels = final_preds[mask] + trajs = final_traj_preds[mask] + + predictions_dict = { + 'bboxes': boxes3d, + 'scores': scores, + 'labels': labels, + 'trajs': trajs + } + + else: + raise NotImplementedError( + 'Need to reorganize output as a batch, only ' + 'support post_center_range is not None for now!') + return predictions_dict + + def decode(self, preds_dicts): + """Decode bboxes. + Args: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + Returns: + list[dict]: Decoded boxes. + """ + all_cls_scores = preds_dicts['all_cls_scores'][-1] + all_bbox_preds = preds_dicts['all_bbox_preds'][-1] + all_traj_preds = preds_dicts['all_traj_preds'][-1] + + batch_size = all_cls_scores.size()[0] + predictions_list = [] + for i in range(batch_size): + predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i], all_traj_preds[i])) + return predictions_list + diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/map_nms_free_coder.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/map_nms_free_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..a7186e8ca56c9f33e3116270ff946b4f6f2fcfac --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/map_nms_free_coder.py @@ -0,0 +1,126 @@ +import torch + +from mmdet.core.bbox import BaseBBoxCoder +from mmdet.core.bbox.builder import BBOX_CODERS +from projects.mmdet3d_plugin.VAD.utils.map_utils import ( + denormalize_2d_pts, denormalize_2d_bbox +) + + +@BBOX_CODERS.register_module() +class MapNMSFreeCoder(BaseBBoxCoder): + """Bbox coder for NMS-free detector. + Args: + pc_range (list[float]): Range of point cloud. + post_center_range (list[float]): Limit of the center. + Default: None. + max_num (int): Max number to be kept. Default: 100. + score_threshold (float): Threshold to filter boxes based on score. + Default: None. + code_size (int): Code size of bboxes. Default: 9 + """ + + def __init__(self, + pc_range, + voxel_size=None, + post_center_range=None, + max_num=100, + score_threshold=None, + num_classes=10): + self.pc_range = pc_range + self.voxel_size = voxel_size + self.post_center_range = post_center_range + self.max_num = max_num + self.score_threshold = score_threshold + self.num_classes = num_classes + + def encode(self): + + pass + + def decode_single(self, cls_scores, bbox_preds, pts_preds): + """Decode bboxes. + Args: + cls_scores (Tensor): Outputs from the classification head, \ + shape [num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + bbox_preds (Tensor): Outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [num_query, 9]. + pts_preds (Tensor): + Shape [num_query, fixed_num_pts, 2] + Returns: + list[dict]: Decoded boxes. + """ + max_num = self.max_num + + cls_scores = cls_scores.sigmoid() + scores, indexs = cls_scores.view(-1).topk(max_num) + labels = indexs % self.num_classes + bbox_index = indexs // self.num_classes + bbox_preds = bbox_preds[bbox_index] + pts_preds = pts_preds[bbox_index] + + final_box_preds = denormalize_2d_bbox(bbox_preds, self.pc_range) + final_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range) #num_q,num_p,2 + # final_box_preds = bbox_preds + final_scores = scores + final_preds = labels + + # use score threshold + if self.score_threshold is not None: + thresh_mask = final_scores > self.score_threshold + tmp_score = self.score_threshold + while thresh_mask.sum() == 0: + tmp_score *= 0.9 + if tmp_score < 0.01: + thresh_mask = final_scores > -1 + break + thresh_mask = final_scores >= tmp_score + + if self.post_center_range is not None: + self.post_center_range = torch.tensor( + self.post_center_range, device=scores.device) + mask = (final_box_preds[..., :4] >= self.post_center_range[:4]).all(1) + mask &= (final_box_preds[..., :4] <= self.post_center_range[4:]).all(1) + + if self.score_threshold: + mask &= thresh_mask + + boxes3d = final_box_preds[mask] + scores = final_scores[mask] + pts = final_pts_preds[mask] + labels = final_preds[mask] + predictions_dict = { + 'map_bboxes': boxes3d, + 'map_scores': scores, + 'map_labels': labels, + 'map_pts': pts, + } + + else: + raise NotImplementedError( + 'Need to reorganize output as a batch, only ' + 'support post_center_range is not None for now!') + return predictions_dict + + def decode(self, preds_dicts): + """Decode bboxes. + Args: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + Returns: + list[dict]: Decoded boxes. + """ + all_cls_scores = preds_dicts['map_all_cls_scores'][-1] + all_bbox_preds = preds_dicts['map_all_bbox_preds'][-1] + all_pts_preds = preds_dicts['map_all_pts_preds'][-1] + batch_size = all_cls_scores.size()[0] + predictions_list = [] + for i in range(batch_size): + predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i],all_pts_preds[i])) + return predictions_list \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py new file mode 100644 index 0000000000000000000000000000000000000000..15321f5b2f376fa938588c4480cd12b77e0e864e --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py @@ -0,0 +1,122 @@ +import torch + +from mmdet.core.bbox import BaseBBoxCoder +from mmdet.core.bbox.builder import BBOX_CODERS +from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox +import numpy as np + + +@BBOX_CODERS.register_module() +class NMSFreeCoder(BaseBBoxCoder): + """Bbox coder for NMS-free detector. + Args: + pc_range (list[float]): Range of point cloud. + post_center_range (list[float]): Limit of the center. + Default: None. + max_num (int): Max number to be kept. Default: 100. + score_threshold (float): Threshold to filter boxes based on score. + Default: None. + code_size (int): Code size of bboxes. Default: 9 + """ + + def __init__(self, + pc_range, + voxel_size=None, + post_center_range=None, + max_num=100, + score_threshold=None, + num_classes=10): + self.pc_range = pc_range + self.voxel_size = voxel_size + self.post_center_range = post_center_range + self.max_num = max_num + self.score_threshold = score_threshold + self.num_classes = num_classes + + def encode(self): + + pass + + def decode_single(self, cls_scores, bbox_preds): + """Decode bboxes. + Args: + cls_scores (Tensor): Outputs from the classification head, \ + shape [num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + bbox_preds (Tensor): Outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [num_query, 9]. + Returns: + list[dict]: Decoded boxes. + """ + max_num = self.max_num + + cls_scores = cls_scores.sigmoid() + scores, indexs = cls_scores.view(-1).topk(max_num) + labels = indexs % self.num_classes + bbox_index = indexs // self.num_classes + bbox_preds = bbox_preds[bbox_index] + + final_box_preds = denormalize_bbox(bbox_preds, self.pc_range) + final_scores = scores + final_preds = labels + + # use score threshold + if self.score_threshold is not None: + thresh_mask = final_scores > self.score_threshold + tmp_score = self.score_threshold + while thresh_mask.sum() == 0: + tmp_score *= 0.9 + if tmp_score < 0.01: + thresh_mask = final_scores > -1 + break + thresh_mask = final_scores >= tmp_score + + if self.post_center_range is not None: + self.post_center_range = torch.tensor( + self.post_center_range, device=scores.device) + mask = (final_box_preds[..., :3] >= + self.post_center_range[:3]).all(1) + mask &= (final_box_preds[..., :3] <= + self.post_center_range[3:]).all(1) + + if self.score_threshold: + mask &= thresh_mask + + boxes3d = final_box_preds[mask] + scores = final_scores[mask] + + labels = final_preds[mask] + predictions_dict = { + 'bboxes': boxes3d, + 'scores': scores, + 'labels': labels + } + + else: + raise NotImplementedError( + 'Need to reorganize output as a batch, only ' + 'support post_center_range is not None for now!') + return predictions_dict + + def decode(self, preds_dicts): + """Decode bboxes. + Args: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + Returns: + list[dict]: Decoded boxes. + """ + all_cls_scores = preds_dicts['all_cls_scores'][-1] + all_bbox_preds = preds_dicts['all_bbox_preds'][-1] + + batch_size = all_cls_scores.size()[0] + predictions_list = [] + for i in range(batch_size): + predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i])) + return predictions_list + diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aac1a82a64f467a47e39d7e862357459e84abb84 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py @@ -0,0 +1,4 @@ +from mmdet.core.bbox.match_costs import build_match_cost +from .match_cost import BBox3DL1Cost + +__all__ = ['build_match_cost', 'BBox3DL1Cost'] \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6961e65c2e165d3ce71560cfb27c53e7b538b992 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/match_cost.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/match_cost.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7b9fb73276907000ee1a91a6d67c64ec7fbb9c9 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/match_cost.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py new file mode 100644 index 0000000000000000000000000000000000000000..d9678f3c7f666255540762d4064f0f7d82b920ed --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py @@ -0,0 +1,27 @@ +import torch +from mmdet.core.bbox.match_costs.builder import MATCH_COST + + +@MATCH_COST.register_module() +class BBox3DL1Cost(object): + """BBox3DL1Cost. + Args: + weight (int | float, optional): loss_weight + """ + + def __init__(self, weight=1.): + self.weight = weight + + def __call__(self, bbox_pred, gt_bboxes): + """ + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx, cy, w, h), which are all in range [0, 1]. Shape + [num_query, 4]. + gt_bboxes (Tensor): Ground truth boxes with normalized + coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. + Returns: + torch.Tensor: bbox_cost value with weight + """ + bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) + return bbox_cost * self.weight \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__init__.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..698af69c847b559eaf13f9c3e8609223824d255c --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__init__.py @@ -0,0 +1,3 @@ +from .lidar_box3d import CustomLiDARInstance3DBoxes + +__all__ = ['CustomLiDARInstance3DBoxes'] \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66d3462fa19de5d4227ad0e80cf2f50271c0062d Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/lidar_box3d.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/lidar_box3d.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccd3a2a43fe1369e066e7c41f14e22fe163464fe Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/lidar_box3d.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/nuscenes_box.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/nuscenes_box.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c143274b33e5e82e993d026bf6bae98e308974b Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/nuscenes_box.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/lidar_box3d.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/lidar_box3d.py new file mode 100644 index 0000000000000000000000000000000000000000..22a595de1569ab842214d072b12eed05cc672518 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/lidar_box3d.py @@ -0,0 +1,279 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from mmdet3d.core.points import BasePoints +from mmdet3d.ops.roiaware_pool3d import points_in_boxes_gpu +from mmdet3d.core.bbox.structures.base_box3d import BaseInstance3DBoxes +from mmdet3d.core.bbox.structures.utils import limit_period, rotation_3d_in_axis + + +class CustomLiDARInstance3DBoxes(BaseInstance3DBoxes): + """3D boxes of instances in LIDAR coordinates. + with future trajs. + + Coordinates in LiDAR: + + .. code-block:: none + + up z x front (yaw=-0.5*pi) + ^ ^ + | / + | / + (yaw=-pi) left y <------ 0 -------- (yaw=0) + + The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + The yaw is 0 at the negative direction of y axis, and decreases from + the negative direction of y to the positive direction of x. + + A refactor is ongoing to make the three coordinate systems + easier to understand and convert between each other. + + Attributes: + tensor (torch.Tensor): Float matrix of N x box_dim. + box_dim (int): Integer indicating the dimension of a box. + Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). + with_yaw (bool): If True, the value of yaw will be set to 0 as minmax + boxes. + """ + def __init__(self, tensor, fut_trajs=None, fut_valid_mask=None, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)): + super(CustomLiDARInstance3DBoxes, self).__init__( + tensor, box_dim=box_dim, with_yaw=with_yaw, origin=origin + ) + if fut_trajs is not None: + self.fut_trajs = fut_trajs + if fut_valid_mask is not None: + self.fut_valid_mask = fut_valid_mask + + @property + def gravity_center(self): + """torch.Tensor: A tensor with center of each box.""" + bottom_center = self.bottom_center + gravity_center = torch.zeros_like(bottom_center) + gravity_center[:, :2] = bottom_center[:, :2] + gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5 + return gravity_center + + @property + def corners(self): + """torch.Tensor: Coordinates of corners of all the boxes + in shape (N, 8, 3). + + Convert the boxes to corners in clockwise order, in form of + ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)`` + + .. code-block:: none + + up z + front x ^ + / | + / | + (x1, y0, z1) + ----------- + (x1, y1, z1) + /| / | + / | / | + (x0, y0, z1) + ----------- + + (x1, y1, z0) + | / . | / + | / origin | / + left y<-------- + ----------- + (x0, y1, z0) + (x0, y0, z0) + """ + # TODO: rotation_3d_in_axis function do not support + # empty tensor currently. + assert len(self.tensor) != 0 + dims = self.dims + corners_norm = torch.from_numpy( + np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to( + device=dims.device, dtype=dims.dtype) + + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + # use relative origin [0.5, 0.5, 0] + corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0]) + corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) + + # rotate around z axis + corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2) + corners += self.tensor[:, :3].view(-1, 1, 3) + return corners + + @property + def bev(self): + """torch.Tensor: 2D BEV box of each box with rotation + in XYWHR format.""" + return self.tensor[:, [0, 1, 3, 4, 6]] + + @property + def nearest_bev(self): + """torch.Tensor: A tensor of 2D BEV box of each box + without rotation.""" + # Obtain BEV boxes with rotation in XYWHR format + bev_rotated_boxes = self.bev + # convert the rotation to a valid range + rotations = bev_rotated_boxes[:, -1] + normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi)) + + # find the center of boxes + conditions = (normed_rotations > np.pi / 4)[..., None] + bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:, + [0, 1, 3, 2]], + bev_rotated_boxes[:, :4]) + + centers = bboxes_xywh[:, :2] + dims = bboxes_xywh[:, 2:] + bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) + return bev_boxes + + def rotate(self, angle, points=None): + """Rotate boxes with points (optional) with the given angle or \ + rotation matrix. + + Args: + angles (float | torch.Tensor | np.ndarray): + Rotation angle or rotation matrix. + points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): + Points to rotate. Defaults to None. + + Returns: + tuple or None: When ``points`` is None, the function returns \ + None, otherwise it returns the rotated points and the \ + rotation matrix ``rot_mat_T``. + """ + if not isinstance(angle, torch.Tensor): + angle = self.tensor.new_tensor(angle) + assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \ + f'invalid rotation angle shape {angle.shape}' + + if angle.numel() == 1: + rot_sin = torch.sin(angle) + rot_cos = torch.cos(angle) + rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0], + [rot_sin, rot_cos, 0], + [0, 0, 1]]) + else: + rot_mat_T = angle + rot_sin = rot_mat_T[1, 0] + rot_cos = rot_mat_T[0, 0] + angle = np.arctan2(rot_sin, rot_cos) + + self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T + self.tensor[:, 6] += angle + + if self.tensor.shape[1] == 9: + # rotate velo vector + self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2] + + if points is not None: + if isinstance(points, torch.Tensor): + points[:, :3] = points[:, :3] @ rot_mat_T + elif isinstance(points, np.ndarray): + rot_mat_T = rot_mat_T.numpy() + points[:, :3] = np.dot(points[:, :3], rot_mat_T) + elif isinstance(points, BasePoints): + # clockwise + points.rotate(-angle) + else: + raise ValueError + return points, rot_mat_T + + def flip(self, bev_direction='horizontal', points=None): + """Flip the boxes in BEV along given BEV direction. + + In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None): + Points to flip. Defaults to None. + + Returns: + torch.Tensor, numpy.ndarray or None: Flipped points. + """ + assert bev_direction in ('horizontal', 'vertical') + if bev_direction == 'horizontal': + self.tensor[:, 1::7] = -self.tensor[:, 1::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + np.pi + elif bev_direction == 'vertical': + self.tensor[:, 0::7] = -self.tensor[:, 0::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + + if points is not None: + assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) + if isinstance(points, (torch.Tensor, np.ndarray)): + if bev_direction == 'horizontal': + points[:, 1] = -points[:, 1] + elif bev_direction == 'vertical': + points[:, 0] = -points[:, 0] + elif isinstance(points, BasePoints): + points.flip(bev_direction) + return points + + def in_range_bev(self, box_range): + """Check whether the boxes are in the given range. + + Args: + box_range (list | torch.Tensor): the range of box + (x_min, y_min, x_max, y_max) + + Note: + The original implementation of SECOND checks whether boxes in + a range by checking whether the points are in a convex + polygon, we reduce the burden for simpler cases. + + Returns: + torch.Tensor: Whether each box is inside the reference range. + """ + in_range_flags = ((self.tensor[:, 0] > box_range[0]) + & (self.tensor[:, 1] > box_range[1]) + & (self.tensor[:, 0] < box_range[2]) + & (self.tensor[:, 1] < box_range[3])) + return in_range_flags + + def convert_to(self, dst, rt_mat=None): + """Convert self to ``dst`` mode. + + Args: + dst (:obj:`Box3DMode`): the target Box mode + rt_mat (np.ndarray | torch.Tensor): The rotation and translation + matrix between different coordinates. Defaults to None. + The conversion from ``src`` coordinates to ``dst`` coordinates + usually comes along the change of sensors, e.g., from camera + to LiDAR. This requires a transformation matrix. + + Returns: + :obj:`BaseInstance3DBoxes`: \ + The converted box of the same type in the ``dst`` mode. + """ + from mmdet3d.core.bbox.structures.box_3d_mode import Box3DMode + return Box3DMode.convert( + box=self, src=Box3DMode.LIDAR, dst=dst, rt_mat=rt_mat) + + def enlarged_box(self, extra_width): + """Enlarge the length, width and height boxes. + + Args: + extra_width (float | torch.Tensor): Extra width to enlarge the box. + + Returns: + :obj:`LiDARInstance3DBoxes`: Enlarged boxes. + """ + enlarged_boxes = self.tensor.clone() + enlarged_boxes[:, 3:6] += extra_width * 2 + # bottom center z minus extra_width + enlarged_boxes[:, 2] -= extra_width + return self.new_box(enlarged_boxes) + + def points_in_boxes(self, points): + """Find the box which the points are in. + + Args: + points (torch.Tensor): Points in shape (N, 3). + + Returns: + torch.Tensor: The index of box where each point are in. + """ + box_idx = points_in_boxes_gpu( + points.unsqueeze(0), + self.tensor.unsqueeze(0).to(points.device)).squeeze(0) + return box_idx diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/nuscenes_box.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/nuscenes_box.py new file mode 100644 index 0000000000000000000000000000000000000000..05200a0fc3958831637177d9592ce5c2a47a08df --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/nuscenes_box.py @@ -0,0 +1,458 @@ +# nuScenes dev-kit. +# Code written by Oscar Beijbom, 2018. + +import copy +from typing import Tuple, List + +import cv2 +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.axes import Axes +from matplotlib.collections import LineCollection +from pyquaternion import Quaternion +from nuscenes.utils.geometry_utils import view_points +from nuscenes.eval.common.data_classes import EvalBox +from nuscenes.eval.detection.constants import DETECTION_NAMES, ATTRIBUTE_NAMES + + +def color_map(data, cmap): + """数值映射为颜色""" + + dmin, dmax = np.nanmin(data), np.nanmax(data) + cmo = plt.cm.get_cmap(cmap) + cs, k = list(), 256/cmo.N + + for i in range(cmo.N): + c = cmo(i) + for j in range(int(i*k), int((i+1)*k)): + cs.append(c) + cs = np.array(cs) + data = np.uint8(255*(data-dmin)/(dmax-dmin)) + + return cs[data] + +class CustomNuscenesBox: + """ Simple data class representing a 3d box including, label, score and velocity. """ + + def __init__(self, + center: List[float], + size: List[float], + orientation: Quaternion, + fut_trajs: List[float], + label: int = np.nan, + score: float = np.nan, + velocity: Tuple = (np.nan, np.nan, np.nan), + name: str = None, + token: str = None): + """ + :param center: Center of box given as x, y, z. + :param size: Size of box in width, length, height. + :param orientation: Box orientation. + :param label: Integer label, optional. + :param score: Classification score, optional. + :param velocity: Box velocity in x, y, z direction. + :param name: Box name, optional. Can be used e.g. for denote category name. + :param token: Unique string identifier from DB. + """ + assert not np.any(np.isnan(center)) + assert not np.any(np.isnan(size)) + assert len(center) == 3 + assert len(size) == 3 + assert type(orientation) == Quaternion + + self.center = np.array(center) + self.wlh = np.array(size) + self.orientation = orientation + self.label = int(label) if not np.isnan(label) else label + self.score = float(score) if not np.isnan(score) else score + self.velocity = np.array(velocity) + self.name = name + self.token = token + self.fut_trajs = np.array(fut_trajs) + + def __eq__(self, other): + center = np.allclose(self.center, other.center) + wlh = np.allclose(self.wlh, other.wlh) + orientation = np.allclose(self.orientation.elements, other.orientation.elements) + label = (self.label == other.label) or (np.isnan(self.label) and np.isnan(other.label)) + score = (self.score == other.score) or (np.isnan(self.score) and np.isnan(other.score)) + vel = (np.allclose(self.velocity, other.velocity) or + (np.all(np.isnan(self.velocity)) and np.all(np.isnan(other.velocity)))) + + return center and wlh and orientation and label and score and vel + + def __repr__(self): + repr_str = 'label: {}, score: {:.2f}, xyz: [{:.2f}, {:.2f}, {:.2f}], wlh: [{:.2f}, {:.2f}, {:.2f}], ' \ + 'rot axis: [{:.2f}, {:.2f}, {:.2f}], ang(degrees): {:.2f}, ang(rad): {:.2f}, ' \ + 'vel: {:.2f}, {:.2f}, {:.2f}, name: {}, token: {}' + + return repr_str.format(self.label, self.score, self.center[0], self.center[1], self.center[2], self.wlh[0], + self.wlh[1], self.wlh[2], self.orientation.axis[0], self.orientation.axis[1], + self.orientation.axis[2], self.orientation.degrees, self.orientation.radians, + self.velocity[0], self.velocity[1], self.velocity[2], self.name, self.token) + + @property + def rotation_matrix(self) -> np.ndarray: + """ + Return a rotation matrix. + :return: . The box's rotation matrix. + """ + return self.orientation.rotation_matrix + + def translate(self, x: np.ndarray) -> None: + """ + Applies a translation. + :param x: . Translation in x, y, z direction. + """ + self.center += x + + def rotate(self, quaternion: Quaternion) -> None: + """ + Rotates box. + :param quaternion: Rotation to apply. + """ + self.center = np.dot(quaternion.rotation_matrix, self.center) + self.orientation = quaternion * self.orientation + self.velocity = np.dot(quaternion.rotation_matrix, self.velocity) + + def corners(self, wlh_factor: float = 1.0) -> np.ndarray: + """ + Returns the bounding box corners. + :param wlh_factor: Multiply w, l, h by a factor to scale the box. + :return: . First four corners are the ones facing forward. + The last four are the ones facing backwards. + """ + w, l, h = self.wlh * wlh_factor + + # 3D bounding box corners. (Convention: x points forward, y to the left, z up.) + x_corners = l / 2 * np.array([1, 1, 1, 1, -1, -1, -1, -1]) + y_corners = w / 2 * np.array([1, -1, -1, 1, 1, -1, -1, 1]) + z_corners = h / 2 * np.array([1, 1, -1, -1, 1, 1, -1, -1]) + corners = np.vstack((x_corners, y_corners, z_corners)) + + # Rotate + corners = np.dot(self.orientation.rotation_matrix, corners) + + # Translate + x, y, z = self.center + corners[0, :] = corners[0, :] + x + corners[1, :] = corners[1, :] + y + corners[2, :] = corners[2, :] + z + + return corners + + def bottom_corners(self) -> np.ndarray: + """ + Returns the four bottom corners. + :return: . Bottom corners. First two face forward, last two face backwards. + """ + return self.corners()[:, [2, 3, 7, 6]] + + def render(self, + axis: Axes, + view: np.ndarray = np.eye(3), + normalize: bool = False, + colors: Tuple = ('b', 'r', 'k'), + linewidth: float = 2, + box_idx=None, + alpha=0.5) -> None: + """ + Renders the box in the provided Matplotlib axis. + :param axis: Axis onto which the box should be drawn. + :param view: . Define a projection in needed (e.g. for drawing projection in an image). + :param normalize: Whether to normalize the remaining coordinate. + :param colors: (: 3). Valid Matplotlib colors ( or normalized RGB tuple) for front, + back and sides. + :param linewidth: Width in pixel of the box sides. + """ + corners = view_points(self.corners(), view, normalize=normalize)[:2, :] + + def draw_rect(selected_corners, color, alpha): + prev = selected_corners[-1] + for corner in selected_corners: + axis.plot([prev[0], corner[0]], [prev[1], corner[1]], color=color, linewidth=linewidth, alpha=alpha) + prev = corner + + # Draw the sides + for i in range(4): + axis.plot([corners.T[i][0], corners.T[i + 4][0]], + [corners.T[i][1], corners.T[i + 4][1]], + color=colors[2], linewidth=linewidth, alpha=alpha) + + # Draw front (first 4 corners) and rear (last 4 corners) rectangles(3d)/lines(2d) + draw_rect(corners.T[:4], colors[0], alpha) + draw_rect(corners.T[4:], colors[1], alpha) + + # Draw line indicating the front + center_bottom_forward = np.mean(corners.T[2:4], axis=0) + center_bottom = np.mean(corners.T[[2, 3, 7, 6]], axis=0) + axis.plot([center_bottom[0], center_bottom_forward[0]], + [center_bottom[1], center_bottom_forward[1]], + color=colors[0], linewidth=linewidth, alpha=alpha) + if box_idx is not None and center_bottom[0] > -35 and center_bottom[1] > -35 \ + and center_bottom[0] < 35 and center_bottom[1] < 35: + text = f'{box_idx}' + axis.text(center_bottom[0], center_bottom[1], text, ha='left', fontsize=5) + + def render_fut_trajs(self, + axis: Axes, + color: str = 'b', + linewidth: float = 1, + fut_ts: int = 6, + mode_idx=None) -> None: + """ + Renders the box in the provided Matplotlib axis. + :param axis: Axis onto which the box should be drawn. + :param view: . Define a projection in needed (e.g. for drawing projection in an image). + :param normalize: Whether to normalize the remaining coordinate. + :param colors: (: 3). Valid Matplotlib colors ( or normalized RGB tuple) for front, + back and sides. + :param linewidth: Width in pixel of the box sides. + """ + + fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2)) + if mode_idx is not None: + fut_coords = fut_coords[[mode_idx]] + alpha = 0.8 + for i in range(fut_coords.shape[0]): + fut_coord = fut_coords[i] + fut_coord = fut_coord.cumsum(axis=-2) + fut_coord = fut_coord + self.center[:2] + if np.abs(fut_coord[-1] - self.center[:2]).max() >= 10: + if color == 'g': + axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='*', s=70, alpha=alpha) + elif color == 'b': + axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='o', s=20, alpha=alpha) + if mode_idx is None and fut_coord[-1, 0] > -35 and fut_coord[-1, 1] > -35 \ + and fut_coord[-1, 0] < 35 and fut_coord[-1, 1] < 35: + text = f'{i}' + axis.text(fut_coord[-1, 0], fut_coord[-1, 1], text, ha='left', fontsize=5) + axis.plot( + [self.center[0], fut_coord[0, 0]], + [self.center[1], fut_coord[0, 1]], + color=color, linewidth=linewidth, alpha=alpha + ) + for i in range(fut_coord.shape[0]-1): + axis.plot( + [fut_coord[i, 0], fut_coord[i+1, 0]], + [fut_coord[i, 1], fut_coord[i+1, 1]], + color=color, linewidth=linewidth, alpha=alpha + ) + + def render_fut_trajs_grad_color(self, + axis: Axes, + linewidth: float = 1, + linestyles='solid', + cmap='viridis', + fut_ts: int = 6, + alpha: int = 0.8, + mode_idx=None) -> None: + """ + Renders the box in the provided Matplotlib axis. + :param axis: Axis onto which the box should be drawn. + :param view: . Define a projection in needed (e.g. for drawing projection in an image). + :param normalize: Whether to normalize the remaining coordinate. + :param colors: (: 3). Valid Matplotlib colors ( or normalized RGB tuple) for front, + back and sides. + :param linewidth: Width in pixel of the box sides. + """ + + fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2)) + if mode_idx is not None: + fut_coords = fut_coords[[mode_idx]] + + for i in range(fut_coords.shape[0]): + fut_coord = fut_coords[i] + fut_coord = fut_coord.cumsum(axis=-2) + fut_coord = fut_coord + self.center[:2] + fut_coord = np.concatenate((self.center[np.newaxis, :2], fut_coord), axis=0) + fut_coord_segments = np.stack((fut_coord[:-1], fut_coord[1:]), axis=1) + + fut_vecs = None + for j in range(fut_coord_segments.shape[0]): + fut_vec_j = fut_coord_segments[j] + x_linspace = np.linspace(fut_vec_j[0, 0], fut_vec_j[1, 0], 51) + y_linspace = np.linspace(fut_vec_j[0, 1], fut_vec_j[1, 1], 51) + xy = np.stack((x_linspace, y_linspace), axis=1) + xy = np.stack((xy[:-1], xy[1:]), axis=1) + if fut_vecs is None: + fut_vecs = xy + else: + fut_vecs = np.concatenate((fut_vecs, xy), axis=0) + + y = np.sin(np.linspace(3/2*np.pi, 5/2*np.pi, 301)) + colors = color_map(y[:-1], cmap) + line_segments = LineCollection(fut_vecs, colors=colors, linewidths=linewidth, linestyles=linestyles, cmap=cmap) + + # if mode_idx is None and abs(fut_coord[-1, 0]) < 35 and abs(fut_coord[-1, 1]) < 35: + # text = f'{i}' + # axis.text(fut_coord[-1, 0], fut_coord[-1, 1], text, ha='left', fontsize=5) + + axis.add_collection(line_segments) + + def render_fut_trajs_coords(self, + axis: Axes, + color: str = 'b', + linewidth: float = 1, + fut_ts: int = 12) -> None: + """ + Renders the box in the provided Matplotlib axis. + :param axis: Axis onto which the box should be drawn. + :param view: . Define a projection in needed (e.g. for drawing projection in an image). + :param normalize: Whether to normalize the remaining coordinate. + :param colors: (: 3). Valid Matplotlib colors ( or normalized RGB tuple) for front, + back and sides. + :param linewidth: Width in pixel of the box sides. + """ + + fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2)) + alpha = 0.2 if color == 'b' else 1 + for i in range(fut_coords.shape[0]): + fut_coord = fut_coords[i] + fut_coord = fut_coord + self.center[:2] + if np.abs(fut_coord[-1] - self.center[:2]).max() >= 10: + if color == 'g': + axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='*', s=70, alpha=alpha) + elif color == 'b': + axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='o', s=20, alpha=alpha) + axis.plot( + [self.center[0], fut_coord[0, 0]], + [self.center[1], fut_coord[0, 1]], + color=color, linewidth=linewidth, alpha=alpha + ) + for i in range(fut_coord.shape[0]-1): + axis.plot( + [fut_coord[i, 0], fut_coord[i+1, 0]], + [fut_coord[i, 1], fut_coord[i+1, 1]], + color=color, linewidth=linewidth, alpha=alpha + ) + + def render_cv2(self, + im: np.ndarray, + view: np.ndarray = np.eye(3), + normalize: bool = False, + colors: Tuple = ((0, 0, 255), (255, 0, 0), (155, 155, 155)), + linewidth: int = 2) -> None: + """ + Renders box using OpenCV2. + :param im: . Image array. Channels are in BGR order. + :param view: . Define a projection if needed (e.g. for drawing projection in an image). + :param normalize: Whether to normalize the remaining coordinate. + :param colors: ((R, G, B), (R, G, B), (R, G, B)). Colors for front, side & rear. + :param linewidth: Linewidth for plot. + """ + corners = view_points(self.corners(), view, normalize=normalize)[:2, :] + + def draw_rect(selected_corners, color): + prev = selected_corners[-1] + for corner in selected_corners: + cv2.line(im, + (int(prev[0]), int(prev[1])), + (int(corner[0]), int(corner[1])), + color, linewidth) + prev = corner + + # Draw the sides + for i in range(4): + cv2.line(im, + (int(corners.T[i][0]), int(corners.T[i][1])), + (int(corners.T[i + 4][0]), int(corners.T[i + 4][1])), + colors[2][::-1], linewidth) + + # Draw front (first 4 corners) and rear (last 4 corners) rectangles(3d)/lines(2d) + draw_rect(corners.T[:4], colors[0][::-1]) + draw_rect(corners.T[4:], colors[1][::-1]) + + # Draw line indicating the front + center_bottom_forward = np.mean(corners.T[2:4], axis=0) + center_bottom = np.mean(corners.T[[2, 3, 7, 6]], axis=0) + cv2.line(im, + (int(center_bottom[0]), int(center_bottom[1])), + (int(center_bottom_forward[0]), int(center_bottom_forward[1])), + colors[0][::-1], linewidth) + + def copy(self) -> 'CustomNuscenesBox': + """ + Create a copy of self. + :return: A copy. + """ + return copy.deepcopy(self) + + +class CustomDetectionBox(EvalBox): + """ Data class used during detection evaluation. Can be a prediction or ground truth.""" + + def __init__(self, + sample_token: str = "", + translation: Tuple[float, float, float] = (0, 0, 0), + size: Tuple[float, float, float] = (0, 0, 0), + rotation: Tuple[float, float, float, float] = (0, 0, 0, 0), + velocity: Tuple[float, float] = (0, 0), + ego_translation: Tuple[float, float, float] = (0, 0, 0), # Translation to ego vehicle in meters. + num_pts: int = -1, # Nbr. LIDAR or RADAR inside the box. Only for gt boxes. + detection_name: str = 'car', # The class name used in the detection challenge. + detection_score: float = -1.0, # GT samples do not have a score. + attribute_name: str = '', # Box attribute. Each box can have at most 1 attribute. + fut_trajs=None): # future trajectories of a pred box, shape=[fut_ts*2]. + + super().__init__(sample_token, translation, size, rotation, velocity, ego_translation, num_pts) + + assert detection_name is not None, 'Error: detection_name cannot be empty!' + assert detection_name in DETECTION_NAMES, 'Error: Unknown detection_name %s' % detection_name + + assert attribute_name in ATTRIBUTE_NAMES or attribute_name == '', \ + 'Error: Unknown attribute_name %s' % attribute_name + + assert type(detection_score) == float, 'Error: detection_score must be a float!' + assert not np.any(np.isnan(detection_score)), 'Error: detection_score may not be NaN!' + + # Assign. + self.detection_name = detection_name + self.detection_score = detection_score + self.attribute_name = attribute_name + self.fut_trajs = fut_trajs + + def __eq__(self, other): + return (self.sample_token == other.sample_token and + self.translation == other.translation and + self.size == other.size and + self.rotation == other.rotation and + self.velocity == other.velocity and + self.ego_translation == other.ego_translation and + self.num_pts == other.num_pts and + self.detection_name == other.detection_name and + self.detection_score == other.detection_score and + self.attribute_name == other.attribute_name and + self.fut_trajs == other.fut_trajs) + + def serialize(self) -> dict: + """ Serialize instance into json-friendly format. """ + return { + 'sample_token': self.sample_token, + 'translation': self.translation, + 'size': self.size, + 'rotation': self.rotation, + 'velocity': self.velocity, + 'ego_translation': self.ego_translation, + 'num_pts': self.num_pts, + 'detection_name': self.detection_name, + 'detection_score': self.detection_score, + 'attribute_name': self.attribute_name, + 'fut_trajs': self.fut_trajs + } + + @classmethod + def deserialize(cls, content: dict): + """ Initialize from serialized content. """ + return cls(sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=tuple(content['velocity']), + fut_trajs=tuple(content['fut_trajs']), + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=content['detection_name'], + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name=content['attribute_name']) diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/util.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/util.py new file mode 100644 index 0000000000000000000000000000000000000000..c54bd750246f3d6e2249b7d39888fffa6227beda --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/util.py @@ -0,0 +1,53 @@ +import torch + + +def normalize_bbox(bboxes, pc_range): + + cx = bboxes[..., 0:1] + cy = bboxes[..., 1:2] + cz = bboxes[..., 2:3] + w = bboxes[..., 3:4].log() + l = bboxes[..., 4:5].log() + h = bboxes[..., 5:6].log() + + rot = bboxes[..., 6:7] + if bboxes.size(-1) > 7: + vx = bboxes[..., 7:8] + vy = bboxes[..., 8:9] + normalized_bboxes = torch.cat( + (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1 + ) + else: + normalized_bboxes = torch.cat( + (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1 + ) + return normalized_bboxes + +def denormalize_bbox(normalized_bboxes, pc_range): + # rotation + rot_sine = normalized_bboxes[..., 6:7] + + rot_cosine = normalized_bboxes[..., 7:8] + rot = torch.atan2(rot_sine, rot_cosine) + + # center in the bev + cx = normalized_bboxes[..., 0:1] + cy = normalized_bboxes[..., 1:2] + cz = normalized_bboxes[..., 4:5] + + # size + w = normalized_bboxes[..., 2:3] + l = normalized_bboxes[..., 3:4] + h = normalized_bboxes[..., 5:6] + + w = w.exp() + l = l.exp() + h = h.exp() + if normalized_bboxes.size(-1) > 8: + # velocity + vx = normalized_bboxes[:, 8:9] + vy = normalized_bboxes[:, 9:10] + denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1) + else: + denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) + return denormalized_bboxes \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__init__.py b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d92421c7e84fdc7a33e94aa10fddfccb332d6399 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__init__.py @@ -0,0 +1 @@ +from .eval_hooks import CustomDistEvalHook \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1152e552d359e236bfdc2500cc56c57910cb82af Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/eval_hooks.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/eval_hooks.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d4e14ebf833fa6af9f96f34d3d8a51f39cde0da Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/eval_hooks.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/metric_motion.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/metric_motion.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6111380f8ccfa5ce5e875474c18eaf6fb519ea03 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/metric_motion.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..96b70706885750f8912741363287e973c12a384c --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py @@ -0,0 +1,92 @@ + +# Note: Considering that MMCV's EvalHook updated its interface in V1.3.16, +# in order to avoid strong version dependency, we did not directly +# inherit EvalHook but BaseDistEvalHook. + +import bisect +import os.path as osp + +import mmcv +import torch.distributed as dist +from mmcv.runner import DistEvalHook as BaseDistEvalHook +from mmcv.runner import EvalHook as BaseEvalHook +from torch.nn.modules.batchnorm import _BatchNorm +from mmdet.core.evaluation.eval_hooks import DistEvalHook + + +def _calc_dynamic_intervals(start_interval, dynamic_interval_list): + assert mmcv.is_list_of(dynamic_interval_list, tuple) + + dynamic_milestones = [0] + dynamic_milestones.extend( + [dynamic_interval[0] for dynamic_interval in dynamic_interval_list]) + dynamic_intervals = [start_interval] + dynamic_intervals.extend( + [dynamic_interval[1] for dynamic_interval in dynamic_interval_list]) + return dynamic_milestones, dynamic_intervals + + +class CustomDistEvalHook(BaseDistEvalHook): + + def __init__(self, *args, dynamic_intervals=None, **kwargs): + super(CustomDistEvalHook, self).__init__(*args, **kwargs) + self.use_dynamic_intervals = dynamic_intervals is not None + if self.use_dynamic_intervals: + self.dynamic_milestones, self.dynamic_intervals = \ + _calc_dynamic_intervals(self.interval, dynamic_intervals) + + def _decide_interval(self, runner): + if self.use_dynamic_intervals: + progress = runner.epoch if self.by_epoch else runner.iter + step = bisect.bisect(self.dynamic_milestones, (progress + 1)) + # Dynamically modify the evaluation interval + self.interval = self.dynamic_intervals[step - 1] + + def before_train_epoch(self, runner): + """Evaluate the model only at the start of training by epoch.""" + self._decide_interval(runner) + super().before_train_epoch(runner) + + def before_train_iter(self, runner): + self._decide_interval(runner) + super().before_train_iter(runner) + + def _do_evaluate(self, runner): + """perform evaluation and save ckpt.""" + # Synchronization of BatchNorm's buffer (running_mean + # and running_var) is not supported in the DDP of pytorch, + # which may cause the inconsistent performance of models in + # different ranks, so we broadcast BatchNorm's buffers + # of rank 0 to other ranks to avoid this. + if self.broadcast_bn_buffer: + model = runner.model + for name, module in model.named_modules(): + if isinstance(module, + _BatchNorm) and module.track_running_stats: + dist.broadcast(module.running_var, 0) + dist.broadcast(module.running_mean, 0) + + if not self._should_evaluate(runner): + return + + tmpdir = self.tmpdir + if tmpdir is None: + tmpdir = osp.join(runner.work_dir, '.eval_hook') + + # from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test # to solve circlur import + from projects.mmdet3d_plugin.VAD.apis.test import custom_multi_gpu_test # to solve circlur import + + results = custom_multi_gpu_test( + runner.model, + self.dataloader, + tmpdir=tmpdir, + gpu_collect=self.gpu_collect) + if runner.rank == 0: + print('\n') + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) + + key_score = self.evaluate(runner, results) + + if self.save_best: + self._save_ckpt(runner, key_score) + diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py new file mode 100644 index 0000000000000000000000000000000000000000..f816974544b57c1561a1fc09b9cf9e48dde03e38 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py @@ -0,0 +1,251 @@ +# Copyright (c) OpenMMLab. All rights reserved. +r"""Adapted from `Waymo to KITTI converter + `_. +""" + +try: + from waymo_open_dataset import dataset_pb2 as open_dataset + import mmcv + import numpy as np + import tensorflow as tf + from glob import glob + from os.path import join + from waymo_open_dataset import label_pb2 + from waymo_open_dataset.protos import metrics_pb2 +except ImportError: + #pass + raise ImportError( + 'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" ' + 'to install the official devkit first.') + + + + +class KITTI2Waymo(object): + """KITTI predictions to Waymo converter. + This class serves as the converter to change predictions from KITTI to + Waymo format. + Args: + kitti_result_files (list[dict]): Predictions in KITTI format. + waymo_tfrecords_dir (str): Directory to load waymo raw data. + waymo_results_save_dir (str): Directory to save converted predictions + in waymo format (.bin files). + waymo_results_final_path (str): Path to save combined + predictions in waymo format (.bin file), like 'a/b/c.bin'. + prefix (str): Prefix of filename. In general, 0 for training, 1 for + validation and 2 for testing. + workers (str): Number of parallel processes. + """ + + def __init__(self, + kitti_result_files, + waymo_tfrecords_dir, + waymo_results_save_dir, + waymo_results_final_path, + prefix, + workers=64): + + self.kitti_result_files = kitti_result_files + self.waymo_tfrecords_dir = waymo_tfrecords_dir + self.waymo_results_save_dir = waymo_results_save_dir + self.waymo_results_final_path = waymo_results_final_path + self.prefix = prefix + self.workers = int(workers) + self.name2idx = {} + for idx, result in enumerate(kitti_result_files): + if len(result['sample_idx']) > 0: + self.name2idx[str(result['sample_idx'][0])] = idx + + # turn on eager execution for older tensorflow versions + if int(tf.__version__.split('.')[0]) < 2: + tf.enable_eager_execution() + + self.k2w_cls_map = { + 'Car': label_pb2.Label.TYPE_VEHICLE, + 'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN, + 'Sign': label_pb2.Label.TYPE_SIGN, + 'Cyclist': label_pb2.Label.TYPE_CYCLIST, + } + + self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0], + [-1.0, 0.0, 0.0, 0.0], + [0.0, -1.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 1.0]]) + + self.get_file_names() + self.create_folder() + + def get_file_names(self): + """Get file names of waymo raw data.""" + self.waymo_tfrecord_pathnames = sorted( + glob(join(self.waymo_tfrecords_dir, '*.tfrecord'))) + print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.') + + def create_folder(self): + """Create folder for data conversion.""" + mmcv.mkdir_or_exist(self.waymo_results_save_dir) + + def parse_objects(self, kitti_result, T_k2w, context_name, + frame_timestamp_micros): + """Parse one prediction with several instances in kitti format and + convert them to `Object` proto. + Args: + kitti_result (dict): Predictions in kitti format. + - name (np.ndarray): Class labels of predictions. + - dimensions (np.ndarray): Height, width, length of boxes. + - location (np.ndarray): Bottom center of boxes (x, y, z). + - rotation_y (np.ndarray): Orientation of boxes. + - score (np.ndarray): Scores of predictions. + T_k2w (np.ndarray): Transformation matrix from kitti to waymo. + context_name (str): Context name of the frame. + frame_timestamp_micros (int): Frame timestamp. + Returns: + :obj:`Object`: Predictions in waymo dataset Object proto. + """ + + def parse_one_object(instance_idx): + """Parse one instance in kitti format and convert them to `Object` + proto. + Args: + instance_idx (int): Index of the instance to be converted. + Returns: + :obj:`Object`: Predicted instance in waymo dataset \ + Object proto. + """ + cls = kitti_result['name'][instance_idx] + length = round(kitti_result['dimensions'][instance_idx, 0], 4) + height = round(kitti_result['dimensions'][instance_idx, 1], 4) + width = round(kitti_result['dimensions'][instance_idx, 2], 4) + x = round(kitti_result['location'][instance_idx, 0], 4) + y = round(kitti_result['location'][instance_idx, 1], 4) + z = round(kitti_result['location'][instance_idx, 2], 4) + rotation_y = round(kitti_result['rotation_y'][instance_idx], 4) + score = round(kitti_result['score'][instance_idx], 4) + + # y: downwards; move box origin from bottom center (kitti) to + # true center (waymo) + y -= height / 2 + # frame transformation: kitti -> waymo + x, y, z = self.transform(T_k2w, x, y, z) + + # different conventions + heading = -(rotation_y + np.pi / 2) + while heading < -np.pi: + heading += 2 * np.pi + while heading > np.pi: + heading -= 2 * np.pi + + box = label_pb2.Label.Box() + box.center_x = x + box.center_y = y + box.center_z = z + box.length = length + box.width = width + box.height = height + box.heading = heading + + o = metrics_pb2.Object() + o.object.box.CopyFrom(box) + o.object.type = self.k2w_cls_map[cls] + o.score = score + + o.context_name = context_name + o.frame_timestamp_micros = frame_timestamp_micros + + return o + + objects = metrics_pb2.Objects() + + for instance_idx in range(len(kitti_result['name'])): + o = parse_one_object(instance_idx) + objects.objects.append(o) + + return objects + + def convert_one(self, file_idx): + """Convert action for single file. + Args: + file_idx (int): Index of the file to be converted. + """ + file_pathname = self.waymo_tfrecord_pathnames[file_idx] + file_data = tf.data.TFRecordDataset(file_pathname, compression_type='') + + for frame_num, frame_data in enumerate(file_data): + frame = open_dataset.Frame() + frame.ParseFromString(bytearray(frame_data.numpy())) + filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}' + + for camera in frame.context.camera_calibrations: + # FRONT = 1, see dataset.proto for details + if camera.name == 1: + T_front_cam_to_vehicle = np.array( + camera.extrinsic.transform).reshape(4, 4) + + T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam + + context_name = frame.context.name + frame_timestamp_micros = frame.timestamp_micros + + if filename in self.name2idx: + kitti_result = \ + self.kitti_result_files[self.name2idx[filename]] + objects = self.parse_objects(kitti_result, T_k2w, context_name, + frame_timestamp_micros) + else: + print(filename, 'not found.(bevformer)') + objects = metrics_pb2.Objects() + + with open( + join(self.waymo_results_save_dir, f'{filename}.bin'), + 'wb') as f: + f.write(objects.SerializeToString()) + + def convert(self): + """Convert action.""" + print('Start converting ...') + mmcv.track_parallel_progress(self.convert_one, range(len(self)), + self.workers) + print('\nFinished ...') + + # combine all files into one .bin + pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin'))) + combined = self.combine(pathnames) + + with open(self.waymo_results_final_path, 'wb') as f: + f.write(combined.SerializeToString()) + + def __len__(self): + """Length of the filename list.""" + return len(self.waymo_tfrecord_pathnames) + + def transform(self, T, x, y, z): + """Transform the coordinates with matrix T. + Args: + T (np.ndarray): Transformation matrix. + x(float): Coordinate in x axis. + y(float): Coordinate in y axis. + z(float): Coordinate in z axis. + Returns: + list: Coordinates after transformation. + """ + pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1) + pt_aft = np.matmul(T, pt_bef) + return pt_aft[:3].flatten().tolist() + + def combine(self, pathnames): + """Combine predictions in waymo format for each sample together. + Args: + pathnames (str): Paths to save predictions. + Returns: + :obj:`Objects`: Combined predictions in Objects proto. + """ + combined = metrics_pb2.Objects() + + for pathname in pathnames: + objects = metrics_pb2.Objects() + with open(pathname, 'rb') as f: + objects.ParseFromString(f.read()) + for o in objects.objects: + combined.objects.append(o) + + return combined \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/metric_motion.py b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/metric_motion.py new file mode 100644 index 0000000000000000000000000000000000000000..8219438cda0ad6733871a8b23515d3ff470439ce --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/metric_motion.py @@ -0,0 +1,70 @@ +# + +"""This module evaluates the forecasted trajectories against the ground truth.""" + +import math +from typing import Dict, List, Optional + +import numpy as np +import torch + +LOW_PROB_THRESHOLD_FOR_METRICS = 0.05 + + +def get_ade(forecasted_trajectory: torch.Tensor, gt_trajectory: torch.Tensor) -> float: + """Compute Average Displacement Error. + Args: + forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2] + gt_trajectory: Ground truth trajectory with shape [fut_ts, 2] + Returns: + ade: Average Displacement Error + """ + pred_len = forecasted_trajectory.shape[0] + ade = float( + sum( + torch.sqrt( + (forecasted_trajectory[i, 0] - gt_trajectory[i, 0]) ** 2 + + (forecasted_trajectory[i, 1] - gt_trajectory[i, 1]) ** 2 + ) + for i in range(pred_len) + ) + / pred_len + ) + return ade + +def get_best_preds( + forecasted_trajectory: torch.Tensor, + gt_trajectory: torch.Tensor +) -> float: + """Compute min Average Displacement Error. + Args: + forecasted_trajectory: Predicted trajectory with shape [k, fut_ts, 2] + gt_trajectory: Ground truth trajectory with shape [fut_ts, 2] + gt_fut_masks: Ground truth traj mask with shape (fut_ts) + Returns: + best_forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2] + """ + + # [k, fut_ts] + dist = torch.linalg.norm(gt_trajectory[None] - forecasted_trajectory, dim=-1) + dist = dist[..., -1] + dist[torch.isnan(dist)] = 0 + min_mode_idx = torch.argmin(dist, dim=-1) + + return forecasted_trajectory[min_mode_idx] + +def get_fde(forecasted_trajectory: torch.Tensor, gt_trajectory: torch.Tensor) -> float: + """Compute Final Displacement Error. + Args: + forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2] + gt_trajectory: Ground truth trajectory with shape [fut_ts, 2] + Returns: + fde: Final Displacement Error + """ + fde = float( + torch.sqrt( + (forecasted_trajectory[-1, 0] - gt_trajectory[-1, 0]) ** 2 + + (forecasted_trajectory[-1, 1] - gt_trajectory[-1, 1]) ** 2 + ) + ) + return fde diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/__init__.py b/GenAD-main/projects/mmdet3d_plugin/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..480874fd8759bd989a82de2803f8b872e9238124 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/__init__.py @@ -0,0 +1,6 @@ +from .nuscenes_vad_dataset import VADCustomNuScenesDataset + + +__all__ = [ + 'VADCustomNuScenesDataset' +] diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1757bb3b960f999a9778456293e1f9dbdc3affd2 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/builder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/builder.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a0a975e4f163e41f5c17f500fa8636d507606ae Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/builder.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/nuscenes_vad_dataset.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/nuscenes_vad_dataset.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..424216653d97efa21f41a3ea6794ee2df9d8a29a Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/nuscenes_vad_dataset.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/vad_custom_nuscenes_eval.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/vad_custom_nuscenes_eval.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..823113b695772c5080799170813c1c389fdfca0b Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/vad_custom_nuscenes_eval.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/builder.py b/GenAD-main/projects/mmdet3d_plugin/datasets/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..007c988561194d54581f6e40255f3f20e6087aa7 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/builder.py @@ -0,0 +1,151 @@ + +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import platform +import random +from functools import partial + +import numpy as np +from mmcv.parallel import collate +from mmcv.runner import get_dist_info +from mmcv.utils import Registry, build_from_cfg +from torch.utils.data import DataLoader + +from mmdet.datasets.samplers import GroupSampler +from projects.mmdet3d_plugin.datasets.samplers.group_sampler import DistributedGroupSampler +from projects.mmdet3d_plugin.datasets.samplers.distributed_sampler import DistributedSampler +from projects.mmdet3d_plugin.datasets.samplers.sampler import build_sampler + +FUSERS = Registry("fusers") + +def build_fuser(cfg): + return FUSERS.build(cfg) + +def build_dataloader(dataset, + samples_per_gpu, + workers_per_gpu, + num_gpus=1, + dist=True, + shuffle=True, + seed=None, + shuffler_sampler=None, + nonshuffler_sampler=None, + **kwargs): + """Build PyTorch DataLoader. + In distributed training, each GPU/process has a dataloader. + In non-distributed training, there is only one dataloader for all GPUs. + Args: + dataset (Dataset): A PyTorch dataset. + samples_per_gpu (int): Number of training samples on each GPU, i.e., + batch size of each GPU. + workers_per_gpu (int): How many subprocesses to use for data loading + for each GPU. + num_gpus (int): Number of GPUs. Only used in non-distributed training. + dist (bool): Distributed training/test or not. Default: True. + shuffle (bool): Whether to shuffle the data at every epoch. + Default: True. + kwargs: any keyword argument to be used to initialize DataLoader + Returns: + DataLoader: A PyTorch dataloader. + """ + rank, world_size = get_dist_info() + if dist: + # DistributedGroupSampler will definitely shuffle the data to satisfy + # that images on each GPU are in the same group + if shuffle: + sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'), + dict( + dataset=dataset, + samples_per_gpu=samples_per_gpu, + num_replicas=world_size, + rank=rank, + seed=seed) + ) + + else: + sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'), + dict( + dataset=dataset, + num_replicas=world_size, + rank=rank, + shuffle=shuffle, + seed=seed) + ) + + batch_size = samples_per_gpu + num_workers = workers_per_gpu + else: + # assert False, 'not support in bevformer' + print('WARNING!!!!, Only can be used for obtain inference speed!!!!') + sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None + batch_size = num_gpus * samples_per_gpu + num_workers = num_gpus * workers_per_gpu + + init_fn = partial( + worker_init_fn, num_workers=num_workers, rank=rank, + seed=seed) if seed is not None else None + + data_loader = DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=num_workers, + collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), + pin_memory=False, + worker_init_fn=init_fn, + **kwargs) + + return data_loader + + +def worker_init_fn(worker_id, num_workers, rank, seed): + # The seed of each worker equals to + # num_worker * rank + worker_id + user_seed + worker_seed = num_workers * rank + worker_id + seed + np.random.seed(worker_seed) + random.seed(worker_seed) + + +# Copyright (c) OpenMMLab. All rights reserved. +import platform +from mmcv.utils import Registry, build_from_cfg + +from mmdet.datasets import DATASETS +from mmdet.datasets.builder import _concat_dataset + +if platform.system() != 'Windows': + # https://github.com/pytorch/pytorch/issues/973 + import resource + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + base_soft_limit = rlimit[0] + hard_limit = rlimit[1] + soft_limit = min(max(4096, base_soft_limit), hard_limit) + resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) + +OBJECTSAMPLERS = Registry('Object sampler') + + +def custom_build_dataset(cfg, default_args=None): + from mmdet3d.datasets.dataset_wrappers import CBGSDataset + from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset, + ConcatDataset, RepeatDataset) + if isinstance(cfg, (list, tuple)): + dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg]) + elif cfg['type'] == 'ConcatDataset': + dataset = ConcatDataset( + [custom_build_dataset(c, default_args) for c in cfg['datasets']], + cfg.get('separate_eval', True)) + elif cfg['type'] == 'RepeatDataset': + dataset = RepeatDataset( + custom_build_dataset(cfg['dataset'], default_args), cfg['times']) + elif cfg['type'] == 'ClassBalancedDataset': + dataset = ClassBalancedDataset( + custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr']) + elif cfg['type'] == 'CBGSDataset': + dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args)) + elif isinstance(cfg.get('ann_file'), (list, tuple)): + dataset = _concat_dataset(cfg, default_args) + else: + dataset = build_from_cfg(cfg, DATASETS, default_args) + + return dataset diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/detection_cvpr_2019.json b/GenAD-main/projects/mmdet3d_plugin/datasets/detection_cvpr_2019.json new file mode 100644 index 0000000000000000000000000000000000000000..809ba46f76ebf09cc572c209bfddd94b7ee68084 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/detection_cvpr_2019.json @@ -0,0 +1,21 @@ +{ + "class_range": { + "car": 50, + "truck": 50, + "bus": 50, + "trailer": 50, + "construction_vehicle": 50, + "pedestrian": 40, + "motorcycle": 40, + "bicycle": 40, + "traffic_cone": 30, + "barrier": 30 + }, + "dist_fcn": "center_distance", + "dist_ths": [0.5, 1.0, 2.0, 4.0], + "dist_th_tp": 2.0, + "min_recall": 0.1, + "min_precision": 0.1, + "max_boxes_per_sample": 500, + "mean_ap_weight": 5 +} diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/mean_ap.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/mean_ap.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f17c5e4128e22d60233ae43699557f0460269286 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/mean_ap.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15c9a6947be454468ee21799974f7e840da1f00b Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp_chamfer.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp_chamfer.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..064896ee5a0a3b2d78094397fe6c964489526f0a Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp_chamfer.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py new file mode 100644 index 0000000000000000000000000000000000000000..023260659c4376af4dd4863880648e1c287c88bb --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py @@ -0,0 +1,389 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from multiprocessing import Pool +from shapely.geometry import LineString, Polygon +import mmcv +import numpy as np +from mmcv.utils import print_log +from terminaltables import AsciiTable +import json +from os import path as osp +import os +from functools import partial +from .tpfp import tpfp_gen, custom_tpfp_gen + +def average_precision(recalls, precisions, mode='area'): + """Calculate average precision (for single or multiple scales). + + Args: + recalls (ndarray): shape (num_scales, num_dets) or (num_dets, ) + precisions (ndarray): shape (num_scales, num_dets) or (num_dets, ) + mode (str): 'area' or '11points', 'area' means calculating the area + under precision-recall curve, '11points' means calculating + the average precision of recalls at [0, 0.1, ..., 1] + + Returns: + float or ndarray: calculated average precision + """ + no_scale = False + if recalls.ndim == 1: + no_scale = True + recalls = recalls[np.newaxis, :] + precisions = precisions[np.newaxis, :] + assert recalls.shape == precisions.shape and recalls.ndim == 2 + num_scales = recalls.shape[0] + ap = np.zeros(num_scales, dtype=np.float32) + if mode == 'area': + zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) + ones = np.ones((num_scales, 1), dtype=recalls.dtype) + mrec = np.hstack((zeros, recalls, ones)) + mpre = np.hstack((zeros, precisions, zeros)) + for i in range(mpre.shape[1] - 1, 0, -1): + mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) + for i in range(num_scales): + ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0] + ap[i] = np.sum( + (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1]) + elif mode == '11points': + for i in range(num_scales): + for thr in np.arange(0, 1 + 1e-3, 0.1): + precs = precisions[i, recalls[i, :] >= thr] + prec = precs.max() if precs.size > 0 else 0 + ap[i] += prec + ap /= 11 + else: + raise ValueError( + 'Unrecognized mode, only "area" and "11points" are supported') + if no_scale: + ap = ap[0] + return ap + +def get_cls_results(gen_results, + annotations, + num_sample=100, + num_pred_pts_per_instance=30, + eval_use_same_gt_sample_num_flag=False, + class_id=0, + fix_interval=False): + """Get det results and gt information of a certain class. + + Args: + gen_results (list[list]): Same as `eval_map()`. + annotations (list[dict]): Same as `eval_map()`. + class_id (int): ID of a specific class. + + Returns: + tuple[list[np.ndarray]]: detected bboxes, gt bboxes + """ + # if len(gen_results) == 0 or + + cls_gens, cls_scores = [], [] + for res in gen_results['vectors']: + if res['type'] == class_id: + if len(res['pts']) < 2: + continue + if not eval_use_same_gt_sample_num_flag: + sampled_points = np.array(res['pts']) + else: + line = res['pts'] + line = LineString(line) + + if fix_interval: + distances = list(np.arange(1., line.length, 1.)) + distances = [0,] + distances + [line.length,] + sampled_points = np.array([list(line.interpolate(distance).coords) + for distance in distances]).reshape(-1, 2) + else: + distances = np.linspace(0, line.length, num_sample) + sampled_points = np.array([list(line.interpolate(distance).coords) + for distance in distances]).reshape(-1, 2) + + cls_gens.append(sampled_points) + cls_scores.append(res['confidence_level']) + num_res = len(cls_gens) + if num_res > 0: + cls_gens = np.stack(cls_gens).reshape(num_res,-1) + cls_scores = np.array(cls_scores)[:,np.newaxis] + cls_gens = np.concatenate([cls_gens,cls_scores],axis=-1) + # print(f'for class {i}, cls_gens has shape {cls_gens.shape}') + else: + if not eval_use_same_gt_sample_num_flag: + cls_gens = np.zeros((0,num_pred_pts_per_instance*2+1)) + else: + cls_gens = np.zeros((0,num_sample*2+1)) + # print(f'for class {i}, cls_gens has shape {cls_gens.shape}') + + cls_gts = [] + for ann in annotations['vectors']: + if ann['type'] == class_id: + # line = ann['pts'] + np.array((1,1)) # for hdmapnet + line = ann['pts'] + # line = ann['pts'].cumsum(0) + line = LineString(line) + distances = np.linspace(0, line.length, num_sample) + sampled_points = np.array([list(line.interpolate(distance).coords) + for distance in distances]).reshape(-1, 2) + + cls_gts.append(sampled_points) + num_gts = len(cls_gts) + if num_gts > 0: + cls_gts = np.stack(cls_gts).reshape(num_gts,-1) + else: + cls_gts = np.zeros((0,num_sample*2)) + return cls_gens, cls_gts + # ones = np.ones((num_gts,1)) + # tmp_cls_gens = np.concatenate([cls_gts,ones],axis=-1) + # return tmp_cls_gens, cls_gts + +def format_res_gt_by_classes(result_path, + gen_results, + annotations, + cls_names=None, + num_pred_pts_per_instance=30, + eval_use_same_gt_sample_num_flag=False, + pc_range=[-15.0, -30.0, -5.0, 15.0, 30.0, 3.0], + nproc=24): + assert cls_names is not None + timer = mmcv.Timer() + num_fixed_sample_pts = 100 + fix_interval = False + print('results path: {}'.format(result_path)) + + output_dir = osp.join(*osp.split(result_path)[:-1]) + assert len(gen_results) == len(annotations) + + pool = Pool(nproc) + cls_gens, cls_gts = {}, {} + print('Formatting ...') + formatting_file = 'cls_formatted.pkl' + formatting_file = osp.join(output_dir,formatting_file) + + # for vis + if False: + from PIL import Image + import matplotlib.pyplot as plt + from matplotlib import transforms + from matplotlib.patches import Rectangle + + show_dir = osp.join(output_dir,'vis_json') + mmcv.mkdir_or_exist(osp.abspath(show_dir)) + # import pdb;pdb.set_trace() + car_img = Image.open('./figs/lidar_car.png') + colors_plt = ['r', 'b', 'g'] + for i in range(20): + + plt.figure(figsize=(2, 4)) + plt.xlim(pc_range[0], pc_range[3]) + plt.ylim(pc_range[1], pc_range[4]) + plt.axis('off') + + for line in gen_results[i]['vectors']: + l = np.array(line['pts']) + plt.plot(l[:,0],l[:,1],'-', + # color=colors[line['type']] + color = 'red', + ) + + for line in annotations[i]['vectors']: + # l = np.array(line['pts']) + np.array((1,1)) + l = np.array(line['pts']) + # l = line['pts'] + plt.plot(l[:,0],l[:,1],'-', + # color=colors[line['type']], + color = 'blue', + ) + plt.imshow(car_img, extent=[-1.2, 1.2, -1.5, 1.5]) + map_path = osp.join(show_dir, 'COMPARE_MAP_{}.jpg'.format(i)) + plt.savefig(map_path, bbox_inches='tight', dpi=400) + plt.close() + + for i, clsname in enumerate(cls_names): + + gengts = pool.starmap( + partial(get_cls_results, num_sample=num_fixed_sample_pts, + num_pred_pts_per_instance=num_pred_pts_per_instance, + eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,class_id=i,fix_interval=fix_interval), + zip(list(gen_results.values()), annotations)) + # gengts = map(partial(get_cls_results, num_sample=num_fixed_sample_pts, class_id=i,fix_interval=fix_interval), + # zip(gen_results, annotations)) + # import pdb;pdb.set_trace() + gens, gts = tuple(zip(*gengts)) + cls_gens[clsname] = gens + cls_gts[clsname] = gts + + mmcv.dump([cls_gens, cls_gts],formatting_file) + print('Cls data formatting done in {:2f}s!! with {}'.format(float(timer.since_start()),formatting_file)) + pool.close() + return cls_gens, cls_gts + +def eval_map(gen_results, + annotations, + cls_gens, + cls_gts, + threshold=0.5, + cls_names=None, + logger=None, + tpfp_fn=None, + pc_range=[-15.0, -30.0, -5.0, 15.0, 30.0, 3.0], + metric=None, + num_pred_pts_per_instance=30, + nproc=24): + timer = mmcv.Timer() + pool = Pool(nproc) + + eval_results = [] + + for i, clsname in enumerate(cls_names): + + # get gt and det bboxes of this class + cls_gen = cls_gens[clsname] + cls_gt = cls_gts[clsname] + # choose proper function according to datasets to compute tp and fp + # XXX + # func_name = cls2func[clsname] + # tpfp_fn = tpfp_fn_dict[tpfp_fn_name] + tpfp_fn = custom_tpfp_gen + # Trick for serialized + # only top-level function can be serized + # somehow use partitial the return function is defined + # at the top level. + + # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold, metric=metric) + # import pdb; pdb.set_trace() + # TODO this is a hack + tpfp_fn = partial(tpfp_fn, threshold=threshold, metric=metric) + args = [] + # compute tp and fp for each image with multiple processes + tpfp = pool.starmap( + tpfp_fn, + zip(cls_gen, cls_gt, *args)) + # import pdb;pdb.set_trace() + tp, fp = tuple(zip(*tpfp)) + + + + # map_results = map( + # tpfp_fn, + # cls_gen, cls_gt) + # tp, fp = tuple(map(list, zip(*map_results))) + + + # debug and testing + # for i in range(len(cls_gen)): + # # print(i) + # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold) + # print(i) + # tpfp = (tpfp,) + # print(tpfp) + # i = 0 + # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold) + # import pdb; pdb.set_trace() + + # XXX + + num_gts = 0 + for j, bbox in enumerate(cls_gt): + num_gts += bbox.shape[0] + + # sort all det bboxes by score, also sort tp and fp + # import pdb;pdb.set_trace() + cls_gen = np.vstack(cls_gen) + num_dets = cls_gen.shape[0] + sort_inds = np.argsort(-cls_gen[:, -1]) #descending, high score front + tp = np.hstack(tp)[sort_inds] + fp = np.hstack(fp)[sort_inds] + + # calculate recall and precision with tp and fp + # num_det*num_res + tp = np.cumsum(tp, axis=0) + fp = np.cumsum(fp, axis=0) + eps = np.finfo(np.float32).eps + recalls = tp / np.maximum(num_gts, eps) + precisions = tp / np.maximum((tp + fp), eps) + + # calculate AP + # if dataset != 'voc07' else '11points' + mode = 'area' + ap = average_precision(recalls, precisions, mode) + eval_results.append({ + 'num_gts': num_gts, + 'num_dets': num_dets, + 'recall': recalls, + 'precision': precisions, + 'ap': ap + }) + print('cls:{} done in {:2f}s!!'.format(clsname,float(timer.since_last_check()))) + pool.close() + aps = [] + for cls_result in eval_results: + if cls_result['num_gts'] > 0: + aps.append(cls_result['ap']) + mean_ap = np.array(aps).mean().item() if len(aps) else 0.0 + + print_map_summary( + mean_ap, eval_results, class_name=cls_names, logger=logger) + + return mean_ap, eval_results + + + +def print_map_summary(mean_ap, + results, + class_name=None, + scale_ranges=None, + logger=None): + """Print mAP and results of each class. + + A table will be printed to show the gts/dets/recall/AP of each class and + the mAP. + + Args: + mean_ap (float): Calculated from `eval_map()`. + results (list[dict]): Calculated from `eval_map()`. + dataset (list[str] | str | None): Dataset name or dataset classes. + scale_ranges (list[tuple] | None): Range of scales to be evaluated. + logger (logging.Logger | str | None): The way to print the mAP + summary. See `mmcv.utils.print_log()` for details. Default: None. + """ + + if logger == 'silent': + return + + if isinstance(results[0]['ap'], np.ndarray): + num_scales = len(results[0]['ap']) + else: + num_scales = 1 + + if scale_ranges is not None: + assert len(scale_ranges) == num_scales + + num_classes = len(results) + + recalls = np.zeros((num_scales, num_classes), dtype=np.float32) + aps = np.zeros((num_scales, num_classes), dtype=np.float32) + num_gts = np.zeros((num_scales, num_classes), dtype=int) + for i, cls_result in enumerate(results): + if cls_result['recall'].size > 0: + recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1] + aps[:, i] = cls_result['ap'] + num_gts[:, i] = cls_result['num_gts'] + + label_names = class_name + + if not isinstance(mean_ap, list): + mean_ap = [mean_ap] + + header = ['class', 'gts', 'dets', 'recall', 'ap'] + for i in range(num_scales): + if scale_ranges is not None: + print_log(f'Scale range {scale_ranges[i]}', logger=logger) + table_data = [header] + for j in range(num_classes): + row_data = [ + label_names[j], num_gts[i, j], results[j]['num_dets'], + f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}' + ] + table_data.append(row_data) + table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}']) + table = AsciiTable(table_data) + table.inner_footing_row_border = True + print_log('\n' + table.table, logger=logger) diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py new file mode 100644 index 0000000000000000000000000000000000000000..14ab338023158e35a71592c1d82937317cc3f7fd --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py @@ -0,0 +1,363 @@ +import mmcv +import numpy as np + +from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps +from .tpfp_chamfer import vec_iou, convex_iou, rbbox_iou, polyline_score, custom_polyline_score +from shapely.geometry import LineString, Polygon +# from vecmapnet_ops.ops.iou import convex_iou + +def tpfp_bbox(det_bboxes, + gt_bboxes, + gt_bbox_masks, + threshold=0.5): + """Check if detected bboxes are true positive or false positive. + + Args: + det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). + gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). + gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, + of shape (k, 4). Default: None + iou_thr (float): IoU threshold to be considered as matched. + Default: 0.5. + use_legacy_coordinate (bool): Whether to use coordinate system in + mmdet v1.x. which means width, height should be + calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. + Default: False. + + Returns: + tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of + each array is (num_scales, m). + """ + + num_dets = len(det_bboxes) + num_gts = len(gt_bboxes) + + # tp and fp + tp = np.zeros((num_dets), dtype=np.float32) + fp = np.zeros((num_dets), dtype=np.float32) + + # if there is no gt bboxes in this image, then all det bboxes + # within area range are false positives + # XXX + if num_gts == 0: + fp[...] = 1 + return tp, fp + + if num_dets == 0: + return tp, fp + + # # distance matrix: n x m + bbox_p = det_bboxes[:, :-1].reshape(num_dets,-1,2) + bbox_g = gt_bboxes.reshape(num_gts,-1,2) + bbox_gm = gt_bbox_masks.reshape(num_gts,-1,2) + matrix = convex_iou(bbox_p,bbox_g,bbox_gm) + + # for each det, the max iou with all gts + matrix_max = matrix.max(axis=1) + # for each det, which gt overlaps most with it + matrix_argmax = matrix.argmax(axis=1) + # sort all dets in descending order by scores + sort_inds = np.argsort(-det_bboxes[:, -1]) + + gt_covered = np.zeros(num_gts, dtype=bool) + + # tp = 0 and fp = 0 means ignore this detected bbox, + for i in sort_inds: + if matrix_max[i] >= threshold: + matched_gt = matrix_argmax[i] + if not gt_covered[matched_gt]: + gt_covered[matched_gt] = True + tp[i] = 1 + else: + fp[i] = 1 + else: + fp[i] = 1 + + return tp, fp + + +def tpfp_rbbox(det_bboxes, + gt_bboxes, + gt_bbox_masks, + threshold=0.5): + """Check if detected bboxes are true positive or false positive. + + Args: + det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). + gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). + gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, + of shape (k, 4). Default: None + iou_thr (float): IoU threshold to be considered as matched. + Default: 0.5. + use_legacy_coordinate (bool): Whether to use coordinate system in + mmdet v1.x. which means width, height should be + calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. + Default: False. + + Returns: + tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of + each array is (num_scales, m). + """ + + num_dets = len(det_bboxes) + num_gts = len(gt_bboxes) + + # tp and fp + tp = np.zeros((num_dets), dtype=np.float32) + fp = np.zeros((num_dets), dtype=np.float32) + + # if there is no gt bboxes in this image, then all det bboxes + # within area range are false positives + # XXX + if num_gts == 0: + fp[...] = 1 + return tp, fp + + if num_dets == 0: + return tp, fp + + # # distance matrix: n x m + bbox_p = det_bboxes[:, :-1].reshape(num_dets,-1,2) + bbox_g = gt_bboxes.reshape(num_gts,-1,2) + bbox_gm = gt_bbox_masks.reshape(num_gts,-1,2) + matrix = rbbox_iou(bbox_p,bbox_g,bbox_gm) + + # for each det, the max iou with all gts + matrix_max = matrix.max(axis=1) + # for each det, which gt overlaps most with it + matrix_argmax = matrix.argmax(axis=1) + # sort all dets in descending order by scores + sort_inds = np.argsort(-det_bboxes[:, -1]) + + gt_covered = np.zeros(num_gts, dtype=bool) + + # tp = 0 and fp = 0 means ignore this detected bbox, + for i in sort_inds: + if matrix_max[i] >= threshold: + matched_gt = matrix_argmax[i] + if not gt_covered[matched_gt]: + gt_covered[matched_gt] = True + tp[i] = 1 + else: + fp[i] = 1 + else: + fp[i] = 1 + + return tp, fp + + +def tpfp_det(det_bboxes, + gt_bboxes, + threshold=0.5): + """Check if detected bboxes are true positive or false positive. + + Args: + det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). + gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). + gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, + of shape (k, 4). Default: None + iou_thr (float): IoU threshold to be considered as matched. + Default: 0.5. + use_legacy_coordinate (bool): Whether to use coordinate system in + mmdet v1.x. which means width, height should be + calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. + Default: False. + + Returns: + tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of + each array is (num_scales, m). + """ + + num_dets = det_bboxes.shape[0] + num_gts = gt_bboxes.shape[0] + + # tp and fp + tp = np.zeros((num_dets), dtype=np.float32) + fp = np.zeros((num_dets), dtype=np.float32) + + # if there is no gt bboxes in this image, then all det bboxes + # within area range are false positives + # XXX + if num_gts == 0: + fp[...] = 1 + return tp, fp + + if num_dets == 0: + return tp, fp + + # # distance matrix: n x m + matrix = vec_iou( + det_bboxes[:, :-1].reshape(num_dets,-1,2), + gt_bboxes.reshape(num_gts,-1,2)) + # for each det, the max iou with all gts + matrix_max = matrix.max(axis=1) + # for each det, which gt overlaps most with it + matrix_argmax = matrix.argmax(axis=1) + # sort all dets in descending order by scores + sort_inds = np.argsort(-det_bboxes[:, -1]) + + gt_covered = np.zeros(num_gts, dtype=bool) + + # tp = 0 and fp = 0 means ignore this detected bbox, + for i in sort_inds: + if matrix_max[i] >= threshold: + matched_gt = matrix_argmax[i] + if not gt_covered[matched_gt]: + gt_covered[matched_gt] = True + tp[i] = 1 + else: + fp[i] = 1 + else: + fp[i] = 1 + + return tp, fp + + +def tpfp_gen(gen_lines, + gt_lines, + threshold=0.5, + metric='POR'): + """Check if detected bboxes are true positive or false positive. + + Args: + det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). + gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). + gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, + of shape (k, 4). Default: None + iou_thr (float): IoU threshold to be considered as matched. + Default: 0.5. + use_legacy_coordinate (bool): Whether to use coordinate system in + mmdet v1.x. which means width, height should be + calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. + Default: False. + + Returns: + tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of + each array is (num_scales, m). + """ + + num_gens = gen_lines.shape[0] + num_gts = gt_lines.shape[0] + + # tp and fp + tp = np.zeros((num_gens), dtype=np.float32) + fp = np.zeros((num_gens), dtype=np.float32) + + # if there is no gt bboxes in this image, then all det bboxes + # within area range are false positives + if num_gts == 0: + fp[...] = 1 + return tp, fp + + if num_gens == 0: + return tp, fp + + gen_scores = gen_lines[:,-1] # n + # distance matrix: n x m + + # matrix = custom_polyline_score( + # gen_lines[:,:-1].reshape(num_gens,-1,2), + # gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric) + + # TODO MAY bug here + matrix = polyline_score( + gen_lines[:,:-1].reshape(num_gens,-1,2), + gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric) + # for each det, the max iou with all gts + matrix_max = matrix.max(axis=1) + # for each det, which gt overlaps most with it + matrix_argmax = matrix.argmax(axis=1) + # sort all dets in descending order by scores + sort_inds = np.argsort(-gen_scores) + + gt_covered = np.zeros(num_gts, dtype=bool) + + # tp = 0 and fp = 0 means ignore this detected bbox, + for i in sort_inds: + if matrix_max[i] >= threshold: + matched_gt = matrix_argmax[i] + if not gt_covered[matched_gt]: + gt_covered[matched_gt] = True + tp[i] = 1 + else: + fp[i] = 1 + else: + fp[i] = 1 + + return tp, fp + + +def custom_tpfp_gen(gen_lines, + gt_lines, + threshold=0.5, + metric='chamfer'): + """Check if detected bboxes are true positive or false positive. + + Args: + det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). + gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). + gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, + of shape (k, 4). Default: None + iou_thr (float): IoU threshold to be considered as matched. + Default: 0.5. + use_legacy_coordinate (bool): Whether to use coordinate system in + mmdet v1.x. which means width, height should be + calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. + Default: False. + + Returns: + tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of + each array is (num_scales, m). + """ + if metric == 'chamfer': + if threshold >0: + threshold= -threshold + # else: + # raise NotImplementedError + + # import pdb;pdb.set_trace() + num_gens = gen_lines.shape[0] + num_gts = gt_lines.shape[0] + + # tp and fp + tp = np.zeros((num_gens), dtype=np.float32) + fp = np.zeros((num_gens), dtype=np.float32) + + # if there is no gt bboxes in this image, then all det bboxes + # within area range are false positives + if num_gts == 0: + fp[...] = 1 + return tp, fp + + if num_gens == 0: + return tp, fp + + gen_scores = gen_lines[:,-1] # n + # distance matrix: n x m + + matrix = custom_polyline_score( + gen_lines[:,:-1].reshape(num_gens,-1,2), + gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric) + # for each det, the max iou with all gts + matrix_max = matrix.max(axis=1) + # for each det, which gt overlaps most with it + matrix_argmax = matrix.argmax(axis=1) + # sort all dets in descending order by scores + sort_inds = np.argsort(-gen_scores) + + gt_covered = np.zeros(num_gts, dtype=bool) + + # tp = 0 and fp = 0 means ignore this detected bbox, + for i in sort_inds: + if matrix_max[i] >= threshold: + matched_gt = matrix_argmax[i] + if not gt_covered[matched_gt]: + gt_covered[matched_gt] = True + tp[i] = 1 + else: + fp[i] = 1 + else: + fp[i] = 1 + + return tp, fp + diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py new file mode 100644 index 0000000000000000000000000000000000000000..db55fdd905de53a9033025ae0417f135858f2af8 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py @@ -0,0 +1,335 @@ +# from ..chamfer_dist import ChamferDistance +import numpy as np +from shapely.geometry import LineString, Polygon +from shapely.strtree import STRtree +from shapely.geometry import CAP_STYLE, JOIN_STYLE +from scipy.spatial import distance +import similaritymeasures + +# def chamfer_distance(pred_bbox, gt_bbox): + +# cd_dist_func = ChamferDistance.vec_cd_dist( +# pred, pred_mask, tgt, tgt_mask)() + + +def vec_iou(pred_lines, gt_lines): + ''' + each line with 1 meter width + pred_lines: num_preds, npts, 2 + gt_lines: num_gts, npts, 2 + ''' + + num_preds = pred_lines.shape[0] + num_gts = gt_lines.shape[0] + + pred_lines_shapely = \ + [LineString(i).buffer(1., + cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round) + for i in pred_lines] + gt_lines_shapely =\ + [LineString(i).buffer(1., + cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round) + for i in gt_lines] + + # construct tree + tree = STRtree(gt_lines_shapely) + index_by_id = dict((id(pt), i) for i, pt in enumerate(gt_lines_shapely)) + + iou_matrix = np.zeros((num_preds, num_gts)) + + for i, pline in enumerate(pred_lines_shapely): + + for o in tree.query(pline): + if o.intersects(pline): + gt_id = index_by_id[id(o)] + + inter = o.intersection(pline).area + union = o.union(pline).area + iou_matrix[i, gt_id] = inter / union + + return iou_matrix + +def convex_iou(pred_lines, gt_lines, gt_mask): + ''' + each line with 1 meter width + pred_lines: num_preds, List [npts, 2] + gt_lines: num_gts, npts, 2 + gt_mask: num_gts, npts, 2 + ''' + + num_preds = len(pred_lines) + num_gts = len(gt_lines) + + pred_lines_shapely = \ + [Polygon(i).convex_hull for i in pred_lines] + gt_lines_shapely =\ + [Polygon(i[m].reshape(-1,2)).convex_hull for i,m in zip(gt_lines,gt_mask)] + + # construct tree + tree = STRtree(pred_lines_shapely) + index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely)) + + iou_matrix = np.zeros((num_preds, num_gts)) + + for i, pline in enumerate(gt_lines_shapely): + + for o in tree.query(pline): + if o.intersects(pline): + pred_id = index_by_id[id(o)] + + inter = o.intersection(pline).area + union = o.union(pline).area + iou_matrix[pred_id, i] = inter / union + + return iou_matrix + +def rbbox_iou(pred_lines, gt_lines, gt_mask): + ''' + each line with 1 meter width + pred_lines: num_preds, List [npts, 2] + gt_lines: num_gts, npts, 2 + gt_mask: num_gts, npts, 2 + ''' + + num_preds = len(pred_lines) + num_gts = len(gt_lines) + + pred_lines_shapely = \ + [Polygon(i).minimum_rotated_rectangle for i in pred_lines] + gt_lines_shapely =\ + [Polygon(i[m].reshape(-1,2)) for i,m in zip(gt_lines,gt_mask)] + + # construct tree + tree = STRtree(pred_lines_shapely) + index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely)) + + iou_matrix = np.zeros((num_preds, num_gts)) + + for i, pline in enumerate(gt_lines_shapely): + + for o in tree.query(pline): + if o.intersects(pline): + pred_id = index_by_id[id(o)] + + inter = o.intersection(pline).area + union = o.union(pline).area + iou_matrix[pred_id, i] = inter / union + + return iou_matrix + + +def polyline_score(pred_lines, gt_lines, linewidth=1., metric='POR'): + ''' + each line with 1 meter width + pred_lines: num_preds, List [npts, 2] + gt_lines: num_gts, npts, 2 + gt_mask: num_gts, npts, 2 + ''' + positive_threshold = 1. + num_preds = len(pred_lines) + num_gts = len(gt_lines) + line_length = pred_lines.shape[1] + + # gt_lines = gt_lines + np.array((1.,1.)) + + pred_lines_shapely = \ + [LineString(i).buffer(linewidth, + cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) + for i in pred_lines] + gt_lines_shapely =\ + [LineString(i).buffer(linewidth, + cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) + for i in gt_lines] + + # construct tree + tree = STRtree(pred_lines_shapely) + index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely)) + + if metric=='POR': + iou_matrix = np.zeros((num_preds, num_gts),dtype=np.float64) + elif metric=='frechet': + iou_matrix = np.full((num_preds, num_gts), -100.) + elif metric=='chamfer': + iou_matrix = np.full((num_preds, num_gts), -100.) + elif metric=='chamfer_v2': + iou_matrix = np.full((num_preds, num_gts), -100.) + + for i, pline in enumerate(gt_lines_shapely): + + for o in tree.query(pline): + if o.intersects(pline): + pred_id = index_by_id[id(o)] + + if metric=='POR': + dist_mat = distance.cdist( + pred_lines[pred_id], gt_lines[i], 'euclidean') + + valid_ab = (dist_mat.min(-1) < positive_threshold).sum() + valid_ba = (dist_mat.min(-2) < positive_threshold).sum() + + iou_matrix[pred_id, i] = min(valid_ba,valid_ab) / line_length + # iou_matrix[pred_id, i] = ((valid_ba+valid_ab)/2) / line_length + # assert iou_matrix[pred_id, i] <= 1. and iou_matrix[pred_id, i] >= 0. + elif metric=='frechet': + fdistance_1 = \ + -similaritymeasures.frechet_dist(pred_lines[pred_id], gt_lines[i]) + fdistance_2 = \ + -similaritymeasures.frechet_dist(pred_lines[pred_id][::-1], gt_lines[i]) + fdistance = max(fdistance_1,fdistance_2) + iou_matrix[pred_id, i] = fdistance + + elif metric=='chamfer': + dist_mat = distance.cdist( + pred_lines[pred_id], gt_lines[i], 'euclidean') + + valid_ab = dist_mat.min(-1).sum() + valid_ba = dist_mat.min(-2).sum() + + iou_matrix[pred_id, i] = -(valid_ba+valid_ab)/(2*line_length) + # if iou_matrix[pred_id, i] == 0: + # import ipdb; ipdb.set_trace() + elif metric=='chamfer_v2': + dist_mat = distance.cdist( + pred_lines[pred_id], gt_lines[i], 'euclidean') + + valid_ab = dist_mat.min(-1).sum() + valid_ba = dist_mat.min(-2).sum() + + iou_matrix[pred_id, i] = -(valid_ba/pred_lines[pred_id].shape[0] + +valid_ab/gt_lines[i].shape[0])/2 + # if iou_matrix[pred_id, i] == 0: + # import ipdb; ipdb.set_trace() + + + # if True: + # import matplotlib.pyplot as plt + # print('pred num', num_preds) + # print('gt num', num_gts) + # for i in range(num_preds): + # plt.plot(pred_lines[i][:,0],pred_lines[i][:,1],'-',color='red',alpha=0.5) + # for i in range(num_gts): + # plt.plot(gt_lines[i][:,0],gt_lines[i][:,1],'-',color='blue',alpha=0.5) + # plt.savefig('test.png') + # plt.close() + return iou_matrix + + +def custom_polyline_score(pred_lines, gt_lines, linewidth=1., metric='chamfer'): + ''' + each line with 1 meter width + pred_lines: num_preds, List [npts, 2] + gt_lines: num_gts, npts, 2 + gt_mask: num_gts, npts, 2 + ''' + if metric == 'iou': + linewidth = 1.0 + positive_threshold = 1. + num_preds = len(pred_lines) + num_gts = len(gt_lines) + line_length = pred_lines.shape[1] + + # gt_lines = gt_lines + np.array((1.,1.)) + + pred_lines_shapely = \ + [LineString(i).buffer(linewidth, + cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) + for i in pred_lines] + gt_lines_shapely =\ + [LineString(i).buffer(linewidth, + cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) + for i in gt_lines] + + # construct tree + tree = STRtree(pred_lines_shapely) + index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely)) + + + if metric=='chamfer': + iou_matrix = np.full((num_preds, num_gts), -100.) + elif metric=='iou': + iou_matrix = np.zeros((num_preds, num_gts),dtype=np.float64) + else: + raise NotImplementedError + + for i, pline in enumerate(gt_lines_shapely): + + for o in tree.query(pline): + if o.intersects(pline): + pred_id = index_by_id[id(o)] + + if metric=='chamfer': + dist_mat = distance.cdist( + pred_lines[pred_id], gt_lines[i], 'euclidean') + # import pdb;pdb.set_trace() + valid_ab = dist_mat.min(-1).mean() + valid_ba = dist_mat.min(-2).mean() + + iou_matrix[pred_id, i] = -(valid_ba+valid_ab)/2 + elif metric=='iou': + inter = o.intersection(pline).area + union = o.union(pline).area + iou_matrix[pred_id, i] = inter / union + + return iou_matrix + +if __name__ == '__main__': + import torch + + line1 = torch.tensor([ + [1, 5], [3, 5], [5, 5] + ]) + + line0 = torch.tensor([ + [3, 6], [4, 8], [5, 6] + ]) + + line2 = torch.tensor([ + [1, 4], [3, 4], [5, 4] + ]) + + line3 = torch.tensor([ + [4, 4], [3, 3], [5, 3] + ]) + + gt = torch.stack((line2, line3), dim=0).type(torch.float32) + pred = torch.stack((line0, line1), dim=0).type(torch.float32) + + # import ipdb; ipdb.set_trace() + import mmcv + # with mmcv.Timer(): + # gt = upsampler(gt, pts=10) + # pred = upsampler(pred, pts=10) + + import matplotlib.pyplot as plt + from shapely.geometry import LineString + from descartes import PolygonPatch + + iou_matrix = vec_iou(pred,gt) + print(iou_matrix) + # import pdb;pdb.set_trace() + score_matrix = custom_polyline_score(pred, gt, linewidth=1., metric='chamfer') + print(score_matrix) + fig, ax = plt.subplots() + for i in gt: + i = i.numpy() + plt.plot(i[:, 0], i[:, 1], 'o', color='red') + plt.plot(i[:, 0], i[:, 1], '-', color='red') + + dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round) + patch1 = PolygonPatch(dilated, fc='red', ec='red', alpha=0.5, zorder=-1) + ax.add_patch(patch1) + + for i in pred: + i = i.numpy() + plt.plot(i[:, 0], i[:, 1], 'o', color='blue') + plt.plot(i[:, 0], i[:, 1], '-', color='blue') + + dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) + patch1 = PolygonPatch(dilated, fc='blue', ec='blue', alpha=0.5, zorder=-1) + ax.add_patch(patch1) + + + ax.axis('equal') + + + plt.savefig('test3.png') \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_eval.py b/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..6aa85e73cc18fd84765ee8ce4ead3cca06ed7128 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_eval.py @@ -0,0 +1,783 @@ +import argparse +import copy +import json +import os +import time +from typing import Tuple, Dict, Any +import torch +import numpy as np + +from nuscenes import NuScenes +from nuscenes.eval.common.config import config_factory +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.detection.data_classes import DetectionConfig +from nuscenes.eval.detection.evaluate import NuScenesEval +from pyquaternion import Quaternion + +from nuscenes import NuScenes +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.detection.data_classes import DetectionBox +from nuscenes.eval.detection.utils import category_to_detection_name +from nuscenes.eval.tracking.data_classes import TrackingBox +from nuscenes.utils.data_classes import Box +from nuscenes.utils.geometry_utils import points_in_box +from nuscenes.utils.splits import create_splits_scenes +from nuscenes.eval.common.loaders import add_center_dist, filter_eval_boxes +import tqdm +from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix +from torchvision.transforms.functional import rotate +import pycocotools.mask as mask_util +# from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from torchvision.transforms.functional import rotate +import cv2 +import argparse +import json +import os +import random +import time +from typing import Tuple, Dict, Any + +import numpy as np + +from nuscenes import NuScenes +from nuscenes.eval.common.config import config_factory +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.common.loaders import load_gt, add_center_dist, filter_eval_boxes +from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp +from nuscenes.eval.detection.constants import TP_METRICS +from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \ + DetectionMetricDataList +from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample +from nuscenes.eval.common.utils import quaternion_yaw, Quaternion +from mmdet3d.core.bbox.iou_calculators import BboxOverlaps3D +from IPython import embed +import json +from typing import Any + +import numpy as np +from matplotlib import pyplot as plt + +from nuscenes import NuScenes +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.common.render import setup_axis +from nuscenes.eval.common.utils import boxes_to_sensor +from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \ + PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS +from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList +from nuscenes.utils.data_classes import LidarPointCloud +from nuscenes.utils.geometry_utils import view_points + +import mmcv + + +Axis = Any + +def class_tp_curve(md_list: DetectionMetricDataList, + metrics: DetectionMetrics, + detection_name: str, + min_recall: float, + dist_th_tp: float, + savepath: str = None, + ax: Axis = None) -> None: + """ + Plot the true positive curve for the specified class. + :param md_list: DetectionMetricDataList instance. + :param metrics: DetectionMetrics instance. + :param detection_name: + :param min_recall: Minimum recall value. + :param dist_th_tp: The distance threshold used to determine matches. + :param savepath: If given, saves the the rendering here instead of displaying. + :param ax: Axes onto which to render. + """ + # Get metric data for given detection class with tp distance threshold. + + md = md_list[(detection_name, dist_th_tp)] + min_recall_ind = round(100 * min_recall) + if min_recall_ind <= md.max_recall_ind: + # For traffic_cone and barrier only a subset of the metrics are plotted. + rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))] + ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1 + else: + ylimit = 1.0 + + # Prepare axis. + if ax is None: + ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1, + min_recall=min_recall) + ax.set_ylim(0, ylimit) + + # Plot the recall vs. error curve for each tp metric. + for metric in TP_METRICS: + tp = metrics.get_label_tp(detection_name, metric) + + # Plot only if we have valid data. + if tp is not np.nan and min_recall_ind <= md.max_recall_ind: + recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1] + else: + recall, error = [], [] + + # Change legend based on tp value + if tp is np.nan: + label = '{}: n/a'.format(PRETTY_TP_METRICS[metric]) + elif min_recall_ind > md.max_recall_ind: + label = '{}: nan'.format(PRETTY_TP_METRICS[metric]) + else: + label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric]) + if metric == 'trans_err': + label += f' ({md.max_recall_ind})' # add recall + print(f'Recall: {detection_name}: {md.max_recall_ind/100}') + ax.plot(recall, error, label=label) + ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3)) + ax.legend(loc='best') + + if savepath is not None: + plt.savefig(savepath) + plt.close() + + +class DetectionBox_modified(DetectionBox): + def __init__(self, *args, token=None, visibility=None, index=None, **kwargs): + ''' + add annotation token + ''' + super().__init__(*args, **kwargs) + self.token = token + self.visibility = visibility + self.index = index + + def serialize(self) -> dict: + """ Serialize instance into json-friendly format. """ + return { + 'token': self.token, + 'sample_token': self.sample_token, + 'translation': self.translation, + 'size': self.size, + 'rotation': self.rotation, + 'velocity': self.velocity, + 'ego_translation': self.ego_translation, + 'num_pts': self.num_pts, + 'detection_name': self.detection_name, + 'detection_score': self.detection_score, + 'attribute_name': self.attribute_name, + 'visibility': self.visibility, + 'index': self.index + + } + + @classmethod + def deserialize(cls, content: dict): + """ Initialize from serialized content. """ + return cls( + token=content['token'], + sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=tuple(content['velocity']), + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=content['detection_name'], + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name=content['attribute_name'], + visibility=content['visibility'], + index=content['index'], + ) + + +def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool: + """ + Check if a box is visible inside an image without accounting for occlusions. + :param box: The box to be checked. + :param intrinsic: . Intrinsic camera matrix. + :param imsize: (width, height). + :param vis_level: One of the enumerations of . + :return True if visibility condition is satisfied. + """ + + center_3d = box.center.reshape(3, 1) + center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :] + + visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0]) + visible = np.logical_and(visible, center_img[1, :] < imsize[1]) + visible = np.logical_and(visible, center_img[1, :] > 0) + visible = np.logical_and(visible, center_3d[2, :] > 1) + + in_front = center_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera. + + if vis_level == BoxVisibility.ALL: + return all(visible) and all(in_front) + elif vis_level == BoxVisibility.ANY: + return any(visible) and all(in_front) + elif vis_level == BoxVisibility.NONE: + return True + else: + raise ValueError("vis_level: {} not valid".format(vis_level)) + + +def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int], + vis_level: int = BoxVisibility.ANY) -> bool: + """ + Check if a box is visible in images but not all corners in image . + :param box: The box to be checked. + :param intrinsic: . Intrinsic camera matrix. + :param imsize: (width, height). + :param vis_level: One of the enumerations of . + :return True if visibility condition is satisfied. + """ + + corners_3d = box.corners() + corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :] + + visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0]) + visible = np.logical_and(visible, corners_img[1, :] < imsize[1]) + visible = np.logical_and(visible, corners_img[1, :] > 0) + visible = np.logical_and(visible, corners_3d[2, :] > 1) + + in_front = corners_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera. + + if any(visible) and not all(visible) and all(in_front): + return True + else: + return False + +def load_prediction(result_path: str, max_boxes_per_sample: int, box_cls, verbose: bool = False) \ + -> Tuple[EvalBoxes, Dict]: + """ + Loads object predictions from file. + :param result_path: Path to the .json result file provided by the user. + :param max_boxes_per_sample: Maximim number of boxes allowed per sample. + :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox. + :param verbose: Whether to print messages to stdout. + :return: The deserialized results and meta data. + """ + + # Load from file and check that the format is correct. + # with open(result_path) as f: + # data = json.load(f) + data = mmcv.load(result_path) + assert 'results' in data, 'Error: No field `results` in result file. Please note that the result format changed.' \ + 'See https://www.nuscenes.org/object-detection for more information.' + + # Deserialize results and get meta data. + all_results = EvalBoxes.deserialize(data['results'], box_cls) + meta = data['meta'] + if verbose: + print("Loaded results from {}. Found detections for {} samples." + .format(result_path, len(all_results.sample_tokens))) + + # Check that each sample has no more than x predicted boxes. + for sample_token in all_results.sample_tokens: + assert len(all_results.boxes[sample_token]) <= max_boxes_per_sample, \ + "Error: Only <= %d boxes per sample allowed!" % max_boxes_per_sample + + return all_results, meta + +def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False): + """ + Loads ground truth boxes from DB. + :param nusc: A NuScenes instance. + :param eval_split: The evaluation split for which we load GT boxes. + :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox. + :param verbose: Whether to print messages to stdout. + :return: The GT boxes. + """ + + # Init. + if box_cls == DetectionBox_modified: + attribute_map = {a['token']: a['name'] for a in nusc.attribute} + + if verbose: + print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version)) + # Read out all sample_tokens in DB. + sample_tokens_all = [s['token'] for s in nusc.sample] + assert len(sample_tokens_all) > 0, "Error: Database has no samples!" + + # Only keep samples from this split. + splits = create_splits_scenes() + + # Check compatibility of split with nusc_version. + version = nusc.version + if eval_split in {'train', 'val', 'train_detect', 'train_track'}: + assert version.endswith('trainval'), \ + 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) + elif eval_split in {'mini_train', 'mini_val'}: + assert version.endswith('mini'), \ + 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) + elif eval_split == 'test': + assert version.endswith('test'), \ + 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) + else: + raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.' + .format(eval_split)) + + if eval_split == 'test': + # Check that you aren't trying to cheat :). + assert len(nusc.sample_annotation) > 0, \ + 'Error: You are trying to evaluate on the test set but you do not have the annotations!' + index_map = {} + for scene in nusc.scene: + first_sample_token = scene['first_sample_token'] + sample = nusc.get('sample', first_sample_token) + index_map[first_sample_token] = 1 + index = 2 + while sample['next'] != '': + sample = nusc.get('sample', sample['next']) + index_map[sample['token']] = index + index += 1 + + sample_tokens = [] + for sample_token in sample_tokens_all: + scene_token = nusc.get('sample', sample_token)['scene_token'] + scene_record = nusc.get('scene', scene_token) + if scene_record['name'] in splits[eval_split]: + sample_tokens.append(sample_token) + + all_annotations = EvalBoxes() + + # Load annotations and filter predictions and annotations. + tracking_id_set = set() + for sample_token in tqdm.tqdm(sample_tokens, leave=verbose): + + sample = nusc.get('sample', sample_token) + sample_annotation_tokens = sample['anns'] + + sample_boxes = [] + for sample_annotation_token in sample_annotation_tokens: + + sample_annotation = nusc.get('sample_annotation', sample_annotation_token) + if box_cls == DetectionBox_modified: + # Get label name in detection task and filter unused labels. + detection_name = category_to_detection_name(sample_annotation['category_name']) + if detection_name is None: + continue + + # Get attribute_name. + attr_tokens = sample_annotation['attribute_tokens'] + attr_count = len(attr_tokens) + if attr_count == 0: + attribute_name = '' + elif attr_count == 1: + attribute_name = attribute_map[attr_tokens[0]] + else: + raise Exception('Error: GT annotations must not have more than one attribute!') + + sample_boxes.append( + box_cls( + token=sample_annotation_token, + sample_token=sample_token, + translation=sample_annotation['translation'], + size=sample_annotation['size'], + rotation=sample_annotation['rotation'], + velocity=nusc.box_velocity(sample_annotation['token'])[:2], + num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'], + detection_name=detection_name, + detection_score=-1.0, # GT samples do not have a score. + attribute_name=attribute_name, + visibility=sample_annotation['visibility_token'], + index=index_map[sample_token] + ) + ) + elif box_cls == TrackingBox: + assert False + else: + raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls) + + all_annotations.add_boxes(sample_token, sample_boxes) + + if verbose: + print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens))) + + return all_annotations + + +def filter_eval_boxes_by_id(nusc: NuScenes, + eval_boxes: EvalBoxes, + id=None, + verbose: bool = False) -> EvalBoxes: + """ + Applies filtering to boxes. Distance, bike-racks and points per box. + :param nusc: An instance of the NuScenes class. + :param eval_boxes: An instance of the EvalBoxes class. + :param is: the anns token set that used to keep bboxes. + :param verbose: Whether to print to stdout. + """ + + # Accumulators for number of filtered boxes. + total, anns_filter = 0, 0 + for ind, sample_token in enumerate(eval_boxes.sample_tokens): + + # Filter on anns + total += len(eval_boxes[sample_token]) + filtered_boxes = [] + for box in eval_boxes[sample_token]: + if box.token in id: + filtered_boxes.append(box) + anns_filter += len(filtered_boxes) + eval_boxes.boxes[sample_token] = filtered_boxes + + if verbose: + print("=> Original number of boxes: %d" % total) + print("=> After anns based filtering: %d" % anns_filter) + + return eval_boxes + + +def filter_eval_boxes_by_visibility( + ori_eval_boxes: EvalBoxes, + visibility=None, + verbose: bool = False) -> EvalBoxes: + """ + Applies filtering to boxes. Distance, bike-racks and points per box. + :param nusc: An instance of the NuScenes class. + :param eval_boxes: An instance of the EvalBoxes class. + :param is: the anns token set that used to keep bboxes. + :param verbose: Whether to print to stdout. + """ + + # Accumulators for number of filtered boxes. + eval_boxes = copy.deepcopy(ori_eval_boxes) + total, anns_filter = 0, 0 + for ind, sample_token in enumerate(eval_boxes.sample_tokens): + # Filter on anns + total += len(eval_boxes[sample_token]) + filtered_boxes = [] + for box in eval_boxes[sample_token]: + if box.visibility == visibility: + filtered_boxes.append(box) + anns_filter += len(filtered_boxes) + eval_boxes.boxes[sample_token] = filtered_boxes + + if verbose: + print("=> Original number of boxes: %d" % total) + print("=> After visibility based filtering: %d" % anns_filter) + + return eval_boxes + + +def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[], verbose=False): + eval_boxes = copy.deepcopy(ori_eval_boxes) + for sample_token in eval_boxes.sample_tokens: + if sample_token not in valid_sample_tokens: + eval_boxes.boxes.pop(sample_token) + return eval_boxes + + +def filter_eval_boxes_by_overlap(nusc: NuScenes, + eval_boxes: EvalBoxes, + verbose: bool = False) -> EvalBoxes: + """ + Applies filtering to boxes. basedon overlap . + :param nusc: An instance of the NuScenes class. + :param eval_boxes: An instance of the EvalBoxes class. + :param verbose: Whether to print to stdout. + """ + + # Accumulators for number of filtered boxes. + cams = ['CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_BACK_RIGHT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_FRONT_LEFT'] + + total, anns_filter = 0, 0 + for ind, sample_token in enumerate(eval_boxes.sample_tokens): + + # Filter on anns + total += len(eval_boxes[sample_token]) + sample_record = nusc.get('sample', sample_token) + filtered_boxes = [] + for box in eval_boxes[sample_token]: + count = 0 + for cam in cams: + ''' + copy-paste form nuscens + ''' + sample_data_token = sample_record['data'][cam] + sd_record = nusc.get('sample_data', sample_data_token) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + sensor_record = nusc.get('sensor', cs_record['sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + cam_intrinsic = np.array(cs_record['camera_intrinsic']) + imsize = (sd_record['width'], sd_record['height']) + new_box = Box(box.translation, box.size, Quaternion(box.rotation), + name=box.detection_name, token='') + + # Move box to ego vehicle coord system. + new_box.translate(-np.array(pose_record['translation'])) + new_box.rotate(Quaternion(pose_record['rotation']).inverse) + + # Move box to sensor coord system. + new_box.translate(-np.array(cs_record['translation'])) + new_box.rotate(Quaternion(cs_record['rotation']).inverse) + + if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY): + count += 1 + # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY): + # count += 1 + + if count > 1: + with open('center_overlap.txt', 'a') as f: + try: + f.write(box.token + '\n') + except: + pass + filtered_boxes.append(box) + anns_filter += len(filtered_boxes) + eval_boxes.boxes[sample_token] = filtered_boxes + + verbose = True + + if verbose: + print("=> Original number of boxes: %d" % total) + print("=> After anns based filtering: %d" % anns_filter) + + return eval_boxes + + +class NuScenesEval_custom(NuScenesEval): + """ + Dummy class for backward-compatibility. Same as DetectionEval. + """ + + def __init__(self, + nusc: NuScenes, + config: DetectionConfig, + result_path: str, + eval_set: str, + output_dir: str = None, + verbose: bool = True, + overlap_test=False, + eval_mask=False, + data_infos=None + ): + """ + Initialize a DetectionEval object. + :param nusc: A NuScenes object. + :param config: A DetectionConfig object. + :param result_path: Path of the nuScenes JSON result file. + :param eval_set: The dataset split to evaluate on, e.g. train, val or test. + :param output_dir: Folder to save plots and results to. + :param verbose: Whether to print to stdout. + """ + + self.nusc = nusc + self.result_path = result_path + self.eval_set = eval_set + self.output_dir = output_dir + self.verbose = verbose + self.cfg = config + self.overlap_test = overlap_test + self.eval_mask = eval_mask + self.data_infos = data_infos + # Check result file exists. + assert os.path.exists(result_path), 'Error: The result file does not exist!' + + # Make dirs. + self.plot_dir = os.path.join(self.output_dir, 'plots') + if not os.path.isdir(self.output_dir): + os.makedirs(self.output_dir) + if not os.path.isdir(self.plot_dir): + os.makedirs(self.plot_dir) + + # Load data. + if verbose: + print('Initializing nuScenes detection evaluation') + self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox, + verbose=verbose) + self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose) + + assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \ + "Samples in split doesn't match samples in predictions." + + # Add center distances. + self.pred_boxes = add_center_dist(nusc, self.pred_boxes) + self.gt_boxes = add_center_dist(nusc, self.gt_boxes) + + # Filter boxes (distance, points per box, etc.). + + if verbose: + print('Filtering predictions') + self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose) + if verbose: + print('Filtering ground truth annotations') + self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose) + + if self.overlap_test: + self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes) + + self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True) + + self.all_gt = copy.deepcopy(self.gt_boxes) + self.all_preds = copy.deepcopy(self.pred_boxes) + self.sample_tokens = self.gt_boxes.sample_tokens + + self.index_map = {} + for scene in nusc.scene: + first_sample_token = scene['first_sample_token'] + sample = nusc.get('sample', first_sample_token) + self.index_map[first_sample_token] = 1 + index = 2 + while sample['next'] != '': + sample = nusc.get('sample', sample['next']) + self.index_map[sample['token']] = index + index += 1 + + def update_gt(self, type_='vis', visibility='1', index=1): + if type_ == 'vis': + self.visibility_test = True + if self.visibility_test: + '''[{'description': 'visibility of whole object is between 0 and 40%', + 'token': '1', + 'level': 'v0-40'}, + {'description': 'visibility of whole object is between 40 and 60%', + 'token': '2', + 'level': 'v40-60'}, + {'description': 'visibility of whole object is between 60 and 80%', + 'token': '3', + 'level': 'v60-80'}, + {'description': 'visibility of whole object is between 80 and 100%', + 'token': '4', + 'level': 'v80-100'}]''' + + self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True) + + elif type_ == 'ord': + + valid_tokens = [key for (key, value) in self.index_map.items() if value == index] + # from IPython import embed + # embed() + self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens) + self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens) + self.sample_tokens = self.gt_boxes.sample_tokens + + + def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]: + """ + Performs the actual evaluation. + :return: A tuple of high-level and the raw metric data. + """ + start_time = time.time() + + # ----------------------------------- + # Step 1: Accumulate metric data for all classes and distance thresholds. + # ----------------------------------- + if self.verbose: + print('Accumulating metric data...') + metric_data_list = DetectionMetricDataList() + + # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths) + # self.cfg.dist_ths = [0.3] + # self.cfg.dist_fcn_callable + for class_name in self.cfg.class_names: + for dist_th in self.cfg.dist_ths: + md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th) + metric_data_list.set(class_name, dist_th, md) + + # ----------------------------------- + # Step 2: Calculate metrics from the data. + # ----------------------------------- + if self.verbose: + print('Calculating metrics...') + metrics = DetectionMetrics(self.cfg) + for class_name in self.cfg.class_names: + # Compute APs. + for dist_th in self.cfg.dist_ths: + metric_data = metric_data_list[(class_name, dist_th)] + ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision) + metrics.add_label_ap(class_name, dist_th, ap) + # Compute TP metrics. + for metric_name in TP_METRICS: + metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)] + if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']: + tp = np.nan + elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']: + tp = np.nan + else: + tp = calc_tp(metric_data, self.cfg.min_recall, metric_name) + metrics.add_label_tp(class_name, metric_name, tp) + + # Compute evaluation time. + metrics.add_runtime(time.time() - start_time) + + return metrics, metric_data_list + + def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None: + """ + Renders various PR and TP curves. + :param metrics: DetectionMetrics instance. + :param md_list: DetectionMetricDataList instance. + """ + if self.verbose: + print('Rendering PR and TP curves') + + def savepath(name): + return os.path.join(self.plot_dir, name + '.pdf') + + summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall, + dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary')) + + for detection_name in self.cfg.class_names: + class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall, + savepath=savepath(detection_name + '_pr')) + + class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp, + savepath=savepath(detection_name + '_tp')) + + for dist_th in self.cfg.dist_ths: + dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall, + savepath=savepath('dist_pr_' + str(dist_th))) + + +if __name__ == "__main__": + + # Settings. + parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('result_path', type=str, help='The submission as a JSON file.') + parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics', + help='Folder to store result metrics, graphs and example visualizations.') + parser.add_argument('--eval_set', type=str, default='val', + help='Which dataset split to evaluate on, train, val or test.') + parser.add_argument('--dataroot', type=str, default='data/nuscenes', + help='Default nuScenes data directory.') + parser.add_argument('--version', type=str, default='v1.0-trainval', + help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.') + parser.add_argument('--config_path', type=str, default='', + help='Path to the configuration file.' + 'If no path given, the CVPR 2019 configuration will be used.') + parser.add_argument('--plot_examples', type=int, default=0, + help='How many example visualizations to write to disk.') + parser.add_argument('--render_curves', type=int, default=1, + help='Whether to render PR and TP curves to disk.') + parser.add_argument('--verbose', type=int, default=1, + help='Whether to print to stdout.') + args = parser.parse_args() + + result_path_ = os.path.expanduser(args.result_path) + output_dir_ = os.path.expanduser(args.output_dir) + eval_set_ = args.eval_set + dataroot_ = args.dataroot + version_ = args.version + config_path = args.config_path + plot_examples_ = args.plot_examples + render_curves_ = bool(args.render_curves) + verbose_ = bool(args.verbose) + + if config_path == '': + cfg_ = config_factory('detection_cvpr_2019') + else: + with open(config_path, 'r') as _f: + cfg_ = DetectionConfig.deserialize(json.load(_f)) + + nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_) + nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_, + output_dir=output_dir_, verbose=verbose_) + for vis in ['1', '2', '3', '4']: + nusc_eval.update_gt(type_='vis', visibility=vis) + print(f'================ {vis} ===============') + nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_) + #for index in range(1, 41): + # nusc_eval.update_gt(type_='ord', index=index) + # diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_vad_dataset.py b/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_vad_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..9f73a76f2effd98be3590033bcf16b70124a40bf --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_vad_dataset.py @@ -0,0 +1,1934 @@ +import os +import json +import copy +import tempfile +from typing import Dict, List + +import numpy as np +from mmdet.datasets import DATASETS +from mmdet3d.datasets import NuScenesDataset +import pyquaternion +import mmcv +from os import path as osp +from mmdet.datasets import DATASETS +import torch +import numpy as np +from nuscenes.eval.common.utils import quaternion_yaw, Quaternion +from .vad_custom_nuscenes_eval import NuScenesEval_custom +from nuscenes.eval.common.utils import center_distance +from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from mmcv.parallel import DataContainer as DC +import random +from mmdet3d.core import LiDARInstance3DBoxes +from nuscenes.utils.data_classes import Box as NuScenesBox +from projects.mmdet3d_plugin.core.bbox.structures.nuscenes_box import CustomNuscenesBox +from shapely import affinity, ops +from shapely.geometry import LineString, box, MultiPolygon, MultiLineString +from mmdet.datasets.pipelines import to_tensor +from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer +from nuscenes.eval.detection.constants import DETECTION_NAMES + + +class LiDARInstanceLines(object): + """Line instance in LIDAR coordinates + + """ + def __init__(self, + instance_line_list, + sample_dist=1, + num_samples=250, + padding=False, + fixed_num=-1, + padding_value=-10000, + patch_size=None): + assert isinstance(instance_line_list, list) + assert patch_size is not None + if len(instance_line_list) != 0: + assert isinstance(instance_line_list[0], LineString) + self.patch_size = patch_size + self.max_x = self.patch_size[1] / 2 + self.max_y = self.patch_size[0] / 2 + self.sample_dist = sample_dist + self.num_samples = num_samples + self.padding = padding + self.fixed_num = fixed_num + self.padding_value = padding_value + + self.instance_list = instance_line_list + + @property + def start_end_points(self): + """ + return torch.Tensor([N,4]), in xstart, ystart, xend, yend form + """ + assert len(self.instance_list) != 0 + instance_se_points_list = [] + for instance in self.instance_list: + se_points = [] + se_points.extend(instance.coords[0]) + se_points.extend(instance.coords[-1]) + instance_se_points_list.append(se_points) + instance_se_points_array = np.array(instance_se_points_list) + instance_se_points_tensor = to_tensor(instance_se_points_array) + instance_se_points_tensor = instance_se_points_tensor.to( + dtype=torch.float32) + instance_se_points_tensor[:,0] = torch.clamp(instance_se_points_tensor[:,0], min=-self.max_x,max=self.max_x) + instance_se_points_tensor[:,1] = torch.clamp(instance_se_points_tensor[:,1], min=-self.max_y,max=self.max_y) + instance_se_points_tensor[:,2] = torch.clamp(instance_se_points_tensor[:,2], min=-self.max_x,max=self.max_x) + instance_se_points_tensor[:,3] = torch.clamp(instance_se_points_tensor[:,3], min=-self.max_y,max=self.max_y) + return instance_se_points_tensor + + @property + def bbox(self): + """ + return torch.Tensor([N,4]), in xmin, ymin, xmax, ymax form + """ + assert len(self.instance_list) != 0 + instance_bbox_list = [] + for instance in self.instance_list: + # bounds is bbox: [xmin, ymin, xmax, ymax] + instance_bbox_list.append(instance.bounds) + instance_bbox_array = np.array(instance_bbox_list) + instance_bbox_tensor = to_tensor(instance_bbox_array) + instance_bbox_tensor = instance_bbox_tensor.to( + dtype=torch.float32) + instance_bbox_tensor[:,0] = torch.clamp(instance_bbox_tensor[:,0], min=-self.max_x,max=self.max_x) + instance_bbox_tensor[:,1] = torch.clamp(instance_bbox_tensor[:,1], min=-self.max_y,max=self.max_y) + instance_bbox_tensor[:,2] = torch.clamp(instance_bbox_tensor[:,2], min=-self.max_x,max=self.max_x) + instance_bbox_tensor[:,3] = torch.clamp(instance_bbox_tensor[:,3], min=-self.max_y,max=self.max_y) + return instance_bbox_tensor + + @property + def fixed_num_sampled_points(self): + """ + return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form + N means the num of instances + """ + assert len(self.instance_list) != 0 + instance_points_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + instance_points_list.append(sampled_points) + instance_points_array = np.array(instance_points_list) + instance_points_tensor = to_tensor(instance_points_array) + instance_points_tensor = instance_points_tensor.to( + dtype=torch.float32) + instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) + instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) + return instance_points_tensor + + @property + def fixed_num_sampled_points_ambiguity(self): + """ + return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form + N means the num of instances + """ + assert len(self.instance_list) != 0 + instance_points_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + instance_points_list.append(sampled_points) + instance_points_array = np.array(instance_points_list) + instance_points_tensor = to_tensor(instance_points_array) + instance_points_tensor = instance_points_tensor.to( + dtype=torch.float32) + instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) + instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) + instance_points_tensor = instance_points_tensor.unsqueeze(1) + return instance_points_tensor + + @property + def fixed_num_sampled_points_torch(self): + """ + return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form + N means the num of instances + """ + assert len(self.instance_list) != 0 + instance_points_list = [] + for instance in self.instance_list: + # distances = np.linspace(0, instance.length, self.fixed_num) + # sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + poly_pts = to_tensor(np.array(list(instance.coords))) + poly_pts = poly_pts.unsqueeze(0).permute(0,2,1) + sampled_pts = torch.nn.functional.interpolate(poly_pts,size=(self.fixed_num),mode='linear',align_corners=True) + sampled_pts = sampled_pts.permute(0,2,1).squeeze(0) + instance_points_list.append(sampled_pts) + # instance_points_array = np.array(instance_points_list) + # instance_points_tensor = to_tensor(instance_points_array) + instance_points_tensor = torch.stack(instance_points_list,dim=0) + instance_points_tensor = instance_points_tensor.to( + dtype=torch.float32) + instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) + instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) + return instance_points_tensor + + @property + def shift_fixed_num_sampled_points(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points + instances_list = [] + is_poly = False + # is_line = False + # import pdb;pdb.set_trace() + for fixed_num_pts in fixed_num_sampled_points: + # [fixed_num, 2] + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + fixed_num = fixed_num_pts.shape[0] + shift_pts_list = [] + if is_poly: + # import pdb;pdb.set_trace() + for shift_right_i in range(fixed_num): + shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + # padding = np.zeros((self.num_samples - len(sampled_points), 2)) + # sampled_points = np.concatenate([sampled_points, padding], axis=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v1(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points + instances_list = [] + is_poly = False + # is_line = False + # import pdb;pdb.set_trace() + for fixed_num_pts in fixed_num_sampled_points: + # [fixed_num, 2] + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + pts_num = fixed_num_pts.shape[0] + shift_num = pts_num - 1 + if is_poly: + pts_to_shift = fixed_num_pts[:-1,:] + shift_pts_list = [] + if is_poly: + for shift_right_i in range(shift_num): + shift_pts_list.append(pts_to_shift.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + if is_poly: + _, _, num_coords = shift_pts.shape + tmp_shift_pts = shift_pts.new_zeros((shift_num, pts_num, num_coords)) + tmp_shift_pts[:,:-1,:] = shift_pts + tmp_shift_pts[:,-1,:] = shift_pts[:,0,:] + shift_pts = tmp_shift_pts + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([shift_num-shift_pts.shape[0],pts_num,2], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + # padding = np.zeros((self.num_samples - len(sampled_points), 2)) + # sampled_points = np.concatenate([sampled_points, padding], axis=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v2(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + assert len(self.instance_list) != 0 + instances_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + poly_pts = np.array(list(instance.coords)) + start_pts = poly_pts[0] + end_pts = poly_pts[-1] + is_poly = np.equal(start_pts, end_pts) + is_poly = is_poly.all() + shift_pts_list = [] + pts_num, coords_num = poly_pts.shape + shift_num = pts_num - 1 + final_shift_num = self.fixed_num - 1 + if is_poly: + pts_to_shift = poly_pts[:-1,:] + for shift_right_i in range(shift_num): + shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0) + pts_to_concat = shift_pts[0] + pts_to_concat = np.expand_dims(pts_to_concat,axis=0) + shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) + shift_instance = LineString(shift_pts) + shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + shift_pts_list.append(shift_sampled_points) + # import pdb;pdb.set_trace() + else: + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + flip_sampled_points = np.flip(sampled_points, axis=0) + shift_pts_list.append(sampled_points) + shift_pts_list.append(flip_sampled_points) + + multi_shifts_pts = np.stack(shift_pts_list,axis=0) + shifts_num,_,_ = multi_shifts_pts.shape + + if shifts_num > final_shift_num: + index = np.random.choice(multi_shifts_pts.shape[0], final_shift_num, replace=False) + multi_shifts_pts = multi_shifts_pts[index] + + multi_shifts_pts_tensor = to_tensor(multi_shifts_pts) + multi_shifts_pts_tensor = multi_shifts_pts_tensor.to( + dtype=torch.float32) + + multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x) + multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y) + # if not is_poly: + if multi_shifts_pts_tensor.shape[0] < final_shift_num: + padding = torch.full([final_shift_num-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value) + multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0) + instances_list.append(multi_shifts_pts_tensor) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v3(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + assert len(self.instance_list) != 0 + instances_list = [] + for instance in self.instance_list: + distances = np.linspace(0, instance.length, self.fixed_num) + poly_pts = np.array(list(instance.coords)) + start_pts = poly_pts[0] + end_pts = poly_pts[-1] + is_poly = np.equal(start_pts, end_pts) + is_poly = is_poly.all() + shift_pts_list = [] + pts_num, coords_num = poly_pts.shape + shift_num = pts_num - 1 + final_shift_num = self.fixed_num - 1 + if is_poly: + pts_to_shift = poly_pts[:-1,:] + for shift_right_i in range(shift_num): + shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0) + pts_to_concat = shift_pts[0] + pts_to_concat = np.expand_dims(pts_to_concat,axis=0) + shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) + shift_instance = LineString(shift_pts) + shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + shift_pts_list.append(shift_sampled_points) + flip_pts_to_shift = np.flip(pts_to_shift, axis=0) + for shift_right_i in range(shift_num): + shift_pts = np.roll(flip_pts_to_shift,shift_right_i,axis=0) + pts_to_concat = shift_pts[0] + pts_to_concat = np.expand_dims(pts_to_concat,axis=0) + shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) + shift_instance = LineString(shift_pts) + shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + shift_pts_list.append(shift_sampled_points) + # import pdb;pdb.set_trace() + else: + sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + flip_sampled_points = np.flip(sampled_points, axis=0) + shift_pts_list.append(sampled_points) + shift_pts_list.append(flip_sampled_points) + + multi_shifts_pts = np.stack(shift_pts_list,axis=0) + shifts_num,_,_ = multi_shifts_pts.shape + # import pdb;pdb.set_trace() + if shifts_num > 2*final_shift_num: + index = np.random.choice(shift_num, final_shift_num, replace=False) + flip0_shifts_pts = multi_shifts_pts[index] + flip1_shifts_pts = multi_shifts_pts[index+shift_num] + multi_shifts_pts = np.concatenate((flip0_shifts_pts,flip1_shifts_pts),axis=0) + + multi_shifts_pts_tensor = to_tensor(multi_shifts_pts) + multi_shifts_pts_tensor = multi_shifts_pts_tensor.to( + dtype=torch.float32) + + multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x) + multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y) + # if not is_poly: + if multi_shifts_pts_tensor.shape[0] < 2*final_shift_num: + padding = torch.full([final_shift_num*2-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value) + multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0) + instances_list.append(multi_shifts_pts_tensor) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_v4(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points + instances_list = [] + is_poly = False + # is_line = False + # import pdb;pdb.set_trace() + for fixed_num_pts in fixed_num_sampled_points: + # [fixed_num, 2] + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + pts_num = fixed_num_pts.shape[0] + shift_num = pts_num - 1 + shift_pts_list = [] + if is_poly: + pts_to_shift = fixed_num_pts[:-1,:] + for shift_right_i in range(shift_num): + shift_pts_list.append(pts_to_shift.roll(shift_right_i,0)) + flip_pts_to_shift = pts_to_shift.flip(0) + for shift_right_i in range(shift_num): + shift_pts_list.append(flip_pts_to_shift.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + if is_poly: + _, _, num_coords = shift_pts.shape + tmp_shift_pts = shift_pts.new_zeros((shift_num*2, pts_num, num_coords)) + tmp_shift_pts[:,:-1,:] = shift_pts + tmp_shift_pts[:,-1,:] = shift_pts[:,0,:] + shift_pts = tmp_shift_pts + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([shift_num*2-shift_pts.shape[0],pts_num,2], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + # padding = np.zeros((self.num_samples - len(sampled_points), 2)) + # sampled_points = np.concatenate([sampled_points, padding], axis=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + @property + def shift_fixed_num_sampled_points_torch(self): + """ + return [instances_num, num_shifts, fixed_num, 2] + """ + fixed_num_sampled_points = self.fixed_num_sampled_points_torch + instances_list = [] + is_poly = False + # is_line = False + # import pdb;pdb.set_trace() + for fixed_num_pts in fixed_num_sampled_points: + # [fixed_num, 2] + is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) + fixed_num = fixed_num_pts.shape[0] + shift_pts_list = [] + if is_poly: + # import pdb;pdb.set_trace() + for shift_right_i in range(fixed_num): + shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0)) + else: + shift_pts_list.append(fixed_num_pts) + shift_pts_list.append(fixed_num_pts.flip(0)) + shift_pts = torch.stack(shift_pts_list,dim=0) + + shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) + shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) + + if not is_poly: + padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value) + shift_pts = torch.cat([shift_pts,padding],dim=0) + # padding = np.zeros((self.num_samples - len(sampled_points), 2)) + # sampled_points = np.concatenate([sampled_points, padding], axis=0) + instances_list.append(shift_pts) + instances_tensor = torch.stack(instances_list, dim=0) + instances_tensor = instances_tensor.to( + dtype=torch.float32) + return instances_tensor + + # @property + # def polyline_points(self): + # """ + # return [[x0,y0],[x1,y1],...] + # """ + # assert len(self.instance_list) != 0 + # for instance in self.instance_list: + + +class VectorizedLocalMap(object): + CLASS2LABEL = { + 'road_divider': 0, + 'lane_divider': 0, + 'ped_crossing': 1, + 'contours': 2, + 'others': -1 + } + def __init__(self, + dataroot, + patch_size, + map_classes=['divider','ped_crossing','boundary'], + line_classes=['road_divider', 'lane_divider'], + ped_crossing_classes=['ped_crossing'], + contour_classes=['road_segment', 'lane'], + sample_dist=1, + num_samples=250, + padding=False, + fixed_ptsnum_per_line=-1, + padding_value=-10000,): + ''' + Args: + fixed_ptsnum_per_line = -1 : no fixed num + ''' + super().__init__() + self.data_root = dataroot + self.MAPS = ['boston-seaport', 'singapore-hollandvillage', + 'singapore-onenorth', 'singapore-queenstown'] + self.vec_classes = map_classes + self.line_classes = line_classes + self.ped_crossing_classes = ped_crossing_classes + self.polygon_classes = contour_classes + self.nusc_maps = {} + self.map_explorer = {} + for loc in self.MAPS: + self.nusc_maps[loc] = NuScenesMap(dataroot=self.data_root, map_name=loc) + self.map_explorer[loc] = NuScenesMapExplorer(self.nusc_maps[loc]) + + self.patch_size = patch_size + self.sample_dist = sample_dist + self.num_samples = num_samples + self.padding = padding + self.fixed_num = fixed_ptsnum_per_line + self.padding_value = padding_value + + def gen_vectorized_samples(self, location, lidar2global_translation, lidar2global_rotation): + ''' + use lidar2global to get gt map layers + ''' + + map_pose = lidar2global_translation[:2] + rotation = Quaternion(lidar2global_rotation) + + patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1]) + patch_angle = quaternion_yaw(rotation) / np.pi * 180 + # import pdb;pdb.set_trace() + vectors = [] + for vec_class in self.vec_classes: + if vec_class == 'divider': + line_geom = self.get_map_geom(patch_box, patch_angle, self.line_classes, location) + line_instances_dict = self.line_geoms_to_instances(line_geom) + for line_type, instances in line_instances_dict.items(): + for instance in instances: + vectors.append((instance, self.CLASS2LABEL.get(line_type, -1))) + elif vec_class == 'ped_crossing': + ped_geom = self.get_map_geom(patch_box, patch_angle, self.ped_crossing_classes, location) + # ped_vector_list = self.ped_geoms_to_vectors(ped_geom) + ped_instance_list = self.ped_poly_geoms_to_instances(ped_geom) + # import pdb;pdb.set_trace() + for instance in ped_instance_list: + vectors.append((instance, self.CLASS2LABEL.get('ped_crossing', -1))) + elif vec_class == 'boundary': + polygon_geom = self.get_map_geom(patch_box, patch_angle, self.polygon_classes, location) + # import pdb;pdb.set_trace() + poly_bound_list = self.poly_geoms_to_instances(polygon_geom) + # import pdb;pdb.set_trace() + for contour in poly_bound_list: + vectors.append((contour, self.CLASS2LABEL.get('contours', -1))) + else: + raise ValueError(f'WRONG vec_class: {vec_class}') + + # filter out -1 + filtered_vectors = [] + gt_pts_loc_3d = [] + gt_pts_num_3d = [] + gt_labels = [] + gt_instance = [] + for instance, type in vectors: + if type != -1: + gt_instance.append(instance) + gt_labels.append(type) + + gt_instance = LiDARInstanceLines(gt_instance,self.sample_dist, + self.num_samples, self.padding, self.fixed_num,self.padding_value, patch_size=self.patch_size) + + anns_results = dict( + gt_vecs_pts_loc=gt_instance, + gt_vecs_label=gt_labels, + + ) + # import pdb;pdb.set_trace() + return anns_results + + def get_map_geom(self, patch_box, patch_angle, layer_names, location): + map_geom = [] + for layer_name in layer_names: + if layer_name in self.line_classes: + # import pdb;pdb.set_trace() + geoms = self.get_divider_line(patch_box, patch_angle, layer_name, location) + # import pdb;pdb.set_trace() + # geoms = self.map_explorer[location]._get_layer_line(patch_box, patch_angle, layer_name) + map_geom.append((layer_name, geoms)) + elif layer_name in self.polygon_classes: + geoms = self.get_contour_line(patch_box, patch_angle, layer_name, location) + # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name) + map_geom.append((layer_name, geoms)) + elif layer_name in self.ped_crossing_classes: + geoms = self.get_ped_crossing_line(patch_box, patch_angle, location) + # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name) + map_geom.append((layer_name, geoms)) + return map_geom + + def _one_type_line_geom_to_vectors(self, line_geom): + line_vectors = [] + + for line in line_geom: + if not line.is_empty: + if line.geom_type == 'MultiLineString': + for single_line in line.geoms: + line_vectors.append(self.sample_pts_from_line(single_line)) + elif line.geom_type == 'LineString': + line_vectors.append(self.sample_pts_from_line(line)) + else: + raise NotImplementedError + return line_vectors + + def _one_type_line_geom_to_instances(self, line_geom): + line_instances = [] + + for line in line_geom: + if not line.is_empty: + if line.geom_type == 'MultiLineString': + for single_line in line.geoms: + line_instances.append(single_line) + elif line.geom_type == 'LineString': + line_instances.append(line) + else: + raise NotImplementedError + return line_instances + + def poly_geoms_to_vectors(self, polygon_geom): + roads = polygon_geom[0][1] + lanes = polygon_geom[1][1] + union_roads = ops.unary_union(roads) + union_lanes = ops.unary_union(lanes) + union_segments = ops.unary_union([union_roads, union_lanes]) + max_x = self.patch_size[1] / 2 + max_y = self.patch_size[0] / 2 + local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) + exteriors = [] + interiors = [] + if union_segments.geom_type != 'MultiPolygon': + union_segments = MultiPolygon([union_segments]) + for poly in union_segments.geoms: + exteriors.append(poly.exterior) + for inter in poly.interiors: + interiors.append(inter) + + results = [] + for ext in exteriors: + if ext.is_ccw: + ext.coords = list(ext.coords)[::-1] + lines = ext.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + for inter in interiors: + if not inter.is_ccw: + inter.coords = list(inter.coords)[::-1] + lines = inter.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + return self._one_type_line_geom_to_vectors(results) + + def ped_poly_geoms_to_instances(self, ped_geom): + # import pdb;pdb.set_trace() + ped = ped_geom[0][1] + union_segments = ops.unary_union(ped) + max_x = self.patch_size[1] / 2 + max_y = self.patch_size[0] / 2 + # local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) + local_patch = box(-max_x - 0.2, -max_y - 0.2, max_x + 0.2, max_y + 0.2) + exteriors = [] + interiors = [] + if union_segments.geom_type != 'MultiPolygon': + union_segments = MultiPolygon([union_segments]) + for poly in union_segments.geoms: + exteriors.append(poly.exterior) + for inter in poly.interiors: + interiors.append(inter) + + results = [] + for ext in exteriors: + if ext.is_ccw: + ext.coords = list(ext.coords)[::-1] + lines = ext.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + for inter in interiors: + if not inter.is_ccw: + inter.coords = list(inter.coords)[::-1] + lines = inter.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + return self._one_type_line_geom_to_instances(results) + + + def poly_geoms_to_instances(self, polygon_geom): + roads = polygon_geom[0][1] + lanes = polygon_geom[1][1] + union_roads = ops.unary_union(roads) + union_lanes = ops.unary_union(lanes) + union_segments = ops.unary_union([union_roads, union_lanes]) + max_x = self.patch_size[1] / 2 + max_y = self.patch_size[0] / 2 + local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) + exteriors = [] + interiors = [] + if union_segments.geom_type != 'MultiPolygon': + union_segments = MultiPolygon([union_segments]) + for poly in union_segments.geoms: + exteriors.append(poly.exterior) + for inter in poly.interiors: + interiors.append(inter) + + results = [] + for ext in exteriors: + if ext.is_ccw: + ext.coords = list(ext.coords)[::-1] + lines = ext.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + for inter in interiors: + if not inter.is_ccw: + inter.coords = list(inter.coords)[::-1] + lines = inter.intersection(local_patch) + if isinstance(lines, MultiLineString): + lines = ops.linemerge(lines) + results.append(lines) + + return self._one_type_line_geom_to_instances(results) + + def line_geoms_to_vectors(self, line_geom): + line_vectors_dict = dict() + for line_type, a_type_of_lines in line_geom: + one_type_vectors = self._one_type_line_geom_to_vectors(a_type_of_lines) + line_vectors_dict[line_type] = one_type_vectors + + return line_vectors_dict + def line_geoms_to_instances(self, line_geom): + line_instances_dict = dict() + for line_type, a_type_of_lines in line_geom: + one_type_instances = self._one_type_line_geom_to_instances(a_type_of_lines) + line_instances_dict[line_type] = one_type_instances + + return line_instances_dict + + def ped_geoms_to_vectors(self, ped_geom): + ped_geom = ped_geom[0][1] + union_ped = ops.unary_union(ped_geom) + if union_ped.geom_type != 'MultiPolygon': + union_ped = MultiPolygon([union_ped]) + + max_x = self.patch_size[1] / 2 + max_y = self.patch_size[0] / 2 + local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) + results = [] + for ped_poly in union_ped: + # rect = ped_poly.minimum_rotated_rectangle + ext = ped_poly.exterior + if not ext.is_ccw: + ext.coords = list(ext.coords)[::-1] + lines = ext.intersection(local_patch) + results.append(lines) + + return self._one_type_line_geom_to_vectors(results) + + def get_contour_line(self,patch_box,patch_angle,layer_name,location): + if layer_name not in self.map_explorer[location].map_api.non_geometric_polygon_layers: + raise ValueError('{} is not a polygonal layer'.format(layer_name)) + + patch_x = patch_box[0] + patch_y = patch_box[1] + + patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle) + + records = getattr(self.map_explorer[location].map_api, layer_name) + + polygon_list = [] + if layer_name == 'drivable_area': + for record in records: + polygons = [self.map_explorer[location].map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']] + + for polygon in polygons: + new_polygon = polygon.intersection(patch) + if not new_polygon.is_empty: + new_polygon = affinity.rotate(new_polygon, -patch_angle, + origin=(patch_x, patch_y), use_radians=False) + new_polygon = affinity.affine_transform(new_polygon, + [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) + if new_polygon.geom_type == 'Polygon': + new_polygon = MultiPolygon([new_polygon]) + polygon_list.append(new_polygon) + + else: + for record in records: + polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token']) + + if polygon.is_valid: + new_polygon = polygon.intersection(patch) + if not new_polygon.is_empty: + new_polygon = affinity.rotate(new_polygon, -patch_angle, + origin=(patch_x, patch_y), use_radians=False) + new_polygon = affinity.affine_transform(new_polygon, + [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) + if new_polygon.geom_type == 'Polygon': + new_polygon = MultiPolygon([new_polygon]) + polygon_list.append(new_polygon) + + return polygon_list + + def get_divider_line(self,patch_box,patch_angle,layer_name,location): + if layer_name not in self.map_explorer[location].map_api.non_geometric_line_layers: + raise ValueError("{} is not a line layer".format(layer_name)) + + if layer_name == 'traffic_light': + return None + + patch_x = patch_box[0] + patch_y = patch_box[1] + + patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle) + + line_list = [] + records = getattr(self.map_explorer[location].map_api, layer_name) + for record in records: + line = self.map_explorer[location].map_api.extract_line(record['line_token']) + if line.is_empty: # Skip lines without nodes. + continue + + new_line = line.intersection(patch) + if not new_line.is_empty: + new_line = affinity.rotate(new_line, -patch_angle, origin=(patch_x, patch_y), use_radians=False) + new_line = affinity.affine_transform(new_line, + [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) + line_list.append(new_line) + + return line_list + + def get_ped_crossing_line(self, patch_box, patch_angle, location): + patch_x = patch_box[0] + patch_y = patch_box[1] + + patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle) + polygon_list = [] + records = getattr(self.map_explorer[location].map_api, 'ped_crossing') + # records = getattr(self.nusc_maps[location], 'ped_crossing') + for record in records: + polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token']) + if polygon.is_valid: + new_polygon = polygon.intersection(patch) + if not new_polygon.is_empty: + new_polygon = affinity.rotate(new_polygon, -patch_angle, + origin=(patch_x, patch_y), use_radians=False) + new_polygon = affinity.affine_transform(new_polygon, + [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) + if new_polygon.geom_type == 'Polygon': + new_polygon = MultiPolygon([new_polygon]) + polygon_list.append(new_polygon) + + return polygon_list + + def sample_pts_from_line(self, line): + if self.fixed_num < 0: + distances = np.arange(0, line.length, self.sample_dist) + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + else: + # fixed number of points, so distance is line.length / self.fixed_num + distances = np.linspace(0, line.length, self.fixed_num) + sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) + + # tmpdistances = np.linspace(0, line.length, 2) + # tmpsampled_points = np.array([list(line.interpolate(tmpdistance).coords) for tmpdistance in tmpdistances]).reshape(-1, 2) + # import pdb;pdb.set_trace() + # if self.normalize: + # sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]]) + + num_valid = len(sampled_points) + + if not self.padding or self.fixed_num > 0: + # fixed num sample can return now! + return sampled_points, num_valid + + # fixed distance sampling need padding! + num_valid = len(sampled_points) + + if self.fixed_num < 0: + if num_valid < self.num_samples: + padding = np.zeros((self.num_samples - len(sampled_points), 2)) + sampled_points = np.concatenate([sampled_points, padding], axis=0) + else: + sampled_points = sampled_points[:self.num_samples, :] + num_valid = self.num_samples + + # if self.normalize: + # sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]]) + # num_valid = len(sampled_points) + + return sampled_points, num_valid + + +############################################################################################################### +############################################################################################################### +############################################################################################################### + +class v1CustomDetectionConfig: + """ Data class that specifies the detection evaluation settings. """ + + def __init__(self, + class_range_x: Dict[str, int], + class_range_y: Dict[str, int], + dist_fcn: str, + dist_ths: List[float], + dist_th_tp: float, + min_recall: float, + min_precision: float, + max_boxes_per_sample: int, + mean_ap_weight: int): + + assert set(class_range_x.keys()) == set(DETECTION_NAMES), "Class count mismatch." + assert dist_th_tp in dist_ths, "dist_th_tp must be in set of dist_ths." + + self.class_range_x = class_range_x + self.class_range_y = class_range_y + self.dist_fcn = dist_fcn + self.dist_ths = dist_ths + self.dist_th_tp = dist_th_tp + self.min_recall = min_recall + self.min_precision = min_precision + self.max_boxes_per_sample = max_boxes_per_sample + self.mean_ap_weight = mean_ap_weight + + self.class_names = self.class_range_y.keys() + + def __eq__(self, other): + eq = True + for key in self.serialize().keys(): + eq = eq and np.array_equal(getattr(self, key), getattr(other, key)) + return eq + + def serialize(self) -> dict: + """ Serialize instance into json-friendly format. """ + return { + 'class_range_x': self.class_range_x, + 'class_range_y': self.class_range_y, + 'dist_fcn': self.dist_fcn, + 'dist_ths': self.dist_ths, + 'dist_th_tp': self.dist_th_tp, + 'min_recall': self.min_recall, + 'min_precision': self.min_precision, + 'max_boxes_per_sample': self.max_boxes_per_sample, + 'mean_ap_weight': self.mean_ap_weight + } + + @classmethod + def deserialize(cls, content: dict): + """ Initialize from serialized dictionary. """ + return cls(content['class_range_x'], + content['class_range_y'], + content['dist_fcn'], + content['dist_ths'], + content['dist_th_tp'], + content['min_recall'], + content['min_precision'], + content['max_boxes_per_sample'], + content['mean_ap_weight']) + + @property + def dist_fcn_callable(self): + """ Return the distance function corresponding to the dist_fcn string. """ + if self.dist_fcn == 'center_distance': + return center_distance + else: + raise Exception('Error: Unknown distance function %s!' % self.dist_fcn) + +@DATASETS.register_module() +class VADCustomNuScenesDataset(NuScenesDataset): + r"""Custom NuScenes Dataset. + """ + MAPCLASSES = ('divider',) + def __init__( + self, + queue_length=4, + bev_size=(200, 200), + overlap_test=False, + with_attr=True, + fut_ts=6, + pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], + map_classes=None, + map_ann_file=None, + map_fixed_ptsnum_per_line=-1, + map_eval_use_same_gt_sample_num_flag=False, + padding_value=-10000, + use_pkl_result=False, + custom_eval_version='vad_nusc_detection_cvpr_2019', + *args, + **kwargs + ): + super().__init__(*args, **kwargs) + self.queue_length = queue_length + self.overlap_test = overlap_test + self.bev_size = bev_size + self.with_attr = with_attr + self.fut_ts = fut_ts + self.use_pkl_result = use_pkl_result + + self.custom_eval_version = custom_eval_version + # Check if config exists. + this_dir = os.path.dirname(os.path.abspath(__file__)) + cfg_path = os.path.join(this_dir, '%s.json' % self.custom_eval_version) + assert os.path.exists(cfg_path), \ + 'Requested unknown configuration {}'.format(self.custom_eval_version) + # Load config file and deserialize it. + with open(cfg_path, 'r') as f: + data = json.load(f) + self.custom_eval_detection_configs = v1CustomDetectionConfig.deserialize(data) + + self.map_ann_file = map_ann_file + self.MAPCLASSES = self.get_map_classes(map_classes) + self.NUM_MAPCLASSES = len(self.MAPCLASSES) + self.pc_range = pc_range + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + self.patch_size = (patch_h, patch_w) + self.padding_value = padding_value + self.fixed_num = map_fixed_ptsnum_per_line + self.eval_use_same_gt_sample_num_flag = map_eval_use_same_gt_sample_num_flag + self.vector_map = VectorizedLocalMap(kwargs['data_root'], + patch_size=self.patch_size, map_classes=self.MAPCLASSES, + fixed_ptsnum_per_line=map_fixed_ptsnum_per_line, + padding_value=self.padding_value) + self.is_vis_on_test = True + + @classmethod + def get_map_classes(cls, map_classes=None): + """Get class names of current dataset. + + Args: + classes (Sequence[str] | str | None): If classes is None, use + default CLASSES defined by builtin dataset. If classes is a + string, take it as a file name. The file contains the name of + classes where each line contains one class name. If classes is + a tuple or list, override the CLASSES defined by the dataset. + + Return: + list[str]: A list of class names. + """ + if map_classes is None: + return cls.MAPCLASSES + + if isinstance(map_classes, str): + # take it as a file path + class_names = mmcv.list_from_file(map_classes) + elif isinstance(map_classes, (tuple, list)): + class_names = map_classes + else: + raise ValueError(f'Unsupported type {type(map_classes)} of map classes.') + + return class_names + + def vectormap_pipeline(self, example, input_dict): + ''' + `example` type: + keys: 'img_metas', 'gt_bboxes_3d', 'gt_labels_3d', 'img'; + all keys type is 'DataContainer'; + 'img_metas' cpu_only=True, type is dict, others are false; + 'gt_labels_3d' shape torch.size([num_samples]), stack=False, + padding_value=0, cpu_only=False + 'gt_bboxes_3d': stack=False, cpu_only=True + ''' + # import pdb;pdb.set_trace() + lidar2ego = np.eye(4) + lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix + lidar2ego[:3, 3] = input_dict['lidar2ego_translation'] + ego2global = np.eye(4) + ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix + ego2global[:3, 3] = input_dict['ego2global_translation'] + + lidar2global = ego2global @ lidar2ego + + lidar2global_translation = list(lidar2global[:3,3]) + lidar2global_rotation = list(Quaternion(matrix=lidar2global).q) + + location = input_dict['map_location'] + ego2global_translation = input_dict['ego2global_translation'] + ego2global_rotation = input_dict['ego2global_rotation'] + anns_results = self.vector_map.gen_vectorized_samples( + location, lidar2global_translation, lidar2global_rotation + ) + + ''' + anns_results, type: dict + 'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates + 'gt_vecs_pts_num': list[num_vecs], vec with num_points + 'gt_vecs_label': list[num_vecs], vec with cls index + ''' + gt_vecs_label = to_tensor(anns_results['gt_vecs_label']) + if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines): + gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc'] + else: + gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc']) + try: + gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32) + except: + # empty tensor, will be passed in train, + # but we preserve it for test + gt_vecs_pts_loc = gt_vecs_pts_loc + + example['map_gt_labels_3d'] = DC(gt_vecs_label, cpu_only=False) + example['map_gt_bboxes_3d'] = DC(gt_vecs_pts_loc, cpu_only=True) + + return example + + def prepare_train_data(self, index): + """ + Training data preparation. + Args: + index (int): Index for accessing the target data. + Returns: + dict: Training data dict of the corresponding index. + """ + data_queue = [] + + # temporal aug + prev_indexs_list = list(range(index-self.queue_length, index)) + random.shuffle(prev_indexs_list) + prev_indexs_list = sorted(prev_indexs_list[1:], reverse=True) + ## + + input_dict = self.get_data_info(index) + if input_dict is None: + return None + frame_idx = input_dict['frame_idx'] + scene_token = input_dict['scene_token'] + self.pre_pipeline(input_dict) + example = self.pipeline(input_dict) + example = self.vectormap_pipeline(example,input_dict) + if self.filter_empty_gt and \ + ((example is None or ~(example['gt_labels_3d']._data != -1).any()) or \ + (example is None or ~(example['map_gt_labels_3d']._data != -1).any())): + return None + data_queue.insert(0, example) + for i in prev_indexs_list: + i = max(0, i) + input_dict = self.get_data_info(i) + if input_dict is None: + return None + if input_dict['frame_idx'] < frame_idx and input_dict['scene_token'] == scene_token: + self.pre_pipeline(input_dict) + example = self.pipeline(input_dict) + example = self.vectormap_pipeline(example,input_dict) + if self.filter_empty_gt and \ + (example is None or ~(example['gt_labels_3d']._data != -1).any()) and \ + (example is None or ~(example['map_gt_labels_3d']._data != -1).any()): + return None + frame_idx = input_dict['frame_idx'] + data_queue.insert(0, copy.deepcopy(example)) + return self.union2one(data_queue) + + def prepare_test_data(self, index): + """Prepare data for testing. + + Args: + index (int): Index for accessing the target data. + + Returns: + dict: Testing data dict of the corresponding index. + """ + input_dict = self.get_data_info(index) + self.pre_pipeline(input_dict) + example = self.pipeline(input_dict) + if self.is_vis_on_test: + example = self.vectormap_pipeline(example, input_dict) + return example + + def union2one(self, queue): + """ + convert sample queue into one single sample. + """ + imgs_list = [each['img'].data for each in queue] + metas_map = {} + prev_pos = None + prev_angle = None + for i, each in enumerate(queue): + metas_map[i] = each['img_metas'].data + if i == 0: + metas_map[i]['prev_bev'] = False + prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3]) + prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1]) + metas_map[i]['can_bus'][:3] = 0 + metas_map[i]['can_bus'][-1] = 0 + else: + metas_map[i]['prev_bev'] = True + tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3]) + tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1]) + metas_map[i]['can_bus'][:3] -= prev_pos + metas_map[i]['can_bus'][-1] -= prev_angle + prev_pos = copy.deepcopy(tmp_pos) + prev_angle = copy.deepcopy(tmp_angle) + + queue[-1]['img'] = DC(torch.stack(imgs_list), + cpu_only=False, stack=True) + queue[-1]['img_metas'] = DC(metas_map, cpu_only=True) + queue = queue[-1] + return queue + + def get_ann_info(self, index): + """Get annotation info according to the given index. + + Args: + index (int): Index of the annotation data to get. + + Returns: + dict: Annotation information consists of the following keys: + + - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \ + 3D ground truth bboxes + - gt_labels_3d (np.ndarray): Labels of ground truths. + - gt_names (list[str]): Class names of ground truths. + """ + info = self.data_infos[index] + # filter out bbox containing no points + if self.use_valid_flag: + mask = info['valid_flag'] + else: + mask = info['num_lidar_pts'] > 0 + gt_bboxes_3d = info['gt_boxes'][mask] + gt_names_3d = info['gt_names'][mask] + gt_labels_3d = [] + for cat in gt_names_3d: + if cat in self.CLASSES: + gt_labels_3d.append(self.CLASSES.index(cat)) + else: + gt_labels_3d.append(-1) + gt_labels_3d = np.array(gt_labels_3d) + + if self.with_velocity: + gt_velocity = info['gt_velocity'][mask] + nan_mask = np.isnan(gt_velocity[:, 0]) + gt_velocity[nan_mask] = [0.0, 0.0] + gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1) + + if self.with_attr: + gt_fut_trajs = info['gt_agent_fut_trajs'][mask] + gt_fut_masks = info['gt_agent_fut_masks'][mask] + gt_fut_goal = info['gt_agent_fut_goal'][mask] + gt_lcf_feat = info['gt_agent_lcf_feat'][mask] + gt_fut_yaw = info['gt_agent_fut_yaw'][mask] + attr_labels = np.concatenate( + [gt_fut_trajs, gt_fut_masks, gt_fut_goal[..., None], gt_lcf_feat, gt_fut_yaw], axis=-1 + ).astype(np.float32) + + # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be + # the same as KITTI (0.5, 0.5, 0) + gt_bboxes_3d = LiDARInstance3DBoxes( + gt_bboxes_3d, + box_dim=gt_bboxes_3d.shape[-1], + origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) + + anns_results = dict( + gt_bboxes_3d=gt_bboxes_3d, + gt_labels_3d=gt_labels_3d, + gt_names=gt_names_3d, + attr_labels=attr_labels) + + return anns_results + + def get_data_info(self, index): + """Get data info according to the given index. + + Args: + index (int): Index of the sample data to get. + + Returns: + dict: Data information that will be passed to the data \ + preprocessing pipelines. It includes the following keys: + + - sample_idx (str): Sample index. + - pts_filename (str): Filename of point clouds. + - sweeps (list[dict]): Infos of sweeps. + - timestamp (float): Sample timestamp. + - img_filename (str, optional): Image filename. + - lidar2img (list[np.ndarray], optional): Transformations \ + from lidar to different cameras. + - ann_info (dict): Annotation info. + """ + info = self.data_infos[index] + # standard protocal modified from SECOND.Pytorch + input_dict = dict( + sample_idx=info['token'], + pts_filename=info['lidar_path'], + sweeps=info['sweeps'], + ego2global_translation=info['ego2global_translation'], + ego2global_rotation=info['ego2global_rotation'], + lidar2ego_translation=info['lidar2ego_translation'], + lidar2ego_rotation=info['lidar2ego_rotation'], + prev_idx=info['prev'], + next_idx=info['next'], + scene_token=info['scene_token'], + can_bus=info['can_bus'], + frame_idx=info['frame_idx'], + timestamp=info['timestamp'] / 1e6, + fut_valid_flag=info['fut_valid_flag'], + map_location=info['map_location'], + ego_his_trajs=info['gt_ego_his_trajs'], + ego_fut_trajs=info['gt_ego_fut_trajs'], + ego_fut_masks=info['gt_ego_fut_masks'], + ego_fut_cmd=info['gt_ego_fut_cmd'], + ego_lcf_feat=info['gt_ego_lcf_feat'] + ) + # lidar to ego transform + lidar2ego = np.eye(4).astype(np.float32) + lidar2ego[:3, :3] = Quaternion(info["lidar2ego_rotation"]).rotation_matrix + lidar2ego[:3, 3] = info["lidar2ego_translation"] + input_dict["lidar2ego"] = lidar2ego + + if self.modality['use_camera']: + image_paths = [] + lidar2img_rts = [] + lidar2cam_rts = [] + cam_intrinsics = [] + input_dict["camera2ego"] = [] + input_dict["camera_intrinsics"] = [] + for cam_type, cam_info in info['cams'].items(): + image_paths.append(cam_info['data_path']) + # obtain lidar to image transformation matrix + lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) + lidar2cam_t = cam_info[ + 'sensor2lidar_translation'] @ lidar2cam_r.T + lidar2cam_rt = np.eye(4) + lidar2cam_rt[:3, :3] = lidar2cam_r.T + lidar2cam_rt[3, :3] = -lidar2cam_t + intrinsic = cam_info['cam_intrinsic'] + viewpad = np.eye(4) + viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic + lidar2img_rt = (viewpad @ lidar2cam_rt.T) + lidar2img_rts.append(lidar2img_rt) + + cam_intrinsics.append(viewpad) + lidar2cam_rts.append(lidar2cam_rt.T) + + # camera to ego transform + camera2ego = np.eye(4).astype(np.float32) + camera2ego[:3, :3] = Quaternion( + cam_info["sensor2ego_rotation"] + ).rotation_matrix + camera2ego[:3, 3] = cam_info["sensor2ego_translation"] + input_dict["camera2ego"].append(camera2ego) + # camera intrinsics + camera_intrinsics = np.eye(4).astype(np.float32) + camera_intrinsics[:3, :3] = cam_info["cam_intrinsic"] + input_dict["camera_intrinsics"].append(camera_intrinsics) + + input_dict.update( + dict( + img_filename=image_paths, + lidar2img=lidar2img_rts, + cam_intrinsic=cam_intrinsics, + lidar2cam=lidar2cam_rts, + )) + + # NOTE: now we load gt in test_mode for evaluating + # if not self.test_mode: + # annos = self.get_ann_info(index) + # input_dict['ann_info'] = annos + + annos = self.get_ann_info(index) + input_dict['ann_info'] = annos + + rotation = Quaternion(input_dict['ego2global_rotation']) + translation = input_dict['ego2global_translation'] + can_bus = input_dict['can_bus'] + can_bus[:3] = translation + can_bus[3:7] = rotation + patch_angle = quaternion_yaw(rotation) / np.pi * 180 + if patch_angle < 0: + patch_angle += 360 + can_bus[-2] = patch_angle / 180 * np.pi + can_bus[-1] = patch_angle + + lidar2ego = np.eye(4) + lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix + lidar2ego[:3, 3] = input_dict['lidar2ego_translation'] + ego2global = np.eye(4) + ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix + ego2global[:3, 3] = input_dict['ego2global_translation'] + lidar2global = ego2global @ lidar2ego + input_dict['lidar2global'] = lidar2global + + return input_dict + + def __getitem__(self, idx): + """Get item from infos according to the given index. + Returns: + dict: Data dictionary of the corresponding index. + """ + if self.test_mode: + return self.prepare_test_data(idx) + while True: + + data = self.prepare_train_data(idx) + if data is None: + idx = self._rand_another(idx) + continue + return data + + def _format_gt(self): + gt_annos = [] + print('Start to convert gt map format...') + # assert self.map_ann_file is not None + if (not os.path.exists(self.map_ann_file)) : + dataset_length = len(self) + prog_bar = mmcv.ProgressBar(dataset_length) + mapped_class_names = self.MAPCLASSES + for sample_id in range(dataset_length): + sample_token = self.data_infos[sample_id]['token'] + gt_anno = {} + gt_anno['sample_token'] = sample_token + # gt_sample_annos = [] + gt_sample_dict = {} + gt_sample_dict = self.vectormap_pipeline(gt_sample_dict, self.data_infos[sample_id]) + gt_labels = gt_sample_dict['map_gt_labels_3d'].data.numpy() + gt_vecs = gt_sample_dict['map_gt_bboxes_3d'].data.instance_list + gt_vec_list = [] + for i, (gt_label, gt_vec) in enumerate(zip(gt_labels, gt_vecs)): + name = mapped_class_names[gt_label] + anno = dict( + pts=np.array(list(gt_vec.coords)), + pts_num=len(list(gt_vec.coords)), + cls_name=name, + type=gt_label, + ) + gt_vec_list.append(anno) + gt_anno['vectors']=gt_vec_list + gt_annos.append(gt_anno) + + prog_bar.update() + nusc_submissions = { + 'GTs': gt_annos + } + print('\n GT anns writes to', self.map_ann_file) + mmcv.dump(nusc_submissions, self.map_ann_file) + else: + print(f'{self.map_ann_file} exist, not update') + + def _format_bbox(self, results, jsonfile_prefix=None, score_thresh=0.2): + """Convert the results to the standard format. + + Args: + results (list[dict]): Testing results of the dataset. + jsonfile_prefix (str): The prefix of the output jsonfile. + You can specify the output directory/filename by + modifying the jsonfile_prefix. Default: None. + + Returns: + str: Path of the output json file. + """ + nusc_annos = {} + det_mapped_class_names = self.CLASSES + + # assert self.map_ann_file is not None + map_pred_annos = {} + map_mapped_class_names = self.MAPCLASSES + + plan_annos = {} + + print('Start to convert detection format...') + for sample_id, det in enumerate(mmcv.track_iter_progress(results)): + annos = [] + boxes = output_to_nusc_box(det) + sample_token = self.data_infos[sample_id]['token'] + + plan_annos[sample_token] = [det['ego_fut_preds'], det['ego_fut_cmd']] + + boxes = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes, + det_mapped_class_names, + self.custom_eval_detection_configs, + self.eval_version) + for i, box in enumerate(boxes): + if box.score < score_thresh: + continue + name = det_mapped_class_names[box.label] + if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2: + if name in [ + 'car', + 'construction_vehicle', + 'bus', + 'truck', + 'trailer', + ]: + attr = 'vehicle.moving' + elif name in ['bicycle', 'motorcycle']: + attr = 'cycle.with_rider' + else: + attr = NuScenesDataset.DefaultAttribute[name] + else: + if name in ['pedestrian']: + attr = 'pedestrian.standing' + elif name in ['bus']: + attr = 'vehicle.stopped' + else: + attr = NuScenesDataset.DefaultAttribute[name] + + nusc_anno = dict( + sample_token=sample_token, + translation=box.center.tolist(), + size=box.wlh.tolist(), + rotation=box.orientation.elements.tolist(), + velocity=box.velocity[:2].tolist(), + detection_name=name, + detection_score=box.score, + attribute_name=attr, + fut_traj=box.fut_trajs.tolist()) + annos.append(nusc_anno) + nusc_annos[sample_token] = annos + + + map_pred_anno = {} + vecs = output_to_vecs(det) + sample_token = self.data_infos[sample_id]['token'] + map_pred_anno['sample_token'] = sample_token + pred_vec_list=[] + for i, vec in enumerate(vecs): + name = map_mapped_class_names[vec['label']] + anno = dict( + # sample_token=sample_token, + pts=vec['pts'], + pts_num=len(vec['pts']), + cls_name=name, + type=vec['label'], + confidence_level=vec['score']) + pred_vec_list.append(anno) + # annos.append(nusc_anno) + # nusc_annos[sample_token] = annos + map_pred_anno['vectors'] = pred_vec_list + map_pred_annos[sample_token] = map_pred_anno + + if not os.path.exists(self.map_ann_file): + self._format_gt() + else: + print(f'{self.map_ann_file} exist, not update') + # with open(self.map_ann_file,'r') as f: + # GT_anns = json.load(f) + # gt_annos = GT_anns['GTs'] + + nusc_submissions = { + 'meta': self.modality, + 'results': nusc_annos, + 'map_results': map_pred_annos, + 'plan_results': plan_annos + # 'GTs': gt_annos + } + + mmcv.mkdir_or_exist(jsonfile_prefix) + if self.use_pkl_result: + res_path = osp.join(jsonfile_prefix, 'results_nusc.pkl') + else: + res_path = osp.join(jsonfile_prefix, 'results_nusc.json') + print('Results writes to', res_path) + mmcv.dump(nusc_submissions, res_path) + return res_path + + def format_results(self, results, jsonfile_prefix=None): + """Format the results to json (standard format for COCO evaluation). + + Args: + results (list[dict]): Testing results of the dataset. + jsonfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Default: None. + + Returns: + tuple: Returns (result_files, tmp_dir), where `result_files` is a \ + dict containing the json filepaths, `tmp_dir` is the temporal \ + directory created for saving json files when \ + `jsonfile_prefix` is not specified. + """ + if isinstance(results, dict): + # print(f'results must be a list, but get dict, keys={results.keys()}') + # assert isinstance(results, list) + results = results['bbox_results'] + assert isinstance(results, list) + assert len(results) == len(self), ( + 'The length of results is not equal to the dataset len: {} != {}'. + format(len(results), len(self))) + + if jsonfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + jsonfile_prefix = osp.join(tmp_dir.name, 'results') + else: + tmp_dir = None + + # currently the output prediction results could be in two formats + # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...) + # 2. list of dict('pts_bbox' or 'img_bbox': + # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)) + # this is a workaround to enable evaluation of both formats on nuScenes + # refer to https://github.com/open-mmlab/mmdetection3d/issues/449 + if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]): + result_files = self._format_bbox(results, jsonfile_prefix) + else: + # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict + result_files = dict() + for name in results[0]: + if name == 'metric_results': + continue + print(f'\nFormating bboxes of {name}') + results_ = [out[name] for out in results] + tmp_file_ = osp.join(jsonfile_prefix, name) + result_files.update( + {name: self._format_bbox(results_, tmp_file_)}) + return result_files, tmp_dir + + def _evaluate_single(self, + result_path, + logger=None, + metric='bbox', + map_metric='chamfer', + result_name='pts_bbox'): + """Evaluation for a single model in nuScenes protocol. + + Args: + result_path (str): Path of the result file. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + metric (str): Metric name used for evaluation. Default: 'bbox'. + result_name (str): Result name in the metric prefix. + Default: 'pts_bbox'. + + Returns: + dict: Dictionary of evaluation details. + """ + detail = dict() + from nuscenes import NuScenes + self.nusc = NuScenes(version=self.version, dataroot=self.data_root, + verbose=False) + + output_dir = osp.join(*osp.split(result_path)[:-1]) + + eval_set_map = { + 'v1.0-mini': 'mini_val', + 'v1.0-trainval': 'val', + } + self.nusc_eval = NuScenesEval_custom( + self.nusc, + config=self.custom_eval_detection_configs, + result_path=result_path, + eval_set=eval_set_map[self.version], + output_dir=output_dir, + verbose=False, + overlap_test=self.overlap_test, + data_infos=self.data_infos + ) + self.nusc_eval.main(plot_examples=0, render_curves=False) + # record metrics + metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json')) + metric_prefix = f'{result_name}_NuScenes' + for name in self.CLASSES: + for k, v in metrics['label_aps'][name].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val + for k, v in metrics['label_tp_errors'][name].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}_{}'.format(metric_prefix, name, k)] = val + for k, v in metrics['tp_errors'].items(): + val = float('{:.4f}'.format(v)) + detail['{}/{}'.format(metric_prefix, + self.ErrNameMapping[k])] = val + detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score'] + detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap'] + + + from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import eval_map + from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import format_res_gt_by_classes + result_path = osp.abspath(result_path) + + print('Formating results & gts by classes') + pred_results = mmcv.load(result_path) + map_results = pred_results['map_results'] + gt_anns = mmcv.load(self.map_ann_file) + map_annotations = gt_anns['GTs'] + cls_gens, cls_gts = format_res_gt_by_classes(result_path, + map_results, + map_annotations, + cls_names=self.MAPCLASSES, + num_pred_pts_per_instance=self.fixed_num, + eval_use_same_gt_sample_num_flag=self.eval_use_same_gt_sample_num_flag, + pc_range=self.pc_range) + map_metrics = map_metric if isinstance(map_metric, list) else [map_metric] + allowed_metrics = ['chamfer', 'iou'] + for metric in map_metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + for metric in map_metrics: + print('-*'*10+f'use metric:{metric}'+'-*'*10) + if metric == 'chamfer': + thresholds = [0.5,1.0,1.5] + elif metric == 'iou': + thresholds= np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) + cls_aps = np.zeros((len(thresholds),self.NUM_MAPCLASSES)) + for i, thr in enumerate(thresholds): + print('-*'*10+f'threshhold:{thr}'+'-*'*10) + mAP, cls_ap = eval_map( + map_results, + map_annotations, + cls_gens, + cls_gts, + threshold=thr, + cls_names=self.MAPCLASSES, + logger=logger, + num_pred_pts_per_instance=self.fixed_num, + pc_range=self.pc_range, + metric=metric) + for j in range(self.NUM_MAPCLASSES): + cls_aps[i, j] = cls_ap[j]['ap'] + for i, name in enumerate(self.MAPCLASSES): + print('{}: {}'.format(name, cls_aps.mean(0)[i])) + detail['NuscMap_{}/{}_AP'.format(metric,name)] = cls_aps.mean(0)[i] + print('map: {}'.format(cls_aps.mean(0).mean())) + detail['NuscMap_{}/mAP'.format(metric)] = cls_aps.mean(0).mean() + for i, name in enumerate(self.MAPCLASSES): + for j, thr in enumerate(thresholds): + if metric == 'chamfer': + detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i] + elif metric == 'iou': + if thr == 0.5 or thr == 0.75: + detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i] + + return detail + + def evaluate(self, + results, + metric='bbox', + map_metric='chamfer', + logger=None, + jsonfile_prefix=None, + result_names=['pts_bbox'], + show=False, + out_dir=None, + pipeline=None): + """Evaluation in nuScenes protocol. + + Args: + results (list[dict]): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + jsonfile_prefix (str | None): The prefix of json files. It includes + the file path and the prefix of filename, e.g., "a/b/prefix". + If not specified, a temp file will be created. Default: None. + show (bool): Whether to visualize. + Default: False. + out_dir (str): Path to save the visualization results. + Default: None. + pipeline (list[dict], optional): raw data loading for showing. + Default: None. + + Returns: + dict[str, float]: Results of each evaluation metric. + """ + result_metric_names = ['EPA', 'ADE', 'FDE', 'MR'] + motion_cls_names = ['car', 'pedestrian'] + motion_metric_names = ['gt', 'cnt_ade', 'cnt_fde', 'hit', + 'fp', 'ADE', 'FDE', 'MR'] + all_metric_dict = {} + for met in motion_metric_names: + for cls in motion_cls_names: + all_metric_dict[met+'_'+cls] = 0.0 + result_dict = {} + for met in result_metric_names: + for cls in motion_cls_names: + result_dict[met+'_'+cls] = 0.0 + + alpha = 0.5 + + for i in range(len(results)): + for key in all_metric_dict.keys(): + all_metric_dict[key] += results[i]['metric_results'][key] + + for cls in motion_cls_names: + result_dict['EPA_'+cls] = (all_metric_dict['hit_'+cls] - \ + alpha * all_metric_dict['fp_'+cls]) / all_metric_dict['gt_'+cls] + result_dict['ADE_'+cls] = all_metric_dict['ADE_'+cls] / all_metric_dict['cnt_ade_'+cls] + result_dict['FDE_'+cls] = all_metric_dict['FDE_'+cls] / all_metric_dict['cnt_fde_'+cls] + result_dict['MR_'+cls] = all_metric_dict['MR_'+cls] / all_metric_dict['cnt_fde_'+cls] + + print('\n') + print('-------------- Motion Prediction --------------') + for k, v in result_dict.items(): + print(f'{k}: {v}') + + # NOTE: print planning metric + print('\n') + print('-------------- Planning --------------') + metric_dict = None + num_valid = 0 + for res in results: + if res['metric_results']['fut_valid_flag']: + num_valid += 1 + else: + continue + if metric_dict is None: + metric_dict = copy.deepcopy(res['metric_results']) + else: + for k in res['metric_results'].keys(): + metric_dict[k] += res['metric_results'][k] + + for k in metric_dict: + metric_dict[k] = metric_dict[k] / num_valid + print("{}:{}".format(k, metric_dict[k])) + + result_files, tmp_dir = self.format_results(results, jsonfile_prefix) + + if isinstance(result_files, dict): + results_dict = dict() + for name in result_names: + print('Evaluating bboxes of {}'.format(name)) + ret_dict = self._evaluate_single(result_files[name], metric=metric, map_metric=map_metric) + results_dict.update(ret_dict) + elif isinstance(result_files, str): + results_dict = self._evaluate_single(result_files, metric=metric, map_metric=map_metric) + + if tmp_dir is not None: + tmp_dir.cleanup() + + if show: + self.show(results, out_dir, pipeline=pipeline) + return results_dict + +def output_to_nusc_box(detection): + """Convert the output to the box class in the nuScenes. + + Args: + detection (dict): Detection results. + + - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox. + - scores_3d (torch.Tensor): Detection scores. + - labels_3d (torch.Tensor): Predicted box labels. + + Returns: + list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes. + """ + box3d = detection['boxes_3d'] + scores = detection['scores_3d'].numpy() + labels = detection['labels_3d'].numpy() + trajs = detection['trajs_3d'].numpy() + + + box_gravity_center = box3d.gravity_center.numpy() + box_dims = box3d.dims.numpy() + box_yaw = box3d.yaw.numpy() + # TODO: check whether this is necessary + # with dir_offset & dir_limit in the head + box_yaw = -box_yaw - np.pi / 2 + + box_list = [] + for i in range(len(box3d)): + quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i]) + velocity = (*box3d.tensor[i, 7:9], 0.0) + # velo_val = np.linalg.norm(box3d[i, 7:9]) + # velo_ori = box3d[i, 6] + # velocity = ( + # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0) + box = CustomNuscenesBox( + center=box_gravity_center[i], + size=box_dims[i], + orientation=quat, + fut_trajs=trajs[i], + label=labels[i], + score=scores[i], + velocity=velocity) + box_list.append(box) + return box_list + + +def lidar_nusc_box_to_global(info, + boxes, + classes, + eval_configs, + eval_version='detection_cvpr_2019'): + """Convert the box from ego to global coordinate. + + Args: + info (dict): Info for a specific sample data, including the + calibration information. + boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. + classes (list[str]): Mapped classes in the evaluation. + eval_configs (object): Evaluation configuration object. + eval_version (str): Evaluation version. + Default: 'detection_cvpr_2019' + + Returns: + list: List of standard NuScenesBoxes in the global + coordinate. + """ + box_list = [] + for box in boxes: + # Move box to ego vehicle coord system + box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation'])) + box.translate(np.array(info['lidar2ego_translation'])) + # filter det in ego. + cls_range_x_map = eval_configs.class_range_x + cls_range_y_map = eval_configs.class_range_y + x_distance, y_distance = box.center[0], box.center[1] + det_range_x = cls_range_x_map[classes[box.label]] + det_range_y = cls_range_y_map[classes[box.label]] + if abs(x_distance) > det_range_x or abs(y_distance) > det_range_y: + continue + # Move box to global coord system + box.rotate(pyquaternion.Quaternion(info['ego2global_rotation'])) + box.translate(np.array(info['ego2global_translation'])) + box_list.append(box) + return box_list + +def output_to_vecs(detection): + box3d = detection['map_boxes_3d'].numpy() + scores = detection['map_scores_3d'].numpy() + labels = detection['map_labels_3d'].numpy() + pts = detection['map_pts_3d'].numpy() + + vec_list = [] + # import pdb;pdb.set_trace() + for i in range(box3d.shape[0]): + vec = dict( + bbox = box3d[i], # xyxy + label=labels[i], + score=scores[i], + pts=pts[i], + ) + vec_list.append(vec) + return vec_list \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__init__.py b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..44a3f9505af5288126fae2b76d7463e152308a85 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__init__.py @@ -0,0 +1,14 @@ +from .transform_3d import ( + PadMultiViewImage, NormalizeMultiviewImage, + PhotoMetricDistortionMultiViewImage, CustomCollect3D, + RandomScaleImageMultiViewImage, CustomObjectRangeFilter, CustomObjectNameFilter) +from .formating import CustomDefaultFormatBundle3D +from .loading import CustomLoadPointsFromFile, CustomLoadPointsFromMultiSweeps + +__all__ = [ + 'PadMultiViewImage', 'NormalizeMultiviewImage', + 'PhotoMetricDistortionMultiViewImage', 'CustomDefaultFormatBundle3D', + 'CustomCollect3D', 'RandomScaleImageMultiViewImage', + 'CustomObjectRangeFilter', 'CustomObjectNameFilter', + 'CustomLoadPointsFromFile', 'CustomLoadPointsFromMultiSweeps' +] \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ab2057b66504e1553500eef14ecd5468e823f13 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/formating.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/formating.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..635d2100451c9a23cd18068a9f1dfde8b862cd67 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/formating.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/loading.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/loading.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..148502c4aca9595d777fae36fd2f5cd7f21b4eb9 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/loading.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/transform_3d.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/transform_3d.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e6e9b3f6880f07c4fd4239e15c0f3e1a938dbbf Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/transform_3d.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/formating.py b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/formating.py new file mode 100644 index 0000000000000000000000000000000000000000..184cedddc739da4a5740520a20f973385436de6a --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/formating.py @@ -0,0 +1,55 @@ + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +from mmcv.parallel import DataContainer as DC + +from mmdet3d.core.bbox import BaseInstance3DBoxes +from mmdet3d.core.points import BasePoints +from mmdet.datasets.builder import PIPELINES +from mmdet.datasets.pipelines import to_tensor +from mmdet3d.datasets.pipelines import DefaultFormatBundle3D + +@PIPELINES.register_module() +class CustomDefaultFormatBundle3D(DefaultFormatBundle3D): + """Default formatting bundle. + It simplifies the pipeline of formatting common fields for voxels, + including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and + "gt_semantic_seg". + These fields are formatted as follows. + - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) + - proposals: (1)to tensor, (2)to DataContainer + - gt_bboxes: (1)to tensor, (2)to DataContainer + - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer + - gt_labels: (1)to tensor, (2)to DataContainer + """ + def __init__(self, class_names, with_gt=True, with_label=True, with_ego=True): + super(CustomDefaultFormatBundle3D, self).__init__(class_names, with_gt, with_label) + self.with_ego = with_ego + + + def __call__(self, results): + """Call function to transform and format common fields in results. + Args: + results (dict): Result dict contains the data to convert. + Returns: + dict: The result dict contains the data that is formatted with + default bundle. + """ + # Format 3D data + results = super(CustomDefaultFormatBundle3D, self).__call__(results) + # results['gt_map_masks'] = DC(to_tensor(results['gt_map_masks']), stack=True) + if self.with_ego: + if 'ego_his_trajs' in results: + results['ego_his_trajs'] = DC(to_tensor(results['ego_his_trajs'][None, ...]), stack=True) + if 'ego_fut_trajs' in results: + results['ego_fut_trajs'] = DC(to_tensor(results['ego_fut_trajs'][None, ...]), stack=True) + if 'ego_fut_masks' in results: + results['ego_fut_masks'] = DC(to_tensor(results['ego_fut_masks'][None, None, ...]), stack=True) + if 'ego_fut_cmd' in results: + results['ego_fut_cmd'] = DC(to_tensor(results['ego_fut_cmd'][None, None, ...]), stack=True) + if 'ego_lcf_feat' in results: + results['ego_lcf_feat'] = DC(to_tensor(results['ego_lcf_feat'][None, None, ...]), stack=True) + if 'gt_attr_labels' in results: + results['gt_attr_labels'] = DC(to_tensor(results['gt_attr_labels']), cpu_only=False) + + return results \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/loading.py b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/loading.py new file mode 100644 index 0000000000000000000000000000000000000000..a8b6e68ca3247e8a9c354d22453026ba458106e3 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/loading.py @@ -0,0 +1,389 @@ +import os +from typing import Any, Dict, Tuple + +import mmcv +import torch +import numpy as np +from nuscenes.map_expansion.map_api import NuScenesMap +from nuscenes.map_expansion.map_api import locations as LOCATIONS +from PIL import Image + + +from mmdet3d.core.points import BasePoints, get_points_type +from mmdet.datasets.builder import PIPELINES +from mmdet.datasets.pipelines import LoadAnnotations + +def load_augmented_point_cloud(path, virtual=False, reduce_beams=32): + # NOTE: following Tianwei's implementation, it is hard coded for nuScenes + points = np.fromfile(path, dtype=np.float32).reshape(-1, 5) + # NOTE: path definition different from Tianwei's implementation. + tokens = path.split("/") + vp_dir = "_VIRTUAL" if reduce_beams == 32 else f"_VIRTUAL_{reduce_beams}BEAMS" + seg_path = os.path.join( + *tokens[:-3], + "virtual_points", + tokens[-3], + tokens[-2] + vp_dir, + tokens[-1] + ".pkl.npy", + ) + assert os.path.exists(seg_path) + data_dict = np.load(seg_path, allow_pickle=True).item() + + virtual_points1 = data_dict["real_points"] + # NOTE: add zero reflectance to virtual points instead of removing them from real points + virtual_points2 = np.concatenate( + [ + data_dict["virtual_points"][:, :3], + np.zeros([data_dict["virtual_points"].shape[0], 1]), + data_dict["virtual_points"][:, 3:], + ], + axis=-1, + ) + + points = np.concatenate( + [ + points, + np.ones([points.shape[0], virtual_points1.shape[1] - points.shape[1] + 1]), + ], + axis=1, + ) + virtual_points1 = np.concatenate( + [virtual_points1, np.zeros([virtual_points1.shape[0], 1])], axis=1 + ) + # note: this part is different from Tianwei's implementation, we don't have duplicate foreground real points. + if len(data_dict["real_points_indice"]) > 0: + points[data_dict["real_points_indice"]] = virtual_points1 + if virtual: + virtual_points2 = np.concatenate( + [virtual_points2, -1 * np.ones([virtual_points2.shape[0], 1])], axis=1 + ) + points = np.concatenate([points, virtual_points2], axis=0).astype(np.float32) + return points + + +def reduce_LiDAR_beams(pts, reduce_beams_to=32): + # print(pts.size()) + if isinstance(pts, np.ndarray): + pts = torch.from_numpy(pts) + radius = torch.sqrt(pts[:, 0].pow(2) + pts[:, 1].pow(2) + pts[:, 2].pow(2)) + sine_theta = pts[:, 2] / radius + # [-pi/2, pi/2] + theta = torch.asin(sine_theta) + phi = torch.atan2(pts[:, 1], pts[:, 0]) + + top_ang = 0.1862 + down_ang = -0.5353 + + beam_range = torch.zeros(32) + beam_range[0] = top_ang + beam_range[31] = down_ang + + for i in range(1, 31): + beam_range[i] = beam_range[i - 1] - 0.023275 + # beam_range = [1, 0.18, 0.15, 0.13, 0.11, 0.085, 0.065, 0.03, 0.01, -0.01, -0.03, -0.055, -0.08, -0.105, -0.13, -0.155, -0.18, -0.205, -0.228, -0.251, -0.275, + # -0.295, -0.32, -0.34, -0.36, -0.38, -0.40, -0.425, -0.45, -0.47, -0.49, -0.52, -0.54] + + num_pts, _ = pts.size() + mask = torch.zeros(num_pts) + if reduce_beams_to == 16: + for id in [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]: + beam_mask = (theta < (beam_range[id - 1] - 0.012)) * ( + theta > (beam_range[id] - 0.012) + ) + mask = mask + beam_mask + mask = mask.bool() + elif reduce_beams_to == 4: + for id in [7, 9, 11, 13]: + beam_mask = (theta < (beam_range[id - 1] - 0.012)) * ( + theta > (beam_range[id] - 0.012) + ) + mask = mask + beam_mask + mask = mask.bool() + # [?] pick the 14th beam + elif reduce_beams_to == 1: + chosen_beam_id = 9 + mask = (theta < (beam_range[chosen_beam_id - 1] - 0.012)) * ( + theta > (beam_range[chosen_beam_id] - 0.012) + ) + else: + raise NotImplementedError + # points = copy.copy(pts) + points = pts[mask] + # print(points.size()) + return points.numpy() + +@PIPELINES.register_module() +class CustomLoadPointsFromMultiSweeps: + """Load points from multiple sweeps. + + This is usually used for nuScenes dataset to utilize previous sweeps. + + Args: + sweeps_num (int): Number of sweeps. Defaults to 10. + load_dim (int): Dimension number of the loaded points. Defaults to 5. + use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4]. + pad_empty_sweeps (bool): Whether to repeat keyframe when + sweeps is empty. Defaults to False. + remove_close (bool): Whether to remove close points. + Defaults to False. + test_mode (bool): If test_model=True used for testing, it will not + randomly sample sweeps but select the nearest N frames. + Defaults to False. + """ + + def __init__( + self, + sweeps_num=10, + load_dim=5, + use_dim=[0, 1, 2, 4], + pad_empty_sweeps=False, + remove_close=False, + test_mode=False, + load_augmented=None, + reduce_beams=None, + ): + self.load_dim = load_dim + self.sweeps_num = sweeps_num + if isinstance(use_dim, int): + use_dim = list(range(use_dim)) + self.use_dim = use_dim + self.pad_empty_sweeps = pad_empty_sweeps + self.remove_close = remove_close + self.test_mode = test_mode + self.load_augmented = load_augmented + self.reduce_beams = reduce_beams + + def _load_points(self, lidar_path): + """Private function to load point clouds data. + + Args: + lidar_path (str): Filename of point clouds data. + + Returns: + np.ndarray: An array containing point clouds data. + """ + mmcv.check_file_exist(lidar_path) + if self.load_augmented: + assert self.load_augmented in ["pointpainting", "mvp"] + virtual = self.load_augmented == "mvp" + points = load_augmented_point_cloud( + lidar_path, virtual=virtual, reduce_beams=self.reduce_beams + ) + elif lidar_path.endswith(".npy"): + points = np.load(lidar_path) + else: + points = np.fromfile(lidar_path, dtype=np.float32) + return points + + def _remove_close(self, points, radius=1.0): + """Removes point too close within a certain radius from origin. + + Args: + points (np.ndarray | :obj:`BasePoints`): Sweep points. + radius (float): Radius below which points are removed. + Defaults to 1.0. + + Returns: + np.ndarray: Points after removing. + """ + if isinstance(points, np.ndarray): + points_numpy = points + elif isinstance(points, BasePoints): + points_numpy = points.tensor.numpy() + else: + raise NotImplementedError + x_filt = np.abs(points_numpy[:, 0]) < radius + y_filt = np.abs(points_numpy[:, 1]) < radius + not_close = np.logical_not(np.logical_and(x_filt, y_filt)) + return points[not_close] + + def __call__(self, results): + """Call function to load multi-sweep point clouds from files. + + Args: + results (dict): Result dict containing multi-sweep point cloud \ + filenames. + + Returns: + dict: The result dict containing the multi-sweep points data. \ + Added key and value are described below. + + - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point \ + cloud arrays. + """ + points = results["points"] + points.tensor[:, 4] = 0 + sweep_points_list = [points] + ts = results["timestamp"] / 1e6 + if self.pad_empty_sweeps and len(results["sweeps"]) == 0: + for i in range(self.sweeps_num): + if self.remove_close: + sweep_points_list.append(self._remove_close(points)) + else: + sweep_points_list.append(points) + else: + if len(results["sweeps"]) <= self.sweeps_num: + choices = np.arange(len(results["sweeps"])) + elif self.test_mode: + choices = np.arange(self.sweeps_num) + else: + # NOTE: seems possible to load frame -11? + if not self.load_augmented: + choices = np.random.choice( + len(results["sweeps"]), self.sweeps_num, replace=False + ) + else: + # don't allow to sample the earliest frame, match with Tianwei's implementation. + choices = np.random.choice( + len(results["sweeps"]) - 1, self.sweeps_num, replace=False + ) + for idx in choices: + sweep = results["sweeps"][idx] + points_sweep = self._load_points(sweep["data_path"]) + points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim) + + # TODO: make it more general + if self.reduce_beams and self.reduce_beams < 32: + points_sweep = reduce_LiDAR_beams(points_sweep, self.reduce_beams) + + if self.remove_close: + points_sweep = self._remove_close(points_sweep) + sweep_ts = sweep["timestamp"] / 1e6 + points_sweep[:, :3] = ( + points_sweep[:, :3] @ sweep["sensor2lidar_rotation"].T + ) + points_sweep[:, :3] += sweep["sensor2lidar_translation"] + points_sweep[:, 4] = ts - sweep_ts + points_sweep = points.new_point(points_sweep) + sweep_points_list.append(points_sweep) + + points = points.cat(sweep_points_list) + points = points[:, self.use_dim] + results["points"] = points + return results + + def __repr__(self): + """str: Return a string that describes the module.""" + return f"{self.__class__.__name__}(sweeps_num={self.sweeps_num})" + + + +@PIPELINES.register_module() +class CustomLoadPointsFromFile: + """Load Points From File. + + Load sunrgbd and scannet points from file. + + Args: + coord_type (str): The type of coordinates of points cloud. + Available options includes: + - 'LIDAR': Points in LiDAR coordinates. + - 'DEPTH': Points in depth coordinates, usually for indoor dataset. + - 'CAMERA': Points in camera coordinates. + load_dim (int): The dimension of the loaded points. + Defaults to 6. + use_dim (list[int]): Which dimensions of the points to be used. + Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4 + or use_dim=[0, 1, 2, 3] to use the intensity dimension. + shift_height (bool): Whether to use shifted height. Defaults to False. + use_color (bool): Whether to use color features. Defaults to False. + """ + + def __init__( + self, + coord_type, + load_dim=6, + use_dim=[0, 1, 2], + shift_height=False, + use_color=False, + load_augmented=None, + reduce_beams=None, + ): + self.shift_height = shift_height + self.use_color = use_color + if isinstance(use_dim, int): + use_dim = list(range(use_dim)) + assert ( + max(use_dim) < load_dim + ), f"Expect all used dimensions < {load_dim}, got {use_dim}" + assert coord_type in ["CAMERA", "LIDAR", "DEPTH"] + + self.coord_type = coord_type + self.load_dim = load_dim + self.use_dim = use_dim + self.load_augmented = load_augmented + self.reduce_beams = reduce_beams + + def _load_points(self, lidar_path): + """Private function to load point clouds data. + + Args: + lidar_path (str): Filename of point clouds data. + + Returns: + np.ndarray: An array containing point clouds data. + """ + mmcv.check_file_exist(lidar_path) + if self.load_augmented: + assert self.load_augmented in ["pointpainting", "mvp"] + virtual = self.load_augmented == "mvp" + points = load_augmented_point_cloud( + lidar_path, virtual=virtual, reduce_beams=self.reduce_beams + ) + elif lidar_path.endswith(".npy"): + points = np.load(lidar_path) + else: + points = np.fromfile(lidar_path, dtype=np.float32) + + return points + + def __call__(self, results): + """Call function to load points data from file. + + Args: + results (dict): Result dict containing point clouds data. + + Returns: + dict: The result dict containing the point clouds data. \ + Added key and value are described below. + + - points (:obj:`BasePoints`): Point clouds data. + """ + lidar_path = results["pts_filename"] + points = self._load_points(lidar_path) + points = points.reshape(-1, self.load_dim) + # TODO: make it more general + if self.reduce_beams and self.reduce_beams < 32: + points = reduce_LiDAR_beams(points, self.reduce_beams) + points = points[:, self.use_dim] + attribute_dims = None + + if self.shift_height: + floor_height = np.percentile(points[:, 2], 0.99) + height = points[:, 2] - floor_height + points = np.concatenate( + [points[:, :3], np.expand_dims(height, 1), points[:, 3:]], 1 + ) + attribute_dims = dict(height=3) + + if self.use_color: + assert len(self.use_dim) >= 6 + if attribute_dims is None: + attribute_dims = dict() + attribute_dims.update( + dict( + color=[ + points.shape[1] - 3, + points.shape[1] - 2, + points.shape[1] - 1, + ] + ) + ) + + points_class = get_points_type(self.coord_type) + points = points_class( + points, points_dim=points.shape[-1], attribute_dims=attribute_dims + ) + results["points"] = points + + return results \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py new file mode 100644 index 0000000000000000000000000000000000000000..5a4ea15fe77bf0f6d7f65661c928076c3e075204 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py @@ -0,0 +1,448 @@ +import numpy as np +from numpy import random +import mmcv +from mmdet.datasets.builder import PIPELINES +from mmcv.parallel import DataContainer as DC +from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes, + LiDARInstance3DBoxes, box_np_ops) + + +@PIPELINES.register_module() +class CustomObjectRangeFilter(object): + """Filter objects by the range, and also filter corresponding fut trajs + + Args: + point_cloud_range (list[float]): Point cloud range. + """ + + def __init__(self, point_cloud_range): + self.pcd_range = np.array(point_cloud_range, dtype=np.float32) + + def __call__(self, input_dict): + """Call function to filter objects by the range. + + Args: + input_dict (dict): Result dict from loading pipeline. + + Returns: + dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \ + keys are updated in the result dict. + """ + # Check points instance type and initialise bev_range + if isinstance(input_dict['gt_bboxes_3d'], + (LiDARInstance3DBoxes, DepthInstance3DBoxes)): + bev_range = self.pcd_range[[0, 1, 3, 4]] + elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes): + bev_range = self.pcd_range[[0, 2, 3, 5]] + + gt_bboxes_3d = input_dict['gt_bboxes_3d'] + gt_labels_3d = input_dict['gt_labels_3d'] + gt_attr_labels = input_dict['attr_labels'] + mask = gt_bboxes_3d.in_range_bev(bev_range) + gt_bboxes_3d = gt_bboxes_3d[mask] + # mask is a torch tensor but gt_labels_3d is still numpy array + # using mask to index gt_labels_3d will cause bug when + # len(gt_labels_3d) == 1, where mask=1 will be interpreted + # as gt_labels_3d[1] and cause out of index error + gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)] + gt_attr_labels = gt_attr_labels[mask.numpy().astype(np.bool)] + + # limit rad to [-pi, pi] + gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi) + input_dict['gt_bboxes_3d'] = gt_bboxes_3d + input_dict['gt_labels_3d'] = gt_labels_3d + input_dict['gt_attr_labels'] = gt_attr_labels + + return input_dict + + def __repr__(self): + """str: Return a string that describes the module.""" + repr_str = self.__class__.__name__ + repr_str += f'(point_cloud_range={self.pcd_range.tolist()})' + return repr_str + + +@PIPELINES.register_module() +class CustomObjectNameFilter(object): + """Filter GT objects by their names, , and also filter corresponding fut trajs + + Args: + classes (list[str]): List of class names to be kept for training. + """ + + def __init__(self, classes): + self.classes = classes + self.labels = list(range(len(self.classes))) + + def __call__(self, input_dict): + """Call function to filter objects by their names. + + Args: + input_dict (dict): Result dict from loading pipeline. + + Returns: + dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \ + keys are updated in the result dict. + """ + gt_labels_3d = input_dict['gt_labels_3d'] + gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d], + dtype=np.bool_) + input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask] + input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask] + input_dict['gt_attr_labels'] = input_dict['gt_attr_labels'][gt_bboxes_mask] + + return input_dict + + def __repr__(self): + """str: Return a string that describes the module.""" + repr_str = self.__class__.__name__ + repr_str += f'(classes={self.classes})' + return repr_str + + +@PIPELINES.register_module() +class PadMultiViewImage(object): + """Pad the multi-view image. + There are two padding modes: (1) pad to a fixed size and (2) pad to the + minimum size that is divisible by some number. + Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", + Args: + size (tuple, optional): Fixed padding size. + size_divisor (int, optional): The divisor of padded size. + pad_val (float, optional): Padding value, 0 by default. + """ + + def __init__(self, size=None, size_divisor=None, pad_val=0): + self.size = size + self.size_divisor = size_divisor + self.pad_val = pad_val + # only one of size and size_divisor should be valid + assert size is not None or size_divisor is not None + assert size is None or size_divisor is None + + def _pad_img(self, results): + """Pad images according to ``self.size``.""" + if self.size is not None: + padded_img = [mmcv.impad( + img, shape=self.size, pad_val=self.pad_val) for img in results['img']] + elif self.size_divisor is not None: + padded_img = [mmcv.impad_to_multiple( + img, self.size_divisor, pad_val=self.pad_val) for img in results['img']] + + results['ori_shape'] = [img.shape for img in results['img']] + results['img'] = padded_img + results['img_shape'] = [img.shape for img in padded_img] + results['pad_shape'] = [img.shape for img in padded_img] + results['pad_fixed_size'] = self.size + results['pad_size_divisor'] = self.size_divisor + + def __call__(self, results): + """Call function to pad images, masks, semantic segmentation maps. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Updated result dict. + """ + self._pad_img(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(size={self.size}, ' + repr_str += f'size_divisor={self.size_divisor}, ' + repr_str += f'pad_val={self.pad_val})' + return repr_str + + +@PIPELINES.register_module() +class NormalizeMultiviewImage(object): + """Normalize the image. + Added key is "img_norm_cfg". + Args: + mean (sequence): Mean values of 3 channels. + std (sequence): Std values of 3 channels. + to_rgb (bool): Whether to convert the image from BGR to RGB, + default is true. + """ + + def __init__(self, mean, std, to_rgb=True): + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + self.to_rgb = to_rgb + + + def __call__(self, results): + """Call function to normalize images. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Normalized results, 'img_norm_cfg' key is added into + result dict. + """ + + results['img'] = [mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) for img in results['img']] + results['img_norm_cfg'] = dict( + mean=self.mean, std=self.std, to_rgb=self.to_rgb) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})' + return repr_str + + +@PIPELINES.register_module() +class PhotoMetricDistortionMultiViewImage: + """Apply photometric distortion to image sequentially, every transformation + is applied with a probability of 0.5. The position of random contrast is in + second or second to last. + 1. random brightness + 2. random contrast (mode 0) + 3. convert color from BGR to HSV + 4. random saturation + 5. random hue + 6. convert color from HSV to BGR + 7. random contrast (mode 1) + 8. randomly swap channels + Args: + brightness_delta (int): delta of brightness. + contrast_range (tuple): range of contrast. + saturation_range (tuple): range of saturation. + hue_delta (int): delta of hue. + """ + + def __init__(self, + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18): + self.brightness_delta = brightness_delta + self.contrast_lower, self.contrast_upper = contrast_range + self.saturation_lower, self.saturation_upper = saturation_range + self.hue_delta = hue_delta + + def __call__(self, results): + """Call function to perform photometric distortion on images. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Result dict with images distorted. + """ + imgs = results['img'] + new_imgs = [] + for img in imgs: + assert img.dtype == np.float32, \ + 'PhotoMetricDistortion needs the input image of dtype np.float32,'\ + ' please set "to_float32=True" in "LoadImageFromFile" pipeline' + # random brightness + if random.randint(2): + delta = random.uniform(-self.brightness_delta, + self.brightness_delta) + img += delta + + # mode == 0 --> do random contrast first + # mode == 1 --> do random contrast last + mode = random.randint(2) + if mode == 1: + if random.randint(2): + alpha = random.uniform(self.contrast_lower, + self.contrast_upper) + img *= alpha + + # convert color from BGR to HSV + img = mmcv.bgr2hsv(img) + + # random saturation + if random.randint(2): + img[..., 1] *= random.uniform(self.saturation_lower, + self.saturation_upper) + + # random hue + if random.randint(2): + img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta) + img[..., 0][img[..., 0] > 360] -= 360 + img[..., 0][img[..., 0] < 0] += 360 + + # convert color from HSV to BGR + img = mmcv.hsv2bgr(img) + + # random contrast + if mode == 0: + if random.randint(2): + alpha = random.uniform(self.contrast_lower, + self.contrast_upper) + img *= alpha + + # randomly swap channels + if random.randint(2): + img = img[..., random.permutation(3)] + new_imgs.append(img) + results['img'] = new_imgs + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' + repr_str += 'contrast_range=' + repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' + repr_str += 'saturation_range=' + repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' + repr_str += f'hue_delta={self.hue_delta})' + return repr_str + + + +@PIPELINES.register_module() +class CustomCollect3D(object): + """Collect data from the loader relevant to the specific task. + This is usually the last stage of the data loader pipeline. Typically keys + is set to some subset of "img", "proposals", "gt_bboxes", + "gt_bboxes_ignore", "gt_labels", and/or "gt_masks". + The "img_meta" item is always populated. The contents of the "img_meta" + dictionary depends on "meta_keys". By default this includes: + - 'img_shape': shape of the image input to the network as a tuple \ + (h, w, c). Note that images may be zero padded on the \ + bottom/right if the batch tensor is larger than this shape. + - 'scale_factor': a float indicating the preprocessing scale + - 'flip': a boolean indicating if image flip transform was used + - 'filename': path to the image file + - 'ori_shape': original shape of the image as a tuple (h, w, c) + - 'pad_shape': image shape after padding + - 'lidar2img': transform from lidar to image + - 'depth2img': transform from depth to image + - 'cam2img': transform from camera to image + - 'pcd_horizontal_flip': a boolean indicating if point cloud is \ + flipped horizontally + - 'pcd_vertical_flip': a boolean indicating if point cloud is \ + flipped vertically + - 'box_mode_3d': 3D box mode + - 'box_type_3d': 3D box type + - 'img_norm_cfg': a dict of normalization information: + - mean: per channel mean subtraction + - std: per channel std divisor + - to_rgb: bool indicating if bgr was converted to rgb + - 'pcd_trans': point cloud transformations + - 'sample_idx': sample index + - 'pcd_scale_factor': point cloud scale factor + - 'pcd_rotation': rotation applied to point cloud + - 'pts_filename': path to point cloud file. + Args: + keys (Sequence[str]): Keys of results to be collected in ``data``. + meta_keys (Sequence[str], optional): Meta keys to be converted to + ``mmcv.DataContainer`` and collected in ``data[img_metas]``. + Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img', + 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip', + 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', + 'box_type_3d', 'img_norm_cfg', 'pcd_trans', + 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename') + """ + + def __init__(self, + keys, + meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', + 'depth2img', 'cam2img', 'pad_shape', + 'scale_factor', 'flip', 'pcd_horizontal_flip', + 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', + 'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx', + 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', + 'transformation_3d_flow', 'scene_token', + 'can_bus', + )): + self.keys = keys + self.meta_keys = meta_keys + + def __call__(self, results): + """Call function to collect keys in results. The keys in ``meta_keys`` + will be converted to :obj:`mmcv.DataContainer`. + Args: + results (dict): Result dict contains the data to collect. + Returns: + dict: The result dict contains the following keys + - keys in ``self.keys`` + - ``img_metas`` + """ + + data = {} + img_metas = {} + + for key in self.meta_keys: + if key in results: + img_metas[key] = results[key] + + data['img_metas'] = DC(img_metas, cpu_only=True) + for key in self.keys: + data[key] = results[key] + return data + + def __repr__(self): + """str: Return a string that describes the module.""" + return self.__class__.__name__ + \ + f'(keys={self.keys}, meta_keys={self.meta_keys})' + + + +@PIPELINES.register_module() +class RandomScaleImageMultiViewImage(object): + """Random scale the image + Args: + scales + """ + + def __init__(self, scales=[]): + self.scales = scales + assert len(self.scales)==1 + + def __call__(self, results): + """Call function to pad images, masks, semantic segmentation maps. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Updated result dict. + """ + rand_ind = np.random.permutation(range(len(self.scales)))[0] + rand_scale = self.scales[rand_ind] + + y_size = [int(img.shape[0] * rand_scale) for img in results['img']] + x_size = [int(img.shape[1] * rand_scale) for img in results['img']] + scale_factor = np.eye(4) + scale_factor[0, 0] *= rand_scale + scale_factor[1, 1] *= rand_scale + results['img'] = [mmcv.imresize(img, (x_size[idx], y_size[idx]), return_scale=False) for idx, img in + enumerate(results['img'])] + lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']] + results['lidar2img'] = lidar2img + results['img_shape'] = [img.shape for img in results['img']] + results['ori_shape'] = [img.shape for img in results['img']] + + return results + + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(size={self.scales}, ' + return repr_str + + +@PIPELINES.register_module() +class CustomPointsRangeFilter: + """Filter points by the range. + Args: + point_cloud_range (list[float]): Point cloud range. + """ + + def __init__(self, point_cloud_range): + self.pcd_range = np.array(point_cloud_range, dtype=np.float32) + + def __call__(self, data): + """Call function to filter points by the range. + Args: + data (dict): Result dict from loading pipeline. + Returns: + dict: Results after filtering, 'points', 'pts_instance_mask' \ + and 'pts_semantic_mask' keys are updated in the result dict. + """ + points = data["points"] + points_mask = points.in_range_3d(self.pcd_range) + clean_points = points[points_mask] + data["points"] = clean_points + return data diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__init__.py b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bb2a0b17769a958042583dcb4c8c4a4f51636f4c --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__init__.py @@ -0,0 +1,4 @@ +from .group_sampler import DistributedGroupSampler +from .distributed_sampler import DistributedSampler +from .sampler import SAMPLER, build_sampler + diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c19564b7ea1908375546175900410df60d0c64d2 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/distributed_sampler.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/distributed_sampler.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12c171c1316e78621ccc60066adad5db8dc6b8db Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/distributed_sampler.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/group_sampler.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/group_sampler.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c867b5ed2d73668995f01b2cbe9c9c8f3517af7 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/group_sampler.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/sampler.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/sampler.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cc394674449b127afd5bd32a03fa9188987be53 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/sampler.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..2913de99253be744a308bbc24c5bcaf3cd4a857c --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py @@ -0,0 +1,41 @@ +import math + +import torch +from torch.utils.data import DistributedSampler as _DistributedSampler +from .sampler import SAMPLER + + +@SAMPLER.register_module() +class DistributedSampler(_DistributedSampler): + + def __init__(self, + dataset=None, + num_replicas=None, + rank=None, + shuffle=True, + seed=0): + super().__init__( + dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + # for the compatibility from PyTorch 1.3+ + self.seed = seed if seed is not None else 0 + + def __iter__(self): + # deterministically shuffle based on epoch + if self.shuffle: + assert False + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + # in case that indices is shorter than half of total_size + indices = (indices * + math.ceil(self.total_size / len(indices)))[:self.total_size] + assert len(indices) == self.total_size + + # subsample + per_replicas = self.total_size//self.num_replicas + # indices = indices[self.rank:self.total_size:self.num_replicas] + indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas] + assert len(indices) == self.num_samples + + return iter(indices) diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..16c59e5f3dd880ba185247acfba6eae354deb771 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py @@ -0,0 +1,110 @@ + +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import numpy as np +import torch +from mmcv.runner import get_dist_info +from torch.utils.data import Sampler +from .sampler import SAMPLER +import random +from IPython import embed + + +@SAMPLER.register_module() +class DistributedGroupSampler(Sampler): + """Sampler that restricts data loading to a subset of the dataset. + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each + process can pass a DistributedSampler instance as a DataLoader sampler, + and load a subset of the original dataset that is exclusive to it. + .. note:: + Dataset is assumed to be of constant size. + Arguments: + dataset: Dataset used for sampling. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + seed (int, optional): random seed used to shuffle the sampler if + ``shuffle=True``. This number should be identical across all + processes in the distributed group. Default: 0. + """ + + def __init__(self, + dataset, + samples_per_gpu=1, + num_replicas=None, + rank=None, + seed=0): + _rank, _num_replicas = get_dist_info() + if num_replicas is None: + num_replicas = _num_replicas + if rank is None: + rank = _rank + self.dataset = dataset + self.samples_per_gpu = samples_per_gpu + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.seed = seed if seed is not None else 0 + + assert hasattr(self.dataset, 'flag') + self.flag = self.dataset.flag + self.group_sizes = np.bincount(self.flag) + + self.num_samples = 0 + for i, j in enumerate(self.group_sizes): + self.num_samples += int( + math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / + self.num_replicas)) * self.samples_per_gpu + self.total_size = self.num_samples * self.num_replicas + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch + self.seed) + + indices = [] + for i, size in enumerate(self.group_sizes): + if size > 0: + indice = np.where(self.flag == i)[0] + assert len(indice) == size + # add .numpy() to avoid bug when selecting indice in parrots. + # TODO: check whether torch.randperm() can be replaced by + # numpy.random.permutation(). + indice = indice[list( + torch.randperm(int(size), generator=g).numpy())].tolist() + extra = int( + math.ceil( + size * 1.0 / self.samples_per_gpu / self.num_replicas) + ) * self.samples_per_gpu * self.num_replicas - len(indice) + # pad indice + tmp = indice.copy() + for _ in range(extra // size): + indice.extend(tmp) + indice.extend(tmp[:extra % size]) + indices.extend(indice) + + assert len(indices) == self.total_size + + indices = [ + indices[j] for i in list( + torch.randperm( + len(indices) // self.samples_per_gpu, generator=g)) + for j in range(i * self.samples_per_gpu, (i + 1) * + self.samples_per_gpu) + ] + + # subsample + offset = self.num_samples * self.rank + indices = indices[offset:offset + self.num_samples] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch + diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/sampler.py b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1906049c4416951ab315338a90dceecc1a3b1203 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/sampler.py @@ -0,0 +1,7 @@ +from mmcv.utils.registry import Registry, build_from_cfg + +SAMPLER = Registry('sampler') + + +def build_sampler(cfg, default_args): + return build_from_cfg(cfg, SAMPLER, default_args) diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/vad_custom_nuscenes_eval.py b/GenAD-main/projects/mmdet3d_plugin/datasets/vad_custom_nuscenes_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..a5aa3ce69b2ebf691028b7dfc776f12cc2eacdd5 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/vad_custom_nuscenes_eval.py @@ -0,0 +1,863 @@ +import argparse +import copy +import json +import os +import time +from typing import Tuple, Dict, Any +import torch +import numpy as np + +from nuscenes import NuScenes +from nuscenes.eval.common.config import config_factory +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.detection.data_classes import DetectionConfig +from nuscenes.eval.detection.evaluate import NuScenesEval +from pyquaternion import Quaternion + +from nuscenes import NuScenes +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.detection.data_classes import DetectionBox +from nuscenes.eval.detection.utils import category_to_detection_name +from nuscenes.eval.tracking.data_classes import TrackingBox +from nuscenes.utils.data_classes import Box +from nuscenes.utils.geometry_utils import points_in_box +from nuscenes.utils.splits import create_splits_scenes +from nuscenes.eval.common.loaders import add_center_dist +import tqdm +from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix +from torchvision.transforms.functional import rotate +import pycocotools.mask as mask_util +# from projects.mmdet3d_plugin.models.utils.visual import save_tensor +from torchvision.transforms.functional import rotate +import cv2 +import argparse +import json +import os +import random +import time +from typing import Tuple, Dict, Any + +import numpy as np + +from nuscenes import NuScenes +from nuscenes.eval.common.config import config_factory +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.common.loaders import load_gt, add_center_dist +from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp +from nuscenes.eval.detection.constants import TP_METRICS +from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \ + DetectionMetricDataList +from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample +from nuscenes.eval.common.utils import quaternion_yaw, Quaternion +from mmdet3d.core.bbox.iou_calculators import BboxOverlaps3D +from IPython import embed +import json +from typing import Any + +import numpy as np +from matplotlib import pyplot as plt + +from nuscenes import NuScenes +from nuscenes.eval.common.data_classes import EvalBoxes +from nuscenes.eval.common.render import setup_axis +from nuscenes.eval.common.utils import boxes_to_sensor +from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \ + PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS +from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList +from nuscenes.utils.data_classes import LidarPointCloud +from nuscenes.utils.geometry_utils import view_points + +import mmcv + + +Axis = Any + +def class_tp_curve(md_list: DetectionMetricDataList, + metrics: DetectionMetrics, + detection_name: str, + min_recall: float, + dist_th_tp: float, + savepath: str = None, + ax: Axis = None) -> None: + """ + Plot the true positive curve for the specified class. + :param md_list: DetectionMetricDataList instance. + :param metrics: DetectionMetrics instance. + :param detection_name: + :param min_recall: Minimum recall value. + :param dist_th_tp: The distance threshold used to determine matches. + :param savepath: If given, saves the the rendering here instead of displaying. + :param ax: Axes onto which to render. + """ + # Get metric data for given detection class with tp distance threshold. + + md = md_list[(detection_name, dist_th_tp)] + min_recall_ind = round(100 * min_recall) + if min_recall_ind <= md.max_recall_ind: + # For traffic_cone and barrier only a subset of the metrics are plotted. + rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))] + ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1 + else: + ylimit = 1.0 + + # Prepare axis. + if ax is None: + ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1, + min_recall=min_recall) + ax.set_ylim(0, ylimit) + + # Plot the recall vs. error curve for each tp metric. + for metric in TP_METRICS: + tp = metrics.get_label_tp(detection_name, metric) + + # Plot only if we have valid data. + if tp is not np.nan and min_recall_ind <= md.max_recall_ind: + recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1] + else: + recall, error = [], [] + + # Change legend based on tp value + if tp is np.nan: + label = '{}: n/a'.format(PRETTY_TP_METRICS[metric]) + elif min_recall_ind > md.max_recall_ind: + label = '{}: nan'.format(PRETTY_TP_METRICS[metric]) + else: + label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric]) + if metric == 'trans_err': + label += f' ({md.max_recall_ind})' # add recall + print(f'Recall: {detection_name}: {md.max_recall_ind/100}') + ax.plot(recall, error, label=label) + ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3)) + ax.legend(loc='best') + + if savepath is not None: + plt.savefig(savepath) + plt.close() + + +class DetectionBox_modified(DetectionBox): + def __init__(self, *args, token=None, visibility=None, index=None, **kwargs): + ''' + add annotation token + ''' + super().__init__(*args, **kwargs) + self.token = token + self.visibility = visibility + self.index = index + + def serialize(self) -> dict: + """ Serialize instance into json-friendly format. """ + return { + 'token': self.token, + 'sample_token': self.sample_token, + 'translation': self.translation, + 'size': self.size, + 'rotation': self.rotation, + 'velocity': self.velocity, + 'ego_translation': self.ego_translation, + 'num_pts': self.num_pts, + 'detection_name': self.detection_name, + 'detection_score': self.detection_score, + 'attribute_name': self.attribute_name, + 'visibility': self.visibility, + 'index': self.index + + } + + @classmethod + def deserialize(cls, content: dict): + """ Initialize from serialized content. """ + return cls( + token=content['token'], + sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=tuple(content['velocity']), + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=content['detection_name'], + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name=content['attribute_name'], + visibility=content['visibility'], + index=content['index'], + ) + + +def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool: + """ + Check if a box is visible inside an image without accounting for occlusions. + :param box: The box to be checked. + :param intrinsic: . Intrinsic camera matrix. + :param imsize: (width, height). + :param vis_level: One of the enumerations of . + :return True if visibility condition is satisfied. + """ + + center_3d = box.center.reshape(3, 1) + center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :] + + visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0]) + visible = np.logical_and(visible, center_img[1, :] < imsize[1]) + visible = np.logical_and(visible, center_img[1, :] > 0) + visible = np.logical_and(visible, center_3d[2, :] > 1) + + in_front = center_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera. + + if vis_level == BoxVisibility.ALL: + return all(visible) and all(in_front) + elif vis_level == BoxVisibility.ANY: + return any(visible) and all(in_front) + elif vis_level == BoxVisibility.NONE: + return True + else: + raise ValueError("vis_level: {} not valid".format(vis_level)) + + +def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int], + vis_level: int = BoxVisibility.ANY) -> bool: + """ + Check if a box is visible in images but not all corners in image . + :param box: The box to be checked. + :param intrinsic: . Intrinsic camera matrix. + :param imsize: (width, height). + :param vis_level: One of the enumerations of . + :return True if visibility condition is satisfied. + """ + + corners_3d = box.corners() + corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :] + + visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0]) + visible = np.logical_and(visible, corners_img[1, :] < imsize[1]) + visible = np.logical_and(visible, corners_img[1, :] > 0) + visible = np.logical_and(visible, corners_3d[2, :] > 1) + + in_front = corners_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera. + + if any(visible) and not all(visible) and all(in_front): + return True + else: + return False + +def load_prediction(result_path: str, max_boxes_per_sample: int, box_cls, verbose: bool = False) \ + -> Tuple[EvalBoxes, Dict]: + """ + Loads object predictions from file. + :param result_path: Path to the .json result file provided by the user. + :param max_boxes_per_sample: Maximim number of boxes allowed per sample. + :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox. + :param verbose: Whether to print messages to stdout. + :return: The deserialized results and meta data. + """ + + # Load from file and check that the format is correct. + # with open(result_path) as f: + # data = json.load(f) + data = mmcv.load(result_path) + assert 'results' in data, 'Error: No field `results` in result file. Please note that the result format changed.' \ + 'See https://www.nuscenes.org/object-detection for more information.' + + # Deserialize results and get meta data. + all_results = EvalBoxes.deserialize(data['results'], box_cls) + meta = data['meta'] + if verbose: + print("Loaded results from {}. Found detections for {} samples." + .format(result_path, len(all_results.sample_tokens))) + + # Check that each sample has no more than x predicted boxes. + for sample_token in all_results.sample_tokens: + assert len(all_results.boxes[sample_token]) <= max_boxes_per_sample, \ + "Error: Only <= %d boxes per sample allowed!" % max_boxes_per_sample + + return all_results, meta + +def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False): + """ + Loads ground truth boxes from DB. + :param nusc: A NuScenes instance. + :param eval_split: The evaluation split for which we load GT boxes. + :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox. + :param verbose: Whether to print messages to stdout. + :return: The GT boxes. + """ + + # Init. + if box_cls == DetectionBox_modified: + attribute_map = {a['token']: a['name'] for a in nusc.attribute} + + if verbose: + print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version)) + # Read out all sample_tokens in DB. + sample_tokens_all = [s['token'] for s in nusc.sample] + assert len(sample_tokens_all) > 0, "Error: Database has no samples!" + + # Only keep samples from this split. + splits = create_splits_scenes() + + # Check compatibility of split with nusc_version. + version = nusc.version + if eval_split in {'train', 'val', 'train_detect', 'train_track'}: + assert version.endswith('trainval'), \ + 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) + elif eval_split in {'mini_train', 'mini_val'}: + assert version.endswith('mini'), \ + 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) + elif eval_split == 'test': + assert version.endswith('test'), \ + 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) + else: + raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.' + .format(eval_split)) + + if eval_split == 'test': + # Check that you aren't trying to cheat :). + assert len(nusc.sample_annotation) > 0, \ + 'Error: You are trying to evaluate on the test set but you do not have the annotations!' + index_map = {} + for scene in nusc.scene: + first_sample_token = scene['first_sample_token'] + sample = nusc.get('sample', first_sample_token) + index_map[first_sample_token] = 1 + index = 2 + while sample['next'] != '': + sample = nusc.get('sample', sample['next']) + index_map[sample['token']] = index + index += 1 + + sample_tokens = [] + for sample_token in sample_tokens_all: + scene_token = nusc.get('sample', sample_token)['scene_token'] + scene_record = nusc.get('scene', scene_token) + if scene_record['name'] in splits[eval_split]: + sample_tokens.append(sample_token) + + all_annotations = EvalBoxes() + + # Load annotations and filter predictions and annotations. + tracking_id_set = set() + for sample_token in tqdm.tqdm(sample_tokens, leave=verbose): + + sample = nusc.get('sample', sample_token) + sample_annotation_tokens = sample['anns'] + + sample_boxes = [] + for sample_annotation_token in sample_annotation_tokens: + + sample_annotation = nusc.get('sample_annotation', sample_annotation_token) + if box_cls == DetectionBox_modified: + # Get label name in detection task and filter unused labels. + detection_name = category_to_detection_name(sample_annotation['category_name']) + if detection_name is None: + continue + + # Get attribute_name. + attr_tokens = sample_annotation['attribute_tokens'] + attr_count = len(attr_tokens) + if attr_count == 0: + attribute_name = '' + elif attr_count == 1: + attribute_name = attribute_map[attr_tokens[0]] + else: + raise Exception('Error: GT annotations must not have more than one attribute!') + + sample_boxes.append( + box_cls( + token=sample_annotation_token, + sample_token=sample_token, + translation=sample_annotation['translation'], + size=sample_annotation['size'], + rotation=sample_annotation['rotation'], + velocity=nusc.box_velocity(sample_annotation['token'])[:2], + num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'], + detection_name=detection_name, + detection_score=-1.0, # GT samples do not have a score. + attribute_name=attribute_name, + visibility=sample_annotation['visibility_token'], + index=index_map[sample_token] + ) + ) + elif box_cls == TrackingBox: + assert False + else: + raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls) + + all_annotations.add_boxes(sample_token, sample_boxes) + + if verbose: + print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens))) + + return all_annotations + + +def filter_eval_boxes_by_id(nusc: NuScenes, + eval_boxes: EvalBoxes, + id=None, + verbose: bool = False) -> EvalBoxes: + """ + Applies filtering to boxes. Distance, bike-racks and points per box. + :param nusc: An instance of the NuScenes class. + :param eval_boxes: An instance of the EvalBoxes class. + :param is: the anns token set that used to keep bboxes. + :param verbose: Whether to print to stdout. + """ + + # Accumulators for number of filtered boxes. + total, anns_filter = 0, 0 + for ind, sample_token in enumerate(eval_boxes.sample_tokens): + + # Filter on anns + total += len(eval_boxes[sample_token]) + filtered_boxes = [] + for box in eval_boxes[sample_token]: + if box.token in id: + filtered_boxes.append(box) + anns_filter += len(filtered_boxes) + eval_boxes.boxes[sample_token] = filtered_boxes + + if verbose: + print("=> Original number of boxes: %d" % total) + print("=> After anns based filtering: %d" % anns_filter) + + return eval_boxes + + +def filter_eval_boxes_by_visibility( + ori_eval_boxes: EvalBoxes, + visibility=None, + verbose: bool = False) -> EvalBoxes: + """ + Applies filtering to boxes. Distance, bike-racks and points per box. + :param nusc: An instance of the NuScenes class. + :param eval_boxes: An instance of the EvalBoxes class. + :param is: the anns token set that used to keep bboxes. + :param verbose: Whether to print to stdout. + """ + + # Accumulators for number of filtered boxes. + eval_boxes = copy.deepcopy(ori_eval_boxes) + total, anns_filter = 0, 0 + for ind, sample_token in enumerate(eval_boxes.sample_tokens): + # Filter on anns + total += len(eval_boxes[sample_token]) + filtered_boxes = [] + for box in eval_boxes[sample_token]: + if box.visibility == visibility: + filtered_boxes.append(box) + anns_filter += len(filtered_boxes) + eval_boxes.boxes[sample_token] = filtered_boxes + + if verbose: + print("=> Original number of boxes: %d" % total) + print("=> After visibility based filtering: %d" % anns_filter) + + return eval_boxes + + +def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[], verbose=False): + eval_boxes = copy.deepcopy(ori_eval_boxes) + for sample_token in eval_boxes.sample_tokens: + if sample_token not in valid_sample_tokens: + eval_boxes.boxes.pop(sample_token) + return eval_boxes + + +def filter_eval_boxes_by_overlap(nusc: NuScenes, + eval_boxes: EvalBoxes, + verbose: bool = False) -> EvalBoxes: + """ + Applies filtering to boxes. basedon overlap . + :param nusc: An instance of the NuScenes class. + :param eval_boxes: An instance of the EvalBoxes class. + :param verbose: Whether to print to stdout. + """ + + # Accumulators for number of filtered boxes. + cams = ['CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_BACK_RIGHT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_FRONT_LEFT'] + + total, anns_filter = 0, 0 + for ind, sample_token in enumerate(eval_boxes.sample_tokens): + + # Filter on anns + total += len(eval_boxes[sample_token]) + sample_record = nusc.get('sample', sample_token) + filtered_boxes = [] + for box in eval_boxes[sample_token]: + count = 0 + for cam in cams: + ''' + copy-paste form nuscens + ''' + sample_data_token = sample_record['data'][cam] + sd_record = nusc.get('sample_data', sample_data_token) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + sensor_record = nusc.get('sensor', cs_record['sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + cam_intrinsic = np.array(cs_record['camera_intrinsic']) + imsize = (sd_record['width'], sd_record['height']) + new_box = Box(box.translation, box.size, Quaternion(box.rotation), + name=box.detection_name, token='') + + # Move box to ego vehicle coord system. + new_box.translate(-np.array(pose_record['translation'])) + new_box.rotate(Quaternion(pose_record['rotation']).inverse) + + # Move box to sensor coord system. + new_box.translate(-np.array(cs_record['translation'])) + new_box.rotate(Quaternion(cs_record['rotation']).inverse) + + if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY): + count += 1 + # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY): + # count += 1 + + if count > 1: + with open('center_overlap.txt', 'a') as f: + try: + f.write(box.token + '\n') + except: + pass + filtered_boxes.append(box) + anns_filter += len(filtered_boxes) + eval_boxes.boxes[sample_token] = filtered_boxes + + verbose = True + + if verbose: + print("=> Original number of boxes: %d" % total) + print("=> After anns based filtering: %d" % anns_filter) + + return eval_boxes + +def _get_box_class_field(eval_boxes: EvalBoxes) -> str: + """ + Retrieve the name of the class field in the boxes. + This parses through all boxes until it finds a valid box. + If there are no valid boxes, this function throws an exception. + :param eval_boxes: The EvalBoxes used for evaluation. + :return: The name of the class field in the boxes, e.g. detection_name or tracking_name. + """ + assert len(eval_boxes.boxes) > 0 + box = None + for val in eval_boxes.boxes.values(): + if len(val) > 0: + box = val[0] + break + if isinstance(box, DetectionBox): + class_field = 'detection_name' + elif isinstance(box, TrackingBox): + class_field = 'tracking_name' + else: + raise Exception('Error: Invalid box type: %s' % box) + + return class_field + +def filter_eval_boxes(nusc: NuScenes, + eval_boxes: EvalBoxes, + max_dist_x: Dict[str, float], + max_dist_y: Dict[str, float], + verbose: bool = False) -> EvalBoxes: + """ + Applies filtering to boxes. Distance, bike-racks and points per box. + :param nusc: An instance of the NuScenes class. + :param eval_boxes: An instance of the EvalBoxes class. + :param max_dist: Maps the detection name to the eval distance threshold for that class. + :param verbose: Whether to print to stdout. + """ + # Retrieve box type for detectipn/tracking boxes. + class_field = _get_box_class_field(eval_boxes) + + # Accumulators for number of filtered boxes. + total, dist_filter, point_filter, bike_rack_filter = 0, 0, 0, 0 + for ind, sample_token in enumerate(eval_boxes.sample_tokens): + + # Filter on distance first. + total += len(eval_boxes[sample_token]) + eval_boxes.boxes[sample_token] = [box for box in eval_boxes[sample_token] if + abs(box.ego_translation[0]) < max_dist_x[box.__getattribute__(class_field)] \ + and abs(box.ego_translation[1]) < max_dist_y[box.__getattribute__(class_field)]] + dist_filter += len(eval_boxes[sample_token]) + + # Then remove boxes with zero points in them. Eval boxes have -1 points by default. + eval_boxes.boxes[sample_token] = [box for box in eval_boxes[sample_token] if not box.num_pts == 0] + point_filter += len(eval_boxes[sample_token]) + + # Perform bike-rack filtering. + sample_anns = nusc.get('sample', sample_token)['anns'] + bikerack_recs = [nusc.get('sample_annotation', ann) for ann in sample_anns if + nusc.get('sample_annotation', ann)['category_name'] == 'static_object.bicycle_rack'] + bikerack_boxes = [Box(rec['translation'], rec['size'], Quaternion(rec['rotation'])) for rec in bikerack_recs] + filtered_boxes = [] + for box in eval_boxes[sample_token]: + if box.__getattribute__(class_field) in ['bicycle', 'motorcycle']: + in_a_bikerack = False + for bikerack_box in bikerack_boxes: + if np.sum(points_in_box(bikerack_box, np.expand_dims(np.array(box.translation), axis=1))) > 0: + in_a_bikerack = True + if not in_a_bikerack: + filtered_boxes.append(box) + else: + filtered_boxes.append(box) + + eval_boxes.boxes[sample_token] = filtered_boxes + bike_rack_filter += len(eval_boxes.boxes[sample_token]) + + if verbose: + print("=> Original number of boxes: %d" % total) + print("=> After distance based filtering: %d" % dist_filter) + print("=> After LIDAR and RADAR points based filtering: %d" % point_filter) + print("=> After bike rack filtering: %d" % bike_rack_filter) + + return eval_boxes + +class NuScenesEval_custom(NuScenesEval): + """ + Dummy class for backward-compatibility. Same as DetectionEval. + """ + + def __init__(self, + nusc: NuScenes, + config: DetectionConfig, + result_path: str, + eval_set: str, + output_dir: str = None, + verbose: bool = True, + overlap_test=False, + eval_mask=False, + data_infos=None + ): + """ + Initialize a DetectionEval object. + :param nusc: A NuScenes object. + :param config: A DetectionConfig object. + :param result_path: Path of the nuScenes JSON result file. + :param eval_set: The dataset split to evaluate on, e.g. train, val or test. + :param output_dir: Folder to save plots and results to. + :param verbose: Whether to print to stdout. + """ + + self.nusc = nusc + self.result_path = result_path + self.eval_set = eval_set + self.output_dir = output_dir + self.verbose = verbose + self.cfg = config + self.overlap_test = overlap_test + self.eval_mask = eval_mask + self.data_infos = data_infos + # Check result file exists. + assert os.path.exists(result_path), 'Error: The result file does not exist!' + + # Make dirs. + self.plot_dir = os.path.join(self.output_dir, 'plots') + if not os.path.isdir(self.output_dir): + os.makedirs(self.output_dir) + if not os.path.isdir(self.plot_dir): + os.makedirs(self.plot_dir) + + # Load data. + if verbose: + print('Initializing nuScenes detection evaluation') + self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox, + verbose=verbose) + self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose) + + assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \ + "Samples in split doesn't match samples in predictions." + + # Add center distances. + self.pred_boxes = add_center_dist(nusc, self.pred_boxes) + self.gt_boxes = add_center_dist(nusc, self.gt_boxes) + + # Filter boxes (distance, points per box, etc.). + + if verbose: + print('Filtering predictions') + self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range_x, self.cfg.class_range_y, verbose=verbose) + if verbose: + print('Filtering ground truth annotations') + self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range_x, self.cfg.class_range_y, verbose=verbose) + + if self.overlap_test: + self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes) + + self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True) + + self.all_gt = copy.deepcopy(self.gt_boxes) + self.all_preds = copy.deepcopy(self.pred_boxes) + self.sample_tokens = self.gt_boxes.sample_tokens + + self.index_map = {} + for scene in nusc.scene: + first_sample_token = scene['first_sample_token'] + sample = nusc.get('sample', first_sample_token) + self.index_map[first_sample_token] = 1 + index = 2 + while sample['next'] != '': + sample = nusc.get('sample', sample['next']) + self.index_map[sample['token']] = index + index += 1 + + def update_gt(self, type_='vis', visibility='1', index=1): + if type_ == 'vis': + self.visibility_test = True + if self.visibility_test: + '''[{'description': 'visibility of whole object is between 0 and 40%', + 'token': '1', + 'level': 'v0-40'}, + {'description': 'visibility of whole object is between 40 and 60%', + 'token': '2', + 'level': 'v40-60'}, + {'description': 'visibility of whole object is between 60 and 80%', + 'token': '3', + 'level': 'v60-80'}, + {'description': 'visibility of whole object is between 80 and 100%', + 'token': '4', + 'level': 'v80-100'}]''' + + self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True) + + elif type_ == 'ord': + + valid_tokens = [key for (key, value) in self.index_map.items() if value == index] + # from IPython import embed + # embed() + self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens) + self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens) + self.sample_tokens = self.gt_boxes.sample_tokens + + + def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]: + """ + Performs the actual evaluation. + :return: A tuple of high-level and the raw metric data. + """ + start_time = time.time() + + # ----------------------------------- + # Step 1: Accumulate metric data for all classes and distance thresholds. + # ----------------------------------- + if self.verbose: + print('Accumulating metric data...') + metric_data_list = DetectionMetricDataList() + + # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths) + # self.cfg.dist_ths = [0.3] + # self.cfg.dist_fcn_callable + for class_name in self.cfg.class_names: + for dist_th in self.cfg.dist_ths: + md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th) + metric_data_list.set(class_name, dist_th, md) + + # ----------------------------------- + # Step 2: Calculate metrics from the data. + # ----------------------------------- + if self.verbose: + print('Calculating metrics...') + metrics = DetectionMetrics(self.cfg) + for class_name in self.cfg.class_names: + # Compute APs. + for dist_th in self.cfg.dist_ths: + metric_data = metric_data_list[(class_name, dist_th)] + ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision) + metrics.add_label_ap(class_name, dist_th, ap) + # Compute TP metrics. + for metric_name in TP_METRICS: + metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)] + if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']: + tp = np.nan + elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']: + tp = np.nan + else: + tp = calc_tp(metric_data, self.cfg.min_recall, metric_name) + metrics.add_label_tp(class_name, metric_name, tp) + + # Compute evaluation time. + metrics.add_runtime(time.time() - start_time) + + return metrics, metric_data_list + + def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None: + """ + Renders various PR and TP curves. + :param metrics: DetectionMetrics instance. + :param md_list: DetectionMetricDataList instance. + """ + if self.verbose: + print('Rendering PR and TP curves') + + def savepath(name): + return os.path.join(self.plot_dir, name + '.pdf') + + summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall, + dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary')) + + for detection_name in self.cfg.class_names: + class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall, + savepath=savepath(detection_name + '_pr')) + + class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp, + savepath=savepath(detection_name + '_tp')) + + for dist_th in self.cfg.dist_ths: + dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall, + savepath=savepath('dist_pr_' + str(dist_th))) + + +if __name__ == "__main__": + + # Settings. + parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('result_path', type=str, help='The submission as a JSON file.') + parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics', + help='Folder to store result metrics, graphs and example visualizations.') + parser.add_argument('--eval_set', type=str, default='val', + help='Which dataset split to evaluate on, train, val or test.') + parser.add_argument('--dataroot', type=str, default='data/nuscenes', + help='Default nuScenes data directory.') + parser.add_argument('--version', type=str, default='v1.0-trainval', + help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.') + parser.add_argument('--config_path', type=str, default='', + help='Path to the configuration file.' + 'If no path given, the CVPR 2019 configuration will be used.') + parser.add_argument('--plot_examples', type=int, default=0, + help='How many example visualizations to write to disk.') + parser.add_argument('--render_curves', type=int, default=1, + help='Whether to render PR and TP curves to disk.') + parser.add_argument('--verbose', type=int, default=1, + help='Whether to print to stdout.') + args = parser.parse_args() + + result_path_ = os.path.expanduser(args.result_path) + output_dir_ = os.path.expanduser(args.output_dir) + eval_set_ = args.eval_set + dataroot_ = args.dataroot + version_ = args.version + config_path = args.config_path + plot_examples_ = args.plot_examples + render_curves_ = bool(args.render_curves) + verbose_ = bool(args.verbose) + + if config_path == '': + cfg_ = config_factory('detection_cvpr_2019') + else: + with open(config_path, 'r') as _f: + cfg_ = DetectionConfig.deserialize(json.load(_f)) + + nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_) + nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_, + output_dir=output_dir_, verbose=verbose_) + for vis in ['1', '2', '3', '4']: + nusc_eval.update_gt(type_='vis', visibility=vis) + print(f'================ {vis} ===============') + nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_) + #for index in range(1, 41): + # nusc_eval.update_gt(type_='ord', index=index) + # diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/vad_nusc_detection_cvpr_2019.json b/GenAD-main/projects/mmdet3d_plugin/datasets/vad_nusc_detection_cvpr_2019.json new file mode 100644 index 0000000000000000000000000000000000000000..b5c810318083771277eac0cca8bf6252a7ae793f --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/datasets/vad_nusc_detection_cvpr_2019.json @@ -0,0 +1,34 @@ +{ + "class_range_x": { + "car": 30, + "truck": 30, + "bus": 30, + "trailer": 30, + "construction_vehicle": 30, + "pedestrian": 30, + "motorcycle": 30, + "bicycle": 30, + "traffic_cone": 30, + "barrier": 30 + }, + "class_range_y": { + "car": 15, + "truck": 15, + "bus": 15, + "trailer": 15, + "construction_vehicle": 15, + "pedestrian": 15, + "motorcycle": 15, + "bicycle": 15, + "traffic_cone": 15, + "barrier": 15 + }, + "dist_fcn": "center_distance", + "dist_ths": [0.5, 1.0, 2.0, 4.0], + "dist_th_tp": 2.0, + "min_recall": 0.1, + "min_precision": 0.1, + "max_boxes_per_sample": 500, + "mean_ap_weight": 5 + } + \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/models/VAD_head.py b/GenAD-main/projects/mmdet3d_plugin/models/VAD_head.py new file mode 100644 index 0000000000000000000000000000000000000000..f8fea586e0993b7c8aa941552fa0d9659d883d89 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/VAD_head.py @@ -0,0 +1,2165 @@ +import copy +from math import pi, cos, sin + +import torch +import numpy as np +import torch.nn as nn +import matplotlib.pyplot as plt +import torch.nn.functional as F +from mmdet.models import HEADS, build_loss +from mmdet.models.dense_heads import DETRHead +from mmcv.runner import force_fp32, auto_fp16 +from mmcv.utils import TORCH_VERSION, digit_version +from mmdet.core import build_assigner, build_sampler +from mmdet3d.core.bbox.coders import build_bbox_coder +from mmdet.models.utils.transformer import inverse_sigmoid +from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh +from mmcv.cnn import Linear, bias_init_with_prob, xavier_init +from mmdet.core import (multi_apply, multi_apply, reduce_mean) +from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence + +from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox +from projects.mmdet3d_plugin.VAD.utils.traj_lr_warmup import get_traj_warmup_loss_weight +from projects.mmdet3d_plugin.VAD.utils.map_utils import ( + normalize_2d_pts, normalize_2d_bbox, denormalize_2d_pts, denormalize_2d_bbox +) + +from projects.mmdet3d_plugin.VAD.generator import DistributionModule, PredictModel +from projects.mmdet3d_plugin.VAD.generator import FuturePrediction + +class MLP(nn.Module): + def __init__(self, in_channels, hidden_unit, verbose=False): + super(MLP, self).__init__() + self.mlp = nn.Sequential( + nn.Linear(in_channels, hidden_unit), + nn.LayerNorm(hidden_unit), + nn.ReLU() + ) + + def forward(self, x): + x = self.mlp(x) + return x + +class LaneNet(nn.Module): + def __init__(self, in_channels, hidden_unit, num_subgraph_layers): + super(LaneNet, self).__init__() + self.num_subgraph_layers = num_subgraph_layers + self.layer_seq = nn.Sequential() + for i in range(num_subgraph_layers): + self.layer_seq.add_module( + f'lmlp_{i}', MLP(in_channels, hidden_unit)) + in_channels = hidden_unit*2 + + def forward(self, pts_lane_feats): + ''' + Extract lane_feature from vectorized lane representation + + Args: + pts_lane_feats: [batch size, max_pnum, pts, D] + + Returns: + inst_lane_feats: [batch size, max_pnum, D] + ''' + x = pts_lane_feats + for name, layer in self.layer_seq.named_modules(): + if isinstance(layer, MLP): + # x [bs,max_lane_num,9,dim] + x = layer(x) + x_max = torch.max(x, -2)[0] + x_max = x_max.unsqueeze(2).repeat(1, 1, x.shape[2], 1) + x = torch.cat([x, x_max], dim=-1) + x_max = torch.max(x, -2)[0] + return x_max + + +@HEADS.register_module() +class VADHead(DETRHead): + """Head of VAD model. + Args: + with_box_refine (bool): Whether to refine the reference points + in the decoder. Defaults to False. + as_two_stage (bool) : Whether to generate the proposal from + the outputs of encoder. + transformer (obj:`ConfigDict`): ConfigDict is used for building + the Encoder and Decoder. + bev_h, bev_w (int): spatial shape of BEV queries. + """ + def __init__(self, + *args, + with_box_refine=False, + as_two_stage=False, + transformer=None, + bbox_coder=None, + num_cls_fcs=2, + code_weights=None, + bev_h=30, + bev_w=30, + fut_ts=6, + fut_mode=6, + loss_traj=dict(type='L1Loss', loss_weight=0.25), + loss_traj_cls=dict( + type='FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + loss_weight=0.8), + map_bbox_coder=None, + map_num_query=900, + map_num_classes=3, + map_num_vec=20, + map_num_pts_per_vec=2, + map_num_pts_per_gt_vec=2, + map_query_embed_type='all_pts', + map_transform_method='minmax', + map_gt_shift_pts_pattern='v0', + map_dir_interval=1, + map_code_size=None, + map_code_weights=None, + loss_map_cls=dict( + type='CrossEntropyLoss', + bg_cls_weight=0.1, + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + loss_map_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_map_iou=dict(type='GIoULoss', loss_weight=2.0), + loss_map_pts=dict( + type='ChamferDistance',loss_src_weight=1.0,loss_dst_weight=1.0 + ), + loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=2.0), + loss_vae_gen=dict(type='ProbabilisticLoss', loss_weight=1.0), + tot_epoch=None, + use_traj_lr_warmup=False, + motion_decoder=None, + motion_map_decoder=None, + use_pe=False, + motion_det_score=None, + map_thresh=0.5, + dis_thresh=0.2, + pe_normalization=True, + ego_his_encoder=None, + ego_fut_mode=3, + loss_plan_reg=dict(type='L1Loss', loss_weight=0.25), + loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=0.1), + loss_plan_col=dict(type='PlanAgentDisLoss', loss_weight=0.1), + loss_plan_dir=dict(type='PlanMapThetaLoss', loss_weight=0.1), + ego_agent_decoder=None, + ego_map_decoder=None, + query_thresh=None, + query_use_fix_pad=None, + ego_lcf_feat_idx=None, + valid_fut_ts=6, + agent_dim = 300, + **kwargs): + + self.bev_h = bev_h + self.bev_w = bev_w + self.fp16_enabled = False + self.fut_ts = fut_ts + self.fut_mode = fut_mode + self.tot_epoch = tot_epoch + self.use_traj_lr_warmup = use_traj_lr_warmup + self.motion_decoder = motion_decoder + self.motion_map_decoder = motion_map_decoder + self.use_pe = use_pe + self.motion_det_score = motion_det_score + self.map_thresh = map_thresh + self.dis_thresh = dis_thresh + self.pe_normalization = pe_normalization + self.ego_his_encoder = ego_his_encoder + self.ego_fut_mode = ego_fut_mode + self.ego_agent_decoder = ego_agent_decoder + self.ego_map_decoder = ego_map_decoder + self.query_thresh = query_thresh + self.query_use_fix_pad = query_use_fix_pad + self.ego_lcf_feat_idx = ego_lcf_feat_idx + self.valid_fut_ts = valid_fut_ts + self.agent_dim = agent_dim + + if loss_traj_cls['use_sigmoid'] == True: + self.traj_num_cls = 1 + else: + self.traj_num_cls = 2 + + self.with_box_refine = with_box_refine + self.as_two_stage = as_two_stage + if self.as_two_stage: + transformer['as_two_stage'] = self.as_two_stage + if 'code_size' in kwargs: + self.code_size = kwargs['code_size'] + else: + self.code_size = 10 + if code_weights is not None: + self.code_weights = code_weights + else: + self.code_weights = [1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] + if map_code_size is not None: + self.map_code_size = map_code_size + else: + self.map_code_size = 10 + if map_code_weights is not None: + self.map_code_weights = map_code_weights + else: + self.map_code_weights = [1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] + + self.bbox_coder = build_bbox_coder(bbox_coder) + self.pc_range = self.bbox_coder.pc_range + self.real_w = self.pc_range[3] - self.pc_range[0] + self.real_h = self.pc_range[4] - self.pc_range[1] + self.num_cls_fcs = num_cls_fcs - 1 + + self.map_bbox_coder = build_bbox_coder(map_bbox_coder) + self.map_query_embed_type = map_query_embed_type + self.map_transform_method = map_transform_method + self.map_gt_shift_pts_pattern = map_gt_shift_pts_pattern + map_num_query = map_num_vec * map_num_pts_per_vec + self.map_num_query = map_num_query + self.map_num_classes = map_num_classes + self.map_num_vec = map_num_vec + self.map_num_pts_per_vec = map_num_pts_per_vec + self.map_num_pts_per_gt_vec = map_num_pts_per_gt_vec + self.map_dir_interval = map_dir_interval + + if loss_map_cls['use_sigmoid'] == True: + self.map_cls_out_channels = map_num_classes + else: + self.map_cls_out_channels = map_num_classes + 1 + + self.map_bg_cls_weight = 0 + map_class_weight = loss_map_cls.get('class_weight', None) + if map_class_weight is not None and (self.__class__ is VADHead): + assert isinstance(map_class_weight, float), 'Expected ' \ + 'class_weight to have type float. Found ' \ + f'{type(map_class_weight)}.' + # NOTE following the official DETR rep0, bg_cls_weight means + # relative classification weight of the no-object class. + map_bg_cls_weight = loss_map_cls.get('bg_cls_weight', map_class_weight) + assert isinstance(map_bg_cls_weight, float), 'Expected ' \ + 'bg_cls_weight to have type float. Found ' \ + f'{type(map_bg_cls_weight)}.' + map_class_weight = torch.ones(map_num_classes + 1) * map_class_weight + # set background class as the last indice + map_class_weight[map_num_classes] = map_bg_cls_weight + loss_map_cls.update({'class_weight': map_class_weight}) + if 'bg_cls_weight' in loss_map_cls: + loss_map_cls.pop('bg_cls_weight') + self.map_bg_cls_weight = map_bg_cls_weight + + self.traj_bg_cls_weight = 0 + + super(VADHead, self).__init__(*args, transformer=transformer, **kwargs) + self.code_weights = nn.Parameter(torch.tensor( + self.code_weights, requires_grad=False), requires_grad=False) + self.map_code_weights = nn.Parameter(torch.tensor( + self.map_code_weights, requires_grad=False), requires_grad=False) + + if kwargs['train_cfg'] is not None: + assert 'map_assigner' in kwargs['train_cfg'], 'map assigner should be provided '\ + 'when train_cfg is set.' + map_assigner = kwargs['train_cfg']['map_assigner'] + assert loss_map_cls['loss_weight'] == map_assigner['cls_cost']['weight'], \ + 'The classification weight for loss and matcher should be' \ + 'exactly the same.' + assert loss_map_bbox['loss_weight'] == map_assigner['reg_cost'][ + 'weight'], 'The regression L1 weight for loss and matcher ' \ + 'should be exactly the same.' + assert loss_map_iou['loss_weight'] == map_assigner['iou_cost']['weight'], \ + 'The regression iou weight for loss and matcher should be' \ + 'exactly the same.' + assert loss_map_pts['loss_weight'] == map_assigner['pts_cost']['weight'], \ + 'The regression l1 weight for map pts loss and matcher should be' \ + 'exactly the same.' + + self.map_assigner = build_assigner(map_assigner) + # DETR sampling=False, so use PseudoSampler + sampler_cfg = dict(type='PseudoSampler') + self.map_sampler = build_sampler(sampler_cfg, context=self) + + self.loss_traj = build_loss(loss_traj) + self.loss_traj_cls = build_loss(loss_traj_cls) + self.loss_map_bbox = build_loss(loss_map_bbox) + self.loss_map_cls = build_loss(loss_map_cls) + self.loss_map_iou = build_loss(loss_map_iou) + self.loss_map_pts = build_loss(loss_map_pts) + self.loss_map_dir = build_loss(loss_map_dir) + self.loss_plan_reg = build_loss(loss_plan_reg) + self.loss_plan_bound = build_loss(loss_plan_bound) + self.loss_plan_col = build_loss(loss_plan_col) + self.loss_plan_dir = build_loss(loss_plan_dir) + self.loss_vae_gen = build_loss(loss_vae_gen) + + + + def _init_layers(self): + """Initialize classification branch and regression branch of head.""" + cls_branch = [] + for _ in range(self.num_reg_fcs): + cls_branch.append(Linear(self.embed_dims, self.embed_dims)) + cls_branch.append(nn.LayerNorm(self.embed_dims)) + cls_branch.append(nn.ReLU(inplace=True)) + cls_branch.append(Linear(self.embed_dims, self.cls_out_channels)) + cls_branch = nn.Sequential(*cls_branch) + + reg_branch = [] + for _ in range(self.num_reg_fcs): + reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + reg_branch.append(nn.ReLU()) + reg_branch.append(Linear(self.embed_dims, self.code_size)) + reg_branch = nn.Sequential(*reg_branch) + + traj_branch = [] + traj_in_dim = self.embed_dims*4 + for _ in range(self.num_reg_fcs): + traj_branch.append(Linear(traj_in_dim, traj_in_dim)) + traj_branch.append(nn.ReLU()) + traj_branch.append(Linear(traj_in_dim, 2)) + traj_branch = nn.Sequential(*traj_branch) + + traj_cls_branch = [] + # for _ in range(self.num_reg_fcs): + traj_cls_branch.append(Linear(self.embed_dims*14, self.embed_dims*2)) + traj_cls_branch.append(nn.LayerNorm(self.embed_dims*2)) + traj_cls_branch.append(nn.ReLU(inplace=True)) + traj_cls_branch.append(Linear(self.embed_dims*2, self.embed_dims*2)) + traj_cls_branch.append(nn.LayerNorm(self.embed_dims*2)) + traj_cls_branch.append(nn.ReLU(inplace=True)) + traj_cls_branch.append(Linear(self.embed_dims*2, self.traj_num_cls)) + traj_cls_branch = nn.Sequential(*traj_cls_branch) + + map_cls_branch = [] + for _ in range(self.num_reg_fcs): + map_cls_branch.append(Linear(self.embed_dims, self.embed_dims)) + map_cls_branch.append(nn.LayerNorm(self.embed_dims)) + map_cls_branch.append(nn.ReLU(inplace=True)) + map_cls_branch.append(Linear(self.embed_dims, self.map_cls_out_channels)) + map_cls_branch = nn.Sequential(*map_cls_branch) + + map_reg_branch = [] + for _ in range(self.num_reg_fcs): + map_reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + map_reg_branch.append(nn.ReLU()) + map_reg_branch.append(Linear(self.embed_dims, self.map_code_size)) + map_reg_branch = nn.Sequential(*map_reg_branch) + + + def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + # last reg_branch is used to generate proposal from + # encode feature map when as_two_stage is True. + num_decoder_layers = 1 + num_map_decoder_layers = 1 + if self.transformer.decoder is not None: + num_decoder_layers = self.transformer.decoder.num_layers + if self.transformer.map_decoder is not None: + num_map_decoder_layers = self.transformer.map_decoder.num_layers + num_motion_decoder_layers = 1 + num_pred = (num_decoder_layers + 1) if \ + self.as_two_stage else num_decoder_layers + motion_num_pred = (num_motion_decoder_layers + 1) if \ + self.as_two_stage else num_motion_decoder_layers + map_num_pred = (num_map_decoder_layers + 1) if \ + self.as_two_stage else num_map_decoder_layers + + if self.with_box_refine: + self.cls_branches = _get_clones(cls_branch, num_pred) + self.reg_branches = _get_clones(reg_branch, num_pred) + self.traj_branches = _get_clones(traj_branch, motion_num_pred) + self.traj_cls_branches = _get_clones(traj_cls_branch, motion_num_pred) + self.map_cls_branches = _get_clones(map_cls_branch, map_num_pred) + self.map_reg_branches = _get_clones(map_reg_branch, map_num_pred) + else: + self.cls_branches = nn.ModuleList( + [cls_branch for _ in range(num_pred)]) + self.reg_branches = nn.ModuleList( + [reg_branch for _ in range(num_pred)]) + self.traj_branches = nn.ModuleList( + [traj_branch for _ in range(motion_num_pred)]) + self.traj_cls_branches = nn.ModuleList( + [traj_cls_branch for _ in range(motion_num_pred)]) + self.map_cls_branches = nn.ModuleList( + [map_cls_branch for _ in range(map_num_pred)]) + self.map_reg_branches = nn.ModuleList( + [map_reg_branch for _ in range(map_num_pred)]) + + if not self.as_two_stage: + self.bev_embedding = nn.Embedding( + self.bev_h * self.bev_w, self.embed_dims) + self.query_embedding = nn.Embedding(self.num_query, + self.embed_dims * 2) + if self.map_query_embed_type == 'all_pts': + self.map_query_embedding = nn.Embedding(self.map_num_query, + self.embed_dims * 2) + elif self.map_query_embed_type == 'instance_pts': + self.map_query_embedding = None + self.map_instance_embedding = nn.Embedding(self.map_num_vec, self.embed_dims * 2) + self.map_pts_embedding = nn.Embedding(self.map_num_pts_per_vec, self.embed_dims * 2) + + if self.motion_decoder is not None: + self.motion_decoder = build_transformer_layer_sequence(self.motion_decoder) + self.motion_mode_query = nn.Embedding(self.fut_mode, self.embed_dims) + self.motion_mode_query.weight.requires_grad = True + if self.use_pe: + self.pos_mlp_sa = nn.Linear(2, self.embed_dims) + else: + raise NotImplementedError('Not implement yet') + + if self.motion_map_decoder is not None: + self.lane_encoder = LaneNet(256, 128, 3) + self.motion_map_decoder = build_transformer_layer_sequence(self.motion_map_decoder) + if self.use_pe: + self.pos_mlp = nn.Linear(2, self.embed_dims) + + if self.ego_his_encoder is not None: + self.ego_his_encoder = LaneNet(2, self.embed_dims//2, 3) + else: + self.ego_query = nn.Embedding(1, self.embed_dims) + + if self.ego_agent_decoder is not None: + self.ego_agent_decoder = build_transformer_layer_sequence(self.ego_agent_decoder) + if self.use_pe: + self.ego_agent_pos_mlp = nn.Linear(2, self.embed_dims) + + + + if self.ego_map_decoder is not None: + self.ego_map_decoder = build_transformer_layer_sequence(self.ego_map_decoder) + if self.use_pe: + self.ego_map_pos_mlp = nn.Linear(2, self.embed_dims) + + ego_fut_decoder = [] + ego_fut_dec_in_dim = self.embed_dims*2 + len(self.ego_lcf_feat_idx) \ + if self.ego_lcf_feat_idx is not None else self.embed_dims*2 + ego_fut_dec_in_dim = int(ego_fut_dec_in_dim * 2) + for _ in range(self.num_reg_fcs): + ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, ego_fut_dec_in_dim)) + ego_fut_decoder.append(nn.ReLU()) + ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, self.ego_fut_mode*2)) + self.ego_fut_decoder = nn.Sequential(*ego_fut_decoder) + + self.agent_fus_mlp = nn.Sequential( + nn.Linear(self.fut_mode*2*self.embed_dims, self.embed_dims, bias=True), + nn.LayerNorm(self.embed_dims), + nn.ReLU(), + nn.Linear(self.embed_dims, self.embed_dims, bias=True)) + + ######################################################### + self.ego_coord_mlp = nn.Linear(2, 2) + + self.layer_dim = 4 + self.state_gru = nn.GRU(input_size=32, hidden_size=512, num_layers=self.layer_dim) + + self.ego_gru = nn.GRU(512, 512, 4) + self.motion_gru = nn.GRU(512, 512, 4) + + # motion head + + traj_branch_ar = [] + for _ in range(self.num_reg_fcs): + traj_branch_ar.append(Linear(self.embed_dims*2, self.embed_dims*2)) + traj_branch_ar.append(nn.ReLU()) + traj_branch_ar.append(Linear(self.embed_dims*2, 2)) + traj_branch_ar = nn.Sequential(*traj_branch_ar) + + traj_cls_branch_ar = [] + for _ in range(self.num_reg_fcs): + traj_cls_branch_ar.append(Linear(self.embed_dims*2, self.embed_dims*2)) + traj_cls_branch_ar.append(nn.LayerNorm(self.embed_dims*2)) + traj_cls_branch_ar.append(nn.ReLU(inplace=True)) + traj_cls_branch_ar.append(Linear(self.embed_dims*2, self.traj_num_cls)) + traj_cls_branch_ar = nn.Sequential(*traj_cls_branch_ar) + + if self.with_box_refine: + self.traj_branches_ar = _get_clones(traj_branch_ar, motion_num_pred) + self.traj_cls_branches_ar = _get_clones(traj_cls_branch_ar, motion_num_pred) + else: + self.traj_branches_ar = nn.ModuleList( + [traj_branch_ar for _ in range(motion_num_pred)]) + self.traj_cls_branches_ar = nn.ModuleList( + [traj_cls_branch_ar for _ in range(motion_num_pred)]) + + + + + # planning head + ego_fut_decoder_ar = [] + ego_fut_dec_in_dim_ar = self.embed_dims*2 + len(self.ego_lcf_feat_idx) \ + if self.ego_lcf_feat_idx is not None else self.embed_dims*2 + for _ in range(self.num_reg_fcs): + ego_fut_decoder_ar.append(Linear(ego_fut_dec_in_dim_ar, ego_fut_dec_in_dim_ar)) + ego_fut_decoder_ar.append(nn.ReLU()) + ego_fut_decoder_ar.append(Linear(ego_fut_dec_in_dim_ar, self.ego_fut_mode*2)) + self.ego_fut_decoder_ar = nn.Sequential(*ego_fut_decoder_ar) + + self.ar = True + + # generator motion & planning + self.present_distribution_in_channels = 512 + self.future_distribution_in_channels = 524 + self.now_pred_in_channels = 64 + self.PROBABILISTIC = True + self.latent_dim = 32 + self.MIN_LOG_SIGMA = -5.0 + self.MAX_LOG_SIGMA = 5.0 + self.FUTURE_DIM = 6 + self.N_GRU_BLOCKS = 3 + self.N_RES_LAYERS = 3 + + self.present_distribution = DistributionModule( + self.present_distribution_in_channels, + self.latent_dim, + min_log_sigma=self.MIN_LOG_SIGMA, + max_log_sigma=self.MAX_LOG_SIGMA, + ) + + # future_distribution_in_channels = (self.future_pred_in_channels + # + 4 * self.FUTURE_DIM + # ) + self.future_distribution = DistributionModule( + self.future_distribution_in_channels, + self.latent_dim, + min_log_sigma=self.MIN_LOG_SIGMA, + max_log_sigma=self.MAX_LOG_SIGMA, + ) + + # Future prediction + self.future_prediction = FuturePrediction( + in_channels=self.present_distribution_in_channels, + latent_dim=self.latent_dim, + n_gru_blocks=self.N_GRU_BLOCKS, + n_res_layers=self.N_RES_LAYERS, + ) + + self.predict_model = PredictModel( + in_channels=self.latent_dim, + out_channels=self.embed_dims*2, + hidden_channels=self.latent_dim*4, + num_layers=self.layer_dim + ) + + + + + + def init_weights(self): + """Initialize weights of the DeformDETR head.""" + self.transformer.init_weights() + if self.loss_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.cls_branches: + nn.init.constant_(m[-1].bias, bias_init) + if self.loss_map_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.map_cls_branches: + nn.init.constant_(m[-1].bias, bias_init) + if self.loss_traj_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.traj_cls_branches: + nn.init.constant_(m[-1].bias, bias_init) + # for m in self.map_reg_branches: + # constant_init(m[-1], 0, bias=0) + # nn.init.constant_(self.map_reg_branches[0][-1].bias.data[2:], 0.) + if self.motion_decoder is not None: + for p in self.motion_decoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + nn.init.orthogonal_(self.motion_mode_query.weight) + if self.use_pe: + xavier_init(self.pos_mlp_sa, distribution='uniform', bias=0.) + if self.motion_map_decoder is not None: + for p in self.motion_map_decoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for p in self.lane_encoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + if self.use_pe: + xavier_init(self.pos_mlp, distribution='uniform', bias=0.) + if self.ego_his_encoder is not None: + for p in self.ego_his_encoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + if self.ego_agent_decoder is not None: + for p in self.ego_agent_decoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + if self.ego_map_decoder is not None: + for p in self.ego_map_decoder.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + # @auto_fp16(apply_to=('mlvl_feats')) + + + + # @auto_fp16(apply_to=('mlvl_feats')) + @force_fp32(apply_to=('mlvl_feats', 'prev_bev')) + def forward(self, + mlvl_feats, + img_metas, + prev_bev=None, + only_bev=False, + ego_his_trajs=None, + ego_lcf_feat=None, + gt_labels_3d=None, + gt_attr_labels=None, + ego_fut_trajs=None, + ): + """Forward function. + Args: + mlvl_feats (tuple[Tensor]): Features from the upstream + network, each is a 5D-tensor with shape + (B, N, C, H, W). + prev_bev: previous bev featues + only_bev: only compute BEV features with encoder. + Returns: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + """ + + bs, num_cam, _, _, _ = mlvl_feats[0].shape + dtype = mlvl_feats[0].dtype + object_query_embeds = self.query_embedding.weight.to(dtype) + + if self.map_query_embed_type == 'all_pts': + map_query_embeds = self.map_query_embedding.weight.to(dtype) + elif self.map_query_embed_type == 'instance_pts': + map_pts_embeds = self.map_pts_embedding.weight.unsqueeze(0) + map_instance_embeds = self.map_instance_embedding.weight.unsqueeze(1) + map_query_embeds = (map_pts_embeds + map_instance_embeds).flatten(0, 1).to(dtype) + + bev_queries = self.bev_embedding.weight.to(dtype) + + bev_mask = torch.zeros((bs, self.bev_h, self.bev_w), + device=bev_queries.device).to(dtype) + bev_pos = self.positional_encoding(bev_mask).to(dtype) + + if only_bev: # only use encoder to obtain BEV features, TODO: refine the workaround + return self.transformer.get_bev_features( + mlvl_feats, + bev_queries, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + img_metas=img_metas, + prev_bev=prev_bev, + ) + else: + outputs = self.transformer( + mlvl_feats, + bev_queries, + object_query_embeds, + map_query_embeds, + self.bev_h, + self.bev_w, + grid_length=(self.real_h / self.bev_h, + self.real_w / self.bev_w), + bev_pos=bev_pos, + reg_branches=self.reg_branches if self.with_box_refine else None, # noqa:E501 + cls_branches=self.cls_branches if self.as_two_stage else None, + map_reg_branches=self.map_reg_branches if self.with_box_refine else None, # noqa:E501 + map_cls_branches=self.map_cls_branches if self.as_two_stage else None, + img_metas=img_metas, + prev_bev=prev_bev + ) + + # bev_embed: bev features + # hs: agent_query + # init_reference: reference points init + # inter_references: reference points processing + # map_hs: map_query + # map_init_reference: reference points init + # map_inter_references: reference points processing + + bev_embed, hs, init_reference, inter_references, \ + map_hs, map_init_reference, map_inter_references = outputs + + hs = hs.permute(0, 2, 1, 3) + outputs_classes = [] + outputs_coords = [] + outputs_coords_bev = [] + outputs_trajs = [] + outputs_trajs_classes = [] + + map_hs = map_hs.permute(0, 2, 1, 3) + map_outputs_classes = [] + map_outputs_coords = [] + map_outputs_pts_coords = [] + map_outputs_coords_bev = [] + + for lvl in range(hs.shape[0]): + if lvl == 0: + reference = init_reference + else: + reference = inter_references[lvl - 1] + reference = inverse_sigmoid(reference) + outputs_class = self.cls_branches[lvl](hs[lvl]) + tmp = self.reg_branches[lvl](hs[lvl]) + + # TODO: check the shape of reference + assert reference.shape[-1] == 3 + tmp[..., 0:2] = tmp[..., 0:2] + reference[..., 0:2] + tmp[..., 0:2] = tmp[..., 0:2].sigmoid() + outputs_coords_bev.append(tmp[..., 0:2].clone().detach()) + tmp[..., 4:5] = tmp[..., 4:5] + reference[..., 2:3] + tmp[..., 4:5] = tmp[..., 4:5].sigmoid() + tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] - + self.pc_range[0]) + self.pc_range[0]) + tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] - + self.pc_range[1]) + self.pc_range[1]) + tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] - + self.pc_range[2]) + self.pc_range[2]) + + # TODO: check if using sigmoid + outputs_coord = tmp + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + + for lvl in range(map_hs.shape[0]): + if lvl == 0: + reference = map_init_reference + else: + reference = map_inter_references[lvl - 1] + reference = inverse_sigmoid(reference) + map_outputs_class = self.map_cls_branches[lvl]( + map_hs[lvl].view(bs,self.map_num_vec, self.map_num_pts_per_vec,-1).mean(2) + ) + tmp = self.map_reg_branches[lvl](map_hs[lvl]) + # TODO: check the shape of reference + assert reference.shape[-1] == 2 + tmp[..., 0:2] += reference[..., 0:2] + tmp = tmp.sigmoid() # cx,cy,w,h + map_outputs_coord, map_outputs_pts_coord = self.map_transform_box(tmp) + map_outputs_coords_bev.append(map_outputs_pts_coord.clone().detach()) + map_outputs_classes.append(map_outputs_class) + map_outputs_coords.append(map_outputs_coord) + map_outputs_pts_coords.append(map_outputs_pts_coord) + + # motion prediction + + #motion query + if self.motion_decoder is not None: + batch_size, num_agent = outputs_coords_bev[-1].shape[:2] + # motion_query + motion_query = hs[-1].permute(1, 0, 2) # [A, B, D] + mode_query = self.motion_mode_query.weight # [fut_mode, D] + # [M, B, D], M=A*fut_mode + motion_query = (motion_query[:, None, :, :] + mode_query[None, :, None, :]).flatten(0, 1) + if self.use_pe: + motion_coords = outputs_coords_bev[-1] # [B, A, 2] + motion_pos = self.pos_mlp_sa(motion_coords) # [B, A, D] + motion_pos = motion_pos.unsqueeze(2).repeat(1, 1, self.fut_mode, 1).flatten(1, 2) + motion_pos = motion_pos.permute(1, 0, 2) # [M, B, D] + else: + motion_pos = None + + if self.motion_det_score is not None: + motion_score = outputs_classes[-1] + max_motion_score = motion_score.max(dim=-1)[0] + invalid_motion_idx = max_motion_score < self.motion_det_score # [B, A] + invalid_motion_idx = invalid_motion_idx.unsqueeze(2).repeat(1, 1, self.fut_mode).flatten(1, 2) + else: + invalid_motion_idx = None + + #ego query + # batch = batch_size + if self.ego_his_encoder is not None: + ego_his_feats = self.ego_his_encoder(ego_his_trajs) # [B, 1, dim] + else: + ego_his_feats = self.ego_query.weight.unsqueeze(0).repeat(batch_size, 1, 1) + # ego <-> agent Interaction + ego_query = ego_his_feats.permute(1, 0, 2) + ego_pos = torch.zeros((batch_size, 1, 2), device=ego_query.device).permute(1, 0, 2) + ego_pos_emb = self.ego_agent_pos_mlp(ego_pos) + + motion_query = torch.cat([motion_query, ego_query], dim=0) + motion_pos = torch.cat([motion_pos, ego_pos_emb], dim=0) + + motion_hs = self.motion_decoder( + query=motion_query, + key=motion_query, + value=motion_query, + query_pos=motion_pos, + key_pos=motion_pos, + key_padding_mask=invalid_motion_idx) + + if self.motion_map_decoder is not None: + # map preprocess + motion_coords = outputs_coords_bev[-1] # [B, A, 2] + motion_coords = motion_coords.unsqueeze(2).repeat(1, 1, self.fut_mode, 1).flatten(1, 2) + + #ego_coords = torch.Tensor(1, 1, 2).cuda(1) + ego_coords = torch.zeros([batch_size, 1, 2], device=motion_hs.device) + ego_coords_embd = self.ego_coord_mlp(ego_coords) + # ego_coords_embd = torch.zeros([batch_size, 1, 2], device=motion_hs.device) + motion_coords = torch.cat([motion_coords, ego_coords_embd], dim=1) + + + map_query = map_hs[-1].view(batch_size, self.map_num_vec, self.map_num_pts_per_vec, -1) + map_query = self.lane_encoder(map_query) # [B, P, pts, D] -> [B, P, D] + map_score = map_outputs_classes[-1] + map_pos = map_outputs_coords_bev[-1] + map_query, map_pos, key_padding_mask = self.select_and_pad_pred_map( + motion_coords, map_query, map_score, map_pos, + map_thresh=self.map_thresh, dis_thresh=self.dis_thresh, + pe_normalization=self.pe_normalization, use_fix_pad=True) + map_query = map_query.permute(1, 0, 2) # [P, B*M, D] + ca_motion_query = motion_hs.permute(1, 0, 2).flatten(0, 1).unsqueeze(0) + + # position encoding + if self.use_pe: + (num_query, batch) = ca_motion_query.shape[:2] + motion_pos = torch.zeros((num_query, batch, 2), device=motion_hs.device) + motion_pos = self.pos_mlp(motion_pos) + map_pos = map_pos.permute(1, 0, 2) + map_pos = self.pos_mlp(map_pos) + else: + motion_pos, map_pos = None, None + + ca_motion_query = self.motion_map_decoder( + query=ca_motion_query, + key=map_query, + value=map_query, + query_pos=motion_pos, + key_pos=map_pos, + key_padding_mask=key_padding_mask) + else: + ca_motion_query = motion_hs.permute(1, 0, 2).flatten(0, 1).unsqueeze(0) + + ######################################## + # generator for planning & motion + current_states = torch.cat((motion_hs.permute(1, 0, 2), + ca_motion_query.reshape(batch_size, -1, self.embed_dims)), dim=2) + distribution_comp = {} + # states = torch.randn((2, 1, 64, 200, 200), device=motion_hs.device) + # future_distribution_inputs = torch.randn((2, 5, 6, 200, 200), device=motion_hs.device) + noise = None + if self.training: + future_distribution_inputs = self.get_future_labels(gt_labels_3d, gt_attr_labels, ego_fut_trajs, motion_hs.device) + else: + future_distribution_inputs = None + + # 1. model CVA distribution for state + if self.fut_ts > 0: + #present_state = states[:, :1].contiguous() + if self.PROBABILISTIC: + # Do probabilistic computation + sample, output_distribution = self.distribution_forward( + current_states, future_distribution_inputs, noise + ) + distribution_comp = {**distribution_comp, **output_distribution} + + # 2. predict future state from distribution + hidden_states = current_states + states_hs, future_states_hs = \ + self.future_states_predict(batch_size, sample, hidden_states, current_states) + + + ego_query_hs = \ + states_hs[:, :, self.agent_dim*self.fut_mode, :].unsqueeze(1).permute(0, 2, 1, 3) + motion_query_hs = states_hs[:, :, 0:self.agent_dim*self.fut_mode, :] + motion_query_hs = \ + motion_query_hs.reshape(self.fut_ts, batch_size, -1, self.fut_ts, motion_query_hs.shape[-1]) + ego_fut_trajs_list = [] + motion_fut_trajs_list = [] + for i in range(self.fut_ts): + outputs_ego_trajs = self.ego_fut_decoder(ego_query_hs[i]).reshape(batch_size, self.ego_fut_mode, 2) + ego_fut_trajs_list.append(outputs_ego_trajs) + outputs_agent_trajs = self.traj_branches[0](motion_query_hs[i]) + motion_fut_trajs_list.append(outputs_agent_trajs) + + ego_trajs = torch.stack(ego_fut_trajs_list, dim=2) + agent_trajs = torch.stack(motion_fut_trajs_list, dim=3).reshape(batch_size, 1, self.agent_dim, self.fut_mode, -1) + + motion_cls_hs = torch.cat((future_states_hs[:, :, 0:self.agent_dim*self.fut_mode, :]. + reshape(batch_size, self.agent_dim,self.fut_mode,-1), + current_states[:,0:self.agent_dim*self.fut_mode,:]. + reshape(batch_size, self.agent_dim ,self.fut_mode,-1)), dim=-1) + + # outputs_traj_class = self.traj_cls_branches[0](motion_query_hs) + + # outputs_traj = self.traj_branches[0](motion_hs) + # outputs_trajs.append(outputs_traj) + outputs_traj_class = self.traj_cls_branches[0](motion_cls_hs) + outputs_trajs_classes.append(outputs_traj_class.squeeze(-1)) + + + map_outputs_classes = torch.stack(map_outputs_classes) + map_outputs_coords = torch.stack(map_outputs_coords) + map_outputs_pts_coords = torch.stack(map_outputs_pts_coords) + + outputs_classes = torch.stack(outputs_classes) + outputs_coords = torch.stack(outputs_coords) + outputs_trajs = agent_trajs.permute(1, 0, 2, 3, 4) + outputs_trajs_classes = torch.stack(outputs_trajs_classes) + + + + + + # print(future_states.shape) + + # Ego prediction + #ego_feats [1, 1, 512] + # outputs_ego_trajs = self.ego_fut_decoder(ego_feats) + # outputs_ego_trajs = outputs_ego_trajs.reshape(outputs_ego_trajs.shape[0], + # self.ego_fut_mode, self.fut_ts, 2) + + outs = { + 'bev_embed': bev_embed, #torch.Size([10000, 1, 256]) + 'all_cls_scores': outputs_classes, #torch.Size([3, 1, 300, 10]) + 'all_bbox_preds': outputs_coords, #torch.Size([3, 1, 300, 10]) + 'all_traj_preds': outputs_trajs.repeat(outputs_coords.shape[0], 1, 1, 1, 1), # torch.Size([3, 1, 300, 6, 12]) + 'all_traj_cls_scores': outputs_trajs_classes.repeat(outputs_coords.shape[0], 1, 1, 1), # torch.Size([3, 1, 300, 6]) + 'map_all_cls_scores': map_outputs_classes, #torch.Size([3, 1, 100, 3]) map_outputs_classes + 'map_all_bbox_preds': map_outputs_coords, #torch.Size([3, 1, 100, 4]) map_outputs_coords + 'map_all_pts_preds': map_outputs_pts_coords, #torch.Size([3, 1, 100, 20, 2]) + 'enc_cls_scores': None, + 'enc_bbox_preds': None, + 'map_enc_cls_scores': None, + 'map_enc_bbox_preds': None, + 'map_enc_pts_preds': None, + 'ego_fut_preds': ego_trajs, # torch.Size([1, 3, 6, 2]) + 'loss_vae_gen': distribution_comp, + } + + return outs + + def map_transform_box(self, pts, y_first=False): + """ + Converting the points set into bounding box. + + Args: + pts: the input points sets (fields), each points + set (fields) is represented as 2n scalar. + y_first: if y_fisrt=True, the point set is represented as + [y1, x1, y2, x2 ... yn, xn], otherwise the point set is + represented as [x1, y1, x2, y2 ... xn, yn]. + Returns: + The bbox [cx, cy, w, h] transformed from points. + """ + pts_reshape = pts.view(pts.shape[0], self.map_num_vec, + self.map_num_pts_per_vec,2) + pts_y = pts_reshape[:, :, :, 0] if y_first else pts_reshape[:, :, :, 1] + pts_x = pts_reshape[:, :, :, 1] if y_first else pts_reshape[:, :, :, 0] + if self.map_transform_method == 'minmax': + # import pdb;pdb.set_trace() + + xmin = pts_x.min(dim=2, keepdim=True)[0] + xmax = pts_x.max(dim=2, keepdim=True)[0] + ymin = pts_y.min(dim=2, keepdim=True)[0] + ymax = pts_y.max(dim=2, keepdim=True)[0] + bbox = torch.cat([xmin, ymin, xmax, ymax], dim=2) + bbox = bbox_xyxy_to_cxcywh(bbox) + else: + raise NotImplementedError + return bbox, pts_reshape + + def _get_target_single(self, + cls_score, + bbox_pred, + gt_labels, + gt_bboxes, + gt_attr_labels, + gt_bboxes_ignore=None): + """"Compute regression and classification targets for one image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_query, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_query, 10]. + gt_bboxes (Tensor): Ground truth bboxes for one image with + shape (num_gts, 9) in [x,y,z,w,l,h,yaw,vx,vy] format. + gt_labels (Tensor): Ground truth class indices for one image + with shape (num_gts, ). + gt_bboxes_ignore (Tensor, optional): Bounding boxes + which can be ignored. Default None. + Returns: + tuple[Tensor]: a tuple containing the following for one image. + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + + num_bboxes = bbox_pred.size(0) + # assigner and sampler + gt_fut_trajs = gt_attr_labels[:, :self.fut_ts*2] + gt_fut_masks = gt_attr_labels[:, self.fut_ts*2:self.fut_ts*3] + gt_bbox_c = gt_bboxes.shape[-1] + num_gt_bbox, gt_traj_c = gt_fut_trajs.shape + + assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, + gt_labels, gt_bboxes_ignore) + + sampling_result = self.sampler.sample(assign_result, bbox_pred, + gt_bboxes) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + + # label targets + labels = gt_bboxes.new_full((num_bboxes,), + self.num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_bbox_c] + bbox_weights = torch.zeros_like(bbox_pred) + bbox_weights[pos_inds] = 1.0 + + # trajs targets + traj_targets = torch.zeros((num_bboxes, gt_traj_c), dtype=torch.float32, device=bbox_pred.device) + traj_weights = torch.zeros_like(traj_targets) + traj_targets[pos_inds] = gt_fut_trajs[sampling_result.pos_assigned_gt_inds] + traj_weights[pos_inds] = 1.0 + + # Filter out invalid fut trajs + traj_masks = torch.zeros_like(traj_targets) # [num_bboxes, fut_ts*2] + gt_fut_masks = gt_fut_masks.unsqueeze(-1).repeat(1, 1, 2).view(num_gt_bbox, -1) # [num_gt_bbox, fut_ts*2] + traj_masks[pos_inds] = gt_fut_masks[sampling_result.pos_assigned_gt_inds] + traj_weights = traj_weights * traj_masks + + # Extra future timestamp mask for controlling pred horizon + fut_ts_mask = torch.zeros((num_bboxes, self.fut_ts, 2), + dtype=torch.float32, device=bbox_pred.device) + fut_ts_mask[:, :self.valid_fut_ts, :] = 1.0 + fut_ts_mask = fut_ts_mask.view(num_bboxes, -1) + traj_weights = traj_weights * fut_ts_mask + + # DETR + bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes + + return ( + labels, label_weights, bbox_targets, bbox_weights, traj_targets, + traj_weights, traj_masks.view(-1, self.fut_ts, 2)[..., 0], + pos_inds, neg_inds + ) + + def _map_get_target_single(self, + cls_score, + bbox_pred, + pts_pred, + gt_labels, + gt_bboxes, + gt_shifts_pts, + gt_bboxes_ignore=None): + """"Compute regression and classification targets for one image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_score (Tensor): Box score logits from a single decoder layer + for one image. Shape [num_query, cls_out_channels]. + bbox_pred (Tensor): Sigmoid outputs from a single decoder layer + for one image, with normalized coordinate (cx, cy, w, h) and + shape [num_query, 4]. + gt_bboxes (Tensor): Ground truth bboxes for one image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (Tensor): Ground truth class indices for one image + with shape (num_gts, ). + gt_bboxes_ignore (Tensor, optional): Bounding boxes + which can be ignored. Default None. + Returns: + tuple[Tensor]: a tuple containing the following for one image. + - labels (Tensor): Labels of each image. + - label_weights (Tensor]): Label weights of each image. + - bbox_targets (Tensor): BBox targets of each image. + - bbox_weights (Tensor): BBox weights of each image. + - pos_inds (Tensor): Sampled positive indices for each image. + - neg_inds (Tensor): Sampled negative indices for each image. + """ + num_bboxes = bbox_pred.size(0) + # assigner and sampler + gt_c = gt_bboxes.shape[-1] + assign_result, order_index = self.map_assigner.assign(bbox_pred, cls_score, pts_pred, + gt_bboxes, gt_labels, gt_shifts_pts, + gt_bboxes_ignore) + + sampling_result = self.map_sampler.sample(assign_result, bbox_pred, + gt_bboxes) + pos_inds = sampling_result.pos_inds + neg_inds = sampling_result.neg_inds + # label targets + labels = gt_bboxes.new_full((num_bboxes,), + self.map_num_classes, + dtype=torch.long) + labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] + label_weights = gt_bboxes.new_ones(num_bboxes) + # bbox targets + bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c] + bbox_weights = torch.zeros_like(bbox_pred) + bbox_weights[pos_inds] = 1.0 + # pts targets + if order_index is None: + assigned_shift = gt_labels[sampling_result.pos_assigned_gt_inds] + else: + assigned_shift = order_index[sampling_result.pos_inds, sampling_result.pos_assigned_gt_inds] + pts_targets = pts_pred.new_zeros((pts_pred.size(0), + pts_pred.size(1), pts_pred.size(2))) + pts_weights = torch.zeros_like(pts_targets) + pts_weights[pos_inds] = 1.0 + # DETR + bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes + pts_targets[pos_inds] = gt_shifts_pts[sampling_result.pos_assigned_gt_inds,assigned_shift,:,:] + return (labels, label_weights, bbox_targets, bbox_weights, + pts_targets, pts_weights, + pos_inds, neg_inds) + + def get_targets(self, + cls_scores_list, + bbox_preds_list, + gt_bboxes_list, + gt_labels_list, + gt_attr_labels_list, + gt_bboxes_ignore_list=None): + """"Compute regression and classification targets for a batch image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image with shape [num_query, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + tuple: a tuple containing the following targets. + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all \ + images. + - bbox_targets_list (list[Tensor]): BBox targets for all \ + images. + - bbox_weights_list (list[Tensor]): BBox weights for all \ + images. + - num_total_pos (int): Number of positive samples in all \ + images. + - num_total_neg (int): Number of negative samples in all \ + images. + """ + assert gt_bboxes_ignore_list is None, \ + 'Only supports for gt_bboxes_ignore setting to None.' + num_imgs = len(cls_scores_list) + gt_bboxes_ignore_list = [ + gt_bboxes_ignore_list for _ in range(num_imgs) + ] + + (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, traj_targets_list, traj_weights_list, + gt_fut_masks_list, pos_inds_list, neg_inds_list) = multi_apply( + self._get_target_single, cls_scores_list, bbox_preds_list, + gt_labels_list, gt_bboxes_list, gt_attr_labels_list, gt_bboxes_ignore_list + ) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + traj_targets_list, traj_weights_list, gt_fut_masks_list, num_total_pos, num_total_neg) + + def map_get_targets(self, + cls_scores_list, + bbox_preds_list, + pts_preds_list, + gt_bboxes_list, + gt_labels_list, + gt_shifts_pts_list, + gt_bboxes_ignore_list=None): + """"Compute regression and classification targets for a batch image. + Outputs from a single decoder layer of a single feature level are used. + Args: + cls_scores_list (list[Tensor]): Box score logits from a single + decoder layer for each image with shape [num_query, + cls_out_channels]. + bbox_preds_list (list[Tensor]): Sigmoid outputs from a single + decoder layer for each image, with normalized coordinate + (cx, cy, w, h) and shape [num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + tuple: a tuple containing the following targets. + - labels_list (list[Tensor]): Labels for all images. + - label_weights_list (list[Tensor]): Label weights for all \ + images. + - bbox_targets_list (list[Tensor]): BBox targets for all \ + images. + - bbox_weights_list (list[Tensor]): BBox weights for all \ + images. + - num_total_pos (int): Number of positive samples in all \ + images. + - num_total_neg (int): Number of negative samples in all \ + images. + """ + assert gt_bboxes_ignore_list is None, \ + 'Only supports for gt_bboxes_ignore setting to None.' + num_imgs = len(cls_scores_list) + gt_bboxes_ignore_list = [ + gt_bboxes_ignore_list for _ in range(num_imgs) + ] + + (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, pts_targets_list, pts_weights_list, + pos_inds_list, neg_inds_list) = multi_apply( + self._map_get_target_single, cls_scores_list, bbox_preds_list,pts_preds_list, + gt_labels_list, gt_bboxes_list, gt_shifts_pts_list, gt_bboxes_ignore_list) + num_total_pos = sum((inds.numel() for inds in pos_inds_list)) + num_total_neg = sum((inds.numel() for inds in neg_inds_list)) + return (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, pts_targets_list, pts_weights_list, + num_total_pos, num_total_neg) + + + + def loss_planning(self, + ego_fut_preds, + ego_fut_gt, + ego_fut_masks, + ego_fut_cmd, + lane_preds, + lane_score_preds, + agent_preds, + agent_fut_preds, + agent_score_preds, + agent_fut_cls_preds): + """"Loss function for ego vehicle planning. + Args: + ego_fut_preds (Tensor): [B, ego_fut_mode, fut_ts, 2] + ego_fut_gt (Tensor): [B, fut_ts, 2] + ego_fut_masks (Tensor): [B, fut_ts] + ego_fut_cmd (Tensor): [B, ego_fut_mode] + lane_preds (Tensor): [B, num_vec, num_pts, 2] + lane_score_preds (Tensor): [B, num_vec, 3] + agent_preds (Tensor): [B, num_agent, 2] + agent_fut_preds (Tensor): [B, num_agent, fut_mode, fut_ts, 2] + agent_score_preds (Tensor): [B, num_agent, 10] + agent_fut_cls_scores (Tensor): [B, num_agent, fut_mode] + Returns: + loss_plan_reg (Tensor): planning reg loss. + loss_plan_bound (Tensor): planning map boundary constraint loss. + loss_plan_col (Tensor): planning col constraint loss. + loss_plan_dir (Tensor): planning directional constraint loss. + """ + + ego_fut_gt = ego_fut_gt.unsqueeze(1).repeat(1, self.ego_fut_mode, 1, 1) + loss_plan_l1_weight = ego_fut_cmd[..., None, None] * ego_fut_masks[:, None, :, None] + loss_plan_l1_weight = loss_plan_l1_weight.repeat(1, 1, 1, 2) + + loss_plan_l1 = self.loss_plan_reg( + ego_fut_preds, + ego_fut_gt, + loss_plan_l1_weight + ) + + loss_plan_bound = self.loss_plan_bound( + ego_fut_preds[ego_fut_cmd==1], + lane_preds, + lane_score_preds, + weight=ego_fut_masks + ) + + loss_plan_col = self.loss_plan_col( + ego_fut_preds[ego_fut_cmd==1], + agent_preds, + agent_fut_preds, + agent_score_preds, + agent_fut_cls_preds, + weight=ego_fut_masks[:, :, None].repeat(1, 1, 2) + ) + + loss_plan_dir = self.loss_plan_dir( + ego_fut_preds[ego_fut_cmd==1], + lane_preds, + lane_score_preds, + weight=ego_fut_masks + ) + + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + loss_plan_l1 = torch.nan_to_num(loss_plan_l1) + loss_plan_bound = torch.nan_to_num(loss_plan_bound) + loss_plan_col = torch.nan_to_num(loss_plan_col) + loss_plan_dir = torch.nan_to_num(loss_plan_dir) + + loss_plan_dict = dict() + loss_plan_dict['loss_plan_reg'] = loss_plan_l1 + loss_plan_dict['loss_plan_bound'] = loss_plan_bound + loss_plan_dict['loss_plan_col'] = loss_plan_col + loss_plan_dict['loss_plan_dir'] = loss_plan_dir + + return loss_plan_dict + + def loss_single(self, + cls_scores, + bbox_preds, + traj_preds, + traj_cls_preds, + gt_bboxes_list, + gt_labels_list, + gt_attr_labels_list, + gt_bboxes_ignore_list=None): + """"Loss function for outputs from a single decoder layer of a single + feature level. + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images. Shape [bs, num_query, cls_out_channels]. + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape [bs, num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components for outputs from + a single decoder layer. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, + gt_bboxes_list, gt_labels_list, + gt_attr_labels_list, gt_bboxes_ignore_list) + + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + traj_targets_list, traj_weights_list, gt_fut_masks_list, + num_total_pos, num_total_neg) = cls_reg_targets + + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + traj_targets = torch.cat(traj_targets_list, 0) + traj_weights = torch.cat(traj_weights_list, 0) + gt_fut_masks = torch.cat(gt_fut_masks_list, 0) + + # classification loss + cls_scores = cls_scores.reshape(-1, self.cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + + cls_avg_factor = max(cls_avg_factor, 1) + loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes accross all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # regression L1 loss + bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) + normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range) + isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) + bbox_weights = bbox_weights * self.code_weights + loss_bbox = self.loss_bbox( + bbox_preds[isnotnan, :10], + normalized_bbox_targets[isnotnan, :10], + bbox_weights[isnotnan, :10], + avg_factor=num_total_pos) + + # traj regression loss + best_traj_preds = self.get_best_fut_preds( + traj_preds.reshape(-1, self.fut_mode, self.fut_ts, 2), + traj_targets.reshape(-1, self.fut_ts, 2), gt_fut_masks) + + neg_inds = (bbox_weights[:, 0] == 0) + traj_labels = self.get_traj_cls_target( + traj_preds.reshape(-1, self.fut_mode, self.fut_ts, 2), + traj_targets.reshape(-1, self.fut_ts, 2), + gt_fut_masks, neg_inds) + + loss_traj = self.loss_traj( + best_traj_preds[isnotnan], + traj_targets[isnotnan], + traj_weights[isnotnan], + avg_factor=num_total_pos) + + if self.use_traj_lr_warmup: + loss_scale_factor = get_traj_warmup_loss_weight(self.epoch, self.tot_epoch) + loss_traj = loss_scale_factor * loss_traj + + # traj classification loss + traj_cls_scores = traj_cls_preds.reshape(-1, self.fut_mode) + # construct weighted avg_factor to match with the official DETR repo + traj_cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.traj_bg_cls_weight + if self.sync_cls_avg_factor: + traj_cls_avg_factor = reduce_mean( + traj_cls_scores.new_tensor([traj_cls_avg_factor])) + + traj_cls_avg_factor = max(traj_cls_avg_factor, 1) + loss_traj_cls = self.loss_traj_cls( + traj_cls_scores, traj_labels, label_weights, avg_factor=traj_cls_avg_factor + ) + + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + loss_cls = torch.nan_to_num(loss_cls) + loss_bbox = torch.nan_to_num(loss_bbox) + loss_traj = torch.nan_to_num(loss_traj) + loss_traj_cls = torch.nan_to_num(loss_traj_cls) + + return loss_cls, loss_bbox, loss_traj, loss_traj_cls + + def get_best_fut_preds(self, + traj_preds, + traj_targets, + gt_fut_masks): + """"Choose best preds among all modes. + Args: + traj_preds (Tensor): MultiModal traj preds with shape (num_box_preds, fut_mode, fut_ts, 2). + traj_targets (Tensor): Ground truth traj for each pred box with shape (num_box_preds, fut_ts, 2). + gt_fut_masks (Tensor): Ground truth traj mask with shape (num_box_preds, fut_ts). + pred_box_centers (Tensor): Pred box centers with shape (num_box_preds, 2). + gt_box_centers (Tensor): Ground truth box centers with shape (num_box_preds, 2). + + Returns: + best_traj_preds (Tensor): best traj preds (min displacement error with gt) + with shape (num_box_preds, fut_ts*2). + """ + + cum_traj_preds = traj_preds.cumsum(dim=-2) + cum_traj_targets = traj_targets.cumsum(dim=-2) + + # Get min pred mode indices. + # (num_box_preds, fut_mode, fut_ts) + dist = torch.linalg.norm(cum_traj_targets[:, None, :, :] - cum_traj_preds, dim=-1) + dist = dist * gt_fut_masks[:, None, :] + dist = dist[..., -1] + dist[torch.isnan(dist)] = dist[torch.isnan(dist)] * 0 + min_mode_idxs = torch.argmin(dist, dim=-1).tolist() + box_idxs = torch.arange(traj_preds.shape[0]).tolist() + best_traj_preds = traj_preds[box_idxs, min_mode_idxs, :, :].reshape(-1, self.fut_ts*2) + + return best_traj_preds + + def get_traj_cls_target(self, + traj_preds, + traj_targets, + gt_fut_masks, + neg_inds): + """"Get Trajectory mode classification target. + Args: + traj_preds (Tensor): MultiModal traj preds with shape (num_box_preds, fut_mode, fut_ts, 2). + traj_targets (Tensor): Ground truth traj for each pred box with shape (num_box_preds, fut_ts, 2). + gt_fut_masks (Tensor): Ground truth traj mask with shape (num_box_preds, fut_ts). + neg_inds (Tensor): Negtive indices with shape (num_box_preds,) + + Returns: + traj_labels (Tensor): traj cls labels (num_box_preds,). + """ + + cum_traj_preds = traj_preds.cumsum(dim=-2) + cum_traj_targets = traj_targets.cumsum(dim=-2) + + # Get min pred mode indices. + # (num_box_preds, fut_mode, fut_ts) + dist = torch.linalg.norm(cum_traj_targets[:, None, :, :] - cum_traj_preds, dim=-1) + dist = dist * gt_fut_masks[:, None, :] + dist = dist[..., -1] + dist[torch.isnan(dist)] = dist[torch.isnan(dist)] * 0 + traj_labels = torch.argmin(dist, dim=-1) + traj_labels[neg_inds] = self.fut_mode + + return traj_labels + + def map_loss_single(self, + cls_scores, + bbox_preds, + pts_preds, + gt_bboxes_list, + gt_labels_list, + gt_shifts_pts_list, + gt_bboxes_ignore_list=None): + """"Loss function for outputs from a single decoder layer of a single + feature level. + Args: + cls_scores (Tensor): Box score logits from a single decoder layer + for all images. Shape [bs, num_query, cls_out_channels]. + bbox_preds (Tensor): Sigmoid outputs from a single decoder layer + for all images, with normalized coordinate (cx, cy, w, h) and + shape [bs, num_query, 4]. + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + gt_pts_list (list[Tensor]): Ground truth pts for each image + with shape (num_gts, fixed_num, 2) in [x,y] format. + gt_bboxes_ignore_list (list[Tensor], optional): Bounding + boxes which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components for outputs from + a single decoder layer. + """ + num_imgs = cls_scores.size(0) + cls_scores_list = [cls_scores[i] for i in range(num_imgs)] + bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] + pts_preds_list = [pts_preds[i] for i in range(num_imgs)] + + cls_reg_targets = self.map_get_targets(cls_scores_list, bbox_preds_list,pts_preds_list, + gt_bboxes_list, gt_labels_list,gt_shifts_pts_list, + gt_bboxes_ignore_list) + (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, + pts_targets_list, pts_weights_list, + num_total_pos, num_total_neg) = cls_reg_targets + + labels = torch.cat(labels_list, 0) + label_weights = torch.cat(label_weights_list, 0) + bbox_targets = torch.cat(bbox_targets_list, 0) + bbox_weights = torch.cat(bbox_weights_list, 0) + pts_targets = torch.cat(pts_targets_list, 0) + pts_weights = torch.cat(pts_weights_list, 0) + + # classification loss + cls_scores = cls_scores.reshape(-1, self.map_cls_out_channels) + # construct weighted avg_factor to match with the official DETR repo + cls_avg_factor = num_total_pos * 1.0 + \ + num_total_neg * self.map_bg_cls_weight + if self.sync_cls_avg_factor: + cls_avg_factor = reduce_mean( + cls_scores.new_tensor([cls_avg_factor])) + + cls_avg_factor = max(cls_avg_factor, 1) + loss_cls = self.loss_map_cls( + cls_scores, labels, label_weights, avg_factor=cls_avg_factor) + + # Compute the average number of gt boxes accross all gpus, for + # normalization purposes + num_total_pos = loss_cls.new_tensor([num_total_pos]) + num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() + + # regression L1 loss + bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) + normalized_bbox_targets = normalize_2d_bbox(bbox_targets, self.pc_range) + # normalized_bbox_targets = bbox_targets + isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) + bbox_weights = bbox_weights * self.map_code_weights + + loss_bbox = self.loss_map_bbox( + bbox_preds[isnotnan, :4], + normalized_bbox_targets[isnotnan,:4], + bbox_weights[isnotnan, :4], + avg_factor=num_total_pos) + + # regression pts CD loss + # num_samples, num_order, num_pts, num_coords + normalized_pts_targets = normalize_2d_pts(pts_targets, self.pc_range) + + # num_samples, num_pts, num_coords + pts_preds = pts_preds.reshape(-1, pts_preds.size(-2), pts_preds.size(-1)) + if self.map_num_pts_per_vec != self.map_num_pts_per_gt_vec: + pts_preds = pts_preds.permute(0,2,1) + pts_preds = F.interpolate(pts_preds, size=(self.map_num_pts_per_gt_vec), mode='linear', + align_corners=True) + pts_preds = pts_preds.permute(0,2,1).contiguous() + + loss_pts = self.loss_map_pts( + pts_preds[isnotnan,:,:], + normalized_pts_targets[isnotnan,:,:], + pts_weights[isnotnan,:,:], + avg_factor=num_total_pos) + + dir_weights = pts_weights[:, :-self.map_dir_interval,0] + denormed_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range) + denormed_pts_preds_dir = denormed_pts_preds[:,self.map_dir_interval:,:] - \ + denormed_pts_preds[:,:-self.map_dir_interval,:] + pts_targets_dir = pts_targets[:, self.map_dir_interval:,:] - pts_targets[:,:-self.map_dir_interval,:] + + loss_dir = self.loss_map_dir( + denormed_pts_preds_dir[isnotnan,:,:], + pts_targets_dir[isnotnan,:,:], + dir_weights[isnotnan,:], + avg_factor=num_total_pos) + + bboxes = denormalize_2d_bbox(bbox_preds, self.pc_range) + # regression IoU loss, defaultly GIoU loss + loss_iou = self.loss_map_iou( + bboxes[isnotnan, :4], + bbox_targets[isnotnan, :4], + bbox_weights[isnotnan, :4], + avg_factor=num_total_pos) + + if digit_version(TORCH_VERSION) >= digit_version('1.8'): + loss_cls = torch.nan_to_num(loss_cls) + loss_bbox = torch.nan_to_num(loss_bbox) + loss_iou = torch.nan_to_num(loss_iou) + loss_pts = torch.nan_to_num(loss_pts) + loss_dir = torch.nan_to_num(loss_dir) + + return loss_cls, loss_bbox, loss_iou, loss_pts, loss_dir + + + + def distribution_loss(self, output): + kl_loss = self.loss_vae_gen(output) + return kl_loss + + @force_fp32(apply_to=('preds_dicts')) + def loss(self, + gt_bboxes_list, + gt_labels_list, + map_gt_bboxes_list, + map_gt_labels_list, + preds_dicts, + ego_fut_gt, + ego_fut_masks, + ego_fut_cmd, + gt_attr_labels, + gt_bboxes_ignore=None, + map_gt_bboxes_ignore=None, + img_metas=None): + """"Loss function. + Args: + + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + preds_dicts: + all_cls_scores (Tensor): Classification score of all + decoder layers, has shape + [nb_dec, bs, num_query, cls_out_channels]. + all_bbox_preds (Tensor): Sigmoid regression + outputs of all decode layers. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and shape + [nb_dec, bs, num_query, 4]. + enc_cls_scores (Tensor): Classification scores of + points on encode feature map , has shape + (N, h*w, num_classes). Only be passed when as_two_stage is + True, otherwise is None. + enc_bbox_preds (Tensor): Regression results of each points + on the encode feature map, has shape (N, h*w, 4). Only be + passed when as_two_stage is True, otherwise is None. + gt_bboxes_ignore (list[Tensor], optional): Bounding boxes + which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert gt_bboxes_ignore is None, \ + f'{self.__class__.__name__} only supports ' \ + f'for gt_bboxes_ignore setting to None.' + + map_gt_vecs_list = copy.deepcopy(map_gt_bboxes_list) + + all_cls_scores = preds_dicts['all_cls_scores'] + all_bbox_preds = preds_dicts['all_bbox_preds'] + all_traj_preds = preds_dicts['all_traj_preds'] + all_traj_cls_scores = preds_dicts['all_traj_cls_scores'] + enc_cls_scores = preds_dicts['enc_cls_scores'] + enc_bbox_preds = preds_dicts['enc_bbox_preds'] + map_all_cls_scores = preds_dicts['map_all_cls_scores'] + map_all_bbox_preds = preds_dicts['map_all_bbox_preds'] + map_all_pts_preds = preds_dicts['map_all_pts_preds'] + map_enc_cls_scores = preds_dicts['map_enc_cls_scores'] + map_enc_bbox_preds = preds_dicts['map_enc_bbox_preds'] + map_enc_pts_preds = preds_dicts['map_enc_pts_preds'] + ego_fut_preds = preds_dicts['ego_fut_preds'] + distribution_pred = preds_dicts['loss_vae_gen'] + + num_dec_layers = len(all_cls_scores) + device = gt_labels_list[0].device + + gt_bboxes_list = [torch.cat( + (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), + dim=1).to(device) for gt_bboxes in gt_bboxes_list] + + all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] + all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] + all_gt_attr_labels_list = [gt_attr_labels for _ in range(num_dec_layers)] + all_gt_bboxes_ignore_list = [ + gt_bboxes_ignore for _ in range(num_dec_layers) + ] + + losses_cls, losses_bbox, loss_traj, loss_traj_cls = multi_apply( + self.loss_single, all_cls_scores, all_bbox_preds, all_traj_preds, + all_traj_cls_scores, all_gt_bboxes_list, all_gt_labels_list, + all_gt_attr_labels_list, all_gt_bboxes_ignore_list) + + + num_dec_layers = len(map_all_cls_scores) + device = map_gt_labels_list[0].device + + map_gt_bboxes_list = [ + map_gt_bboxes.bbox.to(device) for map_gt_bboxes in map_gt_vecs_list] + map_gt_pts_list = [ + map_gt_bboxes.fixed_num_sampled_points.to(device) for map_gt_bboxes in map_gt_vecs_list] + if self.map_gt_shift_pts_pattern == 'v0': + map_gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points.to(device) for gt_bboxes in map_gt_vecs_list] + elif self.map_gt_shift_pts_pattern == 'v1': + map_gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v1.to(device) for gt_bboxes in map_gt_vecs_list] + elif self.map_gt_shift_pts_pattern == 'v2': + map_gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v2.to(device) for gt_bboxes in map_gt_vecs_list] + elif self.map_gt_shift_pts_pattern == 'v3': + map_gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v3.to(device) for gt_bboxes in map_gt_vecs_list] + elif self.map_gt_shift_pts_pattern == 'v4': + map_gt_shifts_pts_list = [ + gt_bboxes.shift_fixed_num_sampled_points_v4.to(device) for gt_bboxes in map_gt_vecs_list] + else: + raise NotImplementedError + map_all_gt_bboxes_list = [map_gt_bboxes_list for _ in range(num_dec_layers)] + map_all_gt_labels_list = [map_gt_labels_list for _ in range(num_dec_layers)] + map_all_gt_pts_list = [map_gt_pts_list for _ in range(num_dec_layers)] + map_all_gt_shifts_pts_list = [map_gt_shifts_pts_list for _ in range(num_dec_layers)] + map_all_gt_bboxes_ignore_list = [ + map_gt_bboxes_ignore for _ in range(num_dec_layers) + ] + + map_losses_cls, map_losses_bbox, map_losses_iou, \ + map_losses_pts, map_losses_dir = multi_apply( + self.map_loss_single, map_all_cls_scores, map_all_bbox_preds, + map_all_pts_preds, map_all_gt_bboxes_list, map_all_gt_labels_list, + map_all_gt_shifts_pts_list, map_all_gt_bboxes_ignore_list) + + loss_dict = dict() + # loss from the last decoder layer + loss_dict['loss_cls'] = losses_cls[-1] + loss_dict['loss_bbox'] = losses_bbox[-1] + loss_dict['loss_traj'] = loss_traj[-1] + loss_dict['loss_traj_cls'] = loss_traj_cls[-1] + # loss from the last decoder layer + loss_dict['loss_map_cls'] = map_losses_cls[-1] + loss_dict['loss_map_bbox'] = map_losses_bbox[-1] + loss_dict['loss_map_iou'] = map_losses_iou[-1] + loss_dict['loss_map_pts'] = map_losses_pts[-1] + loss_dict['loss_map_dir'] = map_losses_dir[-1] + + # Planning Loss + ego_fut_gt = ego_fut_gt.squeeze(1) + ego_fut_masks = ego_fut_masks.squeeze(1).squeeze(1) + ego_fut_cmd = ego_fut_cmd.squeeze(1).squeeze(1) + + batch, num_agent = all_traj_preds[-1].shape[:2] + agent_fut_preds = all_traj_preds[-1].view(batch, num_agent, self.fut_mode, self.fut_ts, 2) + agent_fut_cls_preds = all_traj_cls_scores[-1].view(batch, num_agent, self.fut_mode) + loss_plan_input = [ego_fut_preds, ego_fut_gt, ego_fut_masks, ego_fut_cmd, + map_all_pts_preds[-1], map_all_cls_scores[-1].sigmoid(), + all_bbox_preds[-1][..., 0:2], agent_fut_preds, + all_cls_scores[-1].sigmoid(), agent_fut_cls_preds.sigmoid()] + + loss_planning_dict = self.loss_planning(*loss_plan_input) + loss_dict['loss_plan_reg'] = loss_planning_dict['loss_plan_reg'] + loss_dict['loss_plan_bound'] = loss_planning_dict['loss_plan_bound'] + loss_dict['loss_plan_col'] = loss_planning_dict['loss_plan_col'] + loss_dict['loss_plan_dir'] = loss_planning_dict['loss_plan_dir'] + + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]): + loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i + num_dec_layer += 1 + # loss from other decoder layers + num_dec_layer = 0 + for map_loss_cls_i, map_loss_bbox_i, map_loss_iou_i, map_loss_pts_i, map_loss_dir_i in zip( + map_losses_cls[:-1], + map_losses_bbox[:-1], + map_losses_iou[:-1], + map_losses_pts[:-1], + map_losses_dir[:-1] + ): + loss_dict[f'd{num_dec_layer}.loss_map_cls'] = map_loss_cls_i + loss_dict[f'd{num_dec_layer}.loss_map_bbox'] = map_loss_bbox_i + loss_dict[f'd{num_dec_layer}.loss_map_iou'] = map_loss_iou_i + loss_dict[f'd{num_dec_layer}.loss_map_pts'] = map_loss_pts_i + loss_dict[f'd{num_dec_layer}.loss_map_dir'] = map_loss_dir_i + num_dec_layer += 1 + + # loss of proposal generated from encode feature map. + if enc_cls_scores is not None: + binary_labels_list = [ + torch.zeros_like(gt_labels_list[i]) + for i in range(len(all_gt_labels_list)) + ] + enc_loss_cls, enc_losses_bbox = \ + self.loss_single(enc_cls_scores, enc_bbox_preds, + gt_bboxes_list, binary_labels_list, + gt_bboxes_ignore) + loss_dict['enc_loss_cls'] = enc_loss_cls + loss_dict['enc_loss_bbox'] = enc_losses_bbox + + if map_enc_cls_scores is not None: + map_binary_labels_list = [ + torch.zeros_like(map_gt_labels_list[i]) + for i in range(len(map_all_gt_labels_list)) + ] + # TODO bug here, but we dont care enc_loss now + map_enc_loss_cls, map_enc_loss_bbox, map_enc_loss_iou, \ + map_enc_loss_pts, map_enc_loss_dir = \ + self.map_loss_single( + map_enc_cls_scores, map_enc_bbox_preds, + map_enc_pts_preds, map_gt_bboxes_list, + map_binary_labels_list, map_gt_pts_list, + map_gt_bboxes_ignore + ) + loss_dict['enc_loss_map_cls'] = map_enc_loss_cls + loss_dict['enc_loss_map_bbox'] = map_enc_loss_bbox + loss_dict['enc_loss_map_iou'] = map_enc_loss_iou + loss_dict['enc_loss_map_pts'] = map_enc_loss_pts + loss_dict['enc_loss_map_dir'] = map_enc_loss_dir + + loss_dict['loss_vae_gen'] = self.loss_vae_gen(distribution_pred) + + return loss_dict + + @force_fp32(apply_to=('preds_dicts')) + def get_bboxes(self, preds_dicts, img_metas, rescale=False): + """Generate bboxes from bbox head predictions. + Args: + preds_dicts (tuple[list[dict]]): Prediction results. + img_metas (list[dict]): Point cloud and image's meta info. + Returns: + list[dict]: Decoded bbox, scores and labels after nms. + """ + + det_preds_dicts = self.bbox_coder.decode(preds_dicts) + # map_bboxes: xmin, ymin, xmax, ymax + map_preds_dicts = self.map_bbox_coder.decode(preds_dicts) + + num_samples = len(det_preds_dicts) + assert len(det_preds_dicts) == len(map_preds_dicts), \ + 'len(preds_dict) should be equal to len(map_preds_dicts)' + ret_list = [] + for i in range(num_samples): + preds = det_preds_dicts[i] + bboxes = preds['bboxes'] + bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 + code_size = bboxes.shape[-1] + bboxes = img_metas[i]['box_type_3d'](bboxes, code_size) + scores = preds['scores'] + labels = preds['labels'] + trajs = preds['trajs'] + + map_preds = map_preds_dicts[i] + map_bboxes = map_preds['map_bboxes'] + map_scores = map_preds['map_scores'] + map_labels = map_preds['map_labels'] + map_pts = map_preds['map_pts'] + + ret_list.append([bboxes, scores, labels, trajs, map_bboxes, + map_scores, map_labels, map_pts]) + + return ret_list + + def select_and_pad_pred_map( + self, + motion_pos, + map_query, + map_score, + map_pos, + map_thresh=0.5, + dis_thresh=None, + pe_normalization=True, + use_fix_pad=False + ): + """select_and_pad_pred_map. + Args: + motion_pos: [B, A, 2] + map_query: [B, P, D]. + map_score: [B, P, 3]. + map_pos: [B, P, pts, 2]. + map_thresh: map confidence threshold for filtering low-confidence preds + dis_thresh: distance threshold for masking far maps for each agent in cross-attn + use_fix_pad: always pad one lane instance for each batch + Returns: + selected_map_query: [B*A, P1(+1), D], P1 is the max inst num after filter and pad. + selected_map_pos: [B*A, P1(+1), 2] + selected_padding_mask: [B*A, P1(+1)] + """ + + if dis_thresh is None: + raise NotImplementedError('Not implement yet') + + # use the most close pts pos in each map inst as the inst's pos + batch, num_map = map_pos.shape[:2] + map_dis = torch.sqrt(map_pos[..., 0]**2 + map_pos[..., 1]**2) + min_map_pos_idx = map_dis.argmin(dim=-1).flatten() # [B*P] + min_map_pos = map_pos.flatten(0, 1) # [B*P, pts, 2] + min_map_pos = min_map_pos[range(min_map_pos.shape[0]), min_map_pos_idx] # [B*P, 2] + min_map_pos = min_map_pos.view(batch, num_map, 2) # [B, P, 2] + + # select & pad map vectors for different batch using map_thresh + map_score = map_score.sigmoid() + map_max_score = map_score.max(dim=-1)[0] + map_idx = map_max_score > map_thresh + batch_max_pnum = 0 + for i in range(map_score.shape[0]): + pnum = map_idx[i].sum() + if pnum > batch_max_pnum: + batch_max_pnum = pnum + + selected_map_query, selected_map_pos, selected_padding_mask = [], [], [] + for i in range(map_score.shape[0]): + dim = map_query.shape[-1] + valid_pnum = map_idx[i].sum() + valid_map_query = map_query[i, map_idx[i]] + valid_map_pos = min_map_pos[i, map_idx[i]] + pad_pnum = batch_max_pnum - valid_pnum + padding_mask = torch.tensor([False], device=map_score.device).repeat(batch_max_pnum) + if pad_pnum != 0: + valid_map_query = torch.cat([valid_map_query, torch.zeros((pad_pnum, dim), device=map_score.device)], dim=0) + valid_map_pos = torch.cat([valid_map_pos, torch.zeros((pad_pnum, 2), device=map_score.device)], dim=0) + padding_mask[valid_pnum:] = True + selected_map_query.append(valid_map_query) + selected_map_pos.append(valid_map_pos) + selected_padding_mask.append(padding_mask) + + selected_map_query = torch.stack(selected_map_query, dim=0) + selected_map_pos = torch.stack(selected_map_pos, dim=0) + selected_padding_mask = torch.stack(selected_padding_mask, dim=0) + + # generate different pe for map vectors for each agent + num_agent = motion_pos.shape[1] + selected_map_query = selected_map_query.unsqueeze(1).repeat(1, num_agent, 1, 1) # [B, A, max_P, D] + selected_map_pos = selected_map_pos.unsqueeze(1).repeat(1, num_agent, 1, 1) # [B, A, max_P, 2] + selected_padding_mask = selected_padding_mask.unsqueeze(1).repeat(1, num_agent, 1) # [B, A, max_P] + # move lane to per-car coords system + selected_map_dist = selected_map_pos - motion_pos[:, :, None, :] # [B, A, max_P, 2] + if pe_normalization: + selected_map_pos = selected_map_pos - motion_pos[:, :, None, :] # [B, A, max_P, 2] + + # filter far map inst for each agent + map_dis = torch.sqrt(selected_map_dist[..., 0]**2 + selected_map_dist[..., 1]**2) + valid_map_inst = (map_dis <= dis_thresh) # [B, A, max_P] + invalid_map_inst = (valid_map_inst == False) + selected_padding_mask = selected_padding_mask + invalid_map_inst + + selected_map_query = selected_map_query.flatten(0, 1) + selected_map_pos = selected_map_pos.flatten(0, 1) + selected_padding_mask = selected_padding_mask.flatten(0, 1) + + num_batch = selected_padding_mask.shape[0] + feat_dim = selected_map_query.shape[-1] + if use_fix_pad: + pad_map_query = torch.zeros((num_batch, 1, feat_dim), device=selected_map_query.device) + pad_map_pos = torch.ones((num_batch, 1, 2), device=selected_map_pos.device) + pad_lane_mask = torch.tensor([False], device=selected_padding_mask.device).unsqueeze(0).repeat(num_batch, 1) + selected_map_query = torch.cat([selected_map_query, pad_map_query], dim=1) + selected_map_pos = torch.cat([selected_map_pos, pad_map_pos], dim=1) + selected_padding_mask = torch.cat([selected_padding_mask, pad_lane_mask], dim=1) + + return selected_map_query, selected_map_pos, selected_padding_mask + + + def select_and_pad_query( + self, + query, + query_pos, + query_score, + score_thresh=0.5, + use_fix_pad=True + ): + """select_and_pad_query. + Args: + query: [B, Q, D]. + query_pos: [B, Q, 2] + query_score: [B, Q, C]. + score_thresh: confidence threshold for filtering low-confidence query + use_fix_pad: always pad one query instance for each batch + Returns: + selected_query: [B, Q', D] + selected_query_pos: [B, Q', 2] + selected_padding_mask: [B, Q'] + """ + + # select & pad query for different batch using score_thresh + query_score = query_score.sigmoid() + query_score = query_score.max(dim=-1)[0] + query_idx = query_score > score_thresh + batch_max_qnum = 0 + for i in range(query_score.shape[0]): + qnum = query_idx[i].sum() + if qnum > batch_max_qnum: + batch_max_qnum = qnum + + selected_query, selected_query_pos, selected_padding_mask = [], [], [] + for i in range(query_score.shape[0]): + dim = query.shape[-1] + valid_qnum = query_idx[i].sum() + valid_query = query[i, query_idx[i]] + valid_query_pos = query_pos[i, query_idx[i]] + pad_qnum = batch_max_qnum - valid_qnum + padding_mask = torch.tensor([False], device=query_score.device).repeat(batch_max_qnum) + if pad_qnum != 0: + valid_query = torch.cat([valid_query, torch.zeros((pad_qnum, dim), device=query_score.device)], dim=0) + valid_query_pos = torch.cat([valid_query_pos, torch.zeros((pad_qnum, 2), device=query_score.device)], dim=0) + padding_mask[valid_qnum:] = True + selected_query.append(valid_query) + selected_query_pos.append(valid_query_pos) + selected_padding_mask.append(padding_mask) + + selected_query = torch.stack(selected_query, dim=0) + selected_query_pos = torch.stack(selected_query_pos, dim=0) + selected_padding_mask = torch.stack(selected_padding_mask, dim=0) + + num_batch = selected_padding_mask.shape[0] + feat_dim = selected_query.shape[-1] + if use_fix_pad: + pad_query = torch.zeros((num_batch, 1, feat_dim), device=selected_query.device) + pad_query_pos = torch.ones((num_batch, 1, 2), device=selected_query_pos.device) + pad_mask = torch.tensor([False], device=selected_padding_mask.device).unsqueeze(0).repeat(num_batch, 1) + selected_query = torch.cat([selected_query, pad_query], dim=1) + selected_query_pos = torch.cat([selected_query_pos, pad_query_pos], dim=1) + selected_padding_mask = torch.cat([selected_padding_mask, pad_mask], dim=1) + + return selected_query, selected_query_pos, selected_padding_mask + + + + def distribution_forward(self, present_features, future_distribution_inputs=None, noise=None): + """ + Parameters + ---------- + present_features: 5-D output from dynamics module with shape (b, 1, c, h, w) + future_distribution_inputs: 5-D tensor containing labels shape (b, s, cfg.PROB_FUTURE_DIM, h, w) + noise: a sample from a (0, 1) gaussian with shape (b, s, latent_dim). If None, will sample in function + + Returns + ------- + sample: sample taken from present/future distribution, broadcast to shape (b, s, latent_dim, h, w) + present_distribution_mu: shape (b, s, latent_dim) + present_distribution_log_sigma: shape (b, s, latent_dim) + future_distribution_mu: shape (b, s, latent_dim) + future_distribution_log_sigma: shape (b, s, latent_dim) + """ + + b = present_features.shape[0] + c = present_features.shape[1] + present_mu, present_log_sigma = self.present_distribution(present_features) + + future_mu, future_log_sigma = None, None + if future_distribution_inputs is not None: + # Concatenate future labels to z_t + # future_features = future_distribution_inputs[:, 1:].contiguous().view(b, 1, -1, h, w) + future_features = torch.cat([present_features, future_distribution_inputs], dim=2) + future_mu, future_log_sigma = self.future_distribution(future_features) + + if noise is None: + if self.training: + noise = torch.randn_like(present_mu) + else: + noise = torch.zeros_like(present_mu) + if self.training: + mu = future_mu + sigma = torch.exp(future_log_sigma) + else: + mu = present_mu + sigma = torch.exp(present_log_sigma) + sample = mu + sigma * noise + + # Spatially broadcast sample to the dimensions of present_features + sample = sample.permute(0, 2, 1).expand(b, self.latent_dim, c) + + output_distribution = { + 'present_mu': present_mu, + 'present_log_sigma': present_log_sigma, + 'future_mu': future_mu, + 'future_log_sigma': future_log_sigma, + } + + return sample, output_distribution + + def get_future_labels(self, gt_labels_3d, gt_attr_labels, ego_fut_trajs, device): + + agent_dim = 300 + veh_list = [0,1,3,4] + mapped_class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', + 'trailer', 'barrier', 'motorcycle', 'bicycle', + 'pedestrian', 'traffic_cone' + ] + ignore_list = ['construction_vehicle', 'barrier', + 'traffic_cone', 'motorcycle', 'bicycle'] + + batch_size = len(gt_labels_3d) + + # gt_label = gt_labels_3d[0] + # gt_attr_label = gt_attr_labels[0] + + gt_fut_trajs_bz_list = [] + + for bz in range(batch_size): + gt_fut_trajs_list = [] + gt_label = gt_labels_3d[bz] + gt_attr_label = gt_attr_labels[bz] + for i in range(gt_label.shape[0]): + gt_label[i] = 0 if gt_label[i] in veh_list else gt_label[i] + box_name = mapped_class_names[gt_label[i]] + if box_name in ignore_list: + continue + gt_fut_masks = gt_attr_label[i][self.fut_ts * 2:self.fut_ts * 3] + num_valid_ts = sum(gt_fut_masks == 1) + gt_fut_traj = gt_attr_label[i][:self.fut_ts * 2].reshape(-1, 2) + gt_fut_traj = gt_fut_traj[:num_valid_ts] + if gt_fut_traj.shape[0] == 0: + gt_fut_traj = torch.zeros([self.fut_ts - gt_fut_traj.shape[0], 2], device=device) + if gt_fut_traj.shape[0] < self.fut_ts: + gt_fut_traj = torch.cat((gt_fut_traj, torch.zeros([self.fut_ts - gt_fut_traj.shape[0], 2], device=device)), 0) + gt_fut_trajs_list.append(gt_fut_traj) + + + if len(gt_fut_trajs_list) != 0 & len(gt_fut_trajs_list) < agent_dim: + gt_fut_trajs = torch.cat( + (torch.stack(gt_fut_trajs_list), torch.zeros([agent_dim - len(gt_fut_trajs_list), self.fut_ts, 2], device=device)), 0) + else: + gt_fut_trajs = torch.zeros([agent_dim, self.fut_ts, 2], device=device) + + gt_fut_trajs_bz_list.append(gt_fut_trajs) + + if len(gt_fut_trajs_bz_list) != 0: + gt_trajs = torch.cat((torch.stack(gt_fut_trajs_bz_list).repeat(1, 6, 1, 1), ego_fut_trajs), dim=1) + else: + gt_trajs = ego_fut_trajs + #future_states = gt_trajs.reshape(batch_size, gt_trajs.shape[1], -1) + + + + # [bz, a, t, 2] + return gt_trajs.reshape(batch_size, gt_trajs.shape[1], -1) + + + def future_states_predict(self, batch_size, sample, hidden_states, current_states): + + future_prediction_input = sample.unsqueeze(0).expand(self.fut_ts, -1, -1, -1) + # + # future_states = self.future_prediction(future_prediction_input, hidden_state) + future_prediction_input = future_prediction_input.reshape(self.fut_ts, -1, self.latent_dim) + + hidden_state = hidden_states.reshape(self.layer_dim, -1, int(self.embed_dims/2)) + # future_states, future_hidden = self.state_gru(future_prediction_input, hidden_state) + future_states = self.predict_model(future_prediction_input, hidden_state) + + current_states_hs = current_states.unsqueeze(0).repeat(6, 1, 1, 1) + future_states_hs = future_states.reshape(self.fut_ts, batch_size, -1, future_states.shape[2]) + states_hs = torch.cat((current_states_hs, future_states_hs), dim=-1) + + return states_hs, future_states_hs + + + + + diff --git a/GenAD-main/projects/mmdet3d_plugin/models/backbones/__init__.py b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cea72f55c89cbc3d57bc9ae58e74144b27cc0530 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__init__.py @@ -0,0 +1,3 @@ +from .vovnet import VoVNet + +__all__ = ['VoVNet'] \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91c9b0f6b6a7c9524a3e8f960a2ec1b4fadbaa56 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/vovnet.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/vovnet.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae0009ed23e90f10d68e2d26b85cc595938a2ae2 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/vovnet.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/models/backbones/vovnet.py b/GenAD-main/projects/mmdet3d_plugin/models/backbones/vovnet.py new file mode 100644 index 0000000000000000000000000000000000000000..879d186a37b49addaf27362cc6ae1e5465b2168e --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/backbones/vovnet.py @@ -0,0 +1,375 @@ + +from collections import OrderedDict +from mmcv.runner import BaseModule +from mmdet.models.builder import BACKBONES +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.modules.batchnorm import _BatchNorm + + +VoVNet19_slim_dw_eSE = { + 'stem': [64, 64, 64], + 'stage_conv_ch': [64, 80, 96, 112], + 'stage_out_ch': [112, 256, 384, 512], + "layer_per_block": 3, + "block_per_stage": [1, 1, 1, 1], + "eSE": True, + "dw": True +} + +VoVNet19_dw_eSE = { + 'stem': [64, 64, 64], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 3, + "block_per_stage": [1, 1, 1, 1], + "eSE": True, + "dw": True +} + +VoVNet19_slim_eSE = { + 'stem': [64, 64, 128], + 'stage_conv_ch': [64, 80, 96, 112], + 'stage_out_ch': [112, 256, 384, 512], + 'layer_per_block': 3, + 'block_per_stage': [1, 1, 1, 1], + 'eSE': True, + "dw": False +} + +VoVNet19_eSE = { + 'stem': [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 3, + "block_per_stage": [1, 1, 1, 1], + "eSE": True, + "dw": False +} + +VoVNet39_eSE = { + 'stem': [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 5, + "block_per_stage": [1, 1, 2, 2], + "eSE": True, + "dw": False +} + +VoVNet57_eSE = { + 'stem': [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 5, + "block_per_stage": [1, 1, 4, 3], + "eSE": True, + "dw": False +} + +VoVNet99_eSE = { + 'stem': [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 5, + "block_per_stage": [1, 3, 9, 3], + "eSE": True, + "dw": False +} + +_STAGE_SPECS = { + "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE, + "V-19-dw-eSE": VoVNet19_dw_eSE, + "V-19-slim-eSE": VoVNet19_slim_eSE, + "V-19-eSE": VoVNet19_eSE, + "V-39-eSE": VoVNet39_eSE, + "V-57-eSE": VoVNet57_eSE, + "V-99-eSE": VoVNet99_eSE, +} + + +def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1): + """3x3 convolution with padding""" + return [ + ( + '{}_{}/dw_conv3x3'.format(module_name, postfix), + nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=out_channels, + bias=False + ) + ), + ( + '{}_{}/pw_conv1x1'.format(module_name, postfix), + nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False) + ), + ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)), + ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)), + ] + + +def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1): + """3x3 convolution with padding""" + return [ + ( + f"{module_name}_{postfix}/conv", + nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + ), + ), + (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), + (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), + ] + + +def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0): + """1x1 convolution with padding""" + return [ + ( + f"{module_name}_{postfix}/conv", + nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + ), + ), + (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), + (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), + ] + + +class Hsigmoid(nn.Module): + def __init__(self, inplace=True): + super(Hsigmoid, self).__init__() + self.inplace = inplace + + def forward(self, x): + return F.relu6(x + 3.0, inplace=self.inplace) / 6.0 + + +class eSEModule(nn.Module): + def __init__(self, channel, reduction=4): + super(eSEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0) + self.hsigmoid = Hsigmoid() + + def forward(self, x): + input = x + x = self.avg_pool(x) + x = self.fc(x) + x = self.hsigmoid(x) + return input * x + + +class _OSA_module(nn.Module): + def __init__( + self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False + ): + + super(_OSA_module, self).__init__() + + self.identity = identity + self.depthwise = depthwise + self.isReduced = False + self.layers = nn.ModuleList() + in_channel = in_ch + if self.depthwise and in_channel != stage_ch: + self.isReduced = True + self.conv_reduction = nn.Sequential( + OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0")) + ) + for i in range(layer_per_block): + if self.depthwise: + self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i)))) + else: + self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i)))) + in_channel = stage_ch + + # feature aggregation + in_channel = in_ch + layer_per_block * stage_ch + self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat"))) + + self.ese = eSEModule(concat_ch) + + def forward(self, x): + + identity_feat = x + + output = [] + output.append(x) + if self.depthwise and self.isReduced: + x = self.conv_reduction(x) + for layer in self.layers: + x = layer(x) + output.append(x) + + x = torch.cat(output, dim=1) + xt = self.concat(x) + + xt = self.ese(xt) + + if self.identity: + xt = xt + identity_feat + + return xt + + +class _OSA_stage(nn.Sequential): + def __init__( + self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False + ): + + super(_OSA_stage, self).__init__() + + if not stage_num == 2: + self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)) + + if block_per_stage != 1: + SE = False + module_name = f"OSA{stage_num}_1" + self.add_module( + module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise) + ) + for i in range(block_per_stage - 1): + if i != block_per_stage - 2: # last block + SE = False + module_name = f"OSA{stage_num}_{i + 2}" + self.add_module( + module_name, + _OSA_module( + concat_ch, + stage_ch, + concat_ch, + layer_per_block, + module_name, + SE, + identity=True, + depthwise=depthwise + ), + ) + + +@BACKBONES.register_module() +class VoVNet(BaseModule): + def __init__(self, spec_name, input_ch=3, out_features=None, + frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None): + """ + Args: + input_ch(int) : the number of input channel + out_features (list[str]): name of the layers whose outputs should + be returned in forward. Can be anything in "stem", "stage2" ... + """ + super(VoVNet, self).__init__(init_cfg) + self.frozen_stages = frozen_stages + self.norm_eval = norm_eval + + if isinstance(pretrained, str): + warnings.warn('DeprecationWarning: pretrained is deprecated, ' + 'please use "init_cfg" instead') + self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + stage_specs = _STAGE_SPECS[spec_name] + + stem_ch = stage_specs["stem"] + config_stage_ch = stage_specs["stage_conv_ch"] + config_concat_ch = stage_specs["stage_out_ch"] + block_per_stage = stage_specs["block_per_stage"] + layer_per_block = stage_specs["layer_per_block"] + SE = stage_specs["eSE"] + depthwise = stage_specs["dw"] + + self._out_features = out_features + + # Stem module + conv_type = dw_conv3x3 if depthwise else conv3x3 + stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2) + stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1) + stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2) + self.add_module("stem", nn.Sequential((OrderedDict(stem)))) + current_stirde = 4 + self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde} + self._out_feature_channels = {"stem": stem_ch[2]} + + stem_out_ch = [stem_ch[2]] + in_ch_list = stem_out_ch + config_concat_ch[:-1] + # OSA stages + self.stage_names = [] + for i in range(4): # num_stages + name = "stage%d" % (i + 2) # stage 2 ... stage 5 + self.stage_names.append(name) + self.add_module( + name, + _OSA_stage( + in_ch_list[i], + config_stage_ch[i], + config_concat_ch[i], + block_per_stage[i], + layer_per_block, + i + 2, + SE, + depthwise, + ), + ) + + self._out_feature_channels[name] = config_concat_ch[i] + if not i == 0: + self._out_feature_strides[name] = current_stirde = int(current_stirde * 2) + + # initialize weights + # self._initialize_weights() + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + + def forward(self, x): + outputs = {} + x = self.stem(x) + if "stem" in self._out_features: + outputs["stem"] = x + for name in self.stage_names: + x = getattr(self, name)(x) + if name in self._out_features: + outputs[name] = x + + return outputs + + def _freeze_stages(self): + if self.frozen_stages >= 0: + m = getattr(self, 'stem') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'stage{i+1}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + """Convert the model into training mode while keep normalization layer + freezed.""" + super(VoVNet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/models/hooks/__init__.py b/GenAD-main/projects/mmdet3d_plugin/models/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..93b13c9c853d6f7eece8ae2dc7aa67d4e87db68b --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/hooks/__init__.py @@ -0,0 +1 @@ +from .hooks import GradChecker \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/models/hooks/hooks.py b/GenAD-main/projects/mmdet3d_plugin/models/hooks/hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..56ff7fd575c890e60ce49eb618df157b2cc2ca37 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/hooks/hooks.py @@ -0,0 +1,13 @@ +from mmcv.runner.hooks.hook import HOOKS, Hook +from projects.mmdet3d_plugin.models.utils import run_time + + +@HOOKS.register_module() +class GradChecker(Hook): + + def after_train_iter(self, runner): + for key, val in runner.model.named_parameters(): + if val.grad == None and val.requires_grad: + print('WARNNING: {key}\'s parameters are not be used!!!!'.format(key=key)) + + diff --git a/GenAD-main/projects/mmdet3d_plugin/models/opt/__init__.py b/GenAD-main/projects/mmdet3d_plugin/models/opt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c7dd426868a61772bbe0926e435ce89f15009805 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/opt/__init__.py @@ -0,0 +1 @@ +from .adamw import AdamW2 \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0caad3f875c313e457a7bf915c801f2de5548680 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/adamw.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/adamw.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..114bab1eb6063f7479fc71e7c37748b6f26e792b Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/adamw.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/models/opt/adamw.py b/GenAD-main/projects/mmdet3d_plugin/models/opt/adamw.py new file mode 100644 index 0000000000000000000000000000000000000000..c890aeaf04721580c11ca329f2be09a6a280f773 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/opt/adamw.py @@ -0,0 +1,131 @@ +try: + from torch.optim import _functional as F +except: + print('WARNING!!!, I recommend using torch>=1.8') + +import torch +from torch.optim.optimizer import Optimizer +from mmcv.runner.optimizer.builder import OPTIMIZERS + +@OPTIMIZERS.register_module() +class AdamW2(Optimizer): + r"""Implements AdamW algorithm. Solve the bug of torch 1.8 + + The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. + The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. + + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay coefficient (default: 1e-2) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + + .. _Adam\: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + .. _Decoupled Weight Decay Regularization: + https://arxiv.org/abs/1711.05101 + .. _On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=1e-2, amsgrad=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad) + super(AdamW2, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW2, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + + Args: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + params_with_grad = [] + grads = [] + exp_avgs = [] + exp_avg_sqs = [] + state_sums = [] + max_exp_avg_sqs = [] + state_steps = [] + amsgrad = group['amsgrad'] + + # put this line here for solving bug + beta1, beta2 = group['betas'] + + for p in group['params']: + if p.grad is None: + continue + params_with_grad.append(p) + if p.grad.is_sparse: + raise RuntimeError('AdamW does not support sparse gradients') + grads.append(p.grad) + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + exp_avgs.append(state['exp_avg']) + exp_avg_sqs.append(state['exp_avg_sq']) + + if amsgrad: + max_exp_avg_sqs.append(state['max_exp_avg_sq']) + + + # update the steps for each param group update + state['step'] += 1 + # record the step after step update + state_steps.append(state['step']) + + F.adamw(params_with_grad, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + amsgrad, + beta1, + beta2, + group['lr'], + group['weight_decay'], + group['eps']) + + return loss \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__init__.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..96e2423a7dff3be63b65178827c2bab4dd5c398d --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/__init__.py @@ -0,0 +1,6 @@ + +from .bricks import run_time +from .grid_mask import GridMask +from .position_embedding import RelPositionEmbedding +from .visual import save_tensor +from .embed import PatchEmbed \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81f413b0efef621e6f19ab2a48ac4bb93040df7a Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/__init__.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/bricks.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/bricks.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..828028b1afbc40652c78b6546de08207f09a0b05 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/bricks.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/embed.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/embed.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55363d515578bb6d682a1258873e479f2e73f197 Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/embed.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/grid_mask.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/grid_mask.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4e75ae91e3296dd493acd20964edec62477c26b Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/grid_mask.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/position_embedding.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/position_embedding.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec80fc4ccae70264b92ea6ac20b2fd80984913ad Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/position_embedding.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/visual.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/visual.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6376205ba2361a2131109d43e0ce73eca0fe87c Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/visual.cpython-38.pyc differ diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/bricks.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/bricks.py new file mode 100644 index 0000000000000000000000000000000000000000..fd458813d9ffced23b79799daa84150ba887774e --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/bricks.py @@ -0,0 +1,20 @@ +import functools +import time +from collections import defaultdict +import torch +time_maps = defaultdict(lambda :0.) +count_maps = defaultdict(lambda :0.) +def run_time(name): + def middle(fn): + def wrapper(*args, **kwargs): + torch.cuda.synchronize() + start = time.time() + res = fn(*args, **kwargs) + torch.cuda.synchronize() + time_maps['%s : %s'%(name, fn.__name__) ] += time.time()-start + count_maps['%s : %s'%(name, fn.__name__) ] +=1 + print("%s : %s takes up %f "% (name, fn.__name__,time_maps['%s : %s'%(name, fn.__name__) ] /count_maps['%s : %s'%(name, fn.__name__) ] )) + return res + return wrapper + return middle + \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/embed.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/embed.py new file mode 100644 index 0000000000000000000000000000000000000000..2dbebfe41d7138baaaf112b815b44143fce7a170 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/embed.py @@ -0,0 +1,100 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn.functional as F +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmcv.runner.base_module import BaseModule +from torch.nn.modules.utils import _pair as to_2tuple + + +# Modified from Pytorch-Image-Models +class PatchEmbed(BaseModule): + """Image to Patch Embedding V2. + + We use a conv layer to implement PatchEmbed. + Args: + in_channels (int): The num of input channels. Default: 3 + embed_dims (int): The dimensions of embedding. Default: 768 + conv_type (dict, optional): The config dict for conv layers type + selection. Default: None. + kernel_size (int): The kernel_size of embedding conv. Default: 16. + stride (int): The slide stride of embedding conv. + Default: None (Default to be equal with kernel_size). + padding (int): The padding length of embedding conv. Default: 0. + dilation (int): The dilation rate of embedding conv. Default: 1. + pad_to_patch_size (bool, optional): Whether to pad feature map shape + to multiple patch size. Default: True. + norm_cfg (dict, optional): Config dict for normalization layer. + init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. + Default: None. + """ + + def __init__(self, + in_channels=3, + embed_dims=768, + conv_type=None, + kernel_size=16, + stride=16, + padding=0, + dilation=1, + pad_to_patch_size=True, + norm_cfg=None, + init_cfg=None): + super(PatchEmbed, self).__init__() + + self.embed_dims = embed_dims + self.init_cfg = init_cfg + + if stride is None: + stride = kernel_size + + self.pad_to_patch_size = pad_to_patch_size + + # The default setting of patch size is equal to kernel size. + patch_size = kernel_size + if isinstance(patch_size, int): + patch_size = to_2tuple(patch_size) + elif isinstance(patch_size, tuple): + if len(patch_size) == 1: + patch_size = to_2tuple(patch_size[0]) + assert len(patch_size) == 2, \ + f'The size of patch should have length 1 or 2, ' \ + f'but got {len(patch_size)}' + + self.patch_size = patch_size + + # Use conv layer to embed + conv_type = conv_type or 'Conv2d' + self.projection = build_conv_layer( + dict(type=conv_type), + in_channels=in_channels, + out_channels=embed_dims, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation) + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + else: + self.norm = None + + def forward(self, x): + H, W = x.shape[2], x.shape[3] + + # TODO: Process overlapping op + if self.pad_to_patch_size: + # Modify H, W to multiple of patch size. + if H % self.patch_size[0] != 0: + x = F.pad( + x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) + if W % self.patch_size[1] != 0: + x = F.pad( + x, (0, self.patch_size[1] - W % self.patch_size[1], 0, 0)) + + x = self.projection(x) + self.DH, self.DW = x.shape[2], x.shape[3] + x = x.flatten(2).transpose(1, 2) + + if self.norm is not None: + x = self.norm(x) + + return x \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/grid_mask.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/grid_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..77f77b2314176bf416c447913bcfb482baab02e8 --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/grid_mask.py @@ -0,0 +1,125 @@ +import torch +import torch.nn as nn +import numpy as np +from PIL import Image +from mmcv.runner import force_fp32, auto_fp16 + +class Grid(object): + def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): + self.use_h = use_h + self.use_w = use_w + self.rotate = rotate + self.offset = offset + self.ratio = ratio + self.mode=mode + self.st_prob = prob + self.prob = prob + + def set_prob(self, epoch, max_epoch): + self.prob = self.st_prob * epoch / max_epoch + + def __call__(self, img, label): + if np.random.rand() > self.prob: + return img, label + h = img.size(1) + w = img.size(2) + self.d1 = 2 + self.d2 = min(h, w) + hh = int(1.5*h) + ww = int(1.5*w) + d = np.random.randint(self.d1, self.d2) + if self.ratio == 1: + self.l = np.random.randint(1, d) + else: + self.l = min(max(int(d*self.ratio+0.5),1),d-1) + mask = np.ones((hh, ww), np.float32) + st_h = np.random.randint(d) + st_w = np.random.randint(d) + if self.use_h: + for i in range(hh//d): + s = d*i + st_h + t = min(s+self.l, hh) + mask[s:t,:] *= 0 + if self.use_w: + for i in range(ww//d): + s = d*i + st_w + t = min(s+self.l, ww) + mask[:,s:t] *= 0 + + r = np.random.randint(self.rotate) + mask = Image.fromarray(np.uint8(mask)) + mask = mask.rotate(r) + mask = np.asarray(mask) + mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] + + mask = torch.from_numpy(mask).float() + if self.mode == 1: + mask = 1-mask + + mask = mask.expand_as(img) + if self.offset: + offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float() + offset = (1 - mask) * offset + img = img * mask + offset + else: + img = img * mask + + return img, label + + +class GridMask(nn.Module): + def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): + super(GridMask, self).__init__() + self.use_h = use_h + self.use_w = use_w + self.rotate = rotate + self.offset = offset + self.ratio = ratio + self.mode = mode + self.st_prob = prob + self.prob = prob + self.fp16_enable = False + def set_prob(self, epoch, max_epoch): + self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5 + @auto_fp16() + def forward(self, x): + if np.random.rand() > self.prob or not self.training: + return x + n,c,h,w = x.size() + x = x.view(-1,h,w) + hh = int(1.5*h) + ww = int(1.5*w) + d = np.random.randint(2, h) + self.l = min(max(int(d*self.ratio+0.5),1),d-1) + mask = np.ones((hh, ww), np.float32) + st_h = np.random.randint(d) + st_w = np.random.randint(d) + if self.use_h: + for i in range(hh//d): + s = d*i + st_h + t = min(s+self.l, hh) + mask[s:t,:] *= 0 + if self.use_w: + for i in range(ww//d): + s = d*i + st_w + t = min(s+self.l, ww) + mask[:,s:t] *= 0 + + r = np.random.randint(self.rotate) + mask = Image.fromarray(np.uint8(mask)) + mask = mask.rotate(r) + mask = np.asarray(mask) + mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] + + # mask = torch.from_numpy(mask).to(x.dtype).cuda() + mask = torch.from_numpy(mask).to(x.dtype).to(x.device) + if self.mode == 1: + mask = 1-mask + mask = mask.expand_as(x) + if self.offset: + offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).to(x.dtype).cuda() + x = x * mask + offset * (1 - mask) + else: + x = x * mask + + return x.view(n,c,h,w) \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/position_embedding.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/position_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..7cb9309104cccf0586010de222e4cc307c3c848b --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/position_embedding.py @@ -0,0 +1,73 @@ +import torch +import torch.nn as nn +import math + +class RelPositionEmbedding(nn.Module): + def __init__(self, num_pos_feats=64, pos_norm=True): + super().__init__() + self.num_pos_feats = num_pos_feats + self.fc = nn.Linear(4, self.num_pos_feats,bias=False) + #nn.init.orthogonal_(self.fc.weight) + #self.fc.weight.requires_grad = False + self.pos_norm = pos_norm + if self.pos_norm: + self.norm = nn.LayerNorm(self.num_pos_feats) + def forward(self, tensor): + #mask = nesttensor.mask + B,C,H,W = tensor.shape + #print('tensor.shape', tensor.shape) + y_range = (torch.arange(H) / float(H - 1)).to(tensor.device) + #y_axis = torch.stack((y_range, 1-y_range),dim=1) + y_axis = torch.stack((torch.cos(y_range * math.pi), torch.sin(y_range * math.pi)), dim=1) + y_axis = y_axis.reshape(H, 1, 2).repeat(1, W, 1).reshape(H * W, 2) + + x_range = (torch.arange(W) / float(W - 1)).to(tensor.device) + #x_axis =torch.stack((x_range,1-x_range),dim=1) + x_axis = torch.stack((torch.cos(x_range * math.pi), torch.sin(x_range * math.pi)), dim=1) + x_axis = x_axis.reshape(1, W, 2).repeat(H, 1, 1).reshape(H * W, 2) + x_pos = torch.cat((y_axis, x_axis), dim=1) + x_pos = self.fc(x_pos) + + if self.pos_norm: + x_pos = self.norm(x_pos) + #print('xpos,', x_pos.max(),x_pos.min()) + return x_pos + + +class SineEmbedding(nn.Module): + def __init__(self, in_channels, N_freqs, logscale=True): + """ + Defines a function that embeds x to (x, sin(2^k x), cos(2^k x), ...) + in_channels: number of input channels + """ + super(SineEmbedding, self).__init__() + self.N_freqs = N_freqs + self.in_channels = in_channels + self.funcs = [torch.sin, torch.cos] + self.out_channels = in_channels*(len(self.funcs)*N_freqs) + + if logscale: + self.freq_bands = 2**torch.linspace(0, N_freqs-1, N_freqs) + else: + self.freq_bands = torch.linspace(1, 2**(N_freqs-1), N_freqs) + + def forward(self, x): + """ + Embeds x to (sin(2^k x), cos(2^k x), ...) + Inputs: + x: (B, self.in_channels) + Outputs: + out: (B, self.out_channels) + """ + out = [] + for freq in self.freq_bands: + for func in self.funcs: + out += [func(freq*x)] + + return torch.cat(out, -1) + + +# if __name__ == '__main__': +# pe = Embedding(in_channels=2, N_freqs=64) +# x_pe = pe(torch.randn(1, 4, 2)) +# a = 0 \ No newline at end of file diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/visual.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/visual.py new file mode 100644 index 0000000000000000000000000000000000000000..f9718afea9e67199c77da8ecf33249a28197082a --- /dev/null +++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/visual.py @@ -0,0 +1,24 @@ +import torch +from torchvision.utils import make_grid +import torchvision +import matplotlib.pyplot as plt +import cv2 + + +def convert_color(img_path): + plt.figure() + img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) + plt.imsave(img_path, img, cmap=plt.get_cmap('viridis')) + plt.close() + + +def save_tensor(tensor, path, pad_value=254.0,): + print('save_tensor', path) + tensor = tensor.to(torch.float).detach().cpu() + if tensor.type() == 'torch.BoolTensor': + tensor = tensor*255 + if len(tensor.shape) == 3: + tensor = tensor.unsqueeze(1) + tensor = make_grid(tensor, pad_value=pad_value, normalize=False).permute(1, 2, 0).numpy().copy() + torchvision.utils.save_image(torch.tensor(tensor).permute(2, 0, 1), path) + convert_color(path) diff --git a/GenAD-main/requirements.txt b/GenAD-main/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3b8914733d4daf2461a7f2c2a087398894bf539 --- /dev/null +++ b/GenAD-main/requirements.txt @@ -0,0 +1,221 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +_libgcc_mutex=0.1=main +_openmp_mutex=5.1=1_gnu +absl-py=1.4.0=pypi_0 +addict=2.4.0=pypi_0 +aliyun-python-sdk-core=2.14.0=pypi_0 +aliyun-python-sdk-kms=2.16.2=pypi_0 +anyio=4.0.0=pypi_0 +argon2-cffi=23.1.0=pypi_0 +argon2-cffi-bindings=21.2.0=pypi_0 +arrow=1.2.3=pypi_0 +asttokens=2.4.0=pypi_0 +async-lru=2.0.4=pypi_0 +attrs=23.1.0=pypi_0 +babel=2.12.1=pypi_0 +backcall=0.2.0=pypi_0 +beautifulsoup4=4.12.2=pypi_0 +black=23.7.0=pypi_0 +bleach=6.0.0=pypi_0 +ca-certificates=2023.08.22=h06a4308_0 +cachetools=5.3.1=pypi_0 +certifi=2023.7.22=pypi_0 +cffi=1.15.1=pypi_0 +charset-normalizer=3.2.0=pypi_0 +click=8.1.7=pypi_0 +colorama=0.4.6=pypi_0 +comm=0.1.4=pypi_0 +contourpy=1.1.0=pypi_0 +crcmod=1.7=pypi_0 +cryptography=42.0.0=pypi_0 +cycler=0.11.0=pypi_0 +cython=3.0.2=pypi_0 +debugpy=1.7.0=pypi_0 +decorator=5.1.1=pypi_0 +defusedxml=0.7.1=pypi_0 +descartes=1.1.0=pypi_0 +exceptiongroup=1.1.3=pypi_0 +executing=1.2.0=pypi_0 +fastjsonschema=2.18.0=pypi_0 +filelock=3.12.3=pypi_0 +fire=0.5.0=pypi_0 +flake8=6.1.0=pypi_0 +fonttools=4.42.1=pypi_0 +fqdn=1.5.1=pypi_0 +fsspec=2023.9.0=pypi_0 +google-auth=2.22.0=pypi_0 +google-auth-oauthlib=1.0.0=pypi_0 +grpcio=1.58.0=pypi_0 +huggingface-hub=0.16.4=pypi_0 +idna=3.4=pypi_0 +imageio=2.31.3=pypi_0 +importlib-metadata=6.8.0=pypi_0 +importlib-resources=6.0.1=pypi_0 +iniconfig=2.0.0=pypi_0 +ipykernel=6.25.2=pypi_0 +ipython=8.12.2=pypi_0 +ipython-genutils=0.2.0=pypi_0 +ipywidgets=8.1.0=pypi_0 +isoduration=20.11.0=pypi_0 +jedi=0.19.0=pypi_0 +jinja2=3.1.2=pypi_0 +jmespath=0.10.0=pypi_0 +joblib=1.3.2=pypi_0 +json5=0.9.14=pypi_0 +jsonpointer=2.4=pypi_0 +jsonschema=4.19.0=pypi_0 +jsonschema-specifications=2023.7.1=pypi_0 +jupyter=1.0.0=pypi_0 +jupyter-client=8.3.1=pypi_0 +jupyter-console=6.6.3=pypi_0 +jupyter-core=5.3.1=pypi_0 +jupyter-events=0.7.0=pypi_0 +jupyter-lsp=2.2.0=pypi_0 +jupyter-server=2.7.3=pypi_0 +jupyter-server-terminals=0.4.4=pypi_0 +jupyterlab=4.0.5=pypi_0 +jupyterlab-pygments=0.2.2=pypi_0 +jupyterlab-server=2.24.0=pypi_0 +jupyterlab-widgets=3.0.8=pypi_0 +kiwisolver=1.4.5=pypi_0 +ld_impl_linux-64=2.38=h1181459_1 +libffi=3.4.4=h6a678d5_0 +libgcc-ng=11.2.0=h1234567_1 +libgomp=11.2.0=h1234567_1 +libstdcxx-ng=11.2.0=h1234567_1 +llvmlite=0.31.0=pypi_0 +lyft-dataset-sdk=0.0.8=pypi_0 +markdown=3.4.4=pypi_0 +markdown-it-py=3.0.0=pypi_0 +markupsafe=2.1.3=pypi_0 +matplotlib=3.5.2=pypi_0 +matplotlib-inline=0.1.6=pypi_0 +mccabe=0.7.0=pypi_0 +mdurl=0.1.2=pypi_0 +mistune=3.0.1=pypi_0 +mmcv-full=1.4.0=pypi_0 +mmdet=2.14.0=pypi_0 +mmdet3d=0.17.1=dev_0 +mmsegmentation=0.14.1=pypi_0 +model-index=0.1.11=pypi_0 +mypy-extensions=1.0.0=pypi_0 +nbclient=0.8.0=pypi_0 +nbconvert=7.8.0=pypi_0 +nbformat=5.9.2=pypi_0 +ncurses=6.4=h6a678d5_0 +nest-asyncio=1.5.7=pypi_0 +networkx=2.2=pypi_0 +notebook=7.0.3=pypi_0 +notebook-shim=0.2.3=pypi_0 +numba=0.48.0=pypi_0 +numpy=1.19.5=pypi_0 +nuscenes-devkit=1.1.9=pypi_0 +oauthlib=3.2.2=pypi_0 +opencv-python=4.8.0.76=pypi_0 +opendatalab=0.0.10=pypi_0 +openmim=0.3.9=pypi_0 +openssl=3.0.10=h7f8727e_2 +openxlab=0.0.34=pypi_0 +ordered-set=4.1.0=pypi_0 +oss2=2.17.0=pypi_0 +overrides=7.4.0=pypi_0 +packaging=23.1=pypi_0 +pandas=1.4.4=pypi_0 +pandocfilters=1.5.0=pypi_0 +parso=0.8.3=pypi_0 +pathspec=0.11.2=pypi_0 +pexpect=4.8.0=pypi_0 +pickleshare=0.7.5=pypi_0 +pillow=10.0.0=pypi_0 +pip=22.1=pypi_0 +pkgutil-resolve-name=1.3.10=pypi_0 +platformdirs=3.10.0=pypi_0 +plotly=5.16.1=pypi_0 +pluggy=1.3.0=pypi_0 +plyfile=1.0.1=pypi_0 +prettytable=3.8.0=pypi_0 +prometheus-client=0.17.1=pypi_0 +prompt-toolkit=3.0.39=pypi_0 +protobuf=4.24.3=pypi_0 +psutil=5.9.5=pypi_0 +ptyprocess=0.7.0=pypi_0 +pure-eval=0.2.2=pypi_0 +pyasn1=0.5.0=pypi_0 +pyasn1-modules=0.3.0=pypi_0 +pycocotools=2.0.7=pypi_0 +pycodestyle=2.11.0=pypi_0 +pycparser=2.21=pypi_0 +pycryptodome=3.20.0=pypi_0 +pyflakes=3.1.0=pypi_0 +pygments=2.16.1=pypi_0 +pyparsing=3.0.9=pypi_0 +pyquaternion=0.9.9=pypi_0 +pytest=7.4.2=pypi_0 +python=3.8.17=h955ad1f_0 +python-dateutil=2.8.2=pypi_0 +python-json-logger=2.0.7=pypi_0 +pytz=2023.3.post1=pypi_0 +pywavelets=1.4.1=pypi_0 +pyyaml=6.0.1=pypi_0 +pyzmq=25.1.1=pypi_0 +qtconsole=5.4.4=pypi_0 +qtpy=2.4.0=pypi_0 +readline=8.2=h5eee18b_0 +referencing=0.30.2=pypi_0 +requests=2.28.2=pypi_0 +requests-oauthlib=1.3.1=pypi_0 +rfc3339-validator=0.1.4=pypi_0 +rfc3986-validator=0.1.1=pypi_0 +rich=13.4.2=pypi_0 +rpds-py=0.10.2=pypi_0 +rsa=4.9=pypi_0 +safetensors=0.3.3=pypi_0 +scikit-image=0.19.3=pypi_0 +scikit-learn=1.3.0=pypi_0 +scipy=1.10.1=pypi_0 +send2trash=1.8.2=pypi_0 +setuptools=59.5.0=pypi_0 +shapely=1.8.5=pypi_0 +similaritymeasures=1.0.0=pypi_0 +six=1.16.0=pypi_0 +sniffio=1.3.0=pypi_0 +soupsieve=2.5=pypi_0 +sqlite=3.41.2=h5eee18b_0 +stack-data=0.6.2=pypi_0 +tabulate=0.9.0=pypi_0 +tenacity=8.2.3=pypi_0 +tensorboard=2.14.0=pypi_0 +tensorboard-data-server=0.7.1=pypi_0 +termcolor=2.3.0=pypi_0 +terminado=0.17.1=pypi_0 +terminaltables=3.1.10=pypi_0 +threadpoolctl=3.2.0=pypi_0 +tifffile=2023.7.10=pypi_0 +timm=0.9.7=pypi_0 +tinycss2=1.2.1=pypi_0 +tk=8.6.12=h1ccaba5_0 +tomli=2.0.1=pypi_0 +torch=1.9.1+cu111=pypi_0 +torchaudio=0.9.1=pypi_0 +torchstat=0.0.7=pypi_0 +torchvision=0.10.1+cu111=pypi_0 +tornado=6.3.3=pypi_0 +tqdm=4.65.2=pypi_0 +traitlets=5.9.0=pypi_0 +trimesh=2.35.39=pypi_0 +typing-extensions=4.7.1=pypi_0 +uri-template=1.3.0=pypi_0 +urllib3=1.26.16=pypi_0 +wcwidth=0.2.6=pypi_0 +webcolors=1.13=pypi_0 +webencodings=0.5.1=pypi_0 +websocket-client=1.6.2=pypi_0 +werkzeug=2.3.7=pypi_0 +wheel=0.38.4=py38h06a4308_0 +widgetsnbextension=4.0.8=pypi_0 +xz=5.4.2=h5eee18b_0 +yapf=0.40.1=pypi_0 +zipp=3.16.2=pypi_0 +zlib=1.2.13=h5eee18b_0 diff --git a/GenAD-main/tools/analysis_tools/__init__.py b/GenAD-main/tools/analysis_tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/GenAD-main/tools/analysis_tools/analyze_logs.py b/GenAD-main/tools/analysis_tools/analyze_logs.py new file mode 100644 index 0000000000000000000000000000000000000000..806175f34c0ce6c535167cc7db8470c69a6e243d --- /dev/null +++ b/GenAD-main/tools/analysis_tools/analyze_logs.py @@ -0,0 +1,201 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import json +import numpy as np +import seaborn as sns +from collections import defaultdict +from matplotlib import pyplot as plt + + +def cal_train_time(log_dicts, args): + for i, log_dict in enumerate(log_dicts): + print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}') + all_times = [] + for epoch in log_dict.keys(): + if args.include_outliers: + all_times.append(log_dict[epoch]['time']) + else: + all_times.append(log_dict[epoch]['time'][1:]) + all_times = np.array(all_times) + epoch_ave_time = all_times.mean(-1) + slowest_epoch = epoch_ave_time.argmax() + fastest_epoch = epoch_ave_time.argmin() + std_over_epoch = epoch_ave_time.std() + print(f'slowest epoch {slowest_epoch + 1}, ' + f'average time is {epoch_ave_time[slowest_epoch]:.4f}') + print(f'fastest epoch {fastest_epoch + 1}, ' + f'average time is {epoch_ave_time[fastest_epoch]:.4f}') + print(f'time std over epochs is {std_over_epoch:.4f}') + print(f'average iter time: {np.mean(all_times):.4f} s/iter') + print() + + +def plot_curve(log_dicts, args): + if args.backend is not None: + plt.switch_backend(args.backend) + sns.set_style(args.style) + # if legend is None, use {filename}_{key} as legend + legend = args.legend + if legend is None: + legend = [] + for json_log in args.json_logs: + for metric in args.keys: + legend.append(f'{json_log}_{metric}') + assert len(legend) == (len(args.json_logs) * len(args.keys)) + metrics = args.keys + + num_metrics = len(metrics) + for i, log_dict in enumerate(log_dicts): + epochs = list(log_dict.keys()) + for j, metric in enumerate(metrics): + print(f'plot curve of {args.json_logs[i]}, metric is {metric}') + if metric not in log_dict[epochs[args.interval - 1]]: + raise KeyError( + f'{args.json_logs[i]} does not contain metric {metric}') + + if args.mode == 'eval': + if min(epochs) == args.interval: + x0 = args.interval + else: + # if current training is resumed from previous checkpoint + # we lost information in early epochs + # `xs` should start according to `min(epochs)` + if min(epochs) % args.interval == 0: + x0 = min(epochs) + else: + # find the first epoch that do eval + x0 = min(epochs) + args.interval - \ + min(epochs) % args.interval + xs = np.arange(x0, max(epochs) + 1, args.interval) + ys = [] + for epoch in epochs[args.interval - 1::args.interval]: + ys += log_dict[epoch][metric] + + # if training is aborted before eval of the last epoch + # `xs` and `ys` will have different length and cause an error + # check if `ys[-1]` is empty here + if not log_dict[epoch][metric]: + xs = xs[:-1] + + ax = plt.gca() + ax.set_xticks(xs) + plt.xlabel('epoch') + plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o') + else: + xs = [] + ys = [] + num_iters_per_epoch = \ + log_dict[epochs[args.interval-1]]['iter'][-1] + for epoch in epochs[args.interval - 1::args.interval]: + iters = log_dict[epoch]['iter'] + if log_dict[epoch]['mode'][-1] == 'val': + iters = iters[:-1] + xs.append( + np.array(iters) + (epoch - 1) * num_iters_per_epoch) + ys.append(np.array(log_dict[epoch][metric][:len(iters)])) + xs = np.concatenate(xs) + ys = np.concatenate(ys) + plt.xlabel('iter') + plt.plot( + xs, ys, label=legend[i * num_metrics + j], linewidth=0.5) + plt.legend() + if args.title is not None: + plt.title(args.title) + if args.out is None: + plt.show() + else: + print(f'save curve to: {args.out}') + plt.savefig(args.out) + plt.cla() + + +def add_plot_parser(subparsers): + parser_plt = subparsers.add_parser( + 'plot_curve', help='parser for plotting curves') + parser_plt.add_argument( + 'json_logs', + type=str, + nargs='+', + help='path of train log in json format') + parser_plt.add_argument( + '--keys', + type=str, + nargs='+', + default=['mAP_0.25'], + help='the metric that you want to plot') + parser_plt.add_argument('--title', type=str, help='title of figure') + parser_plt.add_argument( + '--legend', + type=str, + nargs='+', + default=None, + help='legend of each plot') + parser_plt.add_argument( + '--backend', type=str, default=None, help='backend of plt') + parser_plt.add_argument( + '--style', type=str, default='dark', help='style of plt') + parser_plt.add_argument('--out', type=str, default=None) + parser_plt.add_argument('--mode', type=str, default='train') + parser_plt.add_argument('--interval', type=int, default=1) + + +def add_time_parser(subparsers): + parser_time = subparsers.add_parser( + 'cal_train_time', + help='parser for computing the average time per training iteration') + parser_time.add_argument( + 'json_logs', + type=str, + nargs='+', + help='path of train log in json format') + parser_time.add_argument( + '--include-outliers', + action='store_true', + help='include the first value of every epoch when computing ' + 'the average time') + + +def parse_args(): + parser = argparse.ArgumentParser(description='Analyze Json Log') + # currently only support plot curve and calculate average train time + subparsers = parser.add_subparsers(dest='task', help='task parser') + add_plot_parser(subparsers) + add_time_parser(subparsers) + args = parser.parse_args() + return args + + +def load_json_logs(json_logs): + # load and convert json_logs to log_dict, key is epoch, value is a sub dict + # keys of sub dict is different metrics, e.g. memory, bbox_mAP + # value of sub dict is a list of corresponding values of all iterations + log_dicts = [dict() for _ in json_logs] + for json_log, log_dict in zip(json_logs, log_dicts): + with open(json_log, 'r') as log_file: + for line in log_file: + log = json.loads(line.strip()) + # skip lines without `epoch` field + if 'epoch' not in log: + continue + epoch = log.pop('epoch') + if epoch not in log_dict: + log_dict[epoch] = defaultdict(list) + for k, v in log.items(): + log_dict[epoch][k].append(v) + return log_dicts + + +def main(): + args = parse_args() + + json_logs = args.json_logs + for json_log in json_logs: + assert json_log.endswith('.json') + + log_dicts = load_json_logs(json_logs) + + eval(args.task)(log_dicts, args) + + +if __name__ == '__main__': + main() diff --git a/GenAD-main/tools/analysis_tools/benchmark.py b/GenAD-main/tools/analysis_tools/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..487a348935e3c949a8cde2c90a1747db769964c9 --- /dev/null +++ b/GenAD-main/tools/analysis_tools/benchmark.py @@ -0,0 +1,98 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import time +import torch +from mmcv import Config +from mmcv.parallel import MMDataParallel +from mmcv.runner import load_checkpoint, wrap_fp16_model +import sys +sys.path.append('.') +from projects.mmdet3d_plugin.datasets.builder import build_dataloader +from projects.mmdet3d_plugin.datasets import custom_build_dataset +# from mmdet3d.datasets import build_dataloader, build_dataset +from mmdet3d.models import build_detector +#from tools.misc.fuse_conv_bn import fuse_module + + +def parse_args(): + parser = argparse.ArgumentParser(description='MMDet benchmark a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('--checkpoint', default=None, help='checkpoint file') + parser.add_argument('--samples', default=2000, help='samples to benchmark') + parser.add_argument( + '--log-interval', default=50, help='interval of logging') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase' + 'the inference speed') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + cfg.model.pretrained = None + cfg.data.test.test_mode = True + + # build the dataloader + # TODO: support multiple images per gpu (only minor changes are needed) + print(cfg.data.test) + dataset = custom_build_dataset(cfg.data.test) + data_loader = build_dataloader( + dataset, + samples_per_gpu=1, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=False, + shuffle=False) + + # build the model and load checkpoint + cfg.model.train_cfg = None + model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(model) + if args.checkpoint is not None: + load_checkpoint(model, args.checkpoint, map_location='cpu') + #if args.fuse_conv_bn: + # model = fuse_module(model) + + model = MMDataParallel(model, device_ids=[0]) + + model.eval() + + # the first several iterations may be very slow so skip them + num_warmup = 5 + pure_inf_time = 0 + + # benchmark with several samples and take the average + for i, data in enumerate(data_loader): + torch.cuda.synchronize() + start_time = time.perf_counter() + with torch.no_grad(): + model(return_loss=False, rescale=True, **data) + + torch.cuda.synchronize() + elapsed = time.perf_counter() - start_time + + if i >= num_warmup: + pure_inf_time += elapsed + if (i + 1) % args.log_interval == 0: + fps = (i + 1 - num_warmup) / pure_inf_time + print(f'Done image [{i + 1:<3}/ {args.samples}], ' + f'fps: {fps:.1f} img / s') + + if (i + 1) == args.samples: + pure_inf_time += elapsed + fps = (i + 1 - num_warmup) / pure_inf_time + print(f'Overall fps: {fps:.1f} img / s') + break + + +if __name__ == '__main__': + main() diff --git a/GenAD-main/tools/analysis_tools/get_flops.py b/GenAD-main/tools/analysis_tools/get_flops.py new file mode 100644 index 0000000000000000000000000000000000000000..1b9fb0163b2f749108d41f11b332d2bda1e71879 --- /dev/null +++ b/GenAD-main/tools/analysis_tools/get_flops.py @@ -0,0 +1,747 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import argparse + +import torch +from mmcv import Config, DictAction + +from mmdet3d.models import build_model +from mmdet3d.datasets import build_dataset +from projects.mmdet3d_plugin.datasets.builder import build_dataloader + +# try: +# from mmcv.cnn import get_model_complexity_info +# except ImportError: +# raise ImportError('Please upgrade mmcv to >0.6.2') + +import sys +sys.path.append('.') + + +from functools import partial + +import numpy as np +import torch +import torch.nn as nn + +import mmcv + + +def get_model_complexity_info(model, + data, + input_shape=(1280, 720), + print_per_layer_stat=True, + as_strings=True, + input_constructor=None, + flush=False, + ost=sys.stdout): + """Get complexity information of a model. + + This method can calculate FLOPs and parameter counts of a model with + corresponding input shape. It can also print complexity information for + each layer in a model. + + Supported layers are listed as below: + - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``. + - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``, + ``nn.ReLU6``. + - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``, + ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``, + ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``, + ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``, + ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``. + - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``, + ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``, + ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``. + - Linear: ``nn.Linear``. + - Deconvolution: ``nn.ConvTranspose2d``. + - Upsample: ``nn.Upsample``. + + Args: + model (nn.Module): The model for complexity calculation. + input_shape (tuple): Input shape used for calculation. + print_per_layer_stat (bool): Whether to print complexity information + for each layer in a model. Default: True. + as_strings (bool): Output FLOPs and params counts in a string form. + Default: True. + input_constructor (None | callable): If specified, it takes a callable + method that generates input. otherwise, it will generate a random + tensor with input shape to calculate FLOPs. Default: None. + flush (bool): same as that in :func:`print`. Default: False. + ost (stream): same as ``file`` param in :func:`print`. + Default: sys.stdout. + + Returns: + tuple[float | str]: If ``as_strings`` is set to True, it will return + FLOPs and parameter counts in a string format. otherwise, it will + return those in a float number format. + """ + + assert isinstance(model, nn.Module) + flops_model = add_flops_counting_methods(model) + flops_model.eval() + flops_model.start_flops_count() + if input_constructor: + input = input_constructor(input_shape) + _ = flops_model(**input) + else: + try: + batch = torch.ones(()).new_empty( + (1, 6, 3, *input_shape), + dtype=next(flops_model.parameters()).dtype, + device=next(flops_model.parameters()).device) + except StopIteration: + # Avoid StopIteration for models which have no parameters, + # like `nn.Relu()`, `nn.AvgPool2d`, etc. + batch = torch.ones(()).new_empty((1, 6, 3, *input_shape)) + + # img_metas = [data['img_metas'][0].data[0]] + # img = data['img'][0].data[0] + # points = data['points'][0].data[0][0] + # fut_valid_flag = data['fut_valid_flag'][0].data[0] + # img = img.to(batch.device) + # points = [points.to(batch.device)] + # ego_his_trajs = data['ego_his_trajs'][0].data[0].to(batch.device) + # ego_lcf_feat = data['ego_lcf_feat'][0].data[0].to(batch.device).unsqueeze(0) + + # _ = flops_model(rescale=True, img=img, img_metas=img_metas, points=points, + # fut_valid_flag=fut_valid_flag, ego_his_trajs=ego_his_trajs, ego_lcf_feat=ego_lcf_feat) + + img_metas = [data['img_metas'][0].data[0]] + img = data['img'][0].data[0] + img = img.to(batch.device) + + _ = flops_model(rescale=True, img=img, img_metas=img_metas) + + flops_count, params_count = flops_model.compute_average_flops_cost() + if print_per_layer_stat: + print_model_with_flops( + flops_model, flops_count, params_count, ost=ost, flush=flush) + flops_model.stop_flops_count() + + if as_strings: + return flops_to_string(flops_count), params_to_string(params_count) + + return flops_count, params_count + + +def flops_to_string(flops, units='GFLOPs', precision=2): + """Convert FLOPs number into a string. + + Note that Here we take a multiply-add counts as one FLOP. + + Args: + flops (float): FLOPs number to be converted. + units (str | None): Converted FLOPs units. Options are None, 'GFLOPs', + 'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically + choose the most suitable unit for FLOPs. Default: 'GFLOPs'. + precision (int): Digit number after the decimal point. Default: 2. + + Returns: + str: The converted FLOPs number with units. + + Examples: + >>> flops_to_string(1e9) + '1.0 GFLOPs' + >>> flops_to_string(2e5, 'MFLOPs') + '0.2 MFLOPs' + >>> flops_to_string(3e-9, None) + '3e-09 FLOPs' + """ + if units is None: + if flops // 10**9 > 0: + return str(round(flops / 10.**9, precision)) + ' GFLOPs' + elif flops // 10**6 > 0: + return str(round(flops / 10.**6, precision)) + ' MFLOPs' + elif flops // 10**3 > 0: + return str(round(flops / 10.**3, precision)) + ' KFLOPs' + else: + return str(flops) + ' FLOPs' + else: + if units == 'GFLOPs': + return str(round(flops / 10.**9, precision)) + ' ' + units + elif units == 'MFLOPs': + return str(round(flops / 10.**6, precision)) + ' ' + units + elif units == 'KFLOPs': + return str(round(flops / 10.**3, precision)) + ' ' + units + else: + return str(flops) + ' FLOPs' + + +def params_to_string(num_params, units=None, precision=2): + """Convert parameter number into a string. + + Args: + num_params (float): Parameter number to be converted. + units (str | None): Converted FLOPs units. Options are None, 'M', + 'K' and ''. If set to None, it will automatically choose the most + suitable unit for Parameter number. Default: None. + precision (int): Digit number after the decimal point. Default: 2. + + Returns: + str: The converted parameter number with units. + + Examples: + >>> params_to_string(1e9) + '1000.0 M' + >>> params_to_string(2e5) + '200.0 k' + >>> params_to_string(3e-9) + '3e-09' + """ + if units is None: + if num_params // 10**6 > 0: + return str(round(num_params / 10**6, precision)) + ' M' + elif num_params // 10**3: + return str(round(num_params / 10**3, precision)) + ' k' + else: + return str(num_params) + else: + if units == 'M': + return str(round(num_params / 10.**6, precision)) + ' ' + units + elif units == 'K': + return str(round(num_params / 10.**3, precision)) + ' ' + units + else: + return str(num_params) + + +def print_model_with_flops(model, + total_flops, + total_params, + units='GFLOPs', + precision=3, + ost=sys.stdout, + flush=False): + """Print a model with FLOPs for each layer. + + Args: + model (nn.Module): The model to be printed. + total_flops (float): Total FLOPs of the model. + total_params (float): Total parameter counts of the model. + units (str | None): Converted FLOPs units. Default: 'GFLOPs'. + precision (int): Digit number after the decimal point. Default: 3. + ost (stream): same as `file` param in :func:`print`. + Default: sys.stdout. + flush (bool): same as that in :func:`print`. Default: False. + + Example: + >>> class ExampleModel(nn.Module): + + >>> def __init__(self): + >>> super().__init__() + >>> self.conv1 = nn.Conv2d(3, 8, 3) + >>> self.conv2 = nn.Conv2d(8, 256, 3) + >>> self.conv3 = nn.Conv2d(256, 8, 3) + >>> self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) + >>> self.flatten = nn.Flatten() + >>> self.fc = nn.Linear(8, 1) + + >>> def forward(self, x): + >>> x = self.conv1(x) + >>> x = self.conv2(x) + >>> x = self.conv3(x) + >>> x = self.avg_pool(x) + >>> x = self.flatten(x) + >>> x = self.fc(x) + >>> return x + + >>> model = ExampleModel() + >>> x = (3, 16, 16) + to print the complexity information state for each layer, you can use + >>> get_model_complexity_info(model, x) + or directly use + >>> print_model_with_flops(model, 4579784.0, 37361) + ExampleModel( + 0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs, + (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1)) # noqa: E501 + (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1)) + (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1)) + (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1)) + (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, ) + (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True) + ) + """ + + def accumulate_params(self): + if is_supported_instance(self): + return self.__params__ + else: + sum = 0 + for m in self.children(): + sum += m.accumulate_params() + return sum + + def accumulate_flops(self): + if is_supported_instance(self): + return self.__flops__ / model.__batch_counter__ + else: + sum = 0 + for m in self.children(): + sum += m.accumulate_flops() + return sum + + def flops_repr(self): + accumulated_num_params = self.accumulate_params() + accumulated_flops_cost = self.accumulate_flops() + return ', '.join([ + params_to_string( + accumulated_num_params, units='M', precision=precision), + '{:.3%} Params'.format(accumulated_num_params / total_params), + flops_to_string( + accumulated_flops_cost, units=units, precision=precision), + '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops), + self.original_extra_repr() + ]) + + def add_extra_repr(m): + m.accumulate_flops = accumulate_flops.__get__(m) + m.accumulate_params = accumulate_params.__get__(m) + flops_extra_repr = flops_repr.__get__(m) + if m.extra_repr != flops_extra_repr: + m.original_extra_repr = m.extra_repr + m.extra_repr = flops_extra_repr + assert m.extra_repr != m.original_extra_repr + + def del_extra_repr(m): + if hasattr(m, 'original_extra_repr'): + m.extra_repr = m.original_extra_repr + del m.original_extra_repr + if hasattr(m, 'accumulate_flops'): + del m.accumulate_flops + + model.apply(add_extra_repr) + print(model, file=ost, flush=flush) + model.apply(del_extra_repr) + + +def get_model_parameters_number(model): + """Calculate parameter number of a model. + + Args: + model (nn.module): The model for parameter number calculation. + + Returns: + float: Parameter number of the model. + """ + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + return num_params + + +def add_flops_counting_methods(net_main_module): + # adding additional methods to the existing module object, + # this is done this way so that each function has access to self object + net_main_module.start_flops_count = start_flops_count.__get__( + net_main_module) + net_main_module.stop_flops_count = stop_flops_count.__get__( + net_main_module) + net_main_module.reset_flops_count = reset_flops_count.__get__( + net_main_module) + net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( # noqa: E501 + net_main_module) + + net_main_module.reset_flops_count() + + return net_main_module + + +def compute_average_flops_cost(self): + """Compute average FLOPs cost. + + A method to compute average FLOPs cost, which will be available after + `add_flops_counting_methods()` is called on a desired net object. + + Returns: + float: Current mean flops consumption per image. + """ + batches_count = self.__batch_counter__ + flops_sum = 0 + for module in self.modules(): + if is_supported_instance(module): + flops_sum += module.__flops__ + params_sum = get_model_parameters_number(self) + return flops_sum / batches_count, params_sum + + +def start_flops_count(self): + """Activate the computation of mean flops consumption per image. + + A method to activate the computation of mean flops consumption per image. + which will be available after ``add_flops_counting_methods()`` is called on + a desired net object. It should be called before running the network. + """ + add_batch_counter_hook_function(self) + + def add_flops_counter_hook_function(module): + if is_supported_instance(module): + if hasattr(module, '__flops_handle__'): + return + + else: + handle = module.register_forward_hook( + get_modules_mapping()[type(module)]) + + module.__flops_handle__ = handle + + self.apply(partial(add_flops_counter_hook_function)) + + +def stop_flops_count(self): + """Stop computing the mean flops consumption per image. + + A method to stop computing the mean flops consumption per image, which will + be available after ``add_flops_counting_methods()`` is called on a desired + net object. It can be called to pause the computation whenever. + """ + remove_batch_counter_hook_function(self) + self.apply(remove_flops_counter_hook_function) + + +def reset_flops_count(self): + """Reset statistics computed so far. + + A method to Reset computed statistics, which will be available after + `add_flops_counting_methods()` is called on a desired net object. + """ + add_batch_counter_variables_or_reset(self) + self.apply(add_flops_counter_variable_or_reset) + + +# ---- Internal functions +def empty_flops_counter_hook(module, input, output): + module.__flops__ += 0 + + +def upsample_flops_counter_hook(module, input, output): + output_size = output[0] + batch_size = output_size.shape[0] + output_elements_count = batch_size + for val in output_size.shape[1:]: + output_elements_count *= val + module.__flops__ += int(output_elements_count) + + +def relu_flops_counter_hook(module, input, output): + active_elements_count = output.numel() + module.__flops__ += int(active_elements_count) + + +def linear_flops_counter_hook(module, input, output): + input = input[0] + output_last_dim = output.shape[ + -1] # pytorch checks dimensions, so here we don't care much + module.__flops__ += int(np.prod(input.shape) * output_last_dim) + + +def pool_flops_counter_hook(module, input, output): + input = input[0] + module.__flops__ += int(np.prod(input.shape)) + + +def norm_flops_counter_hook(module, input, output): + input = input[0] + + batch_flops = np.prod(input.shape) + if (getattr(module, 'affine', False) + or getattr(module, 'elementwise_affine', False)): + batch_flops *= 2 + module.__flops__ += int(batch_flops) + + +def deconv_flops_counter_hook(conv_module, input, output): + # Can have multiple inputs, getting the first one + input = input[0] + + batch_size = input.shape[0] + input_height, input_width = input.shape[2:] + + kernel_height, kernel_width = conv_module.kernel_size + in_channels = conv_module.in_channels + out_channels = conv_module.out_channels + groups = conv_module.groups + + filters_per_channel = out_channels // groups + conv_per_position_flops = ( + kernel_height * kernel_width * in_channels * filters_per_channel) + + active_elements_count = batch_size * input_height * input_width + overall_conv_flops = conv_per_position_flops * active_elements_count + bias_flops = 0 + if conv_module.bias is not None: + output_height, output_width = output.shape[2:] + bias_flops = out_channels * batch_size * output_height * output_height + overall_flops = overall_conv_flops + bias_flops + + conv_module.__flops__ += int(overall_flops) + + +def conv_flops_counter_hook(conv_module, input, output): + # Can have multiple inputs, getting the first one + input = input[0] + + batch_size = input.shape[0] + output_dims = list(output.shape[2:]) + + kernel_dims = list(conv_module.kernel_size) + in_channels = conv_module.in_channels + out_channels = conv_module.out_channels + groups = conv_module.groups + + filters_per_channel = out_channels // groups + conv_per_position_flops = int( + np.prod(kernel_dims)) * in_channels * filters_per_channel + + active_elements_count = batch_size * int(np.prod(output_dims)) + + overall_conv_flops = conv_per_position_flops * active_elements_count + + bias_flops = 0 + + if conv_module.bias is not None: + + bias_flops = out_channels * active_elements_count + + overall_flops = overall_conv_flops + bias_flops + + conv_module.__flops__ += int(overall_flops) + + +def batch_counter_hook(module, input, output): + batch_size = 1 + if len(input) > 0: + # Can have multiple inputs, getting the first one + input = input[0] + batch_size = len(input) + else: + pass + print('Warning! No positional inputs found for a module, ' + 'assuming batch size is 1.') + module.__batch_counter__ += batch_size + + +def add_batch_counter_variables_or_reset(module): + + module.__batch_counter__ = 0 + + +def add_batch_counter_hook_function(module): + if hasattr(module, '__batch_counter_handle__'): + return + + handle = module.register_forward_hook(batch_counter_hook) + module.__batch_counter_handle__ = handle + + +def remove_batch_counter_hook_function(module): + if hasattr(module, '__batch_counter_handle__'): + module.__batch_counter_handle__.remove() + del module.__batch_counter_handle__ + + +def add_flops_counter_variable_or_reset(module): + if is_supported_instance(module): + if hasattr(module, '__flops__') or hasattr(module, '__params__'): + print('Warning: variables __flops__ or __params__ are already ' + 'defined for the module' + type(module).__name__ + + ' ptflops can affect your code!') + module.__flops__ = 0 + module.__params__ = get_model_parameters_number(module) + + +def is_supported_instance(module): + if type(module) in get_modules_mapping(): + return True + return False + + +def remove_flops_counter_hook_function(module): + if is_supported_instance(module): + if hasattr(module, '__flops_handle__'): + module.__flops_handle__.remove() + del module.__flops_handle__ + + +def get_modules_mapping(): + return { + # convolutions + nn.Conv1d: conv_flops_counter_hook, + nn.Conv2d: conv_flops_counter_hook, + mmcv.cnn.bricks.Conv2d: conv_flops_counter_hook, + nn.Conv3d: conv_flops_counter_hook, + mmcv.cnn.bricks.Conv3d: conv_flops_counter_hook, + # activations + nn.ReLU: relu_flops_counter_hook, + nn.PReLU: relu_flops_counter_hook, + nn.ELU: relu_flops_counter_hook, + nn.LeakyReLU: relu_flops_counter_hook, + nn.ReLU6: relu_flops_counter_hook, + # poolings + nn.MaxPool1d: pool_flops_counter_hook, + nn.AvgPool1d: pool_flops_counter_hook, + nn.AvgPool2d: pool_flops_counter_hook, + nn.MaxPool2d: pool_flops_counter_hook, + mmcv.cnn.bricks.MaxPool2d: pool_flops_counter_hook, + nn.MaxPool3d: pool_flops_counter_hook, + mmcv.cnn.bricks.MaxPool3d: pool_flops_counter_hook, + nn.AvgPool3d: pool_flops_counter_hook, + nn.AdaptiveMaxPool1d: pool_flops_counter_hook, + nn.AdaptiveAvgPool1d: pool_flops_counter_hook, + nn.AdaptiveMaxPool2d: pool_flops_counter_hook, + nn.AdaptiveAvgPool2d: pool_flops_counter_hook, + nn.AdaptiveMaxPool3d: pool_flops_counter_hook, + nn.AdaptiveAvgPool3d: pool_flops_counter_hook, + # normalizations + nn.BatchNorm1d: norm_flops_counter_hook, + nn.BatchNorm2d: norm_flops_counter_hook, + nn.BatchNorm3d: norm_flops_counter_hook, + nn.GroupNorm: norm_flops_counter_hook, + nn.InstanceNorm1d: norm_flops_counter_hook, + nn.InstanceNorm2d: norm_flops_counter_hook, + nn.InstanceNorm3d: norm_flops_counter_hook, + nn.LayerNorm: norm_flops_counter_hook, + # FC + nn.Linear: linear_flops_counter_hook, + mmcv.cnn.bricks.Linear: linear_flops_counter_hook, + # Upscale + nn.Upsample: upsample_flops_counter_hook, + # Deconvolution + nn.ConvTranspose2d: deconv_flops_counter_hook, + mmcv.cnn.bricks.ConvTranspose2d: deconv_flops_counter_hook, + } + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a detector') + parser.add_argument('config', help='train config file path') + parser.add_argument( + '--shape', + type=int, + nargs='+', + default=[40000, 4], + help='input point cloud size') + parser.add_argument( + '--modality', + type=str, + default='point', + choices=['point', 'image', 'multi'], + help='input data modality') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def main(): + + args = parse_args() + + if args.modality == 'point': + assert len(args.shape) == 2, 'invalid input shape' + input_shape = tuple(args.shape) + elif args.modality == 'image': + if len(args.shape) == 1: + input_shape = (3, args.shape[0], args.shape[0]) + elif len(args.shape) == 2: + input_shape = (3, ) + tuple(args.shape) + else: + raise ValueError('invalid input shape') + elif args.modality == 'multi': + raise NotImplementedError( + 'FLOPs counter is currently not supported for models with ' + 'multi-modality input') + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + + if hasattr(cfg, 'plugin'): + if cfg.plugin: + import importlib + if hasattr(cfg, 'plugin_dir'): + plugin_dir = cfg.plugin_dir + _module_dir = os.path.dirname(plugin_dir) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + else: + # import dir is the dirpath for the config file + _module_dir = os.path.dirname(args.config) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + + samples_per_gpu = 1 + from mmdet.datasets import replace_ImageToTensor + if isinstance(cfg.data.test, dict): + cfg.data.test.test_mode = True + samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) + if samples_per_gpu > 1: + # Replace 'ImageToTensor' to 'DefaultFormatBundle' + cfg.data.test.pipeline = replace_ImageToTensor( + cfg.data.test.pipeline) + elif isinstance(cfg.data.test, list): + for ds_cfg in cfg.data.test: + ds_cfg.test_mode = True + samples_per_gpu = max( + [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) + if samples_per_gpu > 1: + for ds_cfg in cfg.data.test: + ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) + + dataset = build_dataset(cfg.data.test) + dataset.is_vis_on_test = True #TODO, this is a hack + data_loader = build_dataloader( + dataset, + samples_per_gpu=1, + workers_per_gpu=0, + dist=False, + shuffle=False, + nonshuffler_sampler=cfg.data.nonshuffler_sampler, + ) + for i, data in enumerate(data_loader): + # if ~(data['map_gt_labels_3d'].data[0][0] != -1).any(): + # continue + img = data['img'][0].data[0] + img_metas = data['img_metas'][0].data[0] + break + + model = build_model( + cfg.model, + train_cfg=cfg.get('train_cfg'), + test_cfg=cfg.get('test_cfg')) + if torch.cuda.is_available(): + model.cuda() + model.eval() + + if hasattr(model, 'forward_dummy'): + model.forward = model.forward_dummy + else: + raise NotImplementedError( + 'FLOPs counter is currently not supported for {}'.format( + model.__class__.__name__)) + + flops, params = get_model_complexity_info(model, data) + split_line = '=' * 30 + print(f'{split_line}\nInput shape: {input_shape}\n' + f'Flops: {flops}\nParams: {params}\n{split_line}') + print('!!!Please be cautious if you use the results in papers. ' + 'You may need to check if all ops are supported and verify that the ' + 'flops computation is correct.') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/GenAD-main/tools/analysis_tools/get_params.py b/GenAD-main/tools/analysis_tools/get_params.py new file mode 100644 index 0000000000000000000000000000000000000000..6bf4ecf3c7599bfe310fbfd59c2efdfd8d695303 --- /dev/null +++ b/GenAD-main/tools/analysis_tools/get_params.py @@ -0,0 +1,8 @@ +import torch +YOUR_CKPT_PATH = None +file_path = YOUR_CKPT_PATH +model = torch.load(file_path, map_location='cpu') +all = 0 +for key in list(model['state_dict'].keys()): + all += model['state_dict'][key].nelement() +print(all) diff --git a/GenAD-main/tools/analysis_tools/visualization.py b/GenAD-main/tools/analysis_tools/visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..cde62913c43eaa5114431614c4f37537f3bff76a --- /dev/null +++ b/GenAD-main/tools/analysis_tools/visualization.py @@ -0,0 +1,939 @@ +import sys +sys.path.append('') +import os +import argparse +import os.path as osp +from PIL import Image +from tqdm import tqdm +from typing import List, Dict + +import cv2 +import mmcv +import torch +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import rcParams +from pyquaternion import Quaternion +from nuscenes.nuscenes import NuScenes +from mmdet.datasets.pipelines import to_tensor +from matplotlib.collections import LineCollection +from nuscenes.utils.data_classes import LidarPointCloud, Box +from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox +from nuscenes.eval.detection.utils import category_to_detection_name +from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility + +from projects.mmdet3d_plugin.core.bbox.structures.nuscenes_box import CustomNuscenesBox, CustomDetectionBox, color_map +from projects.mmdet3d_plugin.datasets.nuscenes_vad_dataset import VectorizedLocalMap, LiDARInstanceLines +import matplotlib.cm as cm +from matplotlib.colors import LinearSegmentedColormap + + +cams = ['CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_BACK_RIGHT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_FRONT_LEFT'] + + +def render_annotation( + anntoken: str, + margin: float = 10, + view: np.ndarray = np.eye(4), + box_vis_level: BoxVisibility = BoxVisibility.ANY, + out_path: str = 'render.png', + extra_info: bool = False) -> None: + """ + Render selected annotation. + :param anntoken: Sample_annotation token. + :param margin: How many meters in each direction to include in LIDAR view. + :param view: LIDAR view point. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param out_path: Optional path to save the rendered figure to disk. + :param extra_info: Whether to render extra information below camera view. + """ + ann_record = nusc.get('sample_annotation', anntoken) + sample_record = nusc.get('sample', ann_record['sample_token']) + assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.' + + # Figure out which camera the object is fully visible in (this may return nothing). + boxes, cam = [], [] + cams = [key for key in sample_record['data'].keys() if 'CAM' in key] + all_bboxes = [] + select_cams = [] + for cam in cams: + _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level, + selected_anntokens=[anntoken]) + if len(boxes) > 0: + all_bboxes.append(boxes) + select_cams.append(cam) + # We found an image that matches. Let's abort. + # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \ + # 'Try using e.g. BoxVisibility.ANY.' + # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!' + + num_cam = len(all_bboxes) + + fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9)) + select_cams = [sample_record['data'][cam] for cam in select_cams] + print('bbox in cams:', select_cams) + # Plot LIDAR view. + lidar = sample_record['data']['LIDAR_TOP'] + data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken]) + LidarPointCloud.from_file(data_path).render_height(axes[0], view=view) + for box in boxes: + c = np.array(get_color(box.name)) / 255.0 + box.render(axes[0], view=view, colors=(c, c, c)) + corners = view_points(boxes[0].corners(), view, False)[:2, :] + axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin]) + axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin]) + axes[0].axis('off') + axes[0].set_aspect('equal') + + # Plot CAMERA view. + for i in range(1, num_cam + 1): + cam = select_cams[i - 1] + data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken]) + im = Image.open(data_path) + axes[i].imshow(im) + axes[i].set_title(nusc.get('sample_data', cam)['channel']) + axes[i].axis('off') + axes[i].set_aspect('equal') + for box in boxes: + c = np.array(get_color(box.name)) / 255.0 + box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c)) + + # Print extra information about the annotation below the camera view. + axes[i].set_xlim(0, im.size[0]) + axes[i].set_ylim(im.size[1], 0) + + if extra_info: + rcParams['font.family'] = 'monospace' + + w, l, h = ann_record['size'] + category = ann_record['category_name'] + lidar_points = ann_record['num_lidar_pts'] + radar_points = ann_record['num_radar_pts'] + + sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP']) + pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token']) + dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation'])) + + information = ' \n'.join(['category: {}'.format(category), + '', + '# lidar points: {0:>4}'.format(lidar_points), + '# radar points: {0:>4}'.format(radar_points), + '', + 'distance: {:>7.3f}m'.format(dist), + '', + 'width: {:>7.3f}m'.format(w), + 'length: {:>7.3f}m'.format(l), + 'height: {:>7.3f}m'.format(h)]) + + plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top') + + if out_path is not None: + plt.savefig(out_path) + + +def get_sample_data(sample_data_token: str, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + selected_anntokens=None, + use_flat_vehicle_coordinates: bool = False): + """ + Returns the data path as well as all annotations related to that sample_data. + Note that the boxes are transformed into the current sensor's coordinate frame. + :param sample_data_token: Sample_data token. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param selected_anntokens: If provided only return the selected annotation. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. + :return: (data_path, boxes, camera_intrinsic ) + """ + + # Retrieve sensor & pose records + sd_record = nusc.get('sample_data', sample_data_token) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + sensor_record = nusc.get('sensor', cs_record['sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + + data_path = nusc.get_sample_data_path(sample_data_token) + + if sensor_record['modality'] == 'camera': + cam_intrinsic = np.array(cs_record['camera_intrinsic']) + imsize = (sd_record['width'], sd_record['height']) + else: + cam_intrinsic = None + imsize = None + + # Retrieve all sample annotations and map to sensor coordinate system. + if selected_anntokens is not None: + boxes = list(map(nusc.get_box, selected_anntokens)) + else: + boxes = nusc.get_boxes(sample_data_token) + + # Make list of Box objects including coord system transforms. + box_list = [] + for box in boxes: + if use_flat_vehicle_coordinates: + # Move box to ego vehicle coord system parallel to world z plane. + yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0] + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) + else: + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + if sensor_record['modality'] == 'camera' and not \ + box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level): + continue + + box_list.append(box) + + return data_path, box_list, cam_intrinsic + + +def get_predicted_data(sample_data_token: str, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + selected_anntokens=None, + use_flat_vehicle_coordinates: bool = False, + pred_anns=None + ): + """ + Returns the data path as well as all annotations related to that sample_data. + Note that the boxes are transformed into the current sensor's coordinate frame. + :param sample_data_token: Sample_data token. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param selected_anntokens: If provided only return the selected annotation. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. + :return: (data_path, boxes, camera_intrinsic ) + """ + + # Retrieve sensor & pose records + sd_record = nusc.get('sample_data', sample_data_token) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + sensor_record = nusc.get('sensor', cs_record['sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + + data_path = nusc.get_sample_data_path(sample_data_token) + + if sensor_record['modality'] == 'camera': + cam_intrinsic = np.array(cs_record['camera_intrinsic']) + imsize = (sd_record['width'], sd_record['height']) + else: + cam_intrinsic = None + imsize = None + + # Retrieve all sample annotations and map to sensor coordinate system. + # if selected_anntokens is not None: + # boxes = list(map(nusc.get_box, selected_anntokens)) + # else: + # boxes = nusc.get_boxes(sample_data_token) + boxes = pred_anns + # Make list of Box objects including coord system transforms. + box_list = [] + for box in boxes: + if use_flat_vehicle_coordinates: + # Move box to ego vehicle coord system parallel to world z plane. + yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0] + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) + else: + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + if sensor_record['modality'] == 'camera' and not \ + box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level): + continue + box_list.append(box) + + return data_path, box_list, cam_intrinsic + + +def lidiar_render(sample_token, data, out_path=None, out_name=None, traj_use_perstep_offset=True): + bbox_gt_list = [] + bbox_pred_list = [] + sample_rec = nusc.get('sample', sample_token) + anns = sample_rec['anns'] + sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + + for ann in anns: + content = nusc.get('sample_annotation', ann) + gt_fut_trajs, gt_fut_masks = get_gt_fut_trajs( + nusc=nusc, anno=content, cs_record=cs_record, + pose_record=pose_record, fut_ts=6 + ) + try: + bbox_gt_list.append(CustomDetectionBox( + sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=nusc.box_velocity(content['token'])[:2], + fut_trajs=tuple(gt_fut_trajs), + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=category_to_detection_name(content['category_name']), + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name='')) + except: + pass + + bbox_anns = data['results'][sample_token] + for content in bbox_anns: + bbox_pred_list.append(CustomDetectionBox( + sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=tuple(content['velocity']), + fut_trajs=tuple(content['fut_traj']), + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=content['detection_name'], + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name=content['attribute_name'])) + gt_annotations = EvalBoxes() + pred_annotations = EvalBoxes() + gt_annotations.add_boxes(sample_token, bbox_gt_list) + pred_annotations.add_boxes(sample_token, bbox_pred_list) + # print('green is ground truth') + # print('blue is the predited result') + visualize_sample(nusc, sample_token, gt_annotations, pred_annotations, + savepath=out_path, traj_use_perstep_offset=traj_use_perstep_offset, pred_data=data) + + +def get_color(category_name: str): + """ + Provides the default colors based on the category names. + This method works for the general nuScenes categories, as well as the nuScenes detection categories. + """ + a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker', + 'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller', + 'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris', + 'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle', + 'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance', + 'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface', + 'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation', + 'vehicle.ego'] + class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' + ] + #print(category_name) + if category_name == 'bicycle': + return nusc.colormap['vehicle.bicycle'] + elif category_name == 'construction_vehicle': + return nusc.colormap['vehicle.construction'] + elif category_name == 'traffic_cone': + return nusc.colormap['movable_object.trafficcone'] + + for key in nusc.colormap.keys(): + if category_name in key: + return nusc.colormap[key] + return [0, 0, 0] + +# TODO: whether to rotate traj +def boxes_to_sensor(boxes: List[EvalBox], pose_record: Dict, cs_record: Dict): + """ + Map boxes from global coordinates to the vehicle's sensor coordinate system. + :param boxes: The boxes in global coordinates. + :param pose_record: The pose record of the vehicle at the current timestamp. + :param cs_record: The calibrated sensor record of the sensor. + :return: The transformed boxes. + """ + boxes_out = [] + for box in boxes: + # Create Box instance. + box = CustomNuscenesBox( + box.translation, box.size, Quaternion(box.rotation), box.fut_trajs, name=box.detection_name + ) + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + boxes_out.append(box) + + return boxes_out + + +def get_gt_fut_trajs(nusc: NuScenes, + anno, + cs_record, + pose_record, + fut_ts) -> None: + """ + Visualizes a sample from BEV with annotations and detection results. + :param nusc: NuScenes object. + """ + box = Box(anno['translation'], anno['size'], Quaternion(anno['rotation'])) + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + # get future trajectory coords for each box + gt_fut_trajs = np.zeros((fut_ts, 2)) # [fut_ts*2] + gt_fut_masks = np.zeros((fut_ts)) # [fut_ts] + gt_fut_trajs[:] = box.center[:2] + cur_box = box + cur_anno = anno + for i in range(fut_ts): + if cur_anno['next'] != '': + anno_next = nusc.get('sample_annotation', cur_anno['next']) + box_next = Box( + anno_next['translation'], anno_next['size'], Quaternion(anno_next['rotation']) + ) + # Move box to ego vehicle coord system. + box_next.translate(-np.array(pose_record['translation'])) + box_next.rotate(Quaternion(pose_record['rotation']).inverse) + # Move box to sensor coord system. + box_next.translate(-np.array(cs_record['translation'])) + box_next.rotate(Quaternion(cs_record['rotation']).inverse) + # gt_fut_trajs[i] = box_next.center[:2] + gt_fut_trajs[i] = box_next.center[:2] - cur_box.center[:2] + gt_fut_masks[i] = 1 + cur_anno = anno_next + cur_box = box_next + else: + # gt_fut_trajs[i:] = gt_fut_trajs[i-1] + gt_fut_trajs[i:] = 0 + break + + return gt_fut_trajs.reshape(-1).tolist(), gt_fut_masks.reshape(-1).tolist() + +def get_gt_vec_maps( + sample_token, + data_root='data/nuscenes/', + pc_range=[-15.0, -30.0, -4.0, 15.0, 30.0, 4.0], + padding_value=-10000, + map_classes=['divider', 'ped_crossing', 'boundary'], + map_fixed_ptsnum_per_line=20 +) -> None: + """ + Get gt vec map for a given sample. + """ + sample_rec = nusc.get('sample', sample_token) + sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + lidar2ego_translation = cs_record['translation'], + lidar2ego_rotation = cs_record['rotation'], + ego2global_translation = pose_record['translation'], + ego2global_rotation = pose_record['rotation'], + map_location = nusc.get('log', nusc.get('scene', sample_rec['scene_token'])['log_token'])['location'] + + lidar2ego = np.eye(4) + lidar2ego[:3,:3] = Quaternion(cs_record['rotation']).rotation_matrix + lidar2ego[:3, 3] = cs_record['translation'] + ego2global = np.eye(4) + ego2global[:3,:3] = Quaternion(pose_record['rotation']).rotation_matrix + ego2global[:3, 3] = pose_record['translation'] + lidar2global = ego2global @ lidar2ego + lidar2global_translation = list(lidar2global[:3,3]) + lidar2global_rotation = list(Quaternion(matrix=lidar2global).q) + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + patch_size = (patch_h, patch_w) + + vector_map = VectorizedLocalMap(data_root, patch_size=patch_size, + map_classes=map_classes, + fixed_ptsnum_per_line=map_fixed_ptsnum_per_line, + padding_value=padding_value) + + + anns_results = vector_map.gen_vectorized_samples( + map_location, lidar2global_translation, lidar2global_rotation + ) + + ''' + anns_results, type: dict + 'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates + 'gt_vecs_pts_num': list[num_vecs], vec with num_points + 'gt_vecs_label': list[num_vecs], vec with cls index + ''' + gt_vecs_label = to_tensor(anns_results['gt_vecs_label']) + if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines): + gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc'] + else: + gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc']) + try: + gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32) + except: + gt_vecs_pts_loc = gt_vecs_pts_loc + + return gt_vecs_pts_loc, gt_vecs_label + + +def visualize_sample(nusc: NuScenes, + sample_token: str, + gt_boxes: EvalBoxes, + pred_boxes: EvalBoxes, + nsweeps: int = 1, + conf_th: float = 0.4, + pc_range: list = [-30.0, -30.0, -4.0, 30.0, 30.0, 4.0], + verbose: bool = True, + savepath: str = None, + traj_use_perstep_offset: bool = True, + data_root='data/nuscenes/', + map_pc_range: list = [-15.0, -30.0, -4.0, 15.0, 30.0, 4.0], + padding_value=-10000, + map_classes=['divider', 'ped_crossing', 'boundary'], + map_fixed_ptsnum_per_line=20, + gt_format=['fixed_num_pts'], + colors_plt = ['red', 'green', 'blue'], #['cornflowerblue', 'royalblue', 'slategrey'], + pred_data = None) -> None: + """ + Visualizes a sample from BEV with annotations and detection results. + :param nusc: NuScenes object. + :param sample_token: The nuScenes sample token. + :param gt_boxes: Ground truth boxes grouped by sample. + :param pred_boxes: Prediction grouped by sample. + :param nsweeps: Number of sweeps used for lidar visualization. + :param conf_th: The confidence threshold used to filter negatives. + :param eval_range: Range in meters beyond which boxes are ignored. + :param verbose: Whether to print to stdout. + :param savepath: If given, saves the the rendering here instead of displaying. + """ + # Retrieve sensor & pose records. + sample_rec = nusc.get('sample', sample_token) + sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + # Get boxes. + boxes_gt_global = gt_boxes[sample_token] + boxes_est_global = pred_boxes[sample_token] + # Map GT boxes to lidar. + boxes_gt = boxes_to_sensor(boxes_gt_global, pose_record, cs_record) + # Map EST boxes to lidar. + boxes_est = boxes_to_sensor(boxes_est_global, pose_record, cs_record) + # Add scores to EST boxes. + for box_est, box_est_global in zip(boxes_est, boxes_est_global): + box_est.score = box_est_global.detection_score + + # Init axes. + fig, axes = plt.subplots(1, 1, figsize=(4, 4)) + plt.xlim(xmin=-30, xmax=30) + plt.ylim(ymin=-30, ymax=30) + + # Show Pred Map + + result_dic = pred_data['map_results'][sample_token]['vectors'] + + for vector in result_dic: + if vector['confidence_level'] < 0.6: + continue + pred_pts_3d = vector['pts'] + pred_label_3d = vector['type'] + pts_x = np.array([pt[0] for pt in pred_pts_3d]) + pts_y = np.array([pt[1] for pt in pred_pts_3d]) + + axes.plot(pts_x, pts_y, color=colors_plt[pred_label_3d],linewidth=2,alpha=0.8,zorder=-1) + axes.scatter(pts_x, pts_y, color=colors_plt[pred_label_3d],s=1,alpha=0.8,zorder=-1) + + + # ignore_list = ['barrier', 'motorcycle', 'bicycle', 'traffic_cone'] + ignore_list = ['barrier', 'bicycle', 'traffic_cone'] + + # Show Pred boxes. + color_list = ['salmon', 'darkcyan', 'orange', 'red', 'lightcoral', 'deepskyblue', 'gold', 'seagreen', 'deeppink', + 'dodgerblue', 'royalblue', 'yellow', 'violet', 'peru', 'palegreen', 'slateblue'] + # color_list = ['Blues', 'PiYG'] + + for i, box in enumerate(boxes_est): + if box.name in ignore_list: + continue + # Show only predictions with a high score. + assert not np.isnan(box.score), 'Error: Box score cannot be NaN!' + if box.name in ['pedestrian']: + continue + if box.score < conf_th or abs(box.center[0]) > 15 or abs(box.center[1]) > 30: + continue + + # colors = color_map(, cmap) + if i < 16: + color_box = color_list[i] + else: + color_box = color_list[-1] + # box.render(axes, view=np.eye(4), colors=('darkcyan', 'darkcyan', 'darkcyan'), linewidth=3, box_idx=None) + + box.render(axes, view=np.eye(4), colors=(color_box, color_box, color_box), linewidth=3, box_idx=None) + + if traj_use_perstep_offset: + # mode_idx = [0, 1, 2, 3, 4, 5] + mode_idx = [0] + # box.render_fut_trajs_grad_color(axes, linewidth=4, mode_idx=mode_idx, fut_ts=6, cmap='autumn') + box.render_fut_trajs_grad_color(axes, linewidth=6, mode_idx=mode_idx, fut_ts=3, cmap="autumn") + #cmap = LinearSegmentedColormap.from_list("mycmap", color_box) + + if box.name in ['pedestrian']: + continue + + else: + box.render_fut_trajs_coords(axes, color='tomato', linewidth=1) + + # Show Planning. + axes.plot([-0.9, -0.9], [-2, 2], color='mediumseagreen', linewidth=3, alpha=0.8) + axes.plot([-0.9, 0.9], [2, 2], color='mediumseagreen', linewidth=3, alpha=0.8) + axes.plot([0.9, 0.9], [2, -2], color='mediumseagreen', linewidth=3, alpha=0.8) + axes.plot([0.9, -0.9], [-2, -2], color='mediumseagreen', linewidth=3, alpha=0.8) + axes.plot([0.0, 0.0], [0.0, 2], color='mediumseagreen', linewidth=3, alpha=0.8) + plan_cmd = np.argmax(pred_data['plan_results'][sample_token][1][0,0,0]) + plan_traj = pred_data['plan_results'][sample_token][0][plan_cmd] + plan_traj[abs(plan_traj) < 0.01] = 0.0 + plan_traj = plan_traj.cumsum(axis=0) + plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0) + plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1) + + plan_vecs = None + for i in range(plan_traj.shape[0]): + plan_vec_i = plan_traj[i] + x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51) + y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51) + xy = np.stack((x_linspace, y_linspace), axis=1) + xy = np.stack((xy[:-1], xy[1:]), axis=1) + if plan_vecs is None: + plan_vecs = xy + else: + plan_vecs = np.concatenate((plan_vecs, xy), axis=0) + + cmap = 'summer' + y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301)) + colors = color_map(y[:-1], cmap) + line_segments = LineCollection(plan_vecs, colors=colors, linewidths=6, linestyles='solid', cmap=cmap) + axes.add_collection(line_segments) + + + axes.axes.xaxis.set_ticks([]) + axes.axes.yaxis.set_ticks([]) + axes.axis('off') + fig.set_tight_layout(True) + fig.canvas.draw() + plt.savefig(savepath+'/bev_pred.png', bbox_inches='tight', dpi=200) + plt.close() + + +def obtain_sensor2top(nusc, + sensor_token, + l2e_t, + l2e_r_mat, + e2g_t, + e2g_r_mat, + sensor_type='lidar'): + """Obtain the info with RT matric from general sensor to Top LiDAR. + + Args: + nusc (class): Dataset class in the nuScenes dataset. + sensor_token (str): Sample data token corresponding to the + specific sensor type. + l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3). + l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego + in shape (3, 3). + e2g_t (np.ndarray): Translation from ego to global in shape (1, 3). + e2g_r_mat (np.ndarray): Rotation matrix from ego to global + in shape (3, 3). + sensor_type (str): Sensor to calibrate. Default: 'lidar'. + + Returns: + sweep (dict): Sweep information after transformation. + """ + sd_rec = nusc.get('sample_data', sensor_token) + cs_record = nusc.get('calibrated_sensor', + sd_rec['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token']) + data_path = str(nusc.get_sample_data_path(sd_rec['token'])) + if os.getcwd() in data_path: # path from lyftdataset is absolute path + data_path = data_path.split(f'{os.getcwd()}/')[-1] # relative path + sweep = { + 'data_path': data_path, + 'type': sensor_type, + 'sample_data_token': sd_rec['token'], + 'sensor2ego_translation': cs_record['translation'], + 'sensor2ego_rotation': cs_record['rotation'], + 'ego2global_translation': pose_record['translation'], + 'ego2global_rotation': pose_record['rotation'], + 'timestamp': sd_rec['timestamp'] + } + + l2e_r_s = sweep['sensor2ego_rotation'] + l2e_t_s = sweep['sensor2ego_translation'] + e2g_r_s = sweep['ego2global_rotation'] + e2g_t_s = sweep['ego2global_translation'] + + # obtain the RT from sensor to Top LiDAR + # sweep->ego->global->ego'->lidar + l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix + e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix + R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ ( + np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) + T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ ( + np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) + T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T + ) + l2e_t @ np.linalg.inv(l2e_r_mat).T + sensor2lidar_rotation = R.T # points @ R.T + T + sensor2lidar_translation = T + + return sensor2lidar_rotation, sensor2lidar_translation + +def render_sample_data( + sample_toekn: str, + with_anns: bool = True, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + axes_limit: float = 40, + ax=None, + nsweeps: int = 1, + out_path: str = None, + out_name: str = None, + underlay_map: bool = True, + use_flat_vehicle_coordinates: bool = True, + show_lidarseg: bool = False, + show_lidarseg_legend: bool = False, + filter_lidarseg_labels=None, + lidarseg_preds_bin_path: str = None, + verbose: bool = True, + show_panoptic: bool = False, + pred_data=None, + traj_use_perstep_offset: bool = True + ) -> None: + """ + Render sample data onto axis. + :param sample_data_token: Sample_data token. + :param with_anns: Whether to draw box annotations. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param axes_limit: Axes limit for lidar and radar (measured in meters). + :param ax: Axes onto which to render. + :param nsweeps: Number of sweeps for lidar and radar. + :param out_path: Optional path to save the rendered figure to disk. + :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which + can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new + setting is more correct and rotates the plot by ~90 degrees. + :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set + to False, the colors of the lidar data represent the distance from the center of the ego vehicle. + :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame. + :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None + or the list is empty, all classes will be displayed. + :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation + predictions for the sample. + :param verbose: Whether to display the image after it is rendered. + :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set + to False, the colors of the lidar data represent the distance from the center of the ego vehicle. + If show_lidarseg is True, show_panoptic will be set to False. + """ + lidiar_render(sample_toekn, pred_data, out_path=out_path, + out_name=out_name, traj_use_perstep_offset=traj_use_perstep_offset) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Visualize VAD predictions') + parser.add_argument('--result-path', help='inference result file path') + parser.add_argument('--save-path', help='the dir to save visualization results') + args = parser.parse_args() + + return args + + +if __name__ == '__main__': + args = parse_args() + inference_result_path = args.result_path + out_path = args.save_path + bevformer_results = mmcv.load(inference_result_path) + sample_token_list = list(bevformer_results['results'].keys()) + + nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True) + + imgs = [] + fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') + video_path = osp.join(out_path, 'tiny.mp4') + video = cv2.VideoWriter(video_path, fourcc, 10, (2933, 800), True) + for id in tqdm(range(len(sample_token_list))): + # for id in tqdm(range(25)): + #3025 1140 + # id = id + 3025 + mmcv.mkdir_or_exist(out_path) + render_sample_data(sample_token_list[id], + pred_data=bevformer_results, + out_path=out_path) + pred_path = osp.join(out_path, 'bev_pred.png') + pred_img = cv2.imread(pred_path) + os.remove(pred_path) + + sample_token = sample_token_list[id] + sample = nusc.get('sample', sample_token) + # sample = data['results'][sample_token_list[0]][0] + cams = [ + 'CAM_FRONT_LEFT', + 'CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_BACK_LEFT', + 'CAM_BACK', + 'CAM_BACK_RIGHT', + ] + + cam_imgs = [] + for cam in cams: + sample_data_token = sample['data'][cam] + sd_record = nusc.get('sample_data', sample_data_token) + sensor_modality = sd_record['sensor_modality'] + if sensor_modality in ['lidar', 'radar']: + assert False + elif sensor_modality == 'camera': + boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']), + name=record['detection_name'], token='predicted') for record in + bevformer_results['results'][sample_token]] + data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token, + box_vis_level=BoxVisibility.ANY, + pred_anns=boxes) + _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=BoxVisibility.ANY) + + data = Image.open(data_path) + + # Show image. + _, ax = plt.subplots(1, 1, figsize=(6, 12)) + ax.imshow(data) + + if cam == 'CAM_FRONT': + lidar_sd_record = nusc.get('sample_data', sample['data']['LIDAR_TOP']) + lidar_cs_record = nusc.get('calibrated_sensor', lidar_sd_record['calibrated_sensor_token']) + lidar_pose_record = nusc.get('ego_pose', lidar_sd_record['ego_pose_token']) + + # get plan traj [x,y,z,w] quaternion, w=1 + # we set z=-1 to get points near the ground in lidar coord system + plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0]) + plan_traj = bevformer_results['plan_results'][sample_token][0][plan_cmd] + plan_traj[abs(plan_traj) < 0.01] = 0.0 + plan_traj = plan_traj.cumsum(axis=0) + + plan_traj = np.concatenate(( + plan_traj[:, [0]], + plan_traj[:, [1]], + -1.0*np.ones((plan_traj.shape[0], 1)), + np.ones((plan_traj.shape[0], 1)), + ), axis=1) + # add the start point in lcf + plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0) + # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2] + plan_traj[0, 0] = 0.3 + plan_traj[0, 2] = -1.0 + plan_traj[0, 3] = 1.0 + + l2e_r = lidar_cs_record['rotation'] + l2e_t = lidar_cs_record['translation'] + e2g_r = lidar_pose_record['rotation'] + e2g_t = lidar_pose_record['translation'] + l2e_r_mat = Quaternion(l2e_r).rotation_matrix + e2g_r_mat = Quaternion(e2g_r).rotation_matrix + s2l_r, s2l_t = obtain_sensor2top(nusc, sample_data_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, cam) + # obtain lidar to image transformation matrix + lidar2cam_r = np.linalg.inv(s2l_r) + lidar2cam_t = s2l_t @ lidar2cam_r.T + lidar2cam_rt = np.eye(4) + lidar2cam_rt[:3, :3] = lidar2cam_r.T + lidar2cam_rt[3, :3] = -lidar2cam_t + viewpad = np.eye(4) + viewpad[:camera_intrinsic.shape[0], :camera_intrinsic.shape[1]] = camera_intrinsic + lidar2img_rt = (viewpad @ lidar2cam_rt.T) + plan_traj = lidar2img_rt @ plan_traj.T + plan_traj = plan_traj[0:2, ...] / np.maximum( + plan_traj[2:3, ...], np.ones_like(plan_traj[2:3, ...]) * 1e-5) + plan_traj = plan_traj.T + plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1) + + plan_vecs = None + for i in range(plan_traj.shape[0]): + plan_vec_i = plan_traj[i] + x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51) + y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51) + xy = np.stack((x_linspace, y_linspace), axis=1) + xy = np.stack((xy[:-1], xy[1:]), axis=1) + if plan_vecs is None: + plan_vecs = xy + else: + plan_vecs = np.concatenate((plan_vecs, xy), axis=0) + + cmap = 'summer' + y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301)) + colors = color_map(y[:-1], cmap) + line_segments = LineCollection(plan_vecs, colors=colors, linewidths=2, linestyles='solid', cmap=cmap) + ax.add_collection(line_segments) + + ax.set_xlim(0, data.size[0]) + ax.set_ylim(data.size[1], 0) + ax.axis('off') + if out_path is not None: + savepath = osp.join(out_path, f'{cam}_PRED') + plt.savefig(savepath, bbox_inches='tight', dpi=200, pad_inches=0.0) + plt.close() + + # Load boxes and image. + data_path = osp.join(out_path, f'{cam}_PRED.png') + cam_img = cv2.imread(data_path) + lw = 6 + tf = max(lw - 3, 1) + w, h = cv2.getTextSize(cam, 0, fontScale=lw / 6, thickness=tf)[0] # text width, height + # color=(0, 0, 0) + txt_color=(255, 255, 255) + cv2.putText(cam_img, + cam, (10, h + 10), + 0, + lw / 6, + txt_color, + thickness=tf, + lineType=cv2.LINE_AA) + cam_imgs.append(cam_img) + else: + raise ValueError("Error: Unknown sensor modality!") + + plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0]) + cmd_list = ['Turn Right', 'Turn Left', 'Go Straight'] + plan_cmd_str = cmd_list[plan_cmd] + pred_img = cv2.copyMakeBorder(pred_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0) + # font + font = cv2.FONT_HERSHEY_SIMPLEX + # fontScale + fontScale = 1 + # Line thickness of 2 px + thickness = 3 + # org + org = (20, 40) + # Blue color in BGR + color = (0, 0, 0) + # Using cv2.putText() method + # pred_img = cv2.putText(pred_img, 'BEV', org, font, + # fontScale, color, thickness, cv2.LINE_AA) + # pred_img = cv2.putText(pred_img, plan_cmd_str, (20, 770), font, + # fontScale, color, thickness, cv2.LINE_AA) + + sample_img = pred_img + cam_img_top = cv2.hconcat([cam_imgs[0], cam_imgs[1], cam_imgs[2]]) + cam_img_down = cv2.hconcat([cam_imgs[3], cam_imgs[4], cam_imgs[5]]) + cam_img = cv2.vconcat([cam_img_top, cam_img_down]) + size = (2133, 800) + cam_img = cv2.resize(cam_img, size) + vis_img = cv2.hconcat([cam_img, sample_img]) + + video.write(vis_img) + + video.release() + cv2.destroyAllWindows() diff --git a/GenAD-main/tools/analysis_tools/visualization_div.py b/GenAD-main/tools/analysis_tools/visualization_div.py new file mode 100644 index 0000000000000000000000000000000000000000..b7e2fd43f448a6742d4acb6abac64c804918d637 --- /dev/null +++ b/GenAD-main/tools/analysis_tools/visualization_div.py @@ -0,0 +1,1124 @@ +import sys +sys.path.append('') +import os +import argparse +import os.path as osp +from PIL import Image +from tqdm import tqdm +from typing import List, Dict +import random + +import cv2 +import mmcv +import torch +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import rcParams +from pyquaternion import Quaternion +from nuscenes.nuscenes import NuScenes +from mmdet.datasets.pipelines import to_tensor +from matplotlib.collections import LineCollection +from nuscenes.utils.data_classes import LidarPointCloud, Box +from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox +from nuscenes.eval.detection.utils import category_to_detection_name +from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility + +from projects.mmdet3d_plugin.core.bbox.structures.nuscenes_box import CustomNuscenesBox, CustomDetectionBox, color_map +from projects.mmdet3d_plugin.datasets.nuscenes_vad_dataset import VectorizedLocalMap, LiDARInstanceLines +import matplotlib.cm as cm +from matplotlib.colors import LinearSegmentedColormap + + +cams = ['CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_BACK_RIGHT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_FRONT_LEFT'] + + +def render_annotation( + anntoken: str, + margin: float = 10, + view: np.ndarray = np.eye(4), + box_vis_level: BoxVisibility = BoxVisibility.ANY, + out_path: str = 'render.png', + extra_info: bool = False) -> None: + """ + Render selected annotation. + :param anntoken: Sample_annotation token. + :param margin: How many meters in each direction to include in LIDAR view. + :param view: LIDAR view point. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param out_path: Optional path to save the rendered figure to disk. + :param extra_info: Whether to render extra information below camera view. + """ + ann_record = nusc.get('sample_annotation', anntoken) + sample_record = nusc.get('sample', ann_record['sample_token']) + assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.' + + # Figure out which camera the object is fully visible in (this may return nothing). + boxes, cam = [], [] + cams = [key for key in sample_record['data'].keys() if 'CAM' in key] + all_bboxes = [] + select_cams = [] + for cam in cams: + _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level, + selected_anntokens=[anntoken]) + if len(boxes) > 0: + all_bboxes.append(boxes) + select_cams.append(cam) + # We found an image that matches. Let's abort. + # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \ + # 'Try using e.g. BoxVisibility.ANY.' + # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!' + + num_cam = len(all_bboxes) + + fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9)) + select_cams = [sample_record['data'][cam] for cam in select_cams] + print('bbox in cams:', select_cams) + # Plot LIDAR view. + lidar = sample_record['data']['LIDAR_TOP'] + data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken]) + LidarPointCloud.from_file(data_path).render_height(axes[0], view=view) + for box in boxes: + c = np.array(get_color(box.name)) / 255.0 + box.render(axes[0], view=view, colors=(c, c, c)) + corners = view_points(boxes[0].corners(), view, False)[:2, :] + axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin]) + axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin]) + axes[0].axis('off') + axes[0].set_aspect('equal') + + # Plot CAMERA view. + for i in range(1, num_cam + 1): + cam = select_cams[i - 1] + data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken]) + im = Image.open(data_path) + axes[i].imshow(im) + axes[i].set_title(nusc.get('sample_data', cam)['channel']) + axes[i].axis('off') + axes[i].set_aspect('equal') + for box in boxes: + c = np.array(get_color(box.name)) / 255.0 + box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c)) + + # Print extra information about the annotation below the camera view. + axes[i].set_xlim(0, im.size[0]) + axes[i].set_ylim(im.size[1], 0) + + if extra_info: + rcParams['font.family'] = 'monospace' + + w, l, h = ann_record['size'] + category = ann_record['category_name'] + lidar_points = ann_record['num_lidar_pts'] + radar_points = ann_record['num_radar_pts'] + + sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP']) + pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token']) + dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation'])) + + information = ' \n'.join(['category: {}'.format(category), + '', + '# lidar points: {0:>4}'.format(lidar_points), + '# radar points: {0:>4}'.format(radar_points), + '', + 'distance: {:>7.3f}m'.format(dist), + '', + 'width: {:>7.3f}m'.format(w), + 'length: {:>7.3f}m'.format(l), + 'height: {:>7.3f}m'.format(h)]) + + plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top') + + if out_path is not None: + plt.savefig(out_path) + + +def get_sample_data(sample_data_token: str, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + selected_anntokens=None, + use_flat_vehicle_coordinates: bool = False): + """ + Returns the data path as well as all annotations related to that sample_data. + Note that the boxes are transformed into the current sensor's coordinate frame. + :param sample_data_token: Sample_data token. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param selected_anntokens: If provided only return the selected annotation. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. + :return: (data_path, boxes, camera_intrinsic ) + """ + + # Retrieve sensor & pose records + sd_record = nusc.get('sample_data', sample_data_token) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + sensor_record = nusc.get('sensor', cs_record['sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + + data_path = nusc.get_sample_data_path(sample_data_token) + + if sensor_record['modality'] == 'camera': + cam_intrinsic = np.array(cs_record['camera_intrinsic']) + imsize = (sd_record['width'], sd_record['height']) + else: + cam_intrinsic = None + imsize = None + + # Retrieve all sample annotations and map to sensor coordinate system. + if selected_anntokens is not None: + boxes = list(map(nusc.get_box, selected_anntokens)) + else: + boxes = nusc.get_boxes(sample_data_token) + + # Make list of Box objects including coord system transforms. + box_list = [] + for box in boxes: + if use_flat_vehicle_coordinates: + # Move box to ego vehicle coord system parallel to world z plane. + yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0] + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) + else: + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + if sensor_record['modality'] == 'camera' and not \ + box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level): + continue + + box_list.append(box) + + return data_path, box_list, cam_intrinsic + + +def get_predicted_data(sample_data_token: str, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + selected_anntokens=None, + use_flat_vehicle_coordinates: bool = False, + pred_anns=None + ): + """ + Returns the data path as well as all annotations related to that sample_data. + Note that the boxes are transformed into the current sensor's coordinate frame. + :param sample_data_token: Sample_data token. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param selected_anntokens: If provided only return the selected annotation. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. + :return: (data_path, boxes, camera_intrinsic ) + """ + + # Retrieve sensor & pose records + sd_record = nusc.get('sample_data', sample_data_token) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + sensor_record = nusc.get('sensor', cs_record['sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + + data_path = nusc.get_sample_data_path(sample_data_token) + + if sensor_record['modality'] == 'camera': + cam_intrinsic = np.array(cs_record['camera_intrinsic']) + imsize = (sd_record['width'], sd_record['height']) + else: + cam_intrinsic = None + imsize = None + + # Retrieve all sample annotations and map to sensor coordinate system. + # if selected_anntokens is not None: + # boxes = list(map(nusc.get_box, selected_anntokens)) + # else: + # boxes = nusc.get_boxes(sample_data_token) + boxes = pred_anns + # Make list of Box objects including coord system transforms. + box_list = [] + for box in boxes: + if use_flat_vehicle_coordinates: + # Move box to ego vehicle coord system parallel to world z plane. + yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0] + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse) + else: + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + if sensor_record['modality'] == 'camera' and not \ + box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level): + continue + box_list.append(box) + + return data_path, box_list, cam_intrinsic + + +def lidiar_render(sample_token, data, out_path=None, out_name=None, traj_use_perstep_offset=True): + bbox_gt_list = [] + bbox_pred_list = [] + sample_rec = nusc.get('sample', sample_token) + anns = sample_rec['anns'] + sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + + for ann in anns: + content = nusc.get('sample_annotation', ann) + gt_fut_trajs, gt_fut_masks = get_gt_fut_trajs( + nusc=nusc, anno=content, cs_record=cs_record, + pose_record=pose_record, fut_ts=6 + ) + try: + bbox_gt_list.append(CustomDetectionBox( + sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=nusc.box_velocity(content['token'])[:2], + fut_trajs=tuple(gt_fut_trajs), + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=category_to_detection_name(content['category_name']), + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name='')) + except: + pass + + bbox_anns = data['results'][sample_token] + for content in bbox_anns: + bbox_pred_list.append(CustomDetectionBox( + sample_token=content['sample_token'], + translation=tuple(content['translation']), + size=tuple(content['size']), + rotation=tuple(content['rotation']), + velocity=tuple(content['velocity']), + fut_trajs=tuple(content['fut_traj']), + ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content + else tuple(content['ego_translation']), + num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), + detection_name=content['detection_name'], + detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), + attribute_name=content['attribute_name'])) + gt_annotations = EvalBoxes() + pred_annotations = EvalBoxes() + gt_annotations.add_boxes(sample_token, bbox_gt_list) + pred_annotations.add_boxes(sample_token, bbox_pred_list) + # print('green is ground truth') + # print('blue is the predited result') + visualize_sample(nusc, sample_token, gt_annotations, pred_annotations, + savepath=out_path, traj_use_perstep_offset=traj_use_perstep_offset, pred_data=data) + + +def get_color(category_name: str): + """ + Provides the default colors based on the category names. + This method works for the general nuScenes categories, as well as the nuScenes detection categories. + """ + a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker', + 'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller', + 'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris', + 'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle', + 'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance', + 'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface', + 'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation', + 'vehicle.ego'] + class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' + ] + #print(category_name) + if category_name == 'bicycle': + return nusc.colormap['vehicle.bicycle'] + elif category_name == 'construction_vehicle': + return nusc.colormap['vehicle.construction'] + elif category_name == 'traffic_cone': + return nusc.colormap['movable_object.trafficcone'] + + for key in nusc.colormap.keys(): + if category_name in key: + return nusc.colormap[key] + return [0, 0, 0] + +# TODO: whether to rotate traj +def boxes_to_sensor(boxes: List[EvalBox], pose_record: Dict, cs_record: Dict): + """ + Map boxes from global coordinates to the vehicle's sensor coordinate system. + :param boxes: The boxes in global coordinates. + :param pose_record: The pose record of the vehicle at the current timestamp. + :param cs_record: The calibrated sensor record of the sensor. + :return: The transformed boxes. + """ + boxes_out = [] + for box in boxes: + # Create Box instance. + box = CustomNuscenesBox( + box.translation, box.size, Quaternion(box.rotation), box.fut_trajs, name=box.detection_name + ) + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + boxes_out.append(box) + + return boxes_out + + +def get_gt_fut_trajs(nusc: NuScenes, + anno, + cs_record, + pose_record, + fut_ts) -> None: + """ + Visualizes a sample from BEV with annotations and detection results. + :param nusc: NuScenes object. + """ + box = Box(anno['translation'], anno['size'], Quaternion(anno['rotation'])) + # Move box to ego vehicle coord system. + box.translate(-np.array(pose_record['translation'])) + box.rotate(Quaternion(pose_record['rotation']).inverse) + # Move box to sensor coord system. + box.translate(-np.array(cs_record['translation'])) + box.rotate(Quaternion(cs_record['rotation']).inverse) + + # get future trajectory coords for each box + gt_fut_trajs = np.zeros((fut_ts, 2)) # [fut_ts*2] + gt_fut_masks = np.zeros((fut_ts)) # [fut_ts] + gt_fut_trajs[:] = box.center[:2] + cur_box = box + cur_anno = anno + for i in range(fut_ts): + if cur_anno['next'] != '': + anno_next = nusc.get('sample_annotation', cur_anno['next']) + box_next = Box( + anno_next['translation'], anno_next['size'], Quaternion(anno_next['rotation']) + ) + # Move box to ego vehicle coord system. + box_next.translate(-np.array(pose_record['translation'])) + box_next.rotate(Quaternion(pose_record['rotation']).inverse) + # Move box to sensor coord system. + box_next.translate(-np.array(cs_record['translation'])) + box_next.rotate(Quaternion(cs_record['rotation']).inverse) + # gt_fut_trajs[i] = box_next.center[:2] + gt_fut_trajs[i] = box_next.center[:2] - cur_box.center[:2] + gt_fut_masks[i] = 1 + cur_anno = anno_next + cur_box = box_next + else: + # gt_fut_trajs[i:] = gt_fut_trajs[i-1] + gt_fut_trajs[i:] = 0 + break + + return gt_fut_trajs.reshape(-1).tolist(), gt_fut_masks.reshape(-1).tolist() + +def get_gt_vec_maps( + sample_token, + data_root='data/nuscenes/', + pc_range=[-15.0, -30.0, -4.0, 15.0, 30.0, 4.0], + padding_value=-10000, + map_classes=['divider', 'ped_crossing', 'boundary'], + map_fixed_ptsnum_per_line=20 +) -> None: + """ + Get gt vec map for a given sample. + """ + sample_rec = nusc.get('sample', sample_token) + sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + lidar2ego_translation = cs_record['translation'], + lidar2ego_rotation = cs_record['rotation'], + ego2global_translation = pose_record['translation'], + ego2global_rotation = pose_record['rotation'], + map_location = nusc.get('log', nusc.get('scene', sample_rec['scene_token'])['log_token'])['location'] + + lidar2ego = np.eye(4) + lidar2ego[:3,:3] = Quaternion(cs_record['rotation']).rotation_matrix + lidar2ego[:3, 3] = cs_record['translation'] + ego2global = np.eye(4) + ego2global[:3,:3] = Quaternion(pose_record['rotation']).rotation_matrix + ego2global[:3, 3] = pose_record['translation'] + lidar2global = ego2global @ lidar2ego + lidar2global_translation = list(lidar2global[:3,3]) + lidar2global_rotation = list(Quaternion(matrix=lidar2global).q) + patch_h = pc_range[4]-pc_range[1] + patch_w = pc_range[3]-pc_range[0] + patch_size = (patch_h, patch_w) + + vector_map = VectorizedLocalMap(data_root, patch_size=patch_size, + map_classes=map_classes, + fixed_ptsnum_per_line=map_fixed_ptsnum_per_line, + padding_value=padding_value) + + + anns_results = vector_map.gen_vectorized_samples( + map_location, lidar2global_translation, lidar2global_rotation + ) + + ''' + anns_results, type: dict + 'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates + 'gt_vecs_pts_num': list[num_vecs], vec with num_points + 'gt_vecs_label': list[num_vecs], vec with cls index + ''' + gt_vecs_label = to_tensor(anns_results['gt_vecs_label']) + if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines): + gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc'] + else: + gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc']) + try: + gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32) + except: + gt_vecs_pts_loc = gt_vecs_pts_loc + + return gt_vecs_pts_loc, gt_vecs_label + + +def visualize_sample(nusc: NuScenes, + sample_token: str, + gt_boxes: EvalBoxes, + pred_boxes: EvalBoxes, + nsweeps: int = 1, + conf_th: float = 0.4, + pc_range: list = [-30.0, -30.0, -4.0, 30.0, 30.0, 4.0], + verbose: bool = True, + savepath: str = None, + traj_use_perstep_offset: bool = True, + data_root='data/nuscenes/', + map_pc_range: list = [-15.0, -30.0, -4.0, 15.0, 30.0, 4.0], + padding_value=-10000, + map_classes=['divider', 'ped_crossing', 'boundary'], + map_fixed_ptsnum_per_line=20, + gt_format=['fixed_num_pts'], + colors_plt = ['red', 'green', 'blue'], #['cornflowerblue', 'royalblue', 'slategrey'], + pred_data = None) -> None: + """ + Visualizes a sample from BEV with annotations and detection results. + :param nusc: NuScenes object. + :param sample_token: The nuScenes sample token. + :param gt_boxes: Ground truth boxes grouped by sample. + :param pred_boxes: Prediction grouped by sample. + :param nsweeps: Number of sweeps used for lidar visualization. + :param conf_th: The confidence threshold used to filter negatives. + :param eval_range: Range in meters beyond which boxes are ignored. + :param verbose: Whether to print to stdout. + :param savepath: If given, saves the the rendering here instead of displaying. + """ + # Retrieve sensor & pose records. + sample_rec = nusc.get('sample', sample_token) + sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) + cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) + # Get boxes. + boxes_gt_global = gt_boxes[sample_token] + boxes_est_global = pred_boxes[sample_token] + # Map GT boxes to lidar. + boxes_gt = boxes_to_sensor(boxes_gt_global, pose_record, cs_record) + # Map EST boxes to lidar. + boxes_est = boxes_to_sensor(boxes_est_global, pose_record, cs_record) + # Add scores to EST boxes. + for box_est, box_est_global in zip(boxes_est, boxes_est_global): + box_est.score = box_est_global.detection_score + + # Init axes. + fig, axes = plt.subplots(1, 1, figsize=(4, 4)) + plt.xlim(xmin=-30, xmax=30) + plt.ylim(ymin=-30, ymax=30) + + # Show Pred Map + + result_dic = pred_data['map_results'][sample_token]['vectors'] + + for vector in result_dic: + if vector['confidence_level'] < 0.6: + continue + pred_pts_3d = vector['pts'] + pred_label_3d = vector['type'] + pts_x = np.array([pt[0] for pt in pred_pts_3d]) + pts_y = np.array([pt[1] for pt in pred_pts_3d]) + + axes.plot(pts_x, pts_y, color=colors_plt[pred_label_3d],linewidth=2,alpha=0.8,zorder=-1) + axes.scatter(pts_x, pts_y, color=colors_plt[pred_label_3d],s=1,alpha=0.8,zorder=-1) + + + # ignore_list = ['barrier', 'motorcycle', 'bicycle', 'traffic_cone'] + ignore_list = ['barrier', 'bicycle', 'traffic_cone'] + + # Show Pred boxes. + color_list = ['salmon', 'darkcyan', 'orange', 'red', 'lightcoral', 'deepskyblue', 'gold', 'seagreen', 'deeppink', + 'dodgerblue', 'royalblue', 'yellow', 'violet', 'peru', 'palegreen', 'slateblue'] + # color_list = ['Blues', 'PiYG'] + + for i, box in enumerate(boxes_est): + if box.name in ignore_list: + continue + # Show only predictions with a high score. + assert not np.isnan(box.score), 'Error: Box score cannot be NaN!' + if box.name in ['pedestrian']: + continue + if box.score < conf_th or abs(box.center[0]) > 15 or abs(box.center[1]) > 30: + continue + + # colors = color_map(, cmap) + if i < 16: + color_box = color_list[i] + else: + color_box = color_list[-1] + # box.render(axes, view=np.eye(4), colors=('darkcyan', 'darkcyan', 'darkcyan'), linewidth=3, box_idx=None) + + box.render(axes, view=np.eye(4), colors=(color_box, color_box, color_box), linewidth=3, box_idx=None) + + if traj_use_perstep_offset: + # mode_idx = [0, 1, 2, 3, 4, 5] + mode_idx = [0] + # box.render_fut_trajs_grad_color(axes, linewidth=4, mode_idx=mode_idx, fut_ts=6, cmap='autumn') + box.render_fut_trajs_grad_color(axes, linewidth=6, mode_idx=mode_idx, fut_ts=3, cmap="autumn") + #cmap = LinearSegmentedColormap.from_list("mycmap", color_box) + + if box.name in ['pedestrian']: + continue + + else: + box.render_fut_trajs_coords(axes, color='tomato', linewidth=1) + + # Show Planning. + axes.plot([-0.9, -0.9], [-2, 2], color='mediumseagreen', linewidth=3, alpha=0.8) + axes.plot([-0.9, 0.9], [2, 2], color='mediumseagreen', linewidth=3, alpha=0.8) + axes.plot([0.9, 0.9], [2, -2], color='mediumseagreen', linewidth=3, alpha=0.8) + axes.plot([0.9, -0.9], [-2, -2], color='mediumseagreen', linewidth=3, alpha=0.8) + axes.plot([0.0, 0.0], [0.0, 2], color='mediumseagreen', linewidth=3, alpha=0.8) + plan_cmd = np.argmax(pred_data['plan_results'][sample_token][1][0,0,0]) + plan_traj = pred_data['plan_results'][sample_token][0][plan_cmd] + plan_traj[abs(plan_traj) < 0.01] = 0.0 + plan_traj = plan_traj.cumsum(axis=0) + plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0) + plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1) + + plan_vecs = None + for i in range(plan_traj.shape[0]): + plan_vec_i = plan_traj[i] + x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51) + y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51) + xy = np.stack((x_linspace, y_linspace), axis=1) + xy = np.stack((xy[:-1], xy[1:]), axis=1) + if plan_vecs is None: + plan_vecs = xy + else: + plan_vecs = np.concatenate((plan_vecs, xy), axis=0) + + cmap = 'summer' + y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301)) + colors = color_map(y[:-1], cmap) + line_segments = LineCollection(plan_vecs, colors=colors, linewidths=6, linestyles='solid', cmap=cmap) + axes.add_collection(line_segments) + + + axes.axes.xaxis.set_ticks([]) + axes.axes.yaxis.set_ticks([]) + axes.axis('off') + fig.set_tight_layout(True) + fig.canvas.draw() + plt.savefig(savepath+'/bev_pred.png', bbox_inches='tight', dpi=200) + plt.close() + + +def obtain_sensor2top(nusc, + sensor_token, + l2e_t, + l2e_r_mat, + e2g_t, + e2g_r_mat, + sensor_type='lidar'): + """Obtain the info with RT matric from general sensor to Top LiDAR. + + Args: + nusc (class): Dataset class in the nuScenes dataset. + sensor_token (str): Sample data token corresponding to the + specific sensor type. + l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3). + l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego + in shape (3, 3). + e2g_t (np.ndarray): Translation from ego to global in shape (1, 3). + e2g_r_mat (np.ndarray): Rotation matrix from ego to global + in shape (3, 3). + sensor_type (str): Sensor to calibrate. Default: 'lidar'. + + Returns: + sweep (dict): Sweep information after transformation. + """ + sd_rec = nusc.get('sample_data', sensor_token) + cs_record = nusc.get('calibrated_sensor', + sd_rec['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token']) + data_path = str(nusc.get_sample_data_path(sd_rec['token'])) + if os.getcwd() in data_path: # path from lyftdataset is absolute path + data_path = data_path.split(f'{os.getcwd()}/')[-1] # relative path + sweep = { + 'data_path': data_path, + 'type': sensor_type, + 'sample_data_token': sd_rec['token'], + 'sensor2ego_translation': cs_record['translation'], + 'sensor2ego_rotation': cs_record['rotation'], + 'ego2global_translation': pose_record['translation'], + 'ego2global_rotation': pose_record['rotation'], + 'timestamp': sd_rec['timestamp'] + } + + l2e_r_s = sweep['sensor2ego_rotation'] + l2e_t_s = sweep['sensor2ego_translation'] + e2g_r_s = sweep['ego2global_rotation'] + e2g_t_s = sweep['ego2global_translation'] + + # obtain the RT from sensor to Top LiDAR + # sweep->ego->global->ego'->lidar + l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix + e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix + R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ ( + np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) + T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ ( + np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) + T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T + ) + l2e_t @ np.linalg.inv(l2e_r_mat).T + sensor2lidar_rotation = R.T # points @ R.T + T + sensor2lidar_translation = T + + return sensor2lidar_rotation, sensor2lidar_translation + +def render_sample_data( + sample_toekn: str, + with_anns: bool = True, + box_vis_level: BoxVisibility = BoxVisibility.ANY, + axes_limit: float = 40, + ax=None, + nsweeps: int = 1, + out_path: str = None, + out_name: str = None, + underlay_map: bool = True, + use_flat_vehicle_coordinates: bool = True, + show_lidarseg: bool = False, + show_lidarseg_legend: bool = False, + filter_lidarseg_labels=None, + lidarseg_preds_bin_path: str = None, + verbose: bool = True, + show_panoptic: bool = False, + pred_data=None, + traj_use_perstep_offset: bool = True + ) -> None: + """ + Render sample data onto axis. + :param sample_data_token: Sample_data token. + :param with_anns: Whether to draw box annotations. + :param box_vis_level: If sample_data is an image, this sets required visibility for boxes. + :param axes_limit: Axes limit for lidar and radar (measured in meters). + :param ax: Axes onto which to render. + :param nsweeps: Number of sweeps for lidar and radar. + :param out_path: Optional path to save the rendered figure to disk. + :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow. + :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is + aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which + can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new + setting is more correct and rotates the plot by ~90 degrees. + :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set + to False, the colors of the lidar data represent the distance from the center of the ego vehicle. + :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame. + :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None + or the list is empty, all classes will be displayed. + :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation + predictions for the sample. + :param verbose: Whether to display the image after it is rendered. + :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set + to False, the colors of the lidar data represent the distance from the center of the ego vehicle. + If show_lidarseg is True, show_panoptic will be set to False. + """ + lidiar_render(sample_toekn, pred_data, out_path=out_path, + out_name=out_name, traj_use_perstep_offset=traj_use_perstep_offset) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Visualize VAD predictions') + parser.add_argument('--result-path', help='inference result file path') + parser.add_argument('--save-path', help='the dir to save visualization results') + args = parser.parse_args() + + return args + + +if __name__ == '__main__': + args = parse_args() + inference_result_path_0 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/00/pts_bbox/results_nusc.pkl' + inference_result_path_1 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e/Wed_Nov_15_10_59_19_2023/pts_bbox/results_nusc.pkl' + inference_result_path_2 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_base_e2e_vae/Wed_Nov_15_14_18_16_2023/pts_bbox/results_nusc.pkl' + inference_result_path_3 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/00/pts_bbox/results_nusc.pkl' + inference_result_path_4 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/00/pts_bbox/results_nusc.pkl' + # inference_result_path_0 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/00/pts_bbox/results_nusc.pkl' + # inference_result_path_1 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/01/pts_bbox/results_nusc.pkl' + # inference_result_path_2 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/02/pts_bbox/results_nusc.pkl' + # inference_result_path_3 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/03/pts_bbox/results_nusc.pkl' + # inference_result_path_4 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/04/pts_bbox/results_nusc.pkl' + + out_path = args.save_path + bevformer_results = mmcv.load(inference_result_path_0) + bevformer_results_1 = mmcv.load(inference_result_path_1) + bevformer_results_2 = mmcv.load(inference_result_path_2) + bevformer_results_3 = mmcv.load(inference_result_path_3) + bevformer_results_4 = mmcv.load(inference_result_path_4) + sample_token_list = list(bevformer_results['results'].keys()) + + nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True) + + imgs = [] + fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v') + video_path = osp.join(out_path, 'tiny.mp4') + video = cv2.VideoWriter(video_path, fourcc, 10, (2933, 800), True) + # for id in tqdm(range(len(sample_token_list))): + for id in tqdm(range(200)): + # 3025 1140 + id = id + 3000 + mmcv.mkdir_or_exist(out_path) + render_sample_data(sample_token_list[id], + pred_data=bevformer_results, + out_path=out_path) + pred_path = osp.join(out_path, 'bev_pred.png') + pred_img = cv2.imread(pred_path) + os.remove(pred_path) + + sample_token = sample_token_list[id] + sample = nusc.get('sample', sample_token) + # sample = data['results'][sample_token_list[0]][0] + cams = [ + 'CAM_FRONT_LEFT', + 'CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_BACK_LEFT', + 'CAM_BACK', + 'CAM_BACK_RIGHT', + ] + + cam_imgs = [] + for cam in cams: + sample_data_token = sample['data'][cam] + sd_record = nusc.get('sample_data', sample_data_token) + sensor_modality = sd_record['sensor_modality'] + if sensor_modality in ['lidar', 'radar']: + assert False + elif sensor_modality == 'camera': + boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']), + name=record['detection_name'], token='predicted') for record in + bevformer_results['results'][sample_token]] + data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token, + box_vis_level=BoxVisibility.ANY, + pred_anns=boxes) + _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=BoxVisibility.ANY) + + data = Image.open(data_path) + + # Show image. + _, ax = plt.subplots(1, 1, figsize=(6, 12)) + ax.imshow(data) + + if cam == 'CAM_FRONT': + lidar_sd_record = nusc.get('sample_data', sample['data']['LIDAR_TOP']) + lidar_cs_record = nusc.get('calibrated_sensor', lidar_sd_record['calibrated_sensor_token']) + lidar_pose_record = nusc.get('ego_pose', lidar_sd_record['ego_pose_token']) + + # get plan traj [x,y,z,w] quaternion, w=1 + # we set z=-1 to get points near the ground in lidar coord system + plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0]) + plan_traj = bevformer_results['plan_results'][sample_token][0][plan_cmd] + + ###### + plan_cmd_1 = np.argmax(bevformer_results_1['plan_results'][sample_token][1][0, 0, 0]) + plan_traj_1 = bevformer_results_1['plan_results'][sample_token][0][plan_cmd_1] + plan_cmd_2 = np.argmax(bevformer_results_2['plan_results'][sample_token][1][0, 0, 0]) + plan_traj_2 = bevformer_results_2['plan_results'][sample_token][0][plan_cmd_2] + plan_cmd_3 = np.argmax(bevformer_results_3['plan_results'][sample_token][1][0, 0, 0]) + plan_traj_3 = bevformer_results_3['plan_results'][sample_token][0][plan_cmd_3] + plan_cmd_4 = np.argmax(bevformer_results_4['plan_results'][sample_token][1][0, 0, 0]) + plan_traj_4 = bevformer_results_4['plan_results'][sample_token][0][plan_cmd_4] + + plan_traj[abs(plan_traj) < 0.01] = 0.0 + plan_traj = plan_traj.cumsum(axis=0) + + plan_traj = np.concatenate(( + plan_traj[:, [0]], + plan_traj[:, [1]], + -1.0*np.ones((plan_traj.shape[0], 1)), + np.ones((plan_traj.shape[0], 1)), + ), axis=1) + # add the start point in lcf + plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0) + # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2] + plan_traj[0, 0] = 0.3 + plan_traj[0, 2] = -1.0 + plan_traj[0, 3] = 1.0 + + + ############### 1 + plan_traj_1[abs(plan_traj_1) < 0.01] = 0.0 + plan_traj_1 = plan_traj_1.cumsum(axis=0) + + plan_traj_1 = np.concatenate(( + plan_traj_1[:, [0]], + plan_traj_1[:, [1]], + -1.0*np.ones((plan_traj_1.shape[0], 1)), + np.ones((plan_traj_1.shape[0], 1)), + ), axis=1) + # add the start point in lcf + plan_traj_1 = np.concatenate((np.zeros((1, plan_traj_1.shape[1])), plan_traj_1), axis=0) + # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2] + plan_traj_1[0, 0] = 0.3 + plan_traj_1[0, 2] = -1.0 + plan_traj_1[0, 3] = 1.0 + + ############### 2 + plan_traj_2[abs(plan_traj_2) < 0.01] = 0.0 + plan_traj_2 = plan_traj_2.cumsum(axis=0) + + plan_traj_2 = np.concatenate(( + plan_traj_2[:, [0]], + plan_traj_2[:, [1]], + -1.0 * np.ones((plan_traj_2.shape[0], 1)), + np.ones((plan_traj_2.shape[0], 1)), + ), axis=1) + # add the start point in lcf + plan_traj_2 = np.concatenate((np.zeros((1, plan_traj_2.shape[1])), plan_traj_2), axis=0) + # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2] + plan_traj_2[0, 0] = 0.3 + plan_traj_2[0, 2] = -1.0 + plan_traj_2[0, 3] = 1.0 + + ############### 3 + plan_traj_3[abs(plan_traj_3) < 0.01] = 0.0 + plan_traj_3 = plan_traj_3.cumsum(axis=0) + + plan_traj_3 = np.concatenate(( + plan_traj_3[:, [0]], + plan_traj_3[:, [1]], + -1.0*np.ones((plan_traj_3.shape[0], 1)), + np.ones((plan_traj_3.shape[0], 1)), + ), axis=1) + # add the start point in lcf + plan_traj_3 = np.concatenate((np.zeros((1, plan_traj_3.shape[1])), plan_traj_3), axis=0) + # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2] + plan_traj_3[0, 0] = 0.3 + plan_traj_3[0, 2] = -1.0 + plan_traj_3[0, 3] = 1.0 + + ############### 4 + plan_traj_4[abs(plan_traj_4) < 0.01] = 0.0 + plan_traj_4 = plan_traj_4.cumsum(axis=0) + + plan_traj_4 = np.concatenate(( + plan_traj_4[:, [0]], + plan_traj_4[:, [1]], + -1.0*np.ones((plan_traj_4.shape[0], 1)), + np.ones((plan_traj_4.shape[0], 1)), + ), axis=1) + # add the start point in lcf + plan_traj_4 = np.concatenate((np.zeros((1, plan_traj_4.shape[1])), plan_traj_4), axis=0) + # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2] + plan_traj_4[0, 0] = 0.3 + plan_traj_4[0, 2] = -1.0 + plan_traj_4[0, 3] = 1.0 + + l2e_r = lidar_cs_record['rotation'] + l2e_t = lidar_cs_record['translation'] + e2g_r = lidar_pose_record['rotation'] + e2g_t = lidar_pose_record['translation'] + l2e_r_mat = Quaternion(l2e_r).rotation_matrix + e2g_r_mat = Quaternion(e2g_r).rotation_matrix + s2l_r, s2l_t = obtain_sensor2top(nusc, sample_data_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, cam) + # obtain lidar to image transformation matrix + lidar2cam_r = np.linalg.inv(s2l_r) + lidar2cam_t = s2l_t @ lidar2cam_r.T + lidar2cam_rt = np.eye(4) + lidar2cam_rt[:3, :3] = lidar2cam_r.T + lidar2cam_rt[3, :3] = -lidar2cam_t + viewpad = np.eye(4) + viewpad[:camera_intrinsic.shape[0], :camera_intrinsic.shape[1]] = camera_intrinsic + lidar2img_rt = (viewpad @ lidar2cam_rt.T) + plan_traj = lidar2img_rt @ plan_traj.T + plan_traj = plan_traj[0:2, ...] / np.maximum( + plan_traj[2:3, ...], np.ones_like(plan_traj[2:3, ...]) * 1e-5) + plan_traj = plan_traj.T + plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1) + + plan_vecs = None + for i in range(plan_traj.shape[0]): + plan_vec_i = plan_traj[i] + x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51) + y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51) + xy = np.stack((x_linspace, y_linspace), axis=1) + xy = np.stack((xy[:-1], xy[1:]), axis=1) + if plan_vecs is None: + plan_vecs = xy + else: + plan_vecs = np.concatenate((plan_vecs, xy), axis=0) + + ##############1 + plan_traj_1 = lidar2img_rt @ plan_traj_1.T + plan_traj_1 = plan_traj_1[0:2, ...] / np.maximum( + plan_traj_1[2:3, ...], np.ones_like(plan_traj_1[2:3, ...]) * 1e-5) + plan_traj_1 = plan_traj_1.T + plan_traj_1 = np.stack((plan_traj_1[:-1], plan_traj_1[1:]), axis=1) + + plan_vecs_1 = None + for i in range(plan_traj_1.shape[0]): + plan_vec_i = plan_traj_1[i] + x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51) + y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51) + xy = np.stack((x_linspace, y_linspace), axis=1) + xy = np.stack((xy[:-1], xy[1:]), axis=1) + if plan_vecs_1 is None: + plan_vecs_1 = xy + else: + plan_vecs_1 = np.concatenate((plan_vecs_1, xy), axis=0) + + ##############2 + plan_traj_2 = lidar2img_rt @ plan_traj_2.T + plan_traj_2 = plan_traj_2[0:2, ...] / np.maximum( + plan_traj_2[2:3, ...], np.ones_like(plan_traj_2[2:3, ...]) * 1e-5) + plan_traj_2 = plan_traj_2.T + plan_traj_2 = np.stack((plan_traj_2[:-1], plan_traj_2[1:]), axis=1) + + plan_vecs_2 = None + for i in range(plan_traj_2.shape[0]): + plan_vec_i = plan_traj_2[i] + x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51) + y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51) + xy = np.stack((x_linspace, y_linspace), axis=1) + xy = np.stack((xy[:-1], xy[1:]), axis=1) + if plan_vecs_2 is None: + plan_vecs_2 = xy + else: + plan_vecs_2 = np.concatenate((plan_vecs_2, xy), axis=0) + + ##############3 + plan_traj_3 = lidar2img_rt @ plan_traj_3.T + plan_traj_3 = plan_traj_3[0:2, ...] / np.maximum( + plan_traj_3[2:3, ...], np.ones_like(plan_traj_3[2:3, ...]) * 1e-5) + plan_traj_3 = plan_traj_3.T + plan_traj_3 = np.stack((plan_traj_3[:-1], plan_traj_3[1:]), axis=1) + + plan_vecs_3 = None + for i in range(plan_traj_3.shape[0]): + plan_vec_i = plan_traj_3[i] + x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51) + y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51) + xy = np.stack((x_linspace, y_linspace), axis=1) + xy = np.stack((xy[:-1], xy[1:]), axis=1) + if plan_vecs_3 is None: + plan_vecs_3 = xy + else: + plan_vecs_3 = np.concatenate((plan_vecs_3, xy), axis=0) + + ##############4 + plan_traj_4 = lidar2img_rt @ plan_traj_4.T + plan_traj_4 = plan_traj_4[0:2, ...] / np.maximum( + plan_traj_4[2:3, ...], np.ones_like(plan_traj_4[2:3, ...]) * 1e-5) + plan_traj_4 = plan_traj_4.T + plan_traj_4 = np.stack((plan_traj_4[:-1], plan_traj_4[1:]), axis=1) + + plan_vecs_4 = None + for i in range(plan_traj_4.shape[0]): + plan_vec_i = plan_traj_4[i] + x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51) + y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51) + xy = np.stack((x_linspace, y_linspace), axis=1) + xy = np.stack((xy[:-1], xy[1:]), axis=1) + if plan_vecs_4 is None: + plan_vecs_4 = xy + else: + plan_vecs_4 = np.concatenate((plan_vecs_4, xy), axis=0) + + cmap = 'summer' + cmap_1 = 'autumn' + cmap_2 = 'winter' + cmap_3 = 'spring' + + + y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301)) + colors = color_map(y[:-1], cmap) + line_segments = LineCollection(plan_vecs, colors=colors, linewidths=2, linestyles='solid', cmap=cmap) + ax.add_collection(line_segments) + + line_segments_1 = LineCollection(plan_vecs_1, colors=colors, linewidths=2, linestyles='solid', cmap=cmap_1) + ax.add_collection(line_segments_1) + line_segments_2 = LineCollection(plan_vecs_2, colors=colors, linewidths=2, linestyles='solid', cmap=cmap_2) + ax.add_collection(line_segments_2) + line_segments_3 = LineCollection(plan_vecs_3, colors=colors, linewidths=2, linestyles='solid', cmap=cmap) + ax.add_collection(line_segments_3) + line_segments_4 = LineCollection(plan_vecs_4, colors=colors, linewidths=2, linestyles='solid', cmap=cmap) + ax.add_collection(line_segments_4) + + ax.set_xlim(0, data.size[0]) + ax.set_ylim(data.size[1], 0) + ax.axis('off') + if out_path is not None: + savepath = osp.join(out_path, f'{cam}_PRED') + plt.savefig(savepath, bbox_inches='tight', dpi=200, pad_inches=0.0) + plt.close() + + # Load boxes and image. + data_path = osp.join(out_path, f'{cam}_PRED.png') + cam_img = cv2.imread(data_path) + lw = 6 + tf = max(lw - 3, 1) + w, h = cv2.getTextSize(cam, 0, fontScale=lw / 6, thickness=tf)[0] # text width, height + # color=(0, 0, 0) + txt_color=(255, 255, 255) + cv2.putText(cam_img, + cam, (10, h + 10), + 0, + lw / 6, + txt_color, + thickness=tf, + lineType=cv2.LINE_AA) + cam_imgs.append(cam_img) + else: + raise ValueError("Error: Unknown sensor modality!") + + plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0]) + cmd_list = ['Turn Right', 'Turn Left', 'Go Straight'] + plan_cmd_str = cmd_list[plan_cmd] + pred_img = cv2.copyMakeBorder(pred_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0) + # font + font = cv2.FONT_HERSHEY_SIMPLEX + # fontScale + fontScale = 1 + # Line thickness of 2 px + thickness = 3 + # org + org = (20, 40) + # Blue color in BGR + color = (0, 0, 0) + # Using cv2.putText() method + # pred_img = cv2.putText(pred_img, 'BEV', org, font, + # fontScale, color, thickness, cv2.LINE_AA) + # pred_img = cv2.putText(pred_img, plan_cmd_str, (20, 770), font, + # fontScale, color, thickness, cv2.LINE_AA) + + sample_img = pred_img + cam_img_top = cv2.hconcat([cam_imgs[0], cam_imgs[1], cam_imgs[2]]) + cam_img_down = cv2.hconcat([cam_imgs[3], cam_imgs[4], cam_imgs[5]]) + cam_img = cv2.vconcat([cam_img_top, cam_img_down]) + size = (2133, 800) + cam_img = cv2.resize(cam_img, size) + vis_img = cv2.hconcat([cam_img, sample_img]) + + video.write(vis_img) + + video.release() + cv2.destroyAllWindows() diff --git a/GenAD-main/tools/create_data.py b/GenAD-main/tools/create_data.py new file mode 100644 index 0000000000000000000000000000000000000000..f2b0cc10f1fafa77a39cd8fbd9c1ac9386d2af72 --- /dev/null +++ b/GenAD-main/tools/create_data.py @@ -0,0 +1,305 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- +from data_converter.create_gt_database import create_groundtruth_database +from data_converter import nuscenes_converter as nuscenes_converter +from data_converter import lyft_converter as lyft_converter +from data_converter import kitti_converter as kitti +from data_converter import indoor_converter as indoor +import argparse +from os import path as osp +import sys +sys.path.append('.') + + +def kitti_data_prep(root_path, info_prefix, version, out_dir): + """Prepare data related to Kitti dataset. + + Related data consists of '.pkl' files recording basic infos, + 2D annotations and groundtruth database. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + version (str): Dataset version. + out_dir (str): Output directory of the groundtruth database info. + """ + kitti.create_kitti_info_file(root_path, info_prefix) + kitti.create_reduced_point_cloud(root_path, info_prefix) + + info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl') + info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl') + info_trainval_path = osp.join(root_path, + f'{info_prefix}_infos_trainval.pkl') + info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl') + kitti.export_2d_annotation(root_path, info_train_path) + kitti.export_2d_annotation(root_path, info_val_path) + kitti.export_2d_annotation(root_path, info_trainval_path) + kitti.export_2d_annotation(root_path, info_test_path) + + create_groundtruth_database( + 'KittiDataset', + root_path, + info_prefix, + f'{out_dir}/{info_prefix}_infos_train.pkl', + relative_path=False, + mask_anno_path='instances_train.json', + with_mask=(version == 'mask')) + + +def nuscenes_data_prep(root_path, + can_bus_root_path, + info_prefix, + version, + dataset_name, + out_dir, + max_sweeps=10): + """Prepare data related to nuScenes dataset. + + Related data consists of '.pkl' files recording basic infos, + 2D annotations and groundtruth database. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + version (str): Dataset version. + dataset_name (str): The dataset class name. + out_dir (str): Output directory of the groundtruth database info. + max_sweeps (int): Number of input consecutive frames. Default: 10 + """ + nuscenes_converter.create_nuscenes_infos( + root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps) + + if version == 'v1.0-test': + info_test_path = osp.join( + out_dir, f'{info_prefix}_infos_temporal_test.pkl') + nuscenes_converter.export_2d_annotation( + root_path, info_test_path, version=version) + else: + info_train_path = osp.join( + out_dir, f'{info_prefix}_infos_temporal_train.pkl') + info_val_path = osp.join( + out_dir, f'{info_prefix}_infos_temporal_val.pkl') + nuscenes_converter.export_2d_annotation( + root_path, info_train_path, version=version) + nuscenes_converter.export_2d_annotation( + root_path, info_val_path, version=version) + # create_groundtruth_database(dataset_name, root_path, info_prefix, + # f'{out_dir}/{info_prefix}_infos_train.pkl') + + +def lyft_data_prep(root_path, info_prefix, version, max_sweeps=10): + """Prepare data related to Lyft dataset. + + Related data consists of '.pkl' files recording basic infos. + Although the ground truth database and 2D annotations are not used in + Lyft, it can also be generated like nuScenes. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + version (str): Dataset version. + max_sweeps (int, optional): Number of input consecutive frames. + Defaults to 10. + """ + lyft_converter.create_lyft_infos( + root_path, info_prefix, version=version, max_sweeps=max_sweeps) + + +def scannet_data_prep(root_path, info_prefix, out_dir, workers): + """Prepare the info file for scannet dataset. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + out_dir (str): Output directory of the generated info file. + workers (int): Number of threads to be used. + """ + indoor.create_indoor_info_file( + root_path, info_prefix, out_dir, workers=workers) + + +def s3dis_data_prep(root_path, info_prefix, out_dir, workers): + """Prepare the info file for s3dis dataset. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + out_dir (str): Output directory of the generated info file. + workers (int): Number of threads to be used. + """ + indoor.create_indoor_info_file( + root_path, info_prefix, out_dir, workers=workers) + + +def sunrgbd_data_prep(root_path, info_prefix, out_dir, workers): + """Prepare the info file for sunrgbd dataset. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + out_dir (str): Output directory of the generated info file. + workers (int): Number of threads to be used. + """ + indoor.create_indoor_info_file( + root_path, info_prefix, out_dir, workers=workers) + + +def waymo_data_prep(root_path, + info_prefix, + version, + out_dir, + workers, + max_sweeps=5): + """Prepare the info file for waymo dataset. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + out_dir (str): Output directory of the generated info file. + workers (int): Number of threads to be used. + max_sweeps (int): Number of input consecutive frames. Default: 5 \ + Here we store pose information of these frames for later use. + """ + from tools.data_converter import waymo_converter as waymo + + splits = ['training', 'validation', 'testing'] + + for i, split in enumerate(splits): + load_dir = osp.join(root_path, 'waymo_format', split) + if split == 'validation': + save_dir = osp.join(out_dir, 'kitti_format', 'training') + else: + save_dir = osp.join(out_dir, 'kitti_format', split) + converter = waymo.Waymo2KITTI( + load_dir, + save_dir, + prefix=str(i), + workers=workers, + test_mode=(split == 'test')) + converter.convert() + # Generate waymo infos + out_dir = osp.join(out_dir, 'kitti_format') + kitti.create_waymo_info_file(out_dir, info_prefix, max_sweeps=max_sweeps) + + create_groundtruth_database( + 'WaymoDataset', + out_dir, + info_prefix, + f'{out_dir}/{info_prefix}_infos_train.pkl', + relative_path=False, + with_mask=False) + + +parser = argparse.ArgumentParser(description='Data converter arg parser') +parser.add_argument('dataset', metavar='kitti', help='name of the dataset') +parser.add_argument( + '--root-path', + type=str, + default='./data/kitti', + help='specify the root path of dataset') +parser.add_argument( + '--canbus', + type=str, + default='./data', + help='specify the root path of nuScenes canbus') +parser.add_argument( + '--version', + type=str, + default='v1.0', + required=False, + help='specify the dataset version, no need for kitti') +parser.add_argument( + '--max-sweeps', + type=int, + default=10, + required=False, + help='specify sweeps of lidar per example') +parser.add_argument( + '--out-dir', + type=str, + default='./data/kitti', + required='False', + help='name of info pkl') +parser.add_argument('--extra-tag', type=str, default='kitti') +parser.add_argument( + '--workers', type=int, default=4, help='number of threads to be used') +args = parser.parse_args() + +if __name__ == '__main__': + if args.dataset == 'kitti': + kitti_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + version=args.version, + out_dir=args.out_dir) + elif args.dataset == 'nuscenes' and args.version != 'v1.0-mini': + train_version = f'{args.version}-trainval' + nuscenes_data_prep( + root_path=args.root_path, + can_bus_root_path=args.canbus, + info_prefix=args.extra_tag, + version=train_version, + dataset_name='NuScenesDataset', + out_dir=args.out_dir, + max_sweeps=args.max_sweeps) + test_version = f'{args.version}-test' + nuscenes_data_prep( + root_path=args.root_path, + can_bus_root_path=args.canbus, + info_prefix=args.extra_tag, + version=test_version, + dataset_name='NuScenesDataset', + out_dir=args.out_dir, + max_sweeps=args.max_sweeps) + elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini': + train_version = f'{args.version}' + nuscenes_data_prep( + root_path=args.root_path, + can_bus_root_path=args.canbus, + info_prefix=args.extra_tag, + version=train_version, + dataset_name='NuScenesDataset', + out_dir=args.out_dir, + max_sweeps=args.max_sweeps) + elif args.dataset == 'lyft': + train_version = f'{args.version}-train' + lyft_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + version=train_version, + max_sweeps=args.max_sweeps) + test_version = f'{args.version}-test' + lyft_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + version=test_version, + max_sweeps=args.max_sweeps) + elif args.dataset == 'waymo': + waymo_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + version=args.version, + out_dir=args.out_dir, + workers=args.workers, + max_sweeps=args.max_sweeps) + elif args.dataset == 'scannet': + scannet_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + out_dir=args.out_dir, + workers=args.workers) + elif args.dataset == 's3dis': + s3dis_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + out_dir=args.out_dir, + workers=args.workers) + elif args.dataset == 'sunrgbd': + sunrgbd_data_prep( + root_path=args.root_path, + info_prefix=args.extra_tag, + out_dir=args.out_dir, + workers=args.workers) diff --git a/GenAD-main/tools/data_converter/__init__.py b/GenAD-main/tools/data_converter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d --- /dev/null +++ b/GenAD-main/tools/data_converter/__init__.py @@ -0,0 +1 @@ +# Copyright (c) OpenMMLab. All rights reserved. diff --git a/GenAD-main/tools/data_converter/create_gt_database.py b/GenAD-main/tools/data_converter/create_gt_database.py new file mode 100644 index 0000000000000000000000000000000000000000..7317cedd08377643018b7d4a72f7b5c96397b59c --- /dev/null +++ b/GenAD-main/tools/data_converter/create_gt_database.py @@ -0,0 +1,338 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +import pickle +from mmcv import track_iter_progress +from mmcv.ops import roi_align +from os import path as osp +from pycocotools import mask as maskUtils +from pycocotools.coco import COCO + +from mmdet3d.core.bbox import box_np_ops as box_np_ops +from mmdet3d.datasets import build_dataset +from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps + + +def _poly2mask(mask_ann, img_h, img_w): + if isinstance(mask_ann, list): + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = maskUtils.frPyObjects(mask_ann, img_h, img_w) + rle = maskUtils.merge(rles) + elif isinstance(mask_ann['counts'], list): + # uncompressed RLE + rle = maskUtils.frPyObjects(mask_ann, img_h, img_w) + else: + # rle + rle = mask_ann + mask = maskUtils.decode(rle) + return mask + + +def _parse_coco_ann_info(ann_info): + gt_bboxes = [] + gt_labels = [] + gt_bboxes_ignore = [] + gt_masks_ann = [] + + for i, ann in enumerate(ann_info): + if ann.get('ignore', False): + continue + x1, y1, w, h = ann['bbox'] + if ann['area'] <= 0: + continue + bbox = [x1, y1, x1 + w, y1 + h] + if ann.get('iscrowd', False): + gt_bboxes_ignore.append(bbox) + else: + gt_bboxes.append(bbox) + gt_masks_ann.append(ann['segmentation']) + + if gt_bboxes: + gt_bboxes = np.array(gt_bboxes, dtype=np.float32) + gt_labels = np.array(gt_labels, dtype=np.int64) + else: + gt_bboxes = np.zeros((0, 4), dtype=np.float32) + gt_labels = np.array([], dtype=np.int64) + + if gt_bboxes_ignore: + gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) + else: + gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) + + ann = dict( + bboxes=gt_bboxes, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann) + + return ann + + +def crop_image_patch_v2(pos_proposals, pos_assigned_gt_inds, gt_masks): + import torch + from torch.nn.modules.utils import _pair + device = pos_proposals.device + num_pos = pos_proposals.size(0) + fake_inds = ( + torch.arange(num_pos, + device=device).to(dtype=pos_proposals.dtype)[:, None]) + rois = torch.cat([fake_inds, pos_proposals], dim=1) # Nx5 + mask_size = _pair(28) + rois = rois.to(device=device) + gt_masks_th = ( + torch.from_numpy(gt_masks).to(device).index_select( + 0, pos_assigned_gt_inds).to(dtype=rois.dtype)) + # Use RoIAlign could apparently accelerate the training (~0.1s/iter) + targets = ( + roi_align(gt_masks_th, rois, mask_size[::-1], 1.0, 0, True).squeeze(1)) + return targets + + +def crop_image_patch(pos_proposals, gt_masks, pos_assigned_gt_inds, org_img): + num_pos = pos_proposals.shape[0] + masks = [] + img_patches = [] + for i in range(num_pos): + gt_mask = gt_masks[pos_assigned_gt_inds[i]] + bbox = pos_proposals[i, :].astype(np.int32) + x1, y1, x2, y2 = bbox + w = np.maximum(x2 - x1 + 1, 1) + h = np.maximum(y2 - y1 + 1, 1) + + mask_patch = gt_mask[y1:y1 + h, x1:x1 + w] + masked_img = gt_mask[..., None] * org_img + img_patch = masked_img[y1:y1 + h, x1:x1 + w] + + img_patches.append(img_patch) + masks.append(mask_patch) + return img_patches, masks + + +def create_groundtruth_database(dataset_class_name, + data_path, + info_prefix, + info_path=None, + mask_anno_path=None, + used_classes=None, + database_save_path=None, + db_info_save_path=None, + relative_path=True, + add_rgb=False, + lidar_only=False, + bev_only=False, + coors_range=None, + with_mask=False): + """Given the raw data, generate the ground truth database. + + Args: + dataset_class_name (str): Name of the input dataset. + data_path (str): Path of the data. + info_prefix (str): Prefix of the info file. + info_path (str): Path of the info file. + Default: None. + mask_anno_path (str): Path of the mask_anno. + Default: None. + used_classes (list[str]): Classes have been used. + Default: None. + database_save_path (str): Path to save database. + Default: None. + db_info_save_path (str): Path to save db_info. + Default: None. + relative_path (bool): Whether to use relative path. + Default: True. + with_mask (bool): Whether to use mask. + Default: False. + """ + print(f'Create GT Database of {dataset_class_name}') + dataset_cfg = dict( + type=dataset_class_name, data_root=data_path, ann_file=info_path) + if dataset_class_name == 'KittiDataset': + file_client_args = dict(backend='disk') + dataset_cfg.update( + test_mode=False, + split='training', + modality=dict( + use_lidar=True, + use_depth=False, + use_lidar_intensity=True, + use_camera=with_mask, + ), + pipeline=[ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=4, + use_dim=4, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args) + ]) + + elif dataset_class_name == 'NuScenesDataset': + dataset_cfg.update( + use_valid_flag=True, + pipeline=[ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=10, + use_dim=[0, 1, 2, 3, 4], + pad_empty_sweeps=True, + remove_close=True), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True) + ]) + + elif dataset_class_name == 'WaymoDataset': + file_client_args = dict(backend='disk') + dataset_cfg.update( + test_mode=False, + split='training', + modality=dict( + use_lidar=True, + use_depth=False, + use_lidar_intensity=True, + use_camera=False, + ), + pipeline=[ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=6, + use_dim=5, + file_client_args=file_client_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + file_client_args=file_client_args) + ]) + + dataset = build_dataset(dataset_cfg) + + if database_save_path is None: + database_save_path = osp.join(data_path, f'{info_prefix}_gt_database') + if db_info_save_path is None: + db_info_save_path = osp.join(data_path, + f'{info_prefix}_dbinfos_train.pkl') + mmcv.mkdir_or_exist(database_save_path) + all_db_infos = dict() + if with_mask: + coco = COCO(osp.join(data_path, mask_anno_path)) + imgIds = coco.getImgIds() + file2id = dict() + for i in imgIds: + info = coco.loadImgs([i])[0] + file2id.update({info['file_name']: i}) + + group_counter = 0 + for j in track_iter_progress(list(range(len(dataset)))): + input_dict = dataset.get_data_info(j) + dataset.pre_pipeline(input_dict) + example = dataset.pipeline(input_dict) + annos = example['ann_info'] + image_idx = example['sample_idx'] + points = example['points'].tensor.numpy() + gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy() + names = annos['gt_names'] + group_dict = dict() + if 'group_ids' in annos: + group_ids = annos['group_ids'] + else: + group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64) + difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32) + if 'difficulty' in annos: + difficulty = annos['difficulty'] + + num_obj = gt_boxes_3d.shape[0] + point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d) + + if with_mask: + # prepare masks + gt_boxes = annos['gt_bboxes'] + img_path = osp.split(example['img_info']['filename'])[-1] + if img_path not in file2id.keys(): + print(f'skip image {img_path} for empty mask') + continue + img_id = file2id[img_path] + kins_annIds = coco.getAnnIds(imgIds=img_id) + kins_raw_info = coco.loadAnns(kins_annIds) + kins_ann_info = _parse_coco_ann_info(kins_raw_info) + h, w = annos['img_shape'][:2] + gt_masks = [ + _poly2mask(mask, h, w) for mask in kins_ann_info['masks'] + ] + # get mask inds based on iou mapping + bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes) + mask_inds = bbox_iou.argmax(axis=0) + valid_inds = (bbox_iou.max(axis=0) > 0.5) + + # mask the image + # use more precise crop when it is ready + # object_img_patches = np.ascontiguousarray( + # np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2)) + # crop image patches using roi_align + # object_img_patches = crop_image_patch_v2( + # torch.Tensor(gt_boxes), + # torch.Tensor(mask_inds).long(), object_img_patches) + object_img_patches, object_masks = crop_image_patch( + gt_boxes, gt_masks, mask_inds, annos['img']) + + for i in range(num_obj): + filename = f'{image_idx}_{names[i]}_{i}.bin' + abs_filepath = osp.join(database_save_path, filename) + rel_filepath = osp.join(f'{info_prefix}_gt_database', filename) + + # save point clouds and image patches for each object + gt_points = points[point_indices[:, i]] + gt_points[:, :3] -= gt_boxes_3d[i, :3] + + if with_mask: + if object_masks[i].sum() == 0 or not valid_inds[i]: + # Skip object for empty or invalid mask + continue + img_patch_path = abs_filepath + '.png' + mask_patch_path = abs_filepath + '.mask.png' + mmcv.imwrite(object_img_patches[i], img_patch_path) + mmcv.imwrite(object_masks[i], mask_patch_path) + + with open(abs_filepath, 'w') as f: + gt_points.tofile(f) + + if (used_classes is None) or names[i] in used_classes: + db_info = { + 'name': names[i], + 'path': rel_filepath, + 'image_idx': image_idx, + 'gt_idx': i, + 'box3d_lidar': gt_boxes_3d[i], + 'num_points_in_gt': gt_points.shape[0], + 'difficulty': difficulty[i], + } + local_group_id = group_ids[i] + # if local_group_id >= 0: + if local_group_id not in group_dict: + group_dict[local_group_id] = group_counter + group_counter += 1 + db_info['group_id'] = group_dict[local_group_id] + if 'score' in annos: + db_info['score'] = annos['score'][i] + if with_mask: + db_info.update({'box2d_camera': gt_boxes[i]}) + if names[i] in all_db_infos: + all_db_infos[names[i]].append(db_info) + else: + all_db_infos[names[i]] = [db_info] + + for k, v in all_db_infos.items(): + print(f'load {len(v)} {k} database infos') + + with open(db_info_save_path, 'wb') as f: + pickle.dump(all_db_infos, f) diff --git a/GenAD-main/tools/data_converter/vad_nuscenes_converter.py b/GenAD-main/tools/data_converter/vad_nuscenes_converter.py new file mode 100644 index 0000000000000000000000000000000000000000..338051cbc544f6860fc3ad6296b1271b037d1bd5 --- /dev/null +++ b/GenAD-main/tools/data_converter/vad_nuscenes_converter.py @@ -0,0 +1,1005 @@ +import os +import math +import copy +import argparse +from os import path as osp +from collections import OrderedDict +from typing import List, Tuple, Union + +import mmcv +import numpy as np +from pyquaternion import Quaternion +from nuscenes.nuscenes import NuScenes +from nuscenes.utils.data_classes import Box +from shapely.geometry import MultiPoint, box +from mmdet3d.datasets import NuScenesDataset +from nuscenes.utils.geometry_utils import view_points +from mmdet3d.core.bbox.box_np_ops import points_cam2img +from nuscenes.utils.geometry_utils import transform_matrix + + +nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', + 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', + 'barrier') + +nus_attributes = ('cycle.with_rider', 'cycle.without_rider', + 'pedestrian.moving', 'pedestrian.standing', + 'pedestrian.sitting_lying_down', 'vehicle.moving', + 'vehicle.parked', 'vehicle.stopped', 'None') + +ego_width, ego_length = 1.85, 4.084 + +def quart_to_rpy(qua): + x, y, z, w = qua + roll = math.atan2(2 * (w * x + y * z), 1 - 2 * (x * x + y * y)) + pitch = math.asin(2 * (w * y - x * z)) + yaw = math.atan2(2 * (w * z + x * y), 1 - 2 * (z * z + y * y)) + return roll, pitch, yaw + +def locate_message(utimes, utime): + i = np.searchsorted(utimes, utime) + if i == len(utimes) or (i > 0 and utime - utimes[i-1] < utimes[i] - utime): + i -= 1 + return i + + +def create_nuscenes_infos(root_path, + out_path, + can_bus_root_path, + info_prefix, + version='v1.0-trainval', + max_sweeps=10): + """Create info file of nuscene dataset. + + Given the raw data, generate its related info file in pkl format. + + Args: + root_path (str): Path of the data root. + info_prefix (str): Prefix of the info file to be generated. + version (str): Version of the data. + Default: 'v1.0-trainval' + max_sweeps (int): Max number of sweeps. + Default: 10 + """ + from nuscenes.nuscenes import NuScenes + from nuscenes.can_bus.can_bus_api import NuScenesCanBus + print(version, root_path) + nusc = NuScenes(version=version, dataroot=root_path, verbose=True) + nusc_can_bus = NuScenesCanBus(dataroot=can_bus_root_path) + from nuscenes.utils import splits + available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini'] + assert version in available_vers + if version == 'v1.0-trainval': + train_scenes = splits.train + val_scenes = splits.val + elif version == 'v1.0-test': + train_scenes = splits.test + val_scenes = [] + elif version == 'v1.0-mini': + train_scenes = splits.mini_train + val_scenes = splits.mini_val + else: + raise ValueError('unknown') + + # filter existing scenes. + available_scenes = get_available_scenes(nusc) + available_scene_names = [s['name'] for s in available_scenes] + train_scenes = list( + filter(lambda x: x in available_scene_names, train_scenes)) + val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes)) + train_scenes = set([ + available_scenes[available_scene_names.index(s)]['token'] + for s in train_scenes + ]) + val_scenes = set([ + available_scenes[available_scene_names.index(s)]['token'] + for s in val_scenes + ]) + + test = 'test' in version + if test: + print('test scene: {}'.format(len(train_scenes))) + else: + print('train scene: {}, val scene: {}'.format( + len(train_scenes), len(val_scenes))) + + train_nusc_infos, val_nusc_infos = _fill_trainval_infos( + nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps) + + metadata = dict(version=version) + if test: + print('test sample: {}'.format(len(train_nusc_infos))) + data = dict(infos=train_nusc_infos, metadata=metadata) + info_path = osp.join(out_path, + '{}_infos_temporal_test.pkl'.format(info_prefix)) + mmcv.dump(data, info_path) + else: + print('train sample: {}, val sample: {}'.format( + len(train_nusc_infos), len(val_nusc_infos))) + data = dict(infos=train_nusc_infos, metadata=metadata) + info_path = osp.join(out_path, + '{}_infos_temporal_train.pkl'.format(info_prefix)) + mmcv.dump(data, info_path) + data['infos'] = val_nusc_infos + info_val_path = osp.join(out_path, + '{}_infos_temporal_val.pkl'.format(info_prefix)) + mmcv.dump(data, info_val_path) + + +def get_available_scenes(nusc): + """Get available scenes from the input nuscenes class. + + Given the raw data, get the information of available scenes for + further info generation. + + Args: + nusc (class): Dataset class in the nuScenes dataset. + + Returns: + available_scenes (list[dict]): List of basic information for the + available scenes. + """ + available_scenes = [] + print('total scene num: {}'.format(len(nusc.scene))) + for scene in nusc.scene: + scene_token = scene['token'] + scene_rec = nusc.get('scene', scene_token) + sample_rec = nusc.get('sample', scene_rec['first_sample_token']) + sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) + has_more_frames = True + scene_not_exist = False + while has_more_frames: + lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token']) + lidar_path = str(lidar_path) + if os.getcwd() in lidar_path: + # path from lyftdataset is absolute path + lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1] + # relative path + if not mmcv.is_filepath(lidar_path): + scene_not_exist = True + break + else: + break + if scene_not_exist: + continue + available_scenes.append(scene) + print('exist scene num: {}'.format(len(available_scenes))) + return available_scenes + + +def _get_can_bus_info(nusc, nusc_can_bus, sample): + scene_name = nusc.get('scene', sample['scene_token'])['name'] + sample_timestamp = sample['timestamp'] + try: + pose_list = nusc_can_bus.get_messages(scene_name, 'pose') + except: + return np.zeros(18) # server scenes do not have can bus information. + can_bus = [] + # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp + last_pose = pose_list[0] + for i, pose in enumerate(pose_list): + if pose['utime'] > sample_timestamp: + break + last_pose = pose + _ = last_pose.pop('utime') # useless + pos = last_pose.pop('pos') + rotation = last_pose.pop('orientation') + can_bus.extend(pos) + can_bus.extend(rotation) + for key in last_pose.keys(): + can_bus.extend(pose[key]) # 16 elements + can_bus.extend([0., 0.]) + return np.array(can_bus) + + +def _fill_trainval_infos(nusc, + nusc_can_bus, + train_scenes, + val_scenes, + test=False, + max_sweeps=10, + fut_ts=6, + his_ts=2): + """Generate the train/val infos from the raw data. + + Args: + nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset. + train_scenes (list[str]): Basic information of training scenes. + val_scenes (list[str]): Basic information of validation scenes. + test (bool): Whether use the test mode. In the test mode, no + annotations can be accessed. Default: False. + max_sweeps (int): Max number of sweeps. Default: 10. + + Returns: + tuple[list[dict]]: Information of training set and validation set + that will be saved to the info file. + """ + train_nusc_infos = [] + val_nusc_infos = [] + frame_idx = 0 + cat2idx = {} + for idx, dic in enumerate(nusc.category): + cat2idx[dic['name']] = idx + + for sample in mmcv.track_iter_progress(nusc.sample): + map_location = nusc.get('log', nusc.get('scene', sample['scene_token'])['log_token'])['location'] + lidar_token = sample['data']['LIDAR_TOP'] + sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP']) + cs_record = nusc.get('calibrated_sensor', + sd_rec['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token']) + if sample['prev'] != '': + sample_prev = nusc.get('sample', sample['prev']) + sd_rec_prev = nusc.get('sample_data', sample_prev['data']['LIDAR_TOP']) + pose_record_prev = nusc.get('ego_pose', sd_rec_prev['ego_pose_token']) + else: + pose_record_prev = None + if sample['next'] != '': + sample_next = nusc.get('sample', sample['next']) + sd_rec_next = nusc.get('sample_data', sample_next['data']['LIDAR_TOP']) + pose_record_next = nusc.get('ego_pose', sd_rec_next['ego_pose_token']) + else: + pose_record_next = None + + lidar_path, boxes, _ = nusc.get_sample_data(lidar_token) + + mmcv.check_file_exist(lidar_path) + can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample) + fut_valid_flag = True + test_sample = copy.deepcopy(sample) + for i in range(fut_ts): + if test_sample['next'] != '': + test_sample = nusc.get('sample', test_sample['next']) + else: + fut_valid_flag = False + ## + info = { + 'lidar_path': lidar_path, + 'token': sample['token'], + 'prev': sample['prev'], + 'next': sample['next'], + 'can_bus': can_bus, + 'frame_idx': frame_idx, # temporal related info + 'sweeps': [], + 'cams': dict(), + 'scene_token': sample['scene_token'], # temporal related info + 'lidar2ego_translation': cs_record['translation'], + 'lidar2ego_rotation': cs_record['rotation'], + 'ego2global_translation': pose_record['translation'], + 'ego2global_rotation': pose_record['rotation'], + 'timestamp': sample['timestamp'], + 'fut_valid_flag': fut_valid_flag, + 'map_location': map_location + } + + if sample['next'] == '': + frame_idx = 0 + else: + frame_idx += 1 + + l2e_r = info['lidar2ego_rotation'] + l2e_t = info['lidar2ego_translation'] + e2g_r = info['ego2global_rotation'] + e2g_t = info['ego2global_translation'] + l2e_r_mat = Quaternion(l2e_r).rotation_matrix + e2g_r_mat = Quaternion(e2g_r).rotation_matrix + + # obtain 6 image's information per frame + camera_types = [ + 'CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_FRONT_LEFT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_BACK_RIGHT', + ] + for cam in camera_types: + cam_token = sample['data'][cam] + cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token) + cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat, + e2g_t, e2g_r_mat, cam) + cam_info.update(cam_intrinsic=cam_intrinsic) + info['cams'].update({cam: cam_info}) + + # obtain sweeps for a single key-frame + sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP']) + sweeps = [] + while len(sweeps) < max_sweeps: + if not sd_rec['prev'] == '': + sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t, + l2e_r_mat, e2g_t, e2g_r_mat, 'lidar') + sweeps.append(sweep) + sd_rec = nusc.get('sample_data', sd_rec['prev']) + else: + break + info['sweeps'] = sweeps + # obtain annotation + if not test: + annotations = [ + nusc.get('sample_annotation', token) + for token in sample['anns'] + ] + locs = np.array([b.center for b in boxes]).reshape(-1, 3) + dims = np.array([b.wlh for b in boxes]).reshape(-1, 3) + rots = np.array([b.orientation.yaw_pitch_roll[0] + for b in boxes]).reshape(-1, 1) + velocity = np.array( + [nusc.box_velocity(token)[:2] for token in sample['anns']]) + valid_flag = np.array( + [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0 + for anno in annotations], + dtype=bool).reshape(-1) + # convert velo from global to lidar + for i in range(len(boxes)): + velo = np.array([*velocity[i], 0.0]) + velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv( + l2e_r_mat).T + velocity[i] = velo[:2] + + names = [b.name for b in boxes] + for i in range(len(names)): + if names[i] in NuScenesDataset.NameMapping: + names[i] = NuScenesDataset.NameMapping[names[i]] + names = np.array(names) + # we need to convert rot to SECOND format. + gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1) + assert len(gt_boxes) == len( + annotations), f'{len(gt_boxes)}, {len(annotations)}' + + # get future coords for each box + # [num_box, fut_ts*2] + num_box = len(boxes) + gt_fut_trajs = np.zeros((num_box, fut_ts, 2)) + gt_fut_yaw = np.zeros((num_box, fut_ts)) + gt_fut_masks = np.zeros((num_box, fut_ts)) + gt_boxes_yaw = -(gt_boxes[:,6] + np.pi / 2) + # agent lcf feat (x, y, yaw, vx, vy, width, length, height, type) + agent_lcf_feat = np.zeros((num_box, 9)) + gt_fut_goal = np.zeros((num_box)) + for i, anno in enumerate(annotations): + cur_box = boxes[i] + cur_anno = anno + agent_lcf_feat[i, 0:2] = cur_box.center[:2] + agent_lcf_feat[i, 2] = gt_boxes_yaw[i] + agent_lcf_feat[i, 3:5] = velocity[i] + agent_lcf_feat[i, 5:8] = anno['size'] # width,length,height + agent_lcf_feat[i, 8] = cat2idx[anno['category_name']] if anno['category_name'] in cat2idx.keys() else -1 + for j in range(fut_ts): + if cur_anno['next'] != '': + anno_next = nusc.get('sample_annotation', cur_anno['next']) + box_next = Box( + anno_next['translation'], anno_next['size'], Quaternion(anno_next['rotation']) + ) + # Move box to ego vehicle coord system. + box_next.translate(-np.array(pose_record['translation'])) + box_next.rotate(Quaternion(pose_record['rotation']).inverse) + # Move box to sensor coord system. + box_next.translate(-np.array(cs_record['translation'])) + box_next.rotate(Quaternion(cs_record['rotation']).inverse) + gt_fut_trajs[i, j] = box_next.center[:2] - cur_box.center[:2] + gt_fut_masks[i, j] = 1 + # add yaw diff + _, _, box_yaw = quart_to_rpy([cur_box.orientation.x, cur_box.orientation.y, + cur_box.orientation.z, cur_box.orientation.w]) + _, _, box_yaw_next = quart_to_rpy([box_next.orientation.x, box_next.orientation.y, + box_next.orientation.z, box_next.orientation.w]) + gt_fut_yaw[i, j] = box_yaw_next - box_yaw + cur_anno = anno_next + cur_box = box_next + else: + gt_fut_trajs[i, j:] = 0 + break + # get agent goal + gt_fut_coords = np.cumsum(gt_fut_trajs[i], axis=-2) + coord_diff = gt_fut_coords[-1] - gt_fut_coords[0] + if coord_diff.max() < 1.0: # static + gt_fut_goal[i] = 9 + else: + box_mot_yaw = np.arctan2(coord_diff[1], coord_diff[0]) + np.pi + gt_fut_goal[i] = box_mot_yaw // (np.pi / 4) # 0-8: goal direction class + + # get ego history traj (offset format) + ego_his_trajs = np.zeros((his_ts+1, 3)) + ego_his_trajs_diff = np.zeros((his_ts+1, 3)) + sample_cur = sample + for i in range(his_ts, -1, -1): + if sample_cur is not None: + pose_mat = get_global_sensor_pose(sample_cur, nusc, inverse=False) + ego_his_trajs[i] = pose_mat[:3, 3] + has_prev = sample_cur['prev'] != '' + has_next = sample_cur['next'] != '' + if has_next: + sample_next = nusc.get('sample', sample_cur['next']) + pose_mat_next = get_global_sensor_pose(sample_next, nusc, inverse=False) + ego_his_trajs_diff[i] = pose_mat_next[:3, 3] - ego_his_trajs[i] + sample_cur = nusc.get('sample', sample_cur['prev']) if has_prev else None + else: + ego_his_trajs[i] = ego_his_trajs[i+1] - ego_his_trajs_diff[i+1] + ego_his_trajs_diff[i] = ego_his_trajs_diff[i+1] + + # global to ego at lcf + ego_his_trajs = ego_his_trajs - np.array(pose_record['translation']) + rot_mat = Quaternion(pose_record['rotation']).inverse.rotation_matrix + ego_his_trajs = np.dot(rot_mat, ego_his_trajs.T).T + # ego to lidar at lcf + ego_his_trajs = ego_his_trajs - np.array(cs_record['translation']) + rot_mat = Quaternion(cs_record['rotation']).inverse.rotation_matrix + ego_his_trajs = np.dot(rot_mat, ego_his_trajs.T).T + ego_his_trajs = ego_his_trajs[1:] - ego_his_trajs[:-1] + + # get ego futute traj (offset format) + ego_fut_trajs = np.zeros((fut_ts+1, 3)) + ego_fut_masks = np.zeros((fut_ts+1)) + sample_cur = sample + for i in range(fut_ts+1): + pose_mat = get_global_sensor_pose(sample_cur, nusc, inverse=False) + ego_fut_trajs[i] = pose_mat[:3, 3] + ego_fut_masks[i] = 1 + if sample_cur['next'] == '': + ego_fut_trajs[i+1:] = ego_fut_trajs[i] + break + else: + sample_cur = nusc.get('sample', sample_cur['next']) + # global to ego at lcf + ego_fut_trajs = ego_fut_trajs - np.array(pose_record['translation']) + rot_mat = Quaternion(pose_record['rotation']).inverse.rotation_matrix + ego_fut_trajs = np.dot(rot_mat, ego_fut_trajs.T).T + # ego to lidar at lcf + ego_fut_trajs = ego_fut_trajs - np.array(cs_record['translation']) + rot_mat = Quaternion(cs_record['rotation']).inverse.rotation_matrix + ego_fut_trajs = np.dot(rot_mat, ego_fut_trajs.T).T + # drive command according to final fut step offset from lcf + if ego_fut_trajs[-1][0] >= 2: + command = np.array([1, 0, 0]) # Turn Right + elif ego_fut_trajs[-1][0] <= -2: + command = np.array([0, 1, 0]) # Turn Left + else: + command = np.array([0, 0, 1]) # Go Straight + # offset from lcf -> per-step offset + ego_fut_trajs = ego_fut_trajs[1:] - ego_fut_trajs[:-1] + + ### ego lcf feat (vx, vy, ax, ay, w, length, width, vel, steer), w: yaw角速度 + ego_lcf_feat = np.zeros(9) + # 根据odom推算自车速度及加速度 + _, _, ego_yaw = quart_to_rpy(pose_record['rotation']) + ego_pos = np.array(pose_record['translation']) + if pose_record_prev is not None: + _, _, ego_yaw_prev = quart_to_rpy(pose_record_prev['rotation']) + ego_pos_prev = np.array(pose_record_prev['translation']) + if pose_record_next is not None: + _, _, ego_yaw_next = quart_to_rpy(pose_record_next['rotation']) + ego_pos_next = np.array(pose_record_next['translation']) + assert (pose_record_prev is not None) or (pose_record_next is not None), 'prev token and next token all empty' + if pose_record_prev is not None: + ego_w = (ego_yaw - ego_yaw_prev) / 0.5 + ego_v = np.linalg.norm(ego_pos[:2] - ego_pos_prev[:2]) / 0.5 + ego_vx, ego_vy = ego_v * math.cos(ego_yaw + np.pi/2), ego_v * math.sin(ego_yaw + np.pi/2) + else: + ego_w = (ego_yaw_next - ego_yaw) / 0.5 + ego_v = np.linalg.norm(ego_pos_next[:2] - ego_pos[:2]) / 0.5 + ego_vx, ego_vy = ego_v * math.cos(ego_yaw + np.pi/2), ego_v * math.sin(ego_yaw + np.pi/2) + + ref_scene = nusc.get("scene", sample['scene_token']) + try: + pose_msgs = nusc_can_bus.get_messages(ref_scene['name'],'pose') + steer_msgs = nusc_can_bus.get_messages(ref_scene['name'], 'steeranglefeedback') + pose_uts = [msg['utime'] for msg in pose_msgs] + steer_uts = [msg['utime'] for msg in steer_msgs] + ref_utime = sample['timestamp'] + pose_index = locate_message(pose_uts, ref_utime) + pose_data = pose_msgs[pose_index] + steer_index = locate_message(steer_uts, ref_utime) + steer_data = steer_msgs[steer_index] + # initial speed + v0 = pose_data["vel"][0] # [0] means longitudinal velocity m/s + # curvature (positive: turn left) + steering = steer_data["value"] + # flip x axis if in left-hand traffic (singapore) + flip_flag = True if map_location.startswith('singapore') else False + if flip_flag: + steering *= -1 + Kappa = 2 * steering / 2.588 + except: + delta_x = ego_his_trajs[-1, 0] + ego_fut_trajs[0, 0] + delta_y = ego_his_trajs[-1, 1] + ego_fut_trajs[0, 1] + v0 = np.sqrt(delta_x**2 + delta_y**2) + Kappa = 0 + + ego_lcf_feat[:2] = np.array([ego_vx, ego_vy]) #can_bus[13:15] + ego_lcf_feat[2:4] = can_bus[7:9] + ego_lcf_feat[4] = ego_w #can_bus[12] + ego_lcf_feat[5:7] = np.array([ego_length, ego_width]) + ego_lcf_feat[7] = v0 + ego_lcf_feat[8] = Kappa + + info['gt_boxes'] = gt_boxes + info['gt_names'] = names + info['gt_velocity'] = velocity.reshape(-1, 2) + info['num_lidar_pts'] = np.array( + [a['num_lidar_pts'] for a in annotations]) + info['num_radar_pts'] = np.array( + [a['num_radar_pts'] for a in annotations]) + info['valid_flag'] = valid_flag + info['gt_agent_fut_trajs'] = gt_fut_trajs.reshape(-1, fut_ts*2).astype(np.float32) + info['gt_agent_fut_masks'] = gt_fut_masks.reshape(-1, fut_ts).astype(np.float32) + info['gt_agent_lcf_feat'] = agent_lcf_feat.astype(np.float32) + info['gt_agent_fut_yaw'] = gt_fut_yaw.astype(np.float32) + info['gt_agent_fut_goal'] = gt_fut_goal.astype(np.float32) + info['gt_ego_his_trajs'] = ego_his_trajs[:, :2].astype(np.float32) + info['gt_ego_fut_trajs'] = ego_fut_trajs[:, :2].astype(np.float32) + info['gt_ego_fut_masks'] = ego_fut_masks[1:].astype(np.float32) + info['gt_ego_fut_cmd'] = command.astype(np.float32) + info['gt_ego_lcf_feat'] = ego_lcf_feat.astype(np.float32) + + if sample['scene_token'] in train_scenes: + train_nusc_infos.append(info) + else: + val_nusc_infos.append(info) + + return train_nusc_infos, val_nusc_infos + +def get_global_sensor_pose(rec, nusc, inverse=False): + lidar_sample_data = nusc.get('sample_data', rec['data']['LIDAR_TOP']) + + sd_ep = nusc.get("ego_pose", lidar_sample_data["ego_pose_token"]) + sd_cs = nusc.get("calibrated_sensor", lidar_sample_data["calibrated_sensor_token"]) + if inverse is False: + global_from_ego = transform_matrix(sd_ep["translation"], Quaternion(sd_ep["rotation"]), inverse=False) + ego_from_sensor = transform_matrix(sd_cs["translation"], Quaternion(sd_cs["rotation"]), inverse=False) + pose = global_from_ego.dot(ego_from_sensor) + # translation equivalent writing + # pose_translation = np.array(sd_cs["translation"]) + # rot_mat = Quaternion(sd_ep['rotation']).rotation_matrix + # pose_translation = np.dot(rot_mat, pose_translation) + # # pose_translation = pose[:3, 3] + # pose_translation = pose_translation + np.array(sd_ep["translation"]) + else: + sensor_from_ego = transform_matrix(sd_cs["translation"], Quaternion(sd_cs["rotation"]), inverse=True) + ego_from_global = transform_matrix(sd_ep["translation"], Quaternion(sd_ep["rotation"]), inverse=True) + pose = sensor_from_ego.dot(ego_from_global) + return pose + +def obtain_sensor2top(nusc, + sensor_token, + l2e_t, + l2e_r_mat, + e2g_t, + e2g_r_mat, + sensor_type='lidar'): + """Obtain the info with RT matric from general sensor to Top LiDAR. + + Args: + nusc (class): Dataset class in the nuScenes dataset. + sensor_token (str): Sample data token corresponding to the + specific sensor type. + l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3). + l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego + in shape (3, 3). + e2g_t (np.ndarray): Translation from ego to global in shape (1, 3). + e2g_r_mat (np.ndarray): Rotation matrix from ego to global + in shape (3, 3). + sensor_type (str): Sensor to calibrate. Default: 'lidar'. + + Returns: + sweep (dict): Sweep information after transformation. + """ + sd_rec = nusc.get('sample_data', sensor_token) + cs_record = nusc.get('calibrated_sensor', + sd_rec['calibrated_sensor_token']) + pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token']) + data_path = str(nusc.get_sample_data_path(sd_rec['token'])) + if os.getcwd() in data_path: # path from lyftdataset is absolute path + data_path = data_path.split(f'{os.getcwd()}/')[-1] # relative path + sweep = { + 'data_path': data_path, + 'type': sensor_type, + 'sample_data_token': sd_rec['token'], + 'sensor2ego_translation': cs_record['translation'], + 'sensor2ego_rotation': cs_record['rotation'], + 'ego2global_translation': pose_record['translation'], + 'ego2global_rotation': pose_record['rotation'], + 'timestamp': sd_rec['timestamp'] + } + + l2e_r_s = sweep['sensor2ego_rotation'] + l2e_t_s = sweep['sensor2ego_translation'] + e2g_r_s = sweep['ego2global_rotation'] + e2g_t_s = sweep['ego2global_translation'] + + # obtain the RT from sensor to Top LiDAR + # sweep->ego->global->ego'->lidar + l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix + e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix + R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ ( + np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) + T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ ( + np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) + T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T + ) + l2e_t @ np.linalg.inv(l2e_r_mat).T + sweep['sensor2lidar_rotation'] = R.T # points @ R.T + T + sweep['sensor2lidar_translation'] = T + return sweep + + +def export_2d_annotation(root_path, info_path, version, mono3d=False): + """Export 2d annotation from the info file and raw data. + + Args: + root_path (str): Root path of the raw data. + info_path (str): Path of the info file. + version (str): Dataset version. + mono3d (bool): Whether to export mono3d annotation. Default: False. + """ + # get bbox annotations for camera + camera_types = [ + 'CAM_FRONT', + 'CAM_FRONT_RIGHT', + 'CAM_FRONT_LEFT', + 'CAM_BACK', + 'CAM_BACK_LEFT', + 'CAM_BACK_RIGHT', + ] + nusc_infos = mmcv.load(info_path)['infos'] + nusc = NuScenes(version=version, dataroot=root_path, verbose=True) + # info_2d_list = [] + cat2Ids = [ + dict(id=nus_categories.index(cat_name), name=cat_name) + for cat_name in nus_categories + ] + coco_ann_id = 0 + coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids) + for info in mmcv.track_iter_progress(nusc_infos): + for cam in camera_types: + cam_info = info['cams'][cam] + coco_infos = get_2d_boxes( + nusc, + cam_info['sample_data_token'], + visibilities=['', '1', '2', '3', '4'], + mono3d=mono3d) + (height, width, _) = mmcv.imread(cam_info['data_path']).shape + coco_2d_dict['images'].append( + dict( + file_name=cam_info['data_path'].split('data/nuscenes/') + [-1], + id=cam_info['sample_data_token'], + token=info['token'], + cam2ego_rotation=cam_info['sensor2ego_rotation'], + cam2ego_translation=cam_info['sensor2ego_translation'], + ego2global_rotation=info['ego2global_rotation'], + ego2global_translation=info['ego2global_translation'], + cam_intrinsic=cam_info['cam_intrinsic'], + width=width, + height=height)) + for coco_info in coco_infos: + if coco_info is None: + continue + # add an empty key for coco format + coco_info['segmentation'] = [] + coco_info['id'] = coco_ann_id + coco_2d_dict['annotations'].append(coco_info) + coco_ann_id += 1 + if mono3d: + json_prefix = f'{info_path[:-4]}_mono3d' + else: + json_prefix = f'{info_path[:-4]}' + mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json') + + +def get_2d_boxes(nusc, + sample_data_token: str, + visibilities: List[str], + mono3d=True): + """Get the 2D annotation records for a given `sample_data_token`. + + Args: + sample_data_token (str): Sample data token belonging to a camera \ + keyframe. + visibilities (list[str]): Visibility filter. + mono3d (bool): Whether to get boxes with mono3d annotation. + + Return: + list[dict]: List of 2D annotation record that belongs to the input + `sample_data_token`. + """ + + # Get the sample data and the sample corresponding to that sample data. + sd_rec = nusc.get('sample_data', sample_data_token) + + assert sd_rec[ + 'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \ + ' for camera sample_data!' + if not sd_rec['is_key_frame']: + raise ValueError( + 'The 2D re-projections are available only for keyframes.') + + s_rec = nusc.get('sample', sd_rec['sample_token']) + + # Get the calibrated sensor and ego pose + # record to get the transformation matrices. + cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token']) + pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token']) + camera_intrinsic = np.array(cs_rec['camera_intrinsic']) + + # Get all the annotation with the specified visibilties. + ann_recs = [ + nusc.get('sample_annotation', token) for token in s_rec['anns'] + ] + ann_recs = [ + ann_rec for ann_rec in ann_recs + if (ann_rec['visibility_token'] in visibilities) + ] + + repro_recs = [] + + for ann_rec in ann_recs: + # Augment sample_annotation with token information. + ann_rec['sample_annotation_token'] = ann_rec['token'] + ann_rec['sample_data_token'] = sample_data_token + + # Get the box in global coordinates. + box = nusc.get_box(ann_rec['token']) + + # Move them to the ego-pose frame. + box.translate(-np.array(pose_rec['translation'])) + box.rotate(Quaternion(pose_rec['rotation']).inverse) + + # Move them to the calibrated sensor frame. + box.translate(-np.array(cs_rec['translation'])) + box.rotate(Quaternion(cs_rec['rotation']).inverse) + + # Filter out the corners that are not in front of the calibrated + # sensor. + corners_3d = box.corners() + in_front = np.argwhere(corners_3d[2, :] > 0).flatten() + corners_3d = corners_3d[:, in_front] + + # Project 3d box to 2d. + corner_coords = view_points(corners_3d, camera_intrinsic, + True).T[:, :2].tolist() + + # Keep only corners that fall within the image. + final_coords = post_process_coords(corner_coords) + + # Skip if the convex hull of the re-projected corners + # does not intersect the image canvas. + if final_coords is None: + continue + else: + min_x, min_y, max_x, max_y = final_coords + + # Generate dictionary record to be included in the .json file. + repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y, + sample_data_token, sd_rec['filename']) + + # If mono3d=True, add 3D annotations in camera coordinates + if mono3d and (repro_rec is not None): + loc = box.center.tolist() + + dim = box.wlh + dim[[0, 1, 2]] = dim[[1, 2, 0]] # convert wlh to our lhw + dim = dim.tolist() + + rot = box.orientation.yaw_pitch_roll[0] + rot = [-rot] # convert the rot to our cam coordinate + + global_velo2d = nusc.box_velocity(box.token)[:2] + global_velo3d = np.array([*global_velo2d, 0.0]) + e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix + c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix + cam_velo3d = global_velo3d @ np.linalg.inv( + e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T + velo = cam_velo3d[0::2].tolist() + + repro_rec['bbox_cam3d'] = loc + dim + rot + repro_rec['velo_cam3d'] = velo + + center3d = np.array(loc).reshape([1, 3]) + center2d = points_cam2img( + center3d, camera_intrinsic, with_depth=True) + repro_rec['center2d'] = center2d.squeeze().tolist() + # normalized center2D + depth + # if samples with depth < 0 will be removed + if repro_rec['center2d'][2] <= 0: + continue + + ann_token = nusc.get('sample_annotation', + box.token)['attribute_tokens'] + if len(ann_token) == 0: + attr_name = 'None' + else: + attr_name = nusc.get('attribute', ann_token[0])['name'] + attr_id = nus_attributes.index(attr_name) + repro_rec['attribute_name'] = attr_name + repro_rec['attribute_id'] = attr_id + + repro_recs.append(repro_rec) + + return repro_recs + + +def post_process_coords( + corner_coords: List, imsize: Tuple[int, int] = (1600, 900) +) -> Union[Tuple[float, float, float, float], None]: + """Get the intersection of the convex hull of the reprojected bbox corners + and the image canvas, return None if no intersection. + + Args: + corner_coords (list[int]): Corner coordinates of reprojected + bounding box. + imsize (tuple[int]): Size of the image canvas. + + Return: + tuple [float]: Intersection of the convex hull of the 2D box + corners and the image canvas. + """ + polygon_from_2d_box = MultiPoint(corner_coords).convex_hull + img_canvas = box(0, 0, imsize[0], imsize[1]) + + if polygon_from_2d_box.intersects(img_canvas): + img_intersection = polygon_from_2d_box.intersection(img_canvas) + intersection_coords = np.array( + [coord for coord in img_intersection.exterior.coords]) + + min_x = min(intersection_coords[:, 0]) + min_y = min(intersection_coords[:, 1]) + max_x = max(intersection_coords[:, 0]) + max_y = max(intersection_coords[:, 1]) + + return min_x, min_y, max_x, max_y + else: + return None + + +def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float, + sample_data_token: str, filename: str) -> OrderedDict: + """Generate one 2D annotation record given various informations on top of + the 2D bounding box coordinates. + + Args: + ann_rec (dict): Original 3d annotation record. + x1 (float): Minimum value of the x coordinate. + y1 (float): Minimum value of the y coordinate. + x2 (float): Maximum value of the x coordinate. + y2 (float): Maximum value of the y coordinate. + sample_data_token (str): Sample data token. + filename (str):The corresponding image file where the annotation + is present. + + Returns: + dict: A sample 2D annotation record. + - file_name (str): flie name + - image_id (str): sample data token + - area (float): 2d box area + - category_name (str): category name + - category_id (int): category id + - bbox (list[float]): left x, top y, dx, dy of 2d box + - iscrowd (int): whether the area is crowd + """ + repro_rec = OrderedDict() + repro_rec['sample_data_token'] = sample_data_token + coco_rec = dict() + + relevant_keys = [ + 'attribute_tokens', + 'category_name', + 'instance_token', + 'next', + 'num_lidar_pts', + 'num_radar_pts', + 'prev', + 'sample_annotation_token', + 'sample_data_token', + 'visibility_token', + ] + + for key, value in ann_rec.items(): + if key in relevant_keys: + repro_rec[key] = value + + repro_rec['bbox_corners'] = [x1, y1, x2, y2] + repro_rec['filename'] = filename + + coco_rec['file_name'] = filename + coco_rec['image_id'] = sample_data_token + coco_rec['area'] = (y2 - y1) * (x2 - x1) + + if repro_rec['category_name'] not in NuScenesDataset.NameMapping: + return None + cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']] + coco_rec['category_name'] = cat_name + coco_rec['category_id'] = nus_categories.index(cat_name) + coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1] + coco_rec['iscrowd'] = 0 + + return coco_rec + + +def nuscenes_data_prep(root_path, + can_bus_root_path, + info_prefix, + version, + dataset_name, + out_dir, + max_sweeps=10): + """Prepare data related to nuScenes dataset. + + Related data consists of '.pkl' files recording basic infos, + 2D annotations and groundtruth database. + + Args: + root_path (str): Path of dataset root. + info_prefix (str): The prefix of info filenames. + version (str): Dataset version. + dataset_name (str): The dataset class name. + out_dir (str): Output directory of the groundtruth database info. + max_sweeps (int): Number of input consecutive frames. Default: 10 + """ + create_nuscenes_infos( + root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps) + + +parser = argparse.ArgumentParser(description='Data converter arg parser') +parser.add_argument('dataset', metavar='kitti', help='name of the dataset') +parser.add_argument( + '--root-path', + type=str, + default='./data/kitti', + help='specify the root path of dataset') +parser.add_argument( + '--canbus', + type=str, + default='./data', + help='specify the root path of nuScenes canbus') +parser.add_argument( + '--version', + type=str, + default='v1.0', + required=False, + help='specify the dataset version, no need for kitti') +parser.add_argument( + '--max-sweeps', + type=int, + default=10, + required=False, + help='specify sweeps of lidar per example') +parser.add_argument( + '--out-dir', + type=str, + default='./data/kitti', + required='False', + help='name of info pkl') +parser.add_argument('--extra-tag', type=str, default='kitti') +parser.add_argument( + '--workers', type=int, default=4, help='number of threads to be used') +args = parser.parse_args() + +if __name__ == '__main__': + if args.dataset == 'nuscenes' and args.version != 'v1.0-mini': + train_version = f'{args.version}-trainval' + nuscenes_data_prep( + root_path=args.root_path, + can_bus_root_path=args.canbus, + info_prefix=args.extra_tag, + version=train_version, + dataset_name='NuScenesDataset', + out_dir=args.out_dir, + max_sweeps=args.max_sweeps) + test_version = f'{args.version}-test' + nuscenes_data_prep( + root_path=args.root_path, + can_bus_root_path=args.canbus, + info_prefix=args.extra_tag, + version=test_version, + dataset_name='NuScenesDataset', + out_dir=args.out_dir, + max_sweeps=args.max_sweeps) + elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini': + train_version = f'{args.version}' + nuscenes_data_prep( + root_path=args.root_path, + can_bus_root_path=args.canbus, + info_prefix=args.extra_tag, + version=train_version, + dataset_name='NuScenesDataset', + out_dir=args.out_dir, + max_sweeps=args.max_sweeps) diff --git a/GenAD-main/tools/dist_test.sh b/GenAD-main/tools/dist_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..3e2ec3007b1d5927a5bc5a63140ee7e11f500142 --- /dev/null +++ b/GenAD-main/tools/dist_test.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +CONFIG=$1 +CHECKPOINT=$2 +GPUS=$3 +PORT=${PORT:-29503} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ + $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --eval bbox diff --git a/GenAD-main/tools/dist_train.sh b/GenAD-main/tools/dist_train.sh new file mode 100644 index 0000000000000000000000000000000000000000..141b284d5e80e42dc66424c4c4900394413bc7fb --- /dev/null +++ b/GenAD-main/tools/dist_train.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CONFIG=$1 +GPUS=$2 +PORT=${PORT:-28509} + +PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ +python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \ + $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic diff --git a/GenAD-main/tools/dist_train_multi_nodes.sh b/GenAD-main/tools/dist_train_multi_nodes.sh new file mode 100644 index 0000000000000000000000000000000000000000..c3bcc56f6a1dc5994311a6959ceb24c97094028d --- /dev/null +++ b/GenAD-main/tools/dist_train_multi_nodes.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -x +CONFIG=$1 +GPUS=$2 +PORT=${PORT:-28509} + +if ! command -v nslookup &> /dev/null; then + apt update + apt install dnsutils -y +fi + +output=$(nslookup $MY_APP_NAME) +addresses=$(echo "$output" | awk '/^Name:/ { name=$2; next } name && /^Address:/ { print $2 }') +sorted_address_list=($(printf '%s\n' "${addresses[@]}" | sort)) + +i=0 +IFS=' ' read -ra addresses <<< "${sorted_address_list[@]}" + +for address in "${addresses[@]}"; do + POD_IPs[$i]=$address + i=$((i+1)) +done + +length=${#POD_IPs[@]} + +local_ip=$(hostname -I | grep -oP '\d+\.\d+\.\d+\.\d+') +echo "local ip is $local_ip" +echo "master ip is ${POD_IPs[0]}" + +if [ "$local_ip" == ${POD_IPs[0]} ]; then + #python -m torch.distributed.run --nproc_per_node=8 --master_port=2333 tools/train.py projects/configs/VAD/VAD_tiny_e2e.py --launcher pytorch --deterministic --work-dir ./outputs/VAD_tiny_e2e_v1_ar_test + source /remote-home/share/miniconda3/bin/activate && conda activate vad && python -m torch.distributed.launch --nproc_per_node=$GPUS --nnodes=2 --node_rank=0 --master_addr=${POD_IPs[0]} --master_port=$PORT $(dirname "$0")/train.py projects/configs/VAD/VAD_tiny_e2e.py --launcher pytorch ${@:3} --deterministic --work-dir ./outputs/VAD_tiny_e2e_v1_ar_test + +fi + +if [ "$local_ip" == ${POD_IPs[1]} ]; then + source /remote-home/share/miniconda3/bin/activate && conda activate vad && python -m torch.distributed.launch --nproc_per_node=$GPUS --nnodes=2 --node_rank=0 --master_addr=${POD_IPs[0]} --master_port=$PORT $(dirname "$0")/train.py projects/configs/VAD/VAD_tiny_e2e.py --launcher pytorch ${@:3} --deterministic --work-dir ./outputs/VAD_tiny_e2e_v1_ar_test +# command="python -m torch.distributed.launch --nproc_per_node=$GPUS --nnodes 2 --node_rank 1 --master_addr=${POD_IPs[0]} --master_port=$PORT $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic" +fi diff --git a/GenAD-main/tools/exp/data_analysis.py b/GenAD-main/tools/exp/data_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..8480cff0dd17fe3cbf1430b0a4f045c284e5e999 --- /dev/null +++ b/GenAD-main/tools/exp/data_analysis.py @@ -0,0 +1,22 @@ +import pickle + +train=open(r'/home/ubuntu/data/nuscenes/vad_nuscenes_infos_temporal_train.pkl','rb') +val=open(r'/home/ubuntu/data/nuscenes/vad_nuscenes_infos_temporal_val.pkl','rb') + +content_train=pickle.load(train) +content_val=pickle.load(val) + +train_len = len(content_train['infos']) +val_len = len(content_val['infos']) + + +for i in range(val_len): + val_id = content_val['infos'][i]['lidar_path'] + for j in range(train_len): + train_id = content_train['infos'][j]['lidar_path'] + + if val_id == train_id: + print("*************** there is val sample in training set ****************: ", j) + + +print(1) \ No newline at end of file diff --git a/GenAD-main/tools/misc/browse_dataset.py b/GenAD-main/tools/misc/browse_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e3419f66df56679088469a842cd62e31906df8a1 --- /dev/null +++ b/GenAD-main/tools/misc/browse_dataset.py @@ -0,0 +1,240 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import numpy as np +import warnings +from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress +from os import path as osp + +from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode, + DepthInstance3DBoxes, LiDARInstance3DBoxes) +from mmdet3d.core.visualizer import (show_multi_modality_result, show_result, + show_seg_result) +from mmdet3d.datasets import build_dataset + + +def parse_args(): + parser = argparse.ArgumentParser(description='Browse a dataset') + parser.add_argument('config', help='train config file path') + parser.add_argument( + '--skip-type', + type=str, + nargs='+', + default=['Normalize'], + help='skip some useless pipeline') + parser.add_argument( + '--output-dir', + default=None, + type=str, + help='If there is no display interface, you can save it') + parser.add_argument( + '--task', + type=str, + choices=['det', 'seg', 'multi_modality-det', 'mono-det'], + help='Determine the visualization method depending on the task.') + parser.add_argument( + '--online', + action='store_true', + help='Whether to perform online visualization. Note that you often ' + 'need a monitor to do so.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + args = parser.parse_args() + return args + + +def build_data_cfg(config_path, skip_type, cfg_options): + """Build data config for loading visualization data.""" + cfg = Config.fromfile(config_path) + if cfg_options is not None: + cfg.merge_from_dict(cfg_options) + # import modules from string list. + if cfg.get('custom_imports', None): + from mmcv.utils import import_modules_from_strings + import_modules_from_strings(**cfg['custom_imports']) + # extract inner dataset of `RepeatDataset` as `cfg.data.train` + # so we don't need to worry about it later + if cfg.data.train['type'] == 'RepeatDataset': + cfg.data.train = cfg.data.train.dataset + # use only first dataset for `ConcatDataset` + if cfg.data.train['type'] == 'ConcatDataset': + cfg.data.train = cfg.data.train.datasets[0] + train_data_cfg = cfg.data.train + # eval_pipeline purely consists of loading functions + # use eval_pipeline for data loading + train_data_cfg['pipeline'] = [ + x for x in cfg.eval_pipeline if x['type'] not in skip_type + ] + + return cfg + + +def to_depth_mode(points, bboxes): + """Convert points and bboxes to Depth Coord and Depth Box mode.""" + if points is not None: + points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR, + Coord3DMode.DEPTH) + if bboxes is not None: + bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR, + Box3DMode.DEPTH) + return points, bboxes + + +def show_det_data(idx, dataset, out_dir, filename, show=False): + """Visualize 3D point cloud and 3D bboxes.""" + example = dataset.prepare_train_data(idx) + points = example['points']._data.numpy() + gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor + if dataset.box_mode_3d != Box3DMode.DEPTH: + points, gt_bboxes = to_depth_mode(points, gt_bboxes) + show_result( + points, + gt_bboxes.clone(), + None, + out_dir, + filename, + show=show, + snapshot=True) + + +def show_seg_data(idx, dataset, out_dir, filename, show=False): + """Visualize 3D point cloud and segmentation mask.""" + example = dataset.prepare_train_data(idx) + points = example['points']._data.numpy() + gt_seg = example['pts_semantic_mask']._data.numpy() + show_seg_result( + points, + gt_seg.copy(), + None, + out_dir, + filename, + np.array(dataset.PALETTE), + dataset.ignore_index, + show=show, + snapshot=True) + + +def show_proj_bbox_img(idx, + dataset, + out_dir, + filename, + show=False, + is_nus_mono=False): + """Visualize 3D bboxes on 2D image by projection.""" + try: + example = dataset.prepare_train_data(idx) + except AttributeError: # for Mono-3D datasets + example = dataset.prepare_train_img(idx) + gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'] + img_metas = example['img_metas']._data + img = example['img']._data.numpy() + # need to transpose channel to first dim + img = img.transpose(1, 2, 0) + # no 3D gt bboxes, just show img + if gt_bboxes.tensor.shape[0] == 0: + gt_bboxes = None + if isinstance(gt_bboxes, DepthInstance3DBoxes): + show_multi_modality_result( + img, + gt_bboxes, + None, + None, + out_dir, + filename, + box_mode='depth', + img_metas=img_metas, + show=show) + elif isinstance(gt_bboxes, LiDARInstance3DBoxes): + show_multi_modality_result( + img, + gt_bboxes, + None, + img_metas['lidar2img'], + out_dir, + filename, + box_mode='lidar', + img_metas=img_metas, + show=show) + elif isinstance(gt_bboxes, CameraInstance3DBoxes): + show_multi_modality_result( + img, + gt_bboxes, + None, + img_metas['cam2img'], + out_dir, + filename, + box_mode='camera', + img_metas=img_metas, + show=show) + else: + # can't project, just show img + warnings.warn( + f'unrecognized gt box type {type(gt_bboxes)}, only show image') + show_multi_modality_result( + img, None, None, None, out_dir, filename, show=show) + + +def main(): + args = parse_args() + + if args.output_dir is not None: + mkdir_or_exist(args.output_dir) + + cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options) + try: + dataset = build_dataset( + cfg.data.train, default_args=dict(filter_empty_gt=False)) + except TypeError: # seg dataset doesn't have `filter_empty_gt` key + dataset = build_dataset(cfg.data.train) + data_infos = dataset.data_infos + dataset_type = cfg.dataset_type + + # configure visualization mode + vis_task = args.task # 'det', 'seg', 'multi_modality-det', 'mono-det' + + for idx, data_info in enumerate(track_iter_progress(data_infos)): + if dataset_type in ['KittiDataset', 'WaymoDataset']: + data_path = data_info['point_cloud']['velodyne_path'] + elif dataset_type in [ + 'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset', + 'S3DISSegDataset', 'S3DISDataset' + ]: + data_path = data_info['pts_path'] + elif dataset_type in ['NuScenesDataset', 'LyftDataset']: + data_path = data_info['lidar_path'] + elif dataset_type in ['NuScenesMonoDataset']: + data_path = data_info['file_name'] + else: + raise NotImplementedError( + f'unsupported dataset type {dataset_type}') + + file_name = osp.splitext(osp.basename(data_path))[0] + + if vis_task in ['det', 'multi_modality-det']: + # show 3D bboxes on 3D point clouds + show_det_data( + idx, dataset, args.output_dir, file_name, show=args.online) + if vis_task in ['multi_modality-det', 'mono-det']: + # project 3D bboxes to 2D image + show_proj_bbox_img( + idx, + dataset, + args.output_dir, + file_name, + show=args.online, + is_nus_mono=(dataset_type == 'NuScenesMonoDataset')) + elif vis_task in ['seg']: + # show 3D segmentation mask on 3D point clouds + show_seg_data( + idx, dataset, args.output_dir, file_name, show=args.online) + + +if __name__ == '__main__': + main() diff --git a/GenAD-main/tools/misc/fuse_conv_bn.py b/GenAD-main/tools/misc/fuse_conv_bn.py new file mode 100644 index 0000000000000000000000000000000000000000..d4e22018d66d3bd47119522e9da2ea6676ba5760 --- /dev/null +++ b/GenAD-main/tools/misc/fuse_conv_bn.py @@ -0,0 +1,67 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import torch +from mmcv.runner import save_checkpoint +from torch import nn as nn + +from mmdet.apis import init_model + + +def fuse_conv_bn(conv, bn): + """During inference, the functionary of batch norm layers is turned off but + only the mean and var alone channels are used, which exposes the chance to + fuse it with the preceding conv layers to save computations and simplify + network structures.""" + conv_w = conv.weight + conv_b = conv.bias if conv.bias is not None else torch.zeros_like( + bn.running_mean) + + factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) + conv.weight = nn.Parameter(conv_w * + factor.reshape([conv.out_channels, 1, 1, 1])) + conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) + return conv + + +def fuse_module(m): + last_conv = None + last_conv_name = None + + for name, child in m.named_children(): + if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)): + if last_conv is None: # only fuse BN that is after Conv + continue + fused_conv = fuse_conv_bn(last_conv, child) + m._modules[last_conv_name] = fused_conv + # To reduce changes, set BN as Identity instead of deleting it. + m._modules[name] = nn.Identity() + last_conv = None + elif isinstance(child, nn.Conv2d): + last_conv = child + last_conv_name = name + else: + fuse_module(child) + return m + + +def parse_args(): + parser = argparse.ArgumentParser( + description='fuse Conv and BN layers in a model') + parser.add_argument('config', help='config file path') + parser.add_argument('checkpoint', help='checkpoint file path') + parser.add_argument('out', help='output path of the converted model') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + # build the model from a config file and a checkpoint file + model = init_model(args.config, args.checkpoint) + # fuse conv and bn layers of the model + fused_model = fuse_module(model) + save_checkpoint(fused_model, args.out) + + +if __name__ == '__main__': + main() diff --git a/GenAD-main/tools/misc/print_config.py b/GenAD-main/tools/misc/print_config.py new file mode 100644 index 0000000000000000000000000000000000000000..3100fc324b375330ba10316d71405c535d91fb7b --- /dev/null +++ b/GenAD-main/tools/misc/print_config.py @@ -0,0 +1,26 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +from mmcv import Config, DictAction + + +def parse_args(): + parser = argparse.ArgumentParser(description='Print the whole config') + parser.add_argument('config', help='config file path') + parser.add_argument( + '--options', nargs='+', action=DictAction, help='arguments in dict') + args = parser.parse_args() + + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.options is not None: + cfg.merge_from_dict(args.options) + print(f'Config:\n{cfg.pretty_text}') + + +if __name__ == '__main__': + main() diff --git a/GenAD-main/tools/misc/visualize_results.py b/GenAD-main/tools/misc/visualize_results.py new file mode 100644 index 0000000000000000000000000000000000000000..302adc50eca960a6660104b33521d438cf54faa0 --- /dev/null +++ b/GenAD-main/tools/misc/visualize_results.py @@ -0,0 +1,49 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import mmcv +from mmcv import Config + +from mmdet3d.datasets import build_dataset + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMDet3D visualize the results') + parser.add_argument('config', help='test config file path') + parser.add_argument('--result', help='results file in pickle format') + parser.add_argument( + '--show-dir', help='directory where visualize results will be saved') + args = parser.parse_args() + + return args + + +def main(): + args = parse_args() + + if args.result is not None and \ + not args.result.endswith(('.pkl', '.pickle')): + raise ValueError('The results file must be a pkl file.') + + cfg = Config.fromfile(args.config) + cfg.data.test.test_mode = True + + # build the dataset + dataset = build_dataset(cfg.data.test) + results = mmcv.load(args.result) + + if getattr(dataset, 'show', None) is not None: + # data loading pipeline for showing + eval_pipeline = cfg.get('eval_pipeline', {}) + if eval_pipeline: + dataset.show(results, args.show_dir, pipeline=eval_pipeline) + else: + dataset.show(results, args.show_dir) # use default pipeline + else: + raise NotImplementedError( + 'Show is not implemented for dataset {}!'.format( + type(dataset).__name__)) + + +if __name__ == '__main__': + main() diff --git a/GenAD-main/tools/model_converters/convert_votenet_checkpoints.py b/GenAD-main/tools/model_converters/convert_votenet_checkpoints.py new file mode 100644 index 0000000000000000000000000000000000000000..33792b00ddd96790acdcdf6ba9d8caf9da39b637 --- /dev/null +++ b/GenAD-main/tools/model_converters/convert_votenet_checkpoints.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import tempfile +import torch +from mmcv import Config +from mmcv.runner import load_state_dict + +from mmdet3d.models import build_detector + + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMDet3D upgrade model version(before v0.6.0) of VoteNet') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument('--out', help='path of the output checkpoint file') + args = parser.parse_args() + return args + + +def parse_config(config_strings): + """Parse config from strings. + + Args: + config_strings (string): strings of model config. + + Returns: + Config: model config + """ + temp_file = tempfile.NamedTemporaryFile() + config_path = f'{temp_file.name}.py' + with open(config_path, 'w') as f: + f.write(config_strings) + + config = Config.fromfile(config_path) + + # Update backbone config + if 'pool_mod' in config.model.backbone: + config.model.backbone.pop('pool_mod') + + if 'sa_cfg' not in config.model.backbone: + config.model.backbone['sa_cfg'] = dict( + type='PointSAModule', + pool_mod='max', + use_xyz=True, + normalize_xyz=True) + + if 'type' not in config.model.bbox_head.vote_aggregation_cfg: + config.model.bbox_head.vote_aggregation_cfg['type'] = 'PointSAModule' + + # Update bbox_head config + if 'pred_layer_cfg' not in config.model.bbox_head: + config.model.bbox_head['pred_layer_cfg'] = dict( + in_channels=128, shared_conv_channels=(128, 128), bias=True) + + if 'feat_channels' in config.model.bbox_head: + config.model.bbox_head.pop('feat_channels') + + if 'vote_moudule_cfg' in config.model.bbox_head: + config.model.bbox_head['vote_module_cfg'] = config.model.bbox_head.pop( + 'vote_moudule_cfg') + + if config.model.bbox_head.vote_aggregation_cfg.use_xyz: + config.model.bbox_head.vote_aggregation_cfg.mlp_channels[0] -= 3 + + temp_file.close() + + return config + + +def main(): + """Convert keys in checkpoints for VoteNet. + + There can be some breaking changes during the development of mmdetection3d, + and this tool is used for upgrading checkpoints trained with old versions + (before v0.6.0) to the latest one. + """ + args = parse_args() + checkpoint = torch.load(args.checkpoint) + cfg = parse_config(checkpoint['meta']['config']) + # Build the model and load checkpoint + model = build_detector( + cfg.model, + train_cfg=cfg.get('train_cfg'), + test_cfg=cfg.get('test_cfg')) + orig_ckpt = checkpoint['state_dict'] + converted_ckpt = orig_ckpt.copy() + + if cfg['dataset_type'] == 'ScanNetDataset': + NUM_CLASSES = 18 + elif cfg['dataset_type'] == 'SUNRGBDDataset': + NUM_CLASSES = 10 + else: + raise NotImplementedError + + RENAME_PREFIX = { + 'bbox_head.conv_pred.0': 'bbox_head.conv_pred.shared_convs.layer0', + 'bbox_head.conv_pred.1': 'bbox_head.conv_pred.shared_convs.layer1' + } + + DEL_KEYS = [ + 'bbox_head.conv_pred.0.bn.num_batches_tracked', + 'bbox_head.conv_pred.1.bn.num_batches_tracked' + ] + + EXTRACT_KEYS = { + 'bbox_head.conv_pred.conv_cls.weight': + ('bbox_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]), + 'bbox_head.conv_pred.conv_cls.bias': + ('bbox_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]), + 'bbox_head.conv_pred.conv_reg.weight': + ('bbox_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]), + 'bbox_head.conv_pred.conv_reg.bias': + ('bbox_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)]) + } + + # Delete some useless keys + for key in DEL_KEYS: + converted_ckpt.pop(key) + + # Rename keys with specific prefix + RENAME_KEYS = dict() + for old_key in converted_ckpt.keys(): + for rename_prefix in RENAME_PREFIX.keys(): + if rename_prefix in old_key: + new_key = old_key.replace(rename_prefix, + RENAME_PREFIX[rename_prefix]) + RENAME_KEYS[new_key] = old_key + for new_key, old_key in RENAME_KEYS.items(): + converted_ckpt[new_key] = converted_ckpt.pop(old_key) + + # Extract weights and rename the keys + for new_key, (old_key, indices) in EXTRACT_KEYS.items(): + cur_layers = orig_ckpt[old_key] + converted_layers = [] + for (start, end) in indices: + if end != -1: + converted_layers.append(cur_layers[start:end]) + else: + converted_layers.append(cur_layers[start:]) + converted_layers = torch.cat(converted_layers, 0) + converted_ckpt[new_key] = converted_layers + if old_key in converted_ckpt.keys(): + converted_ckpt.pop(old_key) + + # Check the converted checkpoint by loading to the model + load_state_dict(model, converted_ckpt, strict=True) + checkpoint['state_dict'] = converted_ckpt + torch.save(checkpoint, args.out) + + +if __name__ == '__main__': + main() diff --git a/GenAD-main/tools/model_converters/publish_model.py b/GenAD-main/tools/model_converters/publish_model.py new file mode 100644 index 0000000000000000000000000000000000000000..318fd46a65894575f5f3e915672b18d24ba133d8 --- /dev/null +++ b/GenAD-main/tools/model_converters/publish_model.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import subprocess +import torch + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Process a checkpoint to be published') + parser.add_argument('in_file', help='input checkpoint filename') + parser.add_argument('out_file', help='output checkpoint filename') + args = parser.parse_args() + return args + + +def process_checkpoint(in_file, out_file): + checkpoint = torch.load(in_file, map_location='cpu') + # remove optimizer for smaller file size + if 'optimizer' in checkpoint: + del checkpoint['optimizer'] + # if it is necessary to remove some sensitive data in checkpoint['meta'], + # add the code here. + torch.save(checkpoint, out_file) + sha = subprocess.check_output(['sha256sum', out_file]).decode() + final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8]) + subprocess.Popen(['mv', out_file, final_file]) + + +def main(): + args = parse_args() + process_checkpoint(args.in_file, args.out_file) + + +if __name__ == '__main__': + main() diff --git a/GenAD-main/tools/model_converters/regnet2mmdet.py b/GenAD-main/tools/model_converters/regnet2mmdet.py new file mode 100644 index 0000000000000000000000000000000000000000..9dee3c878abc94c1298dcea6856e432a77339665 --- /dev/null +++ b/GenAD-main/tools/model_converters/regnet2mmdet.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import torch +from collections import OrderedDict + + +def convert_stem(model_key, model_weight, state_dict, converted_names): + new_key = model_key.replace('stem.conv', 'conv1') + new_key = new_key.replace('stem.bn', 'bn1') + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert_head(model_key, model_weight, state_dict, converted_names): + new_key = model_key.replace('head.fc', 'fc') + state_dict[new_key] = model_weight + converted_names.add(model_key) + print(f'Convert {model_key} to {new_key}') + + +def convert_reslayer(model_key, model_weight, state_dict, converted_names): + split_keys = model_key.split('.') + layer, block, module = split_keys[:3] + block_id = int(block[1:]) + layer_name = f'layer{int(layer[1:])}' + block_name = f'{block_id - 1}' + + if block_id == 1 and module == 'bn': + new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}' + elif block_id == 1 and module == 'proj': + new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}' + elif module == 'f': + if split_keys[3] == 'a_bn': + module_name = 'bn1' + elif split_keys[3] == 'b_bn': + module_name = 'bn2' + elif split_keys[3] == 'c_bn': + module_name = 'bn3' + elif split_keys[3] == 'a': + module_name = 'conv1' + elif split_keys[3] == 'b': + module_name = 'conv2' + elif split_keys[3] == 'c': + module_name = 'conv3' + new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}' + else: + raise ValueError(f'Unsupported conversion of key {model_key}') + print(f'Convert {model_key} to {new_key}') + state_dict[new_key] = model_weight + converted_names.add(model_key) + + +def convert(src, dst): + """Convert keys in pycls pretrained RegNet models to mmdet style.""" + # load caffe model + regnet_model = torch.load(src) + blobs = regnet_model['model_state'] + # convert to pytorch style + state_dict = OrderedDict() + converted_names = set() + for key, weight in blobs.items(): + if 'stem' in key: + convert_stem(key, weight, state_dict, converted_names) + elif 'head' in key: + convert_head(key, weight, state_dict, converted_names) + elif key.startswith('s'): + convert_reslayer(key, weight, state_dict, converted_names) + + # check if all layers are converted + for key in blobs: + if key not in converted_names: + print(f'not converted: {key}') + # save checkpoint + checkpoint = dict() + checkpoint['state_dict'] = state_dict + torch.save(checkpoint, dst) + + +def main(): + parser = argparse.ArgumentParser(description='Convert model keys') + parser.add_argument('src', help='src detectron model path') + parser.add_argument('dst', help='save path') + args = parser.parse_args() + convert(args.src, args.dst) + + +if __name__ == '__main__': + main() diff --git a/GenAD-main/tools/test.py b/GenAD-main/tools/test.py new file mode 100644 index 0000000000000000000000000000000000000000..e7a92af0068459608b66e3d6e39bde6d5c9caf15 --- /dev/null +++ b/GenAD-main/tools/test.py @@ -0,0 +1,298 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- +import sys +sys.path.append('') +import numpy as np +import argparse +import mmcv +import os +import copy +import torch +torch.multiprocessing.set_sharing_strategy('file_system') +import warnings +from mmcv import Config, DictAction +from mmcv.cnn import fuse_conv_bn +from mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, + wrap_fp16_model) + +from mmdet3d.apis import single_gpu_test +from mmdet3d.datasets import build_dataset +from projects.mmdet3d_plugin.datasets.builder import build_dataloader +from mmdet3d.models import build_model +from mmdet.apis import set_random_seed +# from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test +from projects.mmdet3d_plugin.VAD.apis.test import custom_multi_gpu_test +from mmdet.datasets import replace_ImageToTensor +import time +import os.path as osp +import json + +import warnings +warnings.filterwarnings("ignore") + +def parse_args(): + parser = argparse.ArgumentParser( + description='MMDet test (and eval) a model') + parser.add_argument('config', help='test config file path') + parser.add_argument('checkpoint', help='checkpoint file') + parser.add_argument('--json_dir', help='json parent dir name file') # NOTE: json file parent folder name + parser.add_argument('--out', help='output result file in pickle format') + parser.add_argument( + '--fuse-conv-bn', + action='store_true', + help='Whether to fuse conv and bn, this will slightly increase' + 'the inference speed') + parser.add_argument( + '--format-only', + action='store_true', + help='Format the output results without perform evaluation. It is' + 'useful when you want to format the result to a specific format and ' + 'submit it to the test server') + parser.add_argument( + '--eval', + type=str, + nargs='+', + help='evaluation metrics, which depends on the dataset, e.g., "bbox",' + ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') + parser.add_argument('--show', action='store_true', help='show results') + parser.add_argument( + '--show-dir', help='directory where results will be saved') + parser.add_argument( + '--gpu-collect', + action='store_true', + help='whether to use gpu to collect results.') + parser.add_argument( + '--tmpdir', + help='tmp directory used for collecting results from multiple ' + 'workers, available when gpu-collect is not specified') + parser.add_argument('--seed', type=int, default=0, help='random seed') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--options', + nargs='+', + action=DictAction, + help='custom options for evaluation, the key-value pair in xxx=yyy ' + 'format will be kwargs for dataset.evaluate() function (deprecate), ' + 'change to --eval-options instead.') + parser.add_argument( + '--eval-options', + nargs='+', + action=DictAction, + help='custom options for evaluation, the key-value pair in xxx=yyy ' + 'format will be kwargs for dataset.evaluate() function') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + if args.options and args.eval_options: + raise ValueError( + '--options and --eval-options cannot be both specified, ' + '--options is deprecated in favor of --eval-options') + if args.options: + warnings.warn('--options is deprecated in favor of --eval-options') + args.eval_options = args.options + return args + + +def main(): + args = parse_args() + + assert args.out or args.eval or args.format_only or args.show \ + or args.show_dir, \ + ('Please specify at least one operation (save/eval/format/show the ' + 'results / save the results) with the argument "--out", "--eval"' + ', "--format-only", "--show" or "--show-dir"') + + if args.eval and args.format_only: + raise ValueError('--eval and --format_only cannot be both specified') + + if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): + raise ValueError('The output file must be a pkl file.') + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + # import modules from string list. + if cfg.get('custom_imports', None): + from mmcv.utils import import_modules_from_strings + import_modules_from_strings(**cfg['custom_imports']) + + # import modules from plguin/xx, registry will be updated + if hasattr(cfg, 'plugin'): + if cfg.plugin: + import importlib + if hasattr(cfg, 'plugin_dir'): + plugin_dir = cfg.plugin_dir + _module_dir = os.path.dirname(plugin_dir) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + else: + # import dir is the dirpath for the config file + _module_dir = os.path.dirname(args.config) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + cfg.model.pretrained = None + # in case the test dataset is concatenated + samples_per_gpu = 1 + if isinstance(cfg.data.test, dict): + cfg.data.test.test_mode = True + samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) + if samples_per_gpu > 1: + # Replace 'ImageToTensor' to 'DefaultFormatBundle' + cfg.data.test.pipeline = replace_ImageToTensor( + cfg.data.test.pipeline) + elif isinstance(cfg.data.test, list): + for ds_cfg in cfg.data.test: + ds_cfg.test_mode = True + samples_per_gpu = max( + [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) + if samples_per_gpu > 1: + for ds_cfg in cfg.data.test: + ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + + # set random seeds + # args.seed = None + if args.seed is not None: + set_random_seed(args.seed, deterministic=args.deterministic) + + # build the dataloader + dataset = build_dataset(cfg.data.test) + data_loader = build_dataloader( + dataset, + samples_per_gpu=samples_per_gpu, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=distributed, + shuffle=False, + nonshuffler_sampler=cfg.data.nonshuffler_sampler, + ) + + + + # build the model and load checkpoint + cfg.model.train_cfg = None + model = build_model(cfg.model, test_cfg=cfg.get('test_cfg')) + fp16_cfg = cfg.get('fp16', None) + if fp16_cfg is not None: + wrap_fp16_model(model) + checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') + + if args.fuse_conv_bn: + model = fuse_conv_bn(model) + # old versions did not save class info in checkpoints, this walkaround is + # for backward compatibility + if 'CLASSES' in checkpoint.get('meta', {}): + model.CLASSES = checkpoint['meta']['CLASSES'] + else: + model.CLASSES = dataset.CLASSES + # palette for visualization in segmentation tasks + if 'PALETTE' in checkpoint.get('meta', {}): + model.PALETTE = checkpoint['meta']['PALETTE'] + elif hasattr(dataset, 'PALETTE'): + # segmentation dataset has `PALETTE` attribute + model.PALETTE = dataset.PALETTE + + if not distributed: + # assert False + model = MMDataParallel(model, device_ids=[0]) + outputs = single_gpu_test(model, data_loader, args.show, args.show_dir) + else: + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False) + outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir, + args.gpu_collect) + + tmp = {} + tmp['bbox_results'] = outputs + outputs = tmp + rank, _ = get_dist_info() + if rank == 0: + if args.out: + print(f'\nwriting results to {args.out}') + # assert False + if isinstance(outputs, list): + mmcv.dump(outputs, args.out) + else: + mmcv.dump(outputs['bbox_results'], args.out) + kwargs = {} if args.eval_options is None else args.eval_options + kwargs['jsonfile_prefix'] = osp.join('test', args.config.split( + '/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_')) + if args.format_only: + dataset.format_results(outputs['bbox_results'], **kwargs) + + if args.eval: + eval_kwargs = cfg.get('evaluation', {}).copy() + # hard-code way to remove EvalHook args + for key in [ + 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', + 'rule' + ]: + eval_kwargs.pop(key, None) + eval_kwargs.update(dict(metric=args.eval, **kwargs)) + + print(dataset.evaluate(outputs['bbox_results'], **eval_kwargs)) + + # # # NOTE: record to json + # json_path = args.json_dir + # if not os.path.exists(json_path): + # os.makedirs(json_path) + + # metric_all = [] + # for res in outputs['bbox_results']: + # for k in res['metric_results'].keys(): + # if type(res['metric_results'][k]) is np.ndarray: + # res['metric_results'][k] = res['metric_results'][k].tolist() + # metric_all.append(res['metric_results']) + + # print('start saving to json done') + # with open(json_path+'/metric_record.json', "w", encoding="utf-8") as f2: + # json.dump(metric_all, f2, indent=4) + # print('save to json done') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/GenAD-main/tools/train.py b/GenAD-main/tools/train.py new file mode 100644 index 0000000000000000000000000000000000000000..cc4568bcd8559df6653c33e2954875a759bdd629 --- /dev/null +++ b/GenAD-main/tools/train.py @@ -0,0 +1,328 @@ +# --------------------------------------------- +# Copyright (c) OpenMMLab. All rights reserved. +# --------------------------------------------- +# Modified by Zhiqi Li +# --------------------------------------------- + +from __future__ import division + +import argparse +import copy +import mmcv +import os +import time +import torch +import warnings +from mmcv import Config, DictAction +from mmcv.runner import get_dist_info, init_dist +from os import path as osp + +from mmdet import __version__ as mmdet_version +from mmdet3d import __version__ as mmdet3d_version +#from mmdet3d.apis import train_model + +from mmdet3d.datasets import build_dataset +from mmdet3d.models import build_model +from mmdet3d.utils import collect_env, get_root_logger +from mmdet.apis import set_random_seed +from mmseg import __version__ as mmseg_version + +from mmcv.utils import TORCH_VERSION, digit_version + +from torchstat import stat + +import cv2 +cv2.setNumThreads(1) + +import sys +sys.path.append('') + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a detector') + parser.add_argument('config', help='train config file path') + parser.add_argument('--work-dir', help='the dir to save logs and models') + parser.add_argument( + '--resume-from', help='the checkpoint file to resume from') + parser.add_argument( + '--no-validate', + action='store_true', + help='whether not to evaluate the checkpoint during training') + group_gpus = parser.add_mutually_exclusive_group() + group_gpus.add_argument( + '--gpus', + type=int, + help='number of gpus to use ' + '(only applicable to non-distributed training)') + group_gpus.add_argument( + '--gpu-ids', + type=int, + nargs='+', + help='ids of gpus to use ' + '(only applicable to non-distributed training)') + parser.add_argument('--seed', type=int, default=0, help='random seed') + parser.add_argument( + '--deterministic', + action='store_true', + help='whether to set deterministic options for CUDNN backend.') + parser.add_argument( + '--options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file (deprecate), ' + 'change to --cfg-options instead.') + parser.add_argument( + '--cfg-options', + nargs='+', + action=DictAction, + help='override some settings in the used config, the key-value pair ' + 'in xxx=yyy format will be merged into config file. If the value to ' + 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' + 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' + 'Note that the quotation marks are necessary and that no white space ' + 'is allowed.') + parser.add_argument( + '--launcher', + choices=['none', 'pytorch', 'slurm', 'mpi'], + default='none', + help='job launcher') + parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument( + '--autoscale-lr', + action='store_true', + help='automatically scale lr with the number of gpus') + args = parser.parse_args() + if 'LOCAL_RANK' not in os.environ: + os.environ['LOCAL_RANK'] = str(args.local_rank) + + if args.options and args.cfg_options: + raise ValueError( + '--options and --cfg-options cannot be both specified, ' + '--options is deprecated in favor of --cfg-options') + if args.options: + warnings.warn('--options is deprecated in favor of --cfg-options') + args.cfg_options = args.options + + return args + + +def main(): + args = parse_args() + + cfg = Config.fromfile(args.config) + if args.cfg_options is not None: + cfg.merge_from_dict(args.cfg_options) + # import modules from string list. + if cfg.get('custom_imports', None): + from mmcv.utils import import_modules_from_strings + import_modules_from_strings(**cfg['custom_imports']) + + # import modules from plguin/xx, registry will be updated + if hasattr(cfg, 'plugin'): + if cfg.plugin: + import importlib + if hasattr(cfg, 'plugin_dir'): + plugin_dir = cfg.plugin_dir + _module_dir = os.path.dirname(plugin_dir) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + else: + # import dir is the dirpath for the config file + _module_dir = os.path.dirname(args.config) + _module_dir = _module_dir.split('/') + _module_path = _module_dir[0] + for m in _module_dir[1:]: + _module_path = _module_path + '.' + m + print(_module_path) + plg_lib = importlib.import_module(_module_path) + + # from projects.mmdet3d_plugin.bevformer.apis import custom_train_model + from projects.mmdet3d_plugin.VAD.apis.train import custom_train_model + # set cudnn_benchmark + if cfg.get('cudnn_benchmark', False): + torch.backends.cudnn.benchmark = True + + # work_dir is determined in this priority: CLI > segment in file > filename + if args.work_dir is not None: + # update configs according to CLI args if args.work_dir is not None + cfg.work_dir = args.work_dir + elif cfg.get('work_dir', None) is None: + # use config filename as default work_dir if cfg.work_dir is None + cfg.work_dir = osp.join('./work_dirs', + osp.splitext(osp.basename(args.config))[0]) + # if args.resume_from is not None: + if args.resume_from is not None and osp.isfile(args.resume_from): + cfg.resume_from = args.resume_from + if args.gpu_ids is not None: + cfg.gpu_ids = args.gpu_ids + else: + cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) + if digit_version(TORCH_VERSION) == digit_version('1.8.1') and cfg.optimizer['type'] == 'AdamW': + cfg.optimizer['type'] = 'AdamW2' # fix bug in Adamw + if args.autoscale_lr: + # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) + cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 + + # init distributed env first, since logger depends on the dist info. + if args.launcher == 'none': + distributed = False + else: + distributed = True + init_dist(args.launcher, **cfg.dist_params) + # re-set gpu_ids with distributed training mode + _, world_size = get_dist_info() + cfg.gpu_ids = range(world_size) + + # create work_dir + mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) + # dump config + cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) + # init the logger before other steps + timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) + log_file = osp.join(cfg.work_dir, f'{timestamp}.log') + # specify logger name, if we still use 'mmdet', the output info will be + # filtered and won't be saved in the log_file + # TODO: ugly workaround to judge whether we are training det or seg model + if cfg.model.type in ['EncoderDecoder3D']: + logger_name = 'mmseg' + else: + logger_name = 'mmdet' + logger = get_root_logger( + log_file=log_file, log_level=cfg.log_level, name=logger_name) + + # init the meta dict to record some important information such as + # environment info and seed, which will be logged + meta = dict() + # log env info + env_info_dict = collect_env() + env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) + dash_line = '-' * 60 + '\n' + logger.info('Environment info:\n' + dash_line + env_info + '\n' + + dash_line) + meta['env_info'] = env_info + meta['config'] = cfg.pretty_text + + # log some basic info + logger.info(f'Distributed training: {distributed}') + logger.info(f'Config:\n{cfg.pretty_text}') + + # set random seeds + if args.seed is not None: + logger.info(f'Set random seed to {args.seed}, ' + f'deterministic: {args.deterministic}') + set_random_seed(args.seed, deterministic=args.deterministic) + cfg.seed = args.seed + meta['seed'] = args.seed + meta['exp_name'] = osp.basename(args.config) + + model = build_model( + cfg.model, + train_cfg=cfg.get('train_cfg'), + test_cfg=cfg.get('test_cfg')) + model.init_weights() + + # model.load_state_dict(torch.load('/home/ubuntu/phd/unity/vad/models/cvpr/vad_tiny_pl_vae_gru/epoch_60.pth')) + + # 计算参数量 + total_params = sum(p.numel() for p in model.parameters()) + print(f"Total number of parameters: {total_params}") + + # stat(model,(3,384,640)) + # print("params") + + + + # saved_state_dict = torch.load('/home/ubuntu/phd/unity/vad/VAD/ckpts/epoch_9.pth') + # + # # training_layer = ['pts_bbox_head.predict_model', + # # 'pts_bbox_head.future_prediction', + # # 'pts_bbox_head.present_distribution', + # # 'pts_bbox_head.future_distribution'] + # training_layer = ['pts_bbox_head.ego_fut_decoder', + # 'pts_bbox_head.traj_branches', + # 'pts_bbox_head.predict_model', + # 'pts_bbox_head.future_prediction', + # 'pts_bbox_head.present_distribution', + # 'pts_bbox_head.future_distribution'] + # specific_layers = {k: v for k, v in saved_state_dict['state_dict'].items() if training_layer[0] in k or + # training_layer[1] in k or + # training_layer[2] in k or + # training_layer[3] in k or + # training_layer[4] in k or + # training_layer[5] in k} + # + # model.load_state_dict(specific_layers, strict=False) + # + # for name, param in model.named_parameters(): + # for i in range(len(training_layer)): + # if training_layer[i] not in name: + # param.requires_grad = True + # else: + # print(name) + # param.requires_grad = False + + # training_layer = ['pts_bbox_head.ego_fut_decoder', + # 'pts_bbox_head.traj_branches', + # 'pts_bbox_head.predict_model', + # 'pts_bbox_head.future_prediction', + # 'pts_bbox_head.present_distribution', + # 'pts_bbox_head.state_gru', + # 'pts_bbox_head.future_distribution'] + # + # for name, param in model.named_parameters(): + # for i in range(len(training_layer)): + # if training_layer[i] not in name: + # param.requires_grad = False + # else: + # print(name) + # param.requires_grad = True + + + + + + logger.info(f'Model:\n{model}') + datasets = [build_dataset(cfg.data.train)] + if len(cfg.workflow) == 2: + val_dataset = copy.deepcopy(cfg.data.val) + # in case we use a dataset wrapper + if 'dataset' in cfg.data.train: + val_dataset.pipeline = cfg.data.train.dataset.pipeline + else: + val_dataset.pipeline = cfg.data.train.pipeline + # set test_mode=False here in deep copied config + # which do not affect AP/AR calculation later + # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa + val_dataset.test_mode = False + datasets.append(build_dataset(val_dataset)) + if cfg.checkpoint_config is not None: + # save mmdet version, config file content and class names in + # checkpoints as meta data + cfg.checkpoint_config.meta = dict( + mmdet_version=mmdet_version, + mmseg_version=mmseg_version, + mmdet3d_version=mmdet3d_version, + config=cfg.pretty_text, + CLASSES=datasets[0].CLASSES, + PALETTE=datasets[0].PALETTE # for segmentors + if hasattr(datasets[0], 'PALETTE') else None) + # add an attribute for visualization convenience + model.CLASSES = datasets[0].CLASSES + custom_train_model( + model, + datasets, + cfg, + distributed=distributed, + validate=(not args.no_validate), + timestamp=timestamp, + meta=meta) + + +if __name__ == '__main__': + main()