diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..15375bc76cd26e5b75ac16d9664b74f9bb52491d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+GenAD-main/assets/demo.gif filter=lfs diff=lfs merge=lfs -text
diff --git a/GenAD-main/LICENSE b/GenAD-main/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/GenAD-main/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/GenAD-main/README.md b/GenAD-main/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5debb2e25e51bb8d501f4b6830a0e7264dca5d5f
--- /dev/null
+++ b/GenAD-main/README.md
@@ -0,0 +1,127 @@
+# GenAD: Generative End-to-End Autonomous Driving
+
+### [Paper](https://arxiv.org/pdf/2402.11502)
+
+> GenAD: Generative End-to-End Autonomous Driving
+
+> [Wenzhao Zheng](https://wzzheng.net/)\*, Ruiqi Song\*, [Xianda Guo](https://scholar.google.com/citations?user=jPvOqgYAAAAJ)\* $\dagger$, Chenming Zhang, [Long Chen](https://scholar.google.com/citations?user=jzvXnkcAAAAJ)$\dagger$
+
+\* Equal contributions $\dagger$ Corresponding authors
+
+**GenAD casts autonomous driving as a generative modeling problem.**
+
+## News 
+
+- **[2024/5/2]** Training and evaluation code release.
+- **[2024/2/18]** Paper released on [arXiv](https://arxiv.org/pdf/2402.11502).
+
+## Demo
+
+![demo](./assets/demo.gif)
+
+## Overview
+
+![comparison](./assets/comparison.png)
+
+**Comparisons of the proposed generative end-to-end autonomous driving framework with the conventional pipeline.** Most existing methods follow a serial design of perception, prediction, and planning. They usually ignore the high-level interactions between the ego car and other agents and the structural prior of realistic trajectories. We model autonomous driving as a future generation problem and conduct motion prediction and ego planning simultaneously in a structural latent trajectory space.
+
+## Results
+
+![results](./assets/results.png)
+
+## Code 
+### Dataset
+
+Download nuScenes V1.0 full dataset data and CAN bus expansion data [HERE](https://www.nuscenes.org/download). Prepare nuscenes data as follows.
+
+**Download CAN bus expansion**
+
+```
+# download 'can_bus.zip'
+unzip can_bus.zip 
+# move can_bus to data dir
+```
+
+**Prepare nuScenes data**
+
+*We genetate custom annotation files which are different from mmdet3d's*
+
+Generate the train file and val file:
+
+```
+python tools/data_converter/genad_nuscenes_converter.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag genad_nuscenes --version v1.0 --canbus ./data
+```
+
+Using the above code will generate `genad_nuscenes_infos_temporal_{train,val}.pkl`.
+
+
+**Folder structure**
+
+```
+GenAD
+├── projects/
+├── tools/
+├── configs/
+├── ckpts/
+│   ├── resnet50-19c8e357.pth
+├── data/
+│   ├── can_bus/
+│   ├── nuscenes/
+│   │   ├── maps/
+│   │   ├── samples/
+│   │   ├── sweeps/
+│   │   ├── v1.0-test/
+|   |   ├── v1.0-trainval/
+|   |   ├── genad_nuscenes_infos_train.pkl
+|   |   ├── genad_nuscenes_infos_val.pkl
+```
+
+### installation
+
+Detailed package versions can be found in [requirements.txt](../requirements.txt).
+
+- [Installation](docs/install.md)
+
+### Getting Started
+
+**datasets**
+
+https://drive.google.com/drive/folders/1gy7Ux-bk0sge77CsGgeEzPF9ImVn-WgJ?usp=drive_link
+
+**Checkpoints**
+
+https://drive.google.com/drive/folders/1nlAWJlvSHwqnTjEwlfiE99YJVRFKmqF9?usp=drive_link
+
+Train GenAD with 8 GPUs
+
+```shell
+cd /path/to/GenAD
+conda activate genad
+python -m torch.distributed.run --nproc_per_node=8 --master_port=2333 tools/train.py projects/configs/GenAD/GenAD_config.py --launcher pytorch --deterministic --work-dir path/to/save/outputs
+```
+
+Eval GenAD with 1 GPU
+
+```shell
+cd /path/to/GenAD
+conda activate genad
+CUDA_VISIBLE_DEVICES=0 python tools/test.py projects/configs/VAD/GenAD_config.py /path/to/ckpt.pth --launcher none --eval bbox --tmpdir outputs
+```
+
+
+
+## Related Projects
+
+Our code is based on [VAD](https://github.com/hustvl/VAD) and [UniAD](https://github.com/OpenDriveLab/UniAD). 
+
+## Citation
+
+If you find this project helpful, please consider citing the following paper:
+```
+@article{zheng2024genad,
+    title={GenAD: Generative End-to-End Autonomous Driving},
+    author={Zheng, Wenzhao and Song, Ruiqi and Guo, Xianda and Zhang, Chenming and Chen, Long},
+    journal={arXiv preprint arXiv: 2402.11502},
+    year={2024}
+}
+```
diff --git a/GenAD-main/assets/comparison.png b/GenAD-main/assets/comparison.png
new file mode 100644
index 0000000000000000000000000000000000000000..0ca4e903f20f851936a60f2f6698c917707ef08f
Binary files /dev/null and b/GenAD-main/assets/comparison.png differ
diff --git a/GenAD-main/assets/demo.gif b/GenAD-main/assets/demo.gif
new file mode 100644
index 0000000000000000000000000000000000000000..77b3b69525fb24bdd4bfe9709cef6029bec257e1
--- /dev/null
+++ b/GenAD-main/assets/demo.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bac1568c537632d144945191bb77987e00fd822faf2348c415e732b13a041ce
+size 67527613
diff --git a/GenAD-main/assets/framework.png b/GenAD-main/assets/framework.png
new file mode 100644
index 0000000000000000000000000000000000000000..cb14aca6353ba7051ef36d4fb2e28611b695d338
Binary files /dev/null and b/GenAD-main/assets/framework.png differ
diff --git a/GenAD-main/assets/results.png b/GenAD-main/assets/results.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9a94fb50a6c4d52d778c5401de041e7e59abd6e
Binary files /dev/null and b/GenAD-main/assets/results.png differ
diff --git a/GenAD-main/docs/install.md b/GenAD-main/docs/install.md
new file mode 100644
index 0000000000000000000000000000000000000000..b8270b5ec261c8c9cda9dab178d5dd4a2a35ab93
--- /dev/null
+++ b/GenAD-main/docs/install.md
@@ -0,0 +1,66 @@
+# installation
+
+Detailed package versions can be found in [requirements.txt](../requirements.txt).
+
+
+
+**a. Create a conda virtual environment and activate it.**
+```shell
+conda create -n genad python=3.8 -y
+conda activate genad
+```
+
+**b. Install PyTorch and torchvision following the [official instructions](https://pytorch.org/).**
+```shell
+pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
+# Recommended torch>=1.9
+```
+
+**c. Install gcc>=5 in conda env (optional).**
+```shell
+conda install -c omgarcia gcc-5 # gcc-6.2
+```
+
+**c. Install mmcv-full.**
+```shell
+pip install mmcv-full==1.4.0
+#  pip install mmcv-full==1.4.0 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
+
+**d. Install mmdet and mmseg.**
+```shell
+pip install mmdet==2.14.0
+pip install mmsegmentation==0.14.1
+```
+
+**e. Install timm.**
+```shell
+pip install timm
+```
+
+**f. Install mmdet3d.**
+```shell
+conda activate genad
+git clone https://github.com/open-mmlab/mmdetection3d.git
+cd /path/to/mmdetection3d
+git checkout -f v0.17.1
+python setup.py develop
+```
+
+**g. Install nuscenes-devkit.**
+```shell
+pip install nuscenes-devkit==1.1.9
+```
+
+**h. Clone GenAD.**
+```shell
+git clone https://github.com/wzzheng/GenAD.git
+```
+
+**i. Prepare pretrained models.**
+```shell
+cd /path/to/GenAD
+mkdir ckpts
+cd ckpts 
+wget https://download.pytorch.org/models/resnet50-19c8e357.pth
+```
diff --git a/GenAD-main/docs/visualization.md b/GenAD-main/docs/visualization.md
new file mode 100644
index 0000000000000000000000000000000000000000..4fb56f6f320d7cac73faa8d961d22f94701d5a3c
--- /dev/null
+++ b/GenAD-main/docs/visualization.md
@@ -0,0 +1,10 @@
+# Visualization
+
+We provide the script to visualize the VAD prediction to a video [here](../tools/analysis_tools/visualization.py).
+
+```shell
+cd /path/to/GenAD/
+conda activate genad
+python tools/analysis_tools/visualization.py --result-path /path/to/inference/results --save-path /path/to/save/visualization/results
+```
+
diff --git a/GenAD-main/projects/__init__.py b/GenAD-main/projects/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/GenAD-main/projects/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c9dde3c4d79f578fac0a631a6d128f256867e63
Binary files /dev/null and b/GenAD-main/projects/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/configs/VAD/GenAD_config.py b/GenAD-main/projects/configs/VAD/GenAD_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea4858cb90d165bd0e45759937226671450acccb
--- /dev/null
+++ b/GenAD-main/projects/configs/VAD/GenAD_config.py
@@ -0,0 +1,443 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+num_classes = len(class_names)
+
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing', 'boundary']
+map_num_vec = 100
+map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+map_fixed_ptsnum_per_pred_line = 20
+map_eval_use_same_gt_sample_num_flag = True
+map_num_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+bev_h_ = 100
+bev_w_ = 100
+queue_length = 3 # each sequence contains `queue_length` frames.
+total_epochs = 60
+
+model = dict(
+    type='VAD',
+    use_grid_mask=True,
+    video_test_mode=True,
+    pretrained=dict(img='torchvision://resnet50'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='VADHead',
+        map_thresh=0.5,
+        dis_thresh=0.2,
+        pe_normalization=True,
+        tot_epoch=total_epochs,
+        use_traj_lr_warmup=False,
+        query_thresh=0.0,
+        query_use_fix_pad=False,
+        ego_his_encoder=None,
+        ego_lcf_feat_idx=None,
+        valid_fut_ts=6,
+        agent_dim = 300,
+        ego_agent_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        ego_map_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        motion_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        motion_map_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        use_pe=True,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=300,
+        num_classes=num_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        map_num_vec=map_num_vec,
+        map_num_classes=map_num_classes,
+        map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+        map_num_pts_per_gt_vec=map_fixed_ptsnum_per_gt_line,
+        map_query_embed_type='instance_pts',
+        map_transform_method='minmax',
+        map_gt_shift_pts_pattern='v2',
+        map_dir_interval=1,
+        map_code_size=2,
+        map_code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='VADPerceptionTransformer',
+            map_num_vec=map_num_vec,
+            map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=3,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=3,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            map_decoder=dict(
+                type='MapDetectionTransformerDecoder',
+                num_layers=3,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='CustomNMSFreeCoder',
+            post_center_range=[-20, -35, -10.0, 20, 35, 10.0],
+            pc_range=point_cloud_range,
+            max_num=100,
+            voxel_size=voxel_size,
+            num_classes=num_classes),
+        map_bbox_coder=dict(
+            type='MapNMSFreeCoder',
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=map_num_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_traj=dict(type='L1Loss', loss_weight=0.2),
+        loss_traj_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=0.2),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_map_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_map_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_map_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_map_pts=dict(type='PtsL1Loss', loss_weight=1.0),
+        loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=0.005),
+        loss_plan_reg=dict(type='L1Loss', loss_weight=1.0),
+        loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=1.0, dis_thresh=1.0),
+        loss_plan_col=dict(type='PlanCollisionLoss', loss_weight=1.0),
+        loss_plan_dir=dict(type='PlanMapDirectionLoss', loss_weight=0.5),
+        loss_vae_gen=dict(type='ProbabilisticLoss', loss_weight=1.0),
+        loss_diff_gen=dict(type='DiffusionLoss', loss_weight=0.5)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range),
+        map_assigner=dict(
+            type='MapHungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', weight=1.0),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'VADCustomNuScenesDataset'
+data_root = 'xxx/nuscenes/'
+file_client_args = dict(backend='disk')
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+    dict(type='CustomObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='CustomObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.4]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='CustomDefaultFormatBundle3D', class_names=class_names, with_ego=True),
+    dict(type='CustomCollect3D',\
+         keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'ego_his_trajs',
+               'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd', 'ego_lcf_feat', 'gt_attr_labels'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='LoadPointsFromFile',
+         coord_type='LIDAR',
+         load_dim=5,
+         use_dim=5,
+         file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+    dict(type='CustomObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='CustomObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    # dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.4]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(type='CustomDefaultFormatBundle3D', class_names=class_names, with_label=False, with_ego=True),
+            dict(type='CustomCollect3D',\
+                 keys=['points', 'gt_bboxes_3d', 'gt_labels_3d', 'img', 'fut_valid_flag',
+                       'ego_his_trajs', 'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd',
+                       'ego_lcf_feat', 'gt_attr_labels'])])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'genad_nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        queue_length=queue_length,
+        map_classes=map_classes,
+        map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+        map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        custom_eval_version='vad_nusc_detection_cvpr_2019'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             pc_range=point_cloud_range,
+             ann_file=data_root + 'genad_nuscenes_infos_val.pkl',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1,
+             map_classes=map_classes,
+             map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+             map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+             map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+             use_pkl_result=True,
+             custom_eval_version='vad_nusc_detection_cvpr_2019'),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              pc_range=point_cloud_range,
+              ann_file=data_root + 'genad_nuscenes_infos_val.pkl',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality, samples_per_gpu=1,
+              map_classes=map_classes,
+              map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+              map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+              map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+              use_pkl_result=True,
+              custom_eval_version='vad_nusc_detection_cvpr_2019'),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+
+evaluation = dict(interval=total_epochs, pipeline=test_pipeline, metric='bbox', map_metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# fp16 = dict(loss_scale=512.)
+find_unused_parameters = True
+checkpoint_config = dict(interval=1, max_keep_ckpts=total_epochs)
+
+
+custom_hooks = [dict(type='CustomSetEpochInfoHook')]
\ No newline at end of file
diff --git a/GenAD-main/projects/configs/_base_/datasets/coco_instance.py b/GenAD-main/projects/configs/_base_/datasets/coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6ea4f4562a8118275a444879a884717b55caa15
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/coco_instance.py
@@ -0,0 +1,48 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/GenAD-main/projects/configs/_base_/datasets/kitti-3d-3class.py b/GenAD-main/projects/configs/_base_/datasets/kitti-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..1822af4209432eb45e105112a165668fac87b6c5
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/kitti-3d-3class.py
@@ -0,0 +1,140 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'kitti_infos_train.pkl',
+            split='training',
+            pts_prefix='velodyne_reduced',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/datasets/kitti-3d-car.py b/GenAD-main/projects/configs/_base_/datasets/kitti-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e81226e2dfdb0e4e802daa8bf0c9f9d19adb125
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/kitti-3d-car.py
@@ -0,0 +1,138 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'kitti_infos_train.pkl',
+            split='training',
+            pts_prefix='velodyne_reduced',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/datasets/lyft-3d.py b/GenAD-main/projects/configs/_base_/datasets/lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..71baff04c5b5345ab3d7340607c3496a8befc5fa
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_test.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/datasets/nuim_instance.py b/GenAD-main/projects/configs/_base_/datasets/nuim_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..82fce56bf6f2ad2578a0426e71fc13c2feb8bf97
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/nuim_instance.py
@@ -0,0 +1,59 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/nuimages/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1280, 720), (1920, 1080)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-train.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/GenAD-main/projects/configs/_base_/datasets/nus-3d.py b/GenAD-main/projects/configs/_base_/datasets/nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..154817175df8de5768c1d56bc35efaa0da99415c
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/nus-3d.py
@@ -0,0 +1,142 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/datasets/nus-mono3d.py b/GenAD-main/projects/configs/_base_/datasets/nus-mono3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..1363a94ce4fbb3b1014e61dd52bc36408f119ce1
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/nus-mono3d.py
@@ -0,0 +1,100 @@
+dataset_type = 'CustomNuScenesMonoDataset'
+data_root = 'data/nuscenes/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='MultiScaleFlipAug',
+        scale_factor=1.0,
+        flip=False,
+        transforms=[
+            dict(type='RandomFlip3D'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['img']),
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        box_type_3d='Camera'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='Camera'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='Camera'))
+evaluation = dict(interval=2)
diff --git a/GenAD-main/projects/configs/_base_/datasets/range100_lyft-3d.py b/GenAD-main/projects/configs/_base_/datasets/range100_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..efa63ea3f0d351198d609785d971c19d96532844
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/range100_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-100, -100, -5, 100, 100, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_test.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/datasets/s3dis-3d-5class.py b/GenAD-main/projects/configs/_base_/datasets/s3dis-3d-5class.py
new file mode 100644
index 0000000000000000000000000000000000000000..2422766fa351ee5cf7f0cd5ee5ab61b88e1d0300
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/s3dis-3d-5class.py
@@ -0,0 +1,114 @@
+# dataset settings
+dataset_type = 'S3DISDataset'
+data_root = './data/s3dis/'
+class_names = ('table', 'chair', 'sofa', 'bookcase', 'board')
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        # following ScanNet dataset the rotation range is 5 degrees
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=40000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type='ConcatDataset',
+            datasets=[
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file=data_root + f's3dis_infos_Area_{i}.pkl',
+                    pipeline=train_pipeline,
+                    filter_empty_gt=False,
+                    classes=class_names,
+                    box_type_3d='Depth') for i in train_area
+            ],
+            separate_eval=False)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py b/GenAD-main/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py
new file mode 100644
index 0000000000000000000000000000000000000000..39bf5568e01d1a781c1b712e7c20b823e7c90141
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/s3dis_seg-3d-13class.py
@@ -0,0 +1,139 @@
+# dataset settings
+dataset_type = 'S3DISSegDataset'
+data_root = './data/s3dis/'
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+num_points = 4096
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=tuple(range(len(class_names))),
+        max_cat_id=13),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        ignore_index=len(class_names),
+        use_normalized_coord=True,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        # a wrapper in order to successfully call test function
+        # actually we don't perform test-time-aug
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.0,
+                flip_ratio_bev_vertical=0.0),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=tuple(range(len(class_names))),
+        max_cat_id=13),
+    dict(
+        type='DefaultFormatBundle3D',
+        with_label=False,
+        class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    # train on area 1, 2, 3, 4, 6
+    # test on area 5
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=[
+            data_root + f's3dis_infos_Area_{i}.pkl' for i in train_area
+        ],
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        ignore_index=len(class_names),
+        scene_idxs=[
+            data_root + f'seg_info/Area_{i}_resampled_scene_idxs.npy'
+            for i in train_area
+        ]),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names),
+        scene_idxs=data_root +
+        f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/datasets/scannet-3d-18class.py b/GenAD-main/projects/configs/_base_/datasets/scannet-3d-18class.py
new file mode 100644
index 0000000000000000000000000000000000000000..93da1e5870561363fb3686e8288ccf561ca72cd2
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/scannet-3d-18class.py
@@ -0,0 +1,128 @@
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
+                       36, 39),
+        max_cat_id=40),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=40000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            classes=class_names,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/datasets/scannet_seg-3d-20class.py b/GenAD-main/projects/configs/_base_/datasets/scannet_seg-3d-20class.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf73b09c8afa9317fa7077f5f67b1fae3306c1b7
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/scannet_seg-3d-20class.py
@@ -0,0 +1,132 @@
+# dataset settings
+dataset_type = 'ScanNetSegDataset'
+data_root = './data/scannet/'
+class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+               'bathtub', 'otherfurniture')
+num_points = 8192
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+                       33, 34, 36, 39),
+        max_cat_id=40),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.5,
+        ignore_index=len(class_names),
+        use_normalized_coord=False,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        # a wrapper in order to successfully call test function
+        # actually we don't perform test-time-aug
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.0,
+                flip_ratio_bev_vertical=0.0),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+                       33, 34, 36, 39),
+        max_cat_id=40),
+    dict(
+        type='DefaultFormatBundle3D',
+        with_label=False,
+        class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        ignore_index=len(class_names),
+        scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/datasets/sunrgbd-3d-10class.py b/GenAD-main/projects/configs/_base_/datasets/sunrgbd-3d-10class.py
new file mode 100644
index 0000000000000000000000000000000000000000..7121b75bbf0679c55f706ed07294eb2fa3495cc0
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/sunrgbd-3d-10class.py
@@ -0,0 +1,107 @@
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(type='PointSample', num_points=20000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+            ),
+            dict(type='PointSample', num_points=20000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'sunrgbd_infos_train.pkl',
+            pipeline=train_pipeline,
+            classes=class_names,
+            filter_empty_gt=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-3class.py b/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-3class.py
new file mode 100644
index 0000000000000000000000000000000000000000..920ac154d68cb07669642300fafd52d179be5392
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-3class.py
@@ -0,0 +1,145 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'LidarWaymoDataset'
+data_root = 'data/waymo-full/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-car.py b/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-car.py
new file mode 100644
index 0000000000000000000000000000000000000000..02e262721b29ede7e29d0d0046eba243f2c82249
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/datasets/waymoD5-3d-car.py
@@ -0,0 +1,143 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/_base_/default_runtime.py b/GenAD-main/projects/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e85b69abed5f51238da4f183163066073664350
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/default_runtime.py
@@ -0,0 +1,18 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable push
+# By default we use textlogger hook and tensorboard
+# For more loggers see
+# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = None
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/GenAD-main/projects/configs/_base_/models/3dssd.py b/GenAD-main/projects/configs/_base_/models/3dssd.py
new file mode 100644
index 0000000000000000000000000000000000000000..55344c7ddff660dc0306542d94260efad39f8df7
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/3dssd.py
@@ -0,0 +1,77 @@
+model = dict(
+    type='SSD3DNet',
+    backbone=dict(
+        type='PointNet2SAMSG',
+        in_channels=4,
+        num_points=(4096, 512, (256, 256)),
+        radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
+        num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
+        sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
+                     ((64, 64, 128), (64, 64, 128), (64, 96, 128)),
+                     ((128, 128, 256), (128, 192, 256), (128, 256, 256))),
+        aggregation_channels=(64, 128, 256),
+        fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (512, -1)),
+        norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    bbox_head=dict(
+        type='SSD3DHead',
+        in_channels=256,
+        vote_module_cfg=dict(
+            in_channels=256,
+            num_points=256,
+            gt_per_seed=1,
+            conv_channels=(128, ),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+            with_res_feat=False,
+            vote_xyz_range=(3.0, 3.0, 2.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModuleMSG',
+            num_point=256,
+            radii=(4.8, 6.4),
+            sample_nums=(16, 32),
+            mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)),
+            norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+            use_xyz=True,
+            normalize_xyz=False,
+            bias=True),
+        pred_layer_cfg=dict(
+            in_channels=1536,
+            shared_conv_channels=(512, 128),
+            cls_conv_channels=(128, ),
+            reg_conv_channels=(128, ),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+            bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        center_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        corner_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05),
+    test_cfg=dict(
+        nms_cfg=dict(type='nms', iou_thr=0.1),
+        sample_mod='spec',
+        score_thr=0.0,
+        per_class_proposal=True,
+        max_output_num=100))
diff --git a/GenAD-main/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/GenAD-main/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb9e0a8f06d3f597e90156efc9f30264c678fe85
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
@@ -0,0 +1,200 @@
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=2000,
+            max_num=2000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/GenAD-main/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py b/GenAD-main/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
new file mode 100644
index 0000000000000000000000000000000000000000..efdce59c6d59c6564c6558a7a800852fe14314d7
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.1, 0.1, 0.2]
+model = dict(
+    type='CenterPoint',
+    pts_voxel_layer=dict(
+        max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)),
+    pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    pts_middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[41, 1024, 1024],
+        output_channels=128,
+        order=('conv', 'norm', 'act'),
+        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                      128)),
+        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
+        block_type='basicblock'),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        out_channels=[128, 256],
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        out_channels=[256, 256],
+        upsample_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type='CenterHead',
+        in_channels=sum([256, 256]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[1024, 1024, 40],
+            voxel_size=voxel_size,
+            out_size_factor=8,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/GenAD-main/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py b/GenAD-main/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
new file mode 100644
index 0000000000000000000000000000000000000000..311d76373bd261ed8827409be68db0e577b38327
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.2, 0.2, 8]
+model = dict(
+    type='CenterPoint',
+    pts_voxel_layer=dict(
+        max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)),
+    pts_voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=(0.2, 0.2, 8),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        legacy=False),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        out_channels=[64, 128, 256],
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        out_channels=[128, 128, 128],
+        upsample_strides=[0.5, 1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type='CenterHead',
+        in_channels=sum([128, 128, 128]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            pc_range=[-51.2, -51.2],
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/GenAD-main/projects/configs/_base_/models/fcos3d.py b/GenAD-main/projects/configs/_base_/models/fcos3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ea90760519d6205d75af6a39f927503de89aad
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/fcos3d.py
@@ -0,0 +1,74 @@
+model = dict(
+    type='FCOSMono3D',
+    pretrained='open-mmlab://detectron2/resnet101_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='FCOSMono3DHead',
+        num_classes=10,
+        in_channels=256,
+        stacked_convs=2,
+        feat_channels=256,
+        use_direction_classifier=True,
+        diff_rad_by_sin=True,
+        pred_attrs=True,
+        pred_velo=True,
+        dir_offset=0.7854,  # pi/4
+        strides=[8, 16, 32, 64, 128],
+        group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo
+        cls_branch=(256, ),
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            ()  # velo
+        ),
+        dir_branch=(256, ),
+        attr_branch=(256, ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_attr=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        center_sampling=True,
+        conv_bias=True,
+        dcn_on_last_conv=True),
+    train_cfg=dict(
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.8,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_per_img=200))
diff --git a/GenAD-main/projects/configs/_base_/models/groupfree3d.py b/GenAD-main/projects/configs/_base_/models/groupfree3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..077d049662fe16b91639af4a5923a4e8e540148d
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/groupfree3d.py
@@ -0,0 +1,71 @@
+model = dict(
+    type='GroupFree3DNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=3,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 288)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        type='GroupFree3DHead',
+        in_channels=288,
+        num_decoder_layers=6,
+        num_proposal=256,
+        transformerlayers=dict(
+            type='BaseTransformerLayer',
+            attn_cfgs=dict(
+                type='GroupFree3DMHA',
+                embed_dims=288,
+                num_heads=8,
+                attn_drop=0.1,
+                dropout_layer=dict(type='Dropout', drop_prob=0.1)),
+            ffn_cfgs=dict(
+                embed_dims=288,
+                feedforward_channels=2048,
+                ffn_drop=0.1,
+                act_cfg=dict(type='ReLU', inplace=True)),
+            operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
+                             'norm')),
+        pred_layer_cfg=dict(
+            in_channels=288, shared_conv_channels=(288, 288), bias=True),
+        sampling_objectness_loss=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(sample_mod='kps'),
+    test_cfg=dict(
+        sample_mod='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last'))
diff --git a/GenAD-main/projects/configs/_base_/models/h3dnet.py b/GenAD-main/projects/configs/_base_/models/h3dnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..760566744f6484cde261f87f0d95a1182786779c
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/h3dnet.py
@@ -0,0 +1,341 @@
+primitive_z_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=2,
+    num_classes=18,
+    primitive_mode='z',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+primitive_xy_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=1,
+    num_classes=18,
+    primitive_mode='xy',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+primitive_line_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=0,
+    num_classes=18,
+    primitive_mode='line',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=1.0,
+        loss_dst_weight=1.0),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=1.0,
+        loss_dst_weight=1.0),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=2.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+model = dict(
+    type='H3DNet',
+    backbone=dict(
+        type='MultiBackbone',
+        num_streams=4,
+        suffixes=['net0', 'net1', 'net2', 'net3'],
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        backbones=dict(
+            type='PointNet2SASSG',
+            in_channels=4,
+            num_points=(2048, 1024, 512, 256),
+            radius=(0.2, 0.4, 0.8, 1.2),
+            num_samples=(64, 32, 16, 16),
+            sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                         (128, 128, 256)),
+            fp_channels=((256, 256), (256, 256)),
+            norm_cfg=dict(type='BN2d'),
+            sa_cfg=dict(
+                type='PointSAModule',
+                pool_mod='max',
+                use_xyz=True,
+                normalize_xyz=True))),
+    rpn_head=dict(
+        type='VoteHead',
+        vote_module_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModule',
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        pred_layer_cfg=dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    roi_head=dict(
+        type='H3DRoIHead',
+        primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg],
+        bbox_head=dict(
+            type='H3DBboxHead',
+            gt_per_seed=3,
+            num_proposal=256,
+            suface_matching_cfg=dict(
+                type='PointSAModule',
+                num_point=256 * 6,
+                radius=0.5,
+                num_sample=32,
+                mlp_channels=[128 + 6, 128, 64, 32],
+                use_xyz=True,
+                normalize_xyz=True),
+            line_matching_cfg=dict(
+                type='PointSAModule',
+                num_point=256 * 12,
+                radius=0.5,
+                num_sample=32,
+                mlp_channels=[128 + 12, 128, 64, 32],
+                use_xyz=True,
+                normalize_xyz=True),
+            feat_channels=(128, 128),
+            primitive_refine_channels=[128, 128, 128],
+            upper_thresh=100.0,
+            surface_thresh=0.5,
+            line_thresh=0.5,
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='sum',
+                loss_weight=5.0),
+            center_loss=dict(
+                type='ChamferDistance',
+                mode='l2',
+                reduction='sum',
+                loss_src_weight=10.0,
+                loss_dst_weight=10.0),
+            dir_class_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            dir_res_loss=dict(
+                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            size_class_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            size_res_loss=dict(
+                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            semantic_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            cues_objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.3, 0.7],
+                reduction='mean',
+                loss_weight=5.0),
+            cues_semantic_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.3, 0.7],
+                reduction='mean',
+                loss_weight=5.0),
+            proposal_objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='none',
+                loss_weight=5.0),
+            primitive_center_loss=dict(
+                type='MSELoss', reduction='none', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+        rpn_proposal=dict(use_nms=False),
+        rcnn=dict(
+            pos_distance_thr=0.3,
+            neg_distance_thr=0.6,
+            sample_mod='vote',
+            far_threshold=0.6,
+            near_threshold=0.3,
+            mask_surface_threshold=0.3,
+            label_surface_threshold=0.3,
+            mask_line_threshold=0.3,
+            label_line_threshold=0.3)),
+    test_cfg=dict(
+        rpn=dict(
+            sample_mod='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True,
+            use_nms=False),
+        rcnn=dict(
+            sample_mod='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True)))
diff --git a/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py
new file mode 100644
index 0000000000000000000000000000000000000000..87c7fe0c6145f0cceadafd7f51c98f209538796d
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-80, -80, -5, 80, 80, 3],
+        max_voxels=(60000, 60000)),
+    pts_voxel_encoder=dict(
+        feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]),
+    pts_middle_encoder=dict(output_shape=[640, 640]),
+    pts_bbox_head=dict(
+        num_classes=9,
+        anchor_generator=dict(
+            ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]),
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+    # model training settings (based on nuScenes model settings)
+    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py
new file mode 100644
index 0000000000000000000000000000000000000000..e153f6c6e69171d29f79b627dd6d152a842d0db2
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_nus.py
@@ -0,0 +1,96 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.25, 0.25, 8]
+model = dict(
+    type='MVXFasterRCNN',
+    pts_voxel_layer=dict(
+        max_num_points=64,
+        point_cloud_range=[-50, -50, -5, 50, 50, 3],
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000)),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=[-50, -50, -5, 50, 50, 3],
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    pts_neck=dict(
+        type='FPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        in_channels=[64, 128, 256],
+        out_channels=256,
+        start_level=0,
+        num_outs=3),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [0.8660, 2.5981, 1.],  # 1.5/sqrt(3)
+                [0.5774, 1.7321, 1.],  # 1/sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_thr=0.2,
+            score_thr=0.05,
+            min_bbox_size=0,
+            max_num=500)))
diff --git a/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cd200f3e4c0dfb7da1823263b22bbcd63d77d63
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-100, -100, -5, 100, 100, 3],
+        max_voxels=(60000, 60000)),
+    pts_voxel_encoder=dict(
+        feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]),
+    pts_middle_encoder=dict(output_shape=[800, 800]),
+    pts_bbox_head=dict(
+        num_classes=9,
+        anchor_generator=dict(
+            ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]),
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+    # model training settings (based on nuScenes model settings)
+    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..85076d0798bc49e1564d6eabe177d1ae92be0aef
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
@@ -0,0 +1,93 @@
+voxel_size = [0.16, 0.16, 4]
+
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=32,  # max_points_per_voxel
+        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1],
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)  # (training, testing) max_voxels
+    ),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]),
+    middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+                [0, -39.68, -1.78, 70.4, 39.68, -1.78],
+            ],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
diff --git a/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..14873ead474761d96b8487d48765bf2486277bed
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
@@ -0,0 +1,108 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.32, 0.32, 6]
+model = dict(
+    type='MVXFasterRCNN',
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+        voxel_size=voxel_size,
+        max_voxels=(32000, 32000)),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[1, 2, 2],
+        out_channels=[64, 128, 256]),
+    pts_neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345],
+                    [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188],
+                    [-74.88, -74.88, 0, 74.88, 74.88, 0]],
+            sizes=[
+                [2.08, 4.73, 1.77],  # car
+                [0.84, 1.81, 1.77],  # cyclist
+                [0.84, 0.91, 1.74]  # pedestrian
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=[
+                dict(  # car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+                dict(  # pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+            ],
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=4096,
+            nms_thr=0.25,
+            score_thr=0.1,
+            min_bbox_size=0,
+            max_num=500)))
diff --git a/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_kitti.py b/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_kitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bf18abe1df08680cc2bb86dfb7b445af4d63ec8
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_kitti.py
@@ -0,0 +1,89 @@
+voxel_size = [0.05, 0.05, 0.1]
+
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=5,
+        point_cloud_range=[0, -40, -3, 70.4, 40, 1],
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+            ],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
diff --git a/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_waymo.py b/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb9bd3ae5cd6c94e56aa9d88765746853ca58f3e
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/hv_second_secfpn_waymo.py
@@ -0,0 +1,100 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.08, 0.08, 0.1]
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=10,
+        point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4],
+        voxel_size=voxel_size,
+        max_voxels=(80000, 90000)),
+    voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[61, 1280, 1920],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=384,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345],
+                    [-76.8, -51.2, 0, 76.8, 51.2, 0],
+                    [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]],
+            sizes=[
+                [2.08, 4.73, 1.77],  # car
+                [0.84, 0.91, 1.74],  # pedestrian
+                [0.84, 1.81, 1.77]  # cyclist
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.55,
+                neg_iou_thr=0.4,
+                min_pos_iou=0.4,
+                ignore_iof_thr=-1),
+            dict(  # pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            dict(  # cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1)
+        ],
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=4096,
+        nms_thr=0.25,
+        score_thr=0.1,
+        min_bbox_size=0,
+        max_num=500))
diff --git a/GenAD-main/projects/configs/_base_/models/imvotenet_image.py b/GenAD-main/projects/configs/_base_/models/imvotenet_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..981f8bc9be90a3c2d0ff1edfef3cb3ce91d20d41
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/imvotenet_image.py
@@ -0,0 +1,108 @@
+model = dict(
+    type='ImVoteNet',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    img_rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    img_roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=10,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+
+    # model training and testing settings
+    train_cfg=dict(
+        img_rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        img_rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        img_rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        img_rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        img_rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/GenAD-main/projects/configs/_base_/models/mask_rcnn_r50_fpn.py b/GenAD-main/projects/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5d5e32b0427cf29b7240b26c7f506c283ae6c04
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,124 @@
+# model settings
+model = dict(
+    type='MaskRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/GenAD-main/projects/configs/_base_/models/paconv_cuda_ssg.py b/GenAD-main/projects/configs/_base_/models/paconv_cuda_ssg.py
new file mode 100644
index 0000000000000000000000000000000000000000..f513bd4a2f94964f70dba926ef03b427a795e417
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/paconv_cuda_ssg.py
@@ -0,0 +1,7 @@
+_base_ = './paconv_ssg.py'
+
+model = dict(
+    backbone=dict(
+        sa_cfg=dict(
+            type='PAConvCUDASAModule',
+            scorenet_cfg=dict(mlp_channels=[8, 16, 16]))))
diff --git a/GenAD-main/projects/configs/_base_/models/paconv_ssg.py b/GenAD-main/projects/configs/_base_/models/paconv_ssg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4f1ed39373b40e0871bc97dafaf664ff68594d
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/paconv_ssg.py
@@ -0,0 +1,49 @@
+# model settings
+model = dict(
+    type='EncoderDecoder3D',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=9,  # [xyz, rgb, normalized_xyz]
+        num_points=(1024, 256, 64, 16),
+        radius=(None, None, None, None),  # use kNN instead of ball query
+        num_samples=(32, 32, 32, 32),
+        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+                                                                    512)),
+        fp_channels=(),
+        norm_cfg=dict(type='BN2d', momentum=0.1),
+        sa_cfg=dict(
+            type='PAConvSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False,
+            paconv_num_kernels=[16, 16, 16],
+            paconv_kernel_input='w_neighbor',
+            scorenet_input='w_neighbor_dist',
+            scorenet_cfg=dict(
+                mlp_channels=[16, 16, 16],
+                score_norm='softmax',
+                temp_factor=1.0,
+                last_bn=False))),
+    decode_head=dict(
+        type='PAConvHead',
+        # PAConv model's decoder takes skip connections from beckbone
+        # different from PointNet++, it also concats input features in the last
+        # level of decoder, leading to `128 + 6` as the channel number
+        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                     (128 + 6, 128, 128, 128)),
+        channels=128,
+        dropout_ratio=0.5,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU'),
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,  # should be modified with dataset
+            loss_weight=1.0)),
+    # correlation loss to regularize PAConv's kernel weights
+    loss_regularization=dict(
+        type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide'))
diff --git a/GenAD-main/projects/configs/_base_/models/parta2.py b/GenAD-main/projects/configs/_base_/models/parta2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c5ae9a66372c404923b21f5ee37dfcacd7347ec
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/parta2.py
@@ -0,0 +1,201 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+model = dict(
+    type='PartA2',
+    voxel_layer=dict(
+        max_num_points=5,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)  # (training, testing) max_voxels
+    ),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseUNet',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    rpn_head=dict(
+        type='PartA2RPNHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        assigner_per_size=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    roi_head=dict(
+        type='PartAggregationROIHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='PointwiseSemanticHead',
+            in_channels=16,
+            extra_width=0.2,
+            seg_score_thr=0.3,
+            num_classes=3,
+            loss_seg=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_part=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+        seg_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='max')),
+        part_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='avg')),
+        bbox_head=dict(
+            type='PartA2BboxHead',
+            num_classes=3,
+            seg_in_channels=16,
+            part_in_channels=4,
+            seg_conv_channels=[64, 64],
+            part_conv_channels=[64, 64],
+            merge_conv_channels=[128, 128],
+            down_conv_channels=[128, 256],
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            shared_fc_channels=[256, 512, 512, 512],
+            cls_channels=[256, 256],
+            reg_channels=[256, 256],
+            dropout_ratio=0.1,
+            roi_feat_size=14,
+            with_corner_loss=True,
+            loss_bbox=dict(
+                type='SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1)
+            ],
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=9000,
+            nms_post=512,
+            max_num=512,
+            nms_thr=0.8,
+            score_thr=0,
+            use_rotate_nms=False),
+        rcnn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1)
+            ],
+            sampler=dict(
+                type='IoUNegPiecewiseSampler',
+                num=128,
+                pos_fraction=0.55,
+                neg_piece_fractions=[0.8, 0.2],
+                neg_iou_piece_thrs=[0.55, 0.1],
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+                return_iou=True),
+            cls_pos_thr=0.75,
+            cls_neg_thr=0.25)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1024,
+            nms_post=100,
+            max_num=100,
+            nms_thr=0.7,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            use_rotate_nms=True,
+            use_raw_score=True,
+            nms_thr=0.01,
+            score_thr=0.1)))
diff --git a/GenAD-main/projects/configs/_base_/models/pointnet2_msg.py b/GenAD-main/projects/configs/_base_/models/pointnet2_msg.py
new file mode 100644
index 0000000000000000000000000000000000000000..222ab885557984125eb52a934f443870e6c6918d
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/pointnet2_msg.py
@@ -0,0 +1,28 @@
+_base_ = './pointnet2_ssg.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='PointNet2SAMSG',
+        in_channels=6,  # [xyz, rgb], should be modified with dataset
+        num_points=(1024, 256, 64, 16),
+        radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)),
+        num_samples=((16, 32), (16, 32), (16, 32), (16, 32)),
+        sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96,
+                                                                    128)),
+                     ((128, 196, 256), (128, 196, 256)), ((256, 256, 512),
+                                                          (256, 384, 512))),
+        aggregation_channels=(None, None, None, None),
+        fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (-1), (-1)),
+        dilated_group=(False, False, False, False),
+        out_indices=(0, 1, 2, 3),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    decode_head=dict(
+        fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128),
+                     (128, 128, 128, 128))))
diff --git a/GenAD-main/projects/configs/_base_/models/pointnet2_ssg.py b/GenAD-main/projects/configs/_base_/models/pointnet2_ssg.py
new file mode 100644
index 0000000000000000000000000000000000000000..58b4c243ded042612abb1c15c9c175f5e932af38
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/pointnet2_ssg.py
@@ -0,0 +1,35 @@
+# model settings
+model = dict(
+    type='EncoderDecoder3D',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=6,  # [xyz, rgb], should be modified with dataset
+        num_points=(1024, 256, 64, 16),
+        radius=(0.1, 0.2, 0.4, 0.8),
+        num_samples=(32, 32, 32, 32),
+        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+                                                                    512)),
+        fp_channels=(),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    decode_head=dict(
+        type='PointNet2Head',
+        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                     (128, 128, 128, 128)),
+        channels=128,
+        dropout_ratio=0.5,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU'),
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,  # should be modified with dataset
+            loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide'))
diff --git a/GenAD-main/projects/configs/_base_/models/votenet.py b/GenAD-main/projects/configs/_base_/models/votenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..129339dc9eaa3f74c0547a39fa527c14be03743c
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/models/votenet.py
@@ -0,0 +1,73 @@
+model = dict(
+    type='VoteNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        type='VoteHead',
+        vote_module_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModule',
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        pred_layer_cfg=dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+    test_cfg=dict(
+        sample_mod='seed',
+        nms_thr=0.25,
+        score_thr=0.05,
+        per_class_proposal=True))
diff --git a/GenAD-main/projects/configs/_base_/schedules/cosine.py b/GenAD-main/projects/configs/_base_/schedules/cosine.py
new file mode 100644
index 0000000000000000000000000000000000000000..69cb7df87d23846ea7b64fb6d882679e315e55cf
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/schedules/cosine.py
@@ -0,0 +1,20 @@
+# This schedule is mainly used by models with dynamic voxelization
+# optimizer
+lr = 0.003  # max learning rate
+optimizer = dict(
+    type='AdamW',
+    lr=lr,
+    betas=(0.95, 0.99),  # the momentum is change during training
+    weight_decay=0.001)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 10,
+    min_lr_ratio=1e-5)
+
+momentum_config = None
+
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/GenAD-main/projects/configs/_base_/schedules/cyclic_20e.py b/GenAD-main/projects/configs/_base_/schedules/cyclic_20e.py
new file mode 100644
index 0000000000000000000000000000000000000000..704740ee5676515213fd30839f5e116c0b4ebfc7
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/schedules/cyclic_20e.py
@@ -0,0 +1,24 @@
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 20. Please change the interval accordingly if you do not
+# use a default schedule.
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=(10, 1e-4),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=(0.85 / 0.95, 1),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/GenAD-main/projects/configs/_base_/schedules/cyclic_40e.py b/GenAD-main/projects/configs/_base_/schedules/cyclic_40e.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a711acf4f31cca94ea7a10d035282a45f648c9c
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/schedules/cyclic_40e.py
@@ -0,0 +1,31 @@
+# The schedule is usually used by models trained on KITTI dataset
+
+# The learning rate set in the cyclic schedule is the initial learning rate
+# rather than the max learning rate. Since the target_ratio is (10, 1e-4),
+# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
+lr = 0.0018
+# The optimizer follows the setting in SECOND.Pytorch, but here we use
+# the offcial AdamW optimizer implemented by PyTorch.
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+# We use cyclic learning rate and momentum schedule following SECOND.Pytorch
+# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69  # noqa
+# We implement them in mmcv, for more details, please refer to
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327  # noqa
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130  # noqa
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=(10, 1e-4),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=(0.85 / 0.95, 1),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+# Although the max_epochs is 40, this schedule is usually used we
+# RepeatDataset with repeat ratio N, thus the actual max epoch
+# number could be Nx40
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/GenAD-main/projects/configs/_base_/schedules/mmdet_schedule_1x.py b/GenAD-main/projects/configs/_base_/schedules/mmdet_schedule_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..13b3783cbbe93b6c32bc415dc50f633dffa4aec7
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/schedules/mmdet_schedule_1x.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/GenAD-main/projects/configs/_base_/schedules/schedule_2x.py b/GenAD-main/projects/configs/_base_/schedules/schedule_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..afde799d9de1e9c03587b54458938b63b1f7de41
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,14 @@
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[20, 23])
+momentum_config = None
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/GenAD-main/projects/configs/_base_/schedules/schedule_3x.py b/GenAD-main/projects/configs/_base_/schedules/schedule_3x.py
new file mode 100644
index 0000000000000000000000000000000000000000..115cd26b760e749b3ccdd50a6f4d201ea38f824e
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/schedules/schedule_3x.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used by models on indoor dataset,
+# e.g., VoteNet on SUNRGBD and ScanNet
+lr = 0.008  # max learning rate
+optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/GenAD-main/projects/configs/_base_/schedules/seg_cosine_150e.py b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_150e.py
new file mode 100644
index 0000000000000000000000000000000000000000..04b44e51de071dc9158e31fe7c51420326f0493c
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_150e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=150)
diff --git a/GenAD-main/projects/configs/_base_/schedules/seg_cosine_200e.py b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_200e.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a49484c8b37d3c44b7a2979a3173af6a407b967
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_200e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on ScanNet dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/GenAD-main/projects/configs/_base_/schedules/seg_cosine_50e.py b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_50e.py
new file mode 100644
index 0000000000000000000000000000000000000000..975a8f9ff8e5140b0f1707490c282998666c71ef
--- /dev/null
+++ b/GenAD-main/projects/configs/_base_/schedules/seg_cosine_50e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=50)
diff --git a/GenAD-main/projects/configs/datasets/custom_lyft-3d.py b/GenAD-main/projects/configs/datasets/custom_lyft-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a95d898c91e463b731a08f7c52b8186e99da83a
--- /dev/null
+++ b/GenAD-main/projects/configs/datasets/custom_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'CustomLyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
\ No newline at end of file
diff --git a/GenAD-main/projects/configs/datasets/custom_nus-3d.py b/GenAD-main/projects/configs/datasets/custom_nus-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..af81f9b20d182222d0b69fc26fe32c1e66905a16
--- /dev/null
+++ b/GenAD-main/projects/configs/datasets/custom_nus-3d.py
@@ -0,0 +1,141 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset_eval_modified'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/GenAD-main/projects/configs/datasets/custom_waymo-3d.py b/GenAD-main/projects/configs/datasets/custom_waymo-3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4100e13546badb06e69fd0b1ed20158de8acf893
--- /dev/null
+++ b/GenAD-main/projects/configs/datasets/custom_waymo-3d.py
@@ -0,0 +1,112 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'CustomWaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=False, use_camera=True)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1920, 1280),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=test_pipeline)
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/VAD.py b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4876b135aedb12f4d508acfc171a24031510c31
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD.py
@@ -0,0 +1,668 @@
+import time
+import copy
+
+import torch
+from mmdet.models import DETECTORS
+from mmdet3d.core import bbox3d2result
+from mmcv.runner import force_fp32, auto_fp16
+from scipy.optimize import linear_sum_assignment
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+
+from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask
+from projects.mmdet3d_plugin.VAD.planner.metric_stp3 import PlanningMetric
+
+
+@DETECTORS.register_module()
+class VAD(MVXTwoStageDetector):
+    """VAD model.
+    """
+    def __init__(self,
+                 use_grid_mask=False,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 video_test_mode=False,
+                 fut_ts=6,
+                 fut_mode=6
+                 ):
+
+        super(VAD,
+              self).__init__(pts_voxel_layer, pts_voxel_encoder,
+                             pts_middle_encoder, pts_fusion_layer,
+                             img_backbone, pts_backbone, img_neck, pts_neck,
+                             pts_bbox_head, img_roi_head, img_rpn_head,
+                             train_cfg, test_cfg, pretrained)
+        self.grid_mask = GridMask(
+            True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
+        self.use_grid_mask = use_grid_mask
+        self.fp16_enabled = False
+        self.fut_ts = fut_ts
+        self.fut_mode = fut_mode
+        self.valid_fut_ts = pts_bbox_head['valid_fut_ts']
+
+        # temporal
+        self.video_test_mode = video_test_mode
+        self.prev_frame_info = {
+            'prev_bev': None,
+            'scene_token': None,
+            'prev_pos': 0,
+            'prev_angle': 0,
+        }
+
+        self.planning_metric = None
+
+    def extract_img_feat(self, img, img_metas, len_queue=None):
+        """Extract features of images."""
+        B = img.size(0)
+        if img is not None:
+            
+            # input_shape = img.shape[-2:]
+            # # update real input shape of each single img
+            # for img_meta in img_metas:
+            #     img_meta.update(input_shape=input_shape)
+
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.reshape(B * N, C, H, W)
+            if self.use_grid_mask:
+                img = self.grid_mask(img)
+
+            img_feats = self.img_backbone(img)
+            if isinstance(img_feats, dict):
+                img_feats = list(img_feats.values())
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            if len_queue is not None:
+                img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W))
+            else:
+                img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+
+    @auto_fp16(apply_to=('img'), out_fp32=True)
+    def extract_feat(self, img, img_metas=None, len_queue=None):
+        """Extract features from images and points."""
+
+        img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue)
+        
+        return img_feats
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          map_gt_bboxes_3d,
+                          map_gt_labels_3d,                          
+                          img_metas,
+                          gt_bboxes_ignore=None,
+                          map_gt_bboxes_ignore=None,
+                          prev_bev=None,
+                          ego_his_trajs=None,
+                          ego_fut_trajs=None,
+                          ego_fut_masks=None,
+                          ego_fut_cmd=None,
+                          ego_lcf_feat=None,
+                          gt_attr_labels=None):
+        """Forward function'
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+            prev_bev (torch.Tensor, optional): BEV features of previous frame.
+        Returns:
+            dict: Losses of each branch.
+        """
+
+        outs = self.pts_bbox_head(pts_feats, img_metas, prev_bev,
+                                  ego_his_trajs=ego_his_trajs, ego_lcf_feat=ego_lcf_feat,
+                                  gt_labels_3d=gt_labels_3d, gt_attr_labels=gt_attr_labels,
+                                  ego_fut_trajs=ego_fut_trajs)
+        loss_inputs = [
+            gt_bboxes_3d, gt_labels_3d, map_gt_bboxes_3d, map_gt_labels_3d,
+            outs, ego_fut_trajs, ego_fut_masks, ego_fut_cmd, gt_attr_labels,
+        ]
+        losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas)
+        return losses
+
+    def forward_dummy(self, img):
+        dummy_metas = None
+        return self.forward_test(img=img, img_metas=[[dummy_metas]])
+
+    def forward(self, return_loss=True, **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+    
+    def obtain_history_bev(self, imgs_queue, img_metas_list):
+        """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated.
+        """
+        self.eval()
+
+        with torch.no_grad():
+            prev_bev = None
+            bs, len_queue, num_cams, C, H, W = imgs_queue.shape
+            imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W)
+            img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue)
+            for i in range(len_queue):
+                img_metas = [each[i] for each in img_metas_list]
+                # img_feats = self.extract_feat(img=img, img_metas=img_metas)
+                img_feats = [each_scale[:, i] for each_scale in img_feats_list]
+                prev_bev = self.pts_bbox_head(
+                    img_feats, img_metas, prev_bev, only_bev=True)
+            self.train()
+            return prev_bev
+
+    # @auto_fp16(apply_to=('img', 'points'))
+    @force_fp32(apply_to=('img','points','prev_bev'))
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      map_gt_bboxes_3d=None,
+                      map_gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      map_gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None,
+                      ego_his_trajs=None,
+                      ego_fut_trajs=None,
+                      ego_fut_masks=None,
+                      ego_fut_cmd=None,
+                      ego_lcf_feat=None,
+                      gt_attr_labels=None
+                      ):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        
+        len_queue = img.size(1)
+        prev_img = img[:, :-1, ...]
+        img = img[:, -1, ...]
+
+        prev_img_metas = copy.deepcopy(img_metas)
+        # prev_bev = self.obtain_history_bev(prev_img, prev_img_metas)
+        # import pdb;pdb.set_trace()
+        prev_bev = self.obtain_history_bev(prev_img, prev_img_metas) if len_queue > 1 else None
+
+        img_metas = [each[len_queue-1] for each in img_metas]
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, gt_labels_3d,
+                                            map_gt_bboxes_3d, map_gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore, map_gt_bboxes_ignore, prev_bev,
+                                            ego_his_trajs=ego_his_trajs, ego_fut_trajs=ego_fut_trajs,
+                                            ego_fut_masks=ego_fut_masks, ego_fut_cmd=ego_fut_cmd,
+                                            ego_lcf_feat=ego_lcf_feat, gt_attr_labels=gt_attr_labels)
+
+        losses.update(losses_pts)
+        return losses
+
+    def forward_test(
+        self,
+        img_metas,
+        gt_bboxes_3d,
+        gt_labels_3d,
+        img=None,
+        ego_his_trajs=None,
+        ego_fut_trajs=None,
+        ego_fut_cmd=None,
+        ego_lcf_feat=None,
+        gt_attr_labels=None,
+        **kwargs
+    ):
+        for var, name in [(img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+        img = [img] if img is None else img
+
+        if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']:
+            # the first sample of each scene is truncated
+            self.prev_frame_info['prev_bev'] = None
+        # update idx
+        self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token']
+
+        # do not use temporal information
+        if not self.video_test_mode:
+            self.prev_frame_info['prev_bev'] = None
+
+        # Get the delta of ego position and angle between two timestamps.
+        tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3])
+        tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1])
+        if self.prev_frame_info['prev_bev'] is not None:
+            img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos']
+            img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle']
+        else:
+            img_metas[0][0]['can_bus'][-1] = 0
+            img_metas[0][0]['can_bus'][:3] = 0
+
+        new_prev_bev, bbox_results = self.simple_test(
+            img_metas=img_metas[0],
+            img=img[0],
+            prev_bev=self.prev_frame_info['prev_bev'],
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            ego_his_trajs=ego_his_trajs[0],
+            ego_fut_trajs=ego_fut_trajs[0],
+            ego_fut_cmd=ego_fut_cmd[0],
+            ego_lcf_feat=ego_lcf_feat[0],
+            gt_attr_labels=gt_attr_labels,
+            **kwargs
+        )
+        # During inference, we save the BEV features and ego motion of each timestamp.
+        self.prev_frame_info['prev_pos'] = tmp_pos
+        self.prev_frame_info['prev_angle'] = tmp_angle
+        self.prev_frame_info['prev_bev'] = new_prev_bev
+
+        return bbox_results
+
+    def simple_test(
+        self,
+        img_metas,
+        gt_bboxes_3d,
+        gt_labels_3d,
+        img=None,
+        prev_bev=None,
+        points=None,
+        fut_valid_flag=None,
+        rescale=False,
+        ego_his_trajs=None,
+        ego_fut_trajs=None,
+        ego_fut_cmd=None,
+        ego_lcf_feat=None,
+        gt_attr_labels=None,
+        **kwargs
+    ):
+        """Test function without augmentaiton."""
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        bbox_list = [dict() for i in range(len(img_metas))]
+        new_prev_bev, bbox_pts, metric_dict = self.simple_test_pts(
+            img_feats,
+            img_metas,
+            gt_bboxes_3d,
+            gt_labels_3d,
+            prev_bev,
+            fut_valid_flag=fut_valid_flag,
+            rescale=rescale,
+            start=None,
+            ego_his_trajs=ego_his_trajs,
+            ego_fut_trajs=ego_fut_trajs,
+            ego_fut_cmd=ego_fut_cmd,
+            ego_lcf_feat=ego_lcf_feat,
+            gt_attr_labels=gt_attr_labels,
+        )
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+            result_dict['metric_results'] = metric_dict
+
+        return new_prev_bev, bbox_list
+
+    def simple_test_pts(
+        self,
+        x,
+        img_metas,
+        gt_bboxes_3d,
+        gt_labels_3d,
+        prev_bev=None,
+        fut_valid_flag=None,
+        rescale=False,
+        start=None,
+        ego_his_trajs=None,
+        ego_fut_trajs=None,
+        ego_fut_cmd=None,
+        ego_lcf_feat=None,
+        gt_attr_labels=None,
+    ):
+        """Test function"""
+        mapped_class_names = [
+            'car', 'truck', 'construction_vehicle', 'bus',
+            'trailer', 'barrier', 'motorcycle', 'bicycle', 
+            'pedestrian', 'traffic_cone'
+        ]
+
+
+        outs = self.pts_bbox_head(x, img_metas, prev_bev=prev_bev,
+                                  ego_his_trajs=ego_his_trajs, ego_lcf_feat=ego_lcf_feat)
+        bbox_list = self.pts_bbox_head.get_bboxes(outs, img_metas, rescale=rescale)
+
+        bbox_results = []
+        for i, (bboxes, scores, labels, trajs, map_bboxes, \
+                map_scores, map_labels, map_pts) in enumerate(bbox_list):
+            bbox_result = bbox3d2result(bboxes, scores, labels)
+            bbox_result['trajs_3d'] = trajs.cpu()
+            map_bbox_result = self.map_pred2result(map_bboxes, map_scores, map_labels, map_pts)
+            bbox_result.update(map_bbox_result)
+            bbox_result['ego_fut_preds'] = outs['ego_fut_preds'][i].cpu()
+            bbox_result['ego_fut_cmd'] = ego_fut_cmd.cpu()
+            bbox_results.append(bbox_result)
+
+        assert len(bbox_results) == 1, 'only support batch_size=1 now'
+        score_threshold = 0.6
+        with torch.no_grad():
+            c_bbox_results = copy.deepcopy(bbox_results)
+
+            bbox_result = c_bbox_results[0]
+            gt_bbox = gt_bboxes_3d[0][0]
+            gt_label = gt_labels_3d[0][0].to('cpu')
+            gt_attr_label = gt_attr_labels[0][0].to('cpu')
+            fut_valid_flag = bool(fut_valid_flag[0][0])
+            # filter pred bbox by score_threshold
+            mask = bbox_result['scores_3d'] > score_threshold
+            bbox_result['boxes_3d'] = bbox_result['boxes_3d'][mask]
+            bbox_result['scores_3d'] = bbox_result['scores_3d'][mask]
+            bbox_result['labels_3d'] = bbox_result['labels_3d'][mask]
+            bbox_result['trajs_3d'] = bbox_result['trajs_3d'][mask]
+
+            matched_bbox_result = self.assign_pred_to_gt_vip3d(
+                bbox_result, gt_bbox, gt_label)
+
+            metric_dict = self.compute_motion_metric_vip3d(
+                gt_bbox, gt_label, gt_attr_label, bbox_result,
+                matched_bbox_result, mapped_class_names)
+
+            # ego planning metric
+            assert ego_fut_trajs.shape[0] == 1, 'only support batch_size=1 for testing'
+            ego_fut_preds = bbox_result['ego_fut_preds']
+            ego_fut_trajs = ego_fut_trajs[0, 0]
+            ego_fut_cmd = ego_fut_cmd[0, 0, 0]
+            ego_fut_cmd_idx = torch.nonzero(ego_fut_cmd)[0, 0]
+            ego_fut_pred = ego_fut_preds[ego_fut_cmd_idx]
+            ego_fut_pred = ego_fut_pred.cumsum(dim=-2)
+            ego_fut_trajs = ego_fut_trajs.cumsum(dim=-2)
+
+            metric_dict_planner_stp3 = self.compute_planner_metric_stp3(
+                pred_ego_fut_trajs = ego_fut_pred[None],
+                gt_ego_fut_trajs = ego_fut_trajs[None],
+                gt_agent_boxes = gt_bbox,
+                gt_agent_feats = gt_attr_label.unsqueeze(0),
+                fut_valid_flag = fut_valid_flag
+            )
+            metric_dict.update(metric_dict_planner_stp3)
+
+        return outs['bev_embed'], bbox_results, metric_dict
+
+    def map_pred2result(self, bboxes, scores, labels, pts, attrs=None):
+        """Convert detection results to a list of numpy arrays.
+
+        Args:
+            bboxes (torch.Tensor): Bounding boxes with shape of (n, 5).
+            labels (torch.Tensor): Labels with shape of (n, ).
+            scores (torch.Tensor): Scores with shape of (n, ).
+            attrs (torch.Tensor, optional): Attributes with shape of (n, ). \
+                Defaults to None.
+
+        Returns:
+            dict[str, torch.Tensor]: Bounding box results in cpu mode.
+
+                - boxes_3d (torch.Tensor): 3D boxes.
+                - scores (torch.Tensor): Prediction scores.
+                - labels_3d (torch.Tensor): Box labels.
+                - attrs_3d (torch.Tensor, optional): Box attributes.
+        """
+        result_dict = dict(
+            map_boxes_3d=bboxes.to('cpu'),
+            map_scores_3d=scores.cpu(),
+            map_labels_3d=labels.cpu(),
+            map_pts_3d=pts.to('cpu'))
+
+        if attrs is not None:
+            result_dict['map_attrs_3d'] = attrs.cpu()
+
+        return result_dict
+
+    def assign_pred_to_gt_vip3d(
+        self,
+        bbox_result,
+        gt_bbox,
+        gt_label,
+        match_dis_thresh=2.0
+    ):
+        """Assign pred boxs to gt boxs according to object center preds in lcf.
+        Args:
+            bbox_result (dict): Predictions.
+                'boxes_3d': (LiDARInstance3DBoxes)
+                'scores_3d': (Tensor), [num_pred_bbox]
+                'labels_3d': (Tensor), [num_pred_bbox]
+                'trajs_3d': (Tensor), [fut_ts*2]
+            gt_bboxs (LiDARInstance3DBoxes): GT Bboxs.
+            gt_label (Tensor): GT labels for gt_bbox, [num_gt_bbox].
+            match_dis_thresh (float): dis thresh for determine a positive sample for a gt bbox.
+
+        Returns:
+            matched_bbox_result (np.array): assigned pred index for each gt box [num_gt_bbox].
+        """     
+        dynamic_list = [0,1,3,4,6,7,8]
+        matched_bbox_result = torch.ones(
+            (len(gt_bbox)), dtype=torch.long) * -1  # -1: not assigned
+        gt_centers = gt_bbox.center[:, :2]
+        pred_centers = bbox_result['boxes_3d'].center[:, :2]
+        dist = torch.linalg.norm(pred_centers[:, None, :] - gt_centers[None, :, :], dim=-1)
+        pred_not_dyn = [label not in dynamic_list for label in bbox_result['labels_3d']]
+        gt_not_dyn = [label not in dynamic_list for label in gt_label]
+        dist[pred_not_dyn] = 1e6
+        dist[:, gt_not_dyn] = 1e6
+        dist[dist > match_dis_thresh] = 1e6
+
+        r_list, c_list = linear_sum_assignment(dist)
+
+        for i in range(len(r_list)):
+            if dist[r_list[i], c_list[i]] <= match_dis_thresh:
+                matched_bbox_result[c_list[i]] = r_list[i]
+
+        return matched_bbox_result
+
+    def compute_motion_metric_vip3d(
+        self,
+            gt_bbox: object,
+            gt_label: object,
+            gt_attr_label: object,
+            pred_bbox: object,
+            matched_bbox_result: object,
+            mapped_class_names: object,
+            match_dis_thresh: object = 2.0,
+    ) -> object:
+        """Compute EPA metric for one sample.
+        Args:
+            gt_bboxs (LiDARInstance3DBoxes): GT Bboxs.
+            gt_label (Tensor): GT labels for gt_bbox, [num_gt_bbox].
+            pred_bbox (dict): Predictions.
+                'boxes_3d': (LiDARInstance3DBoxes)
+                'scores_3d': (Tensor), [num_pred_bbox]
+                'labels_3d': (Tensor), [num_pred_bbox]
+                'trajs_3d': (Tensor), [fut_ts*2]
+            matched_bbox_result (np.array): assigned pred index for each gt box [num_gt_bbox].
+            match_dis_thresh (float): dis thresh for determine a positive sample for a gt bbox.
+
+        Returns:
+            EPA_dict (dict): EPA metric dict of each cared class.
+        """
+        motion_cls_names = ['car', 'pedestrian']
+        motion_metric_names = ['gt', 'cnt_ade', 'cnt_fde', 'hit',
+                               'fp', 'ADE', 'FDE', 'MR']
+        
+        metric_dict = {}
+        for met in motion_metric_names:
+            for cls in motion_cls_names:
+                metric_dict[met+'_'+cls] = 0.0
+
+
+
+
+        # ignore_list = ['construction_vehicle', 'barrier',
+        #                'traffic_cone', 'motorcycle', 'bicycle']
+        veh_list = [0, 1, 2, 3, 4, 6, 7]
+        ignore_list = ['barrier', 'traffic_cone']
+
+        for i in range(pred_bbox['labels_3d'].shape[0]):
+            pred_bbox['labels_3d'][i] = 0 if pred_bbox['labels_3d'][i] in veh_list else pred_bbox['labels_3d'][i]
+            box_name = mapped_class_names[pred_bbox['labels_3d'][i]]
+            if box_name in ignore_list:
+                continue
+            if i not in matched_bbox_result:
+                metric_dict['fp_'+box_name] += 1
+
+        for i in range(gt_label.shape[0]):
+            gt_label[i] = 0 if gt_label[i] in veh_list else gt_label[i]
+            box_name = mapped_class_names[gt_label[i]]
+            if box_name in ignore_list:
+                continue
+            gt_fut_masks = gt_attr_label[i][self.fut_ts*2:self.fut_ts*3]
+            num_valid_ts = sum(gt_fut_masks==1)
+            if num_valid_ts == self.fut_ts:
+                metric_dict['gt_'+box_name] += 1
+            if matched_bbox_result[i] >= 0 and num_valid_ts > 0:
+                metric_dict['cnt_ade_'+box_name] += 1
+                m_pred_idx = matched_bbox_result[i]
+                gt_fut_trajs = gt_attr_label[i][:self.fut_ts*2].reshape(-1, 2)
+                gt_fut_trajs = gt_fut_trajs[:num_valid_ts]
+                pred_fut_trajs = pred_bbox['trajs_3d'][m_pred_idx].reshape(self.fut_mode, self.fut_ts, 2)
+                pred_fut_trajs = pred_fut_trajs[:, :num_valid_ts, :]
+                gt_fut_trajs = gt_fut_trajs.cumsum(dim=-2)
+                pred_fut_trajs = pred_fut_trajs.cumsum(dim=-2)
+                gt_fut_trajs = gt_fut_trajs + gt_bbox[i].center[0, :2]
+                pred_fut_trajs = pred_fut_trajs + pred_bbox['boxes_3d'][int(m_pred_idx)].center[0, :2]
+
+                dist = torch.linalg.norm(gt_fut_trajs[None, :, :] - pred_fut_trajs, dim=-1)
+                ade = dist.sum(-1) / num_valid_ts
+                ade = ade.min()
+
+                metric_dict['ADE_'+box_name] += ade
+                if num_valid_ts == self.fut_ts:
+                    fde = dist[:, -1].min()
+                    metric_dict['cnt_fde_'+box_name] += 1
+                    metric_dict['FDE_'+box_name] += fde
+                    if fde <= match_dis_thresh:
+                        metric_dict['hit_'+box_name] += 1
+                    else:
+                        metric_dict['MR_'+box_name] += 1
+
+        return metric_dict
+
+    ### same planning metric as stp3
+    def compute_planner_metric_stp3(
+        self,
+        pred_ego_fut_trajs,
+        gt_ego_fut_trajs,
+        gt_agent_boxes,
+        gt_agent_feats,
+        fut_valid_flag
+    ):
+        """Compute planner metric for one sample same as stp3."""
+        metric_dict = {
+            'plan_L2_1s':0,
+            'plan_L2_2s':0,
+            'plan_L2_3s':0,
+            'plan_obj_col_1s':0,
+            'plan_obj_col_2s':0,
+            'plan_obj_col_3s':0,
+            'plan_obj_box_col_1s':0,
+            'plan_obj_box_col_2s':0,
+            'plan_obj_box_col_3s':0,
+        }
+        metric_dict['fut_valid_flag'] = fut_valid_flag
+        future_second = 3
+        assert pred_ego_fut_trajs.shape[0] == 1, 'only support bs=1'
+        if self.planning_metric is None:
+            self.planning_metric = PlanningMetric()
+        segmentation, pedestrian = self.planning_metric.get_label(
+            gt_agent_boxes, gt_agent_feats)
+        occupancy = torch.logical_or(segmentation, pedestrian)
+
+        for i in range(future_second):
+            if fut_valid_flag:
+                cur_time = (i+1)*2
+                traj_L2 = self.planning_metric.compute_L2(
+                    pred_ego_fut_trajs[0, :cur_time].detach().to(gt_ego_fut_trajs.device),
+                    gt_ego_fut_trajs[0, :cur_time]
+                )
+                traj_L2_stp3 = self.planning_metric.compute_L2_stp3(
+                    pred_ego_fut_trajs[0, :cur_time].detach().to(gt_ego_fut_trajs.device),
+                    gt_ego_fut_trajs[0, :cur_time]
+                )
+                obj_coll, obj_box_coll = self.planning_metric.evaluate_coll(
+                    pred_ego_fut_trajs[:, :cur_time].detach(),
+                    gt_ego_fut_trajs[:, :cur_time],
+                    occupancy)
+                metric_dict['plan_L2_{}s'.format(i+1)] = traj_L2
+                metric_dict['plan_L2_stp3_{}s'.format(i+1)] = traj_L2_stp3
+                metric_dict['plan_obj_col_{}s'.format(i+1)] = obj_coll.mean().item()
+                metric_dict['plan_obj_col_stp3_{}s'.format(i + 1)] = obj_coll[-1].item()
+                metric_dict['plan_obj_box_col_{}s'.format(i+1)] = obj_box_coll.mean().item()
+                metric_dict['plan_obj_box_col_stp3_{}s'.format(i + 1)] = obj_box_coll[-1].item()
+                # if (i == 0):
+                #     metric_dict['plan_1'] = obj_box_coll[0].item()
+                #     metric_dict['plan_2'] = obj_box_coll[1].item()
+                # if (i == 1):
+                #     metric_dict['plan_3'] = obj_box_coll[2].item()
+                #     metric_dict['plan_4'] = obj_box_coll[3].item()
+                # if (i == 2):
+                #     metric_dict['plan_5'] = obj_box_coll[4].item()
+                #     metric_dict['plan_6'] = obj_box_coll[5].item()
+            else:
+                metric_dict['plan_L2_{}s'.format(i+1)] = 0.0
+                metric_dict['plan_L2_stp3_{}s'.format(i + 1)] = 0.0
+                metric_dict['plan_obj_col_{}s'.format(i+1)] = 0.0
+                metric_dict['plan_obj_box_col_{}s'.format(i+1)] = 0.0
+            
+        return metric_dict
+
+    def set_epoch(self, epoch): 
+        self.pts_bbox_head.epoch = epoch
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_head.py b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b9d30933e35171927e860806db204407bc70838
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_head.py
@@ -0,0 +1,2156 @@
+import copy
+from math import pi, cos, sin
+
+import torch
+import numpy as np
+import torch.nn as nn
+import matplotlib.pyplot as plt
+import torch.nn.functional as F
+from mmdet.models import HEADS, build_loss
+from mmdet.models.dense_heads import DETRHead
+from mmcv.runner import force_fp32, auto_fp16
+from mmcv.utils import TORCH_VERSION, digit_version
+from mmdet.core import build_assigner, build_sampler
+from mmdet3d.core.bbox.coders import build_bbox_coder
+from mmdet.models.utils.transformer import inverse_sigmoid
+from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh
+from mmcv.cnn import Linear, bias_init_with_prob, xavier_init
+from mmdet.core import (multi_apply, multi_apply, reduce_mean)
+from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
+
+from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
+from projects.mmdet3d_plugin.VAD.utils.traj_lr_warmup import get_traj_warmup_loss_weight
+from projects.mmdet3d_plugin.VAD.utils.map_utils import (
+    normalize_2d_pts, normalize_2d_bbox, denormalize_2d_pts, denormalize_2d_bbox
+)
+
+from projects.mmdet3d_plugin.VAD.generator import DistributionModule, PredictModel
+from projects.mmdet3d_plugin.VAD.generator import FuturePrediction
+
+
+class MLP(nn.Module):
+    def __init__(self, in_channels, hidden_unit, verbose=False):
+        super(MLP, self).__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(in_channels, hidden_unit),
+            nn.LayerNorm(hidden_unit),
+            nn.ReLU()
+        )
+
+    def forward(self, x):
+        x = self.mlp(x)
+        return x
+
+
+class LaneNet(nn.Module):
+    def __init__(self, in_channels, hidden_unit, num_subgraph_layers):
+        super(LaneNet, self).__init__()
+        self.num_subgraph_layers = num_subgraph_layers
+        self.layer_seq = nn.Sequential()
+        for i in range(num_subgraph_layers):
+            self.layer_seq.add_module(
+                f'lmlp_{i}', MLP(in_channels, hidden_unit))
+            in_channels = hidden_unit * 2
+
+    def forward(self, pts_lane_feats):
+        '''
+            Extract lane_feature from vectorized lane representation
+
+        Args:
+            pts_lane_feats: [batch size, max_pnum, pts, D]
+
+        Returns:
+            inst_lane_feats: [batch size, max_pnum, D]
+        '''
+        x = pts_lane_feats
+        for name, layer in self.layer_seq.named_modules():
+            if isinstance(layer, MLP):
+                # x [bs,max_lane_num,9,dim]
+                x = layer(x)
+                x_max = torch.max(x, -2)[0]
+                x_max = x_max.unsqueeze(2).repeat(1, 1, x.shape[2], 1)
+                x = torch.cat([x, x_max], dim=-1)
+        x_max = torch.max(x, -2)[0]
+        return x_max
+
+
+@HEADS.register_module()
+class VADHead(DETRHead):
+    """Head of VAD model.
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+        bev_h, bev_w (int): spatial shape of BEV queries.
+    """
+
+    def __init__(self,
+                 *args,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 transformer=None,
+                 bbox_coder=None,
+                 num_cls_fcs=2,
+                 code_weights=None,
+                 bev_h=30,
+                 bev_w=30,
+                 fut_ts=6,
+                 fut_mode=6,
+                 loss_traj=dict(type='L1Loss', loss_weight=0.25),
+                 loss_traj_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=0.8),
+                 map_bbox_coder=None,
+                 map_num_query=900,
+                 map_num_classes=3,
+                 map_num_vec=20,
+                 map_num_pts_per_vec=2,
+                 map_num_pts_per_gt_vec=2,
+                 map_query_embed_type='all_pts',
+                 map_transform_method='minmax',
+                 map_gt_shift_pts_pattern='v0',
+                 map_dir_interval=1,
+                 map_code_size=None,
+                 map_code_weights=None,
+                 loss_map_cls=dict(
+                     type='CrossEntropyLoss',
+                     bg_cls_weight=0.1,
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=1.0),
+                 loss_map_bbox=dict(type='L1Loss', loss_weight=5.0),
+                 loss_map_iou=dict(type='GIoULoss', loss_weight=2.0),
+                 loss_map_pts=dict(
+                     type='ChamferDistance', loss_src_weight=1.0, loss_dst_weight=1.0
+                 ),
+                 loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=2.0),
+                 loss_vae_gen=dict(type='ProbabilisticLoss', loss_weight=1.0),
+                 tot_epoch=None,
+                 use_traj_lr_warmup=False,
+                 motion_decoder=None,
+                 motion_map_decoder=None,
+                 use_pe=False,
+                 motion_det_score=None,
+                 map_thresh=0.5,
+                 dis_thresh=0.2,
+                 pe_normalization=True,
+                 ego_his_encoder=None,
+                 ego_fut_mode=3,
+                 loss_plan_reg=dict(type='L1Loss', loss_weight=0.25),
+                 loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=0.1),
+                 loss_plan_col=dict(type='PlanAgentDisLoss', loss_weight=0.1),
+                 loss_plan_dir=dict(type='PlanMapThetaLoss', loss_weight=0.1),
+                 ego_agent_decoder=None,
+                 ego_map_decoder=None,
+                 query_thresh=None,
+                 query_use_fix_pad=None,
+                 ego_lcf_feat_idx=None,
+                 valid_fut_ts=6,
+                 agent_dim=300,
+                 **kwargs):
+
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.fp16_enabled = False
+        self.fut_ts = fut_ts
+        self.fut_mode = fut_mode
+        self.tot_epoch = tot_epoch
+        self.use_traj_lr_warmup = use_traj_lr_warmup
+        self.motion_decoder = motion_decoder
+        self.motion_map_decoder = motion_map_decoder
+        self.use_pe = use_pe
+        self.motion_det_score = motion_det_score
+        self.map_thresh = map_thresh
+        self.dis_thresh = dis_thresh
+        self.pe_normalization = pe_normalization
+        self.ego_his_encoder = ego_his_encoder
+        self.ego_fut_mode = ego_fut_mode
+        self.ego_agent_decoder = ego_agent_decoder
+        self.ego_map_decoder = ego_map_decoder
+        self.query_thresh = query_thresh
+        self.query_use_fix_pad = query_use_fix_pad
+        self.ego_lcf_feat_idx = ego_lcf_feat_idx
+        self.valid_fut_ts = valid_fut_ts
+        self.agent_dim = agent_dim
+        self.with_cur = True
+
+        if loss_traj_cls['use_sigmoid'] == True:
+            self.traj_num_cls = 1
+        else:
+            self.traj_num_cls = 2
+
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        if 'code_size' in kwargs:
+            self.code_size = kwargs['code_size']
+        else:
+            self.code_size = 10
+        if code_weights is not None:
+            self.code_weights = code_weights
+        else:
+            self.code_weights = [1.0, 1.0, 1.0,
+                                 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+        if map_code_size is not None:
+            self.map_code_size = map_code_size
+        else:
+            self.map_code_size = 10
+        if map_code_weights is not None:
+            self.map_code_weights = map_code_weights
+        else:
+            self.map_code_weights = [1.0, 1.0, 1.0,
+                                     1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self.real_w = self.pc_range[3] - self.pc_range[0]
+        self.real_h = self.pc_range[4] - self.pc_range[1]
+        self.num_cls_fcs = num_cls_fcs - 1
+
+        self.map_bbox_coder = build_bbox_coder(map_bbox_coder)
+        self.map_query_embed_type = map_query_embed_type
+        self.map_transform_method = map_transform_method
+        self.map_gt_shift_pts_pattern = map_gt_shift_pts_pattern
+        map_num_query = map_num_vec * map_num_pts_per_vec
+        self.map_num_query = map_num_query
+        self.map_num_classes = map_num_classes
+        self.map_num_vec = map_num_vec
+        self.map_num_pts_per_vec = map_num_pts_per_vec
+        self.map_num_pts_per_gt_vec = map_num_pts_per_gt_vec
+        self.map_dir_interval = map_dir_interval
+
+        if loss_map_cls['use_sigmoid'] == True:
+            self.map_cls_out_channels = map_num_classes
+        else:
+            self.map_cls_out_channels = map_num_classes + 1
+
+        self.map_bg_cls_weight = 0
+        map_class_weight = loss_map_cls.get('class_weight', None)
+        if map_class_weight is not None and (self.__class__ is VADHead):
+            assert isinstance(map_class_weight, float), 'Expected ' \
+                                                        'class_weight to have type float. Found ' \
+                                                        f'{type(map_class_weight)}.'
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            map_bg_cls_weight = loss_map_cls.get('bg_cls_weight', map_class_weight)
+            assert isinstance(map_bg_cls_weight, float), 'Expected ' \
+                                                         'bg_cls_weight to have type float. Found ' \
+                                                         f'{type(map_bg_cls_weight)}.'
+            map_class_weight = torch.ones(map_num_classes + 1) * map_class_weight
+            # set background class as the last indice
+            map_class_weight[map_num_classes] = map_bg_cls_weight
+            loss_map_cls.update({'class_weight': map_class_weight})
+            if 'bg_cls_weight' in loss_map_cls:
+                loss_map_cls.pop('bg_cls_weight')
+            self.map_bg_cls_weight = map_bg_cls_weight
+
+        self.traj_bg_cls_weight = 0
+
+        super(VADHead, self).__init__(*args, transformer=transformer, **kwargs)
+        self.code_weights = nn.Parameter(torch.tensor(
+            self.code_weights, requires_grad=False), requires_grad=False)
+        self.map_code_weights = nn.Parameter(torch.tensor(
+            self.map_code_weights, requires_grad=False), requires_grad=False)
+
+        if kwargs['train_cfg'] is not None:
+            assert 'map_assigner' in kwargs['train_cfg'], 'map assigner should be provided ' \
+                                                          'when train_cfg is set.'
+            map_assigner = kwargs['train_cfg']['map_assigner']
+            assert loss_map_cls['loss_weight'] == map_assigner['cls_cost']['weight'], \
+                'The classification weight for loss and matcher should be' \
+                'exactly the same.'
+            assert loss_map_bbox['loss_weight'] == map_assigner['reg_cost'][
+                'weight'], 'The regression L1 weight for loss and matcher ' \
+                           'should be exactly the same.'
+            assert loss_map_iou['loss_weight'] == map_assigner['iou_cost']['weight'], \
+                'The regression iou weight for loss and matcher should be' \
+                'exactly the same.'
+            assert loss_map_pts['loss_weight'] == map_assigner['pts_cost']['weight'], \
+                'The regression l1 weight for map pts loss and matcher should be' \
+                'exactly the same.'
+
+            self.map_assigner = build_assigner(map_assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.map_sampler = build_sampler(sampler_cfg, context=self)
+
+        self.loss_traj = build_loss(loss_traj)
+        self.loss_traj_cls = build_loss(loss_traj_cls)
+        self.loss_map_bbox = build_loss(loss_map_bbox)
+        self.loss_map_cls = build_loss(loss_map_cls)
+        self.loss_map_iou = build_loss(loss_map_iou)
+        self.loss_map_pts = build_loss(loss_map_pts)
+        self.loss_map_dir = build_loss(loss_map_dir)
+        self.loss_plan_reg = build_loss(loss_plan_reg)
+        self.loss_plan_bound = build_loss(loss_plan_bound)
+        self.loss_plan_col = build_loss(loss_plan_col)
+        self.loss_plan_dir = build_loss(loss_plan_dir)
+        self.loss_vae_gen = build_loss(loss_vae_gen)
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        cls_branch = nn.Sequential(*cls_branch)
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, self.code_size))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        traj_branch = []
+        if self.with_cur:
+            traj_in_dim = self.embed_dims * 4
+        else:
+            traj_in_dim = self.embed_dims * 2
+        for _ in range(self.num_reg_fcs):
+            traj_branch.append(Linear(traj_in_dim, traj_in_dim))
+            traj_branch.append(nn.ReLU())
+        traj_branch.append(Linear(traj_in_dim, 2))
+        traj_branch = nn.Sequential(*traj_branch)
+
+        traj_cls_branch = []
+        # for _ in range(self.num_reg_fcs):
+        traj_cls_branch.append(Linear(self.embed_dims * 14, self.embed_dims * 2))
+        traj_cls_branch.append(nn.LayerNorm(self.embed_dims * 2))
+        traj_cls_branch.append(nn.ReLU(inplace=True))
+        traj_cls_branch.append(Linear(self.embed_dims * 2, self.embed_dims * 2))
+        traj_cls_branch.append(nn.LayerNorm(self.embed_dims * 2))
+        traj_cls_branch.append(nn.ReLU(inplace=True))
+        traj_cls_branch.append(Linear(self.embed_dims * 2, self.traj_num_cls))
+        traj_cls_branch = nn.Sequential(*traj_cls_branch)
+
+        map_cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            map_cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            map_cls_branch.append(nn.LayerNorm(self.embed_dims))
+            map_cls_branch.append(nn.ReLU(inplace=True))
+        map_cls_branch.append(Linear(self.embed_dims, self.map_cls_out_channels))
+        map_cls_branch = nn.Sequential(*map_cls_branch)
+
+        map_reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            map_reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            map_reg_branch.append(nn.ReLU())
+        map_reg_branch.append(Linear(self.embed_dims, self.map_code_size))
+        map_reg_branch = nn.Sequential(*map_reg_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_decoder_layers = 1
+        num_map_decoder_layers = 1
+        if self.transformer.decoder is not None:
+            num_decoder_layers = self.transformer.decoder.num_layers
+        if self.transformer.map_decoder is not None:
+            num_map_decoder_layers = self.transformer.map_decoder.num_layers
+        num_motion_decoder_layers = 1
+        num_pred = (num_decoder_layers + 1) if \
+            self.as_two_stage else num_decoder_layers
+        motion_num_pred = (num_motion_decoder_layers + 1) if \
+            self.as_two_stage else num_motion_decoder_layers
+        map_num_pred = (num_map_decoder_layers + 1) if \
+            self.as_two_stage else num_map_decoder_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(cls_branch, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+            self.traj_branches = _get_clones(traj_branch, motion_num_pred)
+            self.traj_cls_branches = _get_clones(traj_cls_branch, motion_num_pred)
+            self.map_cls_branches = _get_clones(map_cls_branch, map_num_pred)
+            self.map_reg_branches = _get_clones(map_reg_branch, map_num_pred)
+        else:
+            self.cls_branches = nn.ModuleList(
+                [cls_branch for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+            self.traj_branches = nn.ModuleList(
+                [traj_branch for _ in range(motion_num_pred)])
+            self.traj_cls_branches = nn.ModuleList(
+                [traj_cls_branch for _ in range(motion_num_pred)])
+            self.map_cls_branches = nn.ModuleList(
+                [map_cls_branch for _ in range(map_num_pred)])
+            self.map_reg_branches = nn.ModuleList(
+                [map_reg_branch for _ in range(map_num_pred)])
+
+        if not self.as_two_stage:
+            self.bev_embedding = nn.Embedding(
+                self.bev_h * self.bev_w, self.embed_dims)
+            self.query_embedding = nn.Embedding(self.num_query,
+                                                self.embed_dims * 2)
+            if self.map_query_embed_type == 'all_pts':
+                self.map_query_embedding = nn.Embedding(self.map_num_query,
+                                                        self.embed_dims * 2)
+            elif self.map_query_embed_type == 'instance_pts':
+                self.map_query_embedding = None
+                self.map_instance_embedding = nn.Embedding(self.map_num_vec, self.embed_dims * 2)
+                self.map_pts_embedding = nn.Embedding(self.map_num_pts_per_vec, self.embed_dims * 2)
+
+        if self.motion_decoder is not None:
+            self.motion_decoder = build_transformer_layer_sequence(self.motion_decoder)
+            self.motion_mode_query = nn.Embedding(self.fut_mode, self.embed_dims)
+            self.motion_mode_query.weight.requires_grad = True
+            if self.use_pe:
+                self.pos_mlp_sa = nn.Linear(2, self.embed_dims)
+        else:
+            raise NotImplementedError('Not implement yet')
+
+        if self.motion_map_decoder is not None:
+            self.lane_encoder = LaneNet(256, 128, 3)
+            self.motion_map_decoder = build_transformer_layer_sequence(self.motion_map_decoder)
+            if self.use_pe:
+                self.pos_mlp = nn.Linear(2, self.embed_dims)
+
+        if self.ego_his_encoder is not None:
+            self.ego_his_encoder = LaneNet(2, self.embed_dims // 2, 3)
+        else:
+            self.ego_query = nn.Embedding(1, self.embed_dims)
+
+        if self.ego_agent_decoder is not None:
+            self.ego_agent_decoder = build_transformer_layer_sequence(self.ego_agent_decoder)
+            if self.use_pe:
+                self.ego_agent_pos_mlp = nn.Linear(2, self.embed_dims)
+
+        if self.ego_map_decoder is not None:
+            self.ego_map_decoder = build_transformer_layer_sequence(self.ego_map_decoder)
+            if self.use_pe:
+                self.ego_map_pos_mlp = nn.Linear(2, self.embed_dims)
+
+        ego_fut_decoder = []
+        ego_fut_dec_in_dim = self.embed_dims * 2 + len(self.ego_lcf_feat_idx) \
+            if self.ego_lcf_feat_idx is not None else self.embed_dims * 2
+        if self.with_cur:
+            ego_fut_dec_in_dim = int(ego_fut_dec_in_dim * 2)
+        for _ in range(self.num_reg_fcs):
+            ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, ego_fut_dec_in_dim))
+            ego_fut_decoder.append(nn.ReLU())
+        ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, self.ego_fut_mode * 2))
+        self.ego_fut_decoder = nn.Sequential(*ego_fut_decoder)
+
+        self.agent_fus_mlp = nn.Sequential(
+            nn.Linear(self.fut_mode * 2 * self.embed_dims, self.embed_dims, bias=True),
+            nn.LayerNorm(self.embed_dims),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims, self.embed_dims, bias=True))
+
+        #########################################################
+        self.ego_coord_mlp = nn.Linear(2, 2)
+
+        self.layer_dim = 4
+        self.state_gru = nn.GRU(input_size=32, hidden_size=512, num_layers=self.layer_dim)
+
+        self.ego_gru = nn.GRU(512, 512, 4)
+        self.motion_gru = nn.GRU(512, 512, 4)
+
+        # motion head
+
+        traj_branch_ar = []
+        for _ in range(self.num_reg_fcs):
+            traj_branch_ar.append(Linear(self.embed_dims * 2, self.embed_dims * 2))
+            traj_branch_ar.append(nn.ReLU())
+        traj_branch_ar.append(Linear(self.embed_dims * 2, 2))
+        traj_branch_ar = nn.Sequential(*traj_branch_ar)
+
+        traj_cls_branch_ar = []
+        for _ in range(self.num_reg_fcs):
+            traj_cls_branch_ar.append(Linear(self.embed_dims * 2, self.embed_dims * 2))
+            traj_cls_branch_ar.append(nn.LayerNorm(self.embed_dims * 2))
+            traj_cls_branch_ar.append(nn.ReLU(inplace=True))
+        traj_cls_branch_ar.append(Linear(self.embed_dims * 2, self.traj_num_cls))
+        traj_cls_branch_ar = nn.Sequential(*traj_cls_branch_ar)
+
+        if self.with_box_refine:
+            self.traj_branches_ar = _get_clones(traj_branch_ar, motion_num_pred)
+            self.traj_cls_branches_ar = _get_clones(traj_cls_branch_ar, motion_num_pred)
+        else:
+            self.traj_branches_ar = nn.ModuleList(
+                [traj_branch_ar for _ in range(motion_num_pred)])
+            self.traj_cls_branches_ar = nn.ModuleList(
+                [traj_cls_branch_ar for _ in range(motion_num_pred)])
+
+        # planning head
+        ego_fut_decoder_ar = []
+        ego_fut_dec_in_dim_ar = self.embed_dims * 2 + len(self.ego_lcf_feat_idx) \
+            if self.ego_lcf_feat_idx is not None else self.embed_dims * 2
+        for _ in range(self.num_reg_fcs):
+            ego_fut_decoder_ar.append(Linear(ego_fut_dec_in_dim_ar, ego_fut_dec_in_dim_ar))
+            ego_fut_decoder_ar.append(nn.ReLU())
+        ego_fut_decoder_ar.append(Linear(ego_fut_dec_in_dim_ar, self.ego_fut_mode * 2))
+        self.ego_fut_decoder_ar = nn.Sequential(*ego_fut_decoder_ar)
+
+        self.ar = True
+
+        # generator motion & planning
+        self.present_distribution_in_channels = 512
+        self.future_distribution_in_channels = 524
+        self.now_pred_in_channels = 64
+        self.PROBABILISTIC = True
+        self.latent_dim = 32
+        self.MIN_LOG_SIGMA = -5.0
+        self.MAX_LOG_SIGMA = 5.0
+        self.FUTURE_DIM = 6
+        self.N_GRU_BLOCKS = 3
+        self.N_RES_LAYERS = 3
+
+        self.present_distribution = DistributionModule(
+            self.present_distribution_in_channels,
+            self.latent_dim,
+            min_log_sigma=self.MIN_LOG_SIGMA,
+            max_log_sigma=self.MAX_LOG_SIGMA,
+        )
+
+        # future_distribution_in_channels = (self.future_pred_in_channels
+        #                                    + 4 * self.FUTURE_DIM
+        #                                    )
+        self.future_distribution = DistributionModule(
+            self.future_distribution_in_channels,
+            self.latent_dim,
+            min_log_sigma=self.MIN_LOG_SIGMA,
+            max_log_sigma=self.MAX_LOG_SIGMA,
+        )
+
+        # Future prediction
+        self.future_prediction = FuturePrediction(
+            in_channels=self.present_distribution_in_channels,
+            latent_dim=self.latent_dim,
+            n_gru_blocks=self.N_GRU_BLOCKS,
+            n_res_layers=self.N_RES_LAYERS,
+        )
+
+        self.predict_model = PredictModel(
+            in_channels=self.latent_dim,
+            out_channels=self.embed_dims * 2,
+            hidden_channels=self.latent_dim * 4,
+            num_layers=self.layer_dim
+        )
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+        if self.loss_map_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.map_cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+        if self.loss_traj_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.traj_cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+        # for m in self.map_reg_branches:
+        #     constant_init(m[-1], 0, bias=0)
+        # nn.init.constant_(self.map_reg_branches[0][-1].bias.data[2:], 0.)
+        if self.motion_decoder is not None:
+            for p in self.motion_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+            nn.init.orthogonal_(self.motion_mode_query.weight)
+            if self.use_pe:
+                xavier_init(self.pos_mlp_sa, distribution='uniform', bias=0.)
+        if self.motion_map_decoder is not None:
+            for p in self.motion_map_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+            for p in self.lane_encoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+            if self.use_pe:
+                xavier_init(self.pos_mlp, distribution='uniform', bias=0.)
+        if self.ego_his_encoder is not None:
+            for p in self.ego_his_encoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        if self.ego_agent_decoder is not None:
+            for p in self.ego_agent_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        if self.ego_map_decoder is not None:
+            for p in self.ego_map_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+
+        # @auto_fp16(apply_to=('mlvl_feats'))
+
+    # @auto_fp16(apply_to=('mlvl_feats'))
+    @force_fp32(apply_to=('mlvl_feats', 'prev_bev'))
+    def forward(self,
+                mlvl_feats,
+                img_metas,
+                prev_bev=None,
+                only_bev=False,
+                ego_his_trajs=None,
+                ego_lcf_feat=None,
+                gt_labels_3d=None,
+                gt_attr_labels=None,
+                ego_fut_trajs=None,
+                ):
+        """Forward function.
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+            prev_bev: previous bev featues
+            only_bev: only compute BEV features with encoder.
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        """
+
+        bs, num_cam, _, _, _ = mlvl_feats[0].shape
+        dtype = mlvl_feats[0].dtype
+        object_query_embeds = self.query_embedding.weight.to(dtype)
+
+        if self.map_query_embed_type == 'all_pts':
+            map_query_embeds = self.map_query_embedding.weight.to(dtype)
+        elif self.map_query_embed_type == 'instance_pts':
+            map_pts_embeds = self.map_pts_embedding.weight.unsqueeze(0)
+            map_instance_embeds = self.map_instance_embedding.weight.unsqueeze(1)
+            map_query_embeds = (map_pts_embeds + map_instance_embeds).flatten(0, 1).to(dtype)
+
+        bev_queries = self.bev_embedding.weight.to(dtype)
+
+        bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
+                               device=bev_queries.device).to(dtype)
+        bev_pos = self.positional_encoding(bev_mask).to(dtype)
+
+        if only_bev:  # only use encoder to obtain BEV features, TODO: refine the workaround
+            return self.transformer.get_bev_features(
+                mlvl_feats,
+                bev_queries,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                img_metas=img_metas,
+                prev_bev=prev_bev,
+            )
+        else:
+            outputs = self.transformer(
+                mlvl_feats,
+                bev_queries,
+                object_query_embeds,
+                map_query_embeds,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                cls_branches=self.cls_branches if self.as_two_stage else None,
+                map_reg_branches=self.map_reg_branches if self.with_box_refine else None,  # noqa:E501
+                map_cls_branches=self.map_cls_branches if self.as_two_stage else None,
+                img_metas=img_metas,
+                prev_bev=prev_bev
+            )
+
+        # bev_embed: bev features
+        # hs: agent_query
+        # init_reference: reference points init
+        # inter_references: reference points processing
+        # map_hs: map_query
+        # map_init_reference: reference points init
+        # map_inter_references: reference points processing
+
+        bev_embed, hs, init_reference, inter_references, \
+        map_hs, map_init_reference, map_inter_references = outputs
+
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+        outputs_coords_bev = []
+        outputs_trajs = []
+        outputs_trajs_classes = []
+
+        map_hs = map_hs.permute(0, 2, 1, 3)
+        map_outputs_classes = []
+        map_outputs_coords = []
+        map_outputs_pts_coords = []
+        map_outputs_coords_bev = []
+
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 3
+            tmp[..., 0:2] = tmp[..., 0:2] + reference[..., 0:2]
+            tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            outputs_coords_bev.append(tmp[..., 0:2].clone().detach())
+            tmp[..., 4:5] = tmp[..., 4:5] + reference[..., 2:3]
+            tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+            tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] -
+                                              self.pc_range[0]) + self.pc_range[0])
+            tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] -
+                                              self.pc_range[1]) + self.pc_range[1])
+            tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] -
+                                              self.pc_range[2]) + self.pc_range[2])
+
+            # TODO: check if using sigmoid
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        for lvl in range(map_hs.shape[0]):
+            if lvl == 0:
+                reference = map_init_reference
+            else:
+                reference = map_inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            map_outputs_class = self.map_cls_branches[lvl](
+                map_hs[lvl].view(bs, self.map_num_vec, self.map_num_pts_per_vec, -1).mean(2)
+            )
+            tmp = self.map_reg_branches[lvl](map_hs[lvl])
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 2
+            tmp[..., 0:2] += reference[..., 0:2]
+            tmp = tmp.sigmoid()  # cx,cy,w,h
+            map_outputs_coord, map_outputs_pts_coord = self.map_transform_box(tmp)
+            map_outputs_coords_bev.append(map_outputs_pts_coord.clone().detach())
+            map_outputs_classes.append(map_outputs_class)
+            map_outputs_coords.append(map_outputs_coord)
+            map_outputs_pts_coords.append(map_outputs_pts_coord)
+
+        # motion prediction
+
+        # motion query
+        if self.motion_decoder is not None:
+            batch_size, num_agent = outputs_coords_bev[-1].shape[:2]
+            # motion_query
+            motion_query = hs[-1].permute(1, 0, 2)  # [A, B, D]
+            mode_query = self.motion_mode_query.weight  # [fut_mode, D]
+            # [M, B, D], M=A*fut_mode
+            motion_query = (motion_query[:, None, :, :] + mode_query[None, :, None, :]).flatten(0, 1)
+            if self.use_pe:
+                motion_coords = outputs_coords_bev[-1]  # [B, A, 2]
+                motion_pos = self.pos_mlp_sa(motion_coords)  # [B, A, D]
+                motion_pos = motion_pos.unsqueeze(2).repeat(1, 1, self.fut_mode, 1).flatten(1, 2)
+                motion_pos = motion_pos.permute(1, 0, 2)  # [M, B, D]
+            else:
+                motion_pos = None
+
+            if self.motion_det_score is not None:
+                motion_score = outputs_classes[-1]
+                max_motion_score = motion_score.max(dim=-1)[0]
+                invalid_motion_idx = max_motion_score < self.motion_det_score  # [B, A]
+                invalid_motion_idx = invalid_motion_idx.unsqueeze(2).repeat(1, 1, self.fut_mode).flatten(1, 2)
+            else:
+                invalid_motion_idx = None
+
+            # ego query
+            # batch = batch_size
+            if self.ego_his_encoder is not None:
+                ego_his_feats = self.ego_his_encoder(ego_his_trajs)  # [B, 1, dim]
+            else:
+                ego_his_feats = self.ego_query.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+                # ego <-> agent Interaction
+            ego_query = ego_his_feats.permute(1, 0, 2)
+            ego_pos = torch.zeros((batch_size, 1, 2), device=ego_query.device).permute(1, 0, 2)
+            ego_pos_emb = self.ego_agent_pos_mlp(ego_pos)
+
+            motion_query = torch.cat([motion_query, ego_query], dim=0)
+            motion_pos = torch.cat([motion_pos, ego_pos_emb], dim=0)
+
+            motion_hs = self.motion_decoder(
+                query=motion_query,
+                key=motion_query,
+                value=motion_query,
+                query_pos=motion_pos,
+                key_pos=motion_pos,
+                key_padding_mask=invalid_motion_idx)
+
+            if self.motion_map_decoder is not None:
+                # map preprocess
+                motion_coords = outputs_coords_bev[-1]  # [B, A, 2]
+                motion_coords = motion_coords.unsqueeze(2).repeat(1, 1, self.fut_mode, 1).flatten(1, 2)
+
+                # ego_coords = torch.Tensor(1, 1, 2).cuda(1)
+                ego_coords = torch.zeros([batch_size, 1, 2], device=motion_hs.device)
+                ego_coords_embd = self.ego_coord_mlp(ego_coords)
+                # ego_coords_embd = torch.zeros([batch_size, 1, 2], device=motion_hs.device)
+                motion_coords = torch.cat([motion_coords, ego_coords_embd], dim=1)
+
+                map_query = map_hs[-1].view(batch_size, self.map_num_vec, self.map_num_pts_per_vec, -1)
+                map_query = self.lane_encoder(map_query)  # [B, P, pts, D] -> [B, P, D]
+                map_score = map_outputs_classes[-1]
+                map_pos = map_outputs_coords_bev[-1]
+                map_query, map_pos, key_padding_mask = self.select_and_pad_pred_map(
+                    motion_coords, map_query, map_score, map_pos,
+                    map_thresh=self.map_thresh, dis_thresh=self.dis_thresh,
+                    pe_normalization=self.pe_normalization, use_fix_pad=True)
+                map_query = map_query.permute(1, 0, 2)  # [P, B*M, D]
+                ca_motion_query = motion_hs.permute(1, 0, 2).flatten(0, 1).unsqueeze(0)
+
+                # position encoding
+                if self.use_pe:
+                    (num_query, batch) = ca_motion_query.shape[:2]
+                    motion_pos = torch.zeros((num_query, batch, 2), device=motion_hs.device)
+                    motion_pos = self.pos_mlp(motion_pos)
+                    map_pos = map_pos.permute(1, 0, 2)
+                    map_pos = self.pos_mlp(map_pos)
+                else:
+                    motion_pos, map_pos = None, None
+
+                ca_motion_query = self.motion_map_decoder(
+                    query=ca_motion_query,
+                    key=map_query,
+                    value=map_query,
+                    query_pos=motion_pos,
+                    key_pos=map_pos,
+                    key_padding_mask=key_padding_mask)
+            else:
+                ca_motion_query = motion_hs.permute(1, 0, 2).flatten(0, 1).unsqueeze(0)
+
+            ########################################
+            # generator for planning & motion
+            current_states = torch.cat((motion_hs.permute(1, 0, 2),
+                                        ca_motion_query.reshape(batch_size, -1, self.embed_dims)), dim=2)
+            distribution_comp = {}
+            # states = torch.randn((2, 1, 64, 200, 200), device=motion_hs.device)
+            # future_distribution_inputs = torch.randn((2, 5, 6, 200, 200), device=motion_hs.device)
+            noise = None
+            if self.training:
+                future_distribution_inputs = self.get_future_labels(gt_labels_3d, gt_attr_labels,
+                                                                    ego_fut_trajs, motion_hs.device)
+            else:
+                future_distribution_inputs = None
+
+            # 1. model CVA distribution for state
+            if self.fut_ts > 0:
+                # present_state = states[:, :1].contiguous()
+                if self.PROBABILISTIC:
+                    # Do probabilistic computation
+                    sample, output_distribution = self.distribution_forward(
+                        current_states, future_distribution_inputs, noise
+                    )
+                    distribution_comp = {**distribution_comp, **output_distribution}
+
+            # 2. predict future state from distribution
+            hidden_states = current_states
+            states_hs, future_states_hs = \
+                self.future_states_predict(batch_size, sample, hidden_states, current_states)
+
+            ego_query_hs = \
+                states_hs[:, :, self.agent_dim * self.fut_mode, :].unsqueeze(1).permute(0, 2, 1, 3)
+            motion_query_hs = states_hs[:, :, 0:self.agent_dim * self.fut_mode, :]
+            motion_query_hs = \
+                motion_query_hs.reshape(self.fut_ts, batch_size, -1, self.fut_ts, motion_query_hs.shape[-1])
+            ego_fut_trajs_list = []
+            motion_fut_trajs_list = []
+            for i in range(self.fut_ts):
+                outputs_ego_trajs = self.ego_fut_decoder(ego_query_hs[i]).reshape(batch_size, self.ego_fut_mode, 2)
+                ego_fut_trajs_list.append(outputs_ego_trajs)
+                outputs_agent_trajs = self.traj_branches[0](motion_query_hs[i])
+                motion_fut_trajs_list.append(outputs_agent_trajs)
+
+            ego_trajs = torch.stack(ego_fut_trajs_list, dim=2)
+            agent_trajs = torch.stack(motion_fut_trajs_list, dim=3).reshape(batch_size, 1, self.agent_dim,
+                                                                            self.fut_mode, -1)
+
+        motion_cls_hs = torch.cat((future_states_hs[:, :, 0:self.agent_dim * self.fut_mode, :].
+                                   reshape(batch_size, self.agent_dim, self.fut_mode, -1),
+                                   current_states[:, 0:self.agent_dim * self.fut_mode, :].
+                                   reshape(batch_size, self.agent_dim, self.fut_mode, -1)), dim=-1)
+
+        # outputs_traj_class = self.traj_cls_branches[0](motion_query_hs)
+
+        # outputs_traj = self.traj_branches[0](motion_hs)
+        # outputs_trajs.append(outputs_traj)
+        outputs_traj_class = self.traj_cls_branches[0](motion_cls_hs)
+        outputs_trajs_classes.append(outputs_traj_class.squeeze(-1))
+
+        map_outputs_classes = torch.stack(map_outputs_classes)
+        map_outputs_coords = torch.stack(map_outputs_coords)
+        map_outputs_pts_coords = torch.stack(map_outputs_pts_coords)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+        outputs_trajs = agent_trajs.permute(1, 0, 2, 3, 4)
+        outputs_trajs_classes = torch.stack(outputs_trajs_classes)
+
+        # print(future_states.shape)
+
+        # Ego prediction
+        # ego_feats [1, 1, 512]
+        # outputs_ego_trajs = self.ego_fut_decoder(ego_feats)
+        # outputs_ego_trajs = outputs_ego_trajs.reshape(outputs_ego_trajs.shape[0],
+        #                                               self.ego_fut_mode, self.fut_ts, 2)
+
+        outs = {
+            'bev_embed': bev_embed,  # torch.Size([10000, 1, 256])
+            'all_cls_scores': outputs_classes,  # torch.Size([3, 1, 300, 10])
+            'all_bbox_preds': outputs_coords,  # torch.Size([3, 1, 300, 10])
+            'all_traj_preds': outputs_trajs.repeat(outputs_coords.shape[0], 1, 1, 1, 1),
+            # torch.Size([3, 1, 300, 6, 12])
+            'all_traj_cls_scores': outputs_trajs_classes.repeat(outputs_coords.shape[0], 1, 1, 1),
+            # torch.Size([3, 1, 300, 6])
+            'map_all_cls_scores': map_outputs_classes,  # torch.Size([3, 1, 100, 3]) map_outputs_classes
+            'map_all_bbox_preds': map_outputs_coords,  # torch.Size([3, 1, 100, 4])  map_outputs_coords
+            'map_all_pts_preds': map_outputs_pts_coords,  # torch.Size([3, 1, 100, 20, 2])
+            'enc_cls_scores': None,
+            'enc_bbox_preds': None,
+            'map_enc_cls_scores': None,
+            'map_enc_bbox_preds': None,
+            'map_enc_pts_preds': None,
+            'ego_fut_preds': ego_trajs,  # torch.Size([1, 3, 6, 2])
+            'loss_vae_gen': distribution_comp,
+        }
+
+        return outs
+
+    def map_transform_box(self, pts, y_first=False):
+        """
+        Converting the points set into bounding box.
+
+        Args:
+            pts: the input points sets (fields), each points
+                set (fields) is represented as 2n scalar.
+            y_first: if y_fisrt=True, the point set is represented as
+                [y1, x1, y2, x2 ... yn, xn], otherwise the point set is
+                represented as [x1, y1, x2, y2 ... xn, yn].
+        Returns:
+            The bbox [cx, cy, w, h] transformed from points.
+        """
+        pts_reshape = pts.view(pts.shape[0], self.map_num_vec,
+                               self.map_num_pts_per_vec, 2)
+        pts_y = pts_reshape[:, :, :, 0] if y_first else pts_reshape[:, :, :, 1]
+        pts_x = pts_reshape[:, :, :, 1] if y_first else pts_reshape[:, :, :, 0]
+        if self.map_transform_method == 'minmax':
+            # import pdb;pdb.set_trace()
+
+            xmin = pts_x.min(dim=2, keepdim=True)[0]
+            xmax = pts_x.max(dim=2, keepdim=True)[0]
+            ymin = pts_y.min(dim=2, keepdim=True)[0]
+            ymax = pts_y.max(dim=2, keepdim=True)[0]
+            bbox = torch.cat([xmin, ymin, xmax, ymax], dim=2)
+            bbox = bbox_xyxy_to_cxcywh(bbox)
+        else:
+            raise NotImplementedError
+        return bbox, pts_reshape
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_attr_labels,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 10].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 9) in [x,y,z,w,l,h,yaw,vx,vy] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        gt_fut_trajs = gt_attr_labels[:, :self.fut_ts * 2]
+        gt_fut_masks = gt_attr_labels[:, self.fut_ts * 2:self.fut_ts * 3]
+        gt_bbox_c = gt_bboxes.shape[-1]
+        num_gt_bbox, gt_traj_c = gt_fut_trajs.shape
+
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, gt_bboxes_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_bbox_c]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+
+        # trajs targets
+        traj_targets = torch.zeros((num_bboxes, gt_traj_c), dtype=torch.float32, device=bbox_pred.device)
+        traj_weights = torch.zeros_like(traj_targets)
+        traj_targets[pos_inds] = gt_fut_trajs[sampling_result.pos_assigned_gt_inds]
+        traj_weights[pos_inds] = 1.0
+
+        # Filter out invalid fut trajs
+        traj_masks = torch.zeros_like(traj_targets)  # [num_bboxes, fut_ts*2]
+        gt_fut_masks = gt_fut_masks.unsqueeze(-1).repeat(1, 1, 2).view(num_gt_bbox, -1)  # [num_gt_bbox, fut_ts*2]
+        traj_masks[pos_inds] = gt_fut_masks[sampling_result.pos_assigned_gt_inds]
+        traj_weights = traj_weights * traj_masks
+
+        # Extra future timestamp mask for controlling pred horizon
+        fut_ts_mask = torch.zeros((num_bboxes, self.fut_ts, 2),
+                                  dtype=torch.float32, device=bbox_pred.device)
+        fut_ts_mask[:, :self.valid_fut_ts, :] = 1.0
+        fut_ts_mask = fut_ts_mask.view(num_bboxes, -1)
+        traj_weights = traj_weights * fut_ts_mask
+
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+
+        return (
+            labels, label_weights, bbox_targets, bbox_weights, traj_targets,
+            traj_weights, traj_masks.view(-1, self.fut_ts, 2)[..., 0],
+            pos_inds, neg_inds
+        )
+
+    def _map_get_target_single(self,
+                               cls_score,
+                               bbox_pred,
+                               pts_pred,
+                               gt_labels,
+                               gt_bboxes,
+                               gt_shifts_pts,
+                               gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        gt_c = gt_bboxes.shape[-1]
+        assign_result, order_index = self.map_assigner.assign(bbox_pred, cls_score, pts_pred,
+                                                              gt_bboxes, gt_labels, gt_shifts_pts,
+                                                              gt_bboxes_ignore)
+
+        sampling_result = self.map_sampler.sample(assign_result, bbox_pred,
+                                                  gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,),
+                                    self.map_num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        # pts targets
+        if order_index is None:
+            assigned_shift = gt_labels[sampling_result.pos_assigned_gt_inds]
+        else:
+            assigned_shift = order_index[sampling_result.pos_inds, sampling_result.pos_assigned_gt_inds]
+        pts_targets = pts_pred.new_zeros((pts_pred.size(0),
+                                          pts_pred.size(1), pts_pred.size(2)))
+        pts_weights = torch.zeros_like(pts_targets)
+        pts_weights[pos_inds] = 1.0
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+        pts_targets[pos_inds] = gt_shifts_pts[sampling_result.pos_assigned_gt_inds, assigned_shift, :, :]
+        return (labels, label_weights, bbox_targets, bbox_weights,
+                pts_targets, pts_weights,
+                pos_inds, neg_inds)
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_attr_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, traj_targets_list, traj_weights_list,
+         gt_fut_masks_list, pos_inds_list, neg_inds_list) = multi_apply(
+            self._get_target_single, cls_scores_list, bbox_preds_list,
+            gt_labels_list, gt_bboxes_list, gt_attr_labels_list, gt_bboxes_ignore_list
+        )
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+                traj_targets_list, traj_weights_list, gt_fut_masks_list, num_total_pos, num_total_neg)
+
+    def map_get_targets(self,
+                        cls_scores_list,
+                        bbox_preds_list,
+                        pts_preds_list,
+                        gt_bboxes_list,
+                        gt_labels_list,
+                        gt_shifts_pts_list,
+                        gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pts_targets_list, pts_weights_list,
+         pos_inds_list, neg_inds_list) = multi_apply(
+            self._map_get_target_single, cls_scores_list, bbox_preds_list, pts_preds_list,
+            gt_labels_list, gt_bboxes_list, gt_shifts_pts_list, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, pts_targets_list, pts_weights_list,
+                num_total_pos, num_total_neg)
+
+    def loss_planning(self,
+                      ego_fut_preds,
+                      ego_fut_gt,
+                      ego_fut_masks,
+                      ego_fut_cmd,
+                      lane_preds,
+                      lane_score_preds,
+                      agent_preds,
+                      agent_fut_preds,
+                      agent_score_preds,
+                      agent_fut_cls_preds):
+        """"Loss function for ego vehicle planning.
+        Args:
+            ego_fut_preds (Tensor): [B, ego_fut_mode, fut_ts, 2]
+            ego_fut_gt (Tensor): [B, fut_ts, 2]
+            ego_fut_masks (Tensor): [B, fut_ts]
+            ego_fut_cmd (Tensor): [B, ego_fut_mode]
+            lane_preds (Tensor): [B, num_vec, num_pts, 2]
+            lane_score_preds (Tensor): [B, num_vec, 3]
+            agent_preds (Tensor): [B, num_agent, 2]
+            agent_fut_preds (Tensor): [B, num_agent, fut_mode, fut_ts, 2]
+            agent_score_preds (Tensor): [B, num_agent, 10]
+            agent_fut_cls_scores (Tensor): [B, num_agent, fut_mode]
+        Returns:
+            loss_plan_reg (Tensor): planning reg loss.
+            loss_plan_bound (Tensor): planning map boundary constraint loss.
+            loss_plan_col (Tensor): planning col constraint loss.
+            loss_plan_dir (Tensor): planning directional constraint loss.
+        """
+
+        ego_fut_gt = ego_fut_gt.unsqueeze(1).repeat(1, self.ego_fut_mode, 1, 1)
+        loss_plan_l1_weight = ego_fut_cmd[..., None, None] * ego_fut_masks[:, None, :, None]
+        loss_plan_l1_weight = loss_plan_l1_weight.repeat(1, 1, 1, 2)
+
+        loss_plan_l1 = self.loss_plan_reg(
+            ego_fut_preds,
+            ego_fut_gt,
+            loss_plan_l1_weight
+        )
+
+        loss_plan_bound = self.loss_plan_bound(
+            ego_fut_preds[ego_fut_cmd == 1],
+            lane_preds,
+            lane_score_preds,
+            weight=ego_fut_masks
+        )
+
+        loss_plan_col = self.loss_plan_col(
+            ego_fut_preds[ego_fut_cmd == 1],
+            agent_preds,
+            agent_fut_preds,
+            agent_score_preds,
+            agent_fut_cls_preds,
+            weight=ego_fut_masks[:, :, None].repeat(1, 1, 2)
+        )
+
+        loss_plan_dir = self.loss_plan_dir(
+            ego_fut_preds[ego_fut_cmd == 1],
+            lane_preds,
+            lane_score_preds,
+            weight=ego_fut_masks
+        )
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_plan_l1 = torch.nan_to_num(loss_plan_l1)
+            loss_plan_bound = torch.nan_to_num(loss_plan_bound)
+            loss_plan_col = torch.nan_to_num(loss_plan_col)
+            loss_plan_dir = torch.nan_to_num(loss_plan_dir)
+
+        loss_plan_dict = dict()
+        loss_plan_dict['loss_plan_reg'] = loss_plan_l1
+        loss_plan_dict['loss_plan_bound'] = loss_plan_bound
+        loss_plan_dict['loss_plan_col'] = loss_plan_col
+        loss_plan_dict['loss_plan_dir'] = loss_plan_dir
+
+        return loss_plan_dict
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    traj_preds,
+                    traj_cls_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_attr_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           gt_attr_labels_list, gt_bboxes_ignore_list)
+
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         traj_targets_list, traj_weights_list, gt_fut_masks_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+        traj_targets = torch.cat(traj_targets_list, 0)
+        traj_weights = torch.cat(traj_weights_list, 0)
+        gt_fut_masks = torch.cat(gt_fut_masks_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+                         num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10],
+            normalized_bbox_targets[isnotnan, :10],
+            bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos)
+
+        # traj regression loss
+        best_traj_preds = self.get_best_fut_preds(
+            traj_preds.reshape(-1, self.fut_mode, self.fut_ts, 2),
+            traj_targets.reshape(-1, self.fut_ts, 2), gt_fut_masks)
+
+        neg_inds = (bbox_weights[:, 0] == 0)
+        traj_labels = self.get_traj_cls_target(
+            traj_preds.reshape(-1, self.fut_mode, self.fut_ts, 2),
+            traj_targets.reshape(-1, self.fut_ts, 2),
+            gt_fut_masks, neg_inds)
+
+        loss_traj = self.loss_traj(
+            best_traj_preds[isnotnan],
+            traj_targets[isnotnan],
+            traj_weights[isnotnan],
+            avg_factor=num_total_pos)
+
+        if self.use_traj_lr_warmup:
+            loss_scale_factor = get_traj_warmup_loss_weight(self.epoch, self.tot_epoch)
+            loss_traj = loss_scale_factor * loss_traj
+
+        # traj classification loss
+        traj_cls_scores = traj_cls_preds.reshape(-1, self.fut_mode)
+        # construct weighted avg_factor to match with the official DETR repo
+        traj_cls_avg_factor = num_total_pos * 1.0 + \
+                              num_total_neg * self.traj_bg_cls_weight
+        if self.sync_cls_avg_factor:
+            traj_cls_avg_factor = reduce_mean(
+                traj_cls_scores.new_tensor([traj_cls_avg_factor]))
+
+        traj_cls_avg_factor = max(traj_cls_avg_factor, 1)
+        loss_traj_cls = self.loss_traj_cls(
+            traj_cls_scores, traj_labels, label_weights, avg_factor=traj_cls_avg_factor
+        )
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_cls = torch.nan_to_num(loss_cls)
+            loss_bbox = torch.nan_to_num(loss_bbox)
+            loss_traj = torch.nan_to_num(loss_traj)
+            loss_traj_cls = torch.nan_to_num(loss_traj_cls)
+
+        return loss_cls, loss_bbox, loss_traj, loss_traj_cls
+
+    def get_best_fut_preds(self,
+                           traj_preds,
+                           traj_targets,
+                           gt_fut_masks):
+        """"Choose best preds among all modes.
+        Args:
+            traj_preds (Tensor): MultiModal traj preds with shape (num_box_preds, fut_mode, fut_ts, 2).
+            traj_targets (Tensor): Ground truth traj for each pred box with shape (num_box_preds, fut_ts, 2).
+            gt_fut_masks (Tensor): Ground truth traj mask with shape (num_box_preds, fut_ts).
+            pred_box_centers (Tensor): Pred box centers with shape (num_box_preds, 2).
+            gt_box_centers (Tensor): Ground truth box centers with shape (num_box_preds, 2).
+
+        Returns:
+            best_traj_preds (Tensor): best traj preds (min displacement error with gt)
+                with shape (num_box_preds, fut_ts*2).
+        """
+
+        cum_traj_preds = traj_preds.cumsum(dim=-2)
+        cum_traj_targets = traj_targets.cumsum(dim=-2)
+
+        # Get min pred mode indices.
+        # (num_box_preds, fut_mode, fut_ts)
+        dist = torch.linalg.norm(cum_traj_targets[:, None, :, :] - cum_traj_preds, dim=-1)
+        dist = dist * gt_fut_masks[:, None, :]
+        dist = dist[..., -1]
+        dist[torch.isnan(dist)] = dist[torch.isnan(dist)] * 0
+        min_mode_idxs = torch.argmin(dist, dim=-1).tolist()
+        box_idxs = torch.arange(traj_preds.shape[0]).tolist()
+        best_traj_preds = traj_preds[box_idxs, min_mode_idxs, :, :].reshape(-1, self.fut_ts * 2)
+
+        return best_traj_preds
+
+    def get_traj_cls_target(self,
+                            traj_preds,
+                            traj_targets,
+                            gt_fut_masks,
+                            neg_inds):
+        """"Get Trajectory mode classification target.
+        Args:
+            traj_preds (Tensor): MultiModal traj preds with shape (num_box_preds, fut_mode, fut_ts, 2).
+            traj_targets (Tensor): Ground truth traj for each pred box with shape (num_box_preds, fut_ts, 2).
+            gt_fut_masks (Tensor): Ground truth traj mask with shape (num_box_preds, fut_ts).
+            neg_inds (Tensor): Negtive indices with shape (num_box_preds,)
+
+        Returns:
+            traj_labels (Tensor): traj cls labels (num_box_preds,).
+        """
+
+        cum_traj_preds = traj_preds.cumsum(dim=-2)
+        cum_traj_targets = traj_targets.cumsum(dim=-2)
+
+        # Get min pred mode indices.
+        # (num_box_preds, fut_mode, fut_ts)
+        dist = torch.linalg.norm(cum_traj_targets[:, None, :, :] - cum_traj_preds, dim=-1)
+        dist = dist * gt_fut_masks[:, None, :]
+        dist = dist[..., -1]
+        dist[torch.isnan(dist)] = dist[torch.isnan(dist)] * 0
+        traj_labels = torch.argmin(dist, dim=-1)
+        traj_labels[neg_inds] = self.fut_mode
+
+        return traj_labels
+
+    def map_loss_single(self,
+                        cls_scores,
+                        bbox_preds,
+                        pts_preds,
+                        gt_bboxes_list,
+                        gt_labels_list,
+                        gt_shifts_pts_list,
+                        gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_pts_list (list[Tensor]): Ground truth pts for each image
+                with shape (num_gts, fixed_num, 2) in [x,y] format.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        pts_preds_list = [pts_preds[i] for i in range(num_imgs)]
+
+        cls_reg_targets = self.map_get_targets(cls_scores_list, bbox_preds_list, pts_preds_list,
+                                               gt_bboxes_list, gt_labels_list, gt_shifts_pts_list,
+                                               gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pts_targets_list, pts_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+        pts_targets = torch.cat(pts_targets_list, 0)
+        pts_weights = torch.cat(pts_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.map_cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+                         num_total_neg * self.map_bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_map_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_2d_bbox(bbox_targets, self.pc_range)
+        # normalized_bbox_targets = bbox_targets
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.map_code_weights
+
+        loss_bbox = self.loss_map_bbox(
+            bbox_preds[isnotnan, :4],
+            normalized_bbox_targets[isnotnan, :4],
+            bbox_weights[isnotnan, :4],
+            avg_factor=num_total_pos)
+
+        # regression pts CD loss
+        # num_samples, num_order, num_pts, num_coords
+        normalized_pts_targets = normalize_2d_pts(pts_targets, self.pc_range)
+
+        # num_samples, num_pts, num_coords
+        pts_preds = pts_preds.reshape(-1, pts_preds.size(-2), pts_preds.size(-1))
+        if self.map_num_pts_per_vec != self.map_num_pts_per_gt_vec:
+            pts_preds = pts_preds.permute(0, 2, 1)
+            pts_preds = F.interpolate(pts_preds, size=(self.map_num_pts_per_gt_vec), mode='linear',
+                                      align_corners=True)
+            pts_preds = pts_preds.permute(0, 2, 1).contiguous()
+
+        loss_pts = self.loss_map_pts(
+            pts_preds[isnotnan, :, :],
+            normalized_pts_targets[isnotnan, :, :],
+            pts_weights[isnotnan, :, :],
+            avg_factor=num_total_pos)
+
+        dir_weights = pts_weights[:, :-self.map_dir_interval, 0]
+        denormed_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range)
+        denormed_pts_preds_dir = denormed_pts_preds[:, self.map_dir_interval:, :] - \
+                                 denormed_pts_preds[:, :-self.map_dir_interval, :]
+        pts_targets_dir = pts_targets[:, self.map_dir_interval:, :] - pts_targets[:, :-self.map_dir_interval, :]
+
+        loss_dir = self.loss_map_dir(
+            denormed_pts_preds_dir[isnotnan, :, :],
+            pts_targets_dir[isnotnan, :, :],
+            dir_weights[isnotnan, :],
+            avg_factor=num_total_pos)
+
+        bboxes = denormalize_2d_bbox(bbox_preds, self.pc_range)
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_map_iou(
+            bboxes[isnotnan, :4],
+            bbox_targets[isnotnan, :4],
+            bbox_weights[isnotnan, :4],
+            avg_factor=num_total_pos)
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_cls = torch.nan_to_num(loss_cls)
+            loss_bbox = torch.nan_to_num(loss_bbox)
+            loss_iou = torch.nan_to_num(loss_iou)
+            loss_pts = torch.nan_to_num(loss_pts)
+            loss_dir = torch.nan_to_num(loss_dir)
+
+        return loss_cls, loss_bbox, loss_iou, loss_pts, loss_dir
+
+    def distribution_loss(self, output):
+        kl_loss = self.loss_vae_gen(output)
+        return kl_loss
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             map_gt_bboxes_list,
+             map_gt_labels_list,
+             preds_dicts,
+             ego_fut_gt,
+             ego_fut_masks,
+             ego_fut_cmd,
+             gt_attr_labels,
+             gt_bboxes_ignore=None,
+             map_gt_bboxes_ignore=None,
+             img_metas=None):
+        """"Loss function.
+        Args:
+
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        map_gt_vecs_list = copy.deepcopy(map_gt_bboxes_list)
+
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        all_traj_preds = preds_dicts['all_traj_preds']
+        all_traj_cls_scores = preds_dicts['all_traj_cls_scores']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+        map_all_cls_scores = preds_dicts['map_all_cls_scores']
+        map_all_bbox_preds = preds_dicts['map_all_bbox_preds']
+        map_all_pts_preds = preds_dicts['map_all_pts_preds']
+        map_enc_cls_scores = preds_dicts['map_enc_cls_scores']
+        map_enc_bbox_preds = preds_dicts['map_enc_bbox_preds']
+        map_enc_pts_preds = preds_dicts['map_enc_pts_preds']
+        ego_fut_preds = preds_dicts['ego_fut_preds']
+        distribution_pred = preds_dicts['loss_vae_gen']
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+
+        gt_bboxes_list = [torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+            dim=1).to(device) for gt_bboxes in gt_bboxes_list]
+
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_attr_labels_list = [gt_attr_labels for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        losses_cls, losses_bbox, loss_traj, loss_traj_cls = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds, all_traj_preds,
+            all_traj_cls_scores, all_gt_bboxes_list, all_gt_labels_list,
+            all_gt_attr_labels_list, all_gt_bboxes_ignore_list)
+
+        num_dec_layers = len(map_all_cls_scores)
+        device = map_gt_labels_list[0].device
+
+        map_gt_bboxes_list = [
+            map_gt_bboxes.bbox.to(device) for map_gt_bboxes in map_gt_vecs_list]
+        map_gt_pts_list = [
+            map_gt_bboxes.fixed_num_sampled_points.to(device) for map_gt_bboxes in map_gt_vecs_list]
+        if self.map_gt_shift_pts_pattern == 'v0':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v1':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v1.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v2':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v2.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v3':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v3.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v4':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v4.to(device) for gt_bboxes in map_gt_vecs_list]
+        else:
+            raise NotImplementedError
+        map_all_gt_bboxes_list = [map_gt_bboxes_list for _ in range(num_dec_layers)]
+        map_all_gt_labels_list = [map_gt_labels_list for _ in range(num_dec_layers)]
+        map_all_gt_pts_list = [map_gt_pts_list for _ in range(num_dec_layers)]
+        map_all_gt_shifts_pts_list = [map_gt_shifts_pts_list for _ in range(num_dec_layers)]
+        map_all_gt_bboxes_ignore_list = [
+            map_gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        map_losses_cls, map_losses_bbox, map_losses_iou, \
+        map_losses_pts, map_losses_dir = multi_apply(
+            self.map_loss_single, map_all_cls_scores, map_all_bbox_preds,
+            map_all_pts_preds, map_all_gt_bboxes_list, map_all_gt_labels_list,
+            map_all_gt_shifts_pts_list, map_all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_traj'] = loss_traj[-1]
+        loss_dict['loss_traj_cls'] = loss_traj_cls[-1]
+        # loss from the last decoder layer
+        loss_dict['loss_map_cls'] = map_losses_cls[-1]
+        loss_dict['loss_map_bbox'] = map_losses_bbox[-1]
+        loss_dict['loss_map_iou'] = map_losses_iou[-1]
+        loss_dict['loss_map_pts'] = map_losses_pts[-1]
+        loss_dict['loss_map_dir'] = map_losses_dir[-1]
+
+        # Planning Loss
+        ego_fut_gt = ego_fut_gt.squeeze(1)
+        ego_fut_masks = ego_fut_masks.squeeze(1).squeeze(1)
+        ego_fut_cmd = ego_fut_cmd.squeeze(1).squeeze(1)
+
+        batch, num_agent = all_traj_preds[-1].shape[:2]
+        agent_fut_preds = all_traj_preds[-1].view(batch, num_agent, self.fut_mode, self.fut_ts, 2)
+        agent_fut_cls_preds = all_traj_cls_scores[-1].view(batch, num_agent, self.fut_mode)
+        loss_plan_input = [ego_fut_preds, ego_fut_gt, ego_fut_masks, ego_fut_cmd,
+                           map_all_pts_preds[-1], map_all_cls_scores[-1].sigmoid(),
+                           all_bbox_preds[-1][..., 0:2], agent_fut_preds,
+                           all_cls_scores[-1].sigmoid(), agent_fut_cls_preds.sigmoid()]
+
+        loss_planning_dict = self.loss_planning(*loss_plan_input)
+        loss_dict['loss_plan_reg'] = loss_planning_dict['loss_plan_reg']
+        loss_dict['loss_plan_bound'] = loss_planning_dict['loss_plan_bound']
+        loss_dict['loss_plan_col'] = loss_planning_dict['loss_plan_col']
+        loss_dict['loss_plan_dir'] = loss_planning_dict['loss_plan_dir']
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            num_dec_layer += 1
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for map_loss_cls_i, map_loss_bbox_i, map_loss_iou_i, map_loss_pts_i, map_loss_dir_i in zip(
+                map_losses_cls[:-1],
+                map_losses_bbox[:-1],
+                map_losses_iou[:-1],
+                map_losses_pts[:-1],
+                map_losses_dir[:-1]
+        ):
+            loss_dict[f'd{num_dec_layer}.loss_map_cls'] = map_loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_map_bbox'] = map_loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_map_iou'] = map_loss_iou_i
+            loss_dict[f'd{num_dec_layer}.loss_map_pts'] = map_loss_pts_i
+            loss_dict[f'd{num_dec_layer}.loss_map_dir'] = map_loss_dir_i
+            num_dec_layer += 1
+
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(all_gt_labels_list))
+            ]
+            enc_loss_cls, enc_losses_bbox = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list,
+                                 gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+
+        if map_enc_cls_scores is not None:
+            map_binary_labels_list = [
+                torch.zeros_like(map_gt_labels_list[i])
+                for i in range(len(map_all_gt_labels_list))
+            ]
+            # TODO bug here, but we dont care enc_loss now
+            map_enc_loss_cls, map_enc_loss_bbox, map_enc_loss_iou, \
+            map_enc_loss_pts, map_enc_loss_dir = \
+                self.map_loss_single(
+                    map_enc_cls_scores, map_enc_bbox_preds,
+                    map_enc_pts_preds, map_gt_bboxes_list,
+                    map_binary_labels_list, map_gt_pts_list,
+                    map_gt_bboxes_ignore
+                )
+            loss_dict['enc_loss_map_cls'] = map_enc_loss_cls
+            loss_dict['enc_loss_map_bbox'] = map_enc_loss_bbox
+            loss_dict['enc_loss_map_iou'] = map_enc_loss_iou
+            loss_dict['enc_loss_map_pts'] = map_enc_loss_pts
+            loss_dict['enc_loss_map_dir'] = map_enc_loss_dir
+
+        loss_dict['loss_vae_gen'] = self.loss_vae_gen(distribution_pred)
+
+        return loss_dict
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
+        """Generate bboxes from bbox head predictions.
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+            img_metas (list[dict]): Point cloud and image's meta info.
+        Returns:
+            list[dict]: Decoded bbox, scores and labels after nms.
+        """
+
+        det_preds_dicts = self.bbox_coder.decode(preds_dicts)
+        # map_bboxes: xmin, ymin, xmax, ymax
+        map_preds_dicts = self.map_bbox_coder.decode(preds_dicts)
+
+        num_samples = len(det_preds_dicts)
+        assert len(det_preds_dicts) == len(map_preds_dicts), \
+            'len(preds_dict) should be equal to len(map_preds_dicts)'
+        ret_list = []
+        for i in range(num_samples):
+            preds = det_preds_dicts[i]
+            bboxes = preds['bboxes']
+            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+            code_size = bboxes.shape[-1]
+            bboxes = img_metas[i]['box_type_3d'](bboxes, code_size)
+            scores = preds['scores']
+            labels = preds['labels']
+            trajs = preds['trajs']
+
+            map_preds = map_preds_dicts[i]
+            map_bboxes = map_preds['map_bboxes']
+            map_scores = map_preds['map_scores']
+            map_labels = map_preds['map_labels']
+            map_pts = map_preds['map_pts']
+
+            ret_list.append([bboxes, scores, labels, trajs, map_bboxes,
+                             map_scores, map_labels, map_pts])
+
+        return ret_list
+
+    def select_and_pad_pred_map(
+            self,
+            motion_pos,
+            map_query,
+            map_score,
+            map_pos,
+            map_thresh=0.5,
+            dis_thresh=None,
+            pe_normalization=True,
+            use_fix_pad=False
+    ):
+        """select_and_pad_pred_map.
+        Args:
+            motion_pos: [B, A, 2]
+            map_query: [B, P, D].
+            map_score: [B, P, 3].
+            map_pos: [B, P, pts, 2].
+            map_thresh: map confidence threshold for filtering low-confidence preds
+            dis_thresh: distance threshold for masking far maps for each agent in cross-attn
+            use_fix_pad: always pad one lane instance for each batch
+        Returns:
+            selected_map_query: [B*A, P1(+1), D], P1 is the max inst num after filter and pad.
+            selected_map_pos: [B*A, P1(+1), 2]
+            selected_padding_mask: [B*A, P1(+1)]
+        """
+
+        if dis_thresh is None:
+            raise NotImplementedError('Not implement yet')
+
+        # use the most close pts pos in each map inst as the inst's pos
+        batch, num_map = map_pos.shape[:2]
+        map_dis = torch.sqrt(map_pos[..., 0] ** 2 + map_pos[..., 1] ** 2)
+        min_map_pos_idx = map_dis.argmin(dim=-1).flatten()  # [B*P]
+        min_map_pos = map_pos.flatten(0, 1)  # [B*P, pts, 2]
+        min_map_pos = min_map_pos[range(min_map_pos.shape[0]), min_map_pos_idx]  # [B*P, 2]
+        min_map_pos = min_map_pos.view(batch, num_map, 2)  # [B, P, 2]
+
+        # select & pad map vectors for different batch using map_thresh
+        map_score = map_score.sigmoid()
+        map_max_score = map_score.max(dim=-1)[0]
+        map_idx = map_max_score > map_thresh
+        batch_max_pnum = 0
+        for i in range(map_score.shape[0]):
+            pnum = map_idx[i].sum()
+            if pnum > batch_max_pnum:
+                batch_max_pnum = pnum
+
+        selected_map_query, selected_map_pos, selected_padding_mask = [], [], []
+        for i in range(map_score.shape[0]):
+            dim = map_query.shape[-1]
+            valid_pnum = map_idx[i].sum()
+            valid_map_query = map_query[i, map_idx[i]]
+            valid_map_pos = min_map_pos[i, map_idx[i]]
+            pad_pnum = batch_max_pnum - valid_pnum
+            padding_mask = torch.tensor([False], device=map_score.device).repeat(batch_max_pnum)
+            if pad_pnum != 0:
+                valid_map_query = torch.cat([valid_map_query, torch.zeros((pad_pnum, dim), device=map_score.device)],
+                                            dim=0)
+                valid_map_pos = torch.cat([valid_map_pos, torch.zeros((pad_pnum, 2), device=map_score.device)], dim=0)
+                padding_mask[valid_pnum:] = True
+            selected_map_query.append(valid_map_query)
+            selected_map_pos.append(valid_map_pos)
+            selected_padding_mask.append(padding_mask)
+
+        selected_map_query = torch.stack(selected_map_query, dim=0)
+        selected_map_pos = torch.stack(selected_map_pos, dim=0)
+        selected_padding_mask = torch.stack(selected_padding_mask, dim=0)
+
+        # generate different pe for map vectors for each agent
+        num_agent = motion_pos.shape[1]
+        selected_map_query = selected_map_query.unsqueeze(1).repeat(1, num_agent, 1, 1)  # [B, A, max_P, D]
+        selected_map_pos = selected_map_pos.unsqueeze(1).repeat(1, num_agent, 1, 1)  # [B, A, max_P, 2]
+        selected_padding_mask = selected_padding_mask.unsqueeze(1).repeat(1, num_agent, 1)  # [B, A, max_P]
+        # move lane to per-car coords system
+        selected_map_dist = selected_map_pos - motion_pos[:, :, None, :]  # [B, A, max_P, 2]
+        if pe_normalization:
+            selected_map_pos = selected_map_pos - motion_pos[:, :, None, :]  # [B, A, max_P, 2]
+
+        # filter far map inst for each agent
+        map_dis = torch.sqrt(selected_map_dist[..., 0] ** 2 + selected_map_dist[..., 1] ** 2)
+        valid_map_inst = (map_dis <= dis_thresh)  # [B, A, max_P]
+        invalid_map_inst = (valid_map_inst == False)
+        selected_padding_mask = selected_padding_mask + invalid_map_inst
+
+        selected_map_query = selected_map_query.flatten(0, 1)
+        selected_map_pos = selected_map_pos.flatten(0, 1)
+        selected_padding_mask = selected_padding_mask.flatten(0, 1)
+
+        num_batch = selected_padding_mask.shape[0]
+        feat_dim = selected_map_query.shape[-1]
+        if use_fix_pad:
+            pad_map_query = torch.zeros((num_batch, 1, feat_dim), device=selected_map_query.device)
+            pad_map_pos = torch.ones((num_batch, 1, 2), device=selected_map_pos.device)
+            pad_lane_mask = torch.tensor([False], device=selected_padding_mask.device).unsqueeze(0).repeat(num_batch, 1)
+            selected_map_query = torch.cat([selected_map_query, pad_map_query], dim=1)
+            selected_map_pos = torch.cat([selected_map_pos, pad_map_pos], dim=1)
+            selected_padding_mask = torch.cat([selected_padding_mask, pad_lane_mask], dim=1)
+
+        return selected_map_query, selected_map_pos, selected_padding_mask
+
+    def select_and_pad_query(
+            self,
+            query,
+            query_pos,
+            query_score,
+            score_thresh=0.5,
+            use_fix_pad=True
+    ):
+        """select_and_pad_query.
+        Args:
+            query: [B, Q, D].
+            query_pos: [B, Q, 2]
+            query_score: [B, Q, C].
+            score_thresh: confidence threshold for filtering low-confidence query
+            use_fix_pad: always pad one query instance for each batch
+        Returns:
+            selected_query: [B, Q', D]
+            selected_query_pos: [B, Q', 2]
+            selected_padding_mask: [B, Q']
+        """
+
+        # select & pad query for different batch using score_thresh
+        query_score = query_score.sigmoid()
+        query_score = query_score.max(dim=-1)[0]
+        query_idx = query_score > score_thresh
+        batch_max_qnum = 0
+        for i in range(query_score.shape[0]):
+            qnum = query_idx[i].sum()
+            if qnum > batch_max_qnum:
+                batch_max_qnum = qnum
+
+        selected_query, selected_query_pos, selected_padding_mask = [], [], []
+        for i in range(query_score.shape[0]):
+            dim = query.shape[-1]
+            valid_qnum = query_idx[i].sum()
+            valid_query = query[i, query_idx[i]]
+            valid_query_pos = query_pos[i, query_idx[i]]
+            pad_qnum = batch_max_qnum - valid_qnum
+            padding_mask = torch.tensor([False], device=query_score.device).repeat(batch_max_qnum)
+            if pad_qnum != 0:
+                valid_query = torch.cat([valid_query, torch.zeros((pad_qnum, dim), device=query_score.device)], dim=0)
+                valid_query_pos = torch.cat([valid_query_pos, torch.zeros((pad_qnum, 2), device=query_score.device)],
+                                            dim=0)
+                padding_mask[valid_qnum:] = True
+            selected_query.append(valid_query)
+            selected_query_pos.append(valid_query_pos)
+            selected_padding_mask.append(padding_mask)
+
+        selected_query = torch.stack(selected_query, dim=0)
+        selected_query_pos = torch.stack(selected_query_pos, dim=0)
+        selected_padding_mask = torch.stack(selected_padding_mask, dim=0)
+
+        num_batch = selected_padding_mask.shape[0]
+        feat_dim = selected_query.shape[-1]
+        if use_fix_pad:
+            pad_query = torch.zeros((num_batch, 1, feat_dim), device=selected_query.device)
+            pad_query_pos = torch.ones((num_batch, 1, 2), device=selected_query_pos.device)
+            pad_mask = torch.tensor([False], device=selected_padding_mask.device).unsqueeze(0).repeat(num_batch, 1)
+            selected_query = torch.cat([selected_query, pad_query], dim=1)
+            selected_query_pos = torch.cat([selected_query_pos, pad_query_pos], dim=1)
+            selected_padding_mask = torch.cat([selected_padding_mask, pad_mask], dim=1)
+
+        return selected_query, selected_query_pos, selected_padding_mask
+
+    def distribution_forward(self, present_features, future_distribution_inputs=None, noise=None):
+        """
+        Parameters
+        ----------
+            present_features: 5-D output from dynamics module with shape (b, 1, c, h, w)
+            future_distribution_inputs: 5-D tensor containing labels shape (b, s, cfg.PROB_FUTURE_DIM, h, w)
+            noise: a sample from a (0, 1) gaussian with shape (b, s, latent_dim). If None, will sample in function
+
+        Returns
+        -------
+            sample: sample taken from present/future distribution, broadcast to shape (b, s, latent_dim, h, w)
+            present_distribution_mu: shape (b, s, latent_dim)
+            present_distribution_log_sigma: shape (b, s, latent_dim)
+            future_distribution_mu: shape (b, s, latent_dim)
+            future_distribution_log_sigma: shape (b, s, latent_dim)
+        """
+
+        b = present_features.shape[0]
+        c = present_features.shape[1]
+        present_mu, present_log_sigma = self.present_distribution(present_features)
+
+        future_mu, future_log_sigma = None, None
+        if future_distribution_inputs is not None:
+            # Concatenate future labels to z_t
+            # future_features = future_distribution_inputs[:, 1:].contiguous().view(b, 1, -1, h, w)
+            future_features = torch.cat([present_features, future_distribution_inputs], dim=2)
+            future_mu, future_log_sigma = self.future_distribution(future_features)
+
+        if noise is None:
+            if self.training:
+                noise = torch.randn_like(present_mu)
+            else:
+                noise = torch.randn_like(present_mu)
+        # print('################################')
+        # print('noise: ', noise)
+        # print('################################')
+        if self.training:
+            mu = future_mu
+            sigma = torch.exp(future_log_sigma)
+        else:
+            mu = present_mu
+            sigma = torch.exp(present_log_sigma)
+        sample = mu + sigma * noise
+
+        # Spatially broadcast sample to the dimensions of present_features
+        sample = sample.permute(0, 2, 1).expand(b, self.latent_dim, c)
+
+        output_distribution = {
+            'present_mu': present_mu,
+            'present_log_sigma': present_log_sigma,
+            'future_mu': future_mu,
+            'future_log_sigma': future_log_sigma,
+        }
+
+        return sample, output_distribution
+
+    def get_future_labels(self, gt_labels_3d, gt_attr_labels, ego_fut_trajs, device):
+
+        agent_dim = 300
+        veh_list = [0, 1, 3, 4]
+        mapped_class_names = [
+            'car', 'truck', 'construction_vehicle', 'bus',
+            'trailer', 'barrier', 'motorcycle', 'bicycle',
+            'pedestrian', 'traffic_cone'
+        ]
+        ignore_list = ['construction_vehicle', 'barrier',
+                       'traffic_cone', 'motorcycle', 'bicycle']
+
+        batch_size = len(gt_labels_3d)
+
+        # gt_label = gt_labels_3d[0]
+        # gt_attr_label = gt_attr_labels[0]
+
+        gt_fut_trajs_bz_list = []
+
+        for bz in range(batch_size):
+            gt_fut_trajs_list = []
+            gt_label = gt_labels_3d[bz]
+            gt_attr_label = gt_attr_labels[bz]
+            for i in range(gt_label.shape[0]):
+                gt_label[i] = 0 if gt_label[i] in veh_list else gt_label[i]
+                box_name = mapped_class_names[gt_label[i]]
+                if box_name in ignore_list:
+                    continue
+                gt_fut_masks = gt_attr_label[i][self.fut_ts * 2:self.fut_ts * 3]
+                num_valid_ts = sum(gt_fut_masks == 1)
+                gt_fut_traj = gt_attr_label[i][:self.fut_ts * 2].reshape(-1, 2)
+                gt_fut_traj = gt_fut_traj[:num_valid_ts]
+                if gt_fut_traj.shape[0] == 0:
+                    gt_fut_traj = torch.zeros([self.fut_ts - gt_fut_traj.shape[0], 2], device=device)
+                if gt_fut_traj.shape[0] < self.fut_ts:
+                    gt_fut_traj = torch.cat(
+                        (gt_fut_traj, torch.zeros([self.fut_ts - gt_fut_traj.shape[0], 2], device=device)), 0)
+                gt_fut_trajs_list.append(gt_fut_traj)
+
+            if len(gt_fut_trajs_list) != 0 & len(gt_fut_trajs_list) < agent_dim:
+                gt_fut_trajs = torch.cat(
+                    (torch.stack(gt_fut_trajs_list),
+                     torch.zeros([agent_dim - len(gt_fut_trajs_list), self.fut_ts, 2], device=device)), 0)
+            else:
+                gt_fut_trajs = torch.zeros([agent_dim, self.fut_ts, 2], device=device)
+
+            gt_fut_trajs_bz_list.append(gt_fut_trajs)
+
+        if len(gt_fut_trajs_bz_list) != 0:
+            gt_trajs = torch.cat((torch.stack(gt_fut_trajs_bz_list).repeat(1, 6, 1, 1), ego_fut_trajs), dim=1)
+        else:
+            gt_trajs = ego_fut_trajs
+        # future_states =  gt_trajs.reshape(batch_size, gt_trajs.shape[1], -1)
+
+        # [bz, a, t, 2]
+        return gt_trajs.reshape(batch_size, gt_trajs.shape[1], -1)
+
+    def future_states_predict(self, batch_size, sample, hidden_states, current_states):
+
+        future_prediction_input = sample.unsqueeze(0).expand(self.fut_ts, -1, -1, -1)
+        #
+        # future_states = self.future_prediction(future_prediction_input, hidden_state)
+        future_prediction_input = future_prediction_input.reshape(self.fut_ts, -1, self.latent_dim)
+
+        hidden_state = hidden_states.reshape(self.layer_dim, -1, int(self.embed_dims / 2))
+        # future_states, future_hidden = self.state_gru(future_prediction_input, hidden_state)
+        future_states = self.predict_model(future_prediction_input, hidden_state)
+
+        current_states_hs = current_states.unsqueeze(0).repeat(6, 1, 1, 1)
+        future_states_hs = future_states.reshape(self.fut_ts, batch_size, -1, future_states.shape[2])
+
+        if self.with_cur:
+            states_hs = torch.cat((current_states_hs, future_states_hs), dim=-1)
+        else:
+            states_hs = future_states_hs
+
+        return states_hs, future_states_hs
+
+
+
+
+
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_transformer.py b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8ffae5f11233f04ac7bbaecae775517c502e72e
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/VAD_transformer.py
@@ -0,0 +1,489 @@
+import torch
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import xavier_init
+from mmcv.utils import ext_loader
+from torch.nn.init import normal_
+from mmcv.runner.base_module import BaseModule
+from mmdet.models.utils.builder import TRANSFORMER
+from torchvision.transforms.functional import rotate
+from mmcv.cnn.bricks.registry import TRANSFORMER_LAYER_SEQUENCE
+from mmcv.cnn.bricks.transformer import TransformerLayerSequence
+from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
+
+from projects.mmdet3d_plugin.VAD.modules.decoder import CustomMSDeformableAttention
+from projects.mmdet3d_plugin.VAD.modules.temporal_self_attention import TemporalSelfAttention
+from projects.mmdet3d_plugin.VAD.modules.spatial_cross_attention import MSDeformableAttention3D
+
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class MapDetectionTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR3D transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default:
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+        super(MapDetectionTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        self.fp16_enabled = False
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                reg_branches=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+
+            reference_points_input = reference_points[..., :2].unsqueeze(
+                2)  # BS NUM_QUERY NUM_LEVEL 2
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+
+                assert reference_points.shape[-1] == 2
+
+                new_reference_points = torch.zeros_like(reference_points)
+                new_reference_points[..., :2] = tmp[
+                    ..., :2] + inverse_sigmoid(reference_points[..., :2])
+                # new_reference_points[..., 2:3] = tmp[
+                #     ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3])
+
+                new_reference_points = new_reference_points.sigmoid()
+
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@TRANSFORMER.register_module()
+class VADPerceptionTransformer(BaseModule):
+    """Implements the Detr3D transformer.
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 num_feature_levels=4,
+                 num_cams=6,
+                 two_stage_num_proposals=300,
+                 encoder=None,
+                 decoder=None,
+                 map_decoder=None,
+                 embed_dims=256,
+                 rotate_prev_bev=True,
+                 use_shift=True,
+                 use_can_bus=True,
+                 can_bus_norm=True,
+                 use_cams_embeds=True,
+                 rotate_center=[100, 100],
+                 map_num_vec=50,
+                 map_num_pts_per_vec=10,
+                 **kwargs):
+        super(VADPerceptionTransformer, self).__init__(**kwargs)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        if decoder is not None:
+            self.decoder = build_transformer_layer_sequence(decoder)
+        else:
+            self.decoder = None
+        if map_decoder is not None:
+            self.map_decoder = build_transformer_layer_sequence(map_decoder)
+        else:
+            self.map_decoder = None
+
+        self.embed_dims = embed_dims
+        self.num_feature_levels = num_feature_levels
+        self.num_cams = num_cams
+        self.fp16_enabled = False
+        self.rotate_prev_bev = rotate_prev_bev
+        self.use_shift = use_shift
+        self.use_can_bus = use_can_bus
+        self.can_bus_norm = can_bus_norm
+        self.use_cams_embeds = use_cams_embeds
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.rotate_center = rotate_center
+        self.map_num_vec = map_num_vec
+        self.map_num_pts_per_vec = map_num_pts_per_vec
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the Detr3DTransformer."""
+        self.level_embeds = nn.Parameter(torch.Tensor(
+            self.num_feature_levels, self.embed_dims))
+        self.cams_embeds = nn.Parameter(
+            torch.Tensor(self.num_cams, self.embed_dims))
+        self.reference_points = nn.Linear(self.embed_dims, 3)
+        self.map_reference_points = nn.Linear(self.embed_dims, 2)
+        self.can_bus_mlp = nn.Sequential(
+            nn.Linear(18, self.embed_dims // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.embed_dims // 2, self.embed_dims),
+            nn.ReLU(inplace=True),
+        )
+        if self.can_bus_norm:
+            self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims))
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        normal_(self.level_embeds)
+        normal_(self.cams_embeds)
+        xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        xavier_init(self.map_reference_points, distribution='uniform', bias=0.)
+        xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.)
+
+    # TODO apply fp16 to this module cause grad_norm NAN
+    # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos'))
+    def get_bev_features(
+            self,
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=[0.512, 0.512],
+            bev_pos=None,
+            prev_bev=None,
+            **kwargs):
+        """
+        obtain bev features.
+        """
+
+        bs = mlvl_feats[0].size(0)
+        bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1)
+        bev_pos = bev_pos.flatten(2).permute(2, 0, 1)
+
+        # obtain rotation angle and shift with ego motion
+        delta_x = np.array([each['can_bus'][0]
+                           for each in kwargs['img_metas']])
+        delta_y = np.array([each['can_bus'][1]
+                           for each in kwargs['img_metas']])
+        ego_angle = np.array(
+            [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']])
+        grid_length_y = grid_length[0]
+        grid_length_x = grid_length[1]
+        translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2)
+        translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180
+        bev_angle = ego_angle - translation_angle
+        shift_y = translation_length * \
+            np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h
+        shift_x = translation_length * \
+            np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w
+        shift_y = shift_y * self.use_shift
+        shift_x = shift_x * self.use_shift
+        shift = bev_queries.new_tensor(
+            [shift_x, shift_y]).permute(1, 0)  # xy, bs -> bs, xy
+
+        if prev_bev is not None:
+            if prev_bev.shape[1] == bev_h * bev_w:
+                prev_bev = prev_bev.permute(1, 0, 2)
+            if self.rotate_prev_bev:
+                for i in range(bs):
+                    # num_prev_bev = prev_bev.size(1)
+                    rotation_angle = kwargs['img_metas'][i]['can_bus'][-1]
+                    tmp_prev_bev = prev_bev[:, i].reshape(
+                        bev_h, bev_w, -1).permute(2, 0, 1)
+                    tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle,
+                                          center=self.rotate_center)
+                    tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape(
+                        bev_h * bev_w, 1, -1)
+                    prev_bev[:, i] = tmp_prev_bev[:, 0]
+
+        # add can bus signals
+        can_bus = bev_queries.new_tensor(
+            [each['can_bus'] for each in kwargs['img_metas']])  # [:, :]
+        can_bus = self.can_bus_mlp(can_bus)[None, :, :]
+        bev_queries = bev_queries + can_bus * self.use_can_bus
+
+        feat_flatten = []
+        spatial_shapes = []
+        for lvl, feat in enumerate(mlvl_feats):
+            bs, num_cam, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            feat = feat.flatten(3).permute(1, 0, 3, 2)
+            if self.use_cams_embeds:
+                feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype)
+            feat = feat + self.level_embeds[None,
+                                            None, lvl:lvl + 1, :].to(feat.dtype)
+            spatial_shapes.append(spatial_shape)
+            feat_flatten.append(feat)
+
+        feat_flatten = torch.cat(feat_flatten, 2)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=bev_pos.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        feat_flatten = feat_flatten.permute(
+            0, 2, 1, 3)  # (num_cam, H*W, bs, embed_dims)
+
+        bev_embed = self.encoder(
+            bev_queries,
+            feat_flatten,
+            feat_flatten,
+            bev_h=bev_h,
+            bev_w=bev_w,
+            bev_pos=bev_pos,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            prev_bev=prev_bev,
+            shift=shift,
+            **kwargs
+        )
+
+        return bev_embed
+
+    # TODO apply fp16 to this module cause grad_norm NAN
+    # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos'))
+    def forward(self,
+                mlvl_feats,
+                bev_queries,
+                object_query_embed,
+                map_query_embed,
+                bev_h,
+                bev_w,
+                grid_length=[0.512, 0.512],
+                bev_pos=None,
+                reg_branches=None,
+                cls_branches=None,
+                map_reg_branches=None,
+                map_cls_branches=None,                
+                prev_bev=None,            
+                **kwargs):
+        """Forward function for `Detr3DTransformer`.
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, num_cams, embed_dims, h, w].
+            bev_queries (Tensor): (bev_h*bev_w, c)
+            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
+            object_query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when `with_box_refine` is True. Default to None.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - bev_embed: BEV features
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+
+        bev_embed = self.get_bev_features(
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=grid_length,
+            bev_pos=bev_pos,
+            prev_bev=prev_bev,
+            **kwargs)  # bev_embed shape: bs, bev_h*bev_w, embed_dims
+
+        bs = mlvl_feats[0].size(0)
+        query_pos, query = torch.split(
+            object_query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+        query = query.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = self.reference_points(query_pos)
+        reference_points = reference_points.sigmoid()
+        init_reference_out = reference_points
+
+        map_query_pos, map_query = torch.split(
+            map_query_embed, self.embed_dims, dim=1)
+        map_query_pos = map_query_pos.unsqueeze(0).expand(bs, -1, -1)
+        map_query = map_query.unsqueeze(0).expand(bs, -1, -1)
+        map_reference_points = self.map_reference_points(map_query_pos)
+        map_reference_points = map_reference_points.sigmoid()
+        map_init_reference_out = map_reference_points        
+
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        map_query = map_query.permute(1, 0, 2)
+        map_query_pos = map_query_pos.permute(1, 0, 2)
+        bev_embed = bev_embed.permute(1, 0, 2)
+
+        if self.decoder is not None:
+            # [L, Q, B, D], [L, B, Q, D]
+            inter_states, inter_references = self.decoder(
+                query=query,
+                key=None,
+                value=bev_embed,
+                query_pos=query_pos,
+                reference_points=reference_points,
+                reg_branches=reg_branches,
+                cls_branches=cls_branches,
+                spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
+                level_start_index=torch.tensor([0], device=query.device),
+                **kwargs)
+            inter_references_out = inter_references
+        else:
+            inter_states = query.unsqueeze(0)
+            inter_references_out = reference_points.unsqueeze(0)
+
+        if self.map_decoder is not None:
+            # [L, Q, B, D], [L, B, Q, D]
+            map_inter_states, map_inter_references = self.map_decoder(
+                query=map_query,
+                key=None,
+                value=bev_embed,
+                query_pos=map_query_pos,
+                reference_points=map_reference_points,
+                reg_branches=map_reg_branches,
+                cls_branches=map_cls_branches,
+                spatial_shapes=torch.tensor([[bev_h, bev_w]], device=map_query.device),
+                level_start_index=torch.tensor([0], device=map_query.device),
+                **kwargs)
+            map_inter_references_out = map_inter_references
+        else:
+            map_inter_states = map_query.unsqueeze(0)
+            map_inter_references_out = map_reference_points.unsqueeze(0)
+
+        return (
+            bev_embed, inter_states, init_reference_out, inter_references_out,
+            map_inter_states, map_init_reference_out, map_inter_references_out)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class CustomTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR3D transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+        super(CustomTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        self.fp16_enabled = False
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                key_padding_mask=None,
+                *args,
+                **kwargs):
+        """Forward function for `Detr3DTransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        intermediate = []
+        for lid, layer in enumerate(self.layers):
+            query = layer(
+                query=query,
+                key=key,
+                value=value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                attn_masks=attn_masks,
+                key_padding_mask=key_padding_mask,
+                *args,
+                **kwargs)
+
+            if self.return_intermediate:
+                intermediate.append(query)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return query
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7d8488aa61f6ec0230f4f17772698cf1d3c062d
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/__init__.py
@@ -0,0 +1,11 @@
+from .modules import *
+from .runner import *
+from .hooks import *
+
+from .VAD import VAD
+# from .VAD_head_v2 import VADHead
+from .VAD_head import VADHead
+from .VAD_transformer import VADPerceptionTransformer, \
+        CustomTransformerDecoder, MapDetectionTransformerDecoder
+
+from .generator import *
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d7efcdfb0646a7a40210462c84cd1618dc2cc82
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD_head.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD_head.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36415fa1a21a02127430dd5d165e7cd9ae6d869c
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/VAD_head.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04d7958b4d87926ac3ef9fbc4f6e89d1a467c409
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..15dff22b7478a0f30151d376d41f3dc46e88ba7d
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__init__.py
@@ -0,0 +1,3 @@
+from .train import custom_train_model
+from .mmdet_train import custom_train_detector
+# from .test import custom_multi_gpu_test
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c610e7f1e0c12eb1bce9408ec700f64a02641e9b
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/mmdet_train.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/mmdet_train.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..945c209515ec3d19c99302ecaebf8ddfd8de7c15
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/mmdet_train.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/test.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/test.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42c7bb316260bb47844d913be0ff89407a70832c
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/test.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/train.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/train.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14f0e9a4285dbeab77adb9ba7962313c09c9e2b2
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/__pycache__/train.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/mmdet_train.py b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/mmdet_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..449d49dc4795b3f5f93b275ba58c74357b85bb5d
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/mmdet_train.py
@@ -0,0 +1,195 @@
+import random
+import warnings
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+                         Fp16OptimizerHook, OptimizerHook, build_optimizer,
+                         build_runner, get_dist_info)
+from mmcv.utils import build_from_cfg
+
+from mmdet.core import EvalHook
+
+from mmdet.datasets import (build_dataset,
+                            replace_ImageToTensor)
+from mmdet.utils import get_root_logger
+import time
+import os.path as osp
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.core.evaluation.eval_hooks import CustomDistEvalHook
+from projects.mmdet3d_plugin.datasets.builder import custom_build_dataset
+def custom_train_detector(model,
+                   dataset,
+                   cfg,
+                   distributed=False,
+                   validate=False,
+                   timestamp=None,
+                   eval_model=None,
+                   meta=None):
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+   
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    #assert len(dataset)==1s
+    if 'imgs_per_gpu' in cfg.data:
+        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                       'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            logger.warning(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            logger.warning(
+                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # cfg.gpus will be ignored if distributed
+            len(cfg.gpu_ids),
+            dist=distributed,
+            seed=cfg.seed,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        ) for ds in dataset
+    ]
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+        if eval_model is not None:
+            eval_model = MMDistributedDataParallel(
+                eval_model.cuda(),
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+    else:
+        model = MMDataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+        if eval_model is not None:
+            eval_model = MMDataParallel(
+                eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+
+    if 'runner' not in cfg:
+        cfg.runner = {
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        }
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    if eval_model is not None:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                eval_model=eval_model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+    else:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+    elif distributed and 'type' not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    
+    # register profiler hook
+    #trace_config = dict(type='tb_trace', dir_name='work_dir')
+    #profiler_config = dict(on_trace_ready=trace_config)
+    #runner.register_profiler_hook(profiler_config)
+    
+    if distributed:
+        if isinstance(runner, EpochBasedRunner):
+            runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        # Support batch_size > 1 in validation
+        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+        if val_samples_per_gpu > 1:
+            assert False
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.val.pipeline = replace_ImageToTensor(
+                cfg.data.val.pipeline)
+        val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True))
+
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=val_samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        )
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
+        eval_hook = CustomDistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+    # user-defined hooks
+    if cfg.get('custom_hooks', None):
+        custom_hooks = cfg.custom_hooks
+        assert isinstance(custom_hooks, list), \
+            f'custom_hooks expect list type, but got {type(custom_hooks)}'
+        for hook_cfg in cfg.custom_hooks:
+            assert isinstance(hook_cfg, dict), \
+                'Each item in custom_hooks expects dict type, but got ' \
+                f'{type(hook_cfg)}'
+            hook_cfg = hook_cfg.copy()
+            priority = hook_cfg.pop('priority', 'NORMAL')
+            hook = build_from_cfg(hook_cfg, HOOKS)
+            runner.register_hook(hook, priority=priority)
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/test.py b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc09efb0687246cb539b4d9f3c56a53e90b6a453
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/test.py
@@ -0,0 +1,159 @@
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.runner import get_dist_info
+
+from mmdet.core import encode_mask_results
+
+
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+
+def custom_encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code. Semantic Masks only
+    Args:
+        mask_results (list | tuple[list]): bitmap mask results.
+            In mask scoring rcnn, mask_results is a tuple of (segm_results,
+            segm_cls_score).
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    cls_segms = mask_results
+    num_classes = len(cls_segms)
+    encoded_mask_results = []
+    for i in range(len(cls_segms)):
+        encoded_mask_results.append(
+            mask_util.encode(
+                np.array(
+                    cls_segms[i][:, :, np.newaxis], order='F',
+                        dtype='uint8'))[0])  # encoded with RLE
+    return [encoded_mask_results]
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    bbox_results = []
+    mask_results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    have_mask = False
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+            # encode mask results
+            if isinstance(result, dict):
+                if 'bbox_results' in result.keys():
+                    bbox_result = result['bbox_results']
+                    batch_size = len(result['bbox_results'])
+                    bbox_results.extend(bbox_result)
+                if 'mask_results' in result.keys() and result['mask_results'] is not None:
+                    mask_result = custom_encode_mask_results(result['mask_results'])
+                    mask_results.extend(mask_result)
+                    have_mask = True
+            else:
+                batch_size = len(result)
+                bbox_results.extend(result)
+
+            #if isinstance(result[0], tuple):
+            #    assert False, 'this code is for instance segmentation, which our code will not utilize.'
+            #    result = [(bbox_results, encode_mask_results(mask_results))
+            #              for bbox_results, mask_results in result]
+        if rank == 0:
+            
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        bbox_results = collect_results_gpu(bbox_results, len(dataset))
+        if have_mask:
+            mask_results = collect_results_gpu(mask_results, len(dataset))
+        else:
+            mask_results = None
+    else:
+        bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+        tmpdir = tmpdir+'_mask' if tmpdir is not None else None
+        if have_mask:
+            mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
+        else:
+            mask_results = None
+
+    if mask_results is None:
+        return {'bbox_results': bbox_results}
+    return {'bbox_results': bbox_results, 'mask_results': mask_results}
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(mmcv.load(part_file))
+        # sort the results
+        ordered_results = []
+        '''
+        bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
+        '''
+        #for res in zip(*part_list):
+        for res in part_list:  
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    collect_results_cpu(result_part, size)
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/apis/train.py b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..f44c9d8372e9fc429c62c3fa304497f5c051e6af
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/apis/train.py
@@ -0,0 +1,61 @@
+from .mmdet_train import custom_train_detector
+from mmseg.apis import train_segmentor
+from mmdet.apis import train_detector
+
+def custom_train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                eval_model=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        assert False
+    else:
+        custom_train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            eval_model=eval_model,
+            meta=meta)
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        train_segmentor(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
+    else:
+        train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1082f134a1f594370163d1eee2dd28b65488d1a
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__init__.py
@@ -0,0 +1,6 @@
+from .distributions import DistributionModule, PredictModel, DistributionDecoder1DV2, PredictModelHidden
+from .layers import Bottleneck, SpatialGRU
+from .state_prediction import FuturePrediction
+# from .diffusion_model import DDIMScheduler
+# from .diffusion_states_estimate import DDIMDepthEstimateRes, EmbeddingDimForward, EmbeddingDimReverse, \
+#     DiffusionHeadMotion, DiffusionHeadPlan, AutoRegMotionPredict, AutoRegEgoPredict, AutoRegMotionPredictAll, AutoRegEgoPredictAll
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e435fdea401be719fae45ad3cf7fa41a36f0c2c7
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/distributions.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/distributions.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01c94943dbbbeb3d283818df00764b2cd0910c9e
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/distributions.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/layers.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/layers.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea2c8df9de5b9fc768893b97fc23c77f22b704d7
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/layers.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/state_prediction.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/state_prediction.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5960f5b61b81837598ca541032292732e4193044
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/__pycache__/state_prediction.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/distributions.py b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0eda3c6359d2236cb439f1b702a4de1ac7ce783
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/distributions.py
@@ -0,0 +1,182 @@
+
+import torch
+import torch.nn as nn
+
+from mmdet.models import LOSSES
+
+from .layers import Bottleneck
+
+
+class DistributionModule(nn.Module):
+    """
+    A convolutional net that parametrises a diagonal Gaussian distribution.
+    """
+
+    def __init__(
+        self, in_channels, latent_dim, min_log_sigma, max_log_sigma):
+        super().__init__()
+        self.compress_dim = in_channels // 2
+        self.latent_dim = latent_dim
+        self.min_log_sigma = min_log_sigma
+        self.max_log_sigma = max_log_sigma
+
+        # self.encoder = DistributionEncoder2D(
+        #     in_channels,
+        #     self.compress_dim,
+        # )
+
+        self.encoder = DistributionEncoder1DV2(
+            in_channels,
+            self.compress_dim,
+        )
+
+        self.last_conv = nn.Sequential(
+            nn.AdaptiveAvgPool1d(1), nn.Conv1d(self.compress_dim, out_channels=2 * self.latent_dim, kernel_size=1)
+        )
+
+    def forward(self, s_t):
+        encoding = self.encoder(s_t.permute(0, 2, 1))
+        mu_log_sigma = self.last_conv(encoding).permute(0, 2, 1)
+        mu = mu_log_sigma[:, :, :self.latent_dim]
+        log_sigma = mu_log_sigma[:, :, self.latent_dim:]
+
+        # clip the log_sigma value for numerical stability
+        log_sigma = torch.clamp(log_sigma, self.min_log_sigma, self.max_log_sigma)
+        return mu, log_sigma
+
+class DistributionEncoder2D(nn.Module):
+    """Encodes s_t or (s_t, y_{t+1}, ..., y_{t+H}).
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        self.model = nn.Sequential(
+            Bottleneck(in_channels, out_channels=out_channels, downsample=True),
+            Bottleneck(out_channels, out_channels=out_channels, downsample=True),
+            Bottleneck(out_channels, out_channels=out_channels, downsample=True),
+            Bottleneck(out_channels, out_channels=out_channels, downsample=True),
+        )
+
+    def forward(self, s_t):
+        return self.model(s_t)
+
+class DistributionEncoder1D(nn.Module):
+    """Encodes s_t or (s_t, y_{t+1}, ..., y_{t+H}).
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        self.model = nn.Sequential(
+            nn.Conv1d(in_channels, out_channels=in_channels*2, kernel_size=1, stride=1),
+            nn.Conv1d(in_channels*2, out_channels=in_channels*2, kernel_size=1, stride=1),
+            nn.Conv1d(in_channels*2, out_channels=in_channels, kernel_size=1, stride=1),
+            nn.Conv1d(in_channels, out_channels=out_channels, kernel_size=1, stride=1),
+        )
+
+    def forward(self, s_t):
+        return self.model(s_t)
+
+class DistributionEncoder1DV2(nn.Module):
+    """Encodes s_t or (s_t, y_{t+1}, ..., y_{t+H}).
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        self.conv1 = nn.Conv1d(in_channels, out_channels=in_channels * 2, kernel_size=1, stride=1)
+        self.conv2 = nn.Conv1d(in_channels * 2, out_channels=in_channels * 2, kernel_size=1, stride=1)
+        self.conv3 = nn.Conv1d(in_channels * 2, out_channels=out_channels, kernel_size=1, stride=1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, s_t):
+        s_t = self.relu(self.conv1(s_t))
+        s_t = self.relu(self.conv2(s_t))
+        s_t = self.conv3(s_t)
+
+        return s_t
+
+class DistributionDecoder1DV2(nn.Module):
+    """Decodes sample to future states.
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        self.conv1 = nn.Conv1d(in_channels, out_channels=in_channels * 8, kernel_size=1, stride=1)
+        self.conv2 = nn.Conv1d(in_channels * 8, out_channels=in_channels * 8, kernel_size=1, stride=1)
+        self.conv3 = nn.Conv1d(in_channels * 8, out_channels=out_channels, kernel_size=1, stride=1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, f_t):
+        f_t = self.relu(self.conv1(f_t))
+        f_t = self.relu(self.conv2(f_t))
+        f_t = self.conv3(f_t)
+
+        return f_t
+
+class PredictModel(nn.Module):
+    """predict future states with rnn.
+    """
+    def __init__(self, in_channels, out_channels, hidden_channels, num_layers):
+        super().__init__()
+        self.gru = nn.GRU(input_size=in_channels, hidden_size=hidden_channels, num_layers=num_layers)
+        self.linear1 = nn.Linear(hidden_channels, hidden_channels*2)
+        self.linear2 = nn.Linear(hidden_channels*2, hidden_channels*4)
+        self.linear3 = nn.Linear(hidden_channels*4, out_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x , h):
+        x, h = self.gru(x, h)
+        x = self.relu(self.linear1(x))
+        x = self.relu(self.linear2(x))
+        x = self.linear3(x)
+        return x
+
+
+class PredictModelHidden(nn.Module):
+    """predict future states with rnn.
+    """
+    def __init__(self, in_channels, out_channels, hidden_channels, num_layers):
+        super().__init__()
+        self.gru = nn.GRU(input_size=in_channels, hidden_size=hidden_channels, num_layers=num_layers)
+        self.linear1 = nn.Linear(hidden_channels, hidden_channels*2)
+        self.linear2 = nn.Linear(hidden_channels*2, hidden_channels*4)
+        self.linear3 = nn.Linear(hidden_channels*4, out_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        x, h = self.gru(x)
+        x = self.relu(self.linear1(x))
+        x = self.relu(self.linear2(x))
+        x = self.linear3(x)
+        return x
+
+
+
+
+@LOSSES.register_module()
+class ProbabilisticLoss(nn.Module):
+    def __init__(self, loss_weight=1.0):
+        super().__init__()
+        self.loss_weight = loss_weight
+
+    def forward(self, output):
+        present_mu = output['present_mu']
+        present_log_sigma = output['present_log_sigma']
+        future_mu = output['future_mu']
+        future_log_sigma = output['future_log_sigma']
+
+        var_future = torch.exp(2 * future_log_sigma)
+        var_present = torch.exp(2 * present_log_sigma)
+        kl_div = (
+                present_log_sigma - future_log_sigma - 0.5 + (var_future + (future_mu - present_mu) ** 2) / (
+                    2 * var_present)
+        )
+
+        kl_loss = torch.mean(torch.sum(kl_div, dim=-1)) * self.loss_weight
+
+        return kl_loss
+
+
+
+
+
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/layers.py b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..068369ca950891321a0113d4f8680ad90a8bc23c
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/layers.py
@@ -0,0 +1,235 @@
+
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+
+from functools import partial
+
+class Bottleneck(nn.Module):
+    """
+    Defines a bottleneck module with a residual connection
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+        kernel_size=3,
+        dilation=1,
+        groups=1,
+        upsample=False,
+        downsample=False,
+        dropout=0.0,
+    ):
+        super().__init__()
+        self._downsample = downsample
+        bottleneck_channels = int(in_channels / 2)
+        out_channels = out_channels or in_channels
+        padding_size = ((kernel_size - 1) * dilation + 1) // 2
+
+        # Define the main conv operation
+        assert dilation == 1
+        if upsample:
+            assert not downsample, 'downsample and upsample not possible simultaneously.'
+            bottleneck_conv = nn.ConvTranspose2d(
+                bottleneck_channels,
+                bottleneck_channels,
+                kernel_size=kernel_size,
+                bias=False,
+                dilation=1,
+                stride=2,
+                output_padding=padding_size,
+                padding=padding_size,
+                groups=groups,
+            )
+        elif downsample:
+            bottleneck_conv = nn.Conv2d(
+                bottleneck_channels,
+                bottleneck_channels,
+                kernel_size=kernel_size,
+                bias=False,
+                dilation=dilation,
+                stride=2,
+                padding=padding_size,
+                groups=groups,
+            )
+        else:
+            bottleneck_conv = nn.Conv2d(
+                bottleneck_channels,
+                bottleneck_channels,
+                kernel_size=kernel_size,
+                bias=False,
+                dilation=dilation,
+                padding=padding_size,
+                groups=groups,
+            )
+
+        self.layers = nn.Sequential(
+            OrderedDict(
+                [
+                    # First projection with 1x1 kernel
+                    ('conv_down_project', nn.Conv2d(in_channels, bottleneck_channels, kernel_size=1, bias=False)),
+                    ('abn_down_project', nn.Sequential(nn.BatchNorm2d(bottleneck_channels),
+                                                       nn.ReLU(inplace=True))),
+                    # Second conv block
+                    ('conv', bottleneck_conv),
+                    ('abn', nn.Sequential(nn.BatchNorm2d(bottleneck_channels), nn.ReLU(inplace=True))),
+                    # Final projection with 1x1 kernel
+                    ('conv_up_project', nn.Conv2d(bottleneck_channels, out_channels, kernel_size=1, bias=False)),
+                    ('abn_up_project', nn.Sequential(nn.BatchNorm2d(out_channels),
+                                                     nn.ReLU(inplace=True))),
+                    # Regulariser
+                    ('dropout', nn.Dropout2d(p=dropout)),
+                ]
+            )
+        )
+
+        if out_channels == in_channels and not downsample and not upsample:
+            self.projection = None
+        else:
+            projection = OrderedDict()
+            if upsample:
+                projection.update({'upsample_skip_proj': Interpolate(scale_factor=2)})
+            elif downsample:
+                projection.update({'upsample_skip_proj': nn.MaxPool2d(kernel_size=2, stride=2)})
+            projection.update(
+                {
+                    'conv_skip_proj': nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
+                    'bn_skip_proj': nn.BatchNorm2d(out_channels),
+                }
+            )
+            self.projection = nn.Sequential(projection)
+
+    # pylint: disable=arguments-differ
+    def forward(self, *args):
+        (x,) = args
+        x_residual = self.layers(x)
+        if self.projection is not None:
+            if self._downsample:
+                # pad h/w dimensions if they are odd to prevent shape mismatch with residual layer
+                x = nn.functional.pad(x, (0, x.shape[-1] % 2, 0, x.shape[-2] % 2), value=0)
+            return x_residual + self.projection(x)
+        return x_residual + x
+
+class ConvBlock(nn.Module):
+    """2D convolution followed by
+         - an optional normalisation (batch norm or instance norm)
+         - an optional activation (ReLU, LeakyReLU, or tanh)
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+        kernel_size=3,
+        stride=1,
+        norm='bn',
+        activation='relu',
+        bias=False,
+        transpose=False,
+    ):
+        super().__init__()
+        out_channels = out_channels or in_channels
+        padding = int((kernel_size - 1) / 2)
+        self.conv = nn.Conv2d if not transpose else partial(nn.ConvTranspose2d, output_padding=1)
+        self.conv = self.conv(in_channels, out_channels, kernel_size, stride, padding=padding, bias=bias)
+
+        if norm == 'bn':
+            self.norm = nn.BatchNorm2d(out_channels)
+        elif norm == 'in':
+            self.norm = nn.InstanceNorm2d(out_channels)
+        elif norm == 'none':
+            self.norm = None
+        else:
+            raise ValueError('Invalid norm {}'.format(norm))
+
+        if activation == 'relu':
+            self.activation = nn.ReLU(inplace=True)
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU(0.1, inplace=True)
+        elif activation == 'elu':
+            self.activation = nn.ELU(inplace=True)
+        elif activation == 'tanh':
+            self.activation = nn.Tanh(inplace=True)
+        elif activation == 'none':
+            self.activation = None
+        else:
+            raise ValueError('Invalid activation {}'.format(activation))
+
+    def forward(self, x):
+        x = self.conv(x)
+
+        if self.norm:
+            x = self.norm(x)
+        if self.activation:
+            x = self.activation(x)
+        return x
+
+
+class SpatialGRU(nn.Module):
+    """A GRU cell that takes an input tensor [BxTxCxHxW] and an optional previous state and passes a
+    convolutional gated recurrent unit over the data"""
+
+    def __init__(self, input_size, hidden_size, gru_bias_init=0.0, norm='bn', activation='relu'):
+        super().__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.gru_bias_init = gru_bias_init
+
+        self.conv_update = nn.Conv2d(input_size + hidden_size, hidden_size, kernel_size=3, bias=True, padding=1)
+        self.conv_reset = nn.Conv2d(input_size + hidden_size, hidden_size, kernel_size=3, bias=True, padding=1)
+
+        self.conv_state_tilde = ConvBlock(
+            input_size + hidden_size, hidden_size, kernel_size=3, bias=False, norm=norm, activation=activation
+        )
+
+    def forward(self, x, state=None, flow=None, mode='bilinear'):
+        # pylint: disable=unused-argument, arguments-differ
+        # Check size
+        assert len(x.size()) == 5, 'Input tensor must be BxTxCxHxW.'
+        b, timesteps, c, h, w = x.size()
+        assert c == self.input_size, f'feature sizes must match, got input {c} for layer with size {self.input_size}'
+
+        # recurrent layers
+        rnn_output = []
+        rnn_state = torch.zeros(b, self.hidden_size, h, w, device=x.device) if state is None else state
+        for t in range(timesteps):
+            x_t = x[:, t]
+            # if flow is not None:
+            #     rnn_state = warp_features(rnn_state, flow[:, t], mode=mode)
+
+            # propagate rnn state
+            rnn_state = self.gru_cell(x_t, rnn_state)
+            rnn_output.append(rnn_state)
+
+        # reshape rnn output to batch tensor
+        return torch.stack(rnn_output, dim=1)
+
+    def gru_cell(self, x, state):
+        # Compute gates
+        x_and_state = torch.cat([x, state], dim=1)
+        update_gate = self.conv_update(x_and_state)
+        reset_gate = self.conv_reset(x_and_state)
+        # Add bias to initialise gate as close to identity function
+        update_gate = torch.sigmoid(update_gate + self.gru_bias_init)
+        reset_gate = torch.sigmoid(reset_gate + self.gru_bias_init)
+
+        # Compute proposal state, activation is defined in norm_act_config (can be tanh, ReLU etc)
+        state_tilde = self.conv_state_tilde(torch.cat([x, (1.0 - reset_gate) * state], dim=1))
+
+        output = (1.0 - update_gate) * state + update_gate * state_tilde
+        return output
+
+
+class Interpolate(nn.Module):
+    def __init__(self, scale_factor: int = 2):
+        super().__init__()
+        self._interpolate = nn.functional.interpolate
+        self._scale_factor = scale_factor
+
+    # pylint: disable=arguments-differ
+    def forward(self, x):
+        return self._interpolate(x, scale_factor=self._scale_factor, mode='bilinear', align_corners=False)
+
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/generator/state_prediction.py b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/state_prediction.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf9e94c65658ddcc5c37335972a9e586b7f7b09c
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/generator/state_prediction.py
@@ -0,0 +1,37 @@
+
+import torch
+
+from .layers import Bottleneck
+from .layers import SpatialGRU
+
+
+class FuturePrediction(torch.nn.Module):
+    def __init__(self, in_channels, latent_dim, n_gru_blocks=3, n_res_layers=3):
+        super().__init__()
+        self.n_gru_blocks = n_gru_blocks
+
+        # Convolutional recurrent model with z_t as an initial hidden state and inputs the sample
+        # from the probabilistic model. The architecture of the model is:
+        # [Spatial GRU - [Bottleneck] x n_res_layers] x n_gru_blocks
+        self.spatial_grus = []
+        self.res_blocks = []
+
+        for i in range(self.n_gru_blocks):
+            gru_in_channels = latent_dim if i == 0 else in_channels
+            self.spatial_grus.append(SpatialGRU(gru_in_channels, in_channels))
+            self.res_blocks.append(torch.nn.Sequential(*[Bottleneck(in_channels)
+                                                         for _ in range(n_res_layers)]))
+
+        self.spatial_grus = torch.nn.ModuleList(self.spatial_grus)
+        self.res_blocks = torch.nn.ModuleList(self.res_blocks)
+
+    def forward(self, x, hidden_state):
+        # x has shape (b, n_future, c, h, w), hidden_state (b, c, h, w)
+        for i in range(self.n_gru_blocks):
+            x = self.spatial_grus[i](x, hidden_state, flow=None)
+            b, n_future, c, h, w = x.shape
+
+            x = self.res_blocks[i](x.view(b * n_future, c, h, w))
+            x = x.view(b, n_future, c, h, w)
+
+        return x
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..081ed0325719dcdefec0f2003d38d35e21362cb7
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__init__.py
@@ -0,0 +1 @@
+from .custom_hooks import TransferWeight, CustomSetEpochInfoHook
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65652b63ff5f4240d9ee7537cdff876a005c6cc8
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/custom_hooks.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/custom_hooks.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ef022782f399c448b4d218531d5fcd54e30fe80
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/__pycache__/custom_hooks.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/custom_hooks.py b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/custom_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..93ce7a27a3ae798aa37b83f1c4a11f08fa0edb93
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/hooks/custom_hooks.py
@@ -0,0 +1,26 @@
+from mmcv.runner.hooks.hook import HOOKS, Hook
+from projects.mmdet3d_plugin.models.utils import run_time
+from mmcv.parallel import is_module_wrapper
+
+
+@HOOKS.register_module()
+class TransferWeight(Hook):
+    
+    def __init__(self, every_n_inters=1):
+        self.every_n_inters=every_n_inters
+
+    def after_train_iter(self, runner):
+        if self.every_n_inner_iters(runner, self.every_n_inters):
+            runner.eval_model.load_state_dict(runner.model.state_dict())
+
+@HOOKS.register_module()
+class CustomSetEpochInfoHook(Hook):
+    """Set runner's epoch information to the model."""
+
+    def before_train_epoch(self, runner):
+        epoch = runner.epoch
+        model = runner.model
+        if is_module_wrapper(model):
+            model = model.module
+        model.set_epoch(epoch)
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d49802f796a43b6d323b6c7a62b0377867cc057
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__init__.py
@@ -0,0 +1,5 @@
+from .transformer import PerceptionTransformer
+from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D
+from .temporal_self_attention import TemporalSelfAttention
+from .encoder import BEVFormerEncoder, BEVFormerLayer
+from .decoder import DetectionTransformerDecoder
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c447eb7e44e7fe9980801db9dee872db4c179040
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/custom_base_transformer_layer.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/custom_base_transformer_layer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..059ae5c20ab7afcd09d460091de7fd9985ab4803
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/custom_base_transformer_layer.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/decoder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/decoder.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1aec994047d2e23bc767110b139636cfd35ea57
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/decoder.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/encoder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/encoder.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae19d0fcc772be7be8b44e9d88162f7aced9cdf3
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/encoder.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/multi_scale_deformable_attn_function.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/multi_scale_deformable_attn_function.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5579a07e4ca4d8f5a009f5714375f72354da2b3b
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/multi_scale_deformable_attn_function.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/spatial_cross_attention.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/spatial_cross_attention.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..428200cbd28f549c7f2ec49dd1f44bda898d3f6e
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/spatial_cross_attention.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/temporal_self_attention.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/temporal_self_attention.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8da0fd1a7f4ac3a01b80576e2a2f4b8b07610c58
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/temporal_self_attention.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/transformer.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/transformer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65c35b82974c48e4a57b27484bdbe5b16c322ee9
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/__pycache__/transformer.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/custom_base_transformer_layer.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/custom_base_transformer_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a040177285ceccf5b291c718f3cdae4587da9a2
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/custom_base_transformer_layer.py
@@ -0,0 +1,254 @@
+import copy
+import warnings
+
+import torch
+import torch.nn as nn
+
+from mmcv import ConfigDict, deprecated_api_warning
+from mmcv.cnn import Linear, build_activation_layer, build_norm_layer
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+
+from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
+                                      TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
+
+# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
+try:
+    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention  # noqa F401
+    warnings.warn(
+        ImportWarning(
+            '``MultiScaleDeformableAttention`` has been moved to '
+            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501
+            '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501
+            'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501
+        ))
+except ImportError:
+    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
+                  '``mmcv.ops.multi_scale_deform_attn``, '
+                  'You should install ``mmcv-full`` if you need this module. ')
+from mmcv.cnn.bricks.transformer import build_feedforward_network, build_attention
+
+
+@TRANSFORMER_LAYER.register_module()
+class MyCustomBaseTransformerLayer(BaseModule):
+    """Base `TransformerLayer` for vision transformer.
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=True,
+                 **kwargs):
+
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ')
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super(MyCustomBaseTransformerLayer, self).__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        assert set(operation_order) & set(
+            ['self_attn', 'norm', 'ffn', 'cross_attn']) == \
+            set(operation_order), f'The operation_order of' \
+            f' {self.__class__.__name__} should ' \
+            f'contains all four operation type ' \
+            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
+
+        num_attn = operation_order.count('self_attn') + operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+
+            self.ffns.append(
+                build_feedforward_network(ffn_cfgs[ffn_index]))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+        **kwargs contains some specific arguments of attentions.
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                f'attn_masks {len(attn_masks)} must be equal ' \
+                f'to the number of attention in ' \
+                f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                temp_key = temp_value = query
+                query = self.attentions[attn_index](
+                    query,
+                    temp_key,
+                    temp_value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/decoder.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d982baf1adab4d3b8ac41e1a6dc7622512f16f3
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/decoder.py
@@ -0,0 +1,339 @@
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import mmcv
+import cv2 as cv
+import copy
+import warnings
+from matplotlib import pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import TransformerLayerSequence
+import math
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+
+from mmcv.utils import ext_loader
+from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \
+    MultiScaleDeformableAttnFunction_fp16
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetectionTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR3D transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+        super(DetectionTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        self.fp16_enabled = False
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                reg_branches=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+
+            reference_points_input = reference_points[..., :2].unsqueeze(
+                2)  # BS NUM_QUERY NUM_LEVEL 2
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+
+                assert reference_points.shape[-1] == 3
+
+                new_reference_points = torch.zeros_like(reference_points)
+                new_reference_points[..., :2] = tmp[
+                    ..., :2] + inverse_sigmoid(reference_points[..., :2])
+                new_reference_points[..., 2:3] = tmp[
+                    ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3])
+
+                new_reference_points = new_reference_points.sigmoid()
+
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@ATTENTION.register_module()
+class CustomMSDeformableAttention(BaseModule):
+    """An attention module used in Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                flag='decoder',
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+
+            # using fp16 deformable attention is unstable because it performs many sum operations
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/encoder.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..27b34c3ed041d84282ade8015b98c28b379253f8
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/encoder.py
@@ -0,0 +1,396 @@
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from .custom_base_transformer_layer import MyCustomBaseTransformerLayer
+import copy
+import warnings
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import TransformerLayerSequence
+from mmcv.runner import force_fp32, auto_fp16
+import numpy as np
+import torch
+import cv2 as cv
+import mmcv
+from mmcv.utils import TORCH_VERSION, digit_version
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class BEVFormerEncoder(TransformerLayerSequence):
+
+    """
+    Attention with both self and cross
+    Implements the decoder in DETR transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes',
+                 **kwargs):
+
+        super(BEVFormerEncoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+
+        self.num_points_in_pillar = num_points_in_pillar
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+
+    @staticmethod
+    def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs=1, device='cuda', dtype=torch.float):
+        """Get the reference points used in SCA and TSA.
+        Args:
+            H, W: spatial shape of bev.
+            Z: hight of pillar.
+            D: sample D points uniformly from each pillar.
+            device (obj:`device`): The device where
+                reference_points should be.
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+
+        # reference points in 3D space, used in spatial cross-attention (SCA)
+        if dim == '3d':
+            zs = torch.linspace(0.5, Z - 0.5, num_points_in_pillar, dtype=dtype,
+                                device=device).view(-1, 1, 1).expand(num_points_in_pillar, H, W) / Z
+            xs = torch.linspace(0.5, W - 0.5, W, dtype=dtype,
+                                device=device).view(1, 1, W).expand(num_points_in_pillar, H, W) / W
+            ys = torch.linspace(0.5, H - 0.5, H, dtype=dtype,
+                                device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H
+            ref_3d = torch.stack((xs, ys, zs), -1)
+            ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1)
+            ref_3d = ref_3d[None].repeat(bs, 1, 1, 1)
+            return ref_3d
+
+        # reference points on 2D bev plane, used in temporal self-attention (TSA).
+        elif dim == '2d':
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H - 0.5, H, dtype=dtype, device=device),
+                torch.linspace(
+                    0.5, W - 0.5, W, dtype=dtype, device=device)
+            )
+            ref_y = ref_y.reshape(-1)[None] / H
+            ref_x = ref_x.reshape(-1)[None] / W
+            ref_2d = torch.stack((ref_x, ref_y), -1)
+            ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2)
+            return ref_2d
+
+    # This function must use fp32!!!
+    @force_fp32(apply_to=('reference_points', 'img_metas'))
+    def point_sampling(self, reference_points, pc_range,  img_metas):
+
+        lidar2img = []
+        for img_meta in img_metas:
+            lidar2img.append(img_meta['lidar2img'])
+        lidar2img = np.asarray(lidar2img)
+        lidar2img = reference_points.new_tensor(lidar2img)  # (B, N, 4, 4)
+        reference_points = reference_points.clone()
+
+        reference_points[..., 0:1] = reference_points[..., 0:1] * \
+            (pc_range[3] - pc_range[0]) + pc_range[0]
+        reference_points[..., 1:2] = reference_points[..., 1:2] * \
+            (pc_range[4] - pc_range[1]) + pc_range[1]
+        reference_points[..., 2:3] = reference_points[..., 2:3] * \
+            (pc_range[5] - pc_range[2]) + pc_range[2]
+
+        reference_points = torch.cat(
+            (reference_points, torch.ones_like(reference_points[..., :1])), -1)
+
+        reference_points = reference_points.permute(1, 0, 2, 3)
+        D, B, num_query = reference_points.size()[:3]
+        num_cam = lidar2img.size(1)
+
+        reference_points = reference_points.view(
+            D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1)
+
+        lidar2img = lidar2img.view(
+            1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1)
+
+        reference_points_cam = torch.matmul(lidar2img.to(torch.float32),
+                                            reference_points.to(torch.float32)).squeeze(-1)
+        eps = 1e-5
+
+        bev_mask = (reference_points_cam[..., 2:3] > eps)
+        reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum(
+            reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps)
+
+        reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1]
+        reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0]
+
+        bev_mask = (bev_mask & (reference_points_cam[..., 1:2] > 0.0)
+                    & (reference_points_cam[..., 1:2] < 1.0)
+                    & (reference_points_cam[..., 0:1] < 1.0)
+                    & (reference_points_cam[..., 0:1] > 0.0))
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            bev_mask = torch.nan_to_num(bev_mask)
+        else:
+            bev_mask = bev_mask.new_tensor(
+                np.nan_to_num(bev_mask.cpu().numpy()))
+
+        reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4)
+        bev_mask = bev_mask.permute(2, 1, 3, 0, 4).squeeze(-1)
+
+        return reference_points_cam, bev_mask
+
+    @auto_fp16()
+    def forward(self,
+                bev_query,
+                key,
+                value,
+                *args,
+                bev_h=None,
+                bev_w=None,
+                bev_pos=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                valid_ratios=None,
+                prev_bev=None,
+                shift=0.,
+                **kwargs):
+        """Forward function for `TransformerDecoder`.
+        Args:
+            bev_query (Tensor): Input BEV query with shape
+                `(num_query, bs, embed_dims)`.
+            key & value (Tensor): Input multi-cameta features with shape
+                (num_cam, num_value, bs, embed_dims)
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+
+        output = bev_query
+        intermediate = []
+
+        ref_3d = self.get_reference_points(
+            bev_h, bev_w, self.pc_range[5]-self.pc_range[2], self.num_points_in_pillar, dim='3d', bs=bev_query.size(1),  device=bev_query.device, dtype=bev_query.dtype)
+        ref_2d = self.get_reference_points(
+            bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype)
+
+        reference_points_cam, bev_mask = self.point_sampling(
+            ref_3d, self.pc_range, kwargs['img_metas'])
+
+        # bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper.
+        shift_ref_2d = ref_2d  # .clone()
+        shift_ref_2d += shift[:, None, None, :]
+
+        # (num_query, bs, embed_dims) -> (bs, num_query, embed_dims)
+        bev_query = bev_query.permute(1, 0, 2)
+        bev_pos = bev_pos.permute(1, 0, 2)
+        bs, len_bev, num_bev_level, _ = ref_2d.shape
+        if prev_bev is not None:
+            prev_bev = prev_bev.permute(1, 0, 2)
+            prev_bev = torch.stack(
+                [prev_bev, bev_query], 1).reshape(bs*2, len_bev, -1)
+            hybird_ref_2d = torch.stack([shift_ref_2d, ref_2d], 1).reshape(
+                bs*2, len_bev, num_bev_level, 2)
+        else:
+            hybird_ref_2d = torch.stack([ref_2d, ref_2d], 1).reshape(
+                bs*2, len_bev, num_bev_level, 2)
+
+        for lid, layer in enumerate(self.layers):
+            output = layer(
+                bev_query,
+                key,
+                value,
+                *args,
+                bev_pos=bev_pos,
+                ref_2d=hybird_ref_2d,
+                ref_3d=ref_3d,
+                bev_h=bev_h,
+                bev_w=bev_w,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                reference_points_cam=reference_points_cam,
+                bev_mask=bev_mask,
+                prev_bev=prev_bev,
+                **kwargs)
+
+            bev_query = output
+            if self.return_intermediate:
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output
+
+
+@TRANSFORMER_LAYER.register_module()
+class BEVFormerLayer(MyCustomBaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default：None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default：2.
+    """
+
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 **kwargs):
+        super(BEVFormerLayer, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs)
+        self.fp16_enabled = False
+        assert len(operation_order) == 6
+        assert set(operation_order) == set(
+            ['self_attn', 'norm', 'cross_attn', 'ffn'])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                bev_pos=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                ref_2d=None,
+                ref_3d=None,
+                bev_h=None,
+                bev_w=None,
+                reference_points_cam=None,
+                mask=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                prev_bev=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+
+        **kwargs contains some specific arguments of attentions.
+
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                                                     f'attn_masks {len(attn_masks)} must be equal ' \
+                                                     f'to the number of attention in ' \
+                f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            # temporal self attention
+            if layer == 'self_attn':
+
+                query = self.attentions[attn_index](
+                    query,
+                    prev_bev,
+                    prev_bev,
+                    identity if self.pre_norm else None,
+                    query_pos=bev_pos,
+                    key_pos=bev_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    reference_points=ref_2d,
+                    spatial_shapes=torch.tensor(
+                        [[bev_h, bev_w]], device=query.device),
+                    level_start_index=torch.tensor([0], device=query.device),
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            # spaital cross attention
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    reference_points=ref_3d,
+                    reference_points_cam=reference_points_cam,
+                    mask=mask,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/multi_scale_deformable_attn_function.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/multi_scale_deformable_attn_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..613dd7c41dd61bc4765ceac7bc45c8e9add07fe6
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/multi_scale_deformable_attn_function.py
@@ -0,0 +1,157 @@
+import torch
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.autograd.function import Function, once_differentiable
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+class MultiScaleDeformableAttnFunction_fp16(Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index, \
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+class MultiScaleDeformableAttnFunction_fp32(Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index, \
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/spatial_cross_attention.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/spatial_cross_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..3362ea053f28b0297d2830ca337722f8978e7b71
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/spatial_cross_attention.py
@@ -0,0 +1,393 @@
+
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import build_attention
+import math
+from mmcv.runner import force_fp32, auto_fp16
+
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+
+from mmcv.utils import ext_loader
+from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \
+    MultiScaleDeformableAttnFunction_fp16
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@ATTENTION.register_module()
+class SpatialCrossAttention(BaseModule):
+    """An attention module used in BEVFormer.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_cams (int): The number of cameras
+        dropout (float): A Dropout layer on `inp_residual`.
+            Default: 0..
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        deformable_attention: (dict): The config for the deformable attention used in SCA.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_cams=6,
+                 pc_range=None,
+                 dropout=0.1,
+                 init_cfg=None,
+                 batch_first=False,
+                 deformable_attention=dict(
+                     type='MSDeformableAttention3D',
+                     embed_dims=256,
+                     num_levels=4),
+                 **kwargs
+                 ):
+        super(SpatialCrossAttention, self).__init__(init_cfg)
+
+        self.init_cfg = init_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+        self.deformable_attention = build_attention(deformable_attention)
+        self.embed_dims = embed_dims
+        self.num_cams = num_cams
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.batch_first = batch_first
+        self.init_weight()
+
+    def init_weight(self):
+        """Default initialization for Parameters of Module."""
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+    
+    @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam'))
+    def forward(self,
+                query,
+                key,
+                value,
+                residual=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                reference_points_cam=None,
+                bev_mask=None,
+                level_start_index=None,
+                flag='encoder',
+                **kwargs):
+        """Forward Function of Detr3DCrossAtten.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`. (B, N, C, H, W)
+            residual (Tensor): The tensor used for addition, with the
+                same shape as `x`. Default None. If None, `x` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for  `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, 4),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different level. With shape  (num_levels, 2),
+                last dimension represent (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+
+        if residual is None:
+            inp_residual = query
+            slots = torch.zeros_like(query)
+        if query_pos is not None:
+            query = query + query_pos
+
+        bs, num_query, _ = query.size()
+
+        D = reference_points_cam.size(3)
+        indexes = []
+        for i, mask_per_img in enumerate(bev_mask):
+            index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1)
+            indexes.append(index_query_per_img)
+        max_len = max([len(each) for each in indexes])
+
+        # each camera only interacts with its corresponding BEV queries. This step can  greatly save GPU memory.
+        queries_rebatch = query.new_zeros(
+            [bs, self.num_cams, max_len, self.embed_dims])
+        reference_points_rebatch = reference_points_cam.new_zeros(
+            [bs, self.num_cams, max_len, D, 2])
+        
+        for j in range(bs):
+            for i, reference_points_per_img in enumerate(reference_points_cam):   
+                index_query_per_img = indexes[i]
+                queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img]
+                reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img]
+
+        num_cams, l, bs, embed_dims = key.shape
+
+        key = key.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+        value = value.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+
+        queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value,
+                                            reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes,
+                                            level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims)
+        for j in range(bs):
+            for i, index_query_per_img in enumerate(indexes):
+                slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)]
+
+        count = bev_mask.sum(-1) > 0
+        count = count.permute(1, 2, 0).sum(-1)
+        count = torch.clamp(count, min=1.0)
+        slots = slots / count[..., None]
+        slots = self.output_proj(slots)
+
+        return self.dropout(slots) + inp_residual
+
+
+@ATTENTION.register_module()
+class MSDeformableAttention3D(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=8,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.batch_first = batch_first
+        self.output_proj = None
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                ( bs, num_query, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        if reference_points.shape[-1] == 2:
+            """
+            For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
+            After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
+            For each referent point, we sample `num_points` sampling points.
+            For `num_Z_anchors` reference points,  it has overall `num_points * num_Z_anchors` sampling points.
+            """
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+
+            bs, num_query, num_Z_anchors, xy = reference_points.shape
+            reference_points = reference_points[:, :, None, None, None, :, :]
+            sampling_offsets = sampling_offsets / \
+                offset_normalizer[None, None, None, :, None, :]
+            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape
+            sampling_offsets = sampling_offsets.view(
+                bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy)
+            sampling_locations = reference_points + sampling_offsets
+            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape
+            assert num_all_points == num_points * num_Z_anchors
+
+            sampling_locations = sampling_locations.view(
+                bs, num_query, num_heads, num_levels, num_all_points, xy)
+
+        elif reference_points.shape[-1] == 4:
+            assert False
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+
+        #  sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
+        #  attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
+        #
+
+        if torch.cuda.is_available() and value.is_cuda:
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return output
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/temporal_self_attention.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/temporal_self_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5151ad7078fb4a93ba45d597f8a936f64bc6ba4
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/temporal_self_attention.py
@@ -0,0 +1,266 @@
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import warnings
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import ATTENTION
+import math
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@ATTENTION.register_module()
+class TemporalSelfAttention(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV.
+         the length of BEV queue is 2.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 num_bev_queue=2,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.num_bev_queue = num_bev_queue
+        self.sampling_offsets = nn.Linear(
+            embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue,
+                                           num_bev_queue*num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1)
+
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                flag='decoder',
+
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            assert self.batch_first
+            bs, len_bev, c = query.shape
+            value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c)
+
+            # value = torch.cat([query, query], 0)
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+        bs,  num_query, embed_dims = query.shape
+        _, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+        assert self.num_bev_queue == 2
+
+        query = torch.cat([value[:bs], query], -1)
+        value = self.value_proj(value)
+
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+
+        value = value.reshape(bs*self.num_bev_queue,
+                              num_value, self.num_heads, -1)
+
+        sampling_offsets = self.sampling_offsets(query)
+        sampling_offsets = sampling_offsets.view(
+            bs, num_query, self.num_heads,  self.num_bev_queue, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query,  self.num_heads, self.num_bev_queue, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_bev_queue,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\
+            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous()
+        sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\
+            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+
+            # using fp16 deformable attention is unstable because it performs many sum operations
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        # output shape (bs*num_bev_queue, num_query, embed_dims)
+        # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue)
+        output = output.permute(1, 2, 0)
+
+        # fuse history value and current value
+        # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue)
+        output = output.view(num_query, embed_dims, bs, self.num_bev_queue)
+        output = output.mean(-1)
+
+        # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims)
+        output = output.permute(2, 0, 1)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/modules/transformer.py b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..55f0a15009e5bf3ac806feda0d9adefd7e16dea3
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/modules/transformer.py
@@ -0,0 +1,283 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init
+from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
+from mmcv.runner.base_module import BaseModule
+
+from mmdet.models.utils.builder import TRANSFORMER
+from torch.nn.init import normal_
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from mmcv.runner.base_module import BaseModule
+from torchvision.transforms.functional import rotate
+from .temporal_self_attention import TemporalSelfAttention
+from .spatial_cross_attention import MSDeformableAttention3D
+from .decoder import CustomMSDeformableAttention
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+from mmcv.runner import force_fp32, auto_fp16
+
+
+@TRANSFORMER.register_module()
+class PerceptionTransformer(BaseModule):
+    """Implements the Detr3D transformer.
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 num_feature_levels=4,
+                 num_cams=6,
+                 two_stage_num_proposals=300,
+                 encoder=None,
+                 decoder=None,
+                 embed_dims=256,
+                 rotate_prev_bev=True,
+                 use_shift=True,
+                 use_can_bus=True,
+                 can_bus_norm=True,
+                 use_cams_embeds=True,
+                 rotate_center=[100, 100],
+                 **kwargs):
+        super(PerceptionTransformer, self).__init__(**kwargs)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = embed_dims
+        self.num_feature_levels = num_feature_levels
+        self.num_cams = num_cams
+        self.fp16_enabled = False
+
+        self.rotate_prev_bev = rotate_prev_bev
+        self.use_shift = use_shift
+        self.use_can_bus = use_can_bus
+        self.can_bus_norm = can_bus_norm
+        self.use_cams_embeds = use_cams_embeds
+
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.init_layers()
+        self.rotate_center = rotate_center
+
+    def init_layers(self):
+        """Initialize layers of the Detr3DTransformer."""
+        self.level_embeds = nn.Parameter(torch.Tensor(
+            self.num_feature_levels, self.embed_dims))
+        self.cams_embeds = nn.Parameter(
+            torch.Tensor(self.num_cams, self.embed_dims))
+        self.reference_points = nn.Linear(self.embed_dims, 3)
+        self.can_bus_mlp = nn.Sequential(
+            nn.Linear(18, self.embed_dims // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.embed_dims // 2, self.embed_dims),
+            nn.ReLU(inplace=True),
+        )
+        if self.can_bus_norm:
+            self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims))
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        normal_(self.level_embeds)
+        normal_(self.cams_embeds)
+        xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.)
+
+    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos'))
+    def get_bev_features(
+            self,
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=[0.512, 0.512],
+            bev_pos=None,
+            prev_bev=None,
+            **kwargs):
+        """
+        obtain bev features.
+        """
+
+        bs = mlvl_feats[0].size(0)
+        bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1)
+        bev_pos = bev_pos.flatten(2).permute(2, 0, 1)
+
+        # obtain rotation angle and shift with ego motion
+        delta_x = np.array([each['can_bus'][0]
+                           for each in kwargs['img_metas']])
+        delta_y = np.array([each['can_bus'][1]
+                           for each in kwargs['img_metas']])
+        ego_angle = np.array(
+            [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']])
+        grid_length_y = grid_length[0]
+        grid_length_x = grid_length[1]
+        translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2)
+        translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180
+        bev_angle = ego_angle - translation_angle
+        shift_y = translation_length * \
+            np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h
+        shift_x = translation_length * \
+            np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w
+        shift_y = shift_y * self.use_shift
+        shift_x = shift_x * self.use_shift
+        shift = bev_queries.new_tensor(
+            [shift_x, shift_y]).permute(1, 0)  # xy, bs -> bs, xy
+
+        if prev_bev is not None:
+            if prev_bev.shape[1] == bev_h * bev_w:
+                prev_bev = prev_bev.permute(1, 0, 2)
+            if self.rotate_prev_bev:
+                for i in range(bs):
+                    # num_prev_bev = prev_bev.size(1)
+                    rotation_angle = kwargs['img_metas'][i]['can_bus'][-1]
+                    tmp_prev_bev = prev_bev[:, i].reshape(
+                        bev_h, bev_w, -1).permute(2, 0, 1)
+                    tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle,
+                                          center=self.rotate_center)
+                    tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape(
+                        bev_h * bev_w, 1, -1)
+                    prev_bev[:, i] = tmp_prev_bev[:, 0]
+
+        # add can bus signals
+        can_bus = bev_queries.new_tensor(
+            [each['can_bus'] for each in kwargs['img_metas']])  # [:, :]
+        can_bus = self.can_bus_mlp(can_bus)[None, :, :]
+        bev_queries = bev_queries + can_bus * self.use_can_bus
+
+        feat_flatten = []
+        spatial_shapes = []
+        for lvl, feat in enumerate(mlvl_feats):
+            bs, num_cam, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            feat = feat.flatten(3).permute(1, 0, 3, 2)
+            if self.use_cams_embeds:
+                feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype)
+            feat = feat + self.level_embeds[None,
+                                            None, lvl:lvl + 1, :].to(feat.dtype)
+            spatial_shapes.append(spatial_shape)
+            feat_flatten.append(feat)
+
+        feat_flatten = torch.cat(feat_flatten, 2)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=bev_pos.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        feat_flatten = feat_flatten.permute(
+            0, 2, 1, 3)  # (num_cam, H*W, bs, embed_dims)
+
+        bev_embed = self.encoder(
+            bev_queries,
+            feat_flatten,
+            feat_flatten,
+            bev_h=bev_h,
+            bev_w=bev_w,
+            bev_pos=bev_pos,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            prev_bev=prev_bev,
+            shift=shift,
+            **kwargs
+        )
+
+        return bev_embed
+
+    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos'))
+    def forward(self,
+                mlvl_feats,
+                bev_queries,
+                object_query_embed,
+                bev_h,
+                bev_w,
+                grid_length=[0.512, 0.512],
+                bev_pos=None,
+                reg_branches=None,
+                cls_branches=None,
+                prev_bev=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformer`.
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, num_cams, embed_dims, h, w].
+            bev_queries (Tensor): (bev_h*bev_w, c)
+            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
+            object_query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when `with_box_refine` is True. Default to None.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - bev_embed: BEV features
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+
+        bev_embed = self.get_bev_features(
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=grid_length,
+            bev_pos=bev_pos,
+            prev_bev=prev_bev,
+            **kwargs)  # bev_embed shape: bs, bev_h*bev_w, embed_dims
+
+        bs = mlvl_feats[0].size(0)
+        query_pos, query = torch.split(
+            object_query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+        query = query.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = self.reference_points(query_pos)
+        reference_points = reference_points.sigmoid()
+        init_reference_out = reference_points
+
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        bev_embed = bev_embed.permute(1, 0, 2)
+
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=bev_embed,
+            query_pos=query_pos,
+            reference_points=reference_points,
+            reg_branches=reg_branches,
+            cls_branches=cls_branches,
+            spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
+            level_start_index=torch.tensor([0], device=query.device),
+            **kwargs)
+
+        inter_references_out = inter_references
+
+        return bev_embed, inter_states, init_reference_out, inter_references_out
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/planner/__pycache__/metric_stp3.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/planner/__pycache__/metric_stp3.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f21912ce8b3dcef93b1ba06b2be744e63605d8
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/planner/__pycache__/metric_stp3.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/planner/metric_stp3.py b/GenAD-main/projects/mmdet3d_plugin/VAD/planner/metric_stp3.py
new file mode 100644
index 0000000000000000000000000000000000000000..da3ac1a9b4969130759d84091e0251b4d78370b2
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/planner/metric_stp3.py
@@ -0,0 +1,351 @@
+'''
+calculate planner metric same as stp3
+'''
+import numpy as np
+import torch
+import cv2
+import copy
+import matplotlib.pyplot as plt
+from projects.mmdet3d_plugin.core.evaluation.metric_motion import get_ade, get_fde
+from skimage.draw import polygon
+from nuscenes.utils.data_classes import Box
+from scipy.spatial.transform import Rotation as R
+
+ego_width, ego_length = 1.85, 4.084
+
+class PlanningMetric():
+    def __init__(self):
+        super().__init__()
+        self.X_BOUND = [-50.0, 50.0, 0.5]  # Forward
+        self.Y_BOUND = [-50.0, 50.0, 0.5]  # Sides
+        self.Z_BOUND = [-10.0, 10.0, 20.0]  # Height
+        dx, bx, _ = self.gen_dx_bx(self.X_BOUND, self.Y_BOUND, self.Z_BOUND)
+        self.dx, self.bx = dx[:2], bx[:2]
+
+        bev_resolution, bev_start_position, bev_dimension = self.calculate_birds_eye_view_parameters(
+            self.X_BOUND, self.Y_BOUND, self.Z_BOUND
+        )
+        self.bev_resolution = bev_resolution.numpy()
+        self.bev_start_position = bev_start_position.numpy()
+        self.bev_dimension = bev_dimension.numpy()
+
+        self.W = ego_width
+        self.H = ego_length
+
+        self.category_index = {
+            'human':[2,3,4,5,6,7,8],
+            'vehicle':[14,15,16,17,18,19,20,21,22,23]
+        }
+
+        # self.n_future = n_future
+
+        # self.add_state("obj_col", default=torch.zeros(self.n_future), dist_reduce_fx="sum")
+        # self.add_state("obj_box_col", default=torch.zeros(self.n_future), dist_reduce_fx="sum")
+        # self.add_state("L2", default=torch.zeros(self.n_future),dist_reduce_fx="sum")
+        # self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
+
+    def gen_dx_bx(self, xbound, ybound, zbound):
+        dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]])
+        bx = torch.Tensor([row[0] + row[2]/2.0 for row in [xbound, ybound, zbound]])
+        nx = torch.LongTensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]])
+
+        return dx, bx, nx
+    
+    def calculate_birds_eye_view_parameters(self, x_bounds, y_bounds, z_bounds):
+        """
+        Parameters
+        ----------
+            x_bounds: Forward direction in the ego-car.
+            y_bounds: Sides
+            z_bounds: Height
+
+        Returns
+        -------
+            bev_resolution: Bird's-eye view bev_resolution
+            bev_start_position Bird's-eye view first element
+            bev_dimension Bird's-eye view tensor spatial dimension
+        """
+        bev_resolution = torch.tensor([row[2] for row in [x_bounds, y_bounds, z_bounds]])
+        bev_start_position = torch.tensor([row[0] + row[2] / 2.0 for row in [x_bounds, y_bounds, z_bounds]])
+        bev_dimension = torch.tensor([(row[1] - row[0]) / row[2] for row in [x_bounds, y_bounds, z_bounds]],
+                                    dtype=torch.long)
+
+        return bev_resolution, bev_start_position, bev_dimension
+    
+    def get_label(
+            self,
+            gt_agent_boxes,
+            gt_agent_feats
+        ):
+        segmentation_np, pedestrian_np = self.get_birds_eye_view_label(gt_agent_boxes,gt_agent_feats)
+        segmentation = torch.from_numpy(segmentation_np).long().unsqueeze(0)
+        pedestrian = torch.from_numpy(pedestrian_np).long().unsqueeze(0)
+
+        return segmentation, pedestrian
+    
+    def get_birds_eye_view_label(
+            self,
+            gt_agent_boxes,
+            gt_agent_feats
+        ):
+        '''
+        gt_agent_boxes (LiDARInstance3DBoxes): list of GT Bboxs.
+            dim 9 = (x,y,z)+(w,l,h)+yaw+(vx,vy)
+        gt_agent_feats: (B, A, 34)
+            dim 34 = fut_traj(6*2) + fut_mask(6) + goal(1) + lcf_feat(9) + fut_yaw(6)
+            lcf_feat (x, y, yaw, vx, vy, width, length, height, type)
+        ego_lcf_feats: (B, 9) 
+            dim 8 = (vx, vy, ax, ay, w, length, width, vel, steer)
+        '''
+        T = 6
+        segmentation = np.zeros((T,self.bev_dimension[0], self.bev_dimension[1]))
+        pedestrian = np.zeros((T,self.bev_dimension[0], self.bev_dimension[1]))
+        agent_num = gt_agent_feats.shape[1]
+
+        gt_agent_boxes = gt_agent_boxes.tensor.cpu().numpy()  #(N, 9)
+        gt_agent_feats = gt_agent_feats.cpu().numpy()
+
+        gt_agent_fut_trajs = gt_agent_feats[..., :T*2].reshape(-1, 6, 2)
+        gt_agent_fut_mask = gt_agent_feats[..., T*2:T*3].reshape(-1, 6)
+        # gt_agent_lcf_feat = gt_agent_feats[..., T*3+1:T*3+10].reshape(-1, 9)
+        gt_agent_fut_yaw = gt_agent_feats[..., T*3+10:T*4+10].reshape(-1, 6, 1)
+        gt_agent_fut_trajs = np.cumsum(gt_agent_fut_trajs, axis=1)
+        gt_agent_fut_yaw = np.cumsum(gt_agent_fut_yaw, axis=1)
+
+        gt_agent_boxes[:,6:7] = -1*(gt_agent_boxes[:,6:7] + np.pi/2) # NOTE: convert yaw to lidar frame
+        gt_agent_fut_trajs = gt_agent_fut_trajs + gt_agent_boxes[:, np.newaxis, 0:2]
+        gt_agent_fut_yaw = gt_agent_fut_yaw + gt_agent_boxes[:, np.newaxis, 6:7]
+        
+        for t in range(T):
+            for i in range(agent_num):
+                if gt_agent_fut_mask[i][t] == 1:
+                    # Filter out all non vehicle instances
+                    category_index = int(gt_agent_feats[0,i][27])
+                    agent_length, agent_width = gt_agent_boxes[i][4], gt_agent_boxes[i][3]
+                    x_a = gt_agent_fut_trajs[i, t, 0]
+                    y_a = gt_agent_fut_trajs[i, t, 1]
+                    yaw_a = gt_agent_fut_yaw[i, t, 0]
+                    param = [x_a,y_a,yaw_a,agent_length, agent_width]
+                    if (category_index in self.category_index['vehicle']):
+                        poly_region = self._get_poly_region_in_image(param)
+                        cv2.fillPoly(segmentation[t], [poly_region], 1.0)
+                    if (category_index in self.category_index['human']):
+                        poly_region = self._get_poly_region_in_image(param)
+                        cv2.fillPoly(pedestrian[t], [poly_region], 1.0)
+        
+        # vis for debug
+        # plt.figure('debug')
+        # for i in range(T):
+        #     plt.subplot(2,T,i+1)
+        #     plt.imshow(segmentation[i])
+        #     plt.subplot(2,T,i+1+T)
+        #     plt.imshow(pedestrian[i])
+        # plt.savefig('/home/users/qing01.xu/bevformer/debug_figs/car_ped_occ.jpg')
+        # plt.close()
+
+        return segmentation, pedestrian
+    
+    def _get_poly_region_in_image(self,param):
+        lidar2cv_rot = np.array([[1,0], [0,-1]])
+        x_a,y_a,yaw_a,agent_length, agent_width = param
+        trans_a = np.array([[x_a,y_a]]).T
+        rot_mat_a = np.array([[np.cos(yaw_a), -np.sin(yaw_a)],
+                                [np.sin(yaw_a), np.cos(yaw_a)]])
+        agent_corner = np.array([
+            [agent_length/2, -agent_length/2, -agent_length/2, agent_length/2],
+            [agent_width/2, agent_width/2, -agent_width/2, -agent_width/2]]) #(2,4)
+        agent_corner_lidar = np.matmul(rot_mat_a, agent_corner) + trans_a #(2,4)
+        # convert to cv frame
+        agent_corner_cv2 = (np.matmul(lidar2cv_rot, agent_corner_lidar) \
+            - self.bev_start_position[:2,None] + self.bev_resolution[:2,None] / 2.0).T / self.bev_resolution[:2] #(4,2)
+        agent_corner_cv2 = np.round(agent_corner_cv2).astype(np.int32)
+
+        return agent_corner_cv2
+
+
+    def evaluate_single_coll(self, traj, segmentation, input_gt):
+        '''
+        traj: torch.Tensor (n_future, 2)
+            自车lidar系为轨迹参考系
+                ^ y
+                |
+                | 
+                0------->
+                        x
+        segmentation: torch.Tensor (n_future, 200, 200)
+        '''
+        pts = np.array([
+            [-self.H / 2. + 0.5, self.W / 2.],
+            [self.H / 2. + 0.5, self.W / 2.],
+            [self.H / 2. + 0.5, -self.W / 2.],
+            [-self.H / 2. + 0.5, -self.W / 2.],
+        ])
+        pts = (pts - self.bx.cpu().numpy()) / (self.dx.cpu().numpy())
+        pts[:, [0, 1]] = pts[:, [1, 0]]
+        rr, cc = polygon(pts[:,1], pts[:,0])
+        rc = np.concatenate([rr[:,None], cc[:,None]], axis=-1)
+
+        n_future, _ = traj.shape
+        trajs = traj.view(n_future, 1, 2)
+        # 轨迹坐标系转换为:
+        #  ^ x
+        #  |
+        #  | 
+        #  0-------> y
+        trajs_ = copy.deepcopy(trajs)
+        trajs_[:,:,[0,1]] = trajs_[:,:,[1,0]] # can also change original tensor
+        trajs_ = trajs_ / self.dx.to(trajs.device)
+        trajs_ = trajs_.cpu().numpy() + rc # (n_future, 32, 2)
+
+        r = (self.bev_dimension[0] - trajs_[:,:,0]).astype(np.int32)
+        r = np.clip(r, 0, self.bev_dimension[0] - 1)
+
+        c = trajs_[:,:,1].astype(np.int32)
+        c = np.clip(c, 0, self.bev_dimension[1] - 1)
+
+        collision = np.full(n_future, False)
+        for t in range(n_future):
+            rr = r[t]
+            cc = c[t]
+            I = np.logical_and(
+                np.logical_and(rr >= 0, rr < self.bev_dimension[0]),
+                np.logical_and(cc >= 0, cc < self.bev_dimension[1]),
+            )
+            collision[t] = np.any(segmentation[t, rr[I], cc[I]].cpu().numpy())
+        
+        # vis for debug
+        # obs_occ = copy.deepcopy(segmentation)
+        # ego_occ = torch.zeros_like(obs_occ)
+        # for t in range(n_future):
+        #     rr = r[t]
+        #     cc = c[t]
+        #     I = np.logical_and(
+        #         np.logical_and(rr >= 0, rr < self.bev_dimension[0]),
+        #         np.logical_and(cc >= 0, cc < self.bev_dimension[1]),
+        #     )
+        #     ego_occ[t, rr[I], cc[I]]=1
+        
+        # plt.figure()
+        # for i in range(6):
+        #     plt.subplot(2,6,i+1)
+        #     plt.imshow(obs_occ[i])
+        #     plt.subplot(2,6,i+7)
+        #     plt.imshow(ego_occ[i])
+        # if input_gt:
+        #     plt.savefig('/home/users/qing01.xu/bevformer/debug_figs/occ_metric_stp3_gt.jpg')
+        # else:
+        #     plt.savefig('/home/users/qing01.xu/bevformer/debug_figs/occ_metric_stp3_pred.jpg')
+        # plt.close()
+
+        return torch.from_numpy(collision).to(device=traj.device)
+
+    def evaluate_coll(
+            self, 
+            trajs, 
+            gt_trajs, 
+            segmentation
+        ):
+        '''
+        trajs: torch.Tensor (B, n_future, 2)
+            自车lidar系为轨迹参考系
+            ^ y
+            |
+            | 
+            0------->
+                    x
+        gt_trajs: torch.Tensor (B, n_future, 2)
+        segmentation: torch.Tensor (B, n_future, 200, 200)
+
+        '''
+        B, n_future, _ = trajs.shape
+        # trajs = trajs * torch.tensor([-1, 1], device=trajs.device)
+        # gt_trajs = gt_trajs * torch.tensor([-1, 1], device=gt_trajs.device)
+
+        obj_coll_sum = torch.zeros(n_future, device=segmentation.device)
+        obj_box_coll_sum = torch.zeros(n_future, device=segmentation.device)
+
+        for i in range(B):
+            gt_box_coll = self.evaluate_single_coll(gt_trajs[i], segmentation[i], input_gt=True)
+
+            xx, yy = trajs[i,:,0], trajs[i, :, 1]
+            # lidar系下的轨迹转换到图片坐标系下
+            xi = ((-self.bx[0]/2 - yy) / self.dx[0]).long()
+            yi = ((-self.bx[1]/2 + xx) / self.dx[1]).long()
+
+            m1 = torch.logical_and(
+                torch.logical_and(xi >= 0, xi < self.bev_dimension[0]),
+                torch.logical_and(yi >= 0, yi < self.bev_dimension[1]),
+            ).to(gt_box_coll.device)
+            m1 = torch.logical_and(m1, torch.logical_not(gt_box_coll))
+
+            ti = torch.arange(n_future)
+            obj_coll_sum[ti[m1]] += segmentation[i, ti[m1], xi[m1], yi[m1]].long()
+
+            m2 = torch.logical_not(gt_box_coll)
+            box_coll = self.evaluate_single_coll(trajs[i], segmentation[i], input_gt=False).to(ti.device)
+            obj_box_coll_sum[ti[m2]] += (box_coll[ti[m2]]).long()
+
+        return obj_coll_sum, obj_box_coll_sum
+
+    def compute_L2(self, trajs, gt_trajs):
+        '''
+        trajs: torch.Tensor (n_future, 2)
+        gt_trajs: torch.Tensor (n_future, 2)
+        '''
+        # return torch.sqrt(((trajs[:, :, :2] - gt_trajs[:, :, :2]) ** 2).sum(dim=-1))
+        pred_len = trajs.shape[0]
+        ade = float(
+            sum(
+                torch.sqrt(
+                    (trajs[i, 0] - gt_trajs[i, 0]) ** 2
+                    + (trajs[i, 1] - gt_trajs[i, 1]) ** 2
+                )
+                for i in range(pred_len)
+            )
+            / pred_len
+        )
+        
+        return ade
+
+    def compute_L2_stp3(self, trajs, gt_trajs):
+        '''
+        trajs: torch.Tensor (n_future, 2)
+        gt_trajs: torch.Tensor (n_future, 2)
+        '''
+        # return torch.sqrt(((trajs[:, :, :2] - gt_trajs[:, :, :2]) ** 2).sum(dim=-1))
+        pred_len = trajs.shape[0]
+        ade = float(
+                torch.sqrt(
+                    (trajs[-1, 0] - gt_trajs[-1, 0]) ** 2
+                    + (trajs[-1, 1] - gt_trajs[-1, 1]) ** 2
+                )
+        )
+        return ade
+
+    # def update(self, trajs, gt_trajs, segmentation):
+    #     '''
+    #     trajs: torch.Tensor (B, n_future, 3)
+    #     gt_trajs: torch.Tensor (B, n_future, 3)
+    #     segmentation: torch.Tensor (B, n_future, 200, 200)
+    #     '''
+    #     assert trajs.shape == gt_trajs.shape
+    #     L2 = self.compute_L2(trajs, gt_trajs)
+    #     obj_coll_sum, obj_box_coll_sum = self.evaluate_coll(trajs[:,:,:2], gt_trajs[:,:,:2], segmentation)
+
+    #     if torch.isnan(L2).max().item():
+    #         debug = 1
+    #     else:
+    #         self.obj_col += obj_coll_sum
+    #         self.obj_box_col += obj_box_coll_sum
+    #         self.L2 += L2.sum(dim=0)
+    #         if torch.isnan(self.L2).max().item():
+    #             debug=1
+    #         self.total +=len(trajs)
+
+
+    # def compute(self):
+    #     return {
+    #         'obj_col': self.obj_col / self.total,
+    #         'obj_box_col': self.obj_box_col / self.total,
+    #         'L2' : self.L2 / self.total
+    #     }
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03f906ce601e2dfac207af680774086067808830
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__init__.py
@@ -0,0 +1 @@
+from .epoch_based_runner import EpochBasedRunner_video
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de53c43bbb77b246753a5b2322a4360cac37eadb
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/epoch_based_runner.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/epoch_based_runner.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a40300a965582c2e82141e7bf5a063b433658ff
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/__pycache__/epoch_based_runner.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/runner/epoch_based_runner.py b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/epoch_based_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3c4c62bc576fb066f5b76ee433759339dd155cd
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/runner/epoch_based_runner.py
@@ -0,0 +1,91 @@
+import os.path as osp
+import torch
+import mmcv
+from mmcv.runner.base_runner import BaseRunner
+from mmcv.runner.epoch_based_runner import EpochBasedRunner
+from mmcv.runner.builder import RUNNERS
+from mmcv.runner.checkpoint import save_checkpoint
+from mmcv.runner.utils import get_host_info
+from pprint import pprint
+from mmcv.parallel.data_container import DataContainer
+
+
+@RUNNERS.register_module()
+class EpochBasedRunner_video(EpochBasedRunner):
+    
+    ''' 
+    # basic logic
+    
+    input_sequence = [a, b, c] # given a sequence of samples
+    
+    prev_bev = None
+    for each in input_sequcene[:-1]
+        prev_bev = eval_model(each, prev_bev)) # inference only.
+    
+    model(input_sequcene[-1], prev_bev) # train the last sample.
+    '''
+    
+    def __init__(self,
+                 model,
+                 eval_model=None,
+                 batch_processor=None,
+                 optimizer=None,
+                 work_dir=None,
+                 logger=None,
+                 meta=None,
+                 keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'],
+                 max_iters=None,
+                 max_epochs=None):
+        super().__init__(model,
+                 batch_processor,
+                 optimizer,
+                 work_dir,
+                 logger,
+                 meta,
+                 max_iters,
+                 max_epochs)
+        keys.append('img_metas')
+        self.keys = keys
+        self.eval_model = eval_model
+        self.eval_model.eval()
+    
+    def run_iter(self, data_batch, train_mode, **kwargs):
+        if self.batch_processor is not None:
+            assert False
+            # outputs = self.batch_processor(
+            #     self.model, data_batch, train_mode=train_mode, **kwargs)
+        elif train_mode:
+
+            num_samples = data_batch['img'].data[0].size(1)
+            data_list = []
+            prev_bev = None
+            for i in range(num_samples):
+                data = {}
+                for key in self.keys:
+                    if key not in ['img_metas', 'img', 'points']:
+                        data[key] = data_batch[key]
+                    else:
+                        if key == 'img':
+                            data['img'] = DataContainer(data=[data_batch['img'].data[0][:, i]], cpu_only=data_batch['img'].cpu_only, stack=True)
+                        elif key == 'img_metas':
+                            data['img_metas'] = DataContainer(data=[[each[i] for each in data_batch['img_metas'].data[0]]], cpu_only=data_batch['img_metas'].cpu_only)
+                        else:
+                            assert False
+                data_list.append(data)
+            with torch.no_grad():
+                for i in range(num_samples-1):
+                    if i>0: data_list[i]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False)
+                    prev_bev = self.eval_model.val_step(data_list[i], self.optimizer, **kwargs)
+            
+            data_list[-1]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False)
+            outputs = self.model.train_step(data_list[-1], self.optimizer, **kwargs)
+        else:
+            assert False
+            # outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
+
+        if not isinstance(outputs, dict):
+            raise TypeError('"batch_processor()" or "model.train_step()"'
+                            'and "model.val_step()" must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/CD_loss.py b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/CD_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed18fc58734de15957235443621d26c7d7785dcd
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/CD_loss.py
@@ -0,0 +1,718 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn as nn
+from torch.nn.functional import l1_loss, mse_loss, smooth_l1_loss
+
+from mmdet.models.builder import LOSSES
+from mmdet.models import weighted_loss
+import mmcv
+import torch.nn.functional as F
+from mmdet.core.bbox.match_costs.builder import MATCH_COST
+import functools
+
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+@mmcv.jit(derivate=True, coderize=True)
+def custom_weight_dir_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): num_sample, num_dir
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        raise ValueError('avg_factor should not be none for OrderedPtsL1Loss')
+        # loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # import pdb;pdb.set_trace()
+            # loss = loss.permute(1,0,2,3).contiguous()
+            loss = loss.sum()
+            loss = loss / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+@mmcv.jit(derivate=True, coderize=True)
+def custom_weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): num_sample, num_order, num_pts, num_coords
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        raise ValueError('avg_factor should not be none for OrderedPtsL1Loss')
+        # loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # import pdb;pdb.set_trace()
+            loss = loss.permute(1,0,2,3).contiguous()
+            loss = loss.sum((1,2,3))
+            loss = loss / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+def custom_weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = custom_weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
+
+
+def custom_weighted_dir_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = custom_weight_dir_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
+
+@mmcv.jit(derivate=True, coderize=True)
+@custom_weighted_loss
+def ordered_pts_smooth_l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): shape [num_samples, num_pts, num_coords]
+        target (torch.Tensor): shape [num_samples, num_order, num_pts, num_coords]
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    pred = pred.unsqueeze(1).repeat(1, target.size(1),1,1)
+    assert pred.size() == target.size()
+    loss =smooth_l1_loss(pred,target, reduction='none')
+    # import pdb;pdb.set_trace()
+    return loss
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def pts_l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): shape [num_samples, num_pts, num_coords]
+        target (torch.Tensor): shape [num_samples, num_pts, num_coords]
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)
+    return loss
+
+@mmcv.jit(derivate=True, coderize=True)
+@custom_weighted_loss
+def ordered_pts_l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): shape [num_samples, num_pts, num_coords]
+        target (torch.Tensor): shape [num_samples, num_order, num_pts, num_coords]
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    pred = pred.unsqueeze(1).repeat(1, target.size(1),1,1)
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)
+    return loss
+
+@mmcv.jit(derivate=True, coderize=True)
+@custom_weighted_dir_loss
+def pts_dir_cos_loss(pred, target):
+    """ Dir cosine similiarity loss
+    pred (torch.Tensor): shape [num_samples, num_dir, num_coords]
+    target (torch.Tensor): shape [num_samples, num_dir, num_coords]
+
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    # import pdb;pdb.set_trace()
+    num_samples, num_dir, num_coords = pred.shape
+    loss_func = torch.nn.CosineEmbeddingLoss(reduction='none')
+    tgt_param = target.new_ones((num_samples, num_dir))
+    tgt_param = tgt_param.flatten(0)
+    loss = loss_func(pred.flatten(0,1), target.flatten(0,1), tgt_param)
+    loss = loss.view(num_samples, num_dir)
+    return loss
+
+@LOSSES.register_module()
+class OrderedPtsSmoothL1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(OrderedPtsSmoothL1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_bbox = self.loss_weight * ordered_pts_smooth_l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+
+@LOSSES.register_module()
+class PtsDirCosLoss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(PtsDirCosLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_dir = self.loss_weight * pts_dir_cos_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_dir
+
+
+
+@LOSSES.register_module()
+class PtsL1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(PtsL1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_bbox = self.loss_weight * pts_l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+@LOSSES.register_module()
+class OrderedPtsL1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(OrderedPtsL1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_bbox = self.loss_weight * ordered_pts_l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+
+
+
+@MATCH_COST.register_module()
+class OrderedPtsSmoothL1Cost(object):
+    """OrderedPtsL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (x, y), which are all in range [0, 1]. Shape
+                [num_query, num_pts, 2].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x,y). 
+                Shape [num_gt, num_ordered, num_pts, 2].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        num_gts, num_orders, num_pts, num_coords = gt_bboxes.shape
+        # import pdb;pdb.set_trace()
+        bbox_pred = bbox_pred.view(bbox_pred.size(0),-1).unsqueeze(1).repeat(1,num_gts*num_orders,1)
+        gt_bboxes = gt_bboxes.flatten(2).view(num_gts*num_orders,-1).unsqueeze(0).repeat(bbox_pred.size(0),1,1)
+        # import pdb;pdb.set_trace()
+        bbox_cost = smooth_l1_loss(bbox_pred, gt_bboxes, reduction='none').sum(-1)
+        # bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+@MATCH_COST.register_module()
+class PtsL1Cost(object):
+    """OrderedPtsL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (x, y), which are all in range [0, 1]. Shape
+                [num_query, num_pts, 2].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x,y). 
+                Shape [num_gt, num_ordered, num_pts, 2].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        num_gts, num_pts, num_coords = gt_bboxes.shape
+        # import pdb;pdb.set_trace()
+        bbox_pred = bbox_pred.view(bbox_pred.size(0),-1)
+        gt_bboxes = gt_bboxes.view(num_gts,-1)
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+@MATCH_COST.register_module()
+class OrderedPtsL1Cost(object):
+    """OrderedPtsL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (x, y), which are all in range [0, 1]. Shape
+                [num_query, num_pts, 2].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x,y). 
+                Shape [num_gt, num_ordered, num_pts, 2].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        num_gts, num_orders, num_pts, num_coords = gt_bboxes.shape
+        # import pdb;pdb.set_trace()
+        bbox_pred = bbox_pred.view(bbox_pred.size(0),-1)
+        gt_bboxes = gt_bboxes.flatten(2).view(num_gts*num_orders,-1)
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+@MATCH_COST.register_module()
+class MyChamferDistanceCost:
+    def __init__(self, loss_src_weight=1., loss_dst_weight=1.):
+        # assert mode in ['smooth_l1', 'l1', 'l2']
+        # self.mode = mode
+        self.loss_src_weight = loss_src_weight
+        self.loss_dst_weight = loss_dst_weight
+
+    def __call__(self, src, dst,src_weight=1.0,dst_weight=1.0,):
+        """
+        pred_pts (Tensor): normed coordinate(x,y), shape (num_q, num_pts_M, 2)
+        gt_pts (Tensor): normed coordinate(x,y), shape (num_gt, num_pts_N, 2)
+        """
+        # criterion_mode = self.mode
+        # if criterion_mode == 'smooth_l1':
+        #     criterion = smooth_l1_loss
+        # elif criterion_mode == 'l1':
+        #     criterion = l1_loss
+        # elif criterion_mode == 'l2':
+        #     criterion = mse_loss
+        # else:
+        #     raise NotImplementedError
+        # import pdb;pdb.set_trace()
+        src_expand = src.unsqueeze(1).repeat(1,dst.shape[0],1,1)
+        dst_expand = dst.unsqueeze(0).repeat(src.shape[0],1,1,1)
+        # src_expand = src.unsqueeze(2).unsqueeze(1).repeat(1,dst.shape[0], 1, dst.shape[1], 1)
+        # dst_expand = dst.unsqueeze(1).unsqueeze(0).repeat(src.shape[0],1, src.shape[1], 1, 1)
+        distance = torch.cdist(src_expand, dst_expand)
+        src2dst_distance = torch.min(distance, dim=3)[0]  # (num_q, num_gt, num_pts_N)
+        dst2src_distance = torch.min(distance, dim=2)[0]  # (num_q, num_gt, num_pts_M)
+        loss_src = (src2dst_distance * src_weight).mean(-1)
+        loss_dst = (dst2src_distance * dst_weight).mean(-1)
+        loss = loss_src*self.loss_src_weight + loss_dst * self.loss_dst_weight
+        return loss
+
+@mmcv.jit(derivate=True, coderize=True)
+def chamfer_distance(src,
+                     dst,
+                     src_weight=1.0,
+                     dst_weight=1.0,
+                    #  criterion_mode='l1',
+                     reduction='mean',
+                     avg_factor=None):
+    """Calculate Chamfer Distance of two sets.
+
+    Args:
+        src (torch.Tensor): Source set with shape [B, N, C] to
+            calculate Chamfer Distance.
+        dst (torch.Tensor): Destination set with shape [B, M, C] to
+            calculate Chamfer Distance.
+        src_weight (torch.Tensor or float): Weight of source loss.
+        dst_weight (torch.Tensor or float): Weight of destination loss.
+        criterion_mode (str): Criterion mode to calculate distance.
+            The valid modes are smooth_l1, l1 or l2.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+
+    Returns:
+        tuple: Source and Destination loss with the corresponding indices.
+
+            - loss_src (torch.Tensor): The min distance \
+                from source to destination.
+            - loss_dst (torch.Tensor): The min distance \
+                from destination to source.
+            - indices1 (torch.Tensor): Index the min distance point \
+                for each point in source to destination.
+            - indices2 (torch.Tensor): Index the min distance point \
+                for each point in destination to source.
+    """
+
+    # if criterion_mode == 'smooth_l1':
+    #     criterion = smooth_l1_loss
+    # elif criterion_mode == 'l1':
+    #     criterion = l1_loss
+    # elif criterion_mode == 'l2':
+    #     criterion = mse_loss
+    # else:
+    #     raise NotImplementedError
+
+    # src_expand = src.unsqueeze(2).repeat(1, 1, dst.shape[1], 1)
+    # dst_expand = dst.unsqueeze(1).repeat(1, src.shape[1], 1, 1)
+    # import pdb;pdb.set_trace()
+    distance = torch.cdist(src, dst)
+    src2dst_distance, indices1 = torch.min(distance, dim=2)  # (B,N)
+    dst2src_distance, indices2 = torch.min(distance, dim=1)  # (B,M)
+    # import pdb;pdb.set_trace()
+    #TODO this may be wrong for misaligned src_weight, now[N,fixed_num]
+    # should be [N], then view
+    loss_src = (src2dst_distance * src_weight)
+    loss_dst = (dst2src_distance * dst_weight)
+    if avg_factor is None:
+        reduction_enum = F._Reduction.get_enum(reduction)
+        if reduction_enum == 0:
+            raise ValueError('MyCDLoss can not be used with reduction=`none`')
+        elif reduction_enum == 1:
+            loss_src = loss_src.mean(-1).mean()
+            loss_dst = loss_dst.mean(-1).mean()
+        elif reduction_enum == 2:
+            loss_src = loss_src.mean(-1).sum()
+            loss_dst = loss_dst.mean(-1).sum()
+        else:
+            raise NotImplementedError
+    else:
+        if reduction == 'mean':
+            eps = torch.finfo(torch.float32).eps
+            loss_src = loss_src.mean(-1).sum() / (avg_factor + eps)
+            loss_dst = loss_dst.mean(-1).sum() / (avg_factor + eps)
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+
+    return loss_src, loss_dst, indices1, indices2
+
+
+@LOSSES.register_module()
+class MyChamferDistance(nn.Module):
+    """Calculate Chamfer Distance of two sets.
+
+    Args:
+        mode (str): Criterion mode to calculate distance.
+            The valid modes are smooth_l1, l1 or l2.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are none, sum or mean.
+        loss_src_weight (float): Weight of loss_source.
+        loss_dst_weight (float): Weight of loss_target.
+    """
+
+    def __init__(self,
+                #  mode='l1',
+                 reduction='mean',
+                 loss_src_weight=1.0,
+                 loss_dst_weight=1.0):
+        super(MyChamferDistance, self).__init__()
+
+        # assert mode in ['smooth_l1', 'l1', 'l2']
+        assert reduction in ['none', 'sum', 'mean']
+        # self.mode = mode
+        self.reduction = reduction
+        self.loss_src_weight = loss_src_weight
+        self.loss_dst_weight = loss_dst_weight
+
+    def forward(self,
+                source,
+                target,
+                src_weight=1.0,
+                dst_weight=1.0,
+                avg_factor=None,
+                reduction_override=None,
+                return_indices=False,
+                **kwargs):
+        """Forward function of loss calculation.
+
+        Args:
+            source (torch.Tensor): Source set with shape [B, N, C] to
+                calculate Chamfer Distance.
+            target (torch.Tensor): Destination set with shape [B, M, C] to
+                calculate Chamfer Distance.
+            src_weight (torch.Tensor | float, optional):
+                Weight of source loss. Defaults to 1.0.
+            dst_weight (torch.Tensor | float, optional):
+                Weight of destination loss. Defaults to 1.0.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+            return_indices (bool, optional): Whether to return indices.
+                Defaults to False.
+
+        Returns:
+            tuple[torch.Tensor]: If ``return_indices=True``, return losses of \
+                source and target with their corresponding indices in the \
+                order of ``(loss_source, loss_target, indices1, indices2)``. \
+                If ``return_indices=False``, return \
+                ``(loss_source, loss_target)``.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss_source, loss_target, indices1, indices2 = chamfer_distance(
+            source, target, src_weight, dst_weight, reduction,
+            avg_factor=avg_factor)
+
+        loss_source *= self.loss_src_weight
+        loss_target *= self.loss_dst_weight
+
+        loss_pts = loss_source + loss_target
+
+        if return_indices:
+            return loss_pts, indices1, indices2
+        else:
+            return loss_pts
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__init__.py b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..126bf07bf3860d0dd817c6d41eafc299002ff956
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__init__.py
@@ -0,0 +1,7 @@
+from .map_utils import normalize_2d_bbox, normalize_2d_pts, denormalize_2d_bbox, denormalize_2d_pts
+from .CD_loss import (
+    MyChamferDistance, MyChamferDistanceCost,
+    OrderedPtsL1Cost, PtsL1Cost, OrderedPtsSmoothL1Cost,
+    OrderedPtsL1Loss, PtsL1Loss, PtsDirCosLoss
+)
+from .plan_loss import PlanMapBoundLoss, PlanCollisionLoss, PlanMapDirectionLoss
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/CD_loss.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/CD_loss.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e5c3040741e07c8aa1e17aaf254c5d23a41196a
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/CD_loss.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12f06749954c89fa2ee28f3f8f761f25913fa3bf
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/map_utils.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/map_utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbefcfd6662b2c0302c0735ef8c54678062bf221
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/map_utils.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/plan_loss.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/plan_loss.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3483eb02a7fd5dd9d0359895086a2560de004df
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/plan_loss.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/traj_lr_warmup.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/traj_lr_warmup.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f82af54fc90b31e5f2bb513dcbe690e6c55480ec
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/__pycache__/traj_lr_warmup.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/map_utils.py b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/map_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4884c1d235b5e22a5666a0f60be1487309225d0
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/map_utils.py
@@ -0,0 +1,41 @@
+from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh, bbox_cxcywh_to_xyxy
+
+def normalize_2d_bbox(bboxes, pc_range):
+
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    cxcywh_bboxes = bbox_xyxy_to_cxcywh(bboxes)
+    cxcywh_bboxes[...,0:1] = cxcywh_bboxes[..., 0:1] - pc_range[0]
+    cxcywh_bboxes[...,1:2] = cxcywh_bboxes[...,1:2] - pc_range[1]
+    factor = bboxes.new_tensor([patch_w, patch_h,patch_w,patch_h])
+
+    normalized_bboxes = cxcywh_bboxes / factor
+    return normalized_bboxes
+
+def normalize_2d_pts(pts, pc_range):
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    new_pts = pts.clone()
+    new_pts[...,0:1] = pts[..., 0:1] - pc_range[0]
+    new_pts[...,1:2] = pts[...,1:2] - pc_range[1]
+    factor = pts.new_tensor([patch_w, patch_h])
+    normalized_pts = new_pts / factor
+    return normalized_pts
+
+def denormalize_2d_bbox(bboxes, pc_range):
+
+    bboxes = bbox_cxcywh_to_xyxy(bboxes)
+    bboxes[..., 0::2] = (bboxes[..., 0::2]*(pc_range[3] -
+                            pc_range[0]) + pc_range[0])
+    bboxes[..., 1::2] = (bboxes[..., 1::2]*(pc_range[4] -
+                            pc_range[1]) + pc_range[1])
+
+    return bboxes
+
+def denormalize_2d_pts(pts, pc_range):
+    new_pts = pts.clone()
+    new_pts[...,0:1] = (pts[..., 0:1]*(pc_range[3] -
+                            pc_range[0]) + pc_range[0])
+    new_pts[...,1:2] = (pts[...,1:2]*(pc_range[4] -
+                            pc_range[1]) + pc_range[1])
+    return new_pts
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/plan_loss.py b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/plan_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd32e1642bf4990d403eb78d1b196d1b1ab95039
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/plan_loss.py
@@ -0,0 +1,447 @@
+import math
+import mmcv
+import torch
+from torch import nn as nn
+from mmdet.models import weighted_loss
+from mmdet.models.builder import LOSSES
+
+
+@LOSSES.register_module()
+class PlanMapBoundLoss(nn.Module):
+    """Planning constraint to push ego vehicle away from the lane boundary.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+        map_thresh (float, optional): confidence threshold to filter map predictions.
+        lane_bound_cls_idx (float, optional): lane_boundary class index.
+        dis_thresh (float, optional): distance threshold between ego vehicle and lane bound.
+        point_cloud_range (list, optional): point cloud range.
+    """
+
+    def __init__(
+        self,
+        reduction='mean',
+        loss_weight=1.0,
+        map_thresh=0.5,
+        lane_bound_cls_idx=2,
+        dis_thresh=1.0,
+        point_cloud_range=[-15.0, -30.0, -2.0, 15.0, 30.0, 2.0],
+        perception_detach=False
+    ):
+        super(PlanMapBoundLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.map_thresh = map_thresh
+        self.lane_bound_cls_idx = lane_bound_cls_idx
+        self.dis_thresh = dis_thresh
+        self.pc_range = point_cloud_range
+        self.perception_detach = perception_detach
+
+    def forward(self,
+                ego_fut_preds,
+                lane_preds,
+                lane_score_preds,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            ego_fut_preds (Tensor): [B, fut_ts, 2]
+            lane_preds (Tensor): [B, num_vec, num_pts, 2]
+            lane_score_preds (Tensor): [B, num_vec, 3]
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        if self.perception_detach:
+            lane_preds = lane_preds.detach()
+            lane_score_preds = lane_score_preds.detach()
+
+        # filter lane element according to confidence score and class
+        not_lane_bound_mask = lane_score_preds[..., self.lane_bound_cls_idx] < self.map_thresh
+        # denormalize map pts
+        lane_bound_preds = lane_preds.clone()
+        lane_bound_preds[...,0:1] = (lane_bound_preds[..., 0:1] * (self.pc_range[3] -
+                                self.pc_range[0]) + self.pc_range[0])
+        lane_bound_preds[...,1:2] = (lane_bound_preds[..., 1:2] * (self.pc_range[4] -
+                                self.pc_range[1]) + self.pc_range[1])
+        # pad not-lane-boundary cls and low confidence preds
+        lane_bound_preds[not_lane_bound_mask] = 1e6
+
+        loss_bbox = self.loss_weight * plan_map_bound_loss(ego_fut_preds, lane_bound_preds,
+                                                           weight=weight, dis_thresh=self.dis_thresh,
+                                                           reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def plan_map_bound_loss(pred, target, dis_thresh=1.0):
+    """Planning map bound constraint (L1 distance).
+
+    Args:
+        pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2].
+        target (torch.Tensor): lane_bound_preds, [B, num_vec, num_pts, 2].
+        weight (torch.Tensor): [B, fut_ts]
+
+    Returns:
+        torch.Tensor: Calculated loss [B, fut_ts]
+    """
+    pred = pred.cumsum(dim=-2)
+    ego_traj_starts = pred[:, :-1, :]
+    ego_traj_ends = pred
+    B, T, _ = ego_traj_ends.size()
+    padding_zeros = torch.zeros((B, 1, 2), dtype=pred.dtype, device=pred.device)  # initial position
+    ego_traj_starts = torch.cat((padding_zeros, ego_traj_starts), dim=1)
+    _, V, P, _ = target.size()
+    ego_traj_expanded = ego_traj_ends.unsqueeze(2).unsqueeze(3)  # [B, T, 1, 1, 2]
+    maps_expanded = target.unsqueeze(1)  # [1, 1, M, P, 2]
+    dist = torch.linalg.norm(ego_traj_expanded - maps_expanded, dim=-1)  # [B, T, M, P]
+    dist = dist.min(dim=-1, keepdim=False)[0]
+    min_inst_idxs = torch.argmin(dist, dim=-1).tolist()
+    batch_idxs = [[i] for i in range(dist.shape[0])]
+    ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])]
+    bd_target = target.unsqueeze(1).repeat(1, pred.shape[1], 1, 1, 1)
+    min_bd_insts = bd_target[batch_idxs, ts_idxs, min_inst_idxs]  # [B, T, P, 2]
+    bd_inst_starts = min_bd_insts[:, :, :-1, :].flatten(0, 2)
+    bd_inst_ends = min_bd_insts[:, :, 1:, :].flatten(0, 2)
+    ego_traj_starts = ego_traj_starts.unsqueeze(2).repeat(1, 1, P-1, 1).flatten(0, 2)
+    ego_traj_ends = ego_traj_ends.unsqueeze(2).repeat(1, 1, P-1, 1).flatten(0, 2)
+
+    intersect_mask = segments_intersect(ego_traj_starts, ego_traj_ends,
+                                        bd_inst_starts, bd_inst_ends)
+    intersect_mask = intersect_mask.reshape(B, T, P-1)
+    intersect_mask = intersect_mask.any(dim=-1)
+    intersect_idx = (intersect_mask == True).nonzero()
+
+    target = target.view(target.shape[0], -1, target.shape[-1])
+    # [B, fut_ts, num_vec*num_pts]
+    dist = torch.linalg.norm(pred[:, :, None, :] - target[:, None, :, :], dim=-1)
+    min_idxs = torch.argmin(dist, dim=-1).tolist()
+    batch_idxs = [[i] for i in range(dist.shape[0])]
+    ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])]
+    min_dist = dist[batch_idxs, ts_idxs, min_idxs]
+    loss = min_dist
+    safe_idx = loss > dis_thresh
+    unsafe_idx = loss <= dis_thresh
+    loss[safe_idx] = 0
+    loss[unsafe_idx] = dis_thresh - loss[unsafe_idx]
+
+    for i in range(len(intersect_idx)):
+        loss[intersect_idx[i, 0], intersect_idx[i, 1]:] = 0
+
+    return loss
+
+
+def segments_intersect(line1_start, line1_end, line2_start, line2_end):
+    # Calculating the differences
+    dx1 = line1_end[:, 0] - line1_start[:, 0]
+    dy1 = line1_end[:, 1] - line1_start[:, 1]
+    dx2 = line2_end[:, 0] - line2_start[:, 0]
+    dy2 = line2_end[:, 1] - line2_start[:, 1]
+
+    # Calculating determinants
+    det = dx1 * dy2 - dx2 * dy1
+    det_mask = det != 0
+
+    # Checking if lines are parallel or coincident
+    parallel_mask = torch.logical_not(det_mask)
+
+    # Calculating intersection parameters
+    t1 = ((line2_start[:, 0] - line1_start[:, 0]) * dy2 
+          - (line2_start[:, 1] - line1_start[:, 1]) * dx2) / det
+    t2 = ((line2_start[:, 0] - line1_start[:, 0]) * dy1 
+          - (line2_start[:, 1] - line1_start[:, 1]) * dx1) / det
+
+    # Checking intersection conditions
+    intersect_mask = torch.logical_and(
+        torch.logical_and(t1 >= 0, t1 <= 1),
+        torch.logical_and(t2 >= 0, t2 <= 1)
+    )
+
+    # Handling parallel or coincident lines
+    intersect_mask[parallel_mask] = False
+
+    return intersect_mask
+
+
+@LOSSES.register_module()
+class PlanCollisionLoss(nn.Module):
+    """Planning constraint to push ego vehicle away from other agents.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+        agent_thresh (float, optional): confidence threshold to filter agent predictions.
+        x_dis_thresh (float, optional): distance threshold between ego and other agents in x-axis.
+        y_dis_thresh (float, optional): distance threshold between ego and other agents in y-axis.
+        point_cloud_range (list, optional): point cloud range.
+    """
+
+    def __init__(
+        self,
+        reduction='mean',
+        loss_weight=1.0,
+        agent_thresh=0.5,
+        x_dis_thresh=1.5,
+        y_dis_thresh=3.0,
+        point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+    ):
+        super(PlanCollisionLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.agent_thresh = agent_thresh
+        self.x_dis_thresh = x_dis_thresh
+        self.y_dis_thresh = y_dis_thresh
+        self.pc_range = point_cloud_range
+
+    def forward(self,
+                ego_fut_preds,
+                agent_preds,
+                agent_fut_preds,
+                agent_score_preds,
+                agent_fut_cls_preds,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            ego_fut_preds (Tensor): [B, fut_ts, 2]
+            agent_preds (Tensor): [B, num_agent, 2]
+            agent_fut_preds (Tensor): [B, num_agent, fut_mode, fut_ts, 2]
+            agent_fut_cls_preds (Tensor): [B, num_agent, fut_mode]
+            agent_score_preds (Tensor): [B, num_agent, 10]
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        # filter agent element according to confidence score
+        agent_max_score_preds, agent_max_score_idxs = agent_score_preds.max(dim=-1)
+        not_valid_agent_mask = agent_max_score_preds < self.agent_thresh
+        # filter low confidence preds
+        agent_fut_preds[not_valid_agent_mask] = 1e6
+        # filter not vehicle preds
+        not_veh_pred_mask = agent_max_score_idxs > 4  # veh idxs are 0-4
+        agent_fut_preds[not_veh_pred_mask] = 1e6
+        # only use best mode pred
+        best_mode_idxs = torch.argmax(agent_fut_cls_preds, dim=-1).tolist()
+        batch_idxs = [[i] for i in range(agent_fut_cls_preds.shape[0])]
+        agent_num_idxs = [[i for i in range(agent_fut_cls_preds.shape[1])] for j in range(agent_fut_cls_preds.shape[0])]
+        agent_fut_preds = agent_fut_preds[batch_idxs, agent_num_idxs, best_mode_idxs]
+
+        loss_bbox = self.loss_weight * plan_col_loss(ego_fut_preds, agent_preds,
+                                                           agent_fut_preds=agent_fut_preds, weight=weight,
+                                                           x_dis_thresh=self.x_dis_thresh,
+                                                           y_dis_thresh=self.y_dis_thresh,
+                                                           reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def plan_col_loss(
+    pred,
+    target,
+    agent_fut_preds,
+    x_dis_thresh=1.5,
+    y_dis_thresh=3.0,
+    dis_thresh=3.0
+):
+    """Planning ego-agent collsion constraint.
+
+    Args:
+        pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2].
+        target (torch.Tensor): agent_preds, [B, num_agent, 2].
+        agent_fut_preds (Tensor): [B, num_agent, fut_ts, 2].
+        weight (torch.Tensor): [B, fut_ts, 2].
+        x_dis_thresh (float, optional): distance threshold between ego and other agents in x-axis.
+        y_dis_thresh (float, optional): distance threshold between ego and other agents in y-axis.
+        dis_thresh (float, optional): distance threshold to filter distant agents.
+
+    Returns:
+        torch.Tensor: Calculated loss [B, fut_mode, fut_ts, 2]
+    """
+    pred = pred.cumsum(dim=-2)
+    agent_fut_preds = agent_fut_preds.cumsum(dim=-2)
+    target = target[:, :, None, :] + agent_fut_preds
+    # filter distant agents from ego vehicle
+    dist = torch.linalg.norm(pred[:, None, :, :] - target, dim=-1)
+    dist_mask = dist > dis_thresh
+    target[dist_mask] = 1e6
+
+    # [B, num_agent, fut_ts]
+    x_dist = torch.abs(pred[:, None, :, 0] - target[..., 0])
+    y_dist = torch.abs(pred[:, None, :, 1] - target[..., 1])
+    x_min_idxs = torch.argmin(x_dist, dim=1).tolist()
+    y_min_idxs = torch.argmin(y_dist, dim=1).tolist()
+    batch_idxs = [[i] for i in range(y_dist.shape[0])]
+    ts_idxs = [[i for i in range(y_dist.shape[-1])] for j in range(y_dist.shape[0])]
+
+    # [B, fut_ts]
+    x_min_dist = x_dist[batch_idxs, x_min_idxs, ts_idxs]
+    y_min_dist = y_dist[batch_idxs, y_min_idxs, ts_idxs]
+    x_loss = x_min_dist
+    safe_idx = x_loss > x_dis_thresh
+    unsafe_idx = x_loss <= x_dis_thresh
+    x_loss[safe_idx] = 0
+    x_loss[unsafe_idx] = x_dis_thresh - x_loss[unsafe_idx]
+    y_loss = y_min_dist
+    safe_idx = y_loss > y_dis_thresh
+    unsafe_idx = y_loss <= y_dis_thresh
+    y_loss[safe_idx] = 0
+    y_loss[unsafe_idx] = y_dis_thresh - y_loss[unsafe_idx]
+    loss = torch.cat([x_loss.unsqueeze(-1), y_loss.unsqueeze(-1)], dim=-1)
+
+    return loss
+
+
+@LOSSES.register_module()
+class PlanMapDirectionLoss(nn.Module):
+    """Planning loss to force the ego heading angle consistent with lane direction.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+        theta_thresh (float, optional): angle diff thresh between ego and lane.
+        point_cloud_range (list, optional): point cloud range.
+    """
+
+    def __init__(
+        self,
+        reduction='mean',
+        loss_weight=1.0,
+        map_thresh=0.5,
+        dis_thresh=2.0,
+        lane_div_cls_idx=0,
+        point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+    ):
+        super(PlanMapDirectionLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.map_thresh = map_thresh
+        self.dis_thresh = dis_thresh
+        self.lane_div_cls_idx = lane_div_cls_idx
+        self.pc_range = point_cloud_range
+
+    def forward(self,
+                ego_fut_preds,
+                lane_preds,
+                lane_score_preds,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            ego_fut_preds (Tensor): [B, fut_ts, 2]
+            lane_preds (Tensor): [B, num_vec, num_pts, 2]
+            lane_score_preds (Tensor): [B, num_vec, 3]
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        # filter lane element according to confidence score and class
+        not_lane_div_mask = lane_score_preds[..., self.lane_div_cls_idx] < self.map_thresh
+        # denormalize map pts
+        lane_div_preds = lane_preds.clone()
+        lane_div_preds[...,0:1] = (lane_div_preds[..., 0:1] * (self.pc_range[3] -
+                                self.pc_range[0]) + self.pc_range[0])
+        lane_div_preds[...,1:2] = (lane_div_preds[..., 1:2] * (self.pc_range[4] -
+                                self.pc_range[1]) + self.pc_range[1])
+        # pad not-lane-divider cls and low confidence preds
+        lane_div_preds[not_lane_div_mask] = 1e6
+
+        loss_bbox = self.loss_weight * plan_map_dir_loss(ego_fut_preds, lane_div_preds,
+                                                           weight=weight, dis_thresh=self.dis_thresh,
+                                                           reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def plan_map_dir_loss(pred, target, dis_thresh=2.0):
+    """Planning ego-map directional loss.
+
+    Args:
+        pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2].
+        target (torch.Tensor): lane_div_preds, [B, num_vec, num_pts, 2].
+        weight (torch.Tensor): [B, fut_ts]
+
+    Returns:
+        torch.Tensor: Calculated loss [B, fut_ts]
+    """
+    num_map_pts = target.shape[2]
+    pred = pred.cumsum(dim=-2)
+    traj_dis = torch.linalg.norm(pred[:, -1, :] - pred[:, 0, :], dim=-1)
+    static_mask = traj_dis < 1.0
+    target = target.unsqueeze(1).repeat(1, pred.shape[1], 1, 1, 1)
+
+    # find the closest map instance for ego at each timestamp
+    dist = torch.linalg.norm(pred[:, :, None, None, :] - target, dim=-1)
+    dist = dist.min(dim=-1, keepdim=False)[0]
+    min_inst_idxs = torch.argmin(dist, dim=-1).tolist()
+    batch_idxs = [[i] for i in range(dist.shape[0])]
+    ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])]
+    target_map_inst = target[batch_idxs, ts_idxs, min_inst_idxs]  # [B, fut_ts, num_pts, 2]
+
+    # calculate distance
+    dist = torch.linalg.norm(pred[:, :, None, :] - target_map_inst, dim=-1)
+    min_pts_idxs = torch.argmin(dist, dim=-1)
+    min_pts_next_idxs = min_pts_idxs.clone()
+    is_end_point = (min_pts_next_idxs == num_map_pts-1)
+    not_end_point = (min_pts_next_idxs != num_map_pts-1)
+    min_pts_next_idxs[is_end_point] = num_map_pts - 2
+    min_pts_next_idxs[not_end_point] = min_pts_next_idxs[not_end_point] + 1
+    min_pts_idxs = min_pts_idxs.tolist()
+    min_pts_next_idxs = min_pts_next_idxs.tolist()
+    traj_yaw = torch.atan2(torch.diff(pred[..., 1]), torch.diff(pred[..., 0]))  # [B, fut_ts-1]
+    # last ts yaw assume same as previous
+    traj_yaw = torch.cat([traj_yaw, traj_yaw[:, [-1]]], dim=-1)  # [B, fut_ts]
+    min_pts = target_map_inst[batch_idxs, ts_idxs, min_pts_idxs]
+    dist = torch.linalg.norm(min_pts - pred, dim=-1)
+    dist_mask = dist > dis_thresh
+    min_pts = min_pts.unsqueeze(2)
+    min_pts_next = target_map_inst[batch_idxs, ts_idxs, min_pts_next_idxs].unsqueeze(2)
+    map_pts = torch.cat([min_pts, min_pts_next], dim=2)
+    lane_yaw = torch.atan2(torch.diff(map_pts[..., 1]).squeeze(-1), torch.diff(map_pts[..., 0]).squeeze(-1))  # [B, fut_ts]
+    yaw_diff = traj_yaw - lane_yaw
+    yaw_diff[yaw_diff > math.pi] =  yaw_diff[yaw_diff > math.pi] - math.pi
+    yaw_diff[yaw_diff > math.pi/2] = yaw_diff[yaw_diff > math.pi/2] - math.pi
+    yaw_diff[yaw_diff < -math.pi] = yaw_diff[yaw_diff < -math.pi] + math.pi
+    yaw_diff[yaw_diff < -math.pi/2] = yaw_diff[yaw_diff < -math.pi/2] + math.pi
+    yaw_diff[dist_mask] = 0  # loss = 0 if no lane around ego
+    yaw_diff[static_mask] = 0  # loss = 0 if ego is static
+
+    loss = torch.abs(yaw_diff)
+
+    return loss  # [B, fut_ts]
diff --git a/GenAD-main/projects/mmdet3d_plugin/VAD/utils/traj_lr_warmup.py b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/traj_lr_warmup.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b5ba1bcb7298e4dc9102af620e406731a97b9b1
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/VAD/utils/traj_lr_warmup.py
@@ -0,0 +1,13 @@
+import torch
+
+def get_traj_warmup_loss_weight(
+    cur_epoch,
+    tot_epoch,
+    start_pos=0.3,
+    end_pos=0.35,
+    scale_weight=1.1
+):
+    epoch_percentage = cur_epoch / tot_epoch
+    sigmoid_input = 5 / (end_pos-start_pos) * epoch_percentage - 2.5 * (end_pos+start_pos) / (end_pos - start_pos)
+
+    return scale_weight * torch.sigmoid(torch.tensor(sigmoid_input))
diff --git a/GenAD-main/projects/mmdet3d_plugin/__init__.py b/GenAD-main/projects/mmdet3d_plugin/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e414aaf938d7422c8c72a360130f04c76537c4
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/__init__.py
@@ -0,0 +1,11 @@
+from .core.bbox.assigners.hungarian_assigner_3d import HungarianAssigner3D
+from .core.bbox.coders.nms_free_coder import NMSFreeCoder
+from .core.bbox.match_costs import BBox3DL1Cost
+from .core.evaluation.eval_hooks import CustomDistEvalHook
+from .datasets.pipelines import (
+  PhotoMetricDistortionMultiViewImage, PadMultiViewImage, 
+  NormalizeMultiviewImage,  CustomCollect3D)
+from .models.backbones.vovnet import VoVNet
+from .models.utils import *
+from .models.opt.adamw import AdamW2
+from .VAD import *
diff --git a/GenAD-main/projects/mmdet3d_plugin/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6b701b5d66914e44a423f2bb2fb8d419f99dd7c
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..98d6e7e00553f1435d6f0d09ca69c8a5c4f1b4d0
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/__init__.py
@@ -0,0 +1,6 @@
+
+from .dense_heads import *
+from .detectors import *
+from .modules import *
+from .runner import *
+from .hooks import *
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..15dff22b7478a0f30151d376d41f3dc46e88ba7d
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/__init__.py
@@ -0,0 +1,3 @@
+from .train import custom_train_model
+from .mmdet_train import custom_train_detector
+# from .test import custom_multi_gpu_test
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..e57bd225dc33d631849a3aef8db2bae217520658
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/mmdet_train.py
@@ -0,0 +1,200 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import random
+import warnings
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+                         Fp16OptimizerHook, OptimizerHook, build_optimizer,
+                         build_runner, get_dist_info)
+from mmcv.utils import build_from_cfg
+
+from mmdet.core import EvalHook
+
+from mmdet.datasets import (build_dataset,
+                            replace_ImageToTensor)
+from mmdet.utils import get_root_logger
+import time
+import os.path as osp
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.core.evaluation.eval_hooks import CustomDistEvalHook
+from projects.mmdet3d_plugin.datasets import custom_build_dataset
+def custom_train_detector(model,
+                   dataset,
+                   cfg,
+                   distributed=False,
+                   validate=False,
+                   timestamp=None,
+                   eval_model=None,
+                   meta=None):
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+   
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    #assert len(dataset)==1s
+    if 'imgs_per_gpu' in cfg.data:
+        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                       'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            logger.warning(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            logger.warning(
+                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # cfg.gpus will be ignored if distributed
+            len(cfg.gpu_ids),
+            dist=distributed,
+            seed=cfg.seed,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        ) for ds in dataset
+    ]
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+        if eval_model is not None:
+            eval_model = MMDistributedDataParallel(
+                eval_model.cuda(),
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+    else:
+        model = MMDataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+        if eval_model is not None:
+            eval_model = MMDataParallel(
+                eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+
+    if 'runner' not in cfg:
+        cfg.runner = {
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        }
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    if eval_model is not None:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                eval_model=eval_model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+    else:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+    elif distributed and 'type' not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    
+    # register profiler hook
+    #trace_config = dict(type='tb_trace', dir_name='work_dir')
+    #profiler_config = dict(on_trace_ready=trace_config)
+    #runner.register_profiler_hook(profiler_config)
+    
+    if distributed:
+        if isinstance(runner, EpochBasedRunner):
+            runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        # Support batch_size > 1 in validation
+        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+        if val_samples_per_gpu > 1:
+            assert False
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.val.pipeline = replace_ImageToTensor(
+                cfg.data.val.pipeline)
+        val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True))
+
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=val_samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        )
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
+        eval_hook = CustomDistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+    # user-defined hooks
+    if cfg.get('custom_hooks', None):
+        custom_hooks = cfg.custom_hooks
+        assert isinstance(custom_hooks, list), \
+            f'custom_hooks expect list type, but got {type(custom_hooks)}'
+        for hook_cfg in cfg.custom_hooks:
+            assert isinstance(hook_cfg, dict), \
+                'Each item in custom_hooks expects dict type, but got ' \
+                f'{type(hook_cfg)}'
+            hook_cfg = hook_cfg.copy()
+            priority = hook_cfg.pop('priority', 'NORMAL')
+            hook = build_from_cfg(hook_cfg, HOOKS)
+            runner.register_hook(hook, priority=priority)
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/test.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e576136224e28e1b5e9a5bac0735ddc55c196bf
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/test.py
@@ -0,0 +1,164 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.runner import get_dist_info
+
+from mmdet.core import encode_mask_results
+
+
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+
+def custom_encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code. Semantic Masks only
+    Args:
+        mask_results (list | tuple[list]): bitmap mask results.
+            In mask scoring rcnn, mask_results is a tuple of (segm_results,
+            segm_cls_score).
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    cls_segms = mask_results
+    num_classes = len(cls_segms)
+    encoded_mask_results = []
+    for i in range(len(cls_segms)):
+        encoded_mask_results.append(
+            mask_util.encode(
+                np.array(
+                    cls_segms[i][:, :, np.newaxis], order='F',
+                        dtype='uint8'))[0])  # encoded with RLE
+    return [encoded_mask_results]
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    bbox_results = []
+    mask_results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    have_mask = False
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+            # encode mask results
+            if isinstance(result, dict):
+                if 'bbox_results' in result.keys():
+                    bbox_result = result['bbox_results']
+                    batch_size = len(result['bbox_results'])
+                    bbox_results.extend(bbox_result)
+                if 'mask_results' in result.keys() and result['mask_results'] is not None:
+                    mask_result = custom_encode_mask_results(result['mask_results'])
+                    mask_results.extend(mask_result)
+                    have_mask = True
+            else:
+                batch_size = len(result)
+                bbox_results.extend(result)
+
+            #if isinstance(result[0], tuple):
+            #    assert False, 'this code is for instance segmentation, which our code will not utilize.'
+            #    result = [(bbox_results, encode_mask_results(mask_results))
+            #              for bbox_results, mask_results in result]
+        if rank == 0:
+            
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        bbox_results = collect_results_gpu(bbox_results, len(dataset))
+        if have_mask:
+            mask_results = collect_results_gpu(mask_results, len(dataset))
+        else:
+            mask_results = None
+    else:
+        bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+        tmpdir = tmpdir+'_mask' if tmpdir is not None else None
+        if have_mask:
+            mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
+        else:
+            mask_results = None
+
+    if mask_results is None:
+        return {'bbox_results': bbox_results}
+    return {'bbox_results': bbox_results, 'mask_results': mask_results}
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(mmcv.load(part_file))
+        # sort the results
+        ordered_results = []
+        '''
+        bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
+        '''
+        #for res in zip(*part_list):
+        for res in part_list:  
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    collect_results_cpu(result_part, size)
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/train.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9391e606f29961875b48eebe36d3b9d415b6290
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/apis/train.py
@@ -0,0 +1,67 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from .mmdet_train import custom_train_detector
+from mmseg.apis import train_segmentor
+from mmdet.apis import train_detector
+
+def custom_train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                eval_model=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        assert False
+    else:
+        custom_train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            eval_model=eval_model,
+            meta=meta)
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        train_segmentor(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
+    else:
+        train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6823adfb593d67f27af4af2207a515af4cbab6f5
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/__init__.py
@@ -0,0 +1 @@
+from .bevformer_head import BEVFormerHead
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d38d1411e5093cf4ae801ea08de88ef47b6a8e
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/dense_heads/bevformer_head.py
@@ -0,0 +1,523 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Linear, bias_init_with_prob
+from mmcv.utils import TORCH_VERSION, digit_version
+
+from mmdet.core import (multi_apply, multi_apply, reduce_mean)
+from mmdet.models.utils.transformer import inverse_sigmoid
+from mmdet.models import HEADS
+from mmdet.models.dense_heads import DETRHead
+from mmdet3d.core.bbox.coders import build_bbox_coder
+from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
+from mmcv.cnn.bricks.transformer import build_positional_encoding
+from mmcv.runner import force_fp32, auto_fp16
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+import numpy as np
+import mmcv
+import cv2 as cv
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+
+
+@HEADS.register_module()
+class BEVFormerHead(DETRHead):
+    """Head of Detr3D.
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+        bev_h, bev_w (int): spatial shape of BEV queries.
+    """
+
+    def __init__(self,
+                 *args,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 transformer=None,
+                 bbox_coder=None,
+                 num_cls_fcs=2,
+                 code_weights=None,
+                 bev_h=30,
+                 bev_w=30,
+                 **kwargs):
+
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.fp16_enabled = False
+
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        if 'code_size' in kwargs:
+            self.code_size = kwargs['code_size']
+        else:
+            self.code_size = 10
+        if code_weights is not None:
+            self.code_weights = code_weights
+        else:
+            self.code_weights = [1.0, 1.0, 1.0,
+                                 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self.real_w = self.pc_range[3] - self.pc_range[0]
+        self.real_h = self.pc_range[4] - self.pc_range[1]
+        self.num_cls_fcs = num_cls_fcs - 1
+        super(BEVFormerHead, self).__init__(
+            *args, transformer=transformer, **kwargs)
+        self.code_weights = nn.Parameter(torch.tensor(
+            self.code_weights, requires_grad=False), requires_grad=False)
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        fc_cls = nn.Sequential(*cls_branch)
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, self.code_size))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_pred = (self.transformer.decoder.num_layers + 1) if \
+            self.as_two_stage else self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(fc_cls, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+        else:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+
+        if not self.as_two_stage:
+            self.bev_embedding = nn.Embedding(
+                self.bev_h * self.bev_w, self.embed_dims)
+            self.query_embedding = nn.Embedding(self.num_query,
+                                                self.embed_dims * 2)
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+
+    @auto_fp16(apply_to=('mlvl_feats'))
+    def forward(self, mlvl_feats, img_metas, prev_bev=None,  only_bev=False):
+        """Forward function.
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+            prev_bev: previous bev featues
+            only_bev: only compute BEV features with encoder. 
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        """
+
+        bs, num_cam, _, _, _ = mlvl_feats[0].shape
+        dtype = mlvl_feats[0].dtype
+        object_query_embeds = self.query_embedding.weight.to(dtype)
+        bev_queries = self.bev_embedding.weight.to(dtype)
+
+        bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
+                               device=bev_queries.device).to(dtype)
+        bev_pos = self.positional_encoding(bev_mask).to(dtype)
+
+        if only_bev:  # only use encoder to obtain BEV features, TODO: refine the workaround
+            return self.transformer.get_bev_features(
+                mlvl_feats,
+                bev_queries,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                img_metas=img_metas,
+                prev_bev=prev_bev,
+            )
+        else:
+            outputs = self.transformer(
+                mlvl_feats,
+                bev_queries,
+                object_query_embeds,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                cls_branches=self.cls_branches if self.as_two_stage else None,
+                img_metas=img_metas,
+                prev_bev=prev_bev
+        )
+
+        bev_embed, hs, init_reference, inter_references = outputs
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 3
+            tmp[..., 0:2] += reference[..., 0:2]
+            tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            tmp[..., 4:5] += reference[..., 2:3]
+            tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+            tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] -
+                             self.pc_range[0]) + self.pc_range[0])
+            tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] -
+                             self.pc_range[1]) + self.pc_range[1])
+            tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] -
+                             self.pc_range[2]) + self.pc_range[2])
+
+            # TODO: check if using sigmoid
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+
+        outs = {
+            'bev_embed': bev_embed,
+            'all_cls_scores': outputs_classes,
+            'all_bbox_preds': outputs_coords,
+            'enc_cls_scores': None,
+            'enc_bbox_preds': None,
+        }
+
+        return outs
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        gt_c = gt_bboxes.shape[-1]
+
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, gt_bboxes_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+        return (labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds)
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+            self._get_target_single, cls_scores_list, bbox_preds_list,
+            gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan,
+                                                               :10], bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos)
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_cls = torch.nan_to_num(loss_cls)
+            loss_bbox = torch.nan_to_num(loss_bbox)
+        return loss_cls, loss_bbox
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             preds_dicts,
+             gt_bboxes_ignore=None,
+             img_metas=None):
+        """"Loss function.
+        Args:
+
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+
+        gt_bboxes_list = [torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+            dim=1).to(device) for gt_bboxes in gt_bboxes_list]
+
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(all_gt_labels_list))
+            ]
+            enc_loss_cls, enc_losses_bbox = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list, gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1],
+                                           losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            num_dec_layer += 1
+        return loss_dict
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
+        """Generate bboxes from bbox head predictions.
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+            img_metas (list[dict]): Point cloud and image's meta info.
+        Returns:
+            list[dict]: Decoded bbox, scores and labels after nms.
+        """
+
+        preds_dicts = self.bbox_coder.decode(preds_dicts)
+
+        num_samples = len(preds_dicts)
+        ret_list = []
+        for i in range(num_samples):
+            preds = preds_dicts[i]
+            bboxes = preds['bboxes']
+
+            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+
+            code_size = bboxes.shape[-1]
+            bboxes = img_metas[i]['box_type_3d'](bboxes, code_size)
+            scores = preds['scores']
+            labels = preds['labels']
+
+            ret_list.append([bboxes, scores, labels])
+
+        return ret_list
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c39fd341d5d65f809bb94bee71c6e9a523639e6
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/__init__.py
@@ -0,0 +1,2 @@
+from .bevformer import BEVFormer
+from .bevformer_fp16 import BEVFormer_fp16
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d3b676115bb46a39ef21ba7b061e98a72ae11c2
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer.py
@@ -0,0 +1,289 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from tkinter.messagebox import NO
+import torch
+from mmcv.runner import force_fp32, auto_fp16
+from mmdet.models import DETECTORS
+from mmdet3d.core import bbox3d2result
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask
+import time
+import copy
+import numpy as np
+import mmdet3d
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+
+
+@DETECTORS.register_module()
+class BEVFormer(MVXTwoStageDetector):
+    """BEVFormer.
+    Args:
+        video_test_mode (bool): Decide whether to use temporal information during inference.
+    """
+
+    def __init__(self,
+                 use_grid_mask=False,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 video_test_mode=False
+                 ):
+
+        super(BEVFormer,
+              self).__init__(pts_voxel_layer, pts_voxel_encoder,
+                             pts_middle_encoder, pts_fusion_layer,
+                             img_backbone, pts_backbone, img_neck, pts_neck,
+                             pts_bbox_head, img_roi_head, img_rpn_head,
+                             train_cfg, test_cfg, pretrained)
+        self.grid_mask = GridMask(
+            True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
+        self.use_grid_mask = use_grid_mask
+        self.fp16_enabled = False
+
+        # temporal
+        self.video_test_mode = video_test_mode
+        self.prev_frame_info = {
+            'prev_bev': None,
+            'scene_token': None,
+            'prev_pos': 0,
+            'prev_angle': 0,
+        }
+
+
+    def extract_img_feat(self, img, img_metas, len_queue=None):
+        """Extract features of images."""
+        B = img.size(0)
+        if img is not None:
+            
+            # input_shape = img.shape[-2:]
+            # # update real input shape of each single img
+            # for img_meta in img_metas:
+            #     img_meta.update(input_shape=input_shape)
+
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.reshape(B * N, C, H, W)
+            if self.use_grid_mask:
+                img = self.grid_mask(img)
+
+            img_feats = self.img_backbone(img)
+            if isinstance(img_feats, dict):
+                img_feats = list(img_feats.values())
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            if len_queue is not None:
+                img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W))
+            else:
+                img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+
+    @auto_fp16(apply_to=('img'))
+    def extract_feat(self, img, img_metas=None, len_queue=None):
+        """Extract features from images and points."""
+
+        img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue)
+        
+        return img_feats
+
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_metas,
+                          gt_bboxes_ignore=None,
+                          prev_bev=None):
+        """Forward function'
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+            prev_bev (torch.Tensor, optional): BEV features of previous frame.
+        Returns:
+            dict: Losses of each branch.
+        """
+
+        outs = self.pts_bbox_head(
+            pts_feats, img_metas, prev_bev)
+        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
+        losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas)
+        return losses
+
+    def forward_dummy(self, img):
+        dummy_metas = None
+        return self.forward_test(img=img, img_metas=[[dummy_metas]])
+
+    def forward(self, return_loss=True, **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+    
+    def obtain_history_bev(self, imgs_queue, img_metas_list):
+        """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated.
+        """
+        self.eval()
+
+        with torch.no_grad():
+            prev_bev = None
+            bs, len_queue, num_cams, C, H, W = imgs_queue.shape
+            imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W)
+            img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue)
+            for i in range(len_queue):
+                img_metas = [each[i] for each in img_metas_list]
+                # img_feats = self.extract_feat(img=img, img_metas=img_metas)
+                img_feats = [each_scale[:, i] for each_scale in img_feats_list]
+                prev_bev = self.pts_bbox_head(
+                    img_feats, img_metas, prev_bev, only_bev=True)
+            self.train()
+            return prev_bev
+
+    @auto_fp16(apply_to=('img', 'points'))
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None,
+                      ):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        
+        len_queue = img.size(1)
+        prev_img = img[:, :-1, ...]
+        img = img[:, -1, ...]
+
+        prev_img_metas = copy.deepcopy(img_metas)
+        prev_bev = self.obtain_history_bev(prev_img, prev_img_metas)
+
+        img_metas = [each[len_queue-1] for each in img_metas]
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
+                                            gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore, prev_bev)
+
+        losses.update(losses_pts)
+        return losses
+
+    def forward_test(self, img_metas, img=None, **kwargs):
+        for var, name in [(img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+        img = [img] if img is None else img
+
+        if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']:
+            # the first sample of each scene is truncated
+            self.prev_frame_info['prev_bev'] = None
+        # update idx
+        self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token']
+
+        # do not use temporal information
+        if not self.video_test_mode:
+            self.prev_frame_info['prev_bev'] = None
+
+        # Get the delta of ego position and angle between two timestamps.
+        tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3])
+        tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1])
+        if self.prev_frame_info['prev_bev'] is not None:
+            img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos']
+            img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle']
+        else:
+            img_metas[0][0]['can_bus'][-1] = 0
+            img_metas[0][0]['can_bus'][:3] = 0
+
+        new_prev_bev, bbox_results = self.simple_test(
+            img_metas[0], img[0], prev_bev=self.prev_frame_info['prev_bev'], **kwargs)
+        # During inference, we save the BEV features and ego motion of each timestamp.
+        self.prev_frame_info['prev_pos'] = tmp_pos
+        self.prev_frame_info['prev_angle'] = tmp_angle
+        self.prev_frame_info['prev_bev'] = new_prev_bev
+        return bbox_results
+
+    def simple_test_pts(self, x, img_metas, prev_bev=None, rescale=False):
+        """Test function"""
+        outs = self.pts_bbox_head(x, img_metas, prev_bev=prev_bev)
+
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return outs['bev_embed'], bbox_results
+
+    def simple_test(self, img_metas, img=None, prev_bev=None, rescale=False):
+        """Test function without augmentaiton."""
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        new_prev_bev, bbox_pts = self.simple_test_pts(
+            img_feats, img_metas, prev_bev, rescale=rescale)
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+        return new_prev_bev, bbox_list
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..5325e3ccb8ac576a6764df3f0094ac5ea1bbc7cb
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/detectors/bevformer_fp16.py
@@ -0,0 +1,89 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from tkinter.messagebox import NO
+import torch
+from mmcv.runner import force_fp32, auto_fp16
+from mmdet.models import DETECTORS
+from mmdet3d.core import bbox3d2result
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask
+from projects.mmdet3d_plugin.bevformer.detectors.bevformer import BEVFormer
+import time
+import copy
+import numpy as np
+import mmdet3d
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+
+
+@DETECTORS.register_module()
+class BEVFormer_fp16(BEVFormer):
+    """
+    The default version BEVFormer currently can not support FP16. 
+    We provide this version to resolve this issue.
+    """
+    
+    @auto_fp16(apply_to=('img', 'prev_bev', 'points'))
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None,
+                      prev_bev=None,
+                      ):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
+                                            gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore, prev_bev=prev_bev)
+        losses.update(losses_pts)
+        return losses
+
+
+    def val_step(self, data, optimizer):
+        """
+        In BEVFormer_fp16, we use this `val_step` function to inference the `prev_pev`.
+        This is not the standard function of `val_step`.
+        """
+
+        img = data['img']
+        img_metas = data['img_metas']
+        img_feats = self.extract_feat(img=img,  img_metas=img_metas)
+        prev_bev = data.get('prev_bev', None)
+        prev_bev = self.pts_bbox_head(img_feats, img_metas, prev_bev=prev_bev, only_bev=True)
+        return prev_bev
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa04ec16df5b0bb9f21cadf22f9172c3cc9a58c1
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/__init__.py
@@ -0,0 +1 @@
+from .custom_hooks import TransferWeight
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..091738a0950869767647383ad001e5e7e5a5bcaa
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/hooks/custom_hooks.py
@@ -0,0 +1,14 @@
+from mmcv.runner.hooks.hook import HOOKS, Hook
+from projects.mmdet3d_plugin.models.utils import run_time
+
+
+@HOOKS.register_module()
+class TransferWeight(Hook):
+    
+    def __init__(self, every_n_inters=1):
+        self.every_n_inters=every_n_inters
+
+    def after_train_iter(self, runner):
+        if self.every_n_inner_iters(runner, self.every_n_inters):
+            runner.eval_model.load_state_dict(runner.model.state_dict())
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bb5e04c7f69b70088321e62760be14f3329962b
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/__init__.py
@@ -0,0 +1,6 @@
+from .transformer import PerceptionTransformer
+from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D
+from .temporal_self_attention import TemporalSelfAttention
+from .encoder import BEVFormerEncoder, BEVFormerLayer
+from .decoder import DetectionTransformerDecoder
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5d994cda08a8b7fc3ba3ecbadbc5f295ce3c6cc
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/custom_base_transformer_layer.py
@@ -0,0 +1,260 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import copy
+import warnings
+
+import torch
+import torch.nn as nn
+
+from mmcv import ConfigDict, deprecated_api_warning
+from mmcv.cnn import Linear, build_activation_layer, build_norm_layer
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+
+from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
+                                      TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
+
+# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
+try:
+    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention  # noqa F401
+    warnings.warn(
+        ImportWarning(
+            '``MultiScaleDeformableAttention`` has been moved to '
+            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501
+            '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501
+            'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501
+        ))
+except ImportError:
+    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
+                  '``mmcv.ops.multi_scale_deform_attn``, '
+                  'You should install ``mmcv-full`` if you need this module. ')
+from mmcv.cnn.bricks.transformer import build_feedforward_network, build_attention
+
+
+@TRANSFORMER_LAYER.register_module()
+class MyCustomBaseTransformerLayer(BaseModule):
+    """Base `TransformerLayer` for vision transformer.
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=True,
+                 **kwargs):
+
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ')
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super(MyCustomBaseTransformerLayer, self).__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        assert set(operation_order) & set(
+            ['self_attn', 'norm', 'ffn', 'cross_attn']) == \
+            set(operation_order), f'The operation_order of' \
+            f' {self.__class__.__name__} should ' \
+            f'contains all four operation type ' \
+            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
+
+        num_attn = operation_order.count('self_attn') + operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+
+            self.ffns.append(
+                build_feedforward_network(ffn_cfgs[ffn_index]))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+        **kwargs contains some specific arguments of attentions.
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                f'attn_masks {len(attn_masks)} must be equal ' \
+                f'to the number of attention in ' \
+                f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                temp_key = temp_value = query
+                query = self.attentions[attn_index](
+                    query,
+                    temp_key,
+                    temp_value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/decoder.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..33024f86a868c4316c15cfadeb5fb0ca58ef8895
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/decoder.py
@@ -0,0 +1,345 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import mmcv
+import cv2 as cv
+import copy
+import warnings
+from matplotlib import pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import TransformerLayerSequence
+import math
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+
+from mmcv.utils import ext_loader
+from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \
+    MultiScaleDeformableAttnFunction_fp16
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetectionTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR3D transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+        super(DetectionTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        self.fp16_enabled = False
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                reg_branches=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+
+            reference_points_input = reference_points[..., :2].unsqueeze(
+                2)  # BS NUM_QUERY NUM_LEVEL 2
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+
+                assert reference_points.shape[-1] == 3
+
+                new_reference_points = torch.zeros_like(reference_points)
+                new_reference_points[..., :2] = tmp[
+                    ..., :2] + inverse_sigmoid(reference_points[..., :2])
+                new_reference_points[..., 2:3] = tmp[
+                    ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3])
+
+                new_reference_points = new_reference_points.sigmoid()
+
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@ATTENTION.register_module()
+class CustomMSDeformableAttention(BaseModule):
+    """An attention module used in Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                flag='decoder',
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+
+            # using fp16 deformable attention is unstable because it performs many sum operations
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/encoder.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1ee30065be23c97371f45d48780acb1f04bfc9f
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/encoder.py
@@ -0,0 +1,403 @@
+
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from .custom_base_transformer_layer import MyCustomBaseTransformerLayer
+import copy
+import warnings
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import TransformerLayerSequence
+from mmcv.runner import force_fp32, auto_fp16
+import numpy as np
+import torch
+import cv2 as cv
+import mmcv
+from mmcv.utils import TORCH_VERSION, digit_version
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class BEVFormerEncoder(TransformerLayerSequence):
+
+    """
+    Attention with both self and cross
+    Implements the decoder in DETR transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes',
+                 **kwargs):
+
+        super(BEVFormerEncoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+
+        self.num_points_in_pillar = num_points_in_pillar
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+
+    @staticmethod
+    def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs=1, device='cuda', dtype=torch.float):
+        """Get the reference points used in SCA and TSA.
+        Args:
+            H, W: spatial shape of bev.
+            Z: hight of pillar.
+            D: sample D points uniformly from each pillar.
+            device (obj:`device`): The device where
+                reference_points should be.
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+
+        # reference points in 3D space, used in spatial cross-attention (SCA)
+        if dim == '3d':
+            zs = torch.linspace(0.5, Z - 0.5, num_points_in_pillar, dtype=dtype,
+                                device=device).view(-1, 1, 1).expand(num_points_in_pillar, H, W) / Z
+            xs = torch.linspace(0.5, W - 0.5, W, dtype=dtype,
+                                device=device).view(1, 1, W).expand(num_points_in_pillar, H, W) / W
+            ys = torch.linspace(0.5, H - 0.5, H, dtype=dtype,
+                                device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H
+            ref_3d = torch.stack((xs, ys, zs), -1)
+            ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1)
+            ref_3d = ref_3d[None].repeat(bs, 1, 1, 1)
+            return ref_3d
+
+        # reference points on 2D bev plane, used in temporal self-attention (TSA).
+        elif dim == '2d':
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H - 0.5, H, dtype=dtype, device=device),
+                torch.linspace(
+                    0.5, W - 0.5, W, dtype=dtype, device=device)
+            )
+            ref_y = ref_y.reshape(-1)[None] / H
+            ref_x = ref_x.reshape(-1)[None] / W
+            ref_2d = torch.stack((ref_x, ref_y), -1)
+            ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2)
+            return ref_2d
+
+    # This function must use fp32!!!
+    @force_fp32(apply_to=('reference_points', 'img_metas'))
+    def point_sampling(self, reference_points, pc_range,  img_metas):
+
+        lidar2img = []
+        for img_meta in img_metas:
+            lidar2img.append(img_meta['lidar2img'])
+        lidar2img = np.asarray(lidar2img)
+        lidar2img = reference_points.new_tensor(lidar2img)  # (B, N, 4, 4)
+        reference_points = reference_points.clone()
+
+        reference_points[..., 0:1] = reference_points[..., 0:1] * \
+            (pc_range[3] - pc_range[0]) + pc_range[0]
+        reference_points[..., 1:2] = reference_points[..., 1:2] * \
+            (pc_range[4] - pc_range[1]) + pc_range[1]
+        reference_points[..., 2:3] = reference_points[..., 2:3] * \
+            (pc_range[5] - pc_range[2]) + pc_range[2]
+
+        reference_points = torch.cat(
+            (reference_points, torch.ones_like(reference_points[..., :1])), -1)
+
+        reference_points = reference_points.permute(1, 0, 2, 3)
+        D, B, num_query = reference_points.size()[:3]
+        num_cam = lidar2img.size(1)
+
+        reference_points = reference_points.view(
+            D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1)
+
+        lidar2img = lidar2img.view(
+            1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1)
+
+        reference_points_cam = torch.matmul(lidar2img.to(torch.float32),
+                                            reference_points.to(torch.float32)).squeeze(-1)
+        eps = 1e-5
+
+        bev_mask = (reference_points_cam[..., 2:3] > eps)
+        reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum(
+            reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps)
+
+        reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1]
+        reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0]
+
+        bev_mask = (bev_mask & (reference_points_cam[..., 1:2] > 0.0)
+                    & (reference_points_cam[..., 1:2] < 1.0)
+                    & (reference_points_cam[..., 0:1] < 1.0)
+                    & (reference_points_cam[..., 0:1] > 0.0))
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            bev_mask = torch.nan_to_num(bev_mask)
+        else:
+            bev_mask = bev_mask.new_tensor(
+                np.nan_to_num(bev_mask.cpu().numpy()))
+
+        reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4)
+        bev_mask = bev_mask.permute(2, 1, 3, 0, 4).squeeze(-1)
+
+        return reference_points_cam, bev_mask
+
+    @auto_fp16()
+    def forward(self,
+                bev_query,
+                key,
+                value,
+                *args,
+                bev_h=None,
+                bev_w=None,
+                bev_pos=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                valid_ratios=None,
+                prev_bev=None,
+                shift=0.,
+                **kwargs):
+        """Forward function for `TransformerDecoder`.
+        Args:
+            bev_query (Tensor): Input BEV query with shape
+                `(num_query, bs, embed_dims)`.
+            key & value (Tensor): Input multi-cameta features with shape
+                (num_cam, num_value, bs, embed_dims)
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+
+        output = bev_query
+        intermediate = []
+
+        ref_3d = self.get_reference_points(
+            bev_h, bev_w, self.pc_range[5]-self.pc_range[2], self.num_points_in_pillar, dim='3d', bs=bev_query.size(1),  device=bev_query.device, dtype=bev_query.dtype)
+        ref_2d = self.get_reference_points(
+            bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype)
+
+        reference_points_cam, bev_mask = self.point_sampling(
+            ref_3d, self.pc_range, kwargs['img_metas'])
+
+        # bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper.
+        shift_ref_2d = ref_2d  # .clone()
+        shift_ref_2d += shift[:, None, None, :]
+
+        # (num_query, bs, embed_dims) -> (bs, num_query, embed_dims)
+        bev_query = bev_query.permute(1, 0, 2)
+        bev_pos = bev_pos.permute(1, 0, 2)
+        bs, len_bev, num_bev_level, _ = ref_2d.shape
+        if prev_bev is not None:
+            prev_bev = prev_bev.permute(1, 0, 2)
+            prev_bev = torch.stack(
+                [prev_bev, bev_query], 1).reshape(bs*2, len_bev, -1)
+            hybird_ref_2d = torch.stack([shift_ref_2d, ref_2d], 1).reshape(
+                bs*2, len_bev, num_bev_level, 2)
+        else:
+            hybird_ref_2d = torch.stack([ref_2d, ref_2d], 1).reshape(
+                bs*2, len_bev, num_bev_level, 2)
+
+        for lid, layer in enumerate(self.layers):
+            output = layer(
+                bev_query,
+                key,
+                value,
+                *args,
+                bev_pos=bev_pos,
+                ref_2d=hybird_ref_2d,
+                ref_3d=ref_3d,
+                bev_h=bev_h,
+                bev_w=bev_w,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                reference_points_cam=reference_points_cam,
+                bev_mask=bev_mask,
+                prev_bev=prev_bev,
+                **kwargs)
+
+            bev_query = output
+            if self.return_intermediate:
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output
+
+
+@TRANSFORMER_LAYER.register_module()
+class BEVFormerLayer(MyCustomBaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default：None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default：2.
+    """
+
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 **kwargs):
+        super(BEVFormerLayer, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs)
+        self.fp16_enabled = False
+        assert len(operation_order) == 6
+        assert set(operation_order) == set(
+            ['self_attn', 'norm', 'cross_attn', 'ffn'])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                bev_pos=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                ref_2d=None,
+                ref_3d=None,
+                bev_h=None,
+                bev_w=None,
+                reference_points_cam=None,
+                mask=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                prev_bev=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+
+        **kwargs contains some specific arguments of attentions.
+
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                                                     f'attn_masks {len(attn_masks)} must be equal ' \
+                                                     f'to the number of attention in ' \
+                f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            # temporal self attention
+            if layer == 'self_attn':
+
+                query = self.attentions[attn_index](
+                    query,
+                    prev_bev,
+                    prev_bev,
+                    identity if self.pre_norm else None,
+                    query_pos=bev_pos,
+                    key_pos=bev_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    reference_points=ref_2d,
+                    spatial_shapes=torch.tensor(
+                        [[bev_h, bev_w]], device=query.device),
+                    level_start_index=torch.tensor([0], device=query.device),
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            # spaital cross attention
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    reference_points=ref_3d,
+                    reference_points_cam=reference_points_cam,
+                    mask=mask,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..77b0f319ccff7e023e1c2d94b63f8c2d7b9c727d
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/multi_scale_deformable_attn_function.py
@@ -0,0 +1,163 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import torch
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.autograd.function import Function, once_differentiable
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+class MultiScaleDeformableAttnFunction_fp16(Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index, \
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+class MultiScaleDeformableAttnFunction_fp32(Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index, \
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..100d94fef34456a0454eb7a328ca8688df1c30c1
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/spatial_cross_attention.py
@@ -0,0 +1,399 @@
+
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.cnn.bricks.transformer import build_attention
+import math
+from mmcv.runner import force_fp32, auto_fp16
+
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+
+from mmcv.utils import ext_loader
+from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \
+    MultiScaleDeformableAttnFunction_fp16
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@ATTENTION.register_module()
+class SpatialCrossAttention(BaseModule):
+    """An attention module used in BEVFormer.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_cams (int): The number of cameras
+        dropout (float): A Dropout layer on `inp_residual`.
+            Default: 0..
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        deformable_attention: (dict): The config for the deformable attention used in SCA.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_cams=6,
+                 pc_range=None,
+                 dropout=0.1,
+                 init_cfg=None,
+                 batch_first=False,
+                 deformable_attention=dict(
+                     type='MSDeformableAttention3D',
+                     embed_dims=256,
+                     num_levels=4),
+                 **kwargs
+                 ):
+        super(SpatialCrossAttention, self).__init__(init_cfg)
+
+        self.init_cfg = init_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+        self.deformable_attention = build_attention(deformable_attention)
+        self.embed_dims = embed_dims
+        self.num_cams = num_cams
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.batch_first = batch_first
+        self.init_weight()
+
+    def init_weight(self):
+        """Default initialization for Parameters of Module."""
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+    
+    @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam'))
+    def forward(self,
+                query,
+                key,
+                value,
+                residual=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                reference_points_cam=None,
+                bev_mask=None,
+                level_start_index=None,
+                flag='encoder',
+                **kwargs):
+        """Forward Function of Detr3DCrossAtten.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`. (B, N, C, H, W)
+            residual (Tensor): The tensor used for addition, with the
+                same shape as `x`. Default None. If None, `x` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for  `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, 4),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different level. With shape  (num_levels, 2),
+                last dimension represent (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+
+        if residual is None:
+            inp_residual = query
+            slots = torch.zeros_like(query)
+        if query_pos is not None:
+            query = query + query_pos
+
+        bs, num_query, _ = query.size()
+
+        D = reference_points_cam.size(3)
+        indexes = []
+        for i, mask_per_img in enumerate(bev_mask):
+            index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1)
+            indexes.append(index_query_per_img)
+        max_len = max([len(each) for each in indexes])
+
+        # each camera only interacts with its corresponding BEV queries. This step can  greatly save GPU memory.
+        queries_rebatch = query.new_zeros(
+            [bs, self.num_cams, max_len, self.embed_dims])
+        reference_points_rebatch = reference_points_cam.new_zeros(
+            [bs, self.num_cams, max_len, D, 2])
+        
+        for j in range(bs):
+            for i, reference_points_per_img in enumerate(reference_points_cam):   
+                index_query_per_img = indexes[i]
+                queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img]
+                reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img]
+
+        num_cams, l, bs, embed_dims = key.shape
+
+        key = key.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+        value = value.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+
+        queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value,
+                                            reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes,
+                                            level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims)
+        for j in range(bs):
+            for i, index_query_per_img in enumerate(indexes):
+                slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)]
+
+        count = bev_mask.sum(-1) > 0
+        count = count.permute(1, 2, 0).sum(-1)
+        count = torch.clamp(count, min=1.0)
+        slots = slots / count[..., None]
+        slots = self.output_proj(slots)
+
+        return self.dropout(slots) + inp_residual
+
+
+@ATTENTION.register_module()
+class MSDeformableAttention3D(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=8,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.batch_first = batch_first
+        self.output_proj = None
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                ( bs, num_query, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        if reference_points.shape[-1] == 2:
+            """
+            For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
+            After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
+            For each referent point, we sample `num_points` sampling points.
+            For `num_Z_anchors` reference points,  it has overall `num_points * num_Z_anchors` sampling points.
+            """
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+
+            bs, num_query, num_Z_anchors, xy = reference_points.shape
+            reference_points = reference_points[:, :, None, None, None, :, :]
+            sampling_offsets = sampling_offsets / \
+                offset_normalizer[None, None, None, :, None, :]
+            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape
+            sampling_offsets = sampling_offsets.view(
+                bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy)
+            sampling_locations = reference_points + sampling_offsets
+            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape
+            assert num_all_points == num_points * num_Z_anchors
+
+            sampling_locations = sampling_locations.view(
+                bs, num_query, num_heads, num_levels, num_all_points, xy)
+
+        elif reference_points.shape[-1] == 4:
+            assert False
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+
+        #  sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
+        #  attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
+        #
+
+        if torch.cuda.is_available() and value.is_cuda:
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return output
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..78fb9f529c925d1a4f74f1cc1f83de6b1cb20f67
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/temporal_self_attention.py
@@ -0,0 +1,272 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import warnings
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init, constant_init
+from mmcv.cnn.bricks.registry import ATTENTION
+import math
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@ATTENTION.register_module()
+class TemporalSelfAttention(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV.
+         the length of BEV queue is 2.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 num_bev_queue=2,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.num_bev_queue = num_bev_queue
+        self.sampling_offsets = nn.Linear(
+            embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue,
+                                           num_bev_queue*num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1)
+
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                flag='decoder',
+
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            assert self.batch_first
+            bs, len_bev, c = query.shape
+            value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c)
+
+            # value = torch.cat([query, query], 0)
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+        bs,  num_query, embed_dims = query.shape
+        _, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+        assert self.num_bev_queue == 2
+
+        query = torch.cat([value[:bs], query], -1)
+        value = self.value_proj(value)
+
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+
+        value = value.reshape(bs*self.num_bev_queue,
+                              num_value, self.num_heads, -1)
+
+        sampling_offsets = self.sampling_offsets(query)
+        sampling_offsets = sampling_offsets.view(
+            bs, num_query, self.num_heads,  self.num_bev_queue, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query,  self.num_heads, self.num_bev_queue, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_bev_queue,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\
+            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous()
+        sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\
+            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+
+            # using fp16 deformable attention is unstable because it performs many sum operations
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        # output shape (bs*num_bev_queue, num_query, embed_dims)
+        # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue)
+        output = output.permute(1, 2, 0)
+
+        # fuse history value and current value
+        # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue)
+        output = output.view(num_query, embed_dims, bs, self.num_bev_queue)
+        output = output.mean(-1)
+
+        # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims)
+        output = output.permute(2, 0, 1)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/transformer.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b740fccf5f5ab16ee4cb101fdb8874f2e6c147d2
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/modules/transformer.py
@@ -0,0 +1,289 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init
+from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
+from mmcv.runner.base_module import BaseModule
+
+from mmdet.models.utils.builder import TRANSFORMER
+from torch.nn.init import normal_
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from mmcv.runner.base_module import BaseModule
+from torchvision.transforms.functional import rotate
+from .temporal_self_attention import TemporalSelfAttention
+from .spatial_cross_attention import MSDeformableAttention3D
+from .decoder import CustomMSDeformableAttention
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+from mmcv.runner import force_fp32, auto_fp16
+
+
+@TRANSFORMER.register_module()
+class PerceptionTransformer(BaseModule):
+    """Implements the Detr3D transformer.
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 num_feature_levels=4,
+                 num_cams=6,
+                 two_stage_num_proposals=300,
+                 encoder=None,
+                 decoder=None,
+                 embed_dims=256,
+                 rotate_prev_bev=True,
+                 use_shift=True,
+                 use_can_bus=True,
+                 can_bus_norm=True,
+                 use_cams_embeds=True,
+                 rotate_center=[100, 100],
+                 **kwargs):
+        super(PerceptionTransformer, self).__init__(**kwargs)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = embed_dims
+        self.num_feature_levels = num_feature_levels
+        self.num_cams = num_cams
+        self.fp16_enabled = False
+
+        self.rotate_prev_bev = rotate_prev_bev
+        self.use_shift = use_shift
+        self.use_can_bus = use_can_bus
+        self.can_bus_norm = can_bus_norm
+        self.use_cams_embeds = use_cams_embeds
+
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.init_layers()
+        self.rotate_center = rotate_center
+
+    def init_layers(self):
+        """Initialize layers of the Detr3DTransformer."""
+        self.level_embeds = nn.Parameter(torch.Tensor(
+            self.num_feature_levels, self.embed_dims))
+        self.cams_embeds = nn.Parameter(
+            torch.Tensor(self.num_cams, self.embed_dims))
+        self.reference_points = nn.Linear(self.embed_dims, 3)
+        self.can_bus_mlp = nn.Sequential(
+            nn.Linear(18, self.embed_dims // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.embed_dims // 2, self.embed_dims),
+            nn.ReLU(inplace=True),
+        )
+        if self.can_bus_norm:
+            self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims))
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        normal_(self.level_embeds)
+        normal_(self.cams_embeds)
+        xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.)
+
+    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos'))
+    def get_bev_features(
+            self,
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=[0.512, 0.512],
+            bev_pos=None,
+            prev_bev=None,
+            **kwargs):
+        """
+        obtain bev features.
+        """
+
+        bs = mlvl_feats[0].size(0)
+        bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1)
+        bev_pos = bev_pos.flatten(2).permute(2, 0, 1)
+
+        # obtain rotation angle and shift with ego motion
+        delta_x = np.array([each['can_bus'][0]
+                           for each in kwargs['img_metas']])
+        delta_y = np.array([each['can_bus'][1]
+                           for each in kwargs['img_metas']])
+        ego_angle = np.array(
+            [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']])
+        grid_length_y = grid_length[0]
+        grid_length_x = grid_length[1]
+        translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2)
+        translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180
+        bev_angle = ego_angle - translation_angle
+        shift_y = translation_length * \
+            np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h
+        shift_x = translation_length * \
+            np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w
+        shift_y = shift_y * self.use_shift
+        shift_x = shift_x * self.use_shift
+        shift = bev_queries.new_tensor(
+            [shift_x, shift_y]).permute(1, 0)  # xy, bs -> bs, xy
+
+        if prev_bev is not None:
+            if prev_bev.shape[1] == bev_h * bev_w:
+                prev_bev = prev_bev.permute(1, 0, 2)
+            if self.rotate_prev_bev:
+                for i in range(bs):
+                    # num_prev_bev = prev_bev.size(1)
+                    rotation_angle = kwargs['img_metas'][i]['can_bus'][-1]
+                    tmp_prev_bev = prev_bev[:, i].reshape(
+                        bev_h, bev_w, -1).permute(2, 0, 1)
+                    tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle,
+                                          center=self.rotate_center)
+                    tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape(
+                        bev_h * bev_w, 1, -1)
+                    prev_bev[:, i] = tmp_prev_bev[:, 0]
+
+        # add can bus signals
+        can_bus = bev_queries.new_tensor(
+            [each['can_bus'] for each in kwargs['img_metas']])  # [:, :]
+        can_bus = self.can_bus_mlp(can_bus)[None, :, :]
+        bev_queries = bev_queries + can_bus * self.use_can_bus
+
+        feat_flatten = []
+        spatial_shapes = []
+        for lvl, feat in enumerate(mlvl_feats):
+            bs, num_cam, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            feat = feat.flatten(3).permute(1, 0, 3, 2)
+            if self.use_cams_embeds:
+                feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype)
+            feat = feat + self.level_embeds[None,
+                                            None, lvl:lvl + 1, :].to(feat.dtype)
+            spatial_shapes.append(spatial_shape)
+            feat_flatten.append(feat)
+
+        feat_flatten = torch.cat(feat_flatten, 2)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=bev_pos.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        feat_flatten = feat_flatten.permute(
+            0, 2, 1, 3)  # (num_cam, H*W, bs, embed_dims)
+
+        bev_embed = self.encoder(
+            bev_queries,
+            feat_flatten,
+            feat_flatten,
+            bev_h=bev_h,
+            bev_w=bev_w,
+            bev_pos=bev_pos,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            prev_bev=prev_bev,
+            shift=shift,
+            **kwargs
+        )
+
+        return bev_embed
+
+    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos'))
+    def forward(self,
+                mlvl_feats,
+                bev_queries,
+                object_query_embed,
+                bev_h,
+                bev_w,
+                grid_length=[0.512, 0.512],
+                bev_pos=None,
+                reg_branches=None,
+                cls_branches=None,
+                prev_bev=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformer`.
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, num_cams, embed_dims, h, w].
+            bev_queries (Tensor): (bev_h*bev_w, c)
+            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
+            object_query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when `with_box_refine` is True. Default to None.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - bev_embed: BEV features
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+
+        bev_embed = self.get_bev_features(
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=grid_length,
+            bev_pos=bev_pos,
+            prev_bev=prev_bev,
+            **kwargs)  # bev_embed shape: bs, bev_h*bev_w, embed_dims
+
+        bs = mlvl_feats[0].size(0)
+        query_pos, query = torch.split(
+            object_query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+        query = query.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = self.reference_points(query_pos)
+        reference_points = reference_points.sigmoid()
+        init_reference_out = reference_points
+
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        bev_embed = bev_embed.permute(1, 0, 2)
+
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=bev_embed,
+            query_pos=query_pos,
+            reference_points=reference_points,
+            reg_branches=reg_branches,
+            cls_branches=cls_branches,
+            spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
+            level_start_index=torch.tensor([0], device=query.device),
+            **kwargs)
+
+        inter_references_out = inter_references
+
+        return bev_embed, inter_states, init_reference_out, inter_references_out
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/__init__.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03f906ce601e2dfac207af680774086067808830
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/__init__.py
@@ -0,0 +1 @@
+from .epoch_based_runner import EpochBasedRunner_video
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py b/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..e73e5e7873f831b3c6e0f19715d950701b65fa25
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/bevformer/runner/epoch_based_runner.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import os.path as osp
+import torch
+import mmcv
+from mmcv.runner.base_runner import BaseRunner
+from mmcv.runner.epoch_based_runner import EpochBasedRunner
+from mmcv.runner.builder import RUNNERS
+from mmcv.runner.checkpoint import save_checkpoint
+from mmcv.runner.utils import get_host_info
+from pprint import pprint
+from mmcv.parallel.data_container import DataContainer
+
+
+@RUNNERS.register_module()
+class EpochBasedRunner_video(EpochBasedRunner):
+    
+    ''' 
+    # basic logic
+    
+    input_sequence = [a, b, c] # given a sequence of samples
+    
+    prev_bev = None
+    for each in input_sequcene[:-1]
+        prev_bev = eval_model(each, prev_bev)) # inference only.
+    
+    model(input_sequcene[-1], prev_bev) # train the last sample.
+    '''
+    
+    def __init__(self,
+                 model,
+                 eval_model=None,
+                 batch_processor=None,
+                 optimizer=None,
+                 work_dir=None,
+                 logger=None,
+                 meta=None,
+                 keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'],
+                 max_iters=None,
+                 max_epochs=None):
+        super().__init__(model,
+                 batch_processor,
+                 optimizer,
+                 work_dir,
+                 logger,
+                 meta,
+                 max_iters,
+                 max_epochs)
+        keys.append('img_metas')
+        self.keys = keys
+        self.eval_model = eval_model
+        self.eval_model.eval()
+    
+    def run_iter(self, data_batch, train_mode, **kwargs):
+        if self.batch_processor is not None:
+            assert False
+            # outputs = self.batch_processor(
+            #     self.model, data_batch, train_mode=train_mode, **kwargs)
+        elif train_mode:
+
+            num_samples = data_batch['img'].data[0].size(1)
+            data_list = []
+            prev_bev = None
+            for i in range(num_samples):
+                data = {}
+                for key in self.keys:
+                    if key not in ['img_metas', 'img', 'points']:
+                        data[key] = data_batch[key]
+                    else:
+                        if key == 'img':
+                            data['img'] = DataContainer(data=[data_batch['img'].data[0][:, i]], cpu_only=data_batch['img'].cpu_only, stack=True)
+                        elif key == 'img_metas':
+                            data['img_metas'] = DataContainer(data=[[each[i] for each in data_batch['img_metas'].data[0]]], cpu_only=data_batch['img_metas'].cpu_only)
+                        else:
+                            assert False
+                data_list.append(data)
+            with torch.no_grad():
+                for i in range(num_samples-1):
+                    if i>0: data_list[i]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False)
+                    prev_bev = self.eval_model.val_step(data_list[i], self.optimizer, **kwargs)
+            
+            data_list[-1]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False)
+            outputs = self.model.train_step(data_list[-1], self.optimizer, **kwargs)
+        else:
+            assert False
+            # outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
+
+        if not isinstance(outputs, dict):
+            raise TypeError('"batch_processor()" or "model.train_step()"'
+                            'and "model.val_step()" must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/__pycache__/util.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/__pycache__/util.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d5aec925098e5e62d94070d0d92fab349404df0
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/__pycache__/util.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3250ef2bad5b2c52a43bae642b6761b8fa4908a7
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__init__.py
@@ -0,0 +1,4 @@
+from .hungarian_assigner_3d import HungarianAssigner3D
+from .map_hungarian_assigner_3d import MapHungarianAssigner3D
+
+__all__ = ['HungarianAssigner3D', 'MapHungarianAssigner3D']
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..579ae0b53e2d313a8a6de5f4581507f0827d6d78
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/hungarian_assigner_3d.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/hungarian_assigner_3d.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..313f212912cac148f79cd15dd0f1bb55aa1ec9bc
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/hungarian_assigner_3d.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/map_hungarian_assigner_3d.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/map_hungarian_assigner_3d.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8be7a223985767e0993f9672ff2d41273cedde3a
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/__pycache__/map_hungarian_assigner_3d.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..583fcab72f6b2bbf20bda90b8f877cc1f81072d9
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/hungarian_assigner_3d.py
@@ -0,0 +1,136 @@
+import torch
+
+from mmdet.core.bbox.builder import BBOX_ASSIGNERS
+from mmdet.core.bbox.assigners import AssignResult
+from mmdet.core.bbox.assigners import BaseAssigner
+from mmdet.core.bbox.match_costs import build_match_cost
+from mmdet.models.utils.transformer import inverse_sigmoid
+from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner3D(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', weight=0.0),
+                 pc_range=None):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+        self.pc_range = pc_range
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes, 
+               gt_labels,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+       
+        normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
+    
+        reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
+      
+        # weighted sum of above two costs
+        cost = cls_cost + reg_cost
+        
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/map_hungarian_assigner_3d.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/map_hungarian_assigner_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6afa75e7b5daefa3fe1592175c628e6ad62c29a
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/assigners/map_hungarian_assigner_3d.py
@@ -0,0 +1,162 @@
+import torch
+import torch.nn.functional as F
+
+from mmdet.core.bbox.builder import BBOX_ASSIGNERS
+from mmdet.core.bbox.assigners import AssignResult
+from mmdet.core.bbox.assigners import BaseAssigner
+from mmdet.core.bbox.match_costs import build_match_cost
+from mmdet.models.utils.transformer import inverse_sigmoid
+from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
+from projects.mmdet3d_plugin.VAD.utils.map_utils import (
+    normalize_2d_bbox, normalize_2d_pts, denormalize_2d_bbox
+)
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+@BBOX_ASSIGNERS.register_module()
+class MapHungarianAssigner3D(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', weight=0.0),
+                 pts_cost=dict(type='ChamferDistance',loss_src_weight=1.0,loss_dst_weight=1.0),
+                 pc_range=None):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+        self.pts_cost = build_match_cost(pts_cost)
+        self.pc_range = pc_range
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               pts_pred,
+               gt_bboxes, 
+               gt_labels,
+               gt_pts,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        assert bbox_pred.shape[-1] == 4, \
+            'Only support bbox pred shape is 4 dims'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels), None
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        
+        normalized_gt_bboxes = normalize_2d_bbox(gt_bboxes, self.pc_range)
+        # normalized_gt_bboxes = gt_bboxes
+        # import pdb;pdb.set_trace()
+        reg_cost = self.reg_cost(bbox_pred[:, :4], normalized_gt_bboxes[:, :4])
+
+        _, num_orders, num_pts_per_gtline, num_coords = gt_pts.shape
+        normalized_gt_pts = normalize_2d_pts(gt_pts, self.pc_range)
+        num_pts_per_predline = pts_pred.size(1)
+        if num_pts_per_predline != num_pts_per_gtline:
+            pts_pred_interpolated = F.interpolate(pts_pred.permute(0,2,1),size=(num_pts_per_gtline),
+                                            mode='linear', align_corners=True)
+            pts_pred_interpolated = pts_pred_interpolated.permute(0,2,1).contiguous()
+        else:
+            pts_pred_interpolated = pts_pred
+        # num_q, num_pts, 2 <-> num_gt, num_pts, 2
+        pts_cost_ordered = self.pts_cost(pts_pred_interpolated, normalized_gt_pts)
+        pts_cost_ordered = pts_cost_ordered.view(num_bboxes, num_gts, num_orders)
+        pts_cost, order_index = torch.min(pts_cost_ordered, 2)
+        
+        bboxes = denormalize_2d_bbox(bbox_pred, self.pc_range)
+        iou_cost = self.iou_cost(bboxes, gt_bboxes)
+        # weighted sum of above three costs
+        cost = cls_cost + reg_cost + iou_cost + pts_cost
+        
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels), order_index
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__init__.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb9c159fd905a4670c06167abc101d178a24da2c
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__init__.py
@@ -0,0 +1,7 @@
+from .nms_free_coder import NMSFreeCoder
+from .fut_nms_free_coder import CustomNMSFreeCoder
+from .map_nms_free_coder import MapNMSFreeCoder
+
+__all__ = ['NMSFreeCoder',
+           'CustomNMSFreeCoder',
+           'MapNMSFreeCoder']
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f80b6fb06b7c41d528470d9d72a7176987e76dd
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/fut_nms_free_coder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/fut_nms_free_coder.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50c3475a8f9dae2ce6faaa7891f4e0457578dd38
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/fut_nms_free_coder.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/map_nms_free_coder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/map_nms_free_coder.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e48cf1bfc297b48796b643e8c36ab4153c88b020
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/map_nms_free_coder.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/nms_free_coder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/nms_free_coder.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17f5aee76da10720a3d82378c3aa9b3fcfa895b1
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/__pycache__/nms_free_coder.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/fut_nms_free_coder.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/fut_nms_free_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fafb3d6aecfc410d7b69715fdf227a8c72feb4d4
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/fut_nms_free_coder.py
@@ -0,0 +1,127 @@
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox
+import numpy as np
+
+
+@BBOX_CODERS.register_module()
+class CustomNMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds, traj_preds):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexs = cls_scores.view(-1).topk(max_num)
+        labels = indexs % self.num_classes
+        bbox_index = indexs // self.num_classes
+        bbox_preds = bbox_preds[bbox_index]
+        traj_preds = traj_preds[bbox_index]
+       
+        final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)   
+        final_scores = scores 
+        final_preds = labels
+        final_traj_preds = traj_preds
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+            tmp_score = self.score_threshold
+            while thresh_mask.sum() == 0:
+                tmp_score *= 0.9
+                if tmp_score < 0.01:
+                    thresh_mask = final_scores > -1
+                    break
+                thresh_mask = final_scores >= tmp_score
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            labels = final_preds[mask]
+            trajs = final_traj_preds[mask]
+
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels,
+                'trajs': trajs
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+        all_traj_preds = preds_dicts['all_traj_preds'][-1]
+        
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i], all_traj_preds[i]))
+        return predictions_list
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/map_nms_free_coder.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/map_nms_free_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7186e8ca56c9f33e3116270ff946b4f6f2fcfac
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/map_nms_free_coder.py
@@ -0,0 +1,126 @@
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+from projects.mmdet3d_plugin.VAD.utils.map_utils import (
+    denormalize_2d_pts, denormalize_2d_bbox
+)
+
+
+@BBOX_CODERS.register_module()
+class MapNMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds, pts_preds):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+            pts_preds (Tensor):
+                Shape [num_query, fixed_num_pts, 2]
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexs = cls_scores.view(-1).topk(max_num)
+        labels = indexs % self.num_classes
+        bbox_index = indexs // self.num_classes
+        bbox_preds = bbox_preds[bbox_index]
+        pts_preds = pts_preds[bbox_index]
+       
+        final_box_preds = denormalize_2d_bbox(bbox_preds, self.pc_range) 
+        final_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range) #num_q,num_p,2
+        # final_box_preds = bbox_preds 
+        final_scores = scores 
+        final_preds = labels 
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+            tmp_score = self.score_threshold
+            while thresh_mask.sum() == 0:
+                tmp_score *= 0.9
+                if tmp_score < 0.01:
+                    thresh_mask = final_scores > -1
+                    break
+                thresh_mask = final_scores >= tmp_score
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :4] >= self.post_center_range[:4]).all(1)
+            mask &= (final_box_preds[..., :4] <= self.post_center_range[4:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            pts = final_pts_preds[mask]
+            labels = final_preds[mask]
+            predictions_dict = {
+                'map_bboxes': boxes3d,
+                'map_scores': scores,
+                'map_labels': labels,
+                'map_pts': pts,
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['map_all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['map_all_bbox_preds'][-1]
+        all_pts_preds = preds_dicts['map_all_pts_preds'][-1]
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i],all_pts_preds[i]))
+        return predictions_list
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..15321f5b2f376fa938588c4480cd12b77e0e864e
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/coders/nms_free_coder.py
@@ -0,0 +1,122 @@
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+from projects.mmdet3d_plugin.core.bbox.util import denormalize_bbox
+import numpy as np
+
+
+@BBOX_CODERS.register_module()
+class NMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexs = cls_scores.view(-1).topk(max_num)
+        labels = indexs % self.num_classes
+        bbox_index = indexs // self.num_classes
+        bbox_preds = bbox_preds[bbox_index]
+       
+        final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)   
+        final_scores = scores 
+        final_preds = labels 
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+            tmp_score = self.score_threshold
+            while thresh_mask.sum() == 0:
+                tmp_score *= 0.9
+                if tmp_score < 0.01:
+                    thresh_mask = final_scores > -1
+                    break
+                thresh_mask = final_scores >= tmp_score
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+
+            labels = final_preds[mask]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+        
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+        return predictions_list
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aac1a82a64f467a47e39d7e862357459e84abb84
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__init__.py
@@ -0,0 +1,4 @@
+from mmdet.core.bbox.match_costs import build_match_cost
+from .match_cost import BBox3DL1Cost
+
+__all__ = ['build_match_cost', 'BBox3DL1Cost']
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6961e65c2e165d3ce71560cfb27c53e7b538b992
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/match_cost.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/match_cost.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7b9fb73276907000ee1a91a6d67c64ec7fbb9c9
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/__pycache__/match_cost.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9678f3c7f666255540762d4064f0f7d82b920ed
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/match_costs/match_cost.py
@@ -0,0 +1,27 @@
+import torch
+from mmdet.core.bbox.match_costs.builder import MATCH_COST
+
+
+@MATCH_COST.register_module()
+class BBox3DL1Cost(object):
+    """BBox3DL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__init__.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..698af69c847b559eaf13f9c3e8609223824d255c
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__init__.py
@@ -0,0 +1,3 @@
+from .lidar_box3d import CustomLiDARInstance3DBoxes
+
+__all__ = ['CustomLiDARInstance3DBoxes']
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66d3462fa19de5d4227ad0e80cf2f50271c0062d
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/lidar_box3d.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/lidar_box3d.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccd3a2a43fe1369e066e7c41f14e22fe163464fe
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/lidar_box3d.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/nuscenes_box.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/nuscenes_box.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c143274b33e5e82e993d026bf6bae98e308974b
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/__pycache__/nuscenes_box.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/lidar_box3d.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/lidar_box3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a595de1569ab842214d072b12eed05cc672518
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/lidar_box3d.py
@@ -0,0 +1,279 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet3d.core.points import BasePoints
+from mmdet3d.ops.roiaware_pool3d import points_in_boxes_gpu
+from mmdet3d.core.bbox.structures.base_box3d import BaseInstance3DBoxes
+from mmdet3d.core.bbox.structures.utils import limit_period, rotation_3d_in_axis
+
+
+class CustomLiDARInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in LIDAR coordinates.
+        with future trajs.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                            up z    x front (yaw=-0.5*pi)
+                               ^   ^
+                               |  /
+                               | /
+      (yaw=-pi) left y <------ 0 -------- (yaw=0)
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    The yaw is 0 at the negative direction of y axis, and decreases from
+    the negative direction of y to the positive direction of x.
+
+    A refactor is ongoing to make the three coordinate systems
+    easier to understand and convert between each other.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicating the dimension of a box.
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+    def __init__(self, tensor, fut_trajs=None, fut_valid_mask=None, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)):
+        super(CustomLiDARInstance3DBoxes, self).__init__(
+            tensor, box_dim=box_dim, with_yaw=with_yaw, origin=origin
+        )
+        if fut_trajs is not None:
+            self.fut_trajs = fut_trajs
+        if fut_valid_mask is not None:
+            self.fut_valid_mask = fut_valid_mask
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, :2] = bottom_center[:, :2]
+        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self):
+        """torch.Tensor: Coordinates of corners of all the boxes
+        in shape (N, 8, 3).
+
+        Convert the boxes to corners in clockwise order, in form of
+        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
+
+        .. code-block:: none
+
+                                           up z
+                            front x           ^
+                                 /            |
+                                /             |
+                  (x1, y0, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / origin    | /
+            left y<-------- + ----------- + (x0, y1, z0)
+                (x0, y0, z0)
+        """
+        # TODO: rotation_3d_in_axis function do not support
+        #  empty tensor currently.
+        assert len(self.tensor) != 0
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+                device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin [0.5, 0.5, 0]
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    @property
+    def bev(self):
+        """torch.Tensor: 2D BEV box of each box with rotation
+        in XYWHR format."""
+        return self.tensor[:, [0, 1, 3, 4, 6]]
+
+    @property
+    def nearest_bev(self):
+        """torch.Tensor: A tensor of 2D BEV box of each box
+        without rotation."""
+        # Obtain BEV boxes with rotation in XYWHR format
+        bev_rotated_boxes = self.bev
+        # convert the rotation to a valid range
+        rotations = bev_rotated_boxes[:, -1]
+        normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
+
+        # find the center of boxes
+        conditions = (normed_rotations > np.pi / 4)[..., None]
+        bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
+                                                                [0, 1, 3, 2]],
+                                  bev_rotated_boxes[:, :4])
+
+        centers = bboxes_xywh[:, :2]
+        dims = bboxes_xywh[:, 2:]
+        bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+        return bev_boxes
+
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or \
+        rotation matrix.
+
+        Args:
+            angles (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns \
+                None, otherwise it returns the rotated points and the \
+                rotation matrix ``rot_mat_T``.
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            rot_sin = torch.sin(angle)
+            rot_cos = torch.cos(angle)
+            rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0],
+                                                [rot_sin, rot_cos, 0],
+                                                [0, 0, 1]])
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[1, 0]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+
+        self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
+        self.tensor[:, 6] += angle
+
+        if self.tensor.shape[1] == 9:
+            # rotate velo vector
+            self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2]
+
+        if points is not None:
+            if isinstance(points, torch.Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                # clockwise
+                points.rotate(-angle)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(self, bev_direction='horizontal', points=None):
+        """Flip the boxes in BEV along given BEV direction.
+
+        In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
+                Points to flip. Defaults to None.
+
+        Returns:
+            torch.Tensor, numpy.ndarray or None: Flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (torch.Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 1] = -points[:, 1]
+                elif bev_direction == 'vertical':
+                    points[:, 0] = -points[:, 0]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    def in_range_bev(self, box_range):
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (list | torch.Tensor): the range of box
+                (x_min, y_min, x_max, y_max)
+
+        Note:
+            The original implementation of SECOND checks whether boxes in
+            a range by checking whether the points are in a convex
+            polygon, we reduce the burden for simpler cases.
+
+        Returns:
+            torch.Tensor: Whether each box is inside the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > box_range[0])
+                          & (self.tensor[:, 1] > box_range[1])
+                          & (self.tensor[:, 0] < box_range[2])
+                          & (self.tensor[:, 1] < box_range[3]))
+        return in_range_flags
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`Box3DMode`): the target Box mode
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from ``src`` coordinates to ``dst`` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: \
+                The converted box of the same type in the ``dst`` mode.
+        """
+        from mmdet3d.core.bbox.structures.box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self, src=Box3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
+
+    def enlarged_box(self, extra_width):
+        """Enlarge the length, width and height boxes.
+
+        Args:
+            extra_width (float | torch.Tensor): Extra width to enlarge the box.
+
+        Returns:
+            :obj:`LiDARInstance3DBoxes`: Enlarged boxes.
+        """
+        enlarged_boxes = self.tensor.clone()
+        enlarged_boxes[:, 3:6] += extra_width * 2
+        # bottom center z minus extra_width
+        enlarged_boxes[:, 2] -= extra_width
+        return self.new_box(enlarged_boxes)
+
+    def points_in_boxes(self, points):
+        """Find the box which the points are in.
+
+        Args:
+            points (torch.Tensor): Points in shape (N, 3).
+
+        Returns:
+            torch.Tensor: The index of box where each point are in.
+        """
+        box_idx = points_in_boxes_gpu(
+            points.unsqueeze(0),
+            self.tensor.unsqueeze(0).to(points.device)).squeeze(0)
+        return box_idx
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/nuscenes_box.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/nuscenes_box.py
new file mode 100644
index 0000000000000000000000000000000000000000..05200a0fc3958831637177d9592ce5c2a47a08df
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/structures/nuscenes_box.py
@@ -0,0 +1,458 @@
+# nuScenes dev-kit.
+# Code written by Oscar Beijbom, 2018.
+
+import copy
+from typing import Tuple, List
+
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.axes import Axes
+from matplotlib.collections import LineCollection
+from pyquaternion import Quaternion
+from nuscenes.utils.geometry_utils import view_points
+from nuscenes.eval.common.data_classes import EvalBox
+from nuscenes.eval.detection.constants import DETECTION_NAMES, ATTRIBUTE_NAMES
+
+
+def color_map(data, cmap):
+    """数值映射为颜色"""
+    
+    dmin, dmax = np.nanmin(data), np.nanmax(data)
+    cmo = plt.cm.get_cmap(cmap)
+    cs, k = list(), 256/cmo.N
+    
+    for i in range(cmo.N):
+        c = cmo(i)
+        for j in range(int(i*k), int((i+1)*k)):
+            cs.append(c)
+    cs = np.array(cs)
+    data = np.uint8(255*(data-dmin)/(dmax-dmin))
+    
+    return cs[data]
+
+class CustomNuscenesBox:
+    """ Simple data class representing a 3d box including, label, score and velocity. """
+
+    def __init__(self,
+                 center: List[float],
+                 size: List[float],
+                 orientation: Quaternion,
+                 fut_trajs: List[float],
+                 label: int = np.nan,
+                 score: float = np.nan,
+                 velocity: Tuple = (np.nan, np.nan, np.nan),
+                 name: str = None,
+                 token: str = None):
+        """
+        :param center: Center of box given as x, y, z.
+        :param size: Size of box in width, length, height.
+        :param orientation: Box orientation.
+        :param label: Integer label, optional.
+        :param score: Classification score, optional.
+        :param velocity: Box velocity in x, y, z direction.
+        :param name: Box name, optional. Can be used e.g. for denote category name.
+        :param token: Unique string identifier from DB.
+        """
+        assert not np.any(np.isnan(center))
+        assert not np.any(np.isnan(size))
+        assert len(center) == 3
+        assert len(size) == 3
+        assert type(orientation) == Quaternion
+
+        self.center = np.array(center)
+        self.wlh = np.array(size)
+        self.orientation = orientation
+        self.label = int(label) if not np.isnan(label) else label
+        self.score = float(score) if not np.isnan(score) else score
+        self.velocity = np.array(velocity)
+        self.name = name
+        self.token = token
+        self.fut_trajs = np.array(fut_trajs)
+
+    def __eq__(self, other):
+        center = np.allclose(self.center, other.center)
+        wlh = np.allclose(self.wlh, other.wlh)
+        orientation = np.allclose(self.orientation.elements, other.orientation.elements)
+        label = (self.label == other.label) or (np.isnan(self.label) and np.isnan(other.label))
+        score = (self.score == other.score) or (np.isnan(self.score) and np.isnan(other.score))
+        vel = (np.allclose(self.velocity, other.velocity) or
+               (np.all(np.isnan(self.velocity)) and np.all(np.isnan(other.velocity))))
+
+        return center and wlh and orientation and label and score and vel
+
+    def __repr__(self):
+        repr_str = 'label: {}, score: {:.2f}, xyz: [{:.2f}, {:.2f}, {:.2f}], wlh: [{:.2f}, {:.2f}, {:.2f}], ' \
+                   'rot axis: [{:.2f}, {:.2f}, {:.2f}], ang(degrees): {:.2f}, ang(rad): {:.2f}, ' \
+                   'vel: {:.2f}, {:.2f}, {:.2f}, name: {}, token: {}'
+
+        return repr_str.format(self.label, self.score, self.center[0], self.center[1], self.center[2], self.wlh[0],
+                               self.wlh[1], self.wlh[2], self.orientation.axis[0], self.orientation.axis[1],
+                               self.orientation.axis[2], self.orientation.degrees, self.orientation.radians,
+                               self.velocity[0], self.velocity[1], self.velocity[2], self.name, self.token)
+
+    @property
+    def rotation_matrix(self) -> np.ndarray:
+        """
+        Return a rotation matrix.
+        :return: <np.float: 3, 3>. The box's rotation matrix.
+        """
+        return self.orientation.rotation_matrix
+
+    def translate(self, x: np.ndarray) -> None:
+        """
+        Applies a translation.
+        :param x: <np.float: 3, 1>. Translation in x, y, z direction.
+        """
+        self.center += x
+
+    def rotate(self, quaternion: Quaternion) -> None:
+        """
+        Rotates box.
+        :param quaternion: Rotation to apply.
+        """
+        self.center = np.dot(quaternion.rotation_matrix, self.center)
+        self.orientation = quaternion * self.orientation
+        self.velocity = np.dot(quaternion.rotation_matrix, self.velocity)
+
+    def corners(self, wlh_factor: float = 1.0) -> np.ndarray:
+        """
+        Returns the bounding box corners.
+        :param wlh_factor: Multiply w, l, h by a factor to scale the box.
+        :return: <np.float: 3, 8>. First four corners are the ones facing forward.
+            The last four are the ones facing backwards.
+        """
+        w, l, h = self.wlh * wlh_factor
+
+        # 3D bounding box corners. (Convention: x points forward, y to the left, z up.)
+        x_corners = l / 2 * np.array([1,  1,  1,  1, -1, -1, -1, -1])
+        y_corners = w / 2 * np.array([1, -1, -1,  1,  1, -1, -1,  1])
+        z_corners = h / 2 * np.array([1,  1, -1, -1,  1,  1, -1, -1])
+        corners = np.vstack((x_corners, y_corners, z_corners))
+
+        # Rotate
+        corners = np.dot(self.orientation.rotation_matrix, corners)
+
+        # Translate
+        x, y, z = self.center
+        corners[0, :] = corners[0, :] + x
+        corners[1, :] = corners[1, :] + y
+        corners[2, :] = corners[2, :] + z
+
+        return corners
+
+    def bottom_corners(self) -> np.ndarray:
+        """
+        Returns the four bottom corners.
+        :return: <np.float: 3, 4>. Bottom corners. First two face forward, last two face backwards.
+        """
+        return self.corners()[:, [2, 3, 7, 6]]
+
+    def render(self,
+               axis: Axes,
+               view: np.ndarray = np.eye(3),
+               normalize: bool = False,
+               colors: Tuple = ('b', 'r', 'k'),
+               linewidth: float = 2,
+               box_idx=None,
+               alpha=0.5) -> None:
+        """
+        Renders the box in the provided Matplotlib axis.
+        :param axis: Axis onto which the box should be drawn.
+        :param view: <np.array: 3, 3>. Define a projection in needed (e.g. for drawing projection in an image).
+        :param normalize: Whether to normalize the remaining coordinate.
+        :param colors: (<Matplotlib.colors>: 3). Valid Matplotlib colors (<str> or normalized RGB tuple) for front,
+            back and sides.
+        :param linewidth: Width in pixel of the box sides.
+        """
+        corners = view_points(self.corners(), view, normalize=normalize)[:2, :]
+
+        def draw_rect(selected_corners, color, alpha):
+            prev = selected_corners[-1]
+            for corner in selected_corners:
+                axis.plot([prev[0], corner[0]], [prev[1], corner[1]], color=color, linewidth=linewidth, alpha=alpha)
+                prev = corner
+
+        # Draw the sides
+        for i in range(4):
+            axis.plot([corners.T[i][0], corners.T[i + 4][0]],
+                      [corners.T[i][1], corners.T[i + 4][1]],
+                      color=colors[2], linewidth=linewidth, alpha=alpha)
+
+        # Draw front (first 4 corners) and rear (last 4 corners) rectangles(3d)/lines(2d)
+        draw_rect(corners.T[:4], colors[0], alpha)
+        draw_rect(corners.T[4:], colors[1], alpha)
+
+        # Draw line indicating the front
+        center_bottom_forward = np.mean(corners.T[2:4], axis=0)
+        center_bottom = np.mean(corners.T[[2, 3, 7, 6]], axis=0)
+        axis.plot([center_bottom[0], center_bottom_forward[0]],
+                  [center_bottom[1], center_bottom_forward[1]],
+                  color=colors[0], linewidth=linewidth, alpha=alpha)
+        if box_idx is not None and center_bottom[0] > -35 and center_bottom[1] > -35 \
+            and center_bottom[0] < 35 and center_bottom[1] < 35:
+            text = f'{box_idx}'
+            axis.text(center_bottom[0], center_bottom[1], text, ha='left', fontsize=5)
+    
+    def render_fut_trajs(self,
+               axis: Axes,
+               color: str = 'b',
+               linewidth: float = 1,
+               fut_ts: int = 6,
+               mode_idx=None) -> None:
+        """
+        Renders the box in the provided Matplotlib axis.
+        :param axis: Axis onto which the box should be drawn.
+        :param view: <np.array: 3, 3>. Define a projection in needed (e.g. for drawing projection in an image).
+        :param normalize: Whether to normalize the remaining coordinate.
+        :param colors: (<Matplotlib.colors>: 3). Valid Matplotlib colors (<str> or normalized RGB tuple) for front,
+            back and sides.
+        :param linewidth: Width in pixel of the box sides.
+        """
+
+        fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2))
+        if mode_idx is not None:
+            fut_coords = fut_coords[[mode_idx]]
+        alpha = 0.8
+        for i in range(fut_coords.shape[0]):
+            fut_coord = fut_coords[i]
+            fut_coord = fut_coord.cumsum(axis=-2)
+            fut_coord = fut_coord + self.center[:2]
+            if np.abs(fut_coord[-1] - self.center[:2]).max() >= 10:
+                if color == 'g':
+                    axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='*', s=70, alpha=alpha)
+                elif color == 'b':
+                    axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='o', s=20, alpha=alpha)
+                if mode_idx is None and fut_coord[-1, 0] > -35 and fut_coord[-1, 1] > -35 \
+                    and fut_coord[-1, 0] < 35 and fut_coord[-1, 1] < 35:
+                    text = f'{i}'
+                    axis.text(fut_coord[-1, 0], fut_coord[-1, 1], text, ha='left', fontsize=5)
+            axis.plot(
+                [self.center[0], fut_coord[0, 0]],
+                [self.center[1], fut_coord[0, 1]],
+                color=color, linewidth=linewidth, alpha=alpha
+            )
+            for i in range(fut_coord.shape[0]-1):
+                axis.plot(
+                    [fut_coord[i, 0], fut_coord[i+1, 0]],
+                    [fut_coord[i, 1], fut_coord[i+1, 1]],
+                    color=color, linewidth=linewidth, alpha=alpha
+                )
+
+    def render_fut_trajs_grad_color(self,
+               axis: Axes,
+               linewidth: float = 1,
+               linestyles='solid',
+               cmap='viridis',
+               fut_ts: int = 6,
+               alpha: int = 0.8,
+               mode_idx=None) -> None:
+        """
+        Renders the box in the provided Matplotlib axis.
+        :param axis: Axis onto which the box should be drawn.
+        :param view: <np.array: 3, 3>. Define a projection in needed (e.g. for drawing projection in an image).
+        :param normalize: Whether to normalize the remaining coordinate.
+        :param colors: (<Matplotlib.colors>: 3). Valid Matplotlib colors (<str> or normalized RGB tuple) for front,
+            back and sides.
+        :param linewidth: Width in pixel of the box sides.
+        """
+
+        fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2))
+        if mode_idx is not None:
+            fut_coords = fut_coords[[mode_idx]]
+
+        for i in range(fut_coords.shape[0]):
+            fut_coord = fut_coords[i]
+            fut_coord = fut_coord.cumsum(axis=-2)
+            fut_coord = fut_coord + self.center[:2]
+            fut_coord = np.concatenate((self.center[np.newaxis, :2], fut_coord), axis=0)
+            fut_coord_segments = np.stack((fut_coord[:-1], fut_coord[1:]), axis=1)
+
+            fut_vecs = None
+            for j in range(fut_coord_segments.shape[0]):
+                fut_vec_j = fut_coord_segments[j]
+                x_linspace = np.linspace(fut_vec_j[0, 0], fut_vec_j[1, 0], 51)
+                y_linspace = np.linspace(fut_vec_j[0, 1], fut_vec_j[1, 1], 51)
+                xy = np.stack((x_linspace, y_linspace), axis=1)
+                xy = np.stack((xy[:-1], xy[1:]), axis=1)
+                if fut_vecs is None:
+                    fut_vecs = xy
+                else:
+                    fut_vecs = np.concatenate((fut_vecs, xy), axis=0)
+
+            y = np.sin(np.linspace(3/2*np.pi, 5/2*np.pi, 301))
+            colors = color_map(y[:-1], cmap)
+            line_segments = LineCollection(fut_vecs, colors=colors, linewidths=linewidth, linestyles=linestyles, cmap=cmap)
+
+            # if mode_idx is None and abs(fut_coord[-1, 0]) < 35 and abs(fut_coord[-1, 1]) < 35:
+            #     text = f'{i}'
+            #     axis.text(fut_coord[-1, 0], fut_coord[-1, 1], text, ha='left', fontsize=5)
+
+            axis.add_collection(line_segments)
+
+    def render_fut_trajs_coords(self,
+               axis: Axes,
+               color: str = 'b',
+               linewidth: float = 1,
+               fut_ts: int = 12) -> None:
+        """
+        Renders the box in the provided Matplotlib axis.
+        :param axis: Axis onto which the box should be drawn.
+        :param view: <np.array: 3, 3>. Define a projection in needed (e.g. for drawing projection in an image).
+        :param normalize: Whether to normalize the remaining coordinate.
+        :param colors: (<Matplotlib.colors>: 3). Valid Matplotlib colors (<str> or normalized RGB tuple) for front,
+            back and sides.
+        :param linewidth: Width in pixel of the box sides.
+        """
+
+        fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2))
+        alpha = 0.2 if color == 'b' else 1
+        for i in range(fut_coords.shape[0]):
+            fut_coord = fut_coords[i]
+            fut_coord = fut_coord + self.center[:2]
+            if np.abs(fut_coord[-1] - self.center[:2]).max() >= 10:
+                if color == 'g':
+                    axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='*', s=70, alpha=alpha)
+                elif color == 'b':
+                    axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='o', s=20, alpha=alpha)
+            axis.plot(
+                [self.center[0], fut_coord[0, 0]],
+                [self.center[1], fut_coord[0, 1]],
+                color=color, linewidth=linewidth, alpha=alpha
+            )
+            for i in range(fut_coord.shape[0]-1):
+                axis.plot(
+                    [fut_coord[i, 0], fut_coord[i+1, 0]],
+                    [fut_coord[i, 1], fut_coord[i+1, 1]],
+                    color=color, linewidth=linewidth, alpha=alpha
+                )
+
+    def render_cv2(self,
+                   im: np.ndarray,
+                   view: np.ndarray = np.eye(3),
+                   normalize: bool = False,
+                   colors: Tuple = ((0, 0, 255), (255, 0, 0), (155, 155, 155)),
+                   linewidth: int = 2) -> None:
+        """
+        Renders box using OpenCV2.
+        :param im: <np.array: width, height, 3>. Image array. Channels are in BGR order.
+        :param view: <np.array: 3, 3>. Define a projection if needed (e.g. for drawing projection in an image).
+        :param normalize: Whether to normalize the remaining coordinate.
+        :param colors: ((R, G, B), (R, G, B), (R, G, B)). Colors for front, side & rear.
+        :param linewidth: Linewidth for plot.
+        """
+        corners = view_points(self.corners(), view, normalize=normalize)[:2, :]
+
+        def draw_rect(selected_corners, color):
+            prev = selected_corners[-1]
+            for corner in selected_corners:
+                cv2.line(im,
+                         (int(prev[0]), int(prev[1])),
+                         (int(corner[0]), int(corner[1])),
+                         color, linewidth)
+                prev = corner
+
+        # Draw the sides
+        for i in range(4):
+            cv2.line(im,
+                     (int(corners.T[i][0]), int(corners.T[i][1])),
+                     (int(corners.T[i + 4][0]), int(corners.T[i + 4][1])),
+                     colors[2][::-1], linewidth)
+
+        # Draw front (first 4 corners) and rear (last 4 corners) rectangles(3d)/lines(2d)
+        draw_rect(corners.T[:4], colors[0][::-1])
+        draw_rect(corners.T[4:], colors[1][::-1])
+
+        # Draw line indicating the front
+        center_bottom_forward = np.mean(corners.T[2:4], axis=0)
+        center_bottom = np.mean(corners.T[[2, 3, 7, 6]], axis=0)
+        cv2.line(im,
+                 (int(center_bottom[0]), int(center_bottom[1])),
+                 (int(center_bottom_forward[0]), int(center_bottom_forward[1])),
+                 colors[0][::-1], linewidth)
+
+    def copy(self) -> 'CustomNuscenesBox':
+        """
+        Create a copy of self.
+        :return: A copy.
+        """
+        return copy.deepcopy(self)
+
+
+class CustomDetectionBox(EvalBox):
+    """ Data class used during detection evaluation. Can be a prediction or ground truth."""
+
+    def __init__(self,
+                 sample_token: str = "",
+                 translation: Tuple[float, float, float] = (0, 0, 0),
+                 size: Tuple[float, float, float] = (0, 0, 0),
+                 rotation: Tuple[float, float, float, float] = (0, 0, 0, 0),
+                 velocity: Tuple[float, float] = (0, 0),
+                 ego_translation: Tuple[float, float, float] = (0, 0, 0),  # Translation to ego vehicle in meters.
+                 num_pts: int = -1,  # Nbr. LIDAR or RADAR inside the box. Only for gt boxes.
+                 detection_name: str = 'car',  # The class name used in the detection challenge.
+                 detection_score: float = -1.0,  # GT samples do not have a score.
+                 attribute_name: str = '',  # Box attribute. Each box can have at most 1 attribute.
+                 fut_trajs=None):  # future trajectories of a pred box, shape=[fut_ts*2].
+
+        super().__init__(sample_token, translation, size, rotation, velocity, ego_translation, num_pts)
+
+        assert detection_name is not None, 'Error: detection_name cannot be empty!'
+        assert detection_name in DETECTION_NAMES, 'Error: Unknown detection_name %s' % detection_name
+
+        assert attribute_name in ATTRIBUTE_NAMES or attribute_name == '', \
+            'Error: Unknown attribute_name %s' % attribute_name
+
+        assert type(detection_score) == float, 'Error: detection_score must be a float!'
+        assert not np.any(np.isnan(detection_score)), 'Error: detection_score may not be NaN!'
+
+        # Assign.
+        self.detection_name = detection_name
+        self.detection_score = detection_score
+        self.attribute_name = attribute_name
+        self.fut_trajs = fut_trajs
+
+    def __eq__(self, other):
+        return (self.sample_token == other.sample_token and
+                self.translation == other.translation and
+                self.size == other.size and
+                self.rotation == other.rotation and
+                self.velocity == other.velocity and
+                self.ego_translation == other.ego_translation and
+                self.num_pts == other.num_pts and
+                self.detection_name == other.detection_name and
+                self.detection_score == other.detection_score and
+                self.attribute_name == other.attribute_name and
+                self.fut_trajs == other.fut_trajs)
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'fut_trajs': self.fut_trajs
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(sample_token=content['sample_token'],
+                   translation=tuple(content['translation']),
+                   size=tuple(content['size']),
+                   rotation=tuple(content['rotation']),
+                   velocity=tuple(content['velocity']),
+                   fut_trajs=tuple(content['fut_trajs']),
+                   ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+                   else tuple(content['ego_translation']),
+                   num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+                   detection_name=content['detection_name'],
+                   detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+                   attribute_name=content['attribute_name'])
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/bbox/util.py b/GenAD-main/projects/mmdet3d_plugin/core/bbox/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..c54bd750246f3d6e2249b7d39888fffa6227beda
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/bbox/util.py
@@ -0,0 +1,53 @@
+import torch 
+
+
+def normalize_bbox(bboxes, pc_range):
+
+    cx = bboxes[..., 0:1]
+    cy = bboxes[..., 1:2]
+    cz = bboxes[..., 2:3]
+    w = bboxes[..., 3:4].log()
+    l = bboxes[..., 4:5].log()
+    h = bboxes[..., 5:6].log()
+
+    rot = bboxes[..., 6:7]
+    if bboxes.size(-1) > 7:
+        vx = bboxes[..., 7:8] 
+        vy = bboxes[..., 8:9]
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1
+        )
+    else:
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1
+        )
+    return normalized_bboxes
+
+def denormalize_bbox(normalized_bboxes, pc_range):
+    # rotation 
+    rot_sine = normalized_bboxes[..., 6:7]
+
+    rot_cosine = normalized_bboxes[..., 7:8]
+    rot = torch.atan2(rot_sine, rot_cosine)
+
+    # center in the bev
+    cx = normalized_bboxes[..., 0:1]
+    cy = normalized_bboxes[..., 1:2]
+    cz = normalized_bboxes[..., 4:5]
+   
+    # size
+    w = normalized_bboxes[..., 2:3]
+    l = normalized_bboxes[..., 3:4]
+    h = normalized_bboxes[..., 5:6]
+
+    w = w.exp() 
+    l = l.exp() 
+    h = h.exp() 
+    if normalized_bboxes.size(-1) > 8:
+         # velocity 
+        vx = normalized_bboxes[:, 8:9]
+        vy = normalized_bboxes[:, 9:10]
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
+    else:
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
+    return denormalized_bboxes
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__init__.py b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d92421c7e84fdc7a33e94aa10fddfccb332d6399
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__init__.py
@@ -0,0 +1 @@
+from .eval_hooks import CustomDistEvalHook
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1152e552d359e236bfdc2500cc56c57910cb82af
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/eval_hooks.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/eval_hooks.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d4e14ebf833fa6af9f96f34d3d8a51f39cde0da
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/eval_hooks.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/metric_motion.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/metric_motion.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6111380f8ccfa5ce5e875474c18eaf6fb519ea03
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/__pycache__/metric_motion.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..96b70706885750f8912741363287e973c12a384c
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/eval_hooks.py
@@ -0,0 +1,92 @@
+
+# Note: Considering that MMCV's EvalHook updated its interface in V1.3.16,
+# in order to avoid strong version dependency, we did not directly
+# inherit EvalHook but BaseDistEvalHook.
+
+import bisect
+import os.path as osp
+
+import mmcv
+import torch.distributed as dist
+from mmcv.runner import DistEvalHook as BaseDistEvalHook
+from mmcv.runner import EvalHook as BaseEvalHook
+from torch.nn.modules.batchnorm import _BatchNorm
+from mmdet.core.evaluation.eval_hooks import DistEvalHook
+
+
+def _calc_dynamic_intervals(start_interval, dynamic_interval_list):
+    assert mmcv.is_list_of(dynamic_interval_list, tuple)
+
+    dynamic_milestones = [0]
+    dynamic_milestones.extend(
+        [dynamic_interval[0] for dynamic_interval in dynamic_interval_list])
+    dynamic_intervals = [start_interval]
+    dynamic_intervals.extend(
+        [dynamic_interval[1] for dynamic_interval in dynamic_interval_list])
+    return dynamic_milestones, dynamic_intervals
+
+
+class CustomDistEvalHook(BaseDistEvalHook):
+
+    def __init__(self, *args, dynamic_intervals=None,  **kwargs):
+        super(CustomDistEvalHook, self).__init__(*args, **kwargs)
+        self.use_dynamic_intervals = dynamic_intervals is not None
+        if self.use_dynamic_intervals:
+            self.dynamic_milestones, self.dynamic_intervals = \
+                _calc_dynamic_intervals(self.interval, dynamic_intervals)
+
+    def _decide_interval(self, runner):
+        if self.use_dynamic_intervals:
+            progress = runner.epoch if self.by_epoch else runner.iter
+            step = bisect.bisect(self.dynamic_milestones, (progress + 1))
+            # Dynamically modify the evaluation interval
+            self.interval = self.dynamic_intervals[step - 1]
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        self._decide_interval(runner)
+        super().before_train_epoch(runner)
+
+    def before_train_iter(self, runner):
+        self._decide_interval(runner)
+        super().before_train_iter(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        if not self._should_evaluate(runner):
+            return
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        # from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test # to solve circlur  import
+        from projects.mmdet3d_plugin.VAD.apis.test import custom_multi_gpu_test # to solve circlur  import
+
+        results = custom_multi_gpu_test(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+
+            key_score = self.evaluate(runner, results)
+
+            if self.save_best:
+                self._save_ckpt(runner, key_score)
+  
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..f816974544b57c1561a1fc09b9cf9e48dde03e38
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/kitti2waymo.py
@@ -0,0 +1,251 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+    <https://github.com/caizhongang/waymo_kitti_converter>`_.
+"""
+
+try:
+    from waymo_open_dataset import dataset_pb2 as open_dataset
+    import mmcv
+    import numpy as np
+    import tensorflow as tf
+    from glob import glob
+    from os.path import join
+    from waymo_open_dataset import label_pb2
+    from waymo_open_dataset.protos import metrics_pb2
+except ImportError:
+    #pass
+    raise ImportError(
+        'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
+        'to install the official devkit first.')
+
+
+
+
+class KITTI2Waymo(object):
+    """KITTI predictions to Waymo converter.
+    This class serves as the converter to change predictions from KITTI to
+    Waymo format.
+    Args:
+        kitti_result_files (list[dict]): Predictions in KITTI format.
+        waymo_tfrecords_dir (str): Directory to load waymo raw data.
+        waymo_results_save_dir (str): Directory to save converted predictions
+            in waymo format (.bin files).
+        waymo_results_final_path (str): Path to save combined
+            predictions in waymo format (.bin file), like 'a/b/c.bin'.
+        prefix (str): Prefix of filename. In general, 0 for training, 1 for
+            validation and 2 for testing.
+        workers (str): Number of parallel processes.
+    """
+
+    def __init__(self,
+                 kitti_result_files,
+                 waymo_tfrecords_dir,
+                 waymo_results_save_dir,
+                 waymo_results_final_path,
+                 prefix,
+                 workers=64):
+
+        self.kitti_result_files = kitti_result_files
+        self.waymo_tfrecords_dir = waymo_tfrecords_dir
+        self.waymo_results_save_dir = waymo_results_save_dir
+        self.waymo_results_final_path = waymo_results_final_path
+        self.prefix = prefix
+        self.workers = int(workers)
+        self.name2idx = {}
+        for idx, result in enumerate(kitti_result_files):
+            if len(result['sample_idx']) > 0:
+                self.name2idx[str(result['sample_idx'][0])] = idx
+
+        # turn on eager execution for older tensorflow versions
+        if int(tf.__version__.split('.')[0]) < 2:
+            tf.enable_eager_execution()
+
+        self.k2w_cls_map = {
+            'Car': label_pb2.Label.TYPE_VEHICLE,
+            'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN,
+            'Sign': label_pb2.Label.TYPE_SIGN,
+            'Cyclist': label_pb2.Label.TYPE_CYCLIST,
+        }
+
+        self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
+                                            [-1.0, 0.0, 0.0, 0.0],
+                                            [0.0, -1.0, 0.0, 0.0],
+                                            [0.0, 0.0, 0.0, 1.0]])
+
+        self.get_file_names()
+        self.create_folder()
+
+    def get_file_names(self):
+        """Get file names of waymo raw data."""
+        self.waymo_tfrecord_pathnames = sorted(
+            glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
+        print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
+
+    def create_folder(self):
+        """Create folder for data conversion."""
+        mmcv.mkdir_or_exist(self.waymo_results_save_dir)
+
+    def parse_objects(self, kitti_result, T_k2w, context_name,
+                      frame_timestamp_micros):
+        """Parse one prediction with several instances in kitti format and
+        convert them to `Object` proto.
+        Args:
+            kitti_result (dict): Predictions in kitti format.
+                - name (np.ndarray): Class labels of predictions.
+                - dimensions (np.ndarray): Height, width, length of boxes.
+                - location (np.ndarray): Bottom center of boxes (x, y, z).
+                - rotation_y (np.ndarray): Orientation of boxes.
+                - score (np.ndarray): Scores of predictions.
+            T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
+            context_name (str): Context name of the frame.
+            frame_timestamp_micros (int): Frame timestamp.
+        Returns:
+            :obj:`Object`: Predictions in waymo dataset Object proto.
+        """
+
+        def parse_one_object(instance_idx):
+            """Parse one instance in kitti format and convert them to `Object`
+            proto.
+            Args:
+                instance_idx (int): Index of the instance to be converted.
+            Returns:
+                :obj:`Object`: Predicted instance in waymo dataset \
+                    Object proto.
+            """
+            cls = kitti_result['name'][instance_idx]
+            length = round(kitti_result['dimensions'][instance_idx, 0], 4)
+            height = round(kitti_result['dimensions'][instance_idx, 1], 4)
+            width = round(kitti_result['dimensions'][instance_idx, 2], 4)
+            x = round(kitti_result['location'][instance_idx, 0], 4)
+            y = round(kitti_result['location'][instance_idx, 1], 4)
+            z = round(kitti_result['location'][instance_idx, 2], 4)
+            rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
+            score = round(kitti_result['score'][instance_idx], 4)
+
+            # y: downwards; move box origin from bottom center (kitti) to
+            # true center (waymo)
+            y -= height / 2
+            # frame transformation: kitti -> waymo
+            x, y, z = self.transform(T_k2w, x, y, z)
+
+            # different conventions
+            heading = -(rotation_y + np.pi / 2)
+            while heading < -np.pi:
+                heading += 2 * np.pi
+            while heading > np.pi:
+                heading -= 2 * np.pi
+
+            box = label_pb2.Label.Box()
+            box.center_x = x
+            box.center_y = y
+            box.center_z = z
+            box.length = length
+            box.width = width
+            box.height = height
+            box.heading = heading
+
+            o = metrics_pb2.Object()
+            o.object.box.CopyFrom(box)
+            o.object.type = self.k2w_cls_map[cls]
+            o.score = score
+
+            o.context_name = context_name
+            o.frame_timestamp_micros = frame_timestamp_micros
+
+            return o
+
+        objects = metrics_pb2.Objects()
+
+        for instance_idx in range(len(kitti_result['name'])):
+            o = parse_one_object(instance_idx)
+            objects.objects.append(o)
+
+        return objects
+
+    def convert_one(self, file_idx):
+        """Convert action for single file.
+        Args:
+            file_idx (int): Index of the file to be converted.
+        """
+        file_pathname = self.waymo_tfrecord_pathnames[file_idx]
+        file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
+
+        for frame_num, frame_data in enumerate(file_data):
+            frame = open_dataset.Frame()
+            frame.ParseFromString(bytearray(frame_data.numpy()))
+            filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
+
+            for camera in frame.context.camera_calibrations:
+                # FRONT = 1, see dataset.proto for details
+                if camera.name == 1:
+                    T_front_cam_to_vehicle = np.array(
+                        camera.extrinsic.transform).reshape(4, 4)
+
+            T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
+
+            context_name = frame.context.name
+            frame_timestamp_micros = frame.timestamp_micros
+
+            if filename in self.name2idx:
+                kitti_result = \
+                    self.kitti_result_files[self.name2idx[filename]]
+                objects = self.parse_objects(kitti_result, T_k2w, context_name,
+                                             frame_timestamp_micros)
+            else:
+                print(filename, 'not found.(bevformer)')
+                objects = metrics_pb2.Objects()
+
+            with open(
+                    join(self.waymo_results_save_dir, f'{filename}.bin'),
+                    'wb') as f:
+                f.write(objects.SerializeToString())
+
+    def convert(self):
+        """Convert action."""
+        print('Start converting ...')
+        mmcv.track_parallel_progress(self.convert_one, range(len(self)),
+                                     self.workers)
+        print('\nFinished ...')
+
+        # combine all files into one .bin
+        pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
+        combined = self.combine(pathnames)
+
+        with open(self.waymo_results_final_path, 'wb') as f:
+            f.write(combined.SerializeToString())
+
+    def __len__(self):
+        """Length of the filename list."""
+        return len(self.waymo_tfrecord_pathnames)
+
+    def transform(self, T, x, y, z):
+        """Transform the coordinates with matrix T.
+        Args:
+            T (np.ndarray): Transformation matrix.
+            x(float): Coordinate in x axis.
+            y(float): Coordinate in y axis.
+            z(float): Coordinate in z axis.
+        Returns:
+            list: Coordinates after transformation.
+        """
+        pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
+        pt_aft = np.matmul(T, pt_bef)
+        return pt_aft[:3].flatten().tolist()
+
+    def combine(self, pathnames):
+        """Combine predictions in waymo format for each sample together.
+        Args:
+            pathnames (str): Paths to save predictions.
+        Returns:
+            :obj:`Objects`: Combined predictions in Objects proto.
+        """
+        combined = metrics_pb2.Objects()
+
+        for pathname in pathnames:
+            objects = metrics_pb2.Objects()
+            with open(pathname, 'rb') as f:
+                objects.ParseFromString(f.read())
+            for o in objects.objects:
+                combined.objects.append(o)
+
+        return combined
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/core/evaluation/metric_motion.py b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/metric_motion.py
new file mode 100644
index 0000000000000000000000000000000000000000..8219438cda0ad6733871a8b23515d3ff470439ce
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/core/evaluation/metric_motion.py
@@ -0,0 +1,70 @@
+# <Copyright 2019, Argo AI, LLC. Released under the MIT license.>
+
+"""This module evaluates the forecasted trajectories against the ground truth."""
+
+import math
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+
+LOW_PROB_THRESHOLD_FOR_METRICS = 0.05
+
+
+def get_ade(forecasted_trajectory: torch.Tensor, gt_trajectory: torch.Tensor) -> float:
+    """Compute Average Displacement Error.
+    Args:
+        forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2]
+        gt_trajectory: Ground truth trajectory with shape [fut_ts, 2]
+    Returns:
+        ade: Average Displacement Error
+    """
+    pred_len = forecasted_trajectory.shape[0]
+    ade = float(
+        sum(
+            torch.sqrt(
+                (forecasted_trajectory[i, 0] - gt_trajectory[i, 0]) ** 2
+                + (forecasted_trajectory[i, 1] - gt_trajectory[i, 1]) ** 2
+            )
+            for i in range(pred_len)
+        )
+        / pred_len
+    )
+    return ade
+
+def get_best_preds(
+    forecasted_trajectory: torch.Tensor,
+    gt_trajectory: torch.Tensor
+) -> float:
+    """Compute min Average Displacement Error.
+    Args:
+        forecasted_trajectory: Predicted trajectory with shape [k, fut_ts, 2]
+        gt_trajectory: Ground truth trajectory with shape [fut_ts, 2]
+        gt_fut_masks: Ground truth traj mask with shape (fut_ts)
+    Returns:
+        best_forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2]
+    """
+
+    # [k, fut_ts]
+    dist = torch.linalg.norm(gt_trajectory[None] - forecasted_trajectory, dim=-1)
+    dist = dist[..., -1]
+    dist[torch.isnan(dist)] = 0
+    min_mode_idx = torch.argmin(dist, dim=-1)
+
+    return forecasted_trajectory[min_mode_idx]
+
+def get_fde(forecasted_trajectory: torch.Tensor, gt_trajectory: torch.Tensor) -> float:
+    """Compute Final Displacement Error.
+    Args:
+        forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2]
+        gt_trajectory: Ground truth trajectory with shape [fut_ts, 2]
+    Returns:
+        fde: Final Displacement Error
+    """
+    fde = float(
+        torch.sqrt(
+            (forecasted_trajectory[-1, 0] - gt_trajectory[-1, 0]) ** 2
+            + (forecasted_trajectory[-1, 1] - gt_trajectory[-1, 1]) ** 2
+        )
+    )
+    return fde
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/__init__.py b/GenAD-main/projects/mmdet3d_plugin/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..480874fd8759bd989a82de2803f8b872e9238124
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/__init__.py
@@ -0,0 +1,6 @@
+from .nuscenes_vad_dataset import VADCustomNuScenesDataset
+
+
+__all__ = [
+    'VADCustomNuScenesDataset'
+]
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1757bb3b960f999a9778456293e1f9dbdc3affd2
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/builder.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/builder.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a0a975e4f163e41f5c17f500fa8636d507606ae
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/builder.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/nuscenes_vad_dataset.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/nuscenes_vad_dataset.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..424216653d97efa21f41a3ea6794ee2df9d8a29a
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/nuscenes_vad_dataset.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/vad_custom_nuscenes_eval.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/vad_custom_nuscenes_eval.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..823113b695772c5080799170813c1c389fdfca0b
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/__pycache__/vad_custom_nuscenes_eval.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/builder.py b/GenAD-main/projects/mmdet3d_plugin/datasets/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..007c988561194d54581f6e40255f3f20e6087aa7
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/builder.py
@@ -0,0 +1,151 @@
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import Registry, build_from_cfg
+from torch.utils.data import DataLoader
+
+from mmdet.datasets.samplers import GroupSampler
+from projects.mmdet3d_plugin.datasets.samplers.group_sampler import DistributedGroupSampler
+from projects.mmdet3d_plugin.datasets.samplers.distributed_sampler import DistributedSampler
+from projects.mmdet3d_plugin.datasets.samplers.sampler import build_sampler
+
+FUSERS = Registry("fusers")
+
+def build_fuser(cfg):
+    return FUSERS.build(cfg)
+
+def build_dataloader(dataset,
+                     samples_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     seed=None,
+                     shuffler_sampler=None,
+                     nonshuffler_sampler=None,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        kwargs: any keyword argument to be used to initialize DataLoader
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+    if dist:
+        # DistributedGroupSampler will definitely shuffle the data to satisfy
+        # that images on each GPU are in the same group
+        if shuffle:
+            sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'),
+                                     dict(
+                                         dataset=dataset,
+                                         samples_per_gpu=samples_per_gpu,
+                                         num_replicas=world_size,
+                                         rank=rank,
+                                         seed=seed)
+                                     )
+
+        else:
+            sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'),
+                                     dict(
+                                         dataset=dataset,
+                                         num_replicas=world_size,
+                                         rank=rank,
+                                         shuffle=shuffle,
+                                         seed=seed)
+                                     )
+
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        # assert False, 'not support in bevformer'
+        print('WARNING!!!!, Only can be used for obtain inference speed!!!!')
+        sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=False,
+        worker_init_fn=init_fn,
+        **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+from mmcv.utils import Registry, build_from_cfg
+
+from mmdet.datasets import DATASETS
+from mmdet.datasets.builder import _concat_dataset
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+OBJECTSAMPLERS = Registry('Object sampler')
+
+
+def custom_build_dataset(cfg, default_args=None):
+    from mmdet3d.datasets.dataset_wrappers import CBGSDataset
+    from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
+                                                 ConcatDataset, RepeatDataset)
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'ConcatDataset':
+        dataset = ConcatDataset(
+            [custom_build_dataset(c, default_args) for c in cfg['datasets']],
+            cfg.get('separate_eval', True))
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            custom_build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif cfg['type'] == 'ClassBalancedDataset':
+        dataset = ClassBalancedDataset(
+            custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
+    elif cfg['type'] == 'CBGSDataset':
+        dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args))
+    elif isinstance(cfg.get('ann_file'), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/detection_cvpr_2019.json b/GenAD-main/projects/mmdet3d_plugin/datasets/detection_cvpr_2019.json
new file mode 100644
index 0000000000000000000000000000000000000000..809ba46f76ebf09cc572c209bfddd94b7ee68084
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/detection_cvpr_2019.json
@@ -0,0 +1,21 @@
+{
+  "class_range": {
+    "car": 50,
+    "truck": 50,
+    "bus": 50,
+    "trailer": 50,
+    "construction_vehicle": 50,
+    "pedestrian": 40,
+    "motorcycle": 40,
+    "bicycle": 40,
+    "traffic_cone": 30,
+    "barrier": 30
+  },
+  "dist_fcn": "center_distance",
+  "dist_ths": [0.5, 1.0, 2.0, 4.0],
+  "dist_th_tp": 2.0,
+  "min_recall": 0.1,
+  "min_precision": 0.1,
+  "max_boxes_per_sample": 500,
+  "mean_ap_weight": 5
+}
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/mean_ap.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/mean_ap.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f17c5e4128e22d60233ae43699557f0460269286
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/mean_ap.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15c9a6947be454468ee21799974f7e840da1f00b
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp_chamfer.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp_chamfer.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..064896ee5a0a3b2d78094397fe6c964489526f0a
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/__pycache__/tpfp_chamfer.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py
new file mode 100644
index 0000000000000000000000000000000000000000..023260659c4376af4dd4863880648e1c287c88bb
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/mean_ap.py
@@ -0,0 +1,389 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from multiprocessing import Pool
+from shapely.geometry import LineString, Polygon
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+import json
+from os import path as osp
+import os
+from functools import partial
+from .tpfp import tpfp_gen, custom_tpfp_gen
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+        ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+def get_cls_results(gen_results, 
+                    annotations, 
+                    num_sample=100, 
+                    num_pred_pts_per_instance=30,
+                    eval_use_same_gt_sample_num_flag=False,
+                    class_id=0, 
+                    fix_interval=False):
+    """Get det results and gt information of a certain class.
+
+    Args:
+        gen_results (list[list]): Same as `eval_map()`.
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        tuple[list[np.ndarray]]: detected bboxes, gt bboxes
+    """
+    # if len(gen_results) == 0 or 
+
+    cls_gens, cls_scores = [], []
+    for res in gen_results['vectors']:
+        if res['type'] == class_id:
+            if len(res['pts']) < 2:
+                continue
+            if not eval_use_same_gt_sample_num_flag:
+                sampled_points = np.array(res['pts'])
+            else:
+                line = res['pts']
+                line = LineString(line)
+
+                if fix_interval:
+                    distances = list(np.arange(1., line.length, 1.))
+                    distances = [0,] + distances + [line.length,]
+                    sampled_points = np.array([list(line.interpolate(distance).coords)
+                                            for distance in distances]).reshape(-1, 2)
+                else:
+                    distances = np.linspace(0, line.length, num_sample)
+                    sampled_points = np.array([list(line.interpolate(distance).coords)
+                                                for distance in distances]).reshape(-1, 2)
+                
+            cls_gens.append(sampled_points)
+            cls_scores.append(res['confidence_level'])
+    num_res = len(cls_gens)
+    if num_res > 0:
+        cls_gens = np.stack(cls_gens).reshape(num_res,-1)
+        cls_scores = np.array(cls_scores)[:,np.newaxis]
+        cls_gens = np.concatenate([cls_gens,cls_scores],axis=-1)
+        # print(f'for class {i}, cls_gens has shape {cls_gens.shape}')
+    else:
+        if not eval_use_same_gt_sample_num_flag:
+            cls_gens = np.zeros((0,num_pred_pts_per_instance*2+1))
+        else:
+            cls_gens = np.zeros((0,num_sample*2+1))
+        # print(f'for class {i}, cls_gens has shape {cls_gens.shape}')
+
+    cls_gts = []
+    for ann in annotations['vectors']:
+        if ann['type'] == class_id:
+            # line = ann['pts'] +  np.array((1,1)) # for hdmapnet
+            line = ann['pts']
+            # line = ann['pts'].cumsum(0)
+            line = LineString(line)
+            distances = np.linspace(0, line.length, num_sample)
+            sampled_points = np.array([list(line.interpolate(distance).coords)
+                                        for distance in distances]).reshape(-1, 2)
+            
+            cls_gts.append(sampled_points)
+    num_gts = len(cls_gts)
+    if num_gts > 0:
+        cls_gts = np.stack(cls_gts).reshape(num_gts,-1)
+    else:
+        cls_gts = np.zeros((0,num_sample*2))
+    return cls_gens, cls_gts
+    # ones = np.ones((num_gts,1))
+    # tmp_cls_gens = np.concatenate([cls_gts,ones],axis=-1)
+    # return tmp_cls_gens, cls_gts
+
+def format_res_gt_by_classes(result_path,
+                             gen_results,
+                             annotations,
+                             cls_names=None,
+                             num_pred_pts_per_instance=30,
+                             eval_use_same_gt_sample_num_flag=False,
+                             pc_range=[-15.0, -30.0, -5.0, 15.0, 30.0, 3.0],
+                             nproc=24):
+    assert cls_names is not None
+    timer = mmcv.Timer()
+    num_fixed_sample_pts = 100
+    fix_interval = False
+    print('results path: {}'.format(result_path))
+
+    output_dir = osp.join(*osp.split(result_path)[:-1])
+    assert len(gen_results) == len(annotations)
+
+    pool = Pool(nproc)
+    cls_gens, cls_gts = {}, {}
+    print('Formatting ...')
+    formatting_file = 'cls_formatted.pkl'
+    formatting_file = osp.join(output_dir,formatting_file)
+
+    # for vis
+    if False:
+        from PIL import Image
+        import matplotlib.pyplot as plt
+        from matplotlib import transforms
+        from matplotlib.patches import Rectangle
+
+        show_dir = osp.join(output_dir,'vis_json')
+        mmcv.mkdir_or_exist(osp.abspath(show_dir))
+        # import pdb;pdb.set_trace()
+        car_img = Image.open('./figs/lidar_car.png')
+        colors_plt = ['r', 'b', 'g']
+        for i in range(20):
+
+            plt.figure(figsize=(2, 4))
+            plt.xlim(pc_range[0], pc_range[3])
+            plt.ylim(pc_range[1], pc_range[4])
+            plt.axis('off')
+
+            for line in gen_results[i]['vectors']:
+                l = np.array(line['pts'])
+                plt.plot(l[:,0],l[:,1],'-', 
+                # color=colors[line['type']]
+                color = 'red',
+                )
+
+            for line in annotations[i]['vectors']:
+                # l = np.array(line['pts']) + np.array((1,1))
+                l = np.array(line['pts'])
+                # l = line['pts']
+                plt.plot(l[:,0],l[:,1],'-', 
+                    # color=colors[line['type']],
+                    color = 'blue',
+                    )
+            plt.imshow(car_img, extent=[-1.2, 1.2, -1.5, 1.5])
+            map_path = osp.join(show_dir, 'COMPARE_MAP_{}.jpg'.format(i))
+            plt.savefig(map_path, bbox_inches='tight', dpi=400)
+            plt.close()
+
+    for i, clsname in enumerate(cls_names):
+
+        gengts = pool.starmap(
+                    partial(get_cls_results, num_sample=num_fixed_sample_pts,
+                        num_pred_pts_per_instance=num_pred_pts_per_instance,
+                        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,class_id=i,fix_interval=fix_interval),
+                    zip(list(gen_results.values()), annotations))   
+        # gengts = map(partial(get_cls_results, num_sample=num_fixed_sample_pts, class_id=i,fix_interval=fix_interval),
+        #             zip(gen_results, annotations))
+        # import pdb;pdb.set_trace()
+        gens, gts = tuple(zip(*gengts))
+        cls_gens[clsname] = gens
+        cls_gts[clsname] = gts
+    
+    mmcv.dump([cls_gens, cls_gts],formatting_file)
+    print('Cls data formatting done in {:2f}s!! with {}'.format(float(timer.since_start()),formatting_file))
+    pool.close()
+    return cls_gens, cls_gts
+
+def eval_map(gen_results,
+             annotations,
+             cls_gens,
+             cls_gts,
+             threshold=0.5,
+             cls_names=None,
+             logger=None,
+             tpfp_fn=None,
+             pc_range=[-15.0, -30.0, -5.0, 15.0, 30.0, 3.0],
+             metric=None,
+             num_pred_pts_per_instance=30,
+             nproc=24):
+    timer = mmcv.Timer()
+    pool = Pool(nproc)
+
+    eval_results = []
+    
+    for i, clsname in enumerate(cls_names):
+        
+        # get gt and det bboxes of this class
+        cls_gen = cls_gens[clsname]
+        cls_gt = cls_gts[clsname]
+        # choose proper function according to datasets to compute tp and fp
+        # XXX
+        # func_name = cls2func[clsname]
+        # tpfp_fn = tpfp_fn_dict[tpfp_fn_name]
+        tpfp_fn = custom_tpfp_gen
+        # Trick for serialized
+        # only top-level function can be serized
+        # somehow use partitial the return function is defined
+        # at the top level.
+
+        # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold, metric=metric)
+        # import pdb; pdb.set_trace()
+        # TODO this is a hack
+        tpfp_fn = partial(tpfp_fn, threshold=threshold, metric=metric)
+        args = []
+        # compute tp and fp for each image with multiple processes
+        tpfp = pool.starmap(
+            tpfp_fn,
+            zip(cls_gen, cls_gt, *args))
+        # import pdb;pdb.set_trace()
+        tp, fp = tuple(zip(*tpfp))
+
+
+
+        # map_results = map(
+        #     tpfp_fn,
+        #     cls_gen, cls_gt)
+        # tp, fp = tuple(map(list, zip(*map_results)))
+
+
+        # debug and testing
+        # for i in range(len(cls_gen)):
+        #     # print(i)
+        #     tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold)
+        #     print(i)
+        #     tpfp = (tpfp,)
+        #     print(tpfp)
+        # i = 0 
+        # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold)
+        # import pdb; pdb.set_trace()
+
+        # XXX
+        
+        num_gts = 0
+        for j, bbox in enumerate(cls_gt):
+            num_gts += bbox.shape[0]
+
+        # sort all det bboxes by score, also sort tp and fp
+        # import pdb;pdb.set_trace()
+        cls_gen = np.vstack(cls_gen)
+        num_dets = cls_gen.shape[0]
+        sort_inds = np.argsort(-cls_gen[:, -1]) #descending, high score front
+        tp = np.hstack(tp)[sort_inds]
+        fp = np.hstack(fp)[sort_inds]
+        
+        # calculate recall and precision with tp and fp
+        # num_det*num_res
+        tp = np.cumsum(tp, axis=0)
+        fp = np.cumsum(fp, axis=0)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts, eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+
+        # calculate AP
+        # if dataset != 'voc07' else '11points'
+        mode = 'area'
+        ap = average_precision(recalls, precisions, mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+        print('cls:{} done in {:2f}s!!'.format(clsname,float(timer.since_last_check())))
+    pool.close()
+    aps = []
+    for cls_result in eval_results:
+        if cls_result['num_gts'] > 0:
+            aps.append(cls_result['ap'])
+    mean_ap = np.array(aps).mean().item() if len(aps) else 0.0
+
+    print_map_summary(
+        mean_ap, eval_results, class_name=cls_names, logger=logger)
+
+    return mean_ap, eval_results
+
+
+
+def print_map_summary(mean_ap,
+                      results,
+                      class_name=None,
+                      scale_ranges=None,
+                      logger=None):
+    """Print mAP and results of each class.
+
+    A table will be printed to show the gts/dets/recall/AP of each class and
+    the mAP.
+
+    Args:
+        mean_ap (float): Calculated from `eval_map()`.
+        results (list[dict]): Calculated from `eval_map()`.
+        dataset (list[str] | str | None): Dataset name or dataset classes.
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+    """
+
+    if logger == 'silent':
+        return
+
+    if isinstance(results[0]['ap'], np.ndarray):
+        num_scales = len(results[0]['ap'])
+    else:
+        num_scales = 1
+
+    if scale_ranges is not None:
+        assert len(scale_ranges) == num_scales
+
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    label_names = class_name
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+
+    header = ['class', 'gts', 'dets', 'recall', 'ap']
+    for i in range(num_scales):
+        if scale_ranges is not None:
+            print_log(f'Scale range {scale_ranges[i]}', logger=logger)
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}'
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}'])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print_log('\n' + table.table, logger=logger)
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py
new file mode 100644
index 0000000000000000000000000000000000000000..14ab338023158e35a71592c1d82937317cc3f7fd
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp.py
@@ -0,0 +1,363 @@
+import mmcv
+import numpy as np
+
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+from .tpfp_chamfer import vec_iou, convex_iou, rbbox_iou, polyline_score, custom_polyline_score
+from shapely.geometry import LineString, Polygon
+# from vecmapnet_ops.ops.iou import convex_iou
+
+def tpfp_bbox(det_bboxes,
+              gt_bboxes,
+              gt_bbox_masks,
+              threshold=0.5):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    num_dets = len(det_bboxes)
+    num_gts = len(gt_bboxes)
+
+    # tp and fp
+    tp = np.zeros((num_dets), dtype=np.float32)
+    fp = np.zeros((num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    # XXX
+    if num_gts == 0:
+        fp[...] = 1
+        return tp, fp
+    
+    if num_dets == 0:
+        return tp, fp
+    
+    # # distance matrix: n x m
+    bbox_p = det_bboxes[:, :-1].reshape(num_dets,-1,2)
+    bbox_g = gt_bboxes.reshape(num_gts,-1,2)
+    bbox_gm = gt_bbox_masks.reshape(num_gts,-1,2)
+    matrix = convex_iou(bbox_p,bbox_g,bbox_gm)
+
+    # for each det, the max iou with all gts
+    matrix_max = matrix.max(axis=1)
+    # for each det, which gt overlaps most with it
+    matrix_argmax = matrix.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+
+    gt_covered = np.zeros(num_gts, dtype=bool)
+
+    # tp = 0 and fp = 0 means ignore this detected bbox,
+    for i in sort_inds:
+        if matrix_max[i] >= threshold:
+            matched_gt = matrix_argmax[i]
+            if not gt_covered[matched_gt]:
+                gt_covered[matched_gt] = True
+                tp[i] = 1
+            else:
+                fp[i] = 1
+        else:
+            fp[i] = 1
+
+    return tp, fp
+
+
+def tpfp_rbbox(det_bboxes,
+              gt_bboxes,
+              gt_bbox_masks,
+              threshold=0.5):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    num_dets = len(det_bboxes)
+    num_gts = len(gt_bboxes)
+
+    # tp and fp
+    tp = np.zeros((num_dets), dtype=np.float32)
+    fp = np.zeros((num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    # XXX
+    if num_gts == 0:
+        fp[...] = 1
+        return tp, fp
+    
+    if num_dets == 0:
+        return tp, fp
+    
+    # # distance matrix: n x m
+    bbox_p = det_bboxes[:, :-1].reshape(num_dets,-1,2)
+    bbox_g = gt_bboxes.reshape(num_gts,-1,2)
+    bbox_gm = gt_bbox_masks.reshape(num_gts,-1,2)
+    matrix = rbbox_iou(bbox_p,bbox_g,bbox_gm)
+
+    # for each det, the max iou with all gts
+    matrix_max = matrix.max(axis=1)
+    # for each det, which gt overlaps most with it
+    matrix_argmax = matrix.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+
+    gt_covered = np.zeros(num_gts, dtype=bool)
+
+    # tp = 0 and fp = 0 means ignore this detected bbox,
+    for i in sort_inds:
+        if matrix_max[i] >= threshold:
+            matched_gt = matrix_argmax[i]
+            if not gt_covered[matched_gt]:
+                gt_covered[matched_gt] = True
+                tp[i] = 1
+            else:
+                fp[i] = 1
+        else:
+            fp[i] = 1
+
+    return tp, fp
+
+
+def tpfp_det(det_bboxes,
+             gt_bboxes,
+             threshold=0.5):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+
+    # tp and fp
+    tp = np.zeros((num_dets), dtype=np.float32)
+    fp = np.zeros((num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    # XXX
+    if num_gts == 0:
+        fp[...] = 1
+        return tp, fp
+    
+    if num_dets == 0:
+        return tp, fp
+    
+    # # distance matrix: n x m
+    matrix = vec_iou(
+            det_bboxes[:, :-1].reshape(num_dets,-1,2), 
+            gt_bboxes.reshape(num_gts,-1,2))
+    # for each det, the max iou with all gts
+    matrix_max = matrix.max(axis=1)
+    # for each det, which gt overlaps most with it
+    matrix_argmax = matrix.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+
+    gt_covered = np.zeros(num_gts, dtype=bool)
+
+    # tp = 0 and fp = 0 means ignore this detected bbox,
+    for i in sort_inds:
+        if matrix_max[i] >= threshold:
+            matched_gt = matrix_argmax[i]
+            if not gt_covered[matched_gt]:
+                gt_covered[matched_gt] = True
+                tp[i] = 1
+            else:
+                fp[i] = 1
+        else:
+            fp[i] = 1
+
+    return tp, fp
+
+
+def tpfp_gen(gen_lines,
+             gt_lines,
+             threshold=0.5,
+             metric='POR'):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    num_gens = gen_lines.shape[0]
+    num_gts = gt_lines.shape[0]
+    
+    # tp and fp
+    tp = np.zeros((num_gens), dtype=np.float32)
+    fp = np.zeros((num_gens), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if num_gts == 0:
+        fp[...] = 1
+        return tp, fp
+    
+    if num_gens == 0:
+        return tp, fp
+    
+    gen_scores = gen_lines[:,-1] # n
+    # distance matrix: n x m
+
+    # matrix = custom_polyline_score(
+    #         gen_lines[:,:-1].reshape(num_gens,-1,2), 
+    #         gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric)
+
+    # TODO MAY bug here
+    matrix = polyline_score(
+            gen_lines[:,:-1].reshape(num_gens,-1,2), 
+            gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric)
+    # for each det, the max iou with all gts
+    matrix_max = matrix.max(axis=1)
+    # for each det, which gt overlaps most with it
+    matrix_argmax = matrix.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-gen_scores)
+
+    gt_covered = np.zeros(num_gts, dtype=bool)
+
+    # tp = 0 and fp = 0 means ignore this detected bbox,
+    for i in sort_inds:
+        if matrix_max[i] >= threshold:
+            matched_gt = matrix_argmax[i]
+            if not gt_covered[matched_gt]:
+                gt_covered[matched_gt] = True
+                tp[i] = 1
+            else:
+                fp[i] = 1
+        else:
+            fp[i] = 1
+
+    return tp, fp
+
+
+def custom_tpfp_gen(gen_lines,
+             gt_lines,
+             threshold=0.5,
+             metric='chamfer'):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+    if metric == 'chamfer':
+        if threshold >0:
+            threshold= -threshold
+    # else:
+    #     raise NotImplementedError
+
+    # import pdb;pdb.set_trace()
+    num_gens = gen_lines.shape[0]
+    num_gts = gt_lines.shape[0]
+    
+    # tp and fp
+    tp = np.zeros((num_gens), dtype=np.float32)
+    fp = np.zeros((num_gens), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if num_gts == 0:
+        fp[...] = 1
+        return tp, fp
+    
+    if num_gens == 0:
+        return tp, fp
+    
+    gen_scores = gen_lines[:,-1] # n
+    # distance matrix: n x m
+
+    matrix = custom_polyline_score(
+            gen_lines[:,:-1].reshape(num_gens,-1,2), 
+            gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric)
+    # for each det, the max iou with all gts
+    matrix_max = matrix.max(axis=1)
+    # for each det, which gt overlaps most with it
+    matrix_argmax = matrix.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-gen_scores)
+
+    gt_covered = np.zeros(num_gts, dtype=bool)
+
+    # tp = 0 and fp = 0 means ignore this detected bbox,
+    for i in sort_inds:
+        if matrix_max[i] >= threshold:
+            matched_gt = matrix_argmax[i]
+            if not gt_covered[matched_gt]:
+                gt_covered[matched_gt] = True
+                tp[i] = 1
+            else:
+                fp[i] = 1
+        else:
+            fp[i] = 1
+
+    return tp, fp
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py
new file mode 100644
index 0000000000000000000000000000000000000000..db55fdd905de53a9033025ae0417f135858f2af8
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/map_utils/tpfp_chamfer.py
@@ -0,0 +1,335 @@
+# from ..chamfer_dist import ChamferDistance
+import numpy as np
+from shapely.geometry import LineString, Polygon
+from shapely.strtree import STRtree
+from shapely.geometry import CAP_STYLE, JOIN_STYLE
+from scipy.spatial import distance
+import similaritymeasures
+
+# def chamfer_distance(pred_bbox, gt_bbox):
+
+#     cd_dist_func = ChamferDistance.vec_cd_dist(
+#         pred, pred_mask, tgt, tgt_mask)()
+
+
+def vec_iou(pred_lines, gt_lines):
+    '''
+        each line with 1 meter width
+        pred_lines: num_preds, npts, 2
+        gt_lines: num_gts, npts, 2
+    '''
+
+    num_preds = pred_lines.shape[0]
+    num_gts = gt_lines.shape[0]
+
+    pred_lines_shapely = \
+        [LineString(i).buffer(1.,
+            cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round)
+                          for i in pred_lines]
+    gt_lines_shapely =\
+        [LineString(i).buffer(1.,
+            cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round)
+                        for i in gt_lines]
+
+    # construct tree
+    tree = STRtree(gt_lines_shapely)
+    index_by_id = dict((id(pt), i) for i, pt in enumerate(gt_lines_shapely))
+
+    iou_matrix = np.zeros((num_preds, num_gts))
+
+    for i, pline in enumerate(pred_lines_shapely):
+
+        for o in tree.query(pline):
+            if o.intersects(pline):
+                gt_id = index_by_id[id(o)]
+
+                inter = o.intersection(pline).area
+                union = o.union(pline).area
+                iou_matrix[i, gt_id] = inter / union
+
+    return iou_matrix
+
+def convex_iou(pred_lines, gt_lines, gt_mask):
+    '''
+        each line with 1 meter width
+        pred_lines: num_preds, List [npts, 2]
+        gt_lines: num_gts, npts, 2
+        gt_mask: num_gts, npts, 2
+    '''
+
+    num_preds = len(pred_lines)
+    num_gts = len(gt_lines)
+
+    pred_lines_shapely = \
+        [Polygon(i).convex_hull for i in pred_lines]
+    gt_lines_shapely =\
+        [Polygon(i[m].reshape(-1,2)).convex_hull for i,m in zip(gt_lines,gt_mask)]
+
+    # construct tree
+    tree = STRtree(pred_lines_shapely)
+    index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+    iou_matrix = np.zeros((num_preds, num_gts))
+
+    for i, pline in enumerate(gt_lines_shapely):
+
+        for o in tree.query(pline):
+            if o.intersects(pline):
+                pred_id = index_by_id[id(o)]
+
+                inter = o.intersection(pline).area
+                union = o.union(pline).area
+                iou_matrix[pred_id, i] = inter / union
+
+    return iou_matrix
+
+def rbbox_iou(pred_lines, gt_lines, gt_mask):
+    '''
+        each line with 1 meter width
+        pred_lines: num_preds, List [npts, 2]
+        gt_lines: num_gts, npts, 2
+        gt_mask: num_gts, npts, 2
+    '''
+
+    num_preds = len(pred_lines)
+    num_gts = len(gt_lines)
+
+    pred_lines_shapely = \
+        [Polygon(i).minimum_rotated_rectangle for i in pred_lines]
+    gt_lines_shapely =\
+        [Polygon(i[m].reshape(-1,2)) for i,m in zip(gt_lines,gt_mask)]
+
+    # construct tree
+    tree = STRtree(pred_lines_shapely)
+    index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+    iou_matrix = np.zeros((num_preds, num_gts))
+
+    for i, pline in enumerate(gt_lines_shapely):
+
+        for o in tree.query(pline):
+            if o.intersects(pline):
+                pred_id = index_by_id[id(o)]
+
+                inter = o.intersection(pline).area
+                union = o.union(pline).area
+                iou_matrix[pred_id, i] = inter / union
+
+    return iou_matrix
+
+
+def polyline_score(pred_lines, gt_lines, linewidth=1., metric='POR'):
+    '''
+        each line with 1 meter width
+        pred_lines: num_preds, List [npts, 2]
+        gt_lines: num_gts, npts, 2
+        gt_mask: num_gts, npts, 2
+    '''
+    positive_threshold = 1.
+    num_preds = len(pred_lines)
+    num_gts = len(gt_lines)
+    line_length = pred_lines.shape[1]
+
+    # gt_lines = gt_lines + np.array((1.,1.))
+
+    pred_lines_shapely = \
+        [LineString(i).buffer(linewidth,
+            cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+                          for i in pred_lines]
+    gt_lines_shapely =\
+        [LineString(i).buffer(linewidth,
+            cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+                        for i in gt_lines]
+
+    # construct tree
+    tree = STRtree(pred_lines_shapely)
+    index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+    if metric=='POR':
+        iou_matrix = np.zeros((num_preds, num_gts),dtype=np.float64)
+    elif metric=='frechet':
+        iou_matrix = np.full((num_preds, num_gts), -100.)
+    elif metric=='chamfer':
+        iou_matrix = np.full((num_preds, num_gts), -100.)
+    elif metric=='chamfer_v2':
+        iou_matrix = np.full((num_preds, num_gts), -100.)
+
+    for i, pline in enumerate(gt_lines_shapely):
+
+        for o in tree.query(pline):
+            if o.intersects(pline):
+                pred_id = index_by_id[id(o)]
+
+                if metric=='POR':
+                    dist_mat = distance.cdist(
+                        pred_lines[pred_id], gt_lines[i], 'euclidean')
+                    
+                    valid_ab = (dist_mat.min(-1) < positive_threshold).sum()
+                    valid_ba = (dist_mat.min(-2) < positive_threshold).sum()
+
+                    iou_matrix[pred_id, i] = min(valid_ba,valid_ab) / line_length
+                    # iou_matrix[pred_id, i] = ((valid_ba+valid_ab)/2) / line_length
+                    # assert iou_matrix[pred_id, i] <= 1. and iou_matrix[pred_id, i] >= 0.
+                elif metric=='frechet':
+                    fdistance_1 = \
+                        -similaritymeasures.frechet_dist(pred_lines[pred_id], gt_lines[i])
+                    fdistance_2 = \
+                        -similaritymeasures.frechet_dist(pred_lines[pred_id][::-1], gt_lines[i])
+                    fdistance = max(fdistance_1,fdistance_2)
+                    iou_matrix[pred_id, i] = fdistance
+
+                elif metric=='chamfer':
+                    dist_mat = distance.cdist(
+                        pred_lines[pred_id], gt_lines[i], 'euclidean')
+                    
+                    valid_ab = dist_mat.min(-1).sum()
+                    valid_ba = dist_mat.min(-2).sum()
+
+                    iou_matrix[pred_id, i] = -(valid_ba+valid_ab)/(2*line_length)
+                    # if iou_matrix[pred_id, i] == 0:
+                    #     import ipdb; ipdb.set_trace()
+                elif metric=='chamfer_v2':
+                    dist_mat = distance.cdist(
+                        pred_lines[pred_id], gt_lines[i], 'euclidean')
+                    
+                    valid_ab = dist_mat.min(-1).sum()
+                    valid_ba = dist_mat.min(-2).sum()
+
+                    iou_matrix[pred_id, i] = -(valid_ba/pred_lines[pred_id].shape[0]
+                                                +valid_ab/gt_lines[i].shape[0])/2
+                    # if iou_matrix[pred_id, i] == 0:
+                    #     import ipdb; ipdb.set_trace()
+
+    
+    # if True:
+    #     import matplotlib.pyplot as plt
+    #     print('pred num', num_preds)
+    #     print('gt num', num_gts)
+    #     for i in range(num_preds):
+    #         plt.plot(pred_lines[i][:,0],pred_lines[i][:,1],'-',color='red',alpha=0.5)
+    #     for i in range(num_gts):
+    #         plt.plot(gt_lines[i][:,0],gt_lines[i][:,1],'-',color='blue',alpha=0.5)
+    #     plt.savefig('test.png')
+    #     plt.close()
+    return iou_matrix
+
+
+def custom_polyline_score(pred_lines, gt_lines, linewidth=1., metric='chamfer'):
+    '''
+        each line with 1 meter width
+        pred_lines: num_preds, List [npts, 2]
+        gt_lines: num_gts, npts, 2
+        gt_mask: num_gts, npts, 2
+    '''
+    if metric == 'iou':
+        linewidth = 1.0
+    positive_threshold = 1.
+    num_preds = len(pred_lines)
+    num_gts = len(gt_lines)
+    line_length = pred_lines.shape[1]
+
+    # gt_lines = gt_lines + np.array((1.,1.))
+
+    pred_lines_shapely = \
+        [LineString(i).buffer(linewidth,
+            cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+                          for i in pred_lines]
+    gt_lines_shapely =\
+        [LineString(i).buffer(linewidth,
+            cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+                        for i in gt_lines]
+
+    # construct tree
+    tree = STRtree(pred_lines_shapely)
+    index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+
+    if metric=='chamfer':
+        iou_matrix = np.full((num_preds, num_gts), -100.)
+    elif metric=='iou':
+        iou_matrix = np.zeros((num_preds, num_gts),dtype=np.float64)
+    else:
+        raise NotImplementedError
+
+    for i, pline in enumerate(gt_lines_shapely):
+
+        for o in tree.query(pline):
+            if o.intersects(pline):
+                pred_id = index_by_id[id(o)]
+
+                if metric=='chamfer':
+                    dist_mat = distance.cdist(
+                        pred_lines[pred_id], gt_lines[i], 'euclidean')
+                    # import pdb;pdb.set_trace()
+                    valid_ab = dist_mat.min(-1).mean()
+                    valid_ba = dist_mat.min(-2).mean()
+
+                    iou_matrix[pred_id, i] = -(valid_ba+valid_ab)/2
+                elif metric=='iou':
+                    inter = o.intersection(pline).area
+                    union = o.union(pline).area
+                    iou_matrix[pred_id, i] = inter / union
+
+    return iou_matrix
+
+if __name__ == '__main__':
+    import torch
+
+    line1 = torch.tensor([
+        [1, 5], [3, 5], [5, 5]
+    ])
+
+    line0 = torch.tensor([
+        [3, 6], [4, 8], [5, 6]
+    ])
+
+    line2 = torch.tensor([
+        [1, 4], [3, 4], [5, 4]
+    ])
+
+    line3 = torch.tensor([
+        [4, 4], [3, 3], [5, 3]
+    ])
+
+    gt = torch.stack((line2, line3), dim=0).type(torch.float32)
+    pred = torch.stack((line0, line1), dim=0).type(torch.float32)
+
+    # import ipdb; ipdb.set_trace()
+    import mmcv
+    # with mmcv.Timer():
+    #     gt = upsampler(gt, pts=10)
+    #     pred = upsampler(pred, pts=10)
+
+    import matplotlib.pyplot as plt
+    from shapely.geometry import LineString
+    from descartes import PolygonPatch
+    
+    iou_matrix = vec_iou(pred,gt)
+    print(iou_matrix)
+    # import pdb;pdb.set_trace()
+    score_matrix = custom_polyline_score(pred, gt, linewidth=1., metric='chamfer')
+    print(score_matrix)
+    fig, ax = plt.subplots()
+    for i in gt:
+        i = i.numpy()
+        plt.plot(i[:, 0], i[:, 1], 'o', color='red')
+        plt.plot(i[:, 0], i[:, 1], '-', color='red')
+
+        dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round)
+        patch1 = PolygonPatch(dilated, fc='red', ec='red', alpha=0.5, zorder=-1)
+        ax.add_patch(patch1)
+
+    for i in pred:
+        i = i.numpy()
+        plt.plot(i[:, 0], i[:, 1], 'o', color='blue')
+        plt.plot(i[:, 0], i[:, 1], '-', color='blue')
+
+        dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+        patch1 = PolygonPatch(dilated, fc='blue', ec='blue', alpha=0.5, zorder=-1)
+        ax.add_patch(patch1)
+
+
+    ax.axis('equal')
+
+
+    plt.savefig('test3.png')    
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_eval.py b/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aa85e73cc18fd84765ee8ce4ead3cca06ed7128
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_eval.py
@@ -0,0 +1,783 @@
+import argparse
+import copy
+import json
+import os
+import time
+from typing import Tuple, Dict, Any
+import torch
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionConfig
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from pyquaternion import Quaternion
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.geometry_utils import points_in_box
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.eval.common.loaders import add_center_dist, filter_eval_boxes
+import tqdm
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from torchvision.transforms.functional import rotate
+import pycocotools.mask as mask_util
+# from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from torchvision.transforms.functional import rotate
+import cv2
+import argparse
+import json
+import os
+import random
+import time
+from typing import Tuple, Dict, Any
+
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.loaders import load_gt, add_center_dist, filter_eval_boxes
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.constants import TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \
+    DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from mmdet3d.core.bbox.iou_calculators import BboxOverlaps3D
+from IPython import embed
+import json
+from typing import Any
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.common.utils import boxes_to_sensor
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+    PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList
+from nuscenes.utils.data_classes import LidarPointCloud
+from nuscenes.utils.geometry_utils import view_points
+
+import mmcv
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+                   metrics: DetectionMetrics,
+                   detection_name: str,
+                   min_recall: float,
+                   dist_th_tp: float,
+                   savepath: str = None,
+                   ax: Axis = None) -> None:
+    """
+    Plot the true positive curve for the specified class.
+    :param md_list: DetectionMetricDataList instance.
+    :param metrics: DetectionMetrics instance.
+    :param detection_name:
+    :param min_recall: Minimum recall value.
+    :param dist_th_tp: The distance threshold used to determine matches.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    :param ax: Axes onto which to render.
+    """
+    # Get metric data for given detection class with tp distance threshold.
+
+    md = md_list[(detection_name, dist_th_tp)]
+    min_recall_ind = round(100 * min_recall)
+    if min_recall_ind <= md.max_recall_ind:
+        # For traffic_cone and barrier only a subset of the metrics are plotted.
+        rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+        ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+    else:
+        ylimit = 1.0
+
+    # Prepare axis.
+    if ax is None:
+        ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+                        min_recall=min_recall)
+    ax.set_ylim(0, ylimit)
+
+    # Plot the recall vs. error curve for each tp metric.
+    for metric in TP_METRICS:
+        tp = metrics.get_label_tp(detection_name, metric)
+
+        # Plot only if we have valid data.
+        if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+            recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+        else:
+            recall, error = [], []
+
+        # Change legend based on tp value
+        if tp is np.nan:
+            label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+        elif min_recall_ind > md.max_recall_ind:
+            label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+        else:
+            label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+        if metric == 'trans_err':
+            label += f' ({md.max_recall_ind})'  # add recall
+            print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+        ax.plot(recall, error, label=label)
+    ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+    ax.legend(loc='best')
+
+    if savepath is not None:
+        plt.savefig(savepath)
+        plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+    def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+        '''
+        add annotation token
+        '''
+        super().__init__(*args, **kwargs)
+        self.token = token
+        self.visibility = visibility
+        self.index = index
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'token': self.token,
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'visibility': self.visibility,
+            'index': self.index
+
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(
+            token=content['token'],
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name'],
+            visibility=content['visibility'],
+            index=content['index'],
+        )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible inside an image without accounting for occlusions.
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    center_3d = box.center.reshape(3, 1)
+    center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, center_img[1, :] > 0)
+    visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+    in_front = center_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if vis_level == BoxVisibility.ALL:
+        return all(visible) and all(in_front)
+    elif vis_level == BoxVisibility.ANY:
+        return any(visible) and all(in_front)
+    elif vis_level == BoxVisibility.NONE:
+        return True
+    else:
+        raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+                                       vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible in images but not all corners in image .
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    corners_3d = box.corners()
+    corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, corners_img[1, :] > 0)
+    visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+    in_front = corners_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if any(visible) and not all(visible) and all(in_front):
+        return True
+    else:
+        return False
+
+def load_prediction(result_path: str, max_boxes_per_sample: int, box_cls, verbose: bool = False) \
+        -> Tuple[EvalBoxes, Dict]:
+    """
+    Loads object predictions from file.
+    :param result_path: Path to the .json result file provided by the user.
+    :param max_boxes_per_sample: Maximim number of boxes allowed per sample.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The deserialized results and meta data.
+    """
+
+    # Load from file and check that the format is correct.
+    # with open(result_path) as f:
+    #     data = json.load(f)
+    data = mmcv.load(result_path)
+    assert 'results' in data, 'Error: No field `results` in result file. Please note that the result format changed.' \
+                              'See https://www.nuscenes.org/object-detection for more information.'
+
+    # Deserialize results and get meta data.
+    all_results = EvalBoxes.deserialize(data['results'], box_cls)
+    meta = data['meta']
+    if verbose:
+        print("Loaded results from {}. Found detections for {} samples."
+              .format(result_path, len(all_results.sample_tokens)))
+
+    # Check that each sample has no more than x predicted boxes.
+    for sample_token in all_results.sample_tokens:
+        assert len(all_results.boxes[sample_token]) <= max_boxes_per_sample, \
+            "Error: Only <= %d boxes per sample allowed!" % max_boxes_per_sample
+
+    return all_results, meta
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+    """
+    Loads ground truth boxes from DB.
+    :param nusc: A NuScenes instance.
+    :param eval_split: The evaluation split for which we load GT boxes.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The GT boxes.
+    """
+
+    # Init.
+    if box_cls == DetectionBox_modified:
+        attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+    if verbose:
+        print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+    # Read out all sample_tokens in DB.
+    sample_tokens_all = [s['token'] for s in nusc.sample]
+    assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+    # Only keep samples from this split.
+    splits = create_splits_scenes()
+
+    # Check compatibility of split with nusc_version.
+    version = nusc.version
+    if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+        assert version.endswith('trainval'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split in {'mini_train', 'mini_val'}:
+        assert version.endswith('mini'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split == 'test':
+        assert version.endswith('test'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    else:
+        raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+                         .format(eval_split))
+
+    if eval_split == 'test':
+        # Check that you aren't trying to cheat :).
+        assert len(nusc.sample_annotation) > 0, \
+            'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+    index_map = {}
+    for scene in nusc.scene:
+        first_sample_token = scene['first_sample_token']
+        sample = nusc.get('sample', first_sample_token)
+        index_map[first_sample_token] = 1
+        index = 2
+        while sample['next'] != '':
+            sample = nusc.get('sample', sample['next'])
+            index_map[sample['token']] = index
+            index += 1
+
+    sample_tokens = []
+    for sample_token in sample_tokens_all:
+        scene_token = nusc.get('sample', sample_token)['scene_token']
+        scene_record = nusc.get('scene', scene_token)
+        if scene_record['name'] in splits[eval_split]:
+            sample_tokens.append(sample_token)
+
+    all_annotations = EvalBoxes()
+
+    # Load annotations and filter predictions and annotations.
+    tracking_id_set = set()
+    for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+        sample = nusc.get('sample', sample_token)
+        sample_annotation_tokens = sample['anns']
+
+        sample_boxes = []
+        for sample_annotation_token in sample_annotation_tokens:
+
+            sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+            if box_cls == DetectionBox_modified:
+                # Get label name in detection task and filter unused labels.
+                detection_name = category_to_detection_name(sample_annotation['category_name'])
+                if detection_name is None:
+                    continue
+
+                # Get attribute_name.
+                attr_tokens = sample_annotation['attribute_tokens']
+                attr_count = len(attr_tokens)
+                if attr_count == 0:
+                    attribute_name = ''
+                elif attr_count == 1:
+                    attribute_name = attribute_map[attr_tokens[0]]
+                else:
+                    raise Exception('Error: GT annotations must not have more than one attribute!')
+
+                sample_boxes.append(
+                    box_cls(
+                        token=sample_annotation_token,
+                        sample_token=sample_token,
+                        translation=sample_annotation['translation'],
+                        size=sample_annotation['size'],
+                        rotation=sample_annotation['rotation'],
+                        velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+                        num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+                        detection_name=detection_name,
+                        detection_score=-1.0,  # GT samples do not have a score.
+                        attribute_name=attribute_name,
+                        visibility=sample_annotation['visibility_token'],
+                        index=index_map[sample_token]
+                    )
+                )
+            elif box_cls == TrackingBox:
+                assert False
+            else:
+                raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+        all_annotations.add_boxes(sample_token, sample_boxes)
+
+    if verbose:
+        print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+    return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+                            eval_boxes: EvalBoxes,
+                            id=None,
+                            verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.token in id:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+        ori_eval_boxes: EvalBoxes,
+        visibility=None,
+        verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.visibility == visibility:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After visibility based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[],  verbose=False):
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    for sample_token in eval_boxes.sample_tokens:
+        if sample_token not in valid_sample_tokens:
+            eval_boxes.boxes.pop(sample_token)
+    return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+                                 eval_boxes: EvalBoxes,
+                                 verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. basedon overlap .
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    cams = ['CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_RIGHT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_FRONT_LEFT']
+
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        sample_record = nusc.get('sample', sample_token)
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            count = 0
+            for cam in cams:
+                '''
+                copy-paste form nuscens
+                '''
+                sample_data_token = sample_record['data'][cam]
+                sd_record = nusc.get('sample_data', sample_data_token)
+                cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+                sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+                pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+                cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+                imsize = (sd_record['width'], sd_record['height'])
+                new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+                              name=box.detection_name, token='')
+
+                # Move box to ego vehicle coord system.
+                new_box.translate(-np.array(pose_record['translation']))
+                new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+                #  Move box to sensor coord system.
+                new_box.translate(-np.array(cs_record['translation']))
+                new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+                if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                    count += 1
+                # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                #    count += 1
+
+            if count > 1:
+                with open('center_overlap.txt', 'a') as f:
+                    try:
+                        f.write(box.token + '\n')
+                    except:
+                        pass
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    verbose = True
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+class NuScenesEval_custom(NuScenesEval):
+    """
+    Dummy class for backward-compatibility. Same as DetectionEval.
+    """
+
+    def __init__(self,
+                 nusc: NuScenes,
+                 config: DetectionConfig,
+                 result_path: str,
+                 eval_set: str,
+                 output_dir: str = None,
+                 verbose: bool = True,
+                 overlap_test=False,
+                 eval_mask=False,
+                 data_infos=None
+                 ):
+        """
+        Initialize a DetectionEval object.
+        :param nusc: A NuScenes object.
+        :param config: A DetectionConfig object.
+        :param result_path: Path of the nuScenes JSON result file.
+        :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+        :param output_dir: Folder to save plots and results to.
+        :param verbose: Whether to print to stdout.
+        """
+
+        self.nusc = nusc
+        self.result_path = result_path
+        self.eval_set = eval_set
+        self.output_dir = output_dir
+        self.verbose = verbose
+        self.cfg = config
+        self.overlap_test = overlap_test
+        self.eval_mask = eval_mask
+        self.data_infos = data_infos
+        # Check result file exists.
+        assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+        # Make dirs.
+        self.plot_dir = os.path.join(self.output_dir, 'plots')
+        if not os.path.isdir(self.output_dir):
+            os.makedirs(self.output_dir)
+        if not os.path.isdir(self.plot_dir):
+            os.makedirs(self.plot_dir)
+
+        # Load data.
+        if verbose:
+            print('Initializing nuScenes detection evaluation')
+        self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+                                                     verbose=verbose)
+        self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+        assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+            "Samples in split doesn't match samples in predictions."
+
+        # Add center distances.
+        self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+        self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+        # Filter boxes (distance, points per box, etc.).
+
+        if verbose:
+            print('Filtering predictions')
+        self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose)
+        if verbose:
+            print('Filtering ground truth annotations')
+        self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose)
+
+        if self.overlap_test:
+            self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+            self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+        self.all_gt = copy.deepcopy(self.gt_boxes)
+        self.all_preds = copy.deepcopy(self.pred_boxes)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+        self.index_map = {}
+        for scene in nusc.scene:
+            first_sample_token = scene['first_sample_token']
+            sample = nusc.get('sample', first_sample_token)
+            self.index_map[first_sample_token] = 1
+            index = 2
+            while sample['next'] != '':
+                sample = nusc.get('sample', sample['next'])
+                self.index_map[sample['token']] = index
+                index += 1
+
+    def update_gt(self, type_='vis', visibility='1', index=1):
+        if type_ == 'vis':
+            self.visibility_test = True
+            if self.visibility_test:
+                '''[{'description': 'visibility of whole object is between 0 and 40%',
+                'token': '1',
+                'level': 'v0-40'},
+                {'description': 'visibility of whole object is between 40 and 60%',
+                'token': '2',
+                'level': 'v40-60'},
+                {'description': 'visibility of whole object is between 60 and 80%',
+                'token': '3',
+                'level': 'v60-80'},
+                {'description': 'visibility of whole object is between 80 and 100%',
+                'token': '4',
+                'level': 'v80-100'}]'''
+
+                self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+        elif type_ == 'ord':
+
+            valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+            # from IPython import embed
+            # embed()
+            self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+            self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+    def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+        """
+        Performs the actual evaluation.
+        :return: A tuple of high-level and the raw metric data.
+        """
+        start_time = time.time()
+
+        # -----------------------------------
+        # Step 1: Accumulate metric data for all classes and distance thresholds.
+        # -----------------------------------
+        if self.verbose:
+            print('Accumulating metric data...')
+        metric_data_list = DetectionMetricDataList()
+
+        # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+        # self.cfg.dist_ths = [0.3]
+        # self.cfg.dist_fcn_callable
+        for class_name in self.cfg.class_names:
+            for dist_th in self.cfg.dist_ths:
+                md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+
+        # -----------------------------------
+        # Step 2: Calculate metrics from the data.
+        # -----------------------------------
+        if self.verbose:
+            print('Calculating metrics...')
+        metrics = DetectionMetrics(self.cfg)
+        for class_name in self.cfg.class_names:
+            # Compute APs.
+            for dist_th in self.cfg.dist_ths:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+                metrics.add_label_ap(class_name, dist_th, ap)
+            # Compute TP metrics.
+            for metric_name in TP_METRICS:
+                metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+                if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+                    tp = np.nan
+                elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+                    tp = np.nan
+                else:
+                    tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+                metrics.add_label_tp(class_name, metric_name, tp)
+
+        # Compute evaluation time.
+        metrics.add_runtime(time.time() - start_time)
+
+        return metrics, metric_data_list
+
+    def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+        """
+        Renders various PR and TP curves.
+        :param metrics: DetectionMetrics instance.
+        :param md_list: DetectionMetricDataList instance.
+        """
+        if self.verbose:
+            print('Rendering PR and TP curves')
+
+        def savepath(name):
+            return os.path.join(self.plot_dir, name + '.pdf')
+
+        summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+                     dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+        for detection_name in self.cfg.class_names:
+            class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+                           savepath=savepath(detection_name + '_pr'))
+
+            class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+                           savepath=savepath(detection_name + '_tp'))
+
+        for dist_th in self.cfg.dist_ths:
+            dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+                          savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+    # Settings.
+    parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+    parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+                        help='Folder to store result metrics, graphs and example visualizations.')
+    parser.add_argument('--eval_set', type=str, default='val',
+                        help='Which dataset split to evaluate on, train, val or test.')
+    parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+                        help='Default nuScenes data directory.')
+    parser.add_argument('--version', type=str, default='v1.0-trainval',
+                        help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+    parser.add_argument('--config_path', type=str, default='',
+                        help='Path to the configuration file.'
+                             'If no path given, the CVPR 2019 configuration will be used.')
+    parser.add_argument('--plot_examples', type=int, default=0,
+                        help='How many example visualizations to write to disk.')
+    parser.add_argument('--render_curves', type=int, default=1,
+                        help='Whether to render PR and TP curves to disk.')
+    parser.add_argument('--verbose', type=int, default=1,
+                        help='Whether to print to stdout.')
+    args = parser.parse_args()
+
+    result_path_ = os.path.expanduser(args.result_path)
+    output_dir_ = os.path.expanduser(args.output_dir)
+    eval_set_ = args.eval_set
+    dataroot_ = args.dataroot
+    version_ = args.version
+    config_path = args.config_path
+    plot_examples_ = args.plot_examples
+    render_curves_ = bool(args.render_curves)
+    verbose_ = bool(args.verbose)
+
+    if config_path == '':
+        cfg_ = config_factory('detection_cvpr_2019')
+    else:
+        with open(config_path, 'r') as _f:
+            cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+    nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+    nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+                                    output_dir=output_dir_, verbose=verbose_)
+    for vis in ['1', '2', '3', '4']:
+        nusc_eval.update_gt(type_='vis', visibility=vis)
+        print(f'================ {vis} ===============')
+        nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
+    #for index in range(1, 41):
+    #    nusc_eval.update_gt(type_='ord', index=index)
+    #
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_vad_dataset.py b/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_vad_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f73a76f2effd98be3590033bcf16b70124a40bf
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/nuscenes_vad_dataset.py
@@ -0,0 +1,1934 @@
+import os
+import json
+import copy
+import tempfile
+from typing import Dict, List
+
+import numpy as np
+from mmdet.datasets import DATASETS
+from mmdet3d.datasets import NuScenesDataset
+import pyquaternion
+import mmcv
+from os import path as osp
+from mmdet.datasets import DATASETS
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .vad_custom_nuscenes_eval import NuScenesEval_custom
+from nuscenes.eval.common.utils import center_distance
+from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from mmcv.parallel import DataContainer as DC
+import random
+from mmdet3d.core import LiDARInstance3DBoxes
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from projects.mmdet3d_plugin.core.bbox.structures.nuscenes_box import CustomNuscenesBox
+from shapely import affinity, ops
+from shapely.geometry import LineString, box, MultiPolygon, MultiLineString
+from mmdet.datasets.pipelines import to_tensor
+from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer
+from nuscenes.eval.detection.constants import DETECTION_NAMES
+
+
+class LiDARInstanceLines(object):
+    """Line instance in LIDAR coordinates
+
+    """
+    def __init__(self, 
+                 instance_line_list, 
+                 sample_dist=1,
+                 num_samples=250,
+                 padding=False,
+                 fixed_num=-1,
+                 padding_value=-10000,
+                 patch_size=None):
+        assert isinstance(instance_line_list, list)
+        assert patch_size is not None
+        if len(instance_line_list) != 0:
+            assert isinstance(instance_line_list[0], LineString)
+        self.patch_size = patch_size
+        self.max_x = self.patch_size[1] / 2
+        self.max_y = self.patch_size[0] / 2
+        self.sample_dist = sample_dist
+        self.num_samples = num_samples
+        self.padding = padding
+        self.fixed_num = fixed_num
+        self.padding_value = padding_value
+
+        self.instance_list = instance_line_list
+
+    @property
+    def start_end_points(self):
+        """
+        return torch.Tensor([N,4]), in xstart, ystart, xend, yend form
+        """
+        assert len(self.instance_list) != 0
+        instance_se_points_list = []
+        for instance in self.instance_list:
+            se_points = []
+            se_points.extend(instance.coords[0])
+            se_points.extend(instance.coords[-1])
+            instance_se_points_list.append(se_points)
+        instance_se_points_array = np.array(instance_se_points_list)
+        instance_se_points_tensor = to_tensor(instance_se_points_array)
+        instance_se_points_tensor = instance_se_points_tensor.to(
+                                dtype=torch.float32)
+        instance_se_points_tensor[:,0] = torch.clamp(instance_se_points_tensor[:,0], min=-self.max_x,max=self.max_x)
+        instance_se_points_tensor[:,1] = torch.clamp(instance_se_points_tensor[:,1], min=-self.max_y,max=self.max_y)
+        instance_se_points_tensor[:,2] = torch.clamp(instance_se_points_tensor[:,2], min=-self.max_x,max=self.max_x)
+        instance_se_points_tensor[:,3] = torch.clamp(instance_se_points_tensor[:,3], min=-self.max_y,max=self.max_y)
+        return instance_se_points_tensor
+
+    @property
+    def bbox(self):
+        """
+        return torch.Tensor([N,4]), in xmin, ymin, xmax, ymax form
+        """
+        assert len(self.instance_list) != 0
+        instance_bbox_list = []
+        for instance in self.instance_list:
+            # bounds is bbox: [xmin, ymin, xmax, ymax]
+            instance_bbox_list.append(instance.bounds)
+        instance_bbox_array = np.array(instance_bbox_list)
+        instance_bbox_tensor = to_tensor(instance_bbox_array)
+        instance_bbox_tensor = instance_bbox_tensor.to(
+                            dtype=torch.float32)
+        instance_bbox_tensor[:,0] = torch.clamp(instance_bbox_tensor[:,0], min=-self.max_x,max=self.max_x)
+        instance_bbox_tensor[:,1] = torch.clamp(instance_bbox_tensor[:,1], min=-self.max_y,max=self.max_y)
+        instance_bbox_tensor[:,2] = torch.clamp(instance_bbox_tensor[:,2], min=-self.max_x,max=self.max_x)
+        instance_bbox_tensor[:,3] = torch.clamp(instance_bbox_tensor[:,3], min=-self.max_y,max=self.max_y)
+        return instance_bbox_tensor
+
+    @property
+    def fixed_num_sampled_points(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            instance_points_list.append(sampled_points)
+        instance_points_array = np.array(instance_points_list)
+        instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        return instance_points_tensor
+
+    @property
+    def fixed_num_sampled_points_ambiguity(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            instance_points_list.append(sampled_points)
+        instance_points_array = np.array(instance_points_list)
+        instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        instance_points_tensor = instance_points_tensor.unsqueeze(1)
+        return instance_points_tensor
+
+    @property
+    def fixed_num_sampled_points_torch(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            # distances = np.linspace(0, instance.length, self.fixed_num)
+            # sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            poly_pts = to_tensor(np.array(list(instance.coords)))
+            poly_pts = poly_pts.unsqueeze(0).permute(0,2,1)
+            sampled_pts = torch.nn.functional.interpolate(poly_pts,size=(self.fixed_num),mode='linear',align_corners=True)
+            sampled_pts = sampled_pts.permute(0,2,1).squeeze(0)
+            instance_points_list.append(sampled_pts)
+        # instance_points_array = np.array(instance_points_list)
+        # instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = torch.stack(instance_points_list,dim=0)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        return instance_points_tensor
+
+    @property
+    def shift_fixed_num_sampled_points(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            fixed_num = fixed_num_pts.shape[0]
+            shift_pts_list = []
+            if is_poly:
+                # import pdb;pdb.set_trace()
+                for shift_right_i in range(fixed_num):
+                    shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v1(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            pts_num = fixed_num_pts.shape[0]
+            shift_num = pts_num - 1
+            if is_poly:
+                pts_to_shift = fixed_num_pts[:-1,:]
+            shift_pts_list = []
+            if is_poly:
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            if is_poly:
+                _, _, num_coords = shift_pts.shape
+                tmp_shift_pts = shift_pts.new_zeros((shift_num, pts_num, num_coords))
+                tmp_shift_pts[:,:-1,:] = shift_pts
+                tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+                shift_pts = tmp_shift_pts
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([shift_num-shift_pts.shape[0],pts_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v2(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        assert len(self.instance_list) != 0
+        instances_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            poly_pts = np.array(list(instance.coords))
+            start_pts = poly_pts[0]
+            end_pts = poly_pts[-1]
+            is_poly = np.equal(start_pts, end_pts)
+            is_poly = is_poly.all()
+            shift_pts_list = []
+            pts_num, coords_num = poly_pts.shape
+            shift_num = pts_num - 1
+            final_shift_num = self.fixed_num - 1
+            if is_poly:
+                pts_to_shift = poly_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+                # import pdb;pdb.set_trace()
+            else:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                flip_sampled_points = np.flip(sampled_points, axis=0)
+                shift_pts_list.append(sampled_points)
+                shift_pts_list.append(flip_sampled_points)
+            
+            multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+            shifts_num,_,_ = multi_shifts_pts.shape
+
+            if shifts_num > final_shift_num:
+                index = np.random.choice(multi_shifts_pts.shape[0], final_shift_num, replace=False)
+                multi_shifts_pts = multi_shifts_pts[index]
+            
+            multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+            multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+                            dtype=torch.float32)
+            
+            multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+            multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+            # if not is_poly:
+            if multi_shifts_pts_tensor.shape[0] < final_shift_num:
+                padding = torch.full([final_shift_num-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+                multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+            instances_list.append(multi_shifts_pts_tensor)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v3(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        assert len(self.instance_list) != 0
+        instances_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            poly_pts = np.array(list(instance.coords))
+            start_pts = poly_pts[0]
+            end_pts = poly_pts[-1]
+            is_poly = np.equal(start_pts, end_pts)
+            is_poly = is_poly.all()
+            shift_pts_list = []
+            pts_num, coords_num = poly_pts.shape
+            shift_num = pts_num - 1
+            final_shift_num = self.fixed_num - 1
+            if is_poly:
+                pts_to_shift = poly_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+                flip_pts_to_shift = np.flip(pts_to_shift, axis=0)
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(flip_pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+                # import pdb;pdb.set_trace()
+            else:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                flip_sampled_points = np.flip(sampled_points, axis=0)
+                shift_pts_list.append(sampled_points)
+                shift_pts_list.append(flip_sampled_points)
+            
+            multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+            shifts_num,_,_ = multi_shifts_pts.shape
+            # import pdb;pdb.set_trace()
+            if shifts_num > 2*final_shift_num:
+                index = np.random.choice(shift_num, final_shift_num, replace=False)
+                flip0_shifts_pts = multi_shifts_pts[index]
+                flip1_shifts_pts = multi_shifts_pts[index+shift_num]
+                multi_shifts_pts = np.concatenate((flip0_shifts_pts,flip1_shifts_pts),axis=0)
+            
+            multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+            multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+                            dtype=torch.float32)
+            
+            multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+            multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+            # if not is_poly:
+            if multi_shifts_pts_tensor.shape[0] < 2*final_shift_num:
+                padding = torch.full([final_shift_num*2-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+                multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+            instances_list.append(multi_shifts_pts_tensor)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v4(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            pts_num = fixed_num_pts.shape[0]
+            shift_num = pts_num - 1
+            shift_pts_list = []
+            if is_poly:
+                pts_to_shift = fixed_num_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+                flip_pts_to_shift = pts_to_shift.flip(0)
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(flip_pts_to_shift.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            if is_poly:
+                _, _, num_coords = shift_pts.shape
+                tmp_shift_pts = shift_pts.new_zeros((shift_num*2, pts_num, num_coords))
+                tmp_shift_pts[:,:-1,:] = shift_pts
+                tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+                shift_pts = tmp_shift_pts
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([shift_num*2-shift_pts.shape[0],pts_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_torch(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points_torch
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            fixed_num = fixed_num_pts.shape[0]
+            shift_pts_list = []
+            if is_poly:
+                # import pdb;pdb.set_trace()
+                for shift_right_i in range(fixed_num):
+                    shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    # @property
+    # def polyline_points(self):
+    #     """
+    #     return [[x0,y0],[x1,y1],...]
+    #     """
+    #     assert len(self.instance_list) != 0
+    #     for instance in self.instance_list:
+
+
+class VectorizedLocalMap(object):
+    CLASS2LABEL = {
+        'road_divider': 0,
+        'lane_divider': 0,
+        'ped_crossing': 1,
+        'contours': 2,
+        'others': -1
+    }
+    def __init__(self,
+                 dataroot,
+                 patch_size,
+                 map_classes=['divider','ped_crossing','boundary'],
+                 line_classes=['road_divider', 'lane_divider'],
+                 ped_crossing_classes=['ped_crossing'],
+                 contour_classes=['road_segment', 'lane'],
+                 sample_dist=1,
+                 num_samples=250,
+                 padding=False,
+                 fixed_ptsnum_per_line=-1,
+                 padding_value=-10000,):
+        '''
+        Args:
+            fixed_ptsnum_per_line = -1 : no fixed num
+        '''
+        super().__init__()
+        self.data_root = dataroot
+        self.MAPS = ['boston-seaport', 'singapore-hollandvillage',
+                     'singapore-onenorth', 'singapore-queenstown']
+        self.vec_classes = map_classes
+        self.line_classes = line_classes
+        self.ped_crossing_classes = ped_crossing_classes
+        self.polygon_classes = contour_classes
+        self.nusc_maps = {}
+        self.map_explorer = {}
+        for loc in self.MAPS:
+            self.nusc_maps[loc] = NuScenesMap(dataroot=self.data_root, map_name=loc)
+            self.map_explorer[loc] = NuScenesMapExplorer(self.nusc_maps[loc])
+
+        self.patch_size = patch_size
+        self.sample_dist = sample_dist
+        self.num_samples = num_samples
+        self.padding = padding
+        self.fixed_num = fixed_ptsnum_per_line
+        self.padding_value = padding_value
+
+    def gen_vectorized_samples(self, location, lidar2global_translation, lidar2global_rotation):
+        '''
+        use lidar2global to get gt map layers
+        '''
+        
+        map_pose = lidar2global_translation[:2]
+        rotation = Quaternion(lidar2global_rotation)
+
+        patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1])
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        # import pdb;pdb.set_trace()
+        vectors = []
+        for vec_class in self.vec_classes:
+            if vec_class == 'divider':
+                line_geom = self.get_map_geom(patch_box, patch_angle, self.line_classes, location)
+                line_instances_dict = self.line_geoms_to_instances(line_geom)     
+                for line_type, instances in line_instances_dict.items():
+                    for instance in instances:
+                        vectors.append((instance, self.CLASS2LABEL.get(line_type, -1)))
+            elif vec_class == 'ped_crossing':
+                ped_geom = self.get_map_geom(patch_box, patch_angle, self.ped_crossing_classes, location)
+                # ped_vector_list = self.ped_geoms_to_vectors(ped_geom)
+                ped_instance_list = self.ped_poly_geoms_to_instances(ped_geom)
+                # import pdb;pdb.set_trace()
+                for instance in ped_instance_list:
+                    vectors.append((instance, self.CLASS2LABEL.get('ped_crossing', -1)))
+            elif vec_class == 'boundary':
+                polygon_geom = self.get_map_geom(patch_box, patch_angle, self.polygon_classes, location)
+                # import pdb;pdb.set_trace()
+                poly_bound_list = self.poly_geoms_to_instances(polygon_geom)
+                # import pdb;pdb.set_trace()
+                for contour in poly_bound_list:
+                    vectors.append((contour, self.CLASS2LABEL.get('contours', -1)))
+            else:
+                raise ValueError(f'WRONG vec_class: {vec_class}')
+
+        # filter out -1
+        filtered_vectors = []
+        gt_pts_loc_3d = []
+        gt_pts_num_3d = []
+        gt_labels = []
+        gt_instance = []
+        for instance, type in vectors:
+            if type != -1:
+                gt_instance.append(instance)
+                gt_labels.append(type)
+        
+        gt_instance = LiDARInstanceLines(gt_instance,self.sample_dist,
+                        self.num_samples, self.padding, self.fixed_num,self.padding_value, patch_size=self.patch_size)
+
+        anns_results = dict(
+            gt_vecs_pts_loc=gt_instance,
+            gt_vecs_label=gt_labels,
+
+        )
+        # import pdb;pdb.set_trace()
+        return anns_results
+
+    def get_map_geom(self, patch_box, patch_angle, layer_names, location):
+        map_geom = []
+        for layer_name in layer_names:
+            if layer_name in self.line_classes:
+                # import pdb;pdb.set_trace()
+                geoms = self.get_divider_line(patch_box, patch_angle, layer_name, location)
+                # import pdb;pdb.set_trace()
+                # geoms = self.map_explorer[location]._get_layer_line(patch_box, patch_angle, layer_name)
+                map_geom.append((layer_name, geoms))
+            elif layer_name in self.polygon_classes:
+                geoms = self.get_contour_line(patch_box, patch_angle, layer_name, location)
+                # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name)
+                map_geom.append((layer_name, geoms))
+            elif layer_name in self.ped_crossing_classes:
+                geoms = self.get_ped_crossing_line(patch_box, patch_angle, location)
+                # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name)
+                map_geom.append((layer_name, geoms))
+        return map_geom
+
+    def _one_type_line_geom_to_vectors(self, line_geom):
+        line_vectors = []
+        
+        for line in line_geom:
+            if not line.is_empty:
+                if line.geom_type == 'MultiLineString':
+                    for single_line in line.geoms:
+                        line_vectors.append(self.sample_pts_from_line(single_line))
+                elif line.geom_type == 'LineString':
+                    line_vectors.append(self.sample_pts_from_line(line))
+                else:
+                    raise NotImplementedError
+        return line_vectors
+
+    def _one_type_line_geom_to_instances(self, line_geom):
+        line_instances = []
+        
+        for line in line_geom:
+            if not line.is_empty:
+                if line.geom_type == 'MultiLineString':
+                    for single_line in line.geoms:
+                        line_instances.append(single_line)
+                elif line.geom_type == 'LineString':
+                    line_instances.append(line)
+                else:
+                    raise NotImplementedError
+        return line_instances
+
+    def poly_geoms_to_vectors(self, polygon_geom):
+        roads = polygon_geom[0][1]
+        lanes = polygon_geom[1][1]
+        union_roads = ops.unary_union(roads)
+        union_lanes = ops.unary_union(lanes)
+        union_segments = ops.unary_union([union_roads, union_lanes])
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        exteriors = []
+        interiors = []
+        if union_segments.geom_type != 'MultiPolygon':
+            union_segments = MultiPolygon([union_segments])
+        for poly in union_segments.geoms:
+            exteriors.append(poly.exterior)
+            for inter in poly.interiors:
+                interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_vectors(results)
+
+    def ped_poly_geoms_to_instances(self, ped_geom):
+        # import pdb;pdb.set_trace()
+        ped = ped_geom[0][1]
+        union_segments = ops.unary_union(ped)
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        # local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        local_patch = box(-max_x - 0.2, -max_y - 0.2, max_x + 0.2, max_y + 0.2)
+        exteriors = []
+        interiors = []
+        if union_segments.geom_type != 'MultiPolygon':
+            union_segments = MultiPolygon([union_segments])
+        for poly in union_segments.geoms:
+            exteriors.append(poly.exterior)
+            for inter in poly.interiors:
+                interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_instances(results)
+
+
+    def poly_geoms_to_instances(self, polygon_geom):
+        roads = polygon_geom[0][1]
+        lanes = polygon_geom[1][1]
+        union_roads = ops.unary_union(roads)
+        union_lanes = ops.unary_union(lanes)
+        union_segments = ops.unary_union([union_roads, union_lanes])
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        exteriors = []
+        interiors = []
+        if union_segments.geom_type != 'MultiPolygon':
+            union_segments = MultiPolygon([union_segments])
+        for poly in union_segments.geoms:
+            exteriors.append(poly.exterior)
+            for inter in poly.interiors:
+                interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_instances(results)
+
+    def line_geoms_to_vectors(self, line_geom):
+        line_vectors_dict = dict()
+        for line_type, a_type_of_lines in line_geom:
+            one_type_vectors = self._one_type_line_geom_to_vectors(a_type_of_lines)
+            line_vectors_dict[line_type] = one_type_vectors
+
+        return line_vectors_dict
+    def line_geoms_to_instances(self, line_geom):
+        line_instances_dict = dict()
+        for line_type, a_type_of_lines in line_geom:
+            one_type_instances = self._one_type_line_geom_to_instances(a_type_of_lines)
+            line_instances_dict[line_type] = one_type_instances
+
+        return line_instances_dict
+
+    def ped_geoms_to_vectors(self, ped_geom):
+        ped_geom = ped_geom[0][1]
+        union_ped = ops.unary_union(ped_geom)
+        if union_ped.geom_type != 'MultiPolygon':
+            union_ped = MultiPolygon([union_ped])
+
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        results = []
+        for ped_poly in union_ped:
+            # rect = ped_poly.minimum_rotated_rectangle
+            ext = ped_poly.exterior
+            if not ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_vectors(results)
+
+    def get_contour_line(self,patch_box,patch_angle,layer_name,location):
+        if layer_name not in self.map_explorer[location].map_api.non_geometric_polygon_layers:
+            raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+
+        records = getattr(self.map_explorer[location].map_api, layer_name)
+
+        polygon_list = []
+        if layer_name == 'drivable_area':
+            for record in records:
+                polygons = [self.map_explorer[location].map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+
+                for polygon in polygons:
+                    new_polygon = polygon.intersection(patch)
+                    if not new_polygon.is_empty:
+                        new_polygon = affinity.rotate(new_polygon, -patch_angle,
+                                                      origin=(patch_x, patch_y), use_radians=False)
+                        new_polygon = affinity.affine_transform(new_polygon,
+                                                                [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                        if new_polygon.geom_type == 'Polygon':
+                            new_polygon = MultiPolygon([new_polygon])
+                        polygon_list.append(new_polygon)
+
+        else:
+            for record in records:
+                polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token'])
+
+                if polygon.is_valid:
+                    new_polygon = polygon.intersection(patch)
+                    if not new_polygon.is_empty:
+                        new_polygon = affinity.rotate(new_polygon, -patch_angle,
+                                                      origin=(patch_x, patch_y), use_radians=False)
+                        new_polygon = affinity.affine_transform(new_polygon,
+                                                                [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                        if new_polygon.geom_type == 'Polygon':
+                            new_polygon = MultiPolygon([new_polygon])
+                        polygon_list.append(new_polygon)
+
+        return polygon_list
+
+    def get_divider_line(self,patch_box,patch_angle,layer_name,location):
+        if layer_name not in self.map_explorer[location].map_api.non_geometric_line_layers:
+            raise ValueError("{} is not a line layer".format(layer_name))
+
+        if layer_name == 'traffic_light':
+            return None
+
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+
+        line_list = []
+        records = getattr(self.map_explorer[location].map_api, layer_name)
+        for record in records:
+            line = self.map_explorer[location].map_api.extract_line(record['line_token'])
+            if line.is_empty:  # Skip lines without nodes.
+                continue
+
+            new_line = line.intersection(patch)
+            if not new_line.is_empty:
+                new_line = affinity.rotate(new_line, -patch_angle, origin=(patch_x, patch_y), use_radians=False)
+                new_line = affinity.affine_transform(new_line,
+                                                     [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                line_list.append(new_line)
+
+        return line_list
+
+    def get_ped_crossing_line(self, patch_box, patch_angle, location):
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+        polygon_list = []
+        records = getattr(self.map_explorer[location].map_api, 'ped_crossing')
+        # records = getattr(self.nusc_maps[location], 'ped_crossing')
+        for record in records:
+            polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token'])
+            if polygon.is_valid:
+                new_polygon = polygon.intersection(patch)
+                if not new_polygon.is_empty:
+                    new_polygon = affinity.rotate(new_polygon, -patch_angle,
+                                                      origin=(patch_x, patch_y), use_radians=False)
+                    new_polygon = affinity.affine_transform(new_polygon,
+                                                            [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                    if new_polygon.geom_type == 'Polygon':
+                        new_polygon = MultiPolygon([new_polygon])
+                    polygon_list.append(new_polygon)
+
+        return polygon_list
+
+    def sample_pts_from_line(self, line):
+        if self.fixed_num < 0:
+            distances = np.arange(0, line.length, self.sample_dist)
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+        else:
+            # fixed number of points, so distance is line.length / self.fixed_num
+            distances = np.linspace(0, line.length, self.fixed_num)
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+
+            # tmpdistances = np.linspace(0, line.length, 2)
+            # tmpsampled_points = np.array([list(line.interpolate(tmpdistance).coords) for tmpdistance in tmpdistances]).reshape(-1, 2)
+        # import pdb;pdb.set_trace()
+        # if self.normalize:
+        #     sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]])
+
+        num_valid = len(sampled_points)
+
+        if not self.padding or self.fixed_num > 0:
+            # fixed num sample can return now!
+            return sampled_points, num_valid
+
+        # fixed distance sampling need padding!
+        num_valid = len(sampled_points)
+
+        if self.fixed_num < 0:
+            if num_valid < self.num_samples:
+                padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            else:
+                sampled_points = sampled_points[:self.num_samples, :]
+                num_valid = self.num_samples
+
+            # if self.normalize:
+            #     sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]])
+            #     num_valid = len(sampled_points)
+
+        return sampled_points, num_valid
+
+
+###############################################################################################################
+###############################################################################################################
+###############################################################################################################
+
+class v1CustomDetectionConfig:
+    """ Data class that specifies the detection evaluation settings. """
+
+    def __init__(self,
+                 class_range_x: Dict[str, int],
+                 class_range_y: Dict[str, int],
+                 dist_fcn: str,
+                 dist_ths: List[float],
+                 dist_th_tp: float,
+                 min_recall: float,
+                 min_precision: float,
+                 max_boxes_per_sample: int,
+                 mean_ap_weight: int):
+
+        assert set(class_range_x.keys()) == set(DETECTION_NAMES), "Class count mismatch."
+        assert dist_th_tp in dist_ths, "dist_th_tp must be in set of dist_ths."
+
+        self.class_range_x = class_range_x
+        self.class_range_y = class_range_y
+        self.dist_fcn = dist_fcn
+        self.dist_ths = dist_ths
+        self.dist_th_tp = dist_th_tp
+        self.min_recall = min_recall
+        self.min_precision = min_precision
+        self.max_boxes_per_sample = max_boxes_per_sample
+        self.mean_ap_weight = mean_ap_weight
+
+        self.class_names = self.class_range_y.keys()
+
+    def __eq__(self, other):
+        eq = True
+        for key in self.serialize().keys():
+            eq = eq and np.array_equal(getattr(self, key), getattr(other, key))
+        return eq
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'class_range_x': self.class_range_x,
+            'class_range_y': self.class_range_y,
+            'dist_fcn': self.dist_fcn,
+            'dist_ths': self.dist_ths,
+            'dist_th_tp': self.dist_th_tp,
+            'min_recall': self.min_recall,
+            'min_precision': self.min_precision,
+            'max_boxes_per_sample': self.max_boxes_per_sample,
+            'mean_ap_weight': self.mean_ap_weight
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized dictionary. """
+        return cls(content['class_range_x'],
+                   content['class_range_y'],
+                   content['dist_fcn'],
+                   content['dist_ths'],
+                   content['dist_th_tp'],
+                   content['min_recall'],
+                   content['min_precision'],
+                   content['max_boxes_per_sample'],
+                   content['mean_ap_weight'])
+
+    @property
+    def dist_fcn_callable(self):
+        """ Return the distance function corresponding to the dist_fcn string. """
+        if self.dist_fcn == 'center_distance':
+            return center_distance
+        else:
+            raise Exception('Error: Unknown distance function %s!' % self.dist_fcn)
+
+@DATASETS.register_module()
+class VADCustomNuScenesDataset(NuScenesDataset):
+    r"""Custom NuScenes Dataset.
+    """
+    MAPCLASSES = ('divider',)
+    def __init__(
+        self,
+        queue_length=4,
+        bev_size=(200, 200),
+        overlap_test=False,
+        with_attr=True,
+        fut_ts=6,
+        pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
+        map_classes=None,
+        map_ann_file=None,
+        map_fixed_ptsnum_per_line=-1,
+        map_eval_use_same_gt_sample_num_flag=False,
+        padding_value=-10000,
+        use_pkl_result=False,
+        custom_eval_version='vad_nusc_detection_cvpr_2019',
+        *args,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.queue_length = queue_length
+        self.overlap_test = overlap_test
+        self.bev_size = bev_size
+        self.with_attr = with_attr
+        self.fut_ts = fut_ts
+        self.use_pkl_result = use_pkl_result
+
+        self.custom_eval_version = custom_eval_version
+        # Check if config exists.
+        this_dir = os.path.dirname(os.path.abspath(__file__))
+        cfg_path = os.path.join(this_dir, '%s.json' % self.custom_eval_version)
+        assert os.path.exists(cfg_path), \
+            'Requested unknown configuration {}'.format(self.custom_eval_version)
+        # Load config file and deserialize it.
+        with open(cfg_path, 'r') as f:
+            data = json.load(f)
+        self.custom_eval_detection_configs = v1CustomDetectionConfig.deserialize(data)
+
+        self.map_ann_file = map_ann_file
+        self.MAPCLASSES = self.get_map_classes(map_classes)
+        self.NUM_MAPCLASSES = len(self.MAPCLASSES)
+        self.pc_range = pc_range
+        patch_h = pc_range[4]-pc_range[1]
+        patch_w = pc_range[3]-pc_range[0]
+        self.patch_size = (patch_h, patch_w)
+        self.padding_value = padding_value
+        self.fixed_num = map_fixed_ptsnum_per_line
+        self.eval_use_same_gt_sample_num_flag = map_eval_use_same_gt_sample_num_flag
+        self.vector_map = VectorizedLocalMap(kwargs['data_root'], 
+                            patch_size=self.patch_size, map_classes=self.MAPCLASSES, 
+                            fixed_ptsnum_per_line=map_fixed_ptsnum_per_line,
+                            padding_value=self.padding_value)
+        self.is_vis_on_test = True
+
+    @classmethod
+    def get_map_classes(cls, map_classes=None):
+        """Get class names of current dataset.
+
+        Args:
+            classes (Sequence[str] | str | None): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+
+        Return:
+            list[str]: A list of class names.
+        """
+        if map_classes is None:
+            return cls.MAPCLASSES
+
+        if isinstance(map_classes, str):
+            # take it as a file path
+            class_names = mmcv.list_from_file(map_classes)
+        elif isinstance(map_classes, (tuple, list)):
+            class_names = map_classes
+        else:
+            raise ValueError(f'Unsupported type {type(map_classes)} of map classes.')
+
+        return class_names
+
+    def vectormap_pipeline(self, example, input_dict):
+        '''
+        `example` type: <class 'dict'>
+            keys: 'img_metas', 'gt_bboxes_3d', 'gt_labels_3d', 'img';
+                  all keys type is 'DataContainer';
+                  'img_metas' cpu_only=True, type is dict, others are false;
+                  'gt_labels_3d' shape torch.size([num_samples]), stack=False,
+                                padding_value=0, cpu_only=False
+                  'gt_bboxes_3d': stack=False, cpu_only=True
+        '''
+        # import pdb;pdb.set_trace()
+        lidar2ego = np.eye(4)
+        lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix
+        lidar2ego[:3, 3] = input_dict['lidar2ego_translation']
+        ego2global = np.eye(4)
+        ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix
+        ego2global[:3, 3] = input_dict['ego2global_translation']
+
+        lidar2global = ego2global @ lidar2ego
+
+        lidar2global_translation = list(lidar2global[:3,3])
+        lidar2global_rotation = list(Quaternion(matrix=lidar2global).q)
+
+        location = input_dict['map_location']
+        ego2global_translation = input_dict['ego2global_translation']
+        ego2global_rotation = input_dict['ego2global_rotation']
+        anns_results = self.vector_map.gen_vectorized_samples(
+            location, lidar2global_translation, lidar2global_rotation
+        )
+        
+        '''
+        anns_results, type: dict
+            'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates
+            'gt_vecs_pts_num': list[num_vecs], vec with num_points
+            'gt_vecs_label': list[num_vecs], vec with cls index
+        '''
+        gt_vecs_label = to_tensor(anns_results['gt_vecs_label'])
+        if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines):
+            gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc']
+        else:
+            gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc'])
+            try:
+                gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32)
+            except:
+                # empty tensor, will be passed in train, 
+                # but we preserve it for test
+                gt_vecs_pts_loc = gt_vecs_pts_loc
+
+        example['map_gt_labels_3d'] = DC(gt_vecs_label, cpu_only=False)
+        example['map_gt_bboxes_3d'] = DC(gt_vecs_pts_loc, cpu_only=True)
+
+        return example
+
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        data_queue = []
+
+        # temporal aug
+        prev_indexs_list = list(range(index-self.queue_length, index))
+        random.shuffle(prev_indexs_list)
+        prev_indexs_list = sorted(prev_indexs_list[1:], reverse=True)
+        ##
+
+        input_dict = self.get_data_info(index)
+        if input_dict is None:
+            return None
+        frame_idx = input_dict['frame_idx']
+        scene_token = input_dict['scene_token']
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        example = self.vectormap_pipeline(example,input_dict)
+        if self.filter_empty_gt and \
+                ((example is None or ~(example['gt_labels_3d']._data != -1).any()) or \
+                    (example is None or ~(example['map_gt_labels_3d']._data != -1).any())):
+            return None
+        data_queue.insert(0, example)
+        for i in prev_indexs_list:
+            i = max(0, i)
+            input_dict = self.get_data_info(i)
+            if input_dict is None:
+                return None
+            if input_dict['frame_idx'] < frame_idx and input_dict['scene_token'] == scene_token:
+                self.pre_pipeline(input_dict)
+                example = self.pipeline(input_dict)
+                example = self.vectormap_pipeline(example,input_dict)
+                if self.filter_empty_gt and \
+                        (example is None or ~(example['gt_labels_3d']._data != -1).any()) and \
+                            (example is None or ~(example['map_gt_labels_3d']._data != -1).any()):
+                    return None
+                frame_idx = input_dict['frame_idx']
+            data_queue.insert(0, copy.deepcopy(example))
+        return self.union2one(data_queue)
+
+    def prepare_test_data(self, index):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Testing data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        if self.is_vis_on_test:
+            example = self.vectormap_pipeline(example, input_dict)
+        return example
+
+    def union2one(self, queue):
+        """
+        convert sample queue into one single sample.
+        """
+        imgs_list = [each['img'].data for each in queue]
+        metas_map = {}
+        prev_pos = None
+        prev_angle = None
+        for i, each in enumerate(queue):
+            metas_map[i] = each['img_metas'].data
+            if i == 0:
+                metas_map[i]['prev_bev'] = False
+                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] = 0
+                metas_map[i]['can_bus'][-1] = 0
+            else:
+                metas_map[i]['prev_bev'] = True
+                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] -= prev_pos
+                metas_map[i]['can_bus'][-1] -= prev_angle
+                prev_pos = copy.deepcopy(tmp_pos)
+                prev_angle = copy.deepcopy(tmp_angle)
+
+        queue[-1]['img'] = DC(torch.stack(imgs_list),
+                              cpu_only=False, stack=True)
+        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[-1]
+        return queue
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+        """
+        info = self.data_infos[index]
+        # filter out bbox containing no points
+        if self.use_valid_flag:
+            mask = info['valid_flag']
+        else:
+            mask = info['num_lidar_pts'] > 0
+        gt_bboxes_3d = info['gt_boxes'][mask]
+        gt_names_3d = info['gt_names'][mask]
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        if self.with_velocity:
+            gt_velocity = info['gt_velocity'][mask]
+            nan_mask = np.isnan(gt_velocity[:, 0])
+            gt_velocity[nan_mask] = [0.0, 0.0]
+            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+        
+        if self.with_attr:
+            gt_fut_trajs = info['gt_agent_fut_trajs'][mask]
+            gt_fut_masks = info['gt_agent_fut_masks'][mask]
+            gt_fut_goal = info['gt_agent_fut_goal'][mask]
+            gt_lcf_feat = info['gt_agent_lcf_feat'][mask]
+            gt_fut_yaw = info['gt_agent_fut_yaw'][mask]
+            attr_labels = np.concatenate(
+                [gt_fut_trajs, gt_fut_masks, gt_fut_goal[..., None], gt_lcf_feat, gt_fut_yaw], axis=-1
+            ).astype(np.float32)
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+        
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_names=gt_names_3d,
+            attr_labels=attr_labels)
+
+        return anns_results
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            sweeps=info['sweeps'],
+            ego2global_translation=info['ego2global_translation'],
+            ego2global_rotation=info['ego2global_rotation'],
+            lidar2ego_translation=info['lidar2ego_translation'],
+            lidar2ego_rotation=info['lidar2ego_rotation'],
+            prev_idx=info['prev'],
+            next_idx=info['next'],
+            scene_token=info['scene_token'],
+            can_bus=info['can_bus'],
+            frame_idx=info['frame_idx'],
+            timestamp=info['timestamp'] / 1e6,
+            fut_valid_flag=info['fut_valid_flag'],
+            map_location=info['map_location'],
+            ego_his_trajs=info['gt_ego_his_trajs'],
+            ego_fut_trajs=info['gt_ego_fut_trajs'],
+            ego_fut_masks=info['gt_ego_fut_masks'],
+            ego_fut_cmd=info['gt_ego_fut_cmd'],
+            ego_lcf_feat=info['gt_ego_lcf_feat']
+        )
+        # lidar to ego transform
+        lidar2ego = np.eye(4).astype(np.float32)
+        lidar2ego[:3, :3] = Quaternion(info["lidar2ego_rotation"]).rotation_matrix
+        lidar2ego[:3, 3] = info["lidar2ego_translation"]
+        input_dict["lidar2ego"] = lidar2ego
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            input_dict["camera2ego"] = []
+            input_dict["camera_intrinsics"] = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+                cam_intrinsics.append(viewpad)
+                lidar2cam_rts.append(lidar2cam_rt.T)
+            
+                # camera to ego transform
+                camera2ego = np.eye(4).astype(np.float32)
+                camera2ego[:3, :3] = Quaternion(
+                    cam_info["sensor2ego_rotation"]
+                ).rotation_matrix
+                camera2ego[:3, 3] = cam_info["sensor2ego_translation"]
+                input_dict["camera2ego"].append(camera2ego)
+                # camera intrinsics
+                camera_intrinsics = np.eye(4).astype(np.float32)
+                camera_intrinsics[:3, :3] = cam_info["cam_intrinsic"]
+                input_dict["camera_intrinsics"].append(camera_intrinsics)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                ))
+
+        # NOTE: now we load gt in test_mode for evaluating
+        # if not self.test_mode:
+        #     annos = self.get_ann_info(index)
+        #     input_dict['ann_info'] = annos
+
+        annos = self.get_ann_info(index)
+        input_dict['ann_info'] = annos
+
+        rotation = Quaternion(input_dict['ego2global_rotation'])
+        translation = input_dict['ego2global_translation']
+        can_bus = input_dict['can_bus']
+        can_bus[:3] = translation
+        can_bus[3:7] = rotation
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        if patch_angle < 0:
+            patch_angle += 360
+        can_bus[-2] = patch_angle / 180 * np.pi
+        can_bus[-1] = patch_angle
+
+        lidar2ego = np.eye(4)
+        lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix
+        lidar2ego[:3, 3] = input_dict['lidar2ego_translation']
+        ego2global = np.eye(4)
+        ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix
+        ego2global[:3, 3] = input_dict['ego2global_translation']
+        lidar2global = ego2global @ lidar2ego
+        input_dict['lidar2global'] = lidar2global
+
+        return input_dict
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _format_gt(self):
+        gt_annos = []
+        print('Start to convert gt map format...')
+        # assert self.map_ann_file is not None
+        if (not os.path.exists(self.map_ann_file)) :
+            dataset_length = len(self)
+            prog_bar = mmcv.ProgressBar(dataset_length)
+            mapped_class_names = self.MAPCLASSES
+            for sample_id in range(dataset_length):
+                sample_token = self.data_infos[sample_id]['token']
+                gt_anno = {}
+                gt_anno['sample_token'] = sample_token
+                # gt_sample_annos = []
+                gt_sample_dict = {}
+                gt_sample_dict = self.vectormap_pipeline(gt_sample_dict, self.data_infos[sample_id])
+                gt_labels = gt_sample_dict['map_gt_labels_3d'].data.numpy()
+                gt_vecs = gt_sample_dict['map_gt_bboxes_3d'].data.instance_list
+                gt_vec_list = []
+                for i, (gt_label, gt_vec) in enumerate(zip(gt_labels, gt_vecs)):
+                    name = mapped_class_names[gt_label]
+                    anno = dict(
+                        pts=np.array(list(gt_vec.coords)),
+                        pts_num=len(list(gt_vec.coords)),
+                        cls_name=name,
+                        type=gt_label,
+                    )
+                    gt_vec_list.append(anno)
+                gt_anno['vectors']=gt_vec_list
+                gt_annos.append(gt_anno)
+
+                prog_bar.update()
+            nusc_submissions = {
+                'GTs': gt_annos
+            }
+            print('\n GT anns writes to', self.map_ann_file)
+            mmcv.dump(nusc_submissions, self.map_ann_file)
+        else:
+            print(f'{self.map_ann_file} exist, not update')
+
+    def _format_bbox(self, results, jsonfile_prefix=None, score_thresh=0.2):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+        det_mapped_class_names = self.CLASSES
+
+        # assert self.map_ann_file is not None
+        map_pred_annos = {}
+        map_mapped_class_names = self.MAPCLASSES
+
+        plan_annos = {}
+
+        print('Start to convert detection format...')
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+            annos = []
+            boxes = output_to_nusc_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+
+            plan_annos[sample_token] = [det['ego_fut_preds'], det['ego_fut_cmd']]
+
+            boxes = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
+                                             det_mapped_class_names,
+                                             self.custom_eval_detection_configs,
+                                             self.eval_version)
+            for i, box in enumerate(boxes):
+                if box.score < score_thresh:
+                    continue
+                name = det_mapped_class_names[box.label]
+                if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+                    if name in [
+                            'car',
+                            'construction_vehicle',
+                            'bus',
+                            'truck',
+                            'trailer',
+                    ]:
+                        attr = 'vehicle.moving'
+                    elif name in ['bicycle', 'motorcycle']:
+                        attr = 'cycle.with_rider'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+                else:
+                    if name in ['pedestrian']:
+                        attr = 'pedestrian.standing'
+                    elif name in ['bus']:
+                        attr = 'vehicle.stopped'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr,
+                    fut_traj=box.fut_trajs.tolist())
+                annos.append(nusc_anno)
+            nusc_annos[sample_token] = annos
+
+
+            map_pred_anno = {}
+            vecs = output_to_vecs(det)
+            sample_token = self.data_infos[sample_id]['token']
+            map_pred_anno['sample_token'] = sample_token
+            pred_vec_list=[]
+            for i, vec in enumerate(vecs):
+                name = map_mapped_class_names[vec['label']]
+                anno = dict(
+                    # sample_token=sample_token,
+                    pts=vec['pts'],
+                    pts_num=len(vec['pts']),
+                    cls_name=name,
+                    type=vec['label'],
+                    confidence_level=vec['score'])
+                pred_vec_list.append(anno)
+                # annos.append(nusc_anno)
+            # nusc_annos[sample_token] = annos
+            map_pred_anno['vectors'] = pred_vec_list
+            map_pred_annos[sample_token] = map_pred_anno
+
+        if not os.path.exists(self.map_ann_file):
+            self._format_gt()
+        else:
+            print(f'{self.map_ann_file} exist, not update')
+        # with open(self.map_ann_file,'r') as f:
+        #     GT_anns = json.load(f)
+        # gt_annos = GT_anns['GTs']
+
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+            'map_results': map_pred_annos,
+            'plan_results': plan_annos
+            # 'GTs': gt_annos
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        if self.use_pkl_result:
+            res_path = osp.join(jsonfile_prefix, 'results_nusc.pkl')
+        else:
+            res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def format_results(self, results, jsonfile_prefix=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+                dict containing the json filepaths, `tmp_dir` is the temporal \
+                directory created for saving json files when \
+                `jsonfile_prefix` is not specified.
+        """
+        if isinstance(results, dict):
+            # print(f'results must be a list, but get dict, keys={results.keys()}')
+            # assert isinstance(results, list)
+            results = results['bbox_results']
+        assert isinstance(results, list)
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        # currently the output prediction results could be in two formats
+        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+        # 2. list of dict('pts_bbox' or 'img_bbox':
+        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+        # this is a workaround to enable evaluation of both formats on nuScenes
+        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                if name == 'metric_results':
+                    continue
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+        return result_files, tmp_dir
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         map_metric='chamfer',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        detail = dict()
+        from nuscenes import NuScenes
+        self.nusc = NuScenes(version=self.version, dataroot=self.data_root,
+                             verbose=False)
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        self.nusc_eval = NuScenesEval_custom(
+            self.nusc,
+            config=self.custom_eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=False,
+            overlap_test=self.overlap_test,
+            data_infos=self.data_infos
+        )
+        self.nusc_eval.main(plot_examples=0, render_curves=False)
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+
+
+        from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import eval_map
+        from projects.mmdet3d_plugin.datasets.map_utils.mean_ap import format_res_gt_by_classes
+        result_path = osp.abspath(result_path)
+        
+        print('Formating results & gts by classes')
+        pred_results = mmcv.load(result_path)
+        map_results = pred_results['map_results']
+        gt_anns = mmcv.load(self.map_ann_file)
+        map_annotations = gt_anns['GTs']
+        cls_gens, cls_gts = format_res_gt_by_classes(result_path,
+                                                     map_results,
+                                                     map_annotations,
+                                                     cls_names=self.MAPCLASSES,
+                                                     num_pred_pts_per_instance=self.fixed_num,
+                                                     eval_use_same_gt_sample_num_flag=self.eval_use_same_gt_sample_num_flag,
+                                                     pc_range=self.pc_range)
+        map_metrics = map_metric if isinstance(map_metric, list) else [map_metric]
+        allowed_metrics = ['chamfer', 'iou']
+        for metric in map_metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+        for metric in map_metrics:
+            print('-*'*10+f'use metric:{metric}'+'-*'*10)
+            if metric == 'chamfer':
+                thresholds = [0.5,1.0,1.5]
+            elif metric == 'iou':
+                thresholds= np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+            cls_aps = np.zeros((len(thresholds),self.NUM_MAPCLASSES))
+            for i, thr in enumerate(thresholds):
+                print('-*'*10+f'threshhold:{thr}'+'-*'*10)
+                mAP, cls_ap = eval_map(
+                                map_results,
+                                map_annotations,
+                                cls_gens,
+                                cls_gts,
+                                threshold=thr,
+                                cls_names=self.MAPCLASSES,
+                                logger=logger,
+                                num_pred_pts_per_instance=self.fixed_num,
+                                pc_range=self.pc_range,
+                                metric=metric)
+                for j in range(self.NUM_MAPCLASSES):
+                    cls_aps[i, j] = cls_ap[j]['ap']
+            for i, name in enumerate(self.MAPCLASSES):
+                print('{}: {}'.format(name, cls_aps.mean(0)[i]))
+                detail['NuscMap_{}/{}_AP'.format(metric,name)] =  cls_aps.mean(0)[i]
+            print('map: {}'.format(cls_aps.mean(0).mean()))
+            detail['NuscMap_{}/mAP'.format(metric)] = cls_aps.mean(0).mean()
+            for i, name in enumerate(self.MAPCLASSES):
+                for j, thr in enumerate(thresholds):
+                    if metric == 'chamfer':
+                        detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+                    elif metric == 'iou':
+                        if thr == 0.5 or thr == 0.75:
+                            detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+
+        return detail
+    
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 map_metric='chamfer',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['pts_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        result_metric_names = ['EPA', 'ADE', 'FDE', 'MR']
+        motion_cls_names = ['car', 'pedestrian']
+        motion_metric_names = ['gt', 'cnt_ade', 'cnt_fde', 'hit',
+                               'fp', 'ADE', 'FDE', 'MR']
+        all_metric_dict = {}
+        for met in motion_metric_names:
+            for cls in motion_cls_names:
+                all_metric_dict[met+'_'+cls] = 0.0
+        result_dict = {}
+        for met in result_metric_names:
+            for cls in motion_cls_names:
+                result_dict[met+'_'+cls] = 0.0
+        
+        alpha = 0.5
+
+        for i in range(len(results)):
+            for key in all_metric_dict.keys():
+                all_metric_dict[key] += results[i]['metric_results'][key]
+        
+        for cls in motion_cls_names:
+            result_dict['EPA_'+cls] = (all_metric_dict['hit_'+cls] - \
+                 alpha * all_metric_dict['fp_'+cls]) / all_metric_dict['gt_'+cls]
+            result_dict['ADE_'+cls] = all_metric_dict['ADE_'+cls] / all_metric_dict['cnt_ade_'+cls]
+            result_dict['FDE_'+cls] = all_metric_dict['FDE_'+cls] / all_metric_dict['cnt_fde_'+cls]
+            result_dict['MR_'+cls] = all_metric_dict['MR_'+cls] / all_metric_dict['cnt_fde_'+cls]
+        
+        print('\n')
+        print('-------------- Motion Prediction --------------')
+        for k, v in result_dict.items():
+            print(f'{k}: {v}')
+
+        # NOTE: print planning metric
+        print('\n')
+        print('-------------- Planning --------------')
+        metric_dict = None
+        num_valid = 0
+        for res in results:
+            if res['metric_results']['fut_valid_flag']:
+                num_valid += 1
+            else:
+                continue
+            if metric_dict is None:
+                metric_dict = copy.deepcopy(res['metric_results'])
+            else:
+                for k in res['metric_results'].keys():
+                    metric_dict[k] += res['metric_results'][k]
+        
+        for k in metric_dict:
+            metric_dict[k] = metric_dict[k] / num_valid
+            print("{}:{}".format(k, metric_dict[k]))
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name], metric=metric, map_metric=map_metric)
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files, metric=metric, map_metric=map_metric)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+def output_to_nusc_box(detection):
+    """Convert the output to the box class in the nuScenes.
+
+    Args:
+        detection (dict): Detection results.
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    trajs = detection['trajs_3d'].numpy()
+
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+    # TODO: check whether this is necessary
+    # with dir_offset & dir_limit in the head
+    box_yaw = -box_yaw - np.pi / 2
+
+    box_list = []
+    for i in range(len(box3d)):
+        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        velocity = (*box3d.tensor[i, 7:9], 0.0)
+        # velo_val = np.linalg.norm(box3d[i, 7:9])
+        # velo_ori = box3d[i, 6]
+        # velocity = (
+        # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+        box = CustomNuscenesBox(
+            center=box_gravity_center[i],
+            size=box_dims[i],
+            orientation=quat,
+            fut_trajs=trajs[i],
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list
+
+
+def lidar_nusc_box_to_global(info,
+                             boxes,
+                             classes,
+                             eval_configs,
+                             eval_version='detection_cvpr_2019'):
+    """Convert the box from ego to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation']))
+        box.translate(np.array(info['lidar2ego_translation']))
+        # filter det in ego.
+        cls_range_x_map = eval_configs.class_range_x
+        cls_range_y_map = eval_configs.class_range_y
+        x_distance, y_distance = box.center[0], box.center[1]
+        det_range_x = cls_range_x_map[classes[box.label]]
+        det_range_y = cls_range_y_map[classes[box.label]]
+        if abs(x_distance) > det_range_x or abs(y_distance) > det_range_y:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+    return box_list
+
+def output_to_vecs(detection):
+    box3d = detection['map_boxes_3d'].numpy()
+    scores = detection['map_scores_3d'].numpy()
+    labels = detection['map_labels_3d'].numpy()
+    pts = detection['map_pts_3d'].numpy()
+
+    vec_list = []
+    # import pdb;pdb.set_trace()
+    for i in range(box3d.shape[0]):
+        vec = dict(
+            bbox = box3d[i], # xyxy
+            label=labels[i],
+            score=scores[i],
+            pts=pts[i],
+        )
+        vec_list.append(vec)
+    return vec_list
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__init__.py b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..44a3f9505af5288126fae2b76d7463e152308a85
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__init__.py
@@ -0,0 +1,14 @@
+from .transform_3d import (
+    PadMultiViewImage, NormalizeMultiviewImage, 
+    PhotoMetricDistortionMultiViewImage, CustomCollect3D,
+    RandomScaleImageMultiViewImage, CustomObjectRangeFilter, CustomObjectNameFilter)
+from .formating import CustomDefaultFormatBundle3D
+from .loading import CustomLoadPointsFromFile, CustomLoadPointsFromMultiSweeps
+
+__all__ = [
+    'PadMultiViewImage', 'NormalizeMultiviewImage', 
+    'PhotoMetricDistortionMultiViewImage', 'CustomDefaultFormatBundle3D',
+    'CustomCollect3D', 'RandomScaleImageMultiViewImage', 
+    'CustomObjectRangeFilter', 'CustomObjectNameFilter',
+    'CustomLoadPointsFromFile', 'CustomLoadPointsFromMultiSweeps'
+]
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ab2057b66504e1553500eef14ecd5468e823f13
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/formating.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/formating.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..635d2100451c9a23cd18068a9f1dfde8b862cd67
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/formating.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/loading.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/loading.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..148502c4aca9595d777fae36fd2f5cd7f21b4eb9
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/loading.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/transform_3d.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/transform_3d.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e6e9b3f6880f07c4fd4239e15c0f3e1a938dbbf
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/__pycache__/transform_3d.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/formating.py b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/formating.py
new file mode 100644
index 0000000000000000000000000000000000000000..184cedddc739da4a5740520a20f973385436de6a
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/formating.py
@@ -0,0 +1,55 @@
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.parallel import DataContainer as DC
+
+from mmdet3d.core.bbox import BaseInstance3DBoxes
+from mmdet3d.core.points import BasePoints
+from mmdet.datasets.builder import PIPELINES
+from mmdet.datasets.pipelines import to_tensor
+from mmdet3d.datasets.pipelines import DefaultFormatBundle3D
+
+@PIPELINES.register_module()
+class CustomDefaultFormatBundle3D(DefaultFormatBundle3D):
+    """Default formatting bundle.
+    It simplifies the pipeline of formatting common fields for voxels,
+    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+    "gt_semantic_seg".
+    These fields are formatted as follows.
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    """
+    def __init__(self, class_names, with_gt=True, with_label=True, with_ego=True):
+        super(CustomDefaultFormatBundle3D, self).__init__(class_names, with_gt, with_label)
+        self.with_ego = with_ego
+
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+        Args:
+            results (dict): Result dict contains the data to convert.
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        # Format 3D data
+        results = super(CustomDefaultFormatBundle3D, self).__call__(results)
+        # results['gt_map_masks'] = DC(to_tensor(results['gt_map_masks']), stack=True)
+        if self.with_ego:
+            if 'ego_his_trajs' in results:
+                results['ego_his_trajs'] = DC(to_tensor(results['ego_his_trajs'][None, ...]), stack=True)
+            if 'ego_fut_trajs' in results:
+                results['ego_fut_trajs'] = DC(to_tensor(results['ego_fut_trajs'][None, ...]), stack=True)
+            if 'ego_fut_masks' in results:
+                results['ego_fut_masks'] = DC(to_tensor(results['ego_fut_masks'][None, None, ...]), stack=True)
+            if 'ego_fut_cmd' in results:
+                results['ego_fut_cmd'] = DC(to_tensor(results['ego_fut_cmd'][None, None, ...]), stack=True)
+            if 'ego_lcf_feat' in results:
+                results['ego_lcf_feat'] = DC(to_tensor(results['ego_lcf_feat'][None, None, ...]), stack=True)
+            if 'gt_attr_labels' in results:
+                results['gt_attr_labels'] = DC(to_tensor(results['gt_attr_labels']), cpu_only=False)
+                
+        return results
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/loading.py b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8b6e68ca3247e8a9c354d22453026ba458106e3
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/loading.py
@@ -0,0 +1,389 @@
+import os
+from typing import Any, Dict, Tuple
+
+import mmcv
+import torch
+import numpy as np
+from nuscenes.map_expansion.map_api import NuScenesMap
+from nuscenes.map_expansion.map_api import locations as LOCATIONS
+from PIL import Image
+
+
+from mmdet3d.core.points import BasePoints, get_points_type
+from mmdet.datasets.builder import PIPELINES
+from mmdet.datasets.pipelines import LoadAnnotations
+
+def load_augmented_point_cloud(path, virtual=False, reduce_beams=32):
+    # NOTE: following Tianwei's implementation, it is hard coded for nuScenes
+    points = np.fromfile(path, dtype=np.float32).reshape(-1, 5)
+    # NOTE: path definition different from Tianwei's implementation.
+    tokens = path.split("/")
+    vp_dir = "_VIRTUAL" if reduce_beams == 32 else f"_VIRTUAL_{reduce_beams}BEAMS"
+    seg_path = os.path.join(
+        *tokens[:-3],
+        "virtual_points",
+        tokens[-3],
+        tokens[-2] + vp_dir,
+        tokens[-1] + ".pkl.npy",
+    )
+    assert os.path.exists(seg_path)
+    data_dict = np.load(seg_path, allow_pickle=True).item()
+
+    virtual_points1 = data_dict["real_points"]
+    # NOTE: add zero reflectance to virtual points instead of removing them from real points
+    virtual_points2 = np.concatenate(
+        [
+            data_dict["virtual_points"][:, :3],
+            np.zeros([data_dict["virtual_points"].shape[0], 1]),
+            data_dict["virtual_points"][:, 3:],
+        ],
+        axis=-1,
+    )
+
+    points = np.concatenate(
+        [
+            points,
+            np.ones([points.shape[0], virtual_points1.shape[1] - points.shape[1] + 1]),
+        ],
+        axis=1,
+    )
+    virtual_points1 = np.concatenate(
+        [virtual_points1, np.zeros([virtual_points1.shape[0], 1])], axis=1
+    )
+    # note: this part is different from Tianwei's implementation, we don't have duplicate foreground real points.
+    if len(data_dict["real_points_indice"]) > 0:
+        points[data_dict["real_points_indice"]] = virtual_points1
+    if virtual:
+        virtual_points2 = np.concatenate(
+            [virtual_points2, -1 * np.ones([virtual_points2.shape[0], 1])], axis=1
+        )
+        points = np.concatenate([points, virtual_points2], axis=0).astype(np.float32)
+    return points
+
+
+def reduce_LiDAR_beams(pts, reduce_beams_to=32):
+    # print(pts.size())
+    if isinstance(pts, np.ndarray):
+        pts = torch.from_numpy(pts)
+    radius = torch.sqrt(pts[:, 0].pow(2) + pts[:, 1].pow(2) + pts[:, 2].pow(2))
+    sine_theta = pts[:, 2] / radius
+    # [-pi/2, pi/2]
+    theta = torch.asin(sine_theta)
+    phi = torch.atan2(pts[:, 1], pts[:, 0])
+
+    top_ang = 0.1862
+    down_ang = -0.5353
+
+    beam_range = torch.zeros(32)
+    beam_range[0] = top_ang
+    beam_range[31] = down_ang
+
+    for i in range(1, 31):
+        beam_range[i] = beam_range[i - 1] - 0.023275
+    # beam_range = [1, 0.18, 0.15, 0.13, 0.11, 0.085, 0.065, 0.03, 0.01, -0.01, -0.03, -0.055, -0.08, -0.105, -0.13, -0.155, -0.18, -0.205, -0.228, -0.251, -0.275,
+    #                -0.295, -0.32, -0.34, -0.36, -0.38, -0.40, -0.425, -0.45, -0.47, -0.49, -0.52, -0.54]
+
+    num_pts, _ = pts.size()
+    mask = torch.zeros(num_pts)
+    if reduce_beams_to == 16:
+        for id in [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]:
+            beam_mask = (theta < (beam_range[id - 1] - 0.012)) * (
+                theta > (beam_range[id] - 0.012)
+            )
+            mask = mask + beam_mask
+        mask = mask.bool()
+    elif reduce_beams_to == 4:
+        for id in [7, 9, 11, 13]:
+            beam_mask = (theta < (beam_range[id - 1] - 0.012)) * (
+                theta > (beam_range[id] - 0.012)
+            )
+            mask = mask + beam_mask
+        mask = mask.bool()
+    # [?] pick the 14th beam
+    elif reduce_beams_to == 1:
+        chosen_beam_id = 9
+        mask = (theta < (beam_range[chosen_beam_id - 1] - 0.012)) * (
+            theta > (beam_range[chosen_beam_id] - 0.012)
+        )
+    else:
+        raise NotImplementedError
+    # points = copy.copy(pts)
+    points = pts[mask]
+    # print(points.size())
+    return points.numpy()
+
+@PIPELINES.register_module()
+class CustomLoadPointsFromMultiSweeps:
+    """Load points from multiple sweeps.
+
+    This is usually used for nuScenes dataset to utilize previous sweeps.
+
+    Args:
+        sweeps_num (int): Number of sweeps. Defaults to 10.
+        load_dim (int): Dimension number of the loaded points. Defaults to 5.
+        use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4].
+        pad_empty_sweeps (bool): Whether to repeat keyframe when
+            sweeps is empty. Defaults to False.
+        remove_close (bool): Whether to remove close points.
+            Defaults to False.
+        test_mode (bool): If test_model=True used for testing, it will not
+            randomly sample sweeps but select the nearest N frames.
+            Defaults to False.
+    """
+
+    def __init__(
+        self,
+        sweeps_num=10,
+        load_dim=5,
+        use_dim=[0, 1, 2, 4],
+        pad_empty_sweeps=False,
+        remove_close=False,
+        test_mode=False,
+        load_augmented=None,
+        reduce_beams=None,
+    ):
+        self.load_dim = load_dim
+        self.sweeps_num = sweeps_num
+        if isinstance(use_dim, int):
+            use_dim = list(range(use_dim))
+        self.use_dim = use_dim
+        self.pad_empty_sweeps = pad_empty_sweeps
+        self.remove_close = remove_close
+        self.test_mode = test_mode
+        self.load_augmented = load_augmented
+        self.reduce_beams = reduce_beams
+
+    def _load_points(self, lidar_path):
+        """Private function to load point clouds data.
+
+        Args:
+            lidar_path (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        mmcv.check_file_exist(lidar_path)
+        if self.load_augmented:
+            assert self.load_augmented in ["pointpainting", "mvp"]
+            virtual = self.load_augmented == "mvp"
+            points = load_augmented_point_cloud(
+                lidar_path, virtual=virtual, reduce_beams=self.reduce_beams
+            )
+        elif lidar_path.endswith(".npy"):
+            points = np.load(lidar_path)
+        else:
+            points = np.fromfile(lidar_path, dtype=np.float32)
+        return points
+
+    def _remove_close(self, points, radius=1.0):
+        """Removes point too close within a certain radius from origin.
+
+        Args:
+            points (np.ndarray | :obj:`BasePoints`): Sweep points.
+            radius (float): Radius below which points are removed.
+                Defaults to 1.0.
+
+        Returns:
+            np.ndarray: Points after removing.
+        """
+        if isinstance(points, np.ndarray):
+            points_numpy = points
+        elif isinstance(points, BasePoints):
+            points_numpy = points.tensor.numpy()
+        else:
+            raise NotImplementedError
+        x_filt = np.abs(points_numpy[:, 0]) < radius
+        y_filt = np.abs(points_numpy[:, 1]) < radius
+        not_close = np.logical_not(np.logical_and(x_filt, y_filt))
+        return points[not_close]
+
+    def __call__(self, results):
+        """Call function to load multi-sweep point clouds from files.
+
+        Args:
+            results (dict): Result dict containing multi-sweep point cloud \
+                filenames.
+
+        Returns:
+            dict: The result dict containing the multi-sweep points data. \
+                Added key and value are described below.
+
+                - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point \
+                    cloud arrays.
+        """
+        points = results["points"]
+        points.tensor[:, 4] = 0
+        sweep_points_list = [points]
+        ts = results["timestamp"] / 1e6
+        if self.pad_empty_sweeps and len(results["sweeps"]) == 0:
+            for i in range(self.sweeps_num):
+                if self.remove_close:
+                    sweep_points_list.append(self._remove_close(points))
+                else:
+                    sweep_points_list.append(points)
+        else:
+            if len(results["sweeps"]) <= self.sweeps_num:
+                choices = np.arange(len(results["sweeps"]))
+            elif self.test_mode:
+                choices = np.arange(self.sweeps_num)
+            else:
+                # NOTE: seems possible to load frame -11?
+                if not self.load_augmented:
+                    choices = np.random.choice(
+                        len(results["sweeps"]), self.sweeps_num, replace=False
+                    )
+                else:
+                    # don't allow to sample the earliest frame, match with Tianwei's implementation.
+                    choices = np.random.choice(
+                        len(results["sweeps"]) - 1, self.sweeps_num, replace=False
+                    )
+            for idx in choices:
+                sweep = results["sweeps"][idx]
+                points_sweep = self._load_points(sweep["data_path"])
+                points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim)
+
+                # TODO: make it more general
+                if self.reduce_beams and self.reduce_beams < 32:
+                    points_sweep = reduce_LiDAR_beams(points_sweep, self.reduce_beams)
+
+                if self.remove_close:
+                    points_sweep = self._remove_close(points_sweep)
+                sweep_ts = sweep["timestamp"] / 1e6
+                points_sweep[:, :3] = (
+                    points_sweep[:, :3] @ sweep["sensor2lidar_rotation"].T
+                )
+                points_sweep[:, :3] += sweep["sensor2lidar_translation"]
+                points_sweep[:, 4] = ts - sweep_ts
+                points_sweep = points.new_point(points_sweep)
+                sweep_points_list.append(points_sweep)
+
+        points = points.cat(sweep_points_list)
+        points = points[:, self.use_dim]
+        results["points"] = points
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return f"{self.__class__.__name__}(sweeps_num={self.sweeps_num})"
+
+
+
+@PIPELINES.register_module()
+class CustomLoadPointsFromFile:
+    """Load Points From File.
+
+    Load sunrgbd and scannet points from file.
+
+    Args:
+        coord_type (str): The type of coordinates of points cloud.
+            Available options includes:
+            - 'LIDAR': Points in LiDAR coordinates.
+            - 'DEPTH': Points in depth coordinates, usually for indoor dataset.
+            - 'CAMERA': Points in camera coordinates.
+        load_dim (int): The dimension of the loaded points.
+            Defaults to 6.
+        use_dim (list[int]): Which dimensions of the points to be used.
+            Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
+            or use_dim=[0, 1, 2, 3] to use the intensity dimension.
+        shift_height (bool): Whether to use shifted height. Defaults to False.
+        use_color (bool): Whether to use color features. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        coord_type,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        shift_height=False,
+        use_color=False,
+        load_augmented=None,
+        reduce_beams=None,
+    ):
+        self.shift_height = shift_height
+        self.use_color = use_color
+        if isinstance(use_dim, int):
+            use_dim = list(range(use_dim))
+        assert (
+            max(use_dim) < load_dim
+        ), f"Expect all used dimensions < {load_dim}, got {use_dim}"
+        assert coord_type in ["CAMERA", "LIDAR", "DEPTH"]
+
+        self.coord_type = coord_type
+        self.load_dim = load_dim
+        self.use_dim = use_dim
+        self.load_augmented = load_augmented
+        self.reduce_beams = reduce_beams
+
+    def _load_points(self, lidar_path):
+        """Private function to load point clouds data.
+
+        Args:
+            lidar_path (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        mmcv.check_file_exist(lidar_path)
+        if self.load_augmented:
+            assert self.load_augmented in ["pointpainting", "mvp"]
+            virtual = self.load_augmented == "mvp"
+            points = load_augmented_point_cloud(
+                lidar_path, virtual=virtual, reduce_beams=self.reduce_beams
+            )
+        elif lidar_path.endswith(".npy"):
+            points = np.load(lidar_path)
+        else:
+            points = np.fromfile(lidar_path, dtype=np.float32)
+
+        return points
+
+    def __call__(self, results):
+        """Call function to load points data from file.
+
+        Args:
+            results (dict): Result dict containing point clouds data.
+
+        Returns:
+            dict: The result dict containing the point clouds data. \
+                Added key and value are described below.
+
+                - points (:obj:`BasePoints`): Point clouds data.
+        """
+        lidar_path = results["pts_filename"]
+        points = self._load_points(lidar_path)
+        points = points.reshape(-1, self.load_dim)
+        # TODO: make it more general
+        if self.reduce_beams and self.reduce_beams < 32:
+            points = reduce_LiDAR_beams(points, self.reduce_beams)
+        points = points[:, self.use_dim]
+        attribute_dims = None
+
+        if self.shift_height:
+            floor_height = np.percentile(points[:, 2], 0.99)
+            height = points[:, 2] - floor_height
+            points = np.concatenate(
+                [points[:, :3], np.expand_dims(height, 1), points[:, 3:]], 1
+            )
+            attribute_dims = dict(height=3)
+
+        if self.use_color:
+            assert len(self.use_dim) >= 6
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(
+                    color=[
+                        points.shape[1] - 3,
+                        points.shape[1] - 2,
+                        points.shape[1] - 1,
+                    ]
+                )
+            )
+
+        points_class = get_points_type(self.coord_type)
+        points = points_class(
+            points, points_dim=points.shape[-1], attribute_dims=attribute_dims
+        )
+        results["points"] = points
+
+        return results
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a4ea15fe77bf0f6d7f65661c928076c3e075204
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/pipelines/transform_3d.py
@@ -0,0 +1,448 @@
+import numpy as np
+from numpy import random
+import mmcv
+from mmdet.datasets.builder import PIPELINES
+from mmcv.parallel import DataContainer as DC
+from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes,
+                               LiDARInstance3DBoxes, box_np_ops)
+
+
+@PIPELINES.register_module()
+class CustomObjectRangeFilter(object):
+    """Filter objects by the range, and also filter corresponding fut trajs
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+                keys are updated in the result dict.
+        """
+        # Check points instance type and initialise bev_range
+        if isinstance(input_dict['gt_bboxes_3d'],
+                      (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            bev_range = self.pcd_range[[0, 1, 3, 4]]
+        elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
+            bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        gt_attr_labels = input_dict['attr_labels']
+        mask = gt_bboxes_3d.in_range_bev(bev_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
+        gt_attr_labels = gt_attr_labels[mask.numpy().astype(np.bool)]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+        input_dict['gt_attr_labels'] = gt_attr_labels
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class CustomObjectNameFilter(object):
+    """Filter GT objects by their names, , and also filter corresponding fut trajs
+
+    Args:
+        classes (list[str]): List of class names to be kept for training.
+    """
+
+    def __init__(self, classes):
+        self.classes = classes
+        self.labels = list(range(len(self.classes)))
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by their names.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+                keys are updated in the result dict.
+        """
+        gt_labels_3d = input_dict['gt_labels_3d']
+        gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+                                  dtype=np.bool_)
+        input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
+        input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
+        input_dict['gt_attr_labels'] = input_dict['gt_attr_labels'][gt_bboxes_mask]
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(classes={self.classes})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PadMultiViewImage(object):
+    """Pad the multi-view image.
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value, 0 by default.
+    """
+
+    def __init__(self, size=None, size_divisor=None, pad_val=0):
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None or size_divisor is not None
+        assert size is None or size_divisor is None
+
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        if self.size is not None:
+            padded_img = [mmcv.impad(
+                img, shape=self.size, pad_val=self.pad_val) for img in results['img']]
+        elif self.size_divisor is not None:
+            padded_img = [mmcv.impad_to_multiple(
+                img, self.size_divisor, pad_val=self.pad_val) for img in results['img']]
+        
+        results['ori_shape'] = [img.shape for img in results['img']]
+        results['img'] = padded_img
+        results['img_shape'] = [img.shape for img in padded_img]
+        results['pad_shape'] = [img.shape for img in padded_img]
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_val={self.pad_val})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class NormalizeMultiviewImage(object):
+    """Normalize the image.
+    Added key is "img_norm_cfg".
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+
+    def __call__(self, results):
+        """Call function to normalize images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+
+        results['img'] = [mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) for img in results['img']]
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortionMultiViewImage:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        imgs = results['img']
+        new_imgs = []
+        for img in imgs:
+            assert img.dtype == np.float32, \
+                'PhotoMetricDistortion needs the input image of dtype np.float32,'\
+                ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
+            # random brightness
+            if random.randint(2):
+                delta = random.uniform(-self.brightness_delta,
+                                    self.brightness_delta)
+                img += delta
+
+            # mode == 0 --> do random contrast first
+            # mode == 1 --> do random contrast last
+            mode = random.randint(2)
+            if mode == 1:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                        self.contrast_upper)
+                    img *= alpha
+
+            # convert color from BGR to HSV
+            img = mmcv.bgr2hsv(img)
+
+            # random saturation
+            if random.randint(2):
+                img[..., 1] *= random.uniform(self.saturation_lower,
+                                            self.saturation_upper)
+
+            # random hue
+            if random.randint(2):
+                img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+                img[..., 0][img[..., 0] > 360] -= 360
+                img[..., 0][img[..., 0] < 0] += 360
+
+            # convert color from HSV to BGR
+            img = mmcv.hsv2bgr(img)
+
+            # random contrast
+            if mode == 0:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                        self.contrast_upper)
+                    img *= alpha
+
+            # randomly swap channels
+            if random.randint(2):
+                img = img[..., random.permutation(3)]
+            new_imgs.append(img)
+        results['img'] = new_imgs
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+
+@PIPELINES.register_module()
+class CustomCollect3D(object):
+    """Collect data from the loader relevant to the specific task.
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img", "proposals", "gt_bboxes",
+    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+    The "img_meta" item is always populated.  The contents of the "img_meta"
+    dictionary depends on "meta_keys". By default this includes:
+        - 'img_shape': shape of the image input to the network as a tuple \
+            (h, w, c).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+        - 'scale_factor': a float indicating the preprocessing scale
+        - 'flip': a boolean indicating if image flip transform was used
+        - 'filename': path to the image file
+        - 'ori_shape': original shape of the image as a tuple (h, w, c)
+        - 'pad_shape': image shape after padding
+        - 'lidar2img': transform from lidar to image
+        - 'depth2img': transform from depth to image
+        - 'cam2img': transform from camera to image
+        - 'pcd_horizontal_flip': a boolean indicating if point cloud is \
+            flipped horizontally
+        - 'pcd_vertical_flip': a boolean indicating if point cloud is \
+            flipped vertically
+        - 'box_mode_3d': 3D box mode
+        - 'box_type_3d': 3D box type
+        - 'img_norm_cfg': a dict of normalization information:
+            - mean: per channel mean subtraction
+            - std: per channel std divisor
+            - to_rgb: bool indicating if bgr was converted to rgb
+        - 'pcd_trans': point cloud transformations
+        - 'sample_idx': sample index
+        - 'pcd_scale_factor': point cloud scale factor
+        - 'pcd_rotation': rotation applied to point cloud
+        - 'pts_filename': path to point cloud file.
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
+            'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
+            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
+            'box_type_3d', 'img_norm_cfg', 'pcd_trans',
+            'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx',
+                            'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
+                            'transformation_3d_flow', 'scene_token',
+                            'can_bus',
+                            )):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        """Call function to collect keys in results. The keys in ``meta_keys``
+        will be converted to :obj:`mmcv.DataContainer`.
+        Args:
+            results (dict): Result dict contains the data to collect.
+        Returns:
+            dict: The result dict contains the following keys
+                - keys in ``self.keys``
+                - ``img_metas``
+        """
+       
+        data = {}
+        img_metas = {}
+      
+        for key in self.meta_keys:
+            if key in results:
+                img_metas[key] = results[key]
+
+        data['img_metas'] = DC(img_metas, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+
+@PIPELINES.register_module()
+class RandomScaleImageMultiViewImage(object):
+    """Random scale the image
+    Args:
+        scales
+    """
+
+    def __init__(self, scales=[]):
+        self.scales = scales
+        assert len(self.scales)==1
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        rand_ind = np.random.permutation(range(len(self.scales)))[0]
+        rand_scale = self.scales[rand_ind]
+
+        y_size = [int(img.shape[0] * rand_scale) for img in results['img']]
+        x_size = [int(img.shape[1] * rand_scale) for img in results['img']]
+        scale_factor = np.eye(4)
+        scale_factor[0, 0] *= rand_scale
+        scale_factor[1, 1] *= rand_scale
+        results['img'] = [mmcv.imresize(img, (x_size[idx], y_size[idx]), return_scale=False) for idx, img in
+                          enumerate(results['img'])]
+        lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']]
+        results['lidar2img'] = lidar2img
+        results['img_shape'] = [img.shape for img in results['img']]
+        results['ori_shape'] = [img.shape for img in results['img']]
+
+        return results
+
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.scales}, '
+        return repr_str
+    
+
+@PIPELINES.register_module()
+class CustomPointsRangeFilter:
+    """Filter points by the range.
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, data):
+        """Call function to filter points by the range.
+        Args:
+            data (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask' \
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = data["points"]
+        points_mask = points.in_range_3d(self.pcd_range)
+        clean_points = points[points_mask]
+        data["points"] = clean_points
+        return data
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__init__.py b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb2a0b17769a958042583dcb4c8c4a4f51636f4c
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__init__.py
@@ -0,0 +1,4 @@
+from .group_sampler import DistributedGroupSampler
+from .distributed_sampler import DistributedSampler
+from .sampler import SAMPLER, build_sampler
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c19564b7ea1908375546175900410df60d0c64d2
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/distributed_sampler.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/distributed_sampler.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12c171c1316e78621ccc60066adad5db8dc6b8db
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/distributed_sampler.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/group_sampler.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/group_sampler.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c867b5ed2d73668995f01b2cbe9c9c8f3517af7
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/group_sampler.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/sampler.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/sampler.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9cc394674449b127afd5bd32a03fa9188987be53
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/__pycache__/sampler.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2913de99253be744a308bbc24c5bcaf3cd4a857c
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/distributed_sampler.py
@@ -0,0 +1,41 @@
+import math
+
+import torch
+from torch.utils.data import DistributedSampler as _DistributedSampler
+from .sampler import SAMPLER
+
+
+@SAMPLER.register_module()
+class DistributedSampler(_DistributedSampler):
+
+    def __init__(self,
+                 dataset=None,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 seed=0):
+        super().__init__(
+            dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        # for the compatibility from PyTorch 1.3+
+        self.seed = seed if seed is not None else 0
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            assert False
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        # in case that indices is shorter than half of total_size
+        indices = (indices *
+                   math.ceil(self.total_size / len(indices)))[:self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        per_replicas = self.total_size//self.num_replicas
+        # indices = indices[self.rank:self.total_size:self.num_replicas]
+        indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..16c59e5f3dd880ba185247acfba6eae354deb771
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/group_sampler.py
@@ -0,0 +1,110 @@
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data import Sampler
+from .sampler import SAMPLER
+import random
+from IPython import embed
+
+
+@SAMPLER.register_module()
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+        seed (int, optional): random seed used to shuffle the sampler if
+            ``shuffle=True``. This number should be identical across all
+            processes in the distributed group. Default: 0.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None,
+                 seed=0):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.seed = seed if seed is not None else 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                # add .numpy() to avoid bug when selecting indice in parrots.
+                # TODO: check whether torch.randperm() can be replaced by
+                # numpy.random.permutation().
+                indice = indice[list(
+                    torch.randperm(int(size), generator=g).numpy())].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                # pad indice
+                tmp = indice.copy()
+                for _ in range(extra // size):
+                    indice.extend(tmp)
+                indice.extend(tmp[:extra % size])
+                indices.extend(indice)
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/sampler.py b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1906049c4416951ab315338a90dceecc1a3b1203
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/samplers/sampler.py
@@ -0,0 +1,7 @@
+from mmcv.utils.registry import Registry, build_from_cfg
+
+SAMPLER = Registry('sampler')
+
+
+def build_sampler(cfg, default_args):
+    return build_from_cfg(cfg, SAMPLER, default_args)
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/vad_custom_nuscenes_eval.py b/GenAD-main/projects/mmdet3d_plugin/datasets/vad_custom_nuscenes_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5aa3ce69b2ebf691028b7dfc776f12cc2eacdd5
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/vad_custom_nuscenes_eval.py
@@ -0,0 +1,863 @@
+import argparse
+import copy
+import json
+import os
+import time
+from typing import Tuple, Dict, Any
+import torch
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionConfig
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from pyquaternion import Quaternion
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.geometry_utils import points_in_box
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.eval.common.loaders import add_center_dist
+import tqdm
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from torchvision.transforms.functional import rotate
+import pycocotools.mask as mask_util
+# from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from torchvision.transforms.functional import rotate
+import cv2
+import argparse
+import json
+import os
+import random
+import time
+from typing import Tuple, Dict, Any
+
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.loaders import load_gt, add_center_dist
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.constants import TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \
+    DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from mmdet3d.core.bbox.iou_calculators import BboxOverlaps3D
+from IPython import embed
+import json
+from typing import Any
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.common.utils import boxes_to_sensor
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+    PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList
+from nuscenes.utils.data_classes import LidarPointCloud
+from nuscenes.utils.geometry_utils import view_points
+
+import mmcv
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+                   metrics: DetectionMetrics,
+                   detection_name: str,
+                   min_recall: float,
+                   dist_th_tp: float,
+                   savepath: str = None,
+                   ax: Axis = None) -> None:
+    """
+    Plot the true positive curve for the specified class.
+    :param md_list: DetectionMetricDataList instance.
+    :param metrics: DetectionMetrics instance.
+    :param detection_name:
+    :param min_recall: Minimum recall value.
+    :param dist_th_tp: The distance threshold used to determine matches.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    :param ax: Axes onto which to render.
+    """
+    # Get metric data for given detection class with tp distance threshold.
+
+    md = md_list[(detection_name, dist_th_tp)]
+    min_recall_ind = round(100 * min_recall)
+    if min_recall_ind <= md.max_recall_ind:
+        # For traffic_cone and barrier only a subset of the metrics are plotted.
+        rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+        ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+    else:
+        ylimit = 1.0
+
+    # Prepare axis.
+    if ax is None:
+        ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+                        min_recall=min_recall)
+    ax.set_ylim(0, ylimit)
+
+    # Plot the recall vs. error curve for each tp metric.
+    for metric in TP_METRICS:
+        tp = metrics.get_label_tp(detection_name, metric)
+
+        # Plot only if we have valid data.
+        if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+            recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+        else:
+            recall, error = [], []
+
+        # Change legend based on tp value
+        if tp is np.nan:
+            label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+        elif min_recall_ind > md.max_recall_ind:
+            label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+        else:
+            label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+        if metric == 'trans_err':
+            label += f' ({md.max_recall_ind})'  # add recall
+            print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+        ax.plot(recall, error, label=label)
+    ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+    ax.legend(loc='best')
+
+    if savepath is not None:
+        plt.savefig(savepath)
+        plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+    def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+        '''
+        add annotation token
+        '''
+        super().__init__(*args, **kwargs)
+        self.token = token
+        self.visibility = visibility
+        self.index = index
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'token': self.token,
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'visibility': self.visibility,
+            'index': self.index
+
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(
+            token=content['token'],
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name'],
+            visibility=content['visibility'],
+            index=content['index'],
+        )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible inside an image without accounting for occlusions.
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    center_3d = box.center.reshape(3, 1)
+    center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, center_img[1, :] > 0)
+    visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+    in_front = center_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if vis_level == BoxVisibility.ALL:
+        return all(visible) and all(in_front)
+    elif vis_level == BoxVisibility.ANY:
+        return any(visible) and all(in_front)
+    elif vis_level == BoxVisibility.NONE:
+        return True
+    else:
+        raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+                                       vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible in images but not all corners in image .
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    corners_3d = box.corners()
+    corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, corners_img[1, :] > 0)
+    visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+    in_front = corners_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if any(visible) and not all(visible) and all(in_front):
+        return True
+    else:
+        return False
+
+def load_prediction(result_path: str, max_boxes_per_sample: int, box_cls, verbose: bool = False) \
+        -> Tuple[EvalBoxes, Dict]:
+    """
+    Loads object predictions from file.
+    :param result_path: Path to the .json result file provided by the user.
+    :param max_boxes_per_sample: Maximim number of boxes allowed per sample.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The deserialized results and meta data.
+    """
+
+    # Load from file and check that the format is correct.
+    # with open(result_path) as f:
+    #     data = json.load(f)
+    data = mmcv.load(result_path)
+    assert 'results' in data, 'Error: No field `results` in result file. Please note that the result format changed.' \
+                              'See https://www.nuscenes.org/object-detection for more information.'
+
+    # Deserialize results and get meta data.
+    all_results = EvalBoxes.deserialize(data['results'], box_cls)
+    meta = data['meta']
+    if verbose:
+        print("Loaded results from {}. Found detections for {} samples."
+              .format(result_path, len(all_results.sample_tokens)))
+
+    # Check that each sample has no more than x predicted boxes.
+    for sample_token in all_results.sample_tokens:
+        assert len(all_results.boxes[sample_token]) <= max_boxes_per_sample, \
+            "Error: Only <= %d boxes per sample allowed!" % max_boxes_per_sample
+
+    return all_results, meta
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+    """
+    Loads ground truth boxes from DB.
+    :param nusc: A NuScenes instance.
+    :param eval_split: The evaluation split for which we load GT boxes.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The GT boxes.
+    """
+
+    # Init.
+    if box_cls == DetectionBox_modified:
+        attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+    if verbose:
+        print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+    # Read out all sample_tokens in DB.
+    sample_tokens_all = [s['token'] for s in nusc.sample]
+    assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+    # Only keep samples from this split.
+    splits = create_splits_scenes()
+
+    # Check compatibility of split with nusc_version.
+    version = nusc.version
+    if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+        assert version.endswith('trainval'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split in {'mini_train', 'mini_val'}:
+        assert version.endswith('mini'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split == 'test':
+        assert version.endswith('test'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    else:
+        raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+                         .format(eval_split))
+
+    if eval_split == 'test':
+        # Check that you aren't trying to cheat :).
+        assert len(nusc.sample_annotation) > 0, \
+            'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+    index_map = {}
+    for scene in nusc.scene:
+        first_sample_token = scene['first_sample_token']
+        sample = nusc.get('sample', first_sample_token)
+        index_map[first_sample_token] = 1
+        index = 2
+        while sample['next'] != '':
+            sample = nusc.get('sample', sample['next'])
+            index_map[sample['token']] = index
+            index += 1
+
+    sample_tokens = []
+    for sample_token in sample_tokens_all:
+        scene_token = nusc.get('sample', sample_token)['scene_token']
+        scene_record = nusc.get('scene', scene_token)
+        if scene_record['name'] in splits[eval_split]:
+            sample_tokens.append(sample_token)
+
+    all_annotations = EvalBoxes()
+
+    # Load annotations and filter predictions and annotations.
+    tracking_id_set = set()
+    for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+        sample = nusc.get('sample', sample_token)
+        sample_annotation_tokens = sample['anns']
+
+        sample_boxes = []
+        for sample_annotation_token in sample_annotation_tokens:
+
+            sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+            if box_cls == DetectionBox_modified:
+                # Get label name in detection task and filter unused labels.
+                detection_name = category_to_detection_name(sample_annotation['category_name'])
+                if detection_name is None:
+                    continue
+
+                # Get attribute_name.
+                attr_tokens = sample_annotation['attribute_tokens']
+                attr_count = len(attr_tokens)
+                if attr_count == 0:
+                    attribute_name = ''
+                elif attr_count == 1:
+                    attribute_name = attribute_map[attr_tokens[0]]
+                else:
+                    raise Exception('Error: GT annotations must not have more than one attribute!')
+
+                sample_boxes.append(
+                    box_cls(
+                        token=sample_annotation_token,
+                        sample_token=sample_token,
+                        translation=sample_annotation['translation'],
+                        size=sample_annotation['size'],
+                        rotation=sample_annotation['rotation'],
+                        velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+                        num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+                        detection_name=detection_name,
+                        detection_score=-1.0,  # GT samples do not have a score.
+                        attribute_name=attribute_name,
+                        visibility=sample_annotation['visibility_token'],
+                        index=index_map[sample_token]
+                    )
+                )
+            elif box_cls == TrackingBox:
+                assert False
+            else:
+                raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+        all_annotations.add_boxes(sample_token, sample_boxes)
+
+    if verbose:
+        print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+    return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+                            eval_boxes: EvalBoxes,
+                            id=None,
+                            verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.token in id:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+        ori_eval_boxes: EvalBoxes,
+        visibility=None,
+        verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.visibility == visibility:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After visibility based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[],  verbose=False):
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    for sample_token in eval_boxes.sample_tokens:
+        if sample_token not in valid_sample_tokens:
+            eval_boxes.boxes.pop(sample_token)
+    return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+                                 eval_boxes: EvalBoxes,
+                                 verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. basedon overlap .
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    cams = ['CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_RIGHT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_FRONT_LEFT']
+
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        sample_record = nusc.get('sample', sample_token)
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            count = 0
+            for cam in cams:
+                '''
+                copy-paste form nuscens
+                '''
+                sample_data_token = sample_record['data'][cam]
+                sd_record = nusc.get('sample_data', sample_data_token)
+                cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+                sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+                pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+                cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+                imsize = (sd_record['width'], sd_record['height'])
+                new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+                              name=box.detection_name, token='')
+
+                # Move box to ego vehicle coord system.
+                new_box.translate(-np.array(pose_record['translation']))
+                new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+                #  Move box to sensor coord system.
+                new_box.translate(-np.array(cs_record['translation']))
+                new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+                if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                    count += 1
+                # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                #    count += 1
+
+            if count > 1:
+                with open('center_overlap.txt', 'a') as f:
+                    try:
+                        f.write(box.token + '\n')
+                    except:
+                        pass
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    verbose = True
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+def _get_box_class_field(eval_boxes: EvalBoxes) -> str:
+    """
+    Retrieve the name of the class field in the boxes.
+    This parses through all boxes until it finds a valid box.
+    If there are no valid boxes, this function throws an exception.
+    :param eval_boxes: The EvalBoxes used for evaluation.
+    :return: The name of the class field in the boxes, e.g. detection_name or tracking_name.
+    """
+    assert len(eval_boxes.boxes) > 0
+    box = None
+    for val in eval_boxes.boxes.values():
+        if len(val) > 0:
+            box = val[0]
+            break
+    if isinstance(box, DetectionBox):
+        class_field = 'detection_name'
+    elif isinstance(box, TrackingBox):
+        class_field = 'tracking_name'
+    else:
+        raise Exception('Error: Invalid box type: %s' % box)
+
+    return class_field
+
+def filter_eval_boxes(nusc: NuScenes,
+                      eval_boxes: EvalBoxes,
+                      max_dist_x: Dict[str, float],
+                      max_dist_y: Dict[str, float],
+                      verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param max_dist: Maps the detection name to the eval distance threshold for that class.
+    :param verbose: Whether to print to stdout.
+    """
+    # Retrieve box type for detectipn/tracking boxes.
+    class_field = _get_box_class_field(eval_boxes)
+
+    # Accumulators for number of filtered boxes.
+    total, dist_filter, point_filter, bike_rack_filter = 0, 0, 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on distance first.
+        total += len(eval_boxes[sample_token])
+        eval_boxes.boxes[sample_token] = [box for box in eval_boxes[sample_token] if
+                                          abs(box.ego_translation[0]) < max_dist_x[box.__getattribute__(class_field)] \
+                                          and abs(box.ego_translation[1]) < max_dist_y[box.__getattribute__(class_field)]]
+        dist_filter += len(eval_boxes[sample_token])
+
+        # Then remove boxes with zero points in them. Eval boxes have -1 points by default.
+        eval_boxes.boxes[sample_token] = [box for box in eval_boxes[sample_token] if not box.num_pts == 0]
+        point_filter += len(eval_boxes[sample_token])
+
+        # Perform bike-rack filtering.
+        sample_anns = nusc.get('sample', sample_token)['anns']
+        bikerack_recs = [nusc.get('sample_annotation', ann) for ann in sample_anns if
+                         nusc.get('sample_annotation', ann)['category_name'] == 'static_object.bicycle_rack']
+        bikerack_boxes = [Box(rec['translation'], rec['size'], Quaternion(rec['rotation'])) for rec in bikerack_recs]
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.__getattribute__(class_field) in ['bicycle', 'motorcycle']:
+                in_a_bikerack = False
+                for bikerack_box in bikerack_boxes:
+                    if np.sum(points_in_box(bikerack_box, np.expand_dims(np.array(box.translation), axis=1))) > 0:
+                        in_a_bikerack = True
+                if not in_a_bikerack:
+                    filtered_boxes.append(box)
+            else:
+                filtered_boxes.append(box)
+
+        eval_boxes.boxes[sample_token] = filtered_boxes
+        bike_rack_filter += len(eval_boxes.boxes[sample_token])
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After distance based filtering: %d" % dist_filter)
+        print("=> After LIDAR and RADAR points based filtering: %d" % point_filter)
+        print("=> After bike rack filtering: %d" % bike_rack_filter)
+
+    return eval_boxes
+
+class NuScenesEval_custom(NuScenesEval):
+    """
+    Dummy class for backward-compatibility. Same as DetectionEval.
+    """
+
+    def __init__(self,
+                 nusc: NuScenes,
+                 config: DetectionConfig,
+                 result_path: str,
+                 eval_set: str,
+                 output_dir: str = None,
+                 verbose: bool = True,
+                 overlap_test=False,
+                 eval_mask=False,
+                 data_infos=None
+                 ):
+        """
+        Initialize a DetectionEval object.
+        :param nusc: A NuScenes object.
+        :param config: A DetectionConfig object.
+        :param result_path: Path of the nuScenes JSON result file.
+        :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+        :param output_dir: Folder to save plots and results to.
+        :param verbose: Whether to print to stdout.
+        """
+
+        self.nusc = nusc
+        self.result_path = result_path
+        self.eval_set = eval_set
+        self.output_dir = output_dir
+        self.verbose = verbose
+        self.cfg = config
+        self.overlap_test = overlap_test
+        self.eval_mask = eval_mask
+        self.data_infos = data_infos
+        # Check result file exists.
+        assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+        # Make dirs.
+        self.plot_dir = os.path.join(self.output_dir, 'plots')
+        if not os.path.isdir(self.output_dir):
+            os.makedirs(self.output_dir)
+        if not os.path.isdir(self.plot_dir):
+            os.makedirs(self.plot_dir)
+
+        # Load data.
+        if verbose:
+            print('Initializing nuScenes detection evaluation')
+        self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+                                                     verbose=verbose)
+        self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+        assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+            "Samples in split doesn't match samples in predictions."
+
+        # Add center distances.
+        self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+        self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+        # Filter boxes (distance, points per box, etc.).
+
+        if verbose:
+            print('Filtering predictions')
+        self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range_x, self.cfg.class_range_y, verbose=verbose)
+        if verbose:
+            print('Filtering ground truth annotations')
+        self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range_x, self.cfg.class_range_y, verbose=verbose)
+
+        if self.overlap_test:
+            self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+            self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+        self.all_gt = copy.deepcopy(self.gt_boxes)
+        self.all_preds = copy.deepcopy(self.pred_boxes)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+        self.index_map = {}
+        for scene in nusc.scene:
+            first_sample_token = scene['first_sample_token']
+            sample = nusc.get('sample', first_sample_token)
+            self.index_map[first_sample_token] = 1
+            index = 2
+            while sample['next'] != '':
+                sample = nusc.get('sample', sample['next'])
+                self.index_map[sample['token']] = index
+                index += 1
+
+    def update_gt(self, type_='vis', visibility='1', index=1):
+        if type_ == 'vis':
+            self.visibility_test = True
+            if self.visibility_test:
+                '''[{'description': 'visibility of whole object is between 0 and 40%',
+                'token': '1',
+                'level': 'v0-40'},
+                {'description': 'visibility of whole object is between 40 and 60%',
+                'token': '2',
+                'level': 'v40-60'},
+                {'description': 'visibility of whole object is between 60 and 80%',
+                'token': '3',
+                'level': 'v60-80'},
+                {'description': 'visibility of whole object is between 80 and 100%',
+                'token': '4',
+                'level': 'v80-100'}]'''
+
+                self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+        elif type_ == 'ord':
+
+            valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+            # from IPython import embed
+            # embed()
+            self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+            self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+    def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+        """
+        Performs the actual evaluation.
+        :return: A tuple of high-level and the raw metric data.
+        """
+        start_time = time.time()
+
+        # -----------------------------------
+        # Step 1: Accumulate metric data for all classes and distance thresholds.
+        # -----------------------------------
+        if self.verbose:
+            print('Accumulating metric data...')
+        metric_data_list = DetectionMetricDataList()
+
+        # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+        # self.cfg.dist_ths = [0.3]
+        # self.cfg.dist_fcn_callable
+        for class_name in self.cfg.class_names:
+            for dist_th in self.cfg.dist_ths:
+                md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+
+        # -----------------------------------
+        # Step 2: Calculate metrics from the data.
+        # -----------------------------------
+        if self.verbose:
+            print('Calculating metrics...')
+        metrics = DetectionMetrics(self.cfg)
+        for class_name in self.cfg.class_names:
+            # Compute APs.
+            for dist_th in self.cfg.dist_ths:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+                metrics.add_label_ap(class_name, dist_th, ap)
+            # Compute TP metrics.
+            for metric_name in TP_METRICS:
+                metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+                if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+                    tp = np.nan
+                elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+                    tp = np.nan
+                else:
+                    tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+                metrics.add_label_tp(class_name, metric_name, tp)
+
+        # Compute evaluation time.
+        metrics.add_runtime(time.time() - start_time)
+
+        return metrics, metric_data_list
+
+    def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+        """
+        Renders various PR and TP curves.
+        :param metrics: DetectionMetrics instance.
+        :param md_list: DetectionMetricDataList instance.
+        """
+        if self.verbose:
+            print('Rendering PR and TP curves')
+
+        def savepath(name):
+            return os.path.join(self.plot_dir, name + '.pdf')
+
+        summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+                     dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+        for detection_name in self.cfg.class_names:
+            class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+                           savepath=savepath(detection_name + '_pr'))
+
+            class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+                           savepath=savepath(detection_name + '_tp'))
+
+        for dist_th in self.cfg.dist_ths:
+            dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+                          savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+    # Settings.
+    parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+    parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+                        help='Folder to store result metrics, graphs and example visualizations.')
+    parser.add_argument('--eval_set', type=str, default='val',
+                        help='Which dataset split to evaluate on, train, val or test.')
+    parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+                        help='Default nuScenes data directory.')
+    parser.add_argument('--version', type=str, default='v1.0-trainval',
+                        help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+    parser.add_argument('--config_path', type=str, default='',
+                        help='Path to the configuration file.'
+                             'If no path given, the CVPR 2019 configuration will be used.')
+    parser.add_argument('--plot_examples', type=int, default=0,
+                        help='How many example visualizations to write to disk.')
+    parser.add_argument('--render_curves', type=int, default=1,
+                        help='Whether to render PR and TP curves to disk.')
+    parser.add_argument('--verbose', type=int, default=1,
+                        help='Whether to print to stdout.')
+    args = parser.parse_args()
+
+    result_path_ = os.path.expanduser(args.result_path)
+    output_dir_ = os.path.expanduser(args.output_dir)
+    eval_set_ = args.eval_set
+    dataroot_ = args.dataroot
+    version_ = args.version
+    config_path = args.config_path
+    plot_examples_ = args.plot_examples
+    render_curves_ = bool(args.render_curves)
+    verbose_ = bool(args.verbose)
+
+    if config_path == '':
+        cfg_ = config_factory('detection_cvpr_2019')
+    else:
+        with open(config_path, 'r') as _f:
+            cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+    nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+    nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+                                    output_dir=output_dir_, verbose=verbose_)
+    for vis in ['1', '2', '3', '4']:
+        nusc_eval.update_gt(type_='vis', visibility=vis)
+        print(f'================ {vis} ===============')
+        nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
+    #for index in range(1, 41):
+    #    nusc_eval.update_gt(type_='ord', index=index)
+    #
diff --git a/GenAD-main/projects/mmdet3d_plugin/datasets/vad_nusc_detection_cvpr_2019.json b/GenAD-main/projects/mmdet3d_plugin/datasets/vad_nusc_detection_cvpr_2019.json
new file mode 100644
index 0000000000000000000000000000000000000000..b5c810318083771277eac0cca8bf6252a7ae793f
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/datasets/vad_nusc_detection_cvpr_2019.json
@@ -0,0 +1,34 @@
+{
+    "class_range_x": {
+      "car": 30,
+      "truck": 30,
+      "bus": 30,
+      "trailer": 30,
+      "construction_vehicle": 30,
+      "pedestrian": 30,
+      "motorcycle": 30,
+      "bicycle": 30,
+      "traffic_cone": 30,
+      "barrier": 30
+    },
+    "class_range_y": {
+        "car": 15,
+        "truck": 15,
+        "bus": 15,
+        "trailer": 15,
+        "construction_vehicle": 15,
+        "pedestrian": 15,
+        "motorcycle": 15,
+        "bicycle": 15,
+        "traffic_cone": 15,
+        "barrier": 15
+      },
+    "dist_fcn": "center_distance",
+    "dist_ths": [0.5, 1.0, 2.0, 4.0],
+    "dist_th_tp": 2.0,
+    "min_recall": 0.1,
+    "min_precision": 0.1,
+    "max_boxes_per_sample": 500,
+    "mean_ap_weight": 5
+  }
+  
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/VAD_head.py b/GenAD-main/projects/mmdet3d_plugin/models/VAD_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8fea586e0993b7c8aa941552fa0d9659d883d89
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/VAD_head.py
@@ -0,0 +1,2165 @@
+import copy
+from math import pi, cos, sin
+
+import torch
+import numpy as np
+import torch.nn as nn
+import matplotlib.pyplot as plt
+import torch.nn.functional as F
+from mmdet.models import HEADS, build_loss 
+from mmdet.models.dense_heads import DETRHead
+from mmcv.runner import force_fp32, auto_fp16
+from mmcv.utils import TORCH_VERSION, digit_version
+from mmdet.core import build_assigner, build_sampler
+from mmdet3d.core.bbox.coders import build_bbox_coder
+from mmdet.models.utils.transformer import inverse_sigmoid
+from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh
+from mmcv.cnn import Linear, bias_init_with_prob, xavier_init
+from mmdet.core import (multi_apply, multi_apply, reduce_mean)
+from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
+
+from projects.mmdet3d_plugin.core.bbox.util import normalize_bbox
+from projects.mmdet3d_plugin.VAD.utils.traj_lr_warmup import get_traj_warmup_loss_weight
+from projects.mmdet3d_plugin.VAD.utils.map_utils import (
+    normalize_2d_pts, normalize_2d_bbox, denormalize_2d_pts, denormalize_2d_bbox
+)
+
+from projects.mmdet3d_plugin.VAD.generator import DistributionModule, PredictModel
+from projects.mmdet3d_plugin.VAD.generator import FuturePrediction
+
+class MLP(nn.Module):
+    def __init__(self, in_channels, hidden_unit, verbose=False):
+        super(MLP, self).__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(in_channels, hidden_unit),
+            nn.LayerNorm(hidden_unit),
+            nn.ReLU()
+        )
+
+    def forward(self, x):
+        x = self.mlp(x)
+        return x
+
+class LaneNet(nn.Module):
+    def __init__(self, in_channels, hidden_unit, num_subgraph_layers):
+        super(LaneNet, self).__init__()
+        self.num_subgraph_layers = num_subgraph_layers
+        self.layer_seq = nn.Sequential()
+        for i in range(num_subgraph_layers):
+            self.layer_seq.add_module(
+                f'lmlp_{i}', MLP(in_channels, hidden_unit))
+            in_channels = hidden_unit*2
+
+    def forward(self, pts_lane_feats):
+        '''
+            Extract lane_feature from vectorized lane representation
+
+        Args:
+            pts_lane_feats: [batch size, max_pnum, pts, D]
+
+        Returns:
+            inst_lane_feats: [batch size, max_pnum, D]
+        '''
+        x = pts_lane_feats
+        for name, layer in self.layer_seq.named_modules():
+            if isinstance(layer, MLP):
+                # x [bs,max_lane_num,9,dim]
+                x = layer(x)
+                x_max = torch.max(x, -2)[0]
+                x_max = x_max.unsqueeze(2).repeat(1, 1, x.shape[2], 1)
+                x = torch.cat([x, x_max], dim=-1)
+        x_max = torch.max(x, -2)[0]
+        return x_max
+
+
+@HEADS.register_module()
+class VADHead(DETRHead):
+    """Head of VAD model.
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+        bev_h, bev_w (int): spatial shape of BEV queries.
+    """
+    def __init__(self,
+                 *args,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 transformer=None,
+                 bbox_coder=None,
+                 num_cls_fcs=2,
+                 code_weights=None,
+                 bev_h=30,
+                 bev_w=30,
+                 fut_ts=6,
+                 fut_mode=6,
+                 loss_traj=dict(type='L1Loss', loss_weight=0.25),
+                 loss_traj_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=0.8),
+                 map_bbox_coder=None,
+                 map_num_query=900,
+                 map_num_classes=3,
+                 map_num_vec=20,
+                 map_num_pts_per_vec=2,
+                 map_num_pts_per_gt_vec=2,
+                 map_query_embed_type='all_pts',
+                 map_transform_method='minmax',
+                 map_gt_shift_pts_pattern='v0',
+                 map_dir_interval=1,
+                 map_code_size=None,
+                 map_code_weights=None,
+                loss_map_cls=dict(
+                     type='CrossEntropyLoss',
+                     bg_cls_weight=0.1,
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=1.0),
+                 loss_map_bbox=dict(type='L1Loss', loss_weight=5.0),
+                 loss_map_iou=dict(type='GIoULoss', loss_weight=2.0),
+                 loss_map_pts=dict(
+                    type='ChamferDistance',loss_src_weight=1.0,loss_dst_weight=1.0
+                 ),
+                 loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=2.0),
+                 loss_vae_gen=dict(type='ProbabilisticLoss', loss_weight=1.0),
+                 tot_epoch=None,
+                 use_traj_lr_warmup=False,
+                 motion_decoder=None,
+                 motion_map_decoder=None,
+                 use_pe=False,
+                 motion_det_score=None,
+                 map_thresh=0.5,
+                 dis_thresh=0.2,
+                 pe_normalization=True,
+                 ego_his_encoder=None,
+                 ego_fut_mode=3,
+                 loss_plan_reg=dict(type='L1Loss', loss_weight=0.25),
+                 loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=0.1),
+                 loss_plan_col=dict(type='PlanAgentDisLoss', loss_weight=0.1),
+                 loss_plan_dir=dict(type='PlanMapThetaLoss', loss_weight=0.1),
+                 ego_agent_decoder=None,
+                 ego_map_decoder=None,
+                 query_thresh=None,
+                 query_use_fix_pad=None,
+                 ego_lcf_feat_idx=None,
+                 valid_fut_ts=6,
+                 agent_dim = 300,
+                 **kwargs):
+
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.fp16_enabled = False
+        self.fut_ts = fut_ts
+        self.fut_mode = fut_mode
+        self.tot_epoch = tot_epoch
+        self.use_traj_lr_warmup = use_traj_lr_warmup
+        self.motion_decoder = motion_decoder
+        self.motion_map_decoder = motion_map_decoder
+        self.use_pe = use_pe
+        self.motion_det_score = motion_det_score
+        self.map_thresh = map_thresh
+        self.dis_thresh = dis_thresh
+        self.pe_normalization = pe_normalization
+        self.ego_his_encoder = ego_his_encoder
+        self.ego_fut_mode = ego_fut_mode
+        self.ego_agent_decoder = ego_agent_decoder
+        self.ego_map_decoder = ego_map_decoder
+        self.query_thresh = query_thresh
+        self.query_use_fix_pad = query_use_fix_pad
+        self.ego_lcf_feat_idx = ego_lcf_feat_idx
+        self.valid_fut_ts = valid_fut_ts
+        self.agent_dim = agent_dim
+
+        if loss_traj_cls['use_sigmoid'] == True:
+            self.traj_num_cls = 1
+        else:
+          self.traj_num_cls = 2
+
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        if 'code_size' in kwargs:
+            self.code_size = kwargs['code_size']
+        else:
+            self.code_size = 10
+        if code_weights is not None:
+            self.code_weights = code_weights
+        else:
+            self.code_weights = [1.0, 1.0, 1.0,
+                                 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+        if map_code_size is not None:
+            self.map_code_size = map_code_size
+        else:
+            self.map_code_size = 10
+        if map_code_weights is not None:
+            self.map_code_weights = map_code_weights
+        else:
+            self.map_code_weights = [1.0, 1.0, 1.0,
+                                 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self.real_w = self.pc_range[3] - self.pc_range[0]
+        self.real_h = self.pc_range[4] - self.pc_range[1]
+        self.num_cls_fcs = num_cls_fcs - 1
+
+        self.map_bbox_coder = build_bbox_coder(map_bbox_coder)
+        self.map_query_embed_type = map_query_embed_type
+        self.map_transform_method = map_transform_method
+        self.map_gt_shift_pts_pattern = map_gt_shift_pts_pattern
+        map_num_query = map_num_vec * map_num_pts_per_vec
+        self.map_num_query = map_num_query
+        self.map_num_classes = map_num_classes
+        self.map_num_vec = map_num_vec
+        self.map_num_pts_per_vec = map_num_pts_per_vec
+        self.map_num_pts_per_gt_vec = map_num_pts_per_gt_vec
+        self.map_dir_interval = map_dir_interval
+
+        if loss_map_cls['use_sigmoid'] == True:
+            self.map_cls_out_channels = map_num_classes
+        else:
+            self.map_cls_out_channels = map_num_classes + 1
+
+        self.map_bg_cls_weight = 0
+        map_class_weight = loss_map_cls.get('class_weight', None)
+        if map_class_weight is not None and (self.__class__ is VADHead):
+            assert isinstance(map_class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(map_class_weight)}.'
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            map_bg_cls_weight = loss_map_cls.get('bg_cls_weight', map_class_weight)
+            assert isinstance(map_bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(map_bg_cls_weight)}.'
+            map_class_weight = torch.ones(map_num_classes + 1) * map_class_weight
+            # set background class as the last indice
+            map_class_weight[map_num_classes] = map_bg_cls_weight
+            loss_map_cls.update({'class_weight': map_class_weight})
+            if 'bg_cls_weight' in loss_map_cls:
+                loss_map_cls.pop('bg_cls_weight')
+            self.map_bg_cls_weight = map_bg_cls_weight
+        
+        self.traj_bg_cls_weight = 0
+
+        super(VADHead, self).__init__(*args, transformer=transformer, **kwargs)
+        self.code_weights = nn.Parameter(torch.tensor(
+            self.code_weights, requires_grad=False), requires_grad=False)
+        self.map_code_weights = nn.Parameter(torch.tensor(
+            self.map_code_weights, requires_grad=False), requires_grad=False)
+        
+        if kwargs['train_cfg'] is not None:
+            assert 'map_assigner' in kwargs['train_cfg'], 'map assigner should be provided '\
+                'when train_cfg is set.'
+            map_assigner = kwargs['train_cfg']['map_assigner']
+            assert loss_map_cls['loss_weight'] == map_assigner['cls_cost']['weight'], \
+                'The classification weight for loss and matcher should be' \
+                'exactly the same.'
+            assert loss_map_bbox['loss_weight'] == map_assigner['reg_cost'][
+                'weight'], 'The regression L1 weight for loss and matcher ' \
+                'should be exactly the same.'
+            assert loss_map_iou['loss_weight'] == map_assigner['iou_cost']['weight'], \
+                'The regression iou weight for loss and matcher should be' \
+                'exactly the same.'
+            assert loss_map_pts['loss_weight'] == map_assigner['pts_cost']['weight'], \
+                'The regression l1 weight for map pts loss and matcher should be' \
+                'exactly the same.'
+
+            self.map_assigner = build_assigner(map_assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.map_sampler = build_sampler(sampler_cfg, context=self)
+        
+        self.loss_traj = build_loss(loss_traj)
+        self.loss_traj_cls = build_loss(loss_traj_cls)
+        self.loss_map_bbox = build_loss(loss_map_bbox)
+        self.loss_map_cls = build_loss(loss_map_cls)
+        self.loss_map_iou = build_loss(loss_map_iou)
+        self.loss_map_pts = build_loss(loss_map_pts)
+        self.loss_map_dir = build_loss(loss_map_dir)
+        self.loss_plan_reg = build_loss(loss_plan_reg)
+        self.loss_plan_bound = build_loss(loss_plan_bound)
+        self.loss_plan_col = build_loss(loss_plan_col)
+        self.loss_plan_dir = build_loss(loss_plan_dir)
+        self.loss_vae_gen = build_loss(loss_vae_gen)
+
+
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        cls_branch = nn.Sequential(*cls_branch)
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, self.code_size))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        traj_branch = []
+        traj_in_dim = self.embed_dims*4
+        for _ in range(self.num_reg_fcs):
+            traj_branch.append(Linear(traj_in_dim, traj_in_dim))
+            traj_branch.append(nn.ReLU())
+        traj_branch.append(Linear(traj_in_dim, 2))
+        traj_branch = nn.Sequential(*traj_branch)
+
+        traj_cls_branch = []
+        # for _ in range(self.num_reg_fcs):
+        traj_cls_branch.append(Linear(self.embed_dims*14, self.embed_dims*2))
+        traj_cls_branch.append(nn.LayerNorm(self.embed_dims*2))
+        traj_cls_branch.append(nn.ReLU(inplace=True))
+        traj_cls_branch.append(Linear(self.embed_dims*2, self.embed_dims*2))
+        traj_cls_branch.append(nn.LayerNorm(self.embed_dims*2))
+        traj_cls_branch.append(nn.ReLU(inplace=True))
+        traj_cls_branch.append(Linear(self.embed_dims*2, self.traj_num_cls))
+        traj_cls_branch = nn.Sequential(*traj_cls_branch)
+
+        map_cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            map_cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            map_cls_branch.append(nn.LayerNorm(self.embed_dims))
+            map_cls_branch.append(nn.ReLU(inplace=True))
+        map_cls_branch.append(Linear(self.embed_dims, self.map_cls_out_channels))
+        map_cls_branch = nn.Sequential(*map_cls_branch)
+
+        map_reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            map_reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            map_reg_branch.append(nn.ReLU())
+        map_reg_branch.append(Linear(self.embed_dims, self.map_code_size))
+        map_reg_branch = nn.Sequential(*map_reg_branch)
+
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_decoder_layers = 1
+        num_map_decoder_layers = 1
+        if self.transformer.decoder is not None:
+            num_decoder_layers = self.transformer.decoder.num_layers
+        if self.transformer.map_decoder is not None:
+            num_map_decoder_layers = self.transformer.map_decoder.num_layers
+        num_motion_decoder_layers = 1
+        num_pred = (num_decoder_layers + 1) if \
+            self.as_two_stage else num_decoder_layers
+        motion_num_pred = (num_motion_decoder_layers + 1) if \
+            self.as_two_stage else num_motion_decoder_layers
+        map_num_pred = (num_map_decoder_layers + 1) if \
+            self.as_two_stage else num_map_decoder_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(cls_branch, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+            self.traj_branches = _get_clones(traj_branch, motion_num_pred)
+            self.traj_cls_branches = _get_clones(traj_cls_branch, motion_num_pred)
+            self.map_cls_branches = _get_clones(map_cls_branch, map_num_pred)
+            self.map_reg_branches = _get_clones(map_reg_branch, map_num_pred)
+        else:
+            self.cls_branches = nn.ModuleList(
+                [cls_branch for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+            self.traj_branches = nn.ModuleList(
+                [traj_branch for _ in range(motion_num_pred)])
+            self.traj_cls_branches = nn.ModuleList(
+                [traj_cls_branch for _ in range(motion_num_pred)])
+            self.map_cls_branches = nn.ModuleList(
+                [map_cls_branch for _ in range(map_num_pred)])
+            self.map_reg_branches = nn.ModuleList(
+                [map_reg_branch for _ in range(map_num_pred)])
+
+        if not self.as_two_stage:
+            self.bev_embedding = nn.Embedding(
+                self.bev_h * self.bev_w, self.embed_dims)
+            self.query_embedding = nn.Embedding(self.num_query,
+                                                self.embed_dims * 2)
+            if self.map_query_embed_type == 'all_pts':
+                self.map_query_embedding = nn.Embedding(self.map_num_query,
+                                                    self.embed_dims * 2)
+            elif self.map_query_embed_type == 'instance_pts':
+                self.map_query_embedding = None
+                self.map_instance_embedding = nn.Embedding(self.map_num_vec, self.embed_dims * 2)
+                self.map_pts_embedding = nn.Embedding(self.map_num_pts_per_vec, self.embed_dims * 2)
+        
+        if self.motion_decoder is not None:
+            self.motion_decoder = build_transformer_layer_sequence(self.motion_decoder)
+            self.motion_mode_query = nn.Embedding(self.fut_mode, self.embed_dims)	
+            self.motion_mode_query.weight.requires_grad = True
+            if self.use_pe:
+                self.pos_mlp_sa = nn.Linear(2, self.embed_dims)
+        else:
+            raise NotImplementedError('Not implement yet')
+
+        if self.motion_map_decoder is not None:
+            self.lane_encoder = LaneNet(256, 128, 3)
+            self.motion_map_decoder = build_transformer_layer_sequence(self.motion_map_decoder)
+            if self.use_pe:
+                self.pos_mlp = nn.Linear(2, self.embed_dims)
+        
+        if self.ego_his_encoder is not None:
+            self.ego_his_encoder = LaneNet(2, self.embed_dims//2, 3)
+        else:
+            self.ego_query = nn.Embedding(1, self.embed_dims)	
+
+        if self.ego_agent_decoder is not None:
+            self.ego_agent_decoder = build_transformer_layer_sequence(self.ego_agent_decoder)
+            if self.use_pe:
+                self.ego_agent_pos_mlp = nn.Linear(2, self.embed_dims)
+
+
+
+        if self.ego_map_decoder is not None:
+            self.ego_map_decoder = build_transformer_layer_sequence(self.ego_map_decoder)
+            if self.use_pe:
+                self.ego_map_pos_mlp = nn.Linear(2, self.embed_dims)
+
+        ego_fut_decoder = []
+        ego_fut_dec_in_dim = self.embed_dims*2 + len(self.ego_lcf_feat_idx) \
+            if self.ego_lcf_feat_idx is not None else self.embed_dims*2
+        ego_fut_dec_in_dim = int(ego_fut_dec_in_dim * 2)
+        for _ in range(self.num_reg_fcs):
+            ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, ego_fut_dec_in_dim))
+            ego_fut_decoder.append(nn.ReLU())
+        ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, self.ego_fut_mode*2))
+        self.ego_fut_decoder = nn.Sequential(*ego_fut_decoder)
+
+        self.agent_fus_mlp = nn.Sequential(
+            nn.Linear(self.fut_mode*2*self.embed_dims, self.embed_dims, bias=True),
+            nn.LayerNorm(self.embed_dims),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims, self.embed_dims, bias=True))
+
+        #########################################################
+        self.ego_coord_mlp = nn.Linear(2, 2)
+
+        self.layer_dim = 4
+        self.state_gru = nn.GRU(input_size=32, hidden_size=512, num_layers=self.layer_dim)
+
+        self.ego_gru = nn.GRU(512, 512, 4)
+        self.motion_gru = nn.GRU(512, 512, 4)
+
+        # motion head
+
+        traj_branch_ar = []
+        for _ in range(self.num_reg_fcs):
+            traj_branch_ar.append(Linear(self.embed_dims*2, self.embed_dims*2))
+            traj_branch_ar.append(nn.ReLU())
+        traj_branch_ar.append(Linear(self.embed_dims*2, 2))
+        traj_branch_ar = nn.Sequential(*traj_branch_ar)
+
+        traj_cls_branch_ar = []
+        for _ in range(self.num_reg_fcs):
+            traj_cls_branch_ar.append(Linear(self.embed_dims*2, self.embed_dims*2))
+            traj_cls_branch_ar.append(nn.LayerNorm(self.embed_dims*2))
+            traj_cls_branch_ar.append(nn.ReLU(inplace=True))
+        traj_cls_branch_ar.append(Linear(self.embed_dims*2, self.traj_num_cls))
+        traj_cls_branch_ar = nn.Sequential(*traj_cls_branch_ar)
+
+        if self.with_box_refine:
+            self.traj_branches_ar = _get_clones(traj_branch_ar, motion_num_pred)
+            self.traj_cls_branches_ar = _get_clones(traj_cls_branch_ar, motion_num_pred)
+        else:
+            self.traj_branches_ar = nn.ModuleList(
+                [traj_branch_ar for _ in range(motion_num_pred)])
+            self.traj_cls_branches_ar = nn.ModuleList(
+                [traj_cls_branch_ar for _ in range(motion_num_pred)])
+
+
+
+
+        # planning head
+        ego_fut_decoder_ar = []
+        ego_fut_dec_in_dim_ar = self.embed_dims*2 + len(self.ego_lcf_feat_idx) \
+            if self.ego_lcf_feat_idx is not None else self.embed_dims*2
+        for _ in range(self.num_reg_fcs):
+            ego_fut_decoder_ar.append(Linear(ego_fut_dec_in_dim_ar, ego_fut_dec_in_dim_ar))
+            ego_fut_decoder_ar.append(nn.ReLU())
+        ego_fut_decoder_ar.append(Linear(ego_fut_dec_in_dim_ar, self.ego_fut_mode*2))
+        self.ego_fut_decoder_ar = nn.Sequential(*ego_fut_decoder_ar)
+
+        self.ar = True
+
+        # generator motion & planning
+        self.present_distribution_in_channels = 512
+        self.future_distribution_in_channels = 524
+        self.now_pred_in_channels = 64
+        self.PROBABILISTIC = True
+        self.latent_dim = 32
+        self.MIN_LOG_SIGMA = -5.0
+        self.MAX_LOG_SIGMA = 5.0
+        self.FUTURE_DIM = 6
+        self.N_GRU_BLOCKS = 3
+        self.N_RES_LAYERS = 3
+
+        self.present_distribution = DistributionModule(
+            self.present_distribution_in_channels,
+            self.latent_dim,
+            min_log_sigma=self.MIN_LOG_SIGMA,
+            max_log_sigma=self.MAX_LOG_SIGMA,
+        )
+
+                # future_distribution_in_channels = (self.future_pred_in_channels
+                #                                    + 4 * self.FUTURE_DIM
+                #                                    )
+        self.future_distribution = DistributionModule(
+            self.future_distribution_in_channels,
+            self.latent_dim,
+            min_log_sigma=self.MIN_LOG_SIGMA,
+            max_log_sigma=self.MAX_LOG_SIGMA,
+        )
+
+        # Future prediction
+        self.future_prediction = FuturePrediction(
+            in_channels=self.present_distribution_in_channels,
+            latent_dim=self.latent_dim,
+            n_gru_blocks=self.N_GRU_BLOCKS,
+            n_res_layers=self.N_RES_LAYERS,
+        )
+
+        self.predict_model = PredictModel(
+            in_channels=self.latent_dim,
+            out_channels=self.embed_dims*2,
+            hidden_channels=self.latent_dim*4,
+            num_layers=self.layer_dim
+        )
+
+
+
+
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+        if self.loss_map_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.map_cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+        if self.loss_traj_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.traj_cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+        # for m in self.map_reg_branches:
+        #     constant_init(m[-1], 0, bias=0)
+        # nn.init.constant_(self.map_reg_branches[0][-1].bias.data[2:], 0.)
+        if self.motion_decoder is not None:
+            for p in self.motion_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+            nn.init.orthogonal_(self.motion_mode_query.weight)
+            if self.use_pe:
+                xavier_init(self.pos_mlp_sa, distribution='uniform', bias=0.)
+        if self.motion_map_decoder is not None:
+            for p in self.motion_map_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+            for p in self.lane_encoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+            if self.use_pe:
+                xavier_init(self.pos_mlp, distribution='uniform', bias=0.)
+        if self.ego_his_encoder is not None:
+            for p in self.ego_his_encoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        if self.ego_agent_decoder is not None:
+            for p in self.ego_agent_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        if self.ego_map_decoder is not None:
+            for p in self.ego_map_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+
+        # @auto_fp16(apply_to=('mlvl_feats'))
+
+
+
+    # @auto_fp16(apply_to=('mlvl_feats'))
+    @force_fp32(apply_to=('mlvl_feats', 'prev_bev'))
+    def forward(self,
+                mlvl_feats,
+                img_metas,
+                prev_bev=None,
+                only_bev=False,
+                ego_his_trajs=None,
+                ego_lcf_feat=None,
+                gt_labels_3d=None,
+                gt_attr_labels=None,
+                ego_fut_trajs=None,
+            ):
+        """Forward function.
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+            prev_bev: previous bev featues
+            only_bev: only compute BEV features with encoder. 
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        """
+        
+        bs, num_cam, _, _, _ = mlvl_feats[0].shape
+        dtype = mlvl_feats[0].dtype
+        object_query_embeds = self.query_embedding.weight.to(dtype)
+        
+        if self.map_query_embed_type == 'all_pts':
+            map_query_embeds = self.map_query_embedding.weight.to(dtype)
+        elif self.map_query_embed_type == 'instance_pts':
+            map_pts_embeds = self.map_pts_embedding.weight.unsqueeze(0)
+            map_instance_embeds = self.map_instance_embedding.weight.unsqueeze(1)
+            map_query_embeds = (map_pts_embeds + map_instance_embeds).flatten(0, 1).to(dtype)
+
+        bev_queries = self.bev_embedding.weight.to(dtype)
+
+        bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
+                               device=bev_queries.device).to(dtype)
+        bev_pos = self.positional_encoding(bev_mask).to(dtype)
+            
+        if only_bev:  # only use encoder to obtain BEV features, TODO: refine the workaround
+            return self.transformer.get_bev_features(
+                mlvl_feats,
+                bev_queries,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                img_metas=img_metas,
+                prev_bev=prev_bev,
+            )
+        else:
+            outputs = self.transformer(
+                mlvl_feats,
+                bev_queries,
+                object_query_embeds,
+                map_query_embeds,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                cls_branches=self.cls_branches if self.as_two_stage else None,
+                map_reg_branches=self.map_reg_branches if self.with_box_refine else None,  # noqa:E501
+                map_cls_branches=self.map_cls_branches if self.as_two_stage else None,
+                img_metas=img_metas,
+                prev_bev=prev_bev
+        )
+
+        # bev_embed: bev features
+        # hs: agent_query
+        # init_reference: reference points init
+        # inter_references: reference points processing
+        # map_hs: map_query
+        # map_init_reference: reference points init
+        # map_inter_references: reference points processing
+
+        bev_embed, hs, init_reference, inter_references, \
+            map_hs, map_init_reference, map_inter_references = outputs
+
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+        outputs_coords_bev = []
+        outputs_trajs = []
+        outputs_trajs_classes = []
+
+        map_hs = map_hs.permute(0, 2, 1, 3)
+        map_outputs_classes = []
+        map_outputs_coords = []
+        map_outputs_pts_coords = []
+        map_outputs_coords_bev = []
+
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 3
+            tmp[..., 0:2] = tmp[..., 0:2] + reference[..., 0:2]
+            tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            outputs_coords_bev.append(tmp[..., 0:2].clone().detach())
+            tmp[..., 4:5] = tmp[..., 4:5] + reference[..., 2:3]
+            tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+            tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] -
+                             self.pc_range[0]) + self.pc_range[0])
+            tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] -
+                             self.pc_range[1]) + self.pc_range[1])
+            tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] -
+                             self.pc_range[2]) + self.pc_range[2])
+
+            # TODO: check if using sigmoid
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        
+        for lvl in range(map_hs.shape[0]):
+            if lvl == 0:
+                reference = map_init_reference
+            else:
+                reference = map_inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            map_outputs_class = self.map_cls_branches[lvl](
+                map_hs[lvl].view(bs,self.map_num_vec, self.map_num_pts_per_vec,-1).mean(2)
+            )
+            tmp = self.map_reg_branches[lvl](map_hs[lvl])
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 2
+            tmp[..., 0:2] += reference[..., 0:2]
+            tmp = tmp.sigmoid() # cx,cy,w,h
+            map_outputs_coord, map_outputs_pts_coord = self.map_transform_box(tmp)
+            map_outputs_coords_bev.append(map_outputs_pts_coord.clone().detach())
+            map_outputs_classes.append(map_outputs_class)
+            map_outputs_coords.append(map_outputs_coord)
+            map_outputs_pts_coords.append(map_outputs_pts_coord)
+
+        # motion prediction
+
+        #motion query
+        if self.motion_decoder is not None:
+            batch_size, num_agent = outputs_coords_bev[-1].shape[:2]
+            # motion_query
+            motion_query = hs[-1].permute(1, 0, 2)  # [A, B, D]
+            mode_query = self.motion_mode_query.weight  # [fut_mode, D]
+            # [M, B, D], M=A*fut_mode
+            motion_query = (motion_query[:, None, :, :] + mode_query[None, :, None, :]).flatten(0, 1)
+            if self.use_pe:
+                motion_coords = outputs_coords_bev[-1]  # [B, A, 2]
+                motion_pos = self.pos_mlp_sa(motion_coords)  # [B, A, D]
+                motion_pos = motion_pos.unsqueeze(2).repeat(1, 1, self.fut_mode, 1).flatten(1, 2)
+                motion_pos = motion_pos.permute(1, 0, 2)  # [M, B, D]
+            else:
+                motion_pos = None
+
+            if self.motion_det_score is not None:
+                motion_score = outputs_classes[-1]
+                max_motion_score = motion_score.max(dim=-1)[0]
+                invalid_motion_idx = max_motion_score < self.motion_det_score  # [B, A]
+                invalid_motion_idx = invalid_motion_idx.unsqueeze(2).repeat(1, 1, self.fut_mode).flatten(1, 2)
+            else:
+                invalid_motion_idx = None
+
+            #ego query
+            # batch = batch_size
+            if self.ego_his_encoder is not None:
+                ego_his_feats = self.ego_his_encoder(ego_his_trajs)  # [B, 1, dim]
+            else:
+                ego_his_feats = self.ego_query.weight.unsqueeze(0).repeat(batch_size, 1, 1)
+                # ego <-> agent Interaction
+            ego_query = ego_his_feats.permute(1, 0, 2)
+            ego_pos = torch.zeros((batch_size, 1, 2), device=ego_query.device).permute(1, 0, 2)
+            ego_pos_emb = self.ego_agent_pos_mlp(ego_pos)
+
+            motion_query = torch.cat([motion_query, ego_query], dim=0)
+            motion_pos = torch.cat([motion_pos, ego_pos_emb], dim=0)
+
+            motion_hs = self.motion_decoder(
+                query=motion_query,
+                key=motion_query,
+                value=motion_query,
+                query_pos=motion_pos,
+                key_pos=motion_pos,
+                key_padding_mask=invalid_motion_idx)
+
+            if self.motion_map_decoder is not None:
+                # map preprocess
+                motion_coords = outputs_coords_bev[-1]  # [B, A, 2]
+                motion_coords = motion_coords.unsqueeze(2).repeat(1, 1, self.fut_mode, 1).flatten(1, 2)
+
+                #ego_coords = torch.Tensor(1, 1, 2).cuda(1)
+                ego_coords = torch.zeros([batch_size, 1, 2], device=motion_hs.device)
+                ego_coords_embd = self.ego_coord_mlp(ego_coords)
+                # ego_coords_embd = torch.zeros([batch_size, 1, 2], device=motion_hs.device)
+                motion_coords = torch.cat([motion_coords, ego_coords_embd], dim=1)
+
+
+                map_query = map_hs[-1].view(batch_size, self.map_num_vec, self.map_num_pts_per_vec, -1)
+                map_query = self.lane_encoder(map_query)  # [B, P, pts, D] -> [B, P, D]
+                map_score = map_outputs_classes[-1]
+                map_pos = map_outputs_coords_bev[-1]
+                map_query, map_pos, key_padding_mask = self.select_and_pad_pred_map(
+                    motion_coords, map_query, map_score, map_pos,
+                    map_thresh=self.map_thresh, dis_thresh=self.dis_thresh,
+                    pe_normalization=self.pe_normalization, use_fix_pad=True)
+                map_query = map_query.permute(1, 0, 2)  # [P, B*M, D]
+                ca_motion_query = motion_hs.permute(1, 0, 2).flatten(0, 1).unsqueeze(0)
+
+                # position encoding
+                if self.use_pe:
+                    (num_query, batch) = ca_motion_query.shape[:2] 
+                    motion_pos = torch.zeros((num_query, batch, 2), device=motion_hs.device)
+                    motion_pos = self.pos_mlp(motion_pos)
+                    map_pos = map_pos.permute(1, 0, 2)
+                    map_pos = self.pos_mlp(map_pos)
+                else:
+                    motion_pos, map_pos = None, None
+                
+                ca_motion_query = self.motion_map_decoder(
+                    query=ca_motion_query,
+                    key=map_query,
+                    value=map_query,
+                    query_pos=motion_pos,
+                    key_pos=map_pos,
+                    key_padding_mask=key_padding_mask)
+            else:
+                ca_motion_query = motion_hs.permute(1, 0, 2).flatten(0, 1).unsqueeze(0)
+
+            ########################################
+            # generator for planning & motion
+            current_states = torch.cat((motion_hs.permute(1, 0, 2),
+                                        ca_motion_query.reshape(batch_size, -1, self.embed_dims)), dim=2)
+            distribution_comp = {}
+            # states = torch.randn((2, 1, 64, 200, 200), device=motion_hs.device)
+            # future_distribution_inputs = torch.randn((2, 5, 6, 200, 200), device=motion_hs.device)
+            noise = None
+            if self.training:
+                future_distribution_inputs = self.get_future_labels(gt_labels_3d, gt_attr_labels, ego_fut_trajs, motion_hs.device)
+            else:
+                future_distribution_inputs = None
+
+            # 1. model CVA distribution for state
+            if self.fut_ts > 0:
+                #present_state = states[:, :1].contiguous()
+                if self.PROBABILISTIC:
+                    # Do probabilistic computation
+                    sample, output_distribution = self.distribution_forward(
+                         current_states, future_distribution_inputs, noise
+                    )
+                    distribution_comp = {**distribution_comp, **output_distribution}
+
+            # 2. predict future state from distribution
+            hidden_states = current_states
+            states_hs, future_states_hs = \
+                self.future_states_predict(batch_size, sample, hidden_states, current_states)
+
+
+            ego_query_hs = \
+                states_hs[:, :, self.agent_dim*self.fut_mode, :].unsqueeze(1).permute(0, 2, 1, 3)
+            motion_query_hs = states_hs[:, :, 0:self.agent_dim*self.fut_mode, :]
+            motion_query_hs = \
+                motion_query_hs.reshape(self.fut_ts, batch_size, -1, self.fut_ts, motion_query_hs.shape[-1])
+            ego_fut_trajs_list = []
+            motion_fut_trajs_list = []
+            for i in range(self.fut_ts):
+                outputs_ego_trajs = self.ego_fut_decoder(ego_query_hs[i]).reshape(batch_size, self.ego_fut_mode, 2)
+                ego_fut_trajs_list.append(outputs_ego_trajs)
+                outputs_agent_trajs = self.traj_branches[0](motion_query_hs[i])
+                motion_fut_trajs_list.append(outputs_agent_trajs)
+
+            ego_trajs = torch.stack(ego_fut_trajs_list, dim=2)
+            agent_trajs = torch.stack(motion_fut_trajs_list, dim=3).reshape(batch_size, 1, self.agent_dim, self.fut_mode, -1)
+
+        motion_cls_hs = torch.cat((future_states_hs[:, :, 0:self.agent_dim*self.fut_mode, :].
+                                   reshape(batch_size, self.agent_dim,self.fut_mode,-1),
+                                   current_states[:,0:self.agent_dim*self.fut_mode,:].
+                                   reshape(batch_size, self.agent_dim ,self.fut_mode,-1)), dim=-1)
+
+        # outputs_traj_class = self.traj_cls_branches[0](motion_query_hs)
+
+        # outputs_traj = self.traj_branches[0](motion_hs)
+        # outputs_trajs.append(outputs_traj)
+        outputs_traj_class = self.traj_cls_branches[0](motion_cls_hs)
+        outputs_trajs_classes.append(outputs_traj_class.squeeze(-1))
+
+             
+        map_outputs_classes = torch.stack(map_outputs_classes)
+        map_outputs_coords = torch.stack(map_outputs_coords)
+        map_outputs_pts_coords = torch.stack(map_outputs_pts_coords)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+        outputs_trajs = agent_trajs.permute(1, 0, 2, 3, 4)
+        outputs_trajs_classes = torch.stack(outputs_trajs_classes)
+
+
+
+
+
+        # print(future_states.shape)
+
+        # Ego prediction
+        #ego_feats [1, 1, 512]
+        # outputs_ego_trajs = self.ego_fut_decoder(ego_feats)
+        # outputs_ego_trajs = outputs_ego_trajs.reshape(outputs_ego_trajs.shape[0],
+        #                                               self.ego_fut_mode, self.fut_ts, 2)
+
+        outs = {
+            'bev_embed': bev_embed,     #torch.Size([10000, 1, 256])
+            'all_cls_scores': outputs_classes, #torch.Size([3, 1, 300, 10])
+            'all_bbox_preds': outputs_coords,     #torch.Size([3, 1, 300, 10])
+            'all_traj_preds': outputs_trajs.repeat(outputs_coords.shape[0], 1, 1, 1, 1),      # torch.Size([3, 1, 300, 6, 12])
+            'all_traj_cls_scores': outputs_trajs_classes.repeat(outputs_coords.shape[0], 1, 1, 1),    # torch.Size([3, 1, 300, 6])
+            'map_all_cls_scores': map_outputs_classes,   #torch.Size([3, 1, 100, 3]) map_outputs_classes
+            'map_all_bbox_preds': map_outputs_coords,    #torch.Size([3, 1, 100, 4])  map_outputs_coords
+            'map_all_pts_preds': map_outputs_pts_coords,   #torch.Size([3, 1, 100, 20, 2])
+            'enc_cls_scores': None,
+            'enc_bbox_preds': None,
+            'map_enc_cls_scores': None,
+            'map_enc_bbox_preds': None,
+            'map_enc_pts_preds': None,
+            'ego_fut_preds': ego_trajs,     # torch.Size([1, 3, 6, 2])
+            'loss_vae_gen': distribution_comp,
+        }
+
+        return outs
+
+    def map_transform_box(self, pts, y_first=False):
+        """
+        Converting the points set into bounding box.
+
+        Args:
+            pts: the input points sets (fields), each points
+                set (fields) is represented as 2n scalar.
+            y_first: if y_fisrt=True, the point set is represented as
+                [y1, x1, y2, x2 ... yn, xn], otherwise the point set is
+                represented as [x1, y1, x2, y2 ... xn, yn].
+        Returns:
+            The bbox [cx, cy, w, h] transformed from points.
+        """
+        pts_reshape = pts.view(pts.shape[0], self.map_num_vec,
+                                self.map_num_pts_per_vec,2)
+        pts_y = pts_reshape[:, :, :, 0] if y_first else pts_reshape[:, :, :, 1]
+        pts_x = pts_reshape[:, :, :, 1] if y_first else pts_reshape[:, :, :, 0]
+        if self.map_transform_method == 'minmax':
+            # import pdb;pdb.set_trace()
+
+            xmin = pts_x.min(dim=2, keepdim=True)[0]
+            xmax = pts_x.max(dim=2, keepdim=True)[0]
+            ymin = pts_y.min(dim=2, keepdim=True)[0]
+            ymax = pts_y.max(dim=2, keepdim=True)[0]
+            bbox = torch.cat([xmin, ymin, xmax, ymax], dim=2)
+            bbox = bbox_xyxy_to_cxcywh(bbox)
+        else:
+            raise NotImplementedError
+        return bbox, pts_reshape
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_attr_labels,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 10].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 9) in [x,y,z,w,l,h,yaw,vx,vy] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        gt_fut_trajs = gt_attr_labels[:, :self.fut_ts*2]
+        gt_fut_masks = gt_attr_labels[:, self.fut_ts*2:self.fut_ts*3]
+        gt_bbox_c = gt_bboxes.shape[-1]
+        num_gt_bbox, gt_traj_c = gt_fut_trajs.shape
+
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, gt_bboxes_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_bbox_c]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+
+        # trajs targets
+        traj_targets = torch.zeros((num_bboxes, gt_traj_c), dtype=torch.float32, device=bbox_pred.device)
+        traj_weights = torch.zeros_like(traj_targets)
+        traj_targets[pos_inds] = gt_fut_trajs[sampling_result.pos_assigned_gt_inds]
+        traj_weights[pos_inds] = 1.0
+
+        # Filter out invalid fut trajs
+        traj_masks = torch.zeros_like(traj_targets)  # [num_bboxes, fut_ts*2]
+        gt_fut_masks = gt_fut_masks.unsqueeze(-1).repeat(1, 1, 2).view(num_gt_bbox, -1)  # [num_gt_bbox, fut_ts*2]
+        traj_masks[pos_inds] = gt_fut_masks[sampling_result.pos_assigned_gt_inds]
+        traj_weights = traj_weights * traj_masks
+
+        # Extra future timestamp mask for controlling pred horizon
+        fut_ts_mask = torch.zeros((num_bboxes, self.fut_ts, 2),
+                                   dtype=torch.float32, device=bbox_pred.device)
+        fut_ts_mask[:, :self.valid_fut_ts, :] = 1.0
+        fut_ts_mask = fut_ts_mask.view(num_bboxes, -1)
+        traj_weights = traj_weights * fut_ts_mask
+
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+
+        return (
+            labels, label_weights, bbox_targets, bbox_weights, traj_targets,
+            traj_weights, traj_masks.view(-1, self.fut_ts, 2)[..., 0],
+            pos_inds, neg_inds
+        )
+
+    def _map_get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           pts_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_shifts_pts,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        gt_c = gt_bboxes.shape[-1]
+        assign_result, order_index = self.map_assigner.assign(bbox_pred, cls_score, pts_pred,
+                                             gt_bboxes, gt_labels, gt_shifts_pts,
+                                             gt_bboxes_ignore)
+
+        sampling_result = self.map_sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,),
+                                    self.map_num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        # pts targets
+        if order_index is None:
+            assigned_shift = gt_labels[sampling_result.pos_assigned_gt_inds]
+        else:
+            assigned_shift = order_index[sampling_result.pos_inds, sampling_result.pos_assigned_gt_inds]
+        pts_targets = pts_pred.new_zeros((pts_pred.size(0),
+                        pts_pred.size(1), pts_pred.size(2)))
+        pts_weights = torch.zeros_like(pts_targets)
+        pts_weights[pos_inds] = 1.0
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+        pts_targets[pos_inds] = gt_shifts_pts[sampling_result.pos_assigned_gt_inds,assigned_shift,:,:]
+        return (labels, label_weights, bbox_targets, bbox_weights,
+                pts_targets, pts_weights,
+                pos_inds, neg_inds)
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_attr_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, traj_targets_list, traj_weights_list,
+         gt_fut_masks_list, pos_inds_list, neg_inds_list) = multi_apply(
+            self._get_target_single, cls_scores_list, bbox_preds_list,
+            gt_labels_list, gt_bboxes_list, gt_attr_labels_list, gt_bboxes_ignore_list
+         )
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+                traj_targets_list, traj_weights_list, gt_fut_masks_list, num_total_pos, num_total_neg)
+
+    def map_get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    pts_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_shifts_pts_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pts_targets_list, pts_weights_list,
+         pos_inds_list, neg_inds_list) = multi_apply(
+            self._map_get_target_single, cls_scores_list, bbox_preds_list,pts_preds_list,
+            gt_labels_list, gt_bboxes_list, gt_shifts_pts_list, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, pts_targets_list, pts_weights_list,
+                num_total_pos, num_total_neg)
+
+
+
+    def loss_planning(self,
+                      ego_fut_preds,
+                      ego_fut_gt,
+                      ego_fut_masks,
+                      ego_fut_cmd,
+                      lane_preds,
+                      lane_score_preds,
+                      agent_preds,
+                      agent_fut_preds,
+                      agent_score_preds,
+                      agent_fut_cls_preds):
+        """"Loss function for ego vehicle planning.
+        Args:
+            ego_fut_preds (Tensor): [B, ego_fut_mode, fut_ts, 2]
+            ego_fut_gt (Tensor): [B, fut_ts, 2]
+            ego_fut_masks (Tensor): [B, fut_ts]
+            ego_fut_cmd (Tensor): [B, ego_fut_mode]
+            lane_preds (Tensor): [B, num_vec, num_pts, 2]
+            lane_score_preds (Tensor): [B, num_vec, 3]
+            agent_preds (Tensor): [B, num_agent, 2]
+            agent_fut_preds (Tensor): [B, num_agent, fut_mode, fut_ts, 2]
+            agent_score_preds (Tensor): [B, num_agent, 10]
+            agent_fut_cls_scores (Tensor): [B, num_agent, fut_mode]
+        Returns:
+            loss_plan_reg (Tensor): planning reg loss.
+            loss_plan_bound (Tensor): planning map boundary constraint loss.
+            loss_plan_col (Tensor): planning col constraint loss.
+            loss_plan_dir (Tensor): planning directional constraint loss.
+        """
+
+        ego_fut_gt = ego_fut_gt.unsqueeze(1).repeat(1, self.ego_fut_mode, 1, 1)
+        loss_plan_l1_weight = ego_fut_cmd[..., None, None] * ego_fut_masks[:, None, :, None]
+        loss_plan_l1_weight = loss_plan_l1_weight.repeat(1, 1, 1, 2)
+
+        loss_plan_l1 = self.loss_plan_reg(
+            ego_fut_preds,
+            ego_fut_gt,
+            loss_plan_l1_weight
+        )
+
+        loss_plan_bound = self.loss_plan_bound(
+            ego_fut_preds[ego_fut_cmd==1],
+            lane_preds,
+            lane_score_preds,
+            weight=ego_fut_masks
+        )
+
+        loss_plan_col = self.loss_plan_col(
+            ego_fut_preds[ego_fut_cmd==1],
+            agent_preds,
+            agent_fut_preds,
+            agent_score_preds,
+            agent_fut_cls_preds,
+            weight=ego_fut_masks[:, :, None].repeat(1, 1, 2)
+        )
+
+        loss_plan_dir = self.loss_plan_dir(
+            ego_fut_preds[ego_fut_cmd==1],
+            lane_preds,
+            lane_score_preds,
+            weight=ego_fut_masks
+        )
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_plan_l1 = torch.nan_to_num(loss_plan_l1)
+            loss_plan_bound = torch.nan_to_num(loss_plan_bound)
+            loss_plan_col = torch.nan_to_num(loss_plan_col)
+            loss_plan_dir = torch.nan_to_num(loss_plan_dir)
+        
+        loss_plan_dict = dict()
+        loss_plan_dict['loss_plan_reg'] = loss_plan_l1
+        loss_plan_dict['loss_plan_bound'] = loss_plan_bound
+        loss_plan_dict['loss_plan_col'] = loss_plan_col
+        loss_plan_dict['loss_plan_dir'] = loss_plan_dir
+
+        return loss_plan_dict
+    
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    traj_preds,
+                    traj_cls_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_attr_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           gt_attr_labels_list, gt_bboxes_ignore_list)
+
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         traj_targets_list, traj_weights_list, gt_fut_masks_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+        traj_targets = torch.cat(traj_targets_list, 0)
+        traj_weights = torch.cat(traj_weights_list, 0)
+        gt_fut_masks = torch.cat(gt_fut_masks_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10],
+            normalized_bbox_targets[isnotnan, :10],
+            bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos)
+
+        # traj regression loss
+        best_traj_preds = self.get_best_fut_preds(
+            traj_preds.reshape(-1, self.fut_mode, self.fut_ts, 2),
+            traj_targets.reshape(-1, self.fut_ts, 2), gt_fut_masks)
+
+        neg_inds = (bbox_weights[:, 0] == 0)
+        traj_labels = self.get_traj_cls_target(
+            traj_preds.reshape(-1, self.fut_mode, self.fut_ts, 2),
+            traj_targets.reshape(-1, self.fut_ts, 2),
+            gt_fut_masks, neg_inds)
+
+        loss_traj = self.loss_traj(
+            best_traj_preds[isnotnan],
+            traj_targets[isnotnan],
+            traj_weights[isnotnan],
+            avg_factor=num_total_pos)
+
+        if self.use_traj_lr_warmup:
+            loss_scale_factor = get_traj_warmup_loss_weight(self.epoch, self.tot_epoch)
+            loss_traj = loss_scale_factor * loss_traj
+
+        # traj classification loss
+        traj_cls_scores = traj_cls_preds.reshape(-1, self.fut_mode)
+        # construct weighted avg_factor to match with the official DETR repo
+        traj_cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.traj_bg_cls_weight
+        if self.sync_cls_avg_factor:
+            traj_cls_avg_factor = reduce_mean(
+                traj_cls_scores.new_tensor([traj_cls_avg_factor]))
+
+        traj_cls_avg_factor = max(traj_cls_avg_factor, 1)
+        loss_traj_cls = self.loss_traj_cls(
+            traj_cls_scores, traj_labels, label_weights, avg_factor=traj_cls_avg_factor
+        )
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_cls = torch.nan_to_num(loss_cls)
+            loss_bbox = torch.nan_to_num(loss_bbox)
+            loss_traj = torch.nan_to_num(loss_traj)
+            loss_traj_cls = torch.nan_to_num(loss_traj_cls)
+
+        return loss_cls, loss_bbox, loss_traj, loss_traj_cls
+
+    def get_best_fut_preds(self,
+             traj_preds,
+             traj_targets,
+             gt_fut_masks):
+        """"Choose best preds among all modes.
+        Args:
+            traj_preds (Tensor): MultiModal traj preds with shape (num_box_preds, fut_mode, fut_ts, 2).
+            traj_targets (Tensor): Ground truth traj for each pred box with shape (num_box_preds, fut_ts, 2).
+            gt_fut_masks (Tensor): Ground truth traj mask with shape (num_box_preds, fut_ts).
+            pred_box_centers (Tensor): Pred box centers with shape (num_box_preds, 2).
+            gt_box_centers (Tensor): Ground truth box centers with shape (num_box_preds, 2).
+
+        Returns:
+            best_traj_preds (Tensor): best traj preds (min displacement error with gt)
+                with shape (num_box_preds, fut_ts*2).
+        """
+
+        cum_traj_preds = traj_preds.cumsum(dim=-2)
+        cum_traj_targets = traj_targets.cumsum(dim=-2)
+
+        # Get min pred mode indices.
+        # (num_box_preds, fut_mode, fut_ts)
+        dist = torch.linalg.norm(cum_traj_targets[:, None, :, :] - cum_traj_preds, dim=-1)
+        dist = dist * gt_fut_masks[:, None, :]
+        dist = dist[..., -1]
+        dist[torch.isnan(dist)] = dist[torch.isnan(dist)] * 0
+        min_mode_idxs = torch.argmin(dist, dim=-1).tolist()
+        box_idxs = torch.arange(traj_preds.shape[0]).tolist()
+        best_traj_preds = traj_preds[box_idxs, min_mode_idxs, :, :].reshape(-1, self.fut_ts*2)
+
+        return best_traj_preds
+
+    def get_traj_cls_target(self,
+             traj_preds,
+             traj_targets,
+             gt_fut_masks,
+             neg_inds):
+        """"Get Trajectory mode classification target.
+        Args:
+            traj_preds (Tensor): MultiModal traj preds with shape (num_box_preds, fut_mode, fut_ts, 2).
+            traj_targets (Tensor): Ground truth traj for each pred box with shape (num_box_preds, fut_ts, 2).
+            gt_fut_masks (Tensor): Ground truth traj mask with shape (num_box_preds, fut_ts).
+            neg_inds (Tensor): Negtive indices with shape (num_box_preds,)
+
+        Returns:
+            traj_labels (Tensor): traj cls labels (num_box_preds,).
+        """
+
+        cum_traj_preds = traj_preds.cumsum(dim=-2)
+        cum_traj_targets = traj_targets.cumsum(dim=-2)
+
+        # Get min pred mode indices.
+        # (num_box_preds, fut_mode, fut_ts)
+        dist = torch.linalg.norm(cum_traj_targets[:, None, :, :] - cum_traj_preds, dim=-1)
+        dist = dist * gt_fut_masks[:, None, :]
+        dist = dist[..., -1]
+        dist[torch.isnan(dist)] = dist[torch.isnan(dist)] * 0
+        traj_labels = torch.argmin(dist, dim=-1)
+        traj_labels[neg_inds] = self.fut_mode
+
+        return traj_labels
+
+    def map_loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    pts_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_shifts_pts_list,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_pts_list (list[Tensor]): Ground truth pts for each image
+                with shape (num_gts, fixed_num, 2) in [x,y] format.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        pts_preds_list = [pts_preds[i] for i in range(num_imgs)]
+
+        cls_reg_targets = self.map_get_targets(cls_scores_list, bbox_preds_list,pts_preds_list,
+                                           gt_bboxes_list, gt_labels_list,gt_shifts_pts_list,
+                                           gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pts_targets_list, pts_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+ 
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+        pts_targets = torch.cat(pts_targets_list, 0)
+        pts_weights = torch.cat(pts_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.map_cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.map_bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_map_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_2d_bbox(bbox_targets, self.pc_range)
+        # normalized_bbox_targets = bbox_targets
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.map_code_weights
+
+        loss_bbox = self.loss_map_bbox(
+            bbox_preds[isnotnan, :4],
+            normalized_bbox_targets[isnotnan,:4],
+            bbox_weights[isnotnan, :4],
+            avg_factor=num_total_pos)
+
+        # regression pts CD loss
+        # num_samples, num_order, num_pts, num_coords
+        normalized_pts_targets = normalize_2d_pts(pts_targets, self.pc_range)
+
+        # num_samples, num_pts, num_coords
+        pts_preds = pts_preds.reshape(-1, pts_preds.size(-2), pts_preds.size(-1))
+        if self.map_num_pts_per_vec != self.map_num_pts_per_gt_vec:
+            pts_preds = pts_preds.permute(0,2,1)
+            pts_preds = F.interpolate(pts_preds, size=(self.map_num_pts_per_gt_vec), mode='linear',
+                                    align_corners=True)
+            pts_preds = pts_preds.permute(0,2,1).contiguous()
+
+        loss_pts = self.loss_map_pts(
+            pts_preds[isnotnan,:,:],
+            normalized_pts_targets[isnotnan,:,:], 
+            pts_weights[isnotnan,:,:],
+            avg_factor=num_total_pos)
+
+        dir_weights = pts_weights[:, :-self.map_dir_interval,0]
+        denormed_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range)
+        denormed_pts_preds_dir = denormed_pts_preds[:,self.map_dir_interval:,:] - \
+            denormed_pts_preds[:,:-self.map_dir_interval,:]
+        pts_targets_dir = pts_targets[:, self.map_dir_interval:,:] - pts_targets[:,:-self.map_dir_interval,:]
+
+        loss_dir = self.loss_map_dir(
+            denormed_pts_preds_dir[isnotnan,:,:],
+            pts_targets_dir[isnotnan,:,:],
+            dir_weights[isnotnan,:],
+            avg_factor=num_total_pos)
+
+        bboxes = denormalize_2d_bbox(bbox_preds, self.pc_range)
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_map_iou(
+            bboxes[isnotnan, :4],
+            bbox_targets[isnotnan, :4],
+            bbox_weights[isnotnan, :4], 
+            avg_factor=num_total_pos)
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_cls = torch.nan_to_num(loss_cls)
+            loss_bbox = torch.nan_to_num(loss_bbox)
+            loss_iou = torch.nan_to_num(loss_iou)
+            loss_pts = torch.nan_to_num(loss_pts)
+            loss_dir = torch.nan_to_num(loss_dir)
+
+        return loss_cls, loss_bbox, loss_iou, loss_pts, loss_dir
+
+
+
+    def distribution_loss(self, output):
+        kl_loss = self.loss_vae_gen(output)
+        return kl_loss
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             map_gt_bboxes_list,
+             map_gt_labels_list,
+             preds_dicts,
+             ego_fut_gt,
+             ego_fut_masks,
+             ego_fut_cmd,
+             gt_attr_labels,
+             gt_bboxes_ignore=None,
+             map_gt_bboxes_ignore=None,
+             img_metas=None):
+        """"Loss function.
+        Args:
+
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        map_gt_vecs_list = copy.deepcopy(map_gt_bboxes_list)
+
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        all_traj_preds = preds_dicts['all_traj_preds']
+        all_traj_cls_scores = preds_dicts['all_traj_cls_scores']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+        map_all_cls_scores = preds_dicts['map_all_cls_scores']
+        map_all_bbox_preds = preds_dicts['map_all_bbox_preds']
+        map_all_pts_preds = preds_dicts['map_all_pts_preds']
+        map_enc_cls_scores = preds_dicts['map_enc_cls_scores']
+        map_enc_bbox_preds = preds_dicts['map_enc_bbox_preds']
+        map_enc_pts_preds = preds_dicts['map_enc_pts_preds']
+        ego_fut_preds = preds_dicts['ego_fut_preds']
+        distribution_pred = preds_dicts['loss_vae_gen']
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+
+        gt_bboxes_list = [torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+            dim=1).to(device) for gt_bboxes in gt_bboxes_list]
+
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_attr_labels_list = [gt_attr_labels for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        losses_cls, losses_bbox, loss_traj, loss_traj_cls = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds, all_traj_preds,
+            all_traj_cls_scores, all_gt_bboxes_list, all_gt_labels_list,
+            all_gt_attr_labels_list, all_gt_bboxes_ignore_list)
+        
+
+        num_dec_layers = len(map_all_cls_scores)
+        device = map_gt_labels_list[0].device
+
+        map_gt_bboxes_list = [
+            map_gt_bboxes.bbox.to(device) for map_gt_bboxes in map_gt_vecs_list]
+        map_gt_pts_list = [
+            map_gt_bboxes.fixed_num_sampled_points.to(device) for map_gt_bboxes in map_gt_vecs_list]
+        if self.map_gt_shift_pts_pattern == 'v0':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v1':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v1.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v2':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v2.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v3':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v3.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v4':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v4.to(device) for gt_bboxes in map_gt_vecs_list]
+        else:
+            raise NotImplementedError
+        map_all_gt_bboxes_list = [map_gt_bboxes_list for _ in range(num_dec_layers)]
+        map_all_gt_labels_list = [map_gt_labels_list for _ in range(num_dec_layers)]
+        map_all_gt_pts_list = [map_gt_pts_list for _ in range(num_dec_layers)]
+        map_all_gt_shifts_pts_list = [map_gt_shifts_pts_list for _ in range(num_dec_layers)]
+        map_all_gt_bboxes_ignore_list = [
+            map_gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        map_losses_cls, map_losses_bbox, map_losses_iou, \
+            map_losses_pts, map_losses_dir = multi_apply(
+            self.map_loss_single, map_all_cls_scores, map_all_bbox_preds,
+            map_all_pts_preds, map_all_gt_bboxes_list, map_all_gt_labels_list,
+            map_all_gt_shifts_pts_list, map_all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_traj'] = loss_traj[-1]
+        loss_dict['loss_traj_cls'] = loss_traj_cls[-1]
+        # loss from the last decoder layer
+        loss_dict['loss_map_cls'] = map_losses_cls[-1]
+        loss_dict['loss_map_bbox'] = map_losses_bbox[-1]
+        loss_dict['loss_map_iou'] = map_losses_iou[-1]
+        loss_dict['loss_map_pts'] = map_losses_pts[-1]
+        loss_dict['loss_map_dir'] = map_losses_dir[-1]
+
+        # Planning Loss
+        ego_fut_gt = ego_fut_gt.squeeze(1)
+        ego_fut_masks = ego_fut_masks.squeeze(1).squeeze(1)
+        ego_fut_cmd = ego_fut_cmd.squeeze(1).squeeze(1)
+
+        batch, num_agent = all_traj_preds[-1].shape[:2]
+        agent_fut_preds = all_traj_preds[-1].view(batch, num_agent, self.fut_mode, self.fut_ts, 2)
+        agent_fut_cls_preds = all_traj_cls_scores[-1].view(batch, num_agent, self.fut_mode)
+        loss_plan_input = [ego_fut_preds, ego_fut_gt, ego_fut_masks, ego_fut_cmd,
+                           map_all_pts_preds[-1], map_all_cls_scores[-1].sigmoid(),
+                           all_bbox_preds[-1][..., 0:2], agent_fut_preds,
+                           all_cls_scores[-1].sigmoid(), agent_fut_cls_preds.sigmoid()]
+
+        loss_planning_dict = self.loss_planning(*loss_plan_input)
+        loss_dict['loss_plan_reg'] = loss_planning_dict['loss_plan_reg']
+        loss_dict['loss_plan_bound'] = loss_planning_dict['loss_plan_bound']
+        loss_dict['loss_plan_col'] = loss_planning_dict['loss_plan_col']
+        loss_dict['loss_plan_dir'] = loss_planning_dict['loss_plan_dir']
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            num_dec_layer += 1
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for map_loss_cls_i, map_loss_bbox_i, map_loss_iou_i, map_loss_pts_i, map_loss_dir_i in zip(
+            map_losses_cls[:-1],
+            map_losses_bbox[:-1],
+            map_losses_iou[:-1],
+            map_losses_pts[:-1],
+            map_losses_dir[:-1]
+        ):
+            loss_dict[f'd{num_dec_layer}.loss_map_cls'] = map_loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_map_bbox'] = map_loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_map_iou'] = map_loss_iou_i
+            loss_dict[f'd{num_dec_layer}.loss_map_pts'] = map_loss_pts_i
+            loss_dict[f'd{num_dec_layer}.loss_map_dir'] = map_loss_dir_i
+            num_dec_layer += 1
+
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(all_gt_labels_list))
+            ]
+            enc_loss_cls, enc_losses_bbox = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list,
+                                 gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+
+        if map_enc_cls_scores is not None:
+            map_binary_labels_list = [
+                torch.zeros_like(map_gt_labels_list[i])
+                for i in range(len(map_all_gt_labels_list))
+            ]
+            # TODO bug here, but we dont care enc_loss now
+            map_enc_loss_cls, map_enc_loss_bbox, map_enc_loss_iou, \
+                 map_enc_loss_pts, map_enc_loss_dir = \
+                self.map_loss_single(
+                    map_enc_cls_scores, map_enc_bbox_preds,
+                    map_enc_pts_preds, map_gt_bboxes_list,
+                    map_binary_labels_list, map_gt_pts_list,
+                    map_gt_bboxes_ignore
+                )
+            loss_dict['enc_loss_map_cls'] = map_enc_loss_cls
+            loss_dict['enc_loss_map_bbox'] = map_enc_loss_bbox
+            loss_dict['enc_loss_map_iou'] = map_enc_loss_iou
+            loss_dict['enc_loss_map_pts'] = map_enc_loss_pts
+            loss_dict['enc_loss_map_dir'] = map_enc_loss_dir
+
+        loss_dict['loss_vae_gen'] = self.loss_vae_gen(distribution_pred)
+
+        return loss_dict
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
+        """Generate bboxes from bbox head predictions.
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+            img_metas (list[dict]): Point cloud and image's meta info.
+        Returns:
+            list[dict]: Decoded bbox, scores and labels after nms.
+        """
+
+        det_preds_dicts = self.bbox_coder.decode(preds_dicts)
+        # map_bboxes: xmin, ymin, xmax, ymax
+        map_preds_dicts = self.map_bbox_coder.decode(preds_dicts)
+
+        num_samples = len(det_preds_dicts)
+        assert len(det_preds_dicts) == len(map_preds_dicts), \
+             'len(preds_dict) should be equal to len(map_preds_dicts)'
+        ret_list = []
+        for i in range(num_samples):
+            preds = det_preds_dicts[i]
+            bboxes = preds['bboxes']
+            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+            code_size = bboxes.shape[-1]
+            bboxes = img_metas[i]['box_type_3d'](bboxes, code_size)
+            scores = preds['scores']
+            labels = preds['labels']
+            trajs = preds['trajs']
+
+            map_preds = map_preds_dicts[i]
+            map_bboxes = map_preds['map_bboxes']
+            map_scores = map_preds['map_scores']
+            map_labels = map_preds['map_labels']
+            map_pts = map_preds['map_pts']
+
+            ret_list.append([bboxes, scores, labels, trajs, map_bboxes,
+                             map_scores, map_labels, map_pts])
+
+        return ret_list
+
+    def select_and_pad_pred_map(
+        self,
+        motion_pos,
+        map_query,
+        map_score,
+        map_pos,
+        map_thresh=0.5,
+        dis_thresh=None,
+        pe_normalization=True,
+        use_fix_pad=False
+    ):
+        """select_and_pad_pred_map.
+        Args:
+            motion_pos: [B, A, 2]
+            map_query: [B, P, D].
+            map_score: [B, P, 3].
+            map_pos: [B, P, pts, 2].
+            map_thresh: map confidence threshold for filtering low-confidence preds
+            dis_thresh: distance threshold for masking far maps for each agent in cross-attn
+            use_fix_pad: always pad one lane instance for each batch
+        Returns:
+            selected_map_query: [B*A, P1(+1), D], P1 is the max inst num after filter and pad.
+            selected_map_pos: [B*A, P1(+1), 2]
+            selected_padding_mask: [B*A, P1(+1)]
+        """
+        
+        if dis_thresh is None:
+            raise NotImplementedError('Not implement yet')
+
+        # use the most close pts pos in each map inst as the inst's pos
+        batch, num_map = map_pos.shape[:2]
+        map_dis = torch.sqrt(map_pos[..., 0]**2 + map_pos[..., 1]**2)
+        min_map_pos_idx = map_dis.argmin(dim=-1).flatten()  # [B*P]
+        min_map_pos = map_pos.flatten(0, 1)  # [B*P, pts, 2]
+        min_map_pos = min_map_pos[range(min_map_pos.shape[0]), min_map_pos_idx]  # [B*P, 2]
+        min_map_pos = min_map_pos.view(batch, num_map, 2)  # [B, P, 2]
+
+        # select & pad map vectors for different batch using map_thresh
+        map_score = map_score.sigmoid()
+        map_max_score = map_score.max(dim=-1)[0]
+        map_idx = map_max_score > map_thresh
+        batch_max_pnum = 0
+        for i in range(map_score.shape[0]):
+            pnum = map_idx[i].sum()
+            if pnum > batch_max_pnum:
+                batch_max_pnum = pnum
+
+        selected_map_query, selected_map_pos, selected_padding_mask = [], [], []
+        for i in range(map_score.shape[0]):
+            dim = map_query.shape[-1]
+            valid_pnum = map_idx[i].sum()
+            valid_map_query = map_query[i, map_idx[i]]
+            valid_map_pos = min_map_pos[i, map_idx[i]]
+            pad_pnum = batch_max_pnum - valid_pnum
+            padding_mask = torch.tensor([False], device=map_score.device).repeat(batch_max_pnum)
+            if pad_pnum != 0:
+                valid_map_query = torch.cat([valid_map_query, torch.zeros((pad_pnum, dim), device=map_score.device)], dim=0)
+                valid_map_pos = torch.cat([valid_map_pos, torch.zeros((pad_pnum, 2), device=map_score.device)], dim=0)
+                padding_mask[valid_pnum:] = True
+            selected_map_query.append(valid_map_query)
+            selected_map_pos.append(valid_map_pos)
+            selected_padding_mask.append(padding_mask)
+
+        selected_map_query = torch.stack(selected_map_query, dim=0)
+        selected_map_pos = torch.stack(selected_map_pos, dim=0)
+        selected_padding_mask = torch.stack(selected_padding_mask, dim=0)
+
+        # generate different pe for map vectors for each agent
+        num_agent = motion_pos.shape[1]
+        selected_map_query = selected_map_query.unsqueeze(1).repeat(1, num_agent, 1, 1)  # [B, A, max_P, D]
+        selected_map_pos = selected_map_pos.unsqueeze(1).repeat(1, num_agent, 1, 1)  # [B, A, max_P, 2]
+        selected_padding_mask = selected_padding_mask.unsqueeze(1).repeat(1, num_agent, 1)  # [B, A, max_P]
+        # move lane to per-car coords system
+        selected_map_dist = selected_map_pos - motion_pos[:, :, None, :]  # [B, A, max_P, 2]
+        if pe_normalization:
+            selected_map_pos = selected_map_pos - motion_pos[:, :, None, :]  # [B, A, max_P, 2]
+
+        # filter far map inst for each agent
+        map_dis = torch.sqrt(selected_map_dist[..., 0]**2 + selected_map_dist[..., 1]**2)
+        valid_map_inst = (map_dis <= dis_thresh)  # [B, A, max_P]
+        invalid_map_inst = (valid_map_inst == False)
+        selected_padding_mask = selected_padding_mask + invalid_map_inst
+
+        selected_map_query = selected_map_query.flatten(0, 1)
+        selected_map_pos = selected_map_pos.flatten(0, 1)
+        selected_padding_mask = selected_padding_mask.flatten(0, 1)
+
+        num_batch = selected_padding_mask.shape[0]
+        feat_dim = selected_map_query.shape[-1]
+        if use_fix_pad:
+            pad_map_query = torch.zeros((num_batch, 1, feat_dim), device=selected_map_query.device)
+            pad_map_pos = torch.ones((num_batch, 1, 2), device=selected_map_pos.device)
+            pad_lane_mask = torch.tensor([False], device=selected_padding_mask.device).unsqueeze(0).repeat(num_batch, 1)
+            selected_map_query = torch.cat([selected_map_query, pad_map_query], dim=1)
+            selected_map_pos = torch.cat([selected_map_pos, pad_map_pos], dim=1)
+            selected_padding_mask = torch.cat([selected_padding_mask, pad_lane_mask], dim=1)
+
+        return selected_map_query, selected_map_pos, selected_padding_mask
+
+
+    def select_and_pad_query(
+        self,
+        query,
+        query_pos,
+        query_score,
+        score_thresh=0.5,
+        use_fix_pad=True
+    ):
+        """select_and_pad_query.
+        Args:
+            query: [B, Q, D].
+            query_pos: [B, Q, 2]
+            query_score: [B, Q, C].
+            score_thresh: confidence threshold for filtering low-confidence query
+            use_fix_pad: always pad one query instance for each batch
+        Returns:
+            selected_query: [B, Q', D]
+            selected_query_pos: [B, Q', 2]
+            selected_padding_mask: [B, Q']
+        """
+
+        # select & pad query for different batch using score_thresh
+        query_score = query_score.sigmoid()
+        query_score = query_score.max(dim=-1)[0]
+        query_idx = query_score > score_thresh
+        batch_max_qnum = 0
+        for i in range(query_score.shape[0]):
+            qnum = query_idx[i].sum()
+            if qnum > batch_max_qnum:
+                batch_max_qnum = qnum
+
+        selected_query, selected_query_pos, selected_padding_mask = [], [], []
+        for i in range(query_score.shape[0]):
+            dim = query.shape[-1]
+            valid_qnum = query_idx[i].sum()
+            valid_query = query[i, query_idx[i]]
+            valid_query_pos = query_pos[i, query_idx[i]]
+            pad_qnum = batch_max_qnum - valid_qnum
+            padding_mask = torch.tensor([False], device=query_score.device).repeat(batch_max_qnum)
+            if pad_qnum != 0:
+                valid_query = torch.cat([valid_query, torch.zeros((pad_qnum, dim), device=query_score.device)], dim=0)
+                valid_query_pos = torch.cat([valid_query_pos, torch.zeros((pad_qnum, 2), device=query_score.device)], dim=0)
+                padding_mask[valid_qnum:] = True
+            selected_query.append(valid_query)
+            selected_query_pos.append(valid_query_pos)
+            selected_padding_mask.append(padding_mask)
+
+        selected_query = torch.stack(selected_query, dim=0)
+        selected_query_pos = torch.stack(selected_query_pos, dim=0)
+        selected_padding_mask = torch.stack(selected_padding_mask, dim=0)
+
+        num_batch = selected_padding_mask.shape[0]
+        feat_dim = selected_query.shape[-1]
+        if use_fix_pad:
+            pad_query = torch.zeros((num_batch, 1, feat_dim), device=selected_query.device)
+            pad_query_pos = torch.ones((num_batch, 1, 2), device=selected_query_pos.device)
+            pad_mask = torch.tensor([False], device=selected_padding_mask.device).unsqueeze(0).repeat(num_batch, 1)
+            selected_query = torch.cat([selected_query, pad_query], dim=1)
+            selected_query_pos = torch.cat([selected_query_pos, pad_query_pos], dim=1)
+            selected_padding_mask = torch.cat([selected_padding_mask, pad_mask], dim=1)
+
+        return selected_query, selected_query_pos, selected_padding_mask
+
+
+
+    def distribution_forward(self, present_features, future_distribution_inputs=None, noise=None):
+        """
+        Parameters
+        ----------
+            present_features: 5-D output from dynamics module with shape (b, 1, c, h, w)
+            future_distribution_inputs: 5-D tensor containing labels shape (b, s, cfg.PROB_FUTURE_DIM, h, w)
+            noise: a sample from a (0, 1) gaussian with shape (b, s, latent_dim). If None, will sample in function
+
+        Returns
+        -------
+            sample: sample taken from present/future distribution, broadcast to shape (b, s, latent_dim, h, w)
+            present_distribution_mu: shape (b, s, latent_dim)
+            present_distribution_log_sigma: shape (b, s, latent_dim)
+            future_distribution_mu: shape (b, s, latent_dim)
+            future_distribution_log_sigma: shape (b, s, latent_dim)
+        """
+
+        b = present_features.shape[0]
+        c = present_features.shape[1]
+        present_mu, present_log_sigma = self.present_distribution(present_features)
+
+        future_mu, future_log_sigma = None, None
+        if future_distribution_inputs is not None:
+            # Concatenate future labels to z_t
+            # future_features = future_distribution_inputs[:, 1:].contiguous().view(b, 1, -1, h, w)
+            future_features = torch.cat([present_features, future_distribution_inputs], dim=2)
+            future_mu, future_log_sigma = self.future_distribution(future_features)
+
+        if noise is None:
+            if self.training:
+                noise = torch.randn_like(present_mu)
+            else:
+                noise = torch.zeros_like(present_mu)
+        if self.training:
+            mu = future_mu
+            sigma = torch.exp(future_log_sigma)
+        else:
+            mu = present_mu
+            sigma = torch.exp(present_log_sigma)
+        sample = mu + sigma * noise
+
+        # Spatially broadcast sample to the dimensions of present_features
+        sample = sample.permute(0, 2, 1).expand(b, self.latent_dim, c)
+
+        output_distribution = {
+            'present_mu': present_mu,
+            'present_log_sigma': present_log_sigma,
+            'future_mu': future_mu,
+            'future_log_sigma': future_log_sigma,
+        }
+
+        return sample, output_distribution
+
+    def get_future_labels(self, gt_labels_3d, gt_attr_labels, ego_fut_trajs, device):
+
+        agent_dim = 300
+        veh_list = [0,1,3,4]
+        mapped_class_names = [
+            'car', 'truck', 'construction_vehicle', 'bus',
+            'trailer', 'barrier', 'motorcycle', 'bicycle',
+            'pedestrian', 'traffic_cone'
+        ]
+        ignore_list = ['construction_vehicle', 'barrier',
+                       'traffic_cone', 'motorcycle', 'bicycle']
+
+        batch_size = len(gt_labels_3d)
+
+        # gt_label = gt_labels_3d[0]
+        # gt_attr_label = gt_attr_labels[0]
+
+        gt_fut_trajs_bz_list = []
+
+        for bz in range(batch_size):
+            gt_fut_trajs_list = []
+            gt_label = gt_labels_3d[bz]
+            gt_attr_label = gt_attr_labels[bz]
+            for i in range(gt_label.shape[0]):
+                gt_label[i] = 0 if gt_label[i] in veh_list else gt_label[i]
+                box_name = mapped_class_names[gt_label[i]]
+                if box_name in ignore_list:
+                    continue
+                gt_fut_masks = gt_attr_label[i][self.fut_ts * 2:self.fut_ts * 3]
+                num_valid_ts = sum(gt_fut_masks == 1)
+                gt_fut_traj = gt_attr_label[i][:self.fut_ts * 2].reshape(-1, 2)
+                gt_fut_traj = gt_fut_traj[:num_valid_ts]
+                if gt_fut_traj.shape[0] == 0:
+                    gt_fut_traj = torch.zeros([self.fut_ts - gt_fut_traj.shape[0], 2], device=device)
+                if gt_fut_traj.shape[0] < self.fut_ts:
+                    gt_fut_traj = torch.cat((gt_fut_traj, torch.zeros([self.fut_ts - gt_fut_traj.shape[0], 2], device=device)), 0)
+                gt_fut_trajs_list.append(gt_fut_traj)
+
+
+            if len(gt_fut_trajs_list) != 0 & len(gt_fut_trajs_list) < agent_dim:
+                gt_fut_trajs = torch.cat(
+                    (torch.stack(gt_fut_trajs_list), torch.zeros([agent_dim - len(gt_fut_trajs_list), self.fut_ts, 2], device=device)), 0)
+            else:
+                gt_fut_trajs = torch.zeros([agent_dim, self.fut_ts, 2], device=device)
+
+            gt_fut_trajs_bz_list.append(gt_fut_trajs)
+
+        if len(gt_fut_trajs_bz_list) != 0:
+            gt_trajs = torch.cat((torch.stack(gt_fut_trajs_bz_list).repeat(1, 6, 1, 1), ego_fut_trajs), dim=1)
+        else:
+            gt_trajs = ego_fut_trajs
+        #future_states =  gt_trajs.reshape(batch_size, gt_trajs.shape[1], -1)
+
+
+
+        # [bz, a, t, 2]
+        return  gt_trajs.reshape(batch_size, gt_trajs.shape[1], -1)
+
+
+    def future_states_predict(self, batch_size, sample, hidden_states, current_states):
+
+        future_prediction_input = sample.unsqueeze(0).expand(self.fut_ts, -1, -1, -1)
+        #
+        # future_states = self.future_prediction(future_prediction_input, hidden_state)
+        future_prediction_input = future_prediction_input.reshape(self.fut_ts, -1, self.latent_dim)
+
+        hidden_state = hidden_states.reshape(self.layer_dim, -1, int(self.embed_dims/2))
+        # future_states, future_hidden = self.state_gru(future_prediction_input, hidden_state)
+        future_states = self.predict_model(future_prediction_input, hidden_state)
+
+        current_states_hs = current_states.unsqueeze(0).repeat(6, 1, 1, 1)
+        future_states_hs = future_states.reshape(self.fut_ts, batch_size, -1, future_states.shape[2])
+        states_hs = torch.cat((current_states_hs, future_states_hs), dim=-1)
+
+        return states_hs, future_states_hs
+
+
+
+
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/backbones/__init__.py b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cea72f55c89cbc3d57bc9ae58e74144b27cc0530
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__init__.py
@@ -0,0 +1,3 @@
+from .vovnet import VoVNet
+
+__all__ = ['VoVNet']
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91c9b0f6b6a7c9524a3e8f960a2ec1b4fadbaa56
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/vovnet.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/vovnet.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae0009ed23e90f10d68e2d26b85cc595938a2ae2
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/backbones/__pycache__/vovnet.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/backbones/vovnet.py b/GenAD-main/projects/mmdet3d_plugin/models/backbones/vovnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..879d186a37b49addaf27362cc6ae1e5465b2168e
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/backbones/vovnet.py
@@ -0,0 +1,375 @@
+
+from collections import OrderedDict
+from mmcv.runner import BaseModule
+from mmdet.models.builder import BACKBONES
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.batchnorm import _BatchNorm
+
+
+VoVNet19_slim_dw_eSE = {
+    'stem': [64, 64, 64],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": True
+}
+
+VoVNet19_dw_eSE = {
+    'stem': [64, 64, 64],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": True
+}
+
+VoVNet19_slim_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    "dw": False
+}
+
+VoVNet19_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": False
+}
+
+VoVNet39_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 1, 2, 2],
+    "eSE": True,
+    "dw": False
+}
+
+VoVNet57_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 1, 4, 3],
+    "eSE": True,
+    "dw": False
+}
+
+VoVNet99_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 3, 9, 3],
+    "eSE": True,
+    "dw": False
+}
+
+_STAGE_SPECS = {
+    "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE,
+    "V-19-dw-eSE": VoVNet19_dw_eSE,
+    "V-19-slim-eSE": VoVNet19_slim_eSE,
+    "V-19-eSE": VoVNet19_eSE,
+    "V-39-eSE": VoVNet39_eSE,
+    "V-57-eSE": VoVNet57_eSE,
+    "V-99-eSE": VoVNet99_eSE,
+}
+
+
+def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1):
+    """3x3 convolution with padding"""
+    return [
+        (
+            '{}_{}/dw_conv3x3'.format(module_name, postfix),
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=out_channels,
+                bias=False
+            )
+        ),
+        (
+            '{}_{}/pw_conv1x1'.format(module_name, postfix),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False)
+        ),
+        ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)),
+        ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)),
+    ]
+
+
+def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1):
+    """3x3 convolution with padding"""
+    return [
+        (
+            f"{module_name}_{postfix}/conv",
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
+        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+    ]
+
+
+def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0):
+    """1x1 convolution with padding"""
+    return [
+        (
+            f"{module_name}_{postfix}/conv",
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
+        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+    ]
+
+
+class Hsigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return F.relu6(x + 3.0, inplace=self.inplace) / 6.0
+
+
+class eSEModule(nn.Module):
+    def __init__(self, channel, reduction=4):
+        super(eSEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
+        self.hsigmoid = Hsigmoid()
+
+    def forward(self, x):
+        input = x
+        x = self.avg_pool(x)
+        x = self.fc(x)
+        x = self.hsigmoid(x)
+        return input * x
+
+
+class _OSA_module(nn.Module):
+    def __init__(
+        self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False
+    ):
+
+        super(_OSA_module, self).__init__()
+
+        self.identity = identity
+        self.depthwise = depthwise
+        self.isReduced = False
+        self.layers = nn.ModuleList()
+        in_channel = in_ch
+        if self.depthwise and in_channel != stage_ch:
+            self.isReduced = True
+            self.conv_reduction = nn.Sequential(
+                OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0"))
+            )
+        for i in range(layer_per_block):
+            if self.depthwise:
+                self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i))))
+            else:
+                self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i))))
+            in_channel = stage_ch
+
+        # feature aggregation
+        in_channel = in_ch + layer_per_block * stage_ch
+        self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat")))
+
+        self.ese = eSEModule(concat_ch)
+
+    def forward(self, x):
+
+        identity_feat = x
+
+        output = []
+        output.append(x)
+        if self.depthwise and self.isReduced:
+            x = self.conv_reduction(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+
+        x = torch.cat(output, dim=1)
+        xt = self.concat(x)
+
+        xt = self.ese(xt)
+
+        if self.identity:
+            xt = xt + identity_feat
+
+        return xt
+
+
+class _OSA_stage(nn.Sequential):
+    def __init__(
+        self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False
+    ):
+
+        super(_OSA_stage, self).__init__()
+
+        if not stage_num == 2:
+            self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
+
+        if block_per_stage != 1:
+            SE = False
+        module_name = f"OSA{stage_num}_1"
+        self.add_module(
+            module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise)
+        )
+        for i in range(block_per_stage - 1):
+            if i != block_per_stage - 2:  # last block
+                SE = False
+            module_name = f"OSA{stage_num}_{i + 2}"
+            self.add_module(
+                module_name,
+                _OSA_module(
+                    concat_ch,
+                    stage_ch,
+                    concat_ch,
+                    layer_per_block,
+                    module_name,
+                    SE,
+                    identity=True,
+                    depthwise=depthwise
+                ),
+            )
+
+
+@BACKBONES.register_module()
+class VoVNet(BaseModule):
+    def __init__(self, spec_name, input_ch=3, out_features=None, 
+                 frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None):
+        """
+        Args:
+            input_ch(int) : the number of input channel
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "stage2" ...
+        """
+        super(VoVNet, self).__init__(init_cfg)
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        stage_specs = _STAGE_SPECS[spec_name]
+
+        stem_ch = stage_specs["stem"]
+        config_stage_ch = stage_specs["stage_conv_ch"]
+        config_concat_ch = stage_specs["stage_out_ch"]
+        block_per_stage = stage_specs["block_per_stage"]
+        layer_per_block = stage_specs["layer_per_block"]
+        SE = stage_specs["eSE"]
+        depthwise = stage_specs["dw"]
+
+        self._out_features = out_features
+
+        # Stem module
+        conv_type = dw_conv3x3 if depthwise else conv3x3
+        stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2)
+        stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1)
+        stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2)
+        self.add_module("stem", nn.Sequential((OrderedDict(stem))))
+        current_stirde = 4
+        self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde}
+        self._out_feature_channels = {"stem": stem_ch[2]}
+
+        stem_out_ch = [stem_ch[2]]
+        in_ch_list = stem_out_ch + config_concat_ch[:-1]
+        # OSA stages
+        self.stage_names = []
+        for i in range(4):  # num_stages
+            name = "stage%d" % (i + 2)  # stage 2 ... stage 5
+            self.stage_names.append(name)
+            self.add_module(
+                name,
+                _OSA_stage(
+                    in_ch_list[i],
+                    config_stage_ch[i],
+                    config_concat_ch[i],
+                    block_per_stage[i],
+                    layer_per_block,
+                    i + 2,
+                    SE,
+                    depthwise,
+                ),
+            )
+
+            self._out_feature_channels[name] = config_concat_ch[i]
+            if not i == 0:
+                self._out_feature_strides[name] = current_stirde = int(current_stirde * 2)
+
+        # initialize weights
+        # self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name in self.stage_names:
+            x = getattr(self, name)(x)
+            if name in self._out_features:
+                outputs[name] = x
+
+        return outputs
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            m = getattr(self, 'stem')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'stage{i+1}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(VoVNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/hooks/__init__.py b/GenAD-main/projects/mmdet3d_plugin/models/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..93b13c9c853d6f7eece8ae2dc7aa67d4e87db68b
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/hooks/__init__.py
@@ -0,0 +1 @@
+from .hooks import GradChecker
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/hooks/hooks.py b/GenAD-main/projects/mmdet3d_plugin/models/hooks/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..56ff7fd575c890e60ce49eb618df157b2cc2ca37
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/hooks/hooks.py
@@ -0,0 +1,13 @@
+from mmcv.runner.hooks.hook import HOOKS, Hook
+from projects.mmdet3d_plugin.models.utils import run_time
+
+
+@HOOKS.register_module()
+class GradChecker(Hook):
+
+    def after_train_iter(self, runner):
+        for key, val in runner.model.named_parameters():
+            if val.grad == None and val.requires_grad:
+                print('WARNNING: {key}\'s parameters are not be used!!!!'.format(key=key))
+
+
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/opt/__init__.py b/GenAD-main/projects/mmdet3d_plugin/models/opt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7dd426868a61772bbe0926e435ce89f15009805
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/opt/__init__.py
@@ -0,0 +1 @@
+from .adamw import AdamW2
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0caad3f875c313e457a7bf915c801f2de5548680
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/adamw.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/adamw.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..114bab1eb6063f7479fc71e7c37748b6f26e792b
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/opt/__pycache__/adamw.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/opt/adamw.py b/GenAD-main/projects/mmdet3d_plugin/models/opt/adamw.py
new file mode 100644
index 0000000000000000000000000000000000000000..c890aeaf04721580c11ca329f2be09a6a280f773
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/opt/adamw.py
@@ -0,0 +1,131 @@
+try:
+    from torch.optim import _functional as F
+except:
+    print('WARNING!!!, I recommend using torch>=1.8')
+
+import torch
+from torch.optim.optimizer import Optimizer
+from mmcv.runner.optimizer.builder import OPTIMIZERS
+
+@OPTIMIZERS.register_module()
+class AdamW2(Optimizer):
+    r"""Implements AdamW algorithm. Solve the bug of torch 1.8
+
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=1e-2, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(AdamW2, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(AdamW2, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+
+            # put this line here for solving bug
+            beta1, beta2 = group['betas']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('AdamW does not support sparse gradients')
+                grads.append(p.grad)
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+
+
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'])
+
+            F.adamw(params_with_grad,
+                    grads,
+                    exp_avgs,
+                    exp_avg_sqs,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad,
+                    beta1,
+                    beta2,
+                    group['lr'],
+                    group['weight_decay'],
+                    group['eps'])
+
+        return loss
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__init__.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..96e2423a7dff3be63b65178827c2bab4dd5c398d
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/__init__.py
@@ -0,0 +1,6 @@
+
+from .bricks import run_time
+from .grid_mask import GridMask
+from .position_embedding import RelPositionEmbedding
+from .visual import save_tensor
+from .embed import PatchEmbed
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/__init__.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/__init__.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81f413b0efef621e6f19ab2a48ac4bb93040df7a
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/__init__.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/bricks.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/bricks.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..828028b1afbc40652c78b6546de08207f09a0b05
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/bricks.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/embed.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/embed.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55363d515578bb6d682a1258873e479f2e73f197
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/embed.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/grid_mask.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/grid_mask.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4e75ae91e3296dd493acd20964edec62477c26b
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/grid_mask.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/position_embedding.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/position_embedding.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec80fc4ccae70264b92ea6ac20b2fd80984913ad
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/position_embedding.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/visual.cpython-38.pyc b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/visual.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6376205ba2361a2131109d43e0ce73eca0fe87c
Binary files /dev/null and b/GenAD-main/projects/mmdet3d_plugin/models/utils/__pycache__/visual.cpython-38.pyc differ
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/bricks.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/bricks.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd458813d9ffced23b79799daa84150ba887774e
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/bricks.py
@@ -0,0 +1,20 @@
+import functools
+import time
+from collections import defaultdict
+import torch
+time_maps = defaultdict(lambda :0.)
+count_maps = defaultdict(lambda :0.)
+def run_time(name):
+    def middle(fn):
+        def wrapper(*args, **kwargs):
+            torch.cuda.synchronize()
+            start = time.time()
+            res = fn(*args, **kwargs)
+            torch.cuda.synchronize()
+            time_maps['%s : %s'%(name, fn.__name__) ] += time.time()-start
+            count_maps['%s : %s'%(name, fn.__name__) ] +=1
+            print("%s : %s takes up %f "% (name, fn.__name__,time_maps['%s : %s'%(name, fn.__name__) ] /count_maps['%s : %s'%(name, fn.__name__) ] ))
+            return res
+        return wrapper
+    return middle
+    
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/embed.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dbebfe41d7138baaaf112b815b44143fce7a170
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/embed.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner.base_module import BaseModule
+from torch.nn.modules.utils import _pair as to_2tuple
+
+
+# Modified from Pytorch-Image-Models
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding V2.
+
+    We use a conv layer to implement PatchEmbed.
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (dict, optional): The config dict for conv layers type
+            selection. Default: None.
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: None (Default to be equal with kernel_size).
+        padding (int): The padding length of embedding conv. Default: 0.
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        pad_to_patch_size (bool, optional): Whether to pad feature map shape
+            to multiple patch size. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type=None,
+                 kernel_size=16,
+                 stride=16,
+                 padding=0,
+                 dilation=1,
+                 pad_to_patch_size=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super(PatchEmbed, self).__init__()
+
+        self.embed_dims = embed_dims
+        self.init_cfg = init_cfg
+
+        if stride is None:
+            stride = kernel_size
+
+        self.pad_to_patch_size = pad_to_patch_size
+
+        # The default setting of patch size is equal to kernel size.
+        patch_size = kernel_size
+        if isinstance(patch_size, int):
+            patch_size = to_2tuple(patch_size)
+        elif isinstance(patch_size, tuple):
+            if len(patch_size) == 1:
+                patch_size = to_2tuple(patch_size[0])
+            assert len(patch_size) == 2, \
+                f'The size of patch should have length 1 or 2, ' \
+                f'but got {len(patch_size)}'
+
+        self.patch_size = patch_size
+
+        # Use conv layer to embed
+        conv_type = conv_type or 'Conv2d'
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        H, W = x.shape[2], x.shape[3]
+
+        # TODO: Process overlapping op
+        if self.pad_to_patch_size:
+            # Modify H, W to multiple of patch size.
+            if H % self.patch_size[0] != 0:
+                x = F.pad(
+                    x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+            if W % self.patch_size[1] != 0:
+                x = F.pad(
+                    x, (0, self.patch_size[1] - W % self.patch_size[1], 0, 0))
+
+        x = self.projection(x)
+        self.DH, self.DW = x.shape[2], x.shape[3]
+        x = x.flatten(2).transpose(1, 2)
+
+        if self.norm is not None:
+            x = self.norm(x)
+
+        return x
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/grid_mask.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/grid_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..77f77b2314176bf416c447913bcfb482baab02e8
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/grid_mask.py
@@ -0,0 +1,125 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+from mmcv.runner import force_fp32, auto_fp16
+
+class Grid(object):
+    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode=mode
+        self.st_prob = prob
+        self.prob = prob
+
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch
+
+    def __call__(self, img, label):
+        if np.random.rand() > self.prob:
+            return img, label
+        h = img.size(1)
+        w = img.size(2)
+        self.d1 = 2
+        self.d2 = min(h, w)
+        hh = int(1.5*h)
+        ww = int(1.5*w)
+        d = np.random.randint(self.d1, self.d2)
+        if self.ratio == 1:
+            self.l = np.random.randint(1, d)
+        else:
+            self.l = min(max(int(d*self.ratio+0.5),1),d-1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh//d):
+                s = d*i + st_h
+                t = min(s+self.l, hh)
+                mask[s:t,:] *= 0
+        if self.use_w:
+            for i in range(ww//d):
+                s = d*i + st_w
+                t = min(s+self.l, ww)
+                mask[:,s:t] *= 0
+       
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]
+
+        mask = torch.from_numpy(mask).float()
+        if self.mode == 1:
+            mask = 1-mask
+
+        mask = mask.expand_as(img)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float()
+            offset = (1 - mask) * offset
+            img = img * mask + offset
+        else:
+            img = img * mask 
+
+        return img, label
+
+
+class GridMask(nn.Module):
+    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
+        super(GridMask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+        self.fp16_enable = False
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5
+    @auto_fp16()
+    def forward(self, x):
+        if np.random.rand() > self.prob or not self.training:
+            return x
+        n,c,h,w = x.size()
+        x = x.view(-1,h,w)
+        hh = int(1.5*h)
+        ww = int(1.5*w)
+        d = np.random.randint(2, h)
+        self.l = min(max(int(d*self.ratio+0.5),1),d-1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh//d):
+                s = d*i + st_h
+                t = min(s+self.l, hh)
+                mask[s:t,:] *= 0
+        if self.use_w:
+            for i in range(ww//d):
+                s = d*i + st_w
+                t = min(s+self.l, ww)
+                mask[:,s:t] *= 0
+       
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]
+
+        # mask = torch.from_numpy(mask).to(x.dtype).cuda()
+        mask = torch.from_numpy(mask).to(x.dtype).to(x.device)
+        if self.mode == 1:
+            mask = 1-mask
+        mask = mask.expand_as(x)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).to(x.dtype).cuda()
+            x = x * mask + offset * (1 - mask)
+        else:
+            x = x * mask 
+        
+        return x.view(n,c,h,w)
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/position_embedding.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/position_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cb9309104cccf0586010de222e4cc307c3c848b
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/position_embedding.py
@@ -0,0 +1,73 @@
+import torch
+import torch.nn as nn
+import math
+
+class RelPositionEmbedding(nn.Module):
+    def __init__(self, num_pos_feats=64, pos_norm=True):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.fc = nn.Linear(4, self.num_pos_feats,bias=False)
+        #nn.init.orthogonal_(self.fc.weight)
+        #self.fc.weight.requires_grad = False
+        self.pos_norm = pos_norm
+        if self.pos_norm:
+            self.norm = nn.LayerNorm(self.num_pos_feats)
+    def forward(self, tensor):
+        #mask = nesttensor.mask
+        B,C,H,W = tensor.shape
+        #print('tensor.shape',  tensor.shape)
+        y_range = (torch.arange(H) / float(H - 1)).to(tensor.device)
+        #y_axis = torch.stack((y_range, 1-y_range),dim=1)
+        y_axis = torch.stack((torch.cos(y_range * math.pi), torch.sin(y_range * math.pi)), dim=1)
+        y_axis = y_axis.reshape(H, 1, 2).repeat(1, W, 1).reshape(H * W, 2)
+
+        x_range = (torch.arange(W) / float(W - 1)).to(tensor.device)
+        #x_axis =torch.stack((x_range,1-x_range),dim=1)
+        x_axis = torch.stack((torch.cos(x_range * math.pi), torch.sin(x_range * math.pi)), dim=1)
+        x_axis = x_axis.reshape(1, W, 2).repeat(H, 1, 1).reshape(H * W, 2)
+        x_pos = torch.cat((y_axis, x_axis), dim=1)
+        x_pos = self.fc(x_pos)
+
+        if self.pos_norm:
+            x_pos = self.norm(x_pos)
+        #print('xpos,', x_pos.max(),x_pos.min())
+        return x_pos
+
+
+class SineEmbedding(nn.Module):
+    def __init__(self, in_channels, N_freqs, logscale=True):
+        """
+        Defines a function that embeds x to (x, sin(2^k x), cos(2^k x), ...)
+        in_channels: number of input channels
+        """
+        super(SineEmbedding, self).__init__()
+        self.N_freqs = N_freqs
+        self.in_channels = in_channels
+        self.funcs = [torch.sin, torch.cos]
+        self.out_channels = in_channels*(len(self.funcs)*N_freqs)
+
+        if logscale:
+            self.freq_bands = 2**torch.linspace(0, N_freqs-1, N_freqs)
+        else:
+            self.freq_bands = torch.linspace(1, 2**(N_freqs-1), N_freqs)
+
+    def forward(self, x):
+        """
+        Embeds x to (sin(2^k x), cos(2^k x), ...) 
+        Inputs:
+            x: (B, self.in_channels)
+        Outputs:
+            out: (B, self.out_channels)
+        """
+        out = []
+        for freq in self.freq_bands:
+            for func in self.funcs:
+                out += [func(freq*x)]
+
+        return torch.cat(out, -1)
+
+
+# if __name__ == '__main__':
+#     pe = Embedding(in_channels=2, N_freqs=64)
+#     x_pe = pe(torch.randn(1, 4, 2))
+#     a = 0
\ No newline at end of file
diff --git a/GenAD-main/projects/mmdet3d_plugin/models/utils/visual.py b/GenAD-main/projects/mmdet3d_plugin/models/utils/visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9718afea9e67199c77da8ecf33249a28197082a
--- /dev/null
+++ b/GenAD-main/projects/mmdet3d_plugin/models/utils/visual.py
@@ -0,0 +1,24 @@
+import torch
+from torchvision.utils import make_grid
+import torchvision
+import matplotlib.pyplot as plt
+import cv2
+
+
+def convert_color(img_path):
+    plt.figure()
+    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
+    plt.imsave(img_path, img, cmap=plt.get_cmap('viridis'))
+    plt.close()
+
+
+def save_tensor(tensor, path, pad_value=254.0,):
+    print('save_tensor', path)
+    tensor = tensor.to(torch.float).detach().cpu()
+    if tensor.type() == 'torch.BoolTensor':
+        tensor = tensor*255
+    if len(tensor.shape) == 3:
+        tensor = tensor.unsqueeze(1)
+    tensor = make_grid(tensor, pad_value=pad_value, normalize=False).permute(1, 2, 0).numpy().copy()
+    torchvision.utils.save_image(torch.tensor(tensor).permute(2, 0, 1), path)
+    convert_color(path)
diff --git a/GenAD-main/requirements.txt b/GenAD-main/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b3b8914733d4daf2461a7f2c2a087398894bf539
--- /dev/null
+++ b/GenAD-main/requirements.txt
@@ -0,0 +1,221 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: linux-64
+_libgcc_mutex=0.1=main
+_openmp_mutex=5.1=1_gnu
+absl-py=1.4.0=pypi_0
+addict=2.4.0=pypi_0
+aliyun-python-sdk-core=2.14.0=pypi_0
+aliyun-python-sdk-kms=2.16.2=pypi_0
+anyio=4.0.0=pypi_0
+argon2-cffi=23.1.0=pypi_0
+argon2-cffi-bindings=21.2.0=pypi_0
+arrow=1.2.3=pypi_0
+asttokens=2.4.0=pypi_0
+async-lru=2.0.4=pypi_0
+attrs=23.1.0=pypi_0
+babel=2.12.1=pypi_0
+backcall=0.2.0=pypi_0
+beautifulsoup4=4.12.2=pypi_0
+black=23.7.0=pypi_0
+bleach=6.0.0=pypi_0
+ca-certificates=2023.08.22=h06a4308_0
+cachetools=5.3.1=pypi_0
+certifi=2023.7.22=pypi_0
+cffi=1.15.1=pypi_0
+charset-normalizer=3.2.0=pypi_0
+click=8.1.7=pypi_0
+colorama=0.4.6=pypi_0
+comm=0.1.4=pypi_0
+contourpy=1.1.0=pypi_0
+crcmod=1.7=pypi_0
+cryptography=42.0.0=pypi_0
+cycler=0.11.0=pypi_0
+cython=3.0.2=pypi_0
+debugpy=1.7.0=pypi_0
+decorator=5.1.1=pypi_0
+defusedxml=0.7.1=pypi_0
+descartes=1.1.0=pypi_0
+exceptiongroup=1.1.3=pypi_0
+executing=1.2.0=pypi_0
+fastjsonschema=2.18.0=pypi_0
+filelock=3.12.3=pypi_0
+fire=0.5.0=pypi_0
+flake8=6.1.0=pypi_0
+fonttools=4.42.1=pypi_0
+fqdn=1.5.1=pypi_0
+fsspec=2023.9.0=pypi_0
+google-auth=2.22.0=pypi_0
+google-auth-oauthlib=1.0.0=pypi_0
+grpcio=1.58.0=pypi_0
+huggingface-hub=0.16.4=pypi_0
+idna=3.4=pypi_0
+imageio=2.31.3=pypi_0
+importlib-metadata=6.8.0=pypi_0
+importlib-resources=6.0.1=pypi_0
+iniconfig=2.0.0=pypi_0
+ipykernel=6.25.2=pypi_0
+ipython=8.12.2=pypi_0
+ipython-genutils=0.2.0=pypi_0
+ipywidgets=8.1.0=pypi_0
+isoduration=20.11.0=pypi_0
+jedi=0.19.0=pypi_0
+jinja2=3.1.2=pypi_0
+jmespath=0.10.0=pypi_0
+joblib=1.3.2=pypi_0
+json5=0.9.14=pypi_0
+jsonpointer=2.4=pypi_0
+jsonschema=4.19.0=pypi_0
+jsonschema-specifications=2023.7.1=pypi_0
+jupyter=1.0.0=pypi_0
+jupyter-client=8.3.1=pypi_0
+jupyter-console=6.6.3=pypi_0
+jupyter-core=5.3.1=pypi_0
+jupyter-events=0.7.0=pypi_0
+jupyter-lsp=2.2.0=pypi_0
+jupyter-server=2.7.3=pypi_0
+jupyter-server-terminals=0.4.4=pypi_0
+jupyterlab=4.0.5=pypi_0
+jupyterlab-pygments=0.2.2=pypi_0
+jupyterlab-server=2.24.0=pypi_0
+jupyterlab-widgets=3.0.8=pypi_0
+kiwisolver=1.4.5=pypi_0
+ld_impl_linux-64=2.38=h1181459_1
+libffi=3.4.4=h6a678d5_0
+libgcc-ng=11.2.0=h1234567_1
+libgomp=11.2.0=h1234567_1
+libstdcxx-ng=11.2.0=h1234567_1
+llvmlite=0.31.0=pypi_0
+lyft-dataset-sdk=0.0.8=pypi_0
+markdown=3.4.4=pypi_0
+markdown-it-py=3.0.0=pypi_0
+markupsafe=2.1.3=pypi_0
+matplotlib=3.5.2=pypi_0
+matplotlib-inline=0.1.6=pypi_0
+mccabe=0.7.0=pypi_0
+mdurl=0.1.2=pypi_0
+mistune=3.0.1=pypi_0
+mmcv-full=1.4.0=pypi_0
+mmdet=2.14.0=pypi_0
+mmdet3d=0.17.1=dev_0
+mmsegmentation=0.14.1=pypi_0
+model-index=0.1.11=pypi_0
+mypy-extensions=1.0.0=pypi_0
+nbclient=0.8.0=pypi_0
+nbconvert=7.8.0=pypi_0
+nbformat=5.9.2=pypi_0
+ncurses=6.4=h6a678d5_0
+nest-asyncio=1.5.7=pypi_0
+networkx=2.2=pypi_0
+notebook=7.0.3=pypi_0
+notebook-shim=0.2.3=pypi_0
+numba=0.48.0=pypi_0
+numpy=1.19.5=pypi_0
+nuscenes-devkit=1.1.9=pypi_0
+oauthlib=3.2.2=pypi_0
+opencv-python=4.8.0.76=pypi_0
+opendatalab=0.0.10=pypi_0
+openmim=0.3.9=pypi_0
+openssl=3.0.10=h7f8727e_2
+openxlab=0.0.34=pypi_0
+ordered-set=4.1.0=pypi_0
+oss2=2.17.0=pypi_0
+overrides=7.4.0=pypi_0
+packaging=23.1=pypi_0
+pandas=1.4.4=pypi_0
+pandocfilters=1.5.0=pypi_0
+parso=0.8.3=pypi_0
+pathspec=0.11.2=pypi_0
+pexpect=4.8.0=pypi_0
+pickleshare=0.7.5=pypi_0
+pillow=10.0.0=pypi_0
+pip=22.1=pypi_0
+pkgutil-resolve-name=1.3.10=pypi_0
+platformdirs=3.10.0=pypi_0
+plotly=5.16.1=pypi_0
+pluggy=1.3.0=pypi_0
+plyfile=1.0.1=pypi_0
+prettytable=3.8.0=pypi_0
+prometheus-client=0.17.1=pypi_0
+prompt-toolkit=3.0.39=pypi_0
+protobuf=4.24.3=pypi_0
+psutil=5.9.5=pypi_0
+ptyprocess=0.7.0=pypi_0
+pure-eval=0.2.2=pypi_0
+pyasn1=0.5.0=pypi_0
+pyasn1-modules=0.3.0=pypi_0
+pycocotools=2.0.7=pypi_0
+pycodestyle=2.11.0=pypi_0
+pycparser=2.21=pypi_0
+pycryptodome=3.20.0=pypi_0
+pyflakes=3.1.0=pypi_0
+pygments=2.16.1=pypi_0
+pyparsing=3.0.9=pypi_0
+pyquaternion=0.9.9=pypi_0
+pytest=7.4.2=pypi_0
+python=3.8.17=h955ad1f_0
+python-dateutil=2.8.2=pypi_0
+python-json-logger=2.0.7=pypi_0
+pytz=2023.3.post1=pypi_0
+pywavelets=1.4.1=pypi_0
+pyyaml=6.0.1=pypi_0
+pyzmq=25.1.1=pypi_0
+qtconsole=5.4.4=pypi_0
+qtpy=2.4.0=pypi_0
+readline=8.2=h5eee18b_0
+referencing=0.30.2=pypi_0
+requests=2.28.2=pypi_0
+requests-oauthlib=1.3.1=pypi_0
+rfc3339-validator=0.1.4=pypi_0
+rfc3986-validator=0.1.1=pypi_0
+rich=13.4.2=pypi_0
+rpds-py=0.10.2=pypi_0
+rsa=4.9=pypi_0
+safetensors=0.3.3=pypi_0
+scikit-image=0.19.3=pypi_0
+scikit-learn=1.3.0=pypi_0
+scipy=1.10.1=pypi_0
+send2trash=1.8.2=pypi_0
+setuptools=59.5.0=pypi_0
+shapely=1.8.5=pypi_0
+similaritymeasures=1.0.0=pypi_0
+six=1.16.0=pypi_0
+sniffio=1.3.0=pypi_0
+soupsieve=2.5=pypi_0
+sqlite=3.41.2=h5eee18b_0
+stack-data=0.6.2=pypi_0
+tabulate=0.9.0=pypi_0
+tenacity=8.2.3=pypi_0
+tensorboard=2.14.0=pypi_0
+tensorboard-data-server=0.7.1=pypi_0
+termcolor=2.3.0=pypi_0
+terminado=0.17.1=pypi_0
+terminaltables=3.1.10=pypi_0
+threadpoolctl=3.2.0=pypi_0
+tifffile=2023.7.10=pypi_0
+timm=0.9.7=pypi_0
+tinycss2=1.2.1=pypi_0
+tk=8.6.12=h1ccaba5_0
+tomli=2.0.1=pypi_0
+torch=1.9.1+cu111=pypi_0
+torchaudio=0.9.1=pypi_0
+torchstat=0.0.7=pypi_0
+torchvision=0.10.1+cu111=pypi_0
+tornado=6.3.3=pypi_0
+tqdm=4.65.2=pypi_0
+traitlets=5.9.0=pypi_0
+trimesh=2.35.39=pypi_0
+typing-extensions=4.7.1=pypi_0
+uri-template=1.3.0=pypi_0
+urllib3=1.26.16=pypi_0
+wcwidth=0.2.6=pypi_0
+webcolors=1.13=pypi_0
+webencodings=0.5.1=pypi_0
+websocket-client=1.6.2=pypi_0
+werkzeug=2.3.7=pypi_0
+wheel=0.38.4=py38h06a4308_0
+widgetsnbextension=4.0.8=pypi_0
+xz=5.4.2=h5eee18b_0
+yapf=0.40.1=pypi_0
+zipp=3.16.2=pypi_0
+zlib=1.2.13=h5eee18b_0
diff --git a/GenAD-main/tools/analysis_tools/__init__.py b/GenAD-main/tools/analysis_tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/GenAD-main/tools/analysis_tools/analyze_logs.py b/GenAD-main/tools/analysis_tools/analyze_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..806175f34c0ce6c535167cc7db8470c69a6e243d
--- /dev/null
+++ b/GenAD-main/tools/analysis_tools/analyze_logs.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import numpy as np
+import seaborn as sns
+from collections import defaultdict
+from matplotlib import pyplot as plt
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        all_times = np.array(all_times)
+        epoch_ave_time = all_times.mean(-1)
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+        print()
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[args.interval - 1]]:
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}')
+
+            if args.mode == 'eval':
+                if min(epochs) == args.interval:
+                    x0 = args.interval
+                else:
+                    # if current training is resumed from previous checkpoint
+                    # we lost information in early epochs
+                    # `xs` should start according to `min(epochs)`
+                    if min(epochs) % args.interval == 0:
+                        x0 = min(epochs)
+                    else:
+                        # find the first epoch that do eval
+                        x0 = min(epochs) + args.interval - \
+                            min(epochs) % args.interval
+                xs = np.arange(x0, max(epochs) + 1, args.interval)
+                ys = []
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    ys += log_dict[epoch][metric]
+
+                # if training is aborted before eval of the last epoch
+                # `xs` and `ys` will have different length and cause an error
+                # check if `ys[-1]` is empty here
+                if not log_dict[epoch][metric]:
+                    xs = xs[:-1]
+
+                ax = plt.gca()
+                ax.set_xticks(xs)
+                plt.xlabel('epoch')
+                plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+            else:
+                xs = []
+                ys = []
+                num_iters_per_epoch = \
+                    log_dict[epochs[args.interval-1]]['iter'][-1]
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    iters = log_dict[epoch]['iter']
+                    if log_dict[epoch]['mode'][-1] == 'val':
+                        iters = iters[:-1]
+                    xs.append(
+                        np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+                    ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+                xs = np.concatenate(xs)
+                ys = np.concatenate(ys)
+                plt.xlabel('iter')
+                plt.plot(
+                    xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+            plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['mAP_0.25'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+    parser_plt.add_argument('--mode', type=str, default='train')
+    parser_plt.add_argument('--interval', type=int, default=1)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            for line in log_file:
+                log = json.loads(line.strip())
+                # skip lines without `epoch` field
+                if 'epoch' not in log:
+                    continue
+                epoch = log.pop('epoch')
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[epoch][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/GenAD-main/tools/analysis_tools/benchmark.py b/GenAD-main/tools/analysis_tools/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..487a348935e3c949a8cde2c90a1747db769964c9
--- /dev/null
+++ b/GenAD-main/tools/analysis_tools/benchmark.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+import sys
+sys.path.append('.')
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.datasets import custom_build_dataset
+# from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_detector
+#from tools.misc.fuse_conv_bn import fuse_module
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+    parser.add_argument('--samples', default=2000, help='samples to benchmark')
+    parser.add_argument(
+        '--log-interval', default=50, help='interval of logging')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    print(cfg.data.test)
+    dataset = custom_build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    if args.checkpoint is not None:
+        load_checkpoint(model, args.checkpoint, map_location='cpu')
+    #if args.fuse_conv_bn:
+    #    model = fuse_module(model)
+
+    model = MMDataParallel(model, device_ids=[0])
+
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with several samples and take the average
+    for i, data in enumerate(data_loader):
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        with torch.no_grad():
+            model(return_loss=False, rescale=True, **data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done image [{i + 1:<3}/ {args.samples}], '
+                      f'fps: {fps:.1f} img / s')
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall fps: {fps:.1f} img / s')
+            break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/GenAD-main/tools/analysis_tools/get_flops.py b/GenAD-main/tools/analysis_tools/get_flops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b9fb0163b2f749108d41f11b332d2bda1e71879
--- /dev/null
+++ b/GenAD-main/tools/analysis_tools/get_flops.py
@@ -0,0 +1,747 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import argparse
+
+import torch
+from mmcv import Config, DictAction
+
+from mmdet3d.models import build_model
+from mmdet3d.datasets import build_dataset
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+
+# try:
+#     from mmcv.cnn import get_model_complexity_info
+# except ImportError:
+#     raise ImportError('Please upgrade mmcv to >0.6.2')
+
+import sys
+sys.path.append('.')
+
+
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+import mmcv
+
+
+def get_model_complexity_info(model,
+                              data,
+                              input_shape=(1280, 720),
+                              print_per_layer_stat=True,
+                              as_strings=True,
+                              input_constructor=None,
+                              flush=False,
+                              ost=sys.stdout):
+    """Get complexity information of a model.
+
+    This method can calculate FLOPs and parameter counts of a model with
+    corresponding input shape. It can also print complexity information for
+    each layer in a model.
+
+    Supported layers are listed as below:
+        - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
+        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``,
+            ``nn.ReLU6``.
+        - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
+            ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
+            ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
+            ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
+            ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
+        - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
+            ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
+            ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
+        - Linear: ``nn.Linear``.
+        - Deconvolution: ``nn.ConvTranspose2d``.
+        - Upsample: ``nn.Upsample``.
+
+    Args:
+        model (nn.Module): The model for complexity calculation.
+        input_shape (tuple): Input shape used for calculation.
+        print_per_layer_stat (bool): Whether to print complexity information
+            for each layer in a model. Default: True.
+        as_strings (bool): Output FLOPs and params counts in a string form.
+            Default: True.
+        input_constructor (None | callable): If specified, it takes a callable
+            method that generates input. otherwise, it will generate a random
+            tensor with input shape to calculate FLOPs. Default: None.
+        flush (bool): same as that in :func:`print`. Default: False.
+        ost (stream): same as ``file`` param in :func:`print`.
+            Default: sys.stdout.
+
+    Returns:
+        tuple[float | str]: If ``as_strings`` is set to True, it will return
+            FLOPs and parameter counts in a string format. otherwise, it will
+            return those in a float number format.
+    """
+
+    assert isinstance(model, nn.Module)
+    flops_model = add_flops_counting_methods(model)
+    flops_model.eval()
+    flops_model.start_flops_count()
+    if input_constructor:
+        input = input_constructor(input_shape)
+        _ = flops_model(**input)
+    else:
+        try:
+            batch = torch.ones(()).new_empty(
+                (1, 6, 3, *input_shape),
+                dtype=next(flops_model.parameters()).dtype,
+                device=next(flops_model.parameters()).device)
+        except StopIteration:
+            # Avoid StopIteration for models which have no parameters,
+            # like `nn.Relu()`, `nn.AvgPool2d`, etc.
+            batch = torch.ones(()).new_empty((1, 6, 3, *input_shape))
+
+        # img_metas = [data['img_metas'][0].data[0]]
+        # img = data['img'][0].data[0]
+        # points = data['points'][0].data[0][0]
+        # fut_valid_flag = data['fut_valid_flag'][0].data[0]
+        # img = img.to(batch.device)
+        # points = [points.to(batch.device)]
+        # ego_his_trajs = data['ego_his_trajs'][0].data[0].to(batch.device)
+        # ego_lcf_feat = data['ego_lcf_feat'][0].data[0].to(batch.device).unsqueeze(0)
+
+        # _ = flops_model(rescale=True, img=img, img_metas=img_metas, points=points,
+        #                 fut_valid_flag=fut_valid_flag, ego_his_trajs=ego_his_trajs, ego_lcf_feat=ego_lcf_feat)
+
+        img_metas = [data['img_metas'][0].data[0]]
+        img = data['img'][0].data[0]
+        img = img.to(batch.device)
+
+        _ = flops_model(rescale=True, img=img, img_metas=img_metas)
+
+    flops_count, params_count = flops_model.compute_average_flops_cost()
+    if print_per_layer_stat:
+        print_model_with_flops(
+            flops_model, flops_count, params_count, ost=ost, flush=flush)
+    flops_model.stop_flops_count()
+
+    if as_strings:
+        return flops_to_string(flops_count), params_to_string(params_count)
+
+    return flops_count, params_count
+
+
+def flops_to_string(flops, units='GFLOPs', precision=2):
+    """Convert FLOPs number into a string.
+
+    Note that Here we take a multiply-add counts as one FLOP.
+
+    Args:
+        flops (float): FLOPs number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'GFLOPs',
+            'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically
+            choose the most suitable unit for FLOPs. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted FLOPs number with units.
+
+    Examples:
+        >>> flops_to_string(1e9)
+        '1.0 GFLOPs'
+        >>> flops_to_string(2e5, 'MFLOPs')
+        '0.2 MFLOPs'
+        >>> flops_to_string(3e-9, None)
+        '3e-09 FLOPs'
+    """
+    if units is None:
+        if flops // 10**9 > 0:
+            return str(round(flops / 10.**9, precision)) + ' GFLOPs'
+        elif flops // 10**6 > 0:
+            return str(round(flops / 10.**6, precision)) + ' MFLOPs'
+        elif flops // 10**3 > 0:
+            return str(round(flops / 10.**3, precision)) + ' KFLOPs'
+        else:
+            return str(flops) + ' FLOPs'
+    else:
+        if units == 'GFLOPs':
+            return str(round(flops / 10.**9, precision)) + ' ' + units
+        elif units == 'MFLOPs':
+            return str(round(flops / 10.**6, precision)) + ' ' + units
+        elif units == 'KFLOPs':
+            return str(round(flops / 10.**3, precision)) + ' ' + units
+        else:
+            return str(flops) + ' FLOPs'
+
+
+def params_to_string(num_params, units=None, precision=2):
+    """Convert parameter number into a string.
+
+    Args:
+        num_params (float): Parameter number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'M',
+            'K' and ''. If set to None, it will automatically choose the most
+            suitable unit for Parameter number. Default: None.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted parameter number with units.
+
+    Examples:
+        >>> params_to_string(1e9)
+        '1000.0 M'
+        >>> params_to_string(2e5)
+        '200.0 k'
+        >>> params_to_string(3e-9)
+        '3e-09'
+    """
+    if units is None:
+        if num_params // 10**6 > 0:
+            return str(round(num_params / 10**6, precision)) + ' M'
+        elif num_params // 10**3:
+            return str(round(num_params / 10**3, precision)) + ' k'
+        else:
+            return str(num_params)
+    else:
+        if units == 'M':
+            return str(round(num_params / 10.**6, precision)) + ' ' + units
+        elif units == 'K':
+            return str(round(num_params / 10.**3, precision)) + ' ' + units
+        else:
+            return str(num_params)
+
+
+def print_model_with_flops(model,
+                           total_flops,
+                           total_params,
+                           units='GFLOPs',
+                           precision=3,
+                           ost=sys.stdout,
+                           flush=False):
+    """Print a model with FLOPs for each layer.
+
+    Args:
+        model (nn.Module): The model to be printed.
+        total_flops (float): Total FLOPs of the model.
+        total_params (float): Total parameter counts of the model.
+        units (str | None): Converted FLOPs units. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 3.
+        ost (stream): same as `file` param in :func:`print`.
+            Default: sys.stdout.
+        flush (bool): same as that in :func:`print`. Default: False.
+
+    Example:
+        >>> class ExampleModel(nn.Module):
+
+        >>> def __init__(self):
+        >>>     super().__init__()
+        >>>     self.conv1 = nn.Conv2d(3, 8, 3)
+        >>>     self.conv2 = nn.Conv2d(8, 256, 3)
+        >>>     self.conv3 = nn.Conv2d(256, 8, 3)
+        >>>     self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        >>>     self.flatten = nn.Flatten()
+        >>>     self.fc = nn.Linear(8, 1)
+
+        >>> def forward(self, x):
+        >>>     x = self.conv1(x)
+        >>>     x = self.conv2(x)
+        >>>     x = self.conv3(x)
+        >>>     x = self.avg_pool(x)
+        >>>     x = self.flatten(x)
+        >>>     x = self.fc(x)
+        >>>     return x
+
+        >>> model = ExampleModel()
+        >>> x = (3, 16, 16)
+        to print the complexity information state for each layer, you can use
+        >>> get_model_complexity_info(model, x)
+        or directly use
+        >>> print_model_with_flops(model, 4579784.0, 37361)
+        ExampleModel(
+          0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs,
+          (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1))  # noqa: E501
+          (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1))
+          (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1))
+          (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1))
+          (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, )
+          (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True)
+        )
+    """
+
+    def accumulate_params(self):
+        if is_supported_instance(self):
+            return self.__params__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_params()
+            return sum
+
+    def accumulate_flops(self):
+        if is_supported_instance(self):
+            return self.__flops__ / model.__batch_counter__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_flops()
+            return sum
+
+    def flops_repr(self):
+        accumulated_num_params = self.accumulate_params()
+        accumulated_flops_cost = self.accumulate_flops()
+        return ', '.join([
+            params_to_string(
+                accumulated_num_params, units='M', precision=precision),
+            '{:.3%} Params'.format(accumulated_num_params / total_params),
+            flops_to_string(
+                accumulated_flops_cost, units=units, precision=precision),
+            '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops),
+            self.original_extra_repr()
+        ])
+
+    def add_extra_repr(m):
+        m.accumulate_flops = accumulate_flops.__get__(m)
+        m.accumulate_params = accumulate_params.__get__(m)
+        flops_extra_repr = flops_repr.__get__(m)
+        if m.extra_repr != flops_extra_repr:
+            m.original_extra_repr = m.extra_repr
+            m.extra_repr = flops_extra_repr
+            assert m.extra_repr != m.original_extra_repr
+
+    def del_extra_repr(m):
+        if hasattr(m, 'original_extra_repr'):
+            m.extra_repr = m.original_extra_repr
+            del m.original_extra_repr
+        if hasattr(m, 'accumulate_flops'):
+            del m.accumulate_flops
+
+    model.apply(add_extra_repr)
+    print(model, file=ost, flush=flush)
+    model.apply(del_extra_repr)
+
+
+def get_model_parameters_number(model):
+    """Calculate parameter number of a model.
+
+    Args:
+        model (nn.module): The model for parameter number calculation.
+
+    Returns:
+        float: Parameter number of the model.
+    """
+    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return num_params
+
+
+def add_flops_counting_methods(net_main_module):
+    # adding additional methods to the existing module object,
+    # this is done this way so that each function has access to self object
+    net_main_module.start_flops_count = start_flops_count.__get__(
+        net_main_module)
+    net_main_module.stop_flops_count = stop_flops_count.__get__(
+        net_main_module)
+    net_main_module.reset_flops_count = reset_flops_count.__get__(
+        net_main_module)
+    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # noqa: E501
+        net_main_module)
+
+    net_main_module.reset_flops_count()
+
+    return net_main_module
+
+
+def compute_average_flops_cost(self):
+    """Compute average FLOPs cost.
+
+    A method to compute average FLOPs cost, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+
+    Returns:
+        float: Current mean flops consumption per image.
+    """
+    batches_count = self.__batch_counter__
+    flops_sum = 0
+    for module in self.modules():
+        if is_supported_instance(module):
+            flops_sum += module.__flops__
+    params_sum = get_model_parameters_number(self)
+    return flops_sum / batches_count, params_sum
+
+
+def start_flops_count(self):
+    """Activate the computation of mean flops consumption per image.
+
+    A method to activate the computation of mean flops consumption per image.
+    which will be available after ``add_flops_counting_methods()`` is called on
+    a desired net object. It should be called before running the network.
+    """
+    add_batch_counter_hook_function(self)
+
+    def add_flops_counter_hook_function(module):
+        if is_supported_instance(module):
+            if hasattr(module, '__flops_handle__'):
+                return
+
+            else:
+                handle = module.register_forward_hook(
+                    get_modules_mapping()[type(module)])
+
+            module.__flops_handle__ = handle
+
+    self.apply(partial(add_flops_counter_hook_function))
+
+
+def stop_flops_count(self):
+    """Stop computing the mean flops consumption per image.
+
+    A method to stop computing the mean flops consumption per image, which will
+    be available after ``add_flops_counting_methods()`` is called on a desired
+    net object. It can be called to pause the computation whenever.
+    """
+    remove_batch_counter_hook_function(self)
+    self.apply(remove_flops_counter_hook_function)
+
+
+def reset_flops_count(self):
+    """Reset statistics computed so far.
+
+    A method to Reset computed statistics, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+    """
+    add_batch_counter_variables_or_reset(self)
+    self.apply(add_flops_counter_variable_or_reset)
+
+
+# ---- Internal functions
+def empty_flops_counter_hook(module, input, output):
+    module.__flops__ += 0
+
+
+def upsample_flops_counter_hook(module, input, output):
+    output_size = output[0]
+    batch_size = output_size.shape[0]
+    output_elements_count = batch_size
+    for val in output_size.shape[1:]:
+        output_elements_count *= val
+    module.__flops__ += int(output_elements_count)
+
+
+def relu_flops_counter_hook(module, input, output):
+    active_elements_count = output.numel()
+    module.__flops__ += int(active_elements_count)
+
+
+def linear_flops_counter_hook(module, input, output):
+    input = input[0]
+    output_last_dim = output.shape[
+        -1]  # pytorch checks dimensions, so here we don't care much
+    module.__flops__ += int(np.prod(input.shape) * output_last_dim)
+
+
+def pool_flops_counter_hook(module, input, output):
+    input = input[0]
+    module.__flops__ += int(np.prod(input.shape))
+
+
+def norm_flops_counter_hook(module, input, output):
+    input = input[0]
+
+    batch_flops = np.prod(input.shape)
+    if (getattr(module, 'affine', False)
+            or getattr(module, 'elementwise_affine', False)):
+        batch_flops *= 2
+    module.__flops__ += int(batch_flops)
+
+
+def deconv_flops_counter_hook(conv_module, input, output):
+    # Can have multiple inputs, getting the first one
+    input = input[0]
+
+    batch_size = input.shape[0]
+    input_height, input_width = input.shape[2:]
+
+    kernel_height, kernel_width = conv_module.kernel_size
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = (
+        kernel_height * kernel_width * in_channels * filters_per_channel)
+
+    active_elements_count = batch_size * input_height * input_width
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+    bias_flops = 0
+    if conv_module.bias is not None:
+        output_height, output_width = output.shape[2:]
+        bias_flops = out_channels * batch_size * output_height * output_height
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def conv_flops_counter_hook(conv_module, input, output):
+    # Can have multiple inputs, getting the first one
+    input = input[0]
+
+    batch_size = input.shape[0]
+    output_dims = list(output.shape[2:])
+
+    kernel_dims = list(conv_module.kernel_size)
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = int(
+        np.prod(kernel_dims)) * in_channels * filters_per_channel
+
+    active_elements_count = batch_size * int(np.prod(output_dims))
+
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+
+    bias_flops = 0
+
+    if conv_module.bias is not None:
+
+        bias_flops = out_channels * active_elements_count
+
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def batch_counter_hook(module, input, output):
+    batch_size = 1
+    if len(input) > 0:
+        # Can have multiple inputs, getting the first one
+        input = input[0]
+        batch_size = len(input)
+    else:
+        pass
+        print('Warning! No positional inputs found for a module, '
+              'assuming batch size is 1.')
+    module.__batch_counter__ += batch_size
+
+
+def add_batch_counter_variables_or_reset(module):
+
+    module.__batch_counter__ = 0
+
+
+def add_batch_counter_hook_function(module):
+    if hasattr(module, '__batch_counter_handle__'):
+        return
+
+    handle = module.register_forward_hook(batch_counter_hook)
+    module.__batch_counter_handle__ = handle
+
+
+def remove_batch_counter_hook_function(module):
+    if hasattr(module, '__batch_counter_handle__'):
+        module.__batch_counter_handle__.remove()
+        del module.__batch_counter_handle__
+
+
+def add_flops_counter_variable_or_reset(module):
+    if is_supported_instance(module):
+        if hasattr(module, '__flops__') or hasattr(module, '__params__'):
+            print('Warning: variables __flops__ or __params__ are already '
+                  'defined for the module' + type(module).__name__ +
+                  ' ptflops can affect your code!')
+        module.__flops__ = 0
+        module.__params__ = get_model_parameters_number(module)
+
+
+def is_supported_instance(module):
+    if type(module) in get_modules_mapping():
+        return True
+    return False
+
+
+def remove_flops_counter_hook_function(module):
+    if is_supported_instance(module):
+        if hasattr(module, '__flops_handle__'):
+            module.__flops_handle__.remove()
+            del module.__flops_handle__
+
+
+def get_modules_mapping():
+    return {
+        # convolutions
+        nn.Conv1d: conv_flops_counter_hook,
+        nn.Conv2d: conv_flops_counter_hook,
+        mmcv.cnn.bricks.Conv2d: conv_flops_counter_hook,
+        nn.Conv3d: conv_flops_counter_hook,
+        mmcv.cnn.bricks.Conv3d: conv_flops_counter_hook,
+        # activations
+        nn.ReLU: relu_flops_counter_hook,
+        nn.PReLU: relu_flops_counter_hook,
+        nn.ELU: relu_flops_counter_hook,
+        nn.LeakyReLU: relu_flops_counter_hook,
+        nn.ReLU6: relu_flops_counter_hook,
+        # poolings
+        nn.MaxPool1d: pool_flops_counter_hook,
+        nn.AvgPool1d: pool_flops_counter_hook,
+        nn.AvgPool2d: pool_flops_counter_hook,
+        nn.MaxPool2d: pool_flops_counter_hook,
+        mmcv.cnn.bricks.MaxPool2d: pool_flops_counter_hook,
+        nn.MaxPool3d: pool_flops_counter_hook,
+        mmcv.cnn.bricks.MaxPool3d: pool_flops_counter_hook,
+        nn.AvgPool3d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool2d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool2d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool3d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool3d: pool_flops_counter_hook,
+        # normalizations
+        nn.BatchNorm1d: norm_flops_counter_hook,
+        nn.BatchNorm2d: norm_flops_counter_hook,
+        nn.BatchNorm3d: norm_flops_counter_hook,
+        nn.GroupNorm: norm_flops_counter_hook,
+        nn.InstanceNorm1d: norm_flops_counter_hook,
+        nn.InstanceNorm2d: norm_flops_counter_hook,
+        nn.InstanceNorm3d: norm_flops_counter_hook,
+        nn.LayerNorm: norm_flops_counter_hook,
+        # FC
+        nn.Linear: linear_flops_counter_hook,
+        mmcv.cnn.bricks.Linear: linear_flops_counter_hook,
+        # Upscale
+        nn.Upsample: upsample_flops_counter_hook,
+        # Deconvolution
+        nn.ConvTranspose2d: deconv_flops_counter_hook,
+        mmcv.cnn.bricks.ConvTranspose2d: deconv_flops_counter_hook,
+    }
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[40000, 4],
+        help='input point cloud size')
+    parser.add_argument(
+        '--modality',
+        type=str,
+        default='point',
+        choices=['point', 'image', 'multi'],
+        help='input data modality')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+
+    args = parse_args()
+
+    if args.modality == 'point':
+        assert len(args.shape) == 2, 'invalid input shape'
+        input_shape = tuple(args.shape)
+    elif args.modality == 'image':
+        if len(args.shape) == 1:
+            input_shape = (3, args.shape[0], args.shape[0])
+        elif len(args.shape) == 2:
+            input_shape = (3, ) + tuple(args.shape)
+        else:
+            raise ValueError('invalid input shape')
+    elif args.modality == 'multi':
+        raise NotImplementedError(
+            'FLOPs counter is currently not supported for models with '
+            'multi-modality input')
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+
+    samples_per_gpu = 1
+    from mmdet.datasets import replace_ImageToTensor
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+        if samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        if samples_per_gpu > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    dataset = build_dataset(cfg.data.test)
+    dataset.is_vis_on_test = True #TODO, this is a hack
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=0,
+        dist=False,
+        shuffle=False,
+        nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+    )
+    for i, data in enumerate(data_loader):
+        # if ~(data['map_gt_labels_3d'].data[0][0] != -1).any():
+        #     continue
+        img = data['img'][0].data[0]
+        img_metas = data['img_metas'][0].data[0]
+        break
+
+    model = build_model(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    if torch.cuda.is_available():
+        model.cuda()
+    model.eval()
+
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not supported for {}'.format(
+                model.__class__.__name__))
+
+    flops, params = get_model_complexity_info(model, data)
+    split_line = '=' * 30
+    print(f'{split_line}\nInput shape: {input_shape}\n'
+          f'Flops: {flops}\nParams: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify that the '
+          'flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/GenAD-main/tools/analysis_tools/get_params.py b/GenAD-main/tools/analysis_tools/get_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bf4ecf3c7599bfe310fbfd59c2efdfd8d695303
--- /dev/null
+++ b/GenAD-main/tools/analysis_tools/get_params.py
@@ -0,0 +1,8 @@
+import torch
+YOUR_CKPT_PATH = None
+file_path = YOUR_CKPT_PATH
+model = torch.load(file_path, map_location='cpu')
+all = 0
+for key in list(model['state_dict'].keys()):
+    all += model['state_dict'][key].nelement()
+print(all)
diff --git a/GenAD-main/tools/analysis_tools/visualization.py b/GenAD-main/tools/analysis_tools/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..cde62913c43eaa5114431614c4f37537f3bff76a
--- /dev/null
+++ b/GenAD-main/tools/analysis_tools/visualization.py
@@ -0,0 +1,939 @@
+import sys
+sys.path.append('')
+import os
+import argparse
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+from typing import List, Dict
+
+import cv2
+import mmcv
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+from pyquaternion import Quaternion
+from nuscenes.nuscenes import NuScenes
+from mmdet.datasets.pipelines import to_tensor
+from matplotlib.collections import LineCollection
+from nuscenes.utils.data_classes import LidarPointCloud, Box
+from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility
+
+from projects.mmdet3d_plugin.core.bbox.structures.nuscenes_box import CustomNuscenesBox, CustomDetectionBox, color_map
+from projects.mmdet3d_plugin.datasets.nuscenes_vad_dataset import VectorizedLocalMap, LiDARInstanceLines
+import matplotlib.cm as cm
+from matplotlib.colors import LinearSegmentedColormap
+
+
+cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+
+def render_annotation(
+        anntoken: str,
+        margin: float = 10,
+        view: np.ndarray = np.eye(4),
+        box_vis_level: BoxVisibility = BoxVisibility.ANY,
+        out_path: str = 'render.png',
+        extra_info: bool = False) -> None:
+    """
+    Render selected annotation.
+    :param anntoken: Sample_annotation token.
+    :param margin: How many meters in each direction to include in LIDAR view.
+    :param view: LIDAR view point.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param out_path: Optional path to save the rendered figure to disk.
+    :param extra_info: Whether to render extra information below camera view.
+    """
+    ann_record = nusc.get('sample_annotation', anntoken)
+    sample_record = nusc.get('sample', ann_record['sample_token'])
+    assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.'
+
+    # Figure out which camera the object is fully visible in (this may return nothing).
+    boxes, cam = [], []
+    cams = [key for key in sample_record['data'].keys() if 'CAM' in key]
+    all_bboxes = []
+    select_cams = []
+    for cam in cams:
+        _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level,
+                                           selected_anntokens=[anntoken])
+        if len(boxes) > 0:
+            all_bboxes.append(boxes)
+            select_cams.append(cam)
+            # We found an image that matches. Let's abort.
+    # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \
+    #                      'Try using e.g. BoxVisibility.ANY.'
+    # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!'
+
+    num_cam = len(all_bboxes)
+
+    fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9))
+    select_cams = [sample_record['data'][cam] for cam in select_cams]
+    print('bbox in cams:', select_cams)
+    # Plot LIDAR view.
+    lidar = sample_record['data']['LIDAR_TOP']
+    data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken])
+    LidarPointCloud.from_file(data_path).render_height(axes[0], view=view)
+    for box in boxes:
+        c = np.array(get_color(box.name)) / 255.0
+        box.render(axes[0], view=view, colors=(c, c, c))
+        corners = view_points(boxes[0].corners(), view, False)[:2, :]
+        axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin])
+        axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin])
+        axes[0].axis('off')
+        axes[0].set_aspect('equal')
+
+    # Plot CAMERA view.
+    for i in range(1, num_cam + 1):
+        cam = select_cams[i - 1]
+        data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken])
+        im = Image.open(data_path)
+        axes[i].imshow(im)
+        axes[i].set_title(nusc.get('sample_data', cam)['channel'])
+        axes[i].axis('off')
+        axes[i].set_aspect('equal')
+        for box in boxes:
+            c = np.array(get_color(box.name)) / 255.0
+            box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+
+        # Print extra information about the annotation below the camera view.
+        axes[i].set_xlim(0, im.size[0])
+        axes[i].set_ylim(im.size[1], 0)
+
+    if extra_info:
+        rcParams['font.family'] = 'monospace'
+
+        w, l, h = ann_record['size']
+        category = ann_record['category_name']
+        lidar_points = ann_record['num_lidar_pts']
+        radar_points = ann_record['num_radar_pts']
+
+        sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP'])
+        pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token'])
+        dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation']))
+
+        information = ' \n'.join(['category: {}'.format(category),
+                                  '',
+                                  '# lidar points: {0:>4}'.format(lidar_points),
+                                  '# radar points: {0:>4}'.format(radar_points),
+                                  '',
+                                  'distance: {:>7.3f}m'.format(dist),
+                                  '',
+                                  'width:  {:>7.3f}m'.format(w),
+                                  'length: {:>7.3f}m'.format(l),
+                                  'height: {:>7.3f}m'.format(h)])
+
+        plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top')
+
+    if out_path is not None:
+        plt.savefig(out_path)
+
+
+def get_sample_data(sample_data_token: str,
+                    box_vis_level: BoxVisibility = BoxVisibility.ANY,
+                    selected_anntokens=None,
+                    use_flat_vehicle_coordinates: bool = False):
+    """
+    Returns the data path as well as all annotations related to that sample_data.
+    Note that the boxes are transformed into the current sensor's coordinate frame.
+    :param sample_data_token: Sample_data token.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param selected_anntokens: If provided only return the selected annotation.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+                                         aligned to z-plane in the world.
+    :return: (data_path, boxes, camera_intrinsic <np.array: 3, 3>)
+    """
+
+    # Retrieve sensor & pose records
+    sd_record = nusc.get('sample_data', sample_data_token)
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    data_path = nusc.get_sample_data_path(sample_data_token)
+
+    if sensor_record['modality'] == 'camera':
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+        imsize = (sd_record['width'], sd_record['height'])
+    else:
+        cam_intrinsic = None
+        imsize = None
+
+    # Retrieve all sample annotations and map to sensor coordinate system.
+    if selected_anntokens is not None:
+        boxes = list(map(nusc.get_box, selected_anntokens))
+    else:
+        boxes = nusc.get_boxes(sample_data_token)
+
+    # Make list of Box objects including coord system transforms.
+    box_list = []
+    for box in boxes:
+        if use_flat_vehicle_coordinates:
+            # Move box to ego vehicle coord system parallel to world z plane.
+            yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+        else:
+            # Move box to ego vehicle coord system.
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+            #  Move box to sensor coord system.
+            box.translate(-np.array(cs_record['translation']))
+            box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        if sensor_record['modality'] == 'camera' and not \
+                box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+            continue
+
+        box_list.append(box)
+
+    return data_path, box_list, cam_intrinsic
+
+
+def get_predicted_data(sample_data_token: str,
+                       box_vis_level: BoxVisibility = BoxVisibility.ANY,
+                       selected_anntokens=None,
+                       use_flat_vehicle_coordinates: bool = False,
+                       pred_anns=None
+                       ):
+    """
+    Returns the data path as well as all annotations related to that sample_data.
+    Note that the boxes are transformed into the current sensor's coordinate frame.
+    :param sample_data_token: Sample_data token.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param selected_anntokens: If provided only return the selected annotation.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+                                         aligned to z-plane in the world.
+    :return: (data_path, boxes, camera_intrinsic <np.array: 3, 3>)
+    """
+
+    # Retrieve sensor & pose records
+    sd_record = nusc.get('sample_data', sample_data_token)
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    data_path = nusc.get_sample_data_path(sample_data_token)
+
+    if sensor_record['modality'] == 'camera':
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+        imsize = (sd_record['width'], sd_record['height'])
+    else:
+        cam_intrinsic = None
+        imsize = None
+
+    # Retrieve all sample annotations and map to sensor coordinate system.
+    # if selected_anntokens is not None:
+    #    boxes = list(map(nusc.get_box, selected_anntokens))
+    # else:
+    #    boxes = nusc.get_boxes(sample_data_token)
+    boxes = pred_anns
+    # Make list of Box objects including coord system transforms.
+    box_list = []
+    for box in boxes:
+        if use_flat_vehicle_coordinates:
+            # Move box to ego vehicle coord system parallel to world z plane.
+            yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+        else:
+            # Move box to ego vehicle coord system.
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+            #  Move box to sensor coord system.
+            box.translate(-np.array(cs_record['translation']))
+            box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        if sensor_record['modality'] == 'camera' and not \
+                box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+            continue
+        box_list.append(box)
+
+    return data_path, box_list, cam_intrinsic
+
+
+def lidiar_render(sample_token, data, out_path=None, out_name=None, traj_use_perstep_offset=True):
+    bbox_gt_list = []
+    bbox_pred_list = []
+    sample_rec = nusc.get('sample', sample_token)
+    anns = sample_rec['anns']
+    sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    for ann in anns:
+        content = nusc.get('sample_annotation', ann)
+        gt_fut_trajs, gt_fut_masks = get_gt_fut_trajs(
+            nusc=nusc, anno=content, cs_record=cs_record, 
+            pose_record=pose_record, fut_ts=6
+        )
+        try:
+            bbox_gt_list.append(CustomDetectionBox(
+                sample_token=content['sample_token'],
+                translation=tuple(content['translation']),
+                size=tuple(content['size']),
+                rotation=tuple(content['rotation']),
+                velocity=nusc.box_velocity(content['token'])[:2],
+                fut_trajs=tuple(gt_fut_trajs),
+                ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+                else tuple(content['ego_translation']),
+                num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+                detection_name=category_to_detection_name(content['category_name']),
+                detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+                attribute_name=''))
+        except:
+            pass
+
+    bbox_anns = data['results'][sample_token]
+    for content in bbox_anns:
+        bbox_pred_list.append(CustomDetectionBox(
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            fut_trajs=tuple(content['fut_traj']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name']))
+    gt_annotations = EvalBoxes()
+    pred_annotations = EvalBoxes()
+    gt_annotations.add_boxes(sample_token, bbox_gt_list)
+    pred_annotations.add_boxes(sample_token, bbox_pred_list)
+    # print('green is ground truth')
+    # print('blue is the predited result')
+    visualize_sample(nusc, sample_token, gt_annotations, pred_annotations,
+                     savepath=out_path, traj_use_perstep_offset=traj_use_perstep_offset, pred_data=data)
+
+
+def get_color(category_name: str):
+    """
+    Provides the default colors based on the category names.
+    This method works for the general nuScenes categories, as well as the nuScenes detection categories.
+    """
+    a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker',
+     'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller',
+     'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris',
+     'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle',
+     'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance',
+     'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface',
+     'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation',
+     'vehicle.ego']
+    class_names = [
+        'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+        'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+    ]
+    #print(category_name)
+    if category_name == 'bicycle':
+        return nusc.colormap['vehicle.bicycle']
+    elif category_name == 'construction_vehicle':
+        return nusc.colormap['vehicle.construction']
+    elif category_name == 'traffic_cone':
+        return nusc.colormap['movable_object.trafficcone']
+
+    for key in nusc.colormap.keys():
+        if category_name in key:
+            return nusc.colormap[key]
+    return [0, 0, 0]
+
+# TODO: whether to rotate traj
+def boxes_to_sensor(boxes: List[EvalBox], pose_record: Dict, cs_record: Dict):
+    """
+    Map boxes from global coordinates to the vehicle's sensor coordinate system.
+    :param boxes: The boxes in global coordinates.
+    :param pose_record: The pose record of the vehicle at the current timestamp.
+    :param cs_record: The calibrated sensor record of the sensor.
+    :return: The transformed boxes.
+    """
+    boxes_out = []
+    for box in boxes:
+        # Create Box instance.
+        box = CustomNuscenesBox(
+            box.translation, box.size, Quaternion(box.rotation), box.fut_trajs, name=box.detection_name
+        )
+        # Move box to ego vehicle coord system.
+        box.translate(-np.array(pose_record['translation']))
+        box.rotate(Quaternion(pose_record['rotation']).inverse)
+        # Move box to sensor coord system.
+        box.translate(-np.array(cs_record['translation']))
+        box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        boxes_out.append(box)
+
+    return boxes_out
+
+
+def get_gt_fut_trajs(nusc: NuScenes,
+                     anno,
+                     cs_record,
+                     pose_record,
+                     fut_ts) -> None:
+    """
+    Visualizes a sample from BEV with annotations and detection results.
+    :param nusc: NuScenes object.
+    """
+    box = Box(anno['translation'], anno['size'], Quaternion(anno['rotation']))
+    # Move box to ego vehicle coord system.
+    box.translate(-np.array(pose_record['translation']))
+    box.rotate(Quaternion(pose_record['rotation']).inverse)
+    #  Move box to sensor coord system.
+    box.translate(-np.array(cs_record['translation']))
+    box.rotate(Quaternion(cs_record['rotation']).inverse)
+    
+    # get future trajectory coords for each box
+    gt_fut_trajs = np.zeros((fut_ts, 2))  # [fut_ts*2]
+    gt_fut_masks = np.zeros((fut_ts))  # [fut_ts]
+    gt_fut_trajs[:] = box.center[:2]
+    cur_box = box
+    cur_anno = anno
+    for i in range(fut_ts):
+        if cur_anno['next'] != '':
+            anno_next = nusc.get('sample_annotation', cur_anno['next'])
+            box_next = Box(
+                anno_next['translation'], anno_next['size'], Quaternion(anno_next['rotation'])
+            )
+            # Move box to ego vehicle coord system.
+            box_next.translate(-np.array(pose_record['translation']))
+            box_next.rotate(Quaternion(pose_record['rotation']).inverse)
+            #  Move box to sensor coord system.
+            box_next.translate(-np.array(cs_record['translation']))
+            box_next.rotate(Quaternion(cs_record['rotation']).inverse)
+            # gt_fut_trajs[i] = box_next.center[:2]
+            gt_fut_trajs[i] = box_next.center[:2] - cur_box.center[:2]
+            gt_fut_masks[i] = 1
+            cur_anno = anno_next
+            cur_box = box_next
+        else:
+            # gt_fut_trajs[i:] = gt_fut_trajs[i-1]
+            gt_fut_trajs[i:] = 0
+            break         
+
+    return gt_fut_trajs.reshape(-1).tolist(), gt_fut_masks.reshape(-1).tolist()
+
+def get_gt_vec_maps(
+    sample_token,
+    data_root='data/nuscenes/',
+    pc_range=[-15.0, -30.0, -4.0, 15.0, 30.0, 4.0],
+    padding_value=-10000,
+    map_classes=['divider', 'ped_crossing', 'boundary'],
+    map_fixed_ptsnum_per_line=20
+) -> None:
+    """
+    Get gt vec map for a given sample.
+    """
+    sample_rec = nusc.get('sample', sample_token)
+    sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+    lidar2ego_translation = cs_record['translation'],
+    lidar2ego_rotation = cs_record['rotation'],
+    ego2global_translation = pose_record['translation'],
+    ego2global_rotation = pose_record['rotation'],
+    map_location = nusc.get('log', nusc.get('scene', sample_rec['scene_token'])['log_token'])['location']
+
+    lidar2ego = np.eye(4)
+    lidar2ego[:3,:3] = Quaternion(cs_record['rotation']).rotation_matrix
+    lidar2ego[:3, 3] = cs_record['translation']
+    ego2global = np.eye(4)
+    ego2global[:3,:3] = Quaternion(pose_record['rotation']).rotation_matrix
+    ego2global[:3, 3] = pose_record['translation']
+    lidar2global = ego2global @ lidar2ego
+    lidar2global_translation = list(lidar2global[:3,3])
+    lidar2global_rotation = list(Quaternion(matrix=lidar2global).q)
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    patch_size = (patch_h, patch_w)
+
+    vector_map = VectorizedLocalMap(data_root, patch_size=patch_size,
+                                    map_classes=map_classes, 
+                                    fixed_ptsnum_per_line=map_fixed_ptsnum_per_line,
+                                    padding_value=padding_value)
+
+
+    anns_results = vector_map.gen_vectorized_samples(
+        map_location, lidar2global_translation, lidar2global_rotation
+    )
+    
+    '''
+    anns_results, type: dict
+        'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates
+        'gt_vecs_pts_num': list[num_vecs], vec with num_points
+        'gt_vecs_label': list[num_vecs], vec with cls index
+    '''
+    gt_vecs_label = to_tensor(anns_results['gt_vecs_label'])
+    if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines):
+        gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc']
+    else:
+        gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc'])
+        try:
+            gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32)
+        except:
+            gt_vecs_pts_loc = gt_vecs_pts_loc
+    
+    return gt_vecs_pts_loc, gt_vecs_label
+
+
+def visualize_sample(nusc: NuScenes,
+                     sample_token: str,
+                     gt_boxes: EvalBoxes,
+                     pred_boxes: EvalBoxes,
+                     nsweeps: int = 1,
+                     conf_th: float = 0.4,
+                     pc_range: list = [-30.0, -30.0, -4.0, 30.0, 30.0, 4.0],
+                     verbose: bool = True,
+                     savepath: str = None,
+                     traj_use_perstep_offset: bool = True,
+                     data_root='data/nuscenes/',
+                     map_pc_range: list = [-15.0, -30.0, -4.0, 15.0, 30.0, 4.0],
+                     padding_value=-10000,
+                     map_classes=['divider', 'ped_crossing', 'boundary'],
+                     map_fixed_ptsnum_per_line=20,
+                     gt_format=['fixed_num_pts'],
+                     colors_plt = ['red', 'green', 'blue'], #['cornflowerblue', 'royalblue', 'slategrey'],
+                     pred_data = None) -> None:
+    """
+    Visualizes a sample from BEV with annotations and detection results.
+    :param nusc: NuScenes object.
+    :param sample_token: The nuScenes sample token.
+    :param gt_boxes: Ground truth boxes grouped by sample.
+    :param pred_boxes: Prediction grouped by sample.
+    :param nsweeps: Number of sweeps used for lidar visualization.
+    :param conf_th: The confidence threshold used to filter negatives.
+    :param eval_range: Range in meters beyond which boxes are ignored.
+    :param verbose: Whether to print to stdout.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    """
+    # Retrieve sensor & pose records.
+    sample_rec = nusc.get('sample', sample_token)
+    sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+    # Get boxes.
+    boxes_gt_global = gt_boxes[sample_token]
+    boxes_est_global = pred_boxes[sample_token]
+    # Map GT boxes to lidar.
+    boxes_gt = boxes_to_sensor(boxes_gt_global, pose_record, cs_record)
+    # Map EST boxes to lidar.
+    boxes_est = boxes_to_sensor(boxes_est_global, pose_record, cs_record)
+    # Add scores to EST boxes.
+    for box_est, box_est_global in zip(boxes_est, boxes_est_global):
+        box_est.score = box_est_global.detection_score
+
+    # Init axes.
+    fig, axes = plt.subplots(1, 1, figsize=(4, 4))
+    plt.xlim(xmin=-30, xmax=30)
+    plt.ylim(ymin=-30, ymax=30)
+
+    # Show Pred Map
+
+    result_dic = pred_data['map_results'][sample_token]['vectors']
+
+    for vector in result_dic:
+        if vector['confidence_level'] < 0.6:
+            continue
+        pred_pts_3d = vector['pts']
+        pred_label_3d = vector['type']
+        pts_x = np.array([pt[0] for pt in pred_pts_3d])
+        pts_y = np.array([pt[1] for pt in pred_pts_3d])
+
+        axes.plot(pts_x, pts_y, color=colors_plt[pred_label_3d],linewidth=2,alpha=0.8,zorder=-1)
+        axes.scatter(pts_x, pts_y, color=colors_plt[pred_label_3d],s=1,alpha=0.8,zorder=-1)
+
+
+    # ignore_list = ['barrier', 'motorcycle', 'bicycle', 'traffic_cone']
+    ignore_list = ['barrier', 'bicycle', 'traffic_cone']
+
+    # Show Pred boxes.
+    color_list = ['salmon', 'darkcyan', 'orange', 'red', 'lightcoral', 'deepskyblue', 'gold', 'seagreen', 'deeppink',
+                 'dodgerblue', 'royalblue', 'yellow', 'violet', 'peru', 'palegreen', 'slateblue']
+    # color_list = ['Blues', 'PiYG']
+
+    for i, box in enumerate(boxes_est):
+        if box.name in ignore_list:
+            continue
+        # Show only predictions with a high score.
+        assert not np.isnan(box.score), 'Error: Box score cannot be NaN!'
+        if box.name in ['pedestrian']:
+            continue
+        if box.score < conf_th or abs(box.center[0]) > 15 or abs(box.center[1]) > 30:
+            continue
+
+        # colors = color_map(, cmap)
+        if i < 16:
+            color_box = color_list[i]
+        else:
+            color_box = color_list[-1]
+        # box.render(axes, view=np.eye(4), colors=('darkcyan', 'darkcyan', 'darkcyan'), linewidth=3, box_idx=None)
+
+        box.render(axes, view=np.eye(4), colors=(color_box, color_box, color_box), linewidth=3, box_idx=None)
+
+        if traj_use_perstep_offset:
+                # mode_idx = [0, 1, 2, 3, 4, 5]
+            mode_idx = [0]
+            # box.render_fut_trajs_grad_color(axes, linewidth=4, mode_idx=mode_idx, fut_ts=6, cmap='autumn')
+            box.render_fut_trajs_grad_color(axes, linewidth=6, mode_idx=mode_idx, fut_ts=3, cmap="autumn")
+        #cmap = LinearSegmentedColormap.from_list("mycmap", color_box)
+
+        if box.name in ['pedestrian']:
+            continue
+
+        else:
+            box.render_fut_trajs_coords(axes, color='tomato', linewidth=1)
+
+    # Show Planning.
+    axes.plot([-0.9, -0.9], [-2, 2], color='mediumseagreen', linewidth=3, alpha=0.8)
+    axes.plot([-0.9, 0.9], [2, 2], color='mediumseagreen', linewidth=3, alpha=0.8)
+    axes.plot([0.9, 0.9], [2, -2], color='mediumseagreen', linewidth=3, alpha=0.8)
+    axes.plot([0.9, -0.9], [-2, -2], color='mediumseagreen', linewidth=3, alpha=0.8)
+    axes.plot([0.0, 0.0], [0.0, 2], color='mediumseagreen', linewidth=3, alpha=0.8)
+    plan_cmd = np.argmax(pred_data['plan_results'][sample_token][1][0,0,0])
+    plan_traj = pred_data['plan_results'][sample_token][0][plan_cmd]
+    plan_traj[abs(plan_traj) < 0.01] = 0.0
+    plan_traj = plan_traj.cumsum(axis=0)
+    plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0)
+    plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1)
+
+    plan_vecs = None
+    for i in range(plan_traj.shape[0]):
+        plan_vec_i = plan_traj[i]
+        x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+        y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+        xy = np.stack((x_linspace, y_linspace), axis=1)
+        xy = np.stack((xy[:-1], xy[1:]), axis=1)
+        if plan_vecs is None:
+            plan_vecs = xy
+        else:
+            plan_vecs = np.concatenate((plan_vecs, xy), axis=0)
+
+    cmap = 'summer'
+    y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301))
+    colors = color_map(y[:-1], cmap)
+    line_segments = LineCollection(plan_vecs, colors=colors, linewidths=6, linestyles='solid', cmap=cmap)
+    axes.add_collection(line_segments)
+
+
+    axes.axes.xaxis.set_ticks([])
+    axes.axes.yaxis.set_ticks([])
+    axes.axis('off')
+    fig.set_tight_layout(True)
+    fig.canvas.draw()
+    plt.savefig(savepath+'/bev_pred.png', bbox_inches='tight', dpi=200)
+    plt.close()
+
+
+def obtain_sensor2top(nusc,
+                      sensor_token,
+                      l2e_t,
+                      l2e_r_mat,
+                      e2g_t,
+                      e2g_r_mat,
+                      sensor_type='lidar'):
+    """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+        sensor_token (str): Sample data token corresponding to the
+            specific sensor type.
+        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+            in shape (3, 3).
+        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+            in shape (3, 3).
+        sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+    Returns:
+        sweep (dict): Sweep information after transformation.
+    """
+    sd_rec = nusc.get('sample_data', sensor_token)
+    cs_record = nusc.get('calibrated_sensor',
+                         sd_rec['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
+        data_path = data_path.split(f'{os.getcwd()}/')[-1]  # relative path
+    sweep = {
+        'data_path': data_path,
+        'type': sensor_type,
+        'sample_data_token': sd_rec['token'],
+        'sensor2ego_translation': cs_record['translation'],
+        'sensor2ego_rotation': cs_record['rotation'],
+        'ego2global_translation': pose_record['translation'],
+        'ego2global_rotation': pose_record['rotation'],
+        'timestamp': sd_rec['timestamp']
+    }
+
+    l2e_r_s = sweep['sensor2ego_rotation']
+    l2e_t_s = sweep['sensor2ego_translation']
+    e2g_r_s = sweep['ego2global_rotation']
+    e2g_t_s = sweep['ego2global_translation']
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    sensor2lidar_rotation = R.T  # points @ R.T + T
+    sensor2lidar_translation = T
+
+    return sensor2lidar_rotation, sensor2lidar_translation
+
+def render_sample_data(
+        sample_toekn: str,
+        with_anns: bool = True,
+        box_vis_level: BoxVisibility = BoxVisibility.ANY,
+        axes_limit: float = 40,
+        ax=None,
+        nsweeps: int = 1,
+        out_path: str = None,
+        out_name: str = None,
+        underlay_map: bool = True,
+        use_flat_vehicle_coordinates: bool = True,
+        show_lidarseg: bool = False,
+        show_lidarseg_legend: bool = False,
+        filter_lidarseg_labels=None,
+        lidarseg_preds_bin_path: str = None,
+        verbose: bool = True,
+        show_panoptic: bool = False,
+        pred_data=None,
+        traj_use_perstep_offset: bool = True
+      ) -> None:
+    """
+    Render sample data onto axis.
+    :param sample_data_token: Sample_data token.
+    :param with_anns: Whether to draw box annotations.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param axes_limit: Axes limit for lidar and radar (measured in meters).
+    :param ax: Axes onto which to render.
+    :param nsweeps: Number of sweeps for lidar and radar.
+    :param out_path: Optional path to save the rendered figure to disk.
+    :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+        aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which
+        can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new
+        setting is more correct and rotates the plot by ~90 degrees.
+    :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set
+        to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+    :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame.
+    :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None
+        or the list is empty, all classes will be displayed.
+    :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation
+                                    predictions for the sample.
+    :param verbose: Whether to display the image after it is rendered.
+    :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set
+        to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+        If show_lidarseg is True, show_panoptic will be set to False.
+    """
+    lidiar_render(sample_toekn, pred_data, out_path=out_path,
+                  out_name=out_name, traj_use_perstep_offset=traj_use_perstep_offset)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Visualize VAD predictions')
+    parser.add_argument('--result-path', help='inference result file path')
+    parser.add_argument('--save-path', help='the dir to save visualization results')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    inference_result_path = args.result_path
+    out_path = args.save_path
+    bevformer_results = mmcv.load(inference_result_path)
+    sample_token_list = list(bevformer_results['results'].keys())
+
+    nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True)
+    
+    imgs = []
+    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
+    video_path = osp.join(out_path, 'tiny.mp4')
+    video = cv2.VideoWriter(video_path, fourcc, 10, (2933, 800), True)
+    for id in tqdm(range(len(sample_token_list))):
+    # for id in tqdm(range(25)):
+        #3025 1140
+        # id = id + 3025
+        mmcv.mkdir_or_exist(out_path)
+        render_sample_data(sample_token_list[id],
+                           pred_data=bevformer_results,
+                           out_path=out_path)
+        pred_path = osp.join(out_path, 'bev_pred.png')
+        pred_img = cv2.imread(pred_path)
+        os.remove(pred_path)
+
+        sample_token = sample_token_list[id]
+        sample = nusc.get('sample', sample_token)
+        # sample = data['results'][sample_token_list[0]][0]
+        cams = [
+            'CAM_FRONT_LEFT',
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_RIGHT',
+        ]
+
+        cam_imgs = []
+        for cam in cams:
+            sample_data_token = sample['data'][cam]
+            sd_record = nusc.get('sample_data', sample_data_token)
+            sensor_modality = sd_record['sensor_modality']
+            if sensor_modality in ['lidar', 'radar']:
+                assert False
+            elif sensor_modality == 'camera':
+                boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']),
+                            name=record['detection_name'], token='predicted') for record in
+                        bevformer_results['results'][sample_token]]
+                data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token,
+                                                                            box_vis_level=BoxVisibility.ANY,
+                                                                            pred_anns=boxes)
+                _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=BoxVisibility.ANY)
+
+                data = Image.open(data_path)
+ 
+                # Show image.
+                _, ax = plt.subplots(1, 1, figsize=(6, 12))
+                ax.imshow(data)
+
+                if cam == 'CAM_FRONT':
+                    lidar_sd_record =  nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+                    lidar_cs_record = nusc.get('calibrated_sensor', lidar_sd_record['calibrated_sensor_token'])
+                    lidar_pose_record = nusc.get('ego_pose', lidar_sd_record['ego_pose_token'])
+
+                    # get plan traj [x,y,z,w] quaternion, w=1
+                    # we set z=-1 to get points near the ground in lidar coord system
+                    plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0])
+                    plan_traj = bevformer_results['plan_results'][sample_token][0][plan_cmd]
+                    plan_traj[abs(plan_traj) < 0.01] = 0.0
+                    plan_traj = plan_traj.cumsum(axis=0)
+
+                    plan_traj = np.concatenate((
+                        plan_traj[:, [0]],
+                        plan_traj[:, [1]],
+                        -1.0*np.ones((plan_traj.shape[0], 1)),
+                        np.ones((plan_traj.shape[0], 1)),
+                    ), axis=1)
+                    # add the start point in lcf
+                    plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0)
+                    # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2]
+                    plan_traj[0, 0] = 0.3
+                    plan_traj[0, 2] = -1.0
+                    plan_traj[0, 3] = 1.0
+
+                    l2e_r = lidar_cs_record['rotation']
+                    l2e_t = lidar_cs_record['translation']
+                    e2g_r = lidar_pose_record['rotation']
+                    e2g_t = lidar_pose_record['translation']
+                    l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+                    e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+                    s2l_r, s2l_t = obtain_sensor2top(nusc, sample_data_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, cam)
+                    # obtain lidar to image transformation matrix
+                    lidar2cam_r = np.linalg.inv(s2l_r)
+                    lidar2cam_t = s2l_t @ lidar2cam_r.T
+                    lidar2cam_rt = np.eye(4)
+                    lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                    lidar2cam_rt[3, :3] = -lidar2cam_t
+                    viewpad = np.eye(4)
+                    viewpad[:camera_intrinsic.shape[0], :camera_intrinsic.shape[1]] = camera_intrinsic
+                    lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                    plan_traj = lidar2img_rt @ plan_traj.T
+                    plan_traj = plan_traj[0:2, ...] / np.maximum(
+                        plan_traj[2:3, ...], np.ones_like(plan_traj[2:3, ...]) * 1e-5)
+                    plan_traj = plan_traj.T
+                    plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1)
+
+                    plan_vecs = None
+                    for i in range(plan_traj.shape[0]):
+                        plan_vec_i = plan_traj[i]
+                        x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+                        y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+                        xy = np.stack((x_linspace, y_linspace), axis=1)
+                        xy = np.stack((xy[:-1], xy[1:]), axis=1)
+                        if plan_vecs is None:
+                            plan_vecs = xy
+                        else:
+                            plan_vecs = np.concatenate((plan_vecs, xy), axis=0)
+
+                    cmap = 'summer'
+                    y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301))
+                    colors = color_map(y[:-1], cmap)
+                    line_segments = LineCollection(plan_vecs, colors=colors, linewidths=2, linestyles='solid', cmap=cmap)
+                    ax.add_collection(line_segments)
+
+                ax.set_xlim(0, data.size[0])
+                ax.set_ylim(data.size[1], 0)
+                ax.axis('off')
+                if out_path is not None:
+                    savepath = osp.join(out_path, f'{cam}_PRED')
+                    plt.savefig(savepath, bbox_inches='tight', dpi=200, pad_inches=0.0)
+                plt.close()
+
+                # Load boxes and image.
+                data_path = osp.join(out_path, f'{cam}_PRED.png')
+                cam_img = cv2.imread(data_path)
+                lw = 6
+                tf = max(lw - 3, 1)
+                w, h = cv2.getTextSize(cam, 0, fontScale=lw / 6, thickness=tf)[0]  # text width, height
+                # color=(0, 0, 0)
+                txt_color=(255, 255, 255)
+                cv2.putText(cam_img,
+                            cam, (10, h + 10),
+                            0,
+                            lw / 6,
+                            txt_color,
+                            thickness=tf,
+                            lineType=cv2.LINE_AA)
+                cam_imgs.append(cam_img)
+            else:
+                raise ValueError("Error: Unknown sensor modality!")
+
+        plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0])
+        cmd_list = ['Turn Right', 'Turn Left', 'Go Straight']
+        plan_cmd_str = cmd_list[plan_cmd]
+        pred_img = cv2.copyMakeBorder(pred_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0)
+        # font
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        # fontScale
+        fontScale = 1
+        # Line thickness of 2 px
+        thickness = 3
+        # org
+        org = (20, 40)      
+        # Blue color in BGR
+        color = (0, 0, 0)
+        # Using cv2.putText() method
+        # pred_img = cv2.putText(pred_img, 'BEV', org, font,
+        #                 fontScale, color, thickness, cv2.LINE_AA)
+        # pred_img = cv2.putText(pred_img, plan_cmd_str, (20, 770), font,
+        #                 fontScale, color, thickness, cv2.LINE_AA)
+        
+        sample_img = pred_img
+        cam_img_top = cv2.hconcat([cam_imgs[0], cam_imgs[1], cam_imgs[2]])
+        cam_img_down = cv2.hconcat([cam_imgs[3], cam_imgs[4], cam_imgs[5]])
+        cam_img = cv2.vconcat([cam_img_top, cam_img_down])
+        size = (2133, 800)
+        cam_img = cv2.resize(cam_img, size)
+        vis_img = cv2.hconcat([cam_img, sample_img])
+
+        video.write(vis_img)
+    
+    video.release()
+    cv2.destroyAllWindows()
diff --git a/GenAD-main/tools/analysis_tools/visualization_div.py b/GenAD-main/tools/analysis_tools/visualization_div.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7e2fd43f448a6742d4acb6abac64c804918d637
--- /dev/null
+++ b/GenAD-main/tools/analysis_tools/visualization_div.py
@@ -0,0 +1,1124 @@
+import sys
+sys.path.append('')
+import os
+import argparse
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+from typing import List, Dict
+import random
+
+import cv2
+import mmcv
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+from pyquaternion import Quaternion
+from nuscenes.nuscenes import NuScenes
+from mmdet.datasets.pipelines import to_tensor
+from matplotlib.collections import LineCollection
+from nuscenes.utils.data_classes import LidarPointCloud, Box
+from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility
+
+from projects.mmdet3d_plugin.core.bbox.structures.nuscenes_box import CustomNuscenesBox, CustomDetectionBox, color_map
+from projects.mmdet3d_plugin.datasets.nuscenes_vad_dataset import VectorizedLocalMap, LiDARInstanceLines
+import matplotlib.cm as cm
+from matplotlib.colors import LinearSegmentedColormap
+
+
+cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+
+def render_annotation(
+        anntoken: str,
+        margin: float = 10,
+        view: np.ndarray = np.eye(4),
+        box_vis_level: BoxVisibility = BoxVisibility.ANY,
+        out_path: str = 'render.png',
+        extra_info: bool = False) -> None:
+    """
+    Render selected annotation.
+    :param anntoken: Sample_annotation token.
+    :param margin: How many meters in each direction to include in LIDAR view.
+    :param view: LIDAR view point.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param out_path: Optional path to save the rendered figure to disk.
+    :param extra_info: Whether to render extra information below camera view.
+    """
+    ann_record = nusc.get('sample_annotation', anntoken)
+    sample_record = nusc.get('sample', ann_record['sample_token'])
+    assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.'
+
+    # Figure out which camera the object is fully visible in (this may return nothing).
+    boxes, cam = [], []
+    cams = [key for key in sample_record['data'].keys() if 'CAM' in key]
+    all_bboxes = []
+    select_cams = []
+    for cam in cams:
+        _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level,
+                                           selected_anntokens=[anntoken])
+        if len(boxes) > 0:
+            all_bboxes.append(boxes)
+            select_cams.append(cam)
+            # We found an image that matches. Let's abort.
+    # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \
+    #                      'Try using e.g. BoxVisibility.ANY.'
+    # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!'
+
+    num_cam = len(all_bboxes)
+
+    fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9))
+    select_cams = [sample_record['data'][cam] for cam in select_cams]
+    print('bbox in cams:', select_cams)
+    # Plot LIDAR view.
+    lidar = sample_record['data']['LIDAR_TOP']
+    data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken])
+    LidarPointCloud.from_file(data_path).render_height(axes[0], view=view)
+    for box in boxes:
+        c = np.array(get_color(box.name)) / 255.0
+        box.render(axes[0], view=view, colors=(c, c, c))
+        corners = view_points(boxes[0].corners(), view, False)[:2, :]
+        axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin])
+        axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin])
+        axes[0].axis('off')
+        axes[0].set_aspect('equal')
+
+    # Plot CAMERA view.
+    for i in range(1, num_cam + 1):
+        cam = select_cams[i - 1]
+        data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken])
+        im = Image.open(data_path)
+        axes[i].imshow(im)
+        axes[i].set_title(nusc.get('sample_data', cam)['channel'])
+        axes[i].axis('off')
+        axes[i].set_aspect('equal')
+        for box in boxes:
+            c = np.array(get_color(box.name)) / 255.0
+            box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+
+        # Print extra information about the annotation below the camera view.
+        axes[i].set_xlim(0, im.size[0])
+        axes[i].set_ylim(im.size[1], 0)
+
+    if extra_info:
+        rcParams['font.family'] = 'monospace'
+
+        w, l, h = ann_record['size']
+        category = ann_record['category_name']
+        lidar_points = ann_record['num_lidar_pts']
+        radar_points = ann_record['num_radar_pts']
+
+        sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP'])
+        pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token'])
+        dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation']))
+
+        information = ' \n'.join(['category: {}'.format(category),
+                                  '',
+                                  '# lidar points: {0:>4}'.format(lidar_points),
+                                  '# radar points: {0:>4}'.format(radar_points),
+                                  '',
+                                  'distance: {:>7.3f}m'.format(dist),
+                                  '',
+                                  'width:  {:>7.3f}m'.format(w),
+                                  'length: {:>7.3f}m'.format(l),
+                                  'height: {:>7.3f}m'.format(h)])
+
+        plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top')
+
+    if out_path is not None:
+        plt.savefig(out_path)
+
+
+def get_sample_data(sample_data_token: str,
+                    box_vis_level: BoxVisibility = BoxVisibility.ANY,
+                    selected_anntokens=None,
+                    use_flat_vehicle_coordinates: bool = False):
+    """
+    Returns the data path as well as all annotations related to that sample_data.
+    Note that the boxes are transformed into the current sensor's coordinate frame.
+    :param sample_data_token: Sample_data token.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param selected_anntokens: If provided only return the selected annotation.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+                                         aligned to z-plane in the world.
+    :return: (data_path, boxes, camera_intrinsic <np.array: 3, 3>)
+    """
+
+    # Retrieve sensor & pose records
+    sd_record = nusc.get('sample_data', sample_data_token)
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    data_path = nusc.get_sample_data_path(sample_data_token)
+
+    if sensor_record['modality'] == 'camera':
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+        imsize = (sd_record['width'], sd_record['height'])
+    else:
+        cam_intrinsic = None
+        imsize = None
+
+    # Retrieve all sample annotations and map to sensor coordinate system.
+    if selected_anntokens is not None:
+        boxes = list(map(nusc.get_box, selected_anntokens))
+    else:
+        boxes = nusc.get_boxes(sample_data_token)
+
+    # Make list of Box objects including coord system transforms.
+    box_list = []
+    for box in boxes:
+        if use_flat_vehicle_coordinates:
+            # Move box to ego vehicle coord system parallel to world z plane.
+            yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+        else:
+            # Move box to ego vehicle coord system.
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+            #  Move box to sensor coord system.
+            box.translate(-np.array(cs_record['translation']))
+            box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        if sensor_record['modality'] == 'camera' and not \
+                box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+            continue
+
+        box_list.append(box)
+
+    return data_path, box_list, cam_intrinsic
+
+
+def get_predicted_data(sample_data_token: str,
+                       box_vis_level: BoxVisibility = BoxVisibility.ANY,
+                       selected_anntokens=None,
+                       use_flat_vehicle_coordinates: bool = False,
+                       pred_anns=None
+                       ):
+    """
+    Returns the data path as well as all annotations related to that sample_data.
+    Note that the boxes are transformed into the current sensor's coordinate frame.
+    :param sample_data_token: Sample_data token.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param selected_anntokens: If provided only return the selected annotation.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+                                         aligned to z-plane in the world.
+    :return: (data_path, boxes, camera_intrinsic <np.array: 3, 3>)
+    """
+
+    # Retrieve sensor & pose records
+    sd_record = nusc.get('sample_data', sample_data_token)
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    data_path = nusc.get_sample_data_path(sample_data_token)
+
+    if sensor_record['modality'] == 'camera':
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+        imsize = (sd_record['width'], sd_record['height'])
+    else:
+        cam_intrinsic = None
+        imsize = None
+
+    # Retrieve all sample annotations and map to sensor coordinate system.
+    # if selected_anntokens is not None:
+    #    boxes = list(map(nusc.get_box, selected_anntokens))
+    # else:
+    #    boxes = nusc.get_boxes(sample_data_token)
+    boxes = pred_anns
+    # Make list of Box objects including coord system transforms.
+    box_list = []
+    for box in boxes:
+        if use_flat_vehicle_coordinates:
+            # Move box to ego vehicle coord system parallel to world z plane.
+            yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+        else:
+            # Move box to ego vehicle coord system.
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+            #  Move box to sensor coord system.
+            box.translate(-np.array(cs_record['translation']))
+            box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        if sensor_record['modality'] == 'camera' and not \
+                box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+            continue
+        box_list.append(box)
+
+    return data_path, box_list, cam_intrinsic
+
+
+def lidiar_render(sample_token, data, out_path=None, out_name=None, traj_use_perstep_offset=True):
+    bbox_gt_list = []
+    bbox_pred_list = []
+    sample_rec = nusc.get('sample', sample_token)
+    anns = sample_rec['anns']
+    sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    for ann in anns:
+        content = nusc.get('sample_annotation', ann)
+        gt_fut_trajs, gt_fut_masks = get_gt_fut_trajs(
+            nusc=nusc, anno=content, cs_record=cs_record, 
+            pose_record=pose_record, fut_ts=6
+        )
+        try:
+            bbox_gt_list.append(CustomDetectionBox(
+                sample_token=content['sample_token'],
+                translation=tuple(content['translation']),
+                size=tuple(content['size']),
+                rotation=tuple(content['rotation']),
+                velocity=nusc.box_velocity(content['token'])[:2],
+                fut_trajs=tuple(gt_fut_trajs),
+                ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+                else tuple(content['ego_translation']),
+                num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+                detection_name=category_to_detection_name(content['category_name']),
+                detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+                attribute_name=''))
+        except:
+            pass
+
+    bbox_anns = data['results'][sample_token]
+    for content in bbox_anns:
+        bbox_pred_list.append(CustomDetectionBox(
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            fut_trajs=tuple(content['fut_traj']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name']))
+    gt_annotations = EvalBoxes()
+    pred_annotations = EvalBoxes()
+    gt_annotations.add_boxes(sample_token, bbox_gt_list)
+    pred_annotations.add_boxes(sample_token, bbox_pred_list)
+    # print('green is ground truth')
+    # print('blue is the predited result')
+    visualize_sample(nusc, sample_token, gt_annotations, pred_annotations,
+                     savepath=out_path, traj_use_perstep_offset=traj_use_perstep_offset, pred_data=data)
+
+
+def get_color(category_name: str):
+    """
+    Provides the default colors based on the category names.
+    This method works for the general nuScenes categories, as well as the nuScenes detection categories.
+    """
+    a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker',
+     'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller',
+     'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris',
+     'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle',
+     'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance',
+     'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface',
+     'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation',
+     'vehicle.ego']
+    class_names = [
+        'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+        'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+    ]
+    #print(category_name)
+    if category_name == 'bicycle':
+        return nusc.colormap['vehicle.bicycle']
+    elif category_name == 'construction_vehicle':
+        return nusc.colormap['vehicle.construction']
+    elif category_name == 'traffic_cone':
+        return nusc.colormap['movable_object.trafficcone']
+
+    for key in nusc.colormap.keys():
+        if category_name in key:
+            return nusc.colormap[key]
+    return [0, 0, 0]
+
+# TODO: whether to rotate traj
+def boxes_to_sensor(boxes: List[EvalBox], pose_record: Dict, cs_record: Dict):
+    """
+    Map boxes from global coordinates to the vehicle's sensor coordinate system.
+    :param boxes: The boxes in global coordinates.
+    :param pose_record: The pose record of the vehicle at the current timestamp.
+    :param cs_record: The calibrated sensor record of the sensor.
+    :return: The transformed boxes.
+    """
+    boxes_out = []
+    for box in boxes:
+        # Create Box instance.
+        box = CustomNuscenesBox(
+            box.translation, box.size, Quaternion(box.rotation), box.fut_trajs, name=box.detection_name
+        )
+        # Move box to ego vehicle coord system.
+        box.translate(-np.array(pose_record['translation']))
+        box.rotate(Quaternion(pose_record['rotation']).inverse)
+        # Move box to sensor coord system.
+        box.translate(-np.array(cs_record['translation']))
+        box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        boxes_out.append(box)
+
+    return boxes_out
+
+
+def get_gt_fut_trajs(nusc: NuScenes,
+                     anno,
+                     cs_record,
+                     pose_record,
+                     fut_ts) -> None:
+    """
+    Visualizes a sample from BEV with annotations and detection results.
+    :param nusc: NuScenes object.
+    """
+    box = Box(anno['translation'], anno['size'], Quaternion(anno['rotation']))
+    # Move box to ego vehicle coord system.
+    box.translate(-np.array(pose_record['translation']))
+    box.rotate(Quaternion(pose_record['rotation']).inverse)
+    #  Move box to sensor coord system.
+    box.translate(-np.array(cs_record['translation']))
+    box.rotate(Quaternion(cs_record['rotation']).inverse)
+    
+    # get future trajectory coords for each box
+    gt_fut_trajs = np.zeros((fut_ts, 2))  # [fut_ts*2]
+    gt_fut_masks = np.zeros((fut_ts))  # [fut_ts]
+    gt_fut_trajs[:] = box.center[:2]
+    cur_box = box
+    cur_anno = anno
+    for i in range(fut_ts):
+        if cur_anno['next'] != '':
+            anno_next = nusc.get('sample_annotation', cur_anno['next'])
+            box_next = Box(
+                anno_next['translation'], anno_next['size'], Quaternion(anno_next['rotation'])
+            )
+            # Move box to ego vehicle coord system.
+            box_next.translate(-np.array(pose_record['translation']))
+            box_next.rotate(Quaternion(pose_record['rotation']).inverse)
+            #  Move box to sensor coord system.
+            box_next.translate(-np.array(cs_record['translation']))
+            box_next.rotate(Quaternion(cs_record['rotation']).inverse)
+            # gt_fut_trajs[i] = box_next.center[:2]
+            gt_fut_trajs[i] = box_next.center[:2] - cur_box.center[:2]
+            gt_fut_masks[i] = 1
+            cur_anno = anno_next
+            cur_box = box_next
+        else:
+            # gt_fut_trajs[i:] = gt_fut_trajs[i-1]
+            gt_fut_trajs[i:] = 0
+            break         
+
+    return gt_fut_trajs.reshape(-1).tolist(), gt_fut_masks.reshape(-1).tolist()
+
+def get_gt_vec_maps(
+    sample_token,
+    data_root='data/nuscenes/',
+    pc_range=[-15.0, -30.0, -4.0, 15.0, 30.0, 4.0],
+    padding_value=-10000,
+    map_classes=['divider', 'ped_crossing', 'boundary'],
+    map_fixed_ptsnum_per_line=20
+) -> None:
+    """
+    Get gt vec map for a given sample.
+    """
+    sample_rec = nusc.get('sample', sample_token)
+    sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+    lidar2ego_translation = cs_record['translation'],
+    lidar2ego_rotation = cs_record['rotation'],
+    ego2global_translation = pose_record['translation'],
+    ego2global_rotation = pose_record['rotation'],
+    map_location = nusc.get('log', nusc.get('scene', sample_rec['scene_token'])['log_token'])['location']
+
+    lidar2ego = np.eye(4)
+    lidar2ego[:3,:3] = Quaternion(cs_record['rotation']).rotation_matrix
+    lidar2ego[:3, 3] = cs_record['translation']
+    ego2global = np.eye(4)
+    ego2global[:3,:3] = Quaternion(pose_record['rotation']).rotation_matrix
+    ego2global[:3, 3] = pose_record['translation']
+    lidar2global = ego2global @ lidar2ego
+    lidar2global_translation = list(lidar2global[:3,3])
+    lidar2global_rotation = list(Quaternion(matrix=lidar2global).q)
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    patch_size = (patch_h, patch_w)
+
+    vector_map = VectorizedLocalMap(data_root, patch_size=patch_size,
+                                    map_classes=map_classes, 
+                                    fixed_ptsnum_per_line=map_fixed_ptsnum_per_line,
+                                    padding_value=padding_value)
+
+
+    anns_results = vector_map.gen_vectorized_samples(
+        map_location, lidar2global_translation, lidar2global_rotation
+    )
+    
+    '''
+    anns_results, type: dict
+        'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates
+        'gt_vecs_pts_num': list[num_vecs], vec with num_points
+        'gt_vecs_label': list[num_vecs], vec with cls index
+    '''
+    gt_vecs_label = to_tensor(anns_results['gt_vecs_label'])
+    if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines):
+        gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc']
+    else:
+        gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc'])
+        try:
+            gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32)
+        except:
+            gt_vecs_pts_loc = gt_vecs_pts_loc
+    
+    return gt_vecs_pts_loc, gt_vecs_label
+
+
+def visualize_sample(nusc: NuScenes,
+                     sample_token: str,
+                     gt_boxes: EvalBoxes,
+                     pred_boxes: EvalBoxes,
+                     nsweeps: int = 1,
+                     conf_th: float = 0.4,
+                     pc_range: list = [-30.0, -30.0, -4.0, 30.0, 30.0, 4.0],
+                     verbose: bool = True,
+                     savepath: str = None,
+                     traj_use_perstep_offset: bool = True,
+                     data_root='data/nuscenes/',
+                     map_pc_range: list = [-15.0, -30.0, -4.0, 15.0, 30.0, 4.0],
+                     padding_value=-10000,
+                     map_classes=['divider', 'ped_crossing', 'boundary'],
+                     map_fixed_ptsnum_per_line=20,
+                     gt_format=['fixed_num_pts'],
+                     colors_plt = ['red', 'green', 'blue'], #['cornflowerblue', 'royalblue', 'slategrey'],
+                     pred_data = None) -> None:
+    """
+    Visualizes a sample from BEV with annotations and detection results.
+    :param nusc: NuScenes object.
+    :param sample_token: The nuScenes sample token.
+    :param gt_boxes: Ground truth boxes grouped by sample.
+    :param pred_boxes: Prediction grouped by sample.
+    :param nsweeps: Number of sweeps used for lidar visualization.
+    :param conf_th: The confidence threshold used to filter negatives.
+    :param eval_range: Range in meters beyond which boxes are ignored.
+    :param verbose: Whether to print to stdout.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    """
+    # Retrieve sensor & pose records.
+    sample_rec = nusc.get('sample', sample_token)
+    sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+    # Get boxes.
+    boxes_gt_global = gt_boxes[sample_token]
+    boxes_est_global = pred_boxes[sample_token]
+    # Map GT boxes to lidar.
+    boxes_gt = boxes_to_sensor(boxes_gt_global, pose_record, cs_record)
+    # Map EST boxes to lidar.
+    boxes_est = boxes_to_sensor(boxes_est_global, pose_record, cs_record)
+    # Add scores to EST boxes.
+    for box_est, box_est_global in zip(boxes_est, boxes_est_global):
+        box_est.score = box_est_global.detection_score
+
+    # Init axes.
+    fig, axes = plt.subplots(1, 1, figsize=(4, 4))
+    plt.xlim(xmin=-30, xmax=30)
+    plt.ylim(ymin=-30, ymax=30)
+
+    # Show Pred Map
+
+    result_dic = pred_data['map_results'][sample_token]['vectors']
+
+    for vector in result_dic:
+        if vector['confidence_level'] < 0.6:
+            continue
+        pred_pts_3d = vector['pts']
+        pred_label_3d = vector['type']
+        pts_x = np.array([pt[0] for pt in pred_pts_3d])
+        pts_y = np.array([pt[1] for pt in pred_pts_3d])
+
+        axes.plot(pts_x, pts_y, color=colors_plt[pred_label_3d],linewidth=2,alpha=0.8,zorder=-1)
+        axes.scatter(pts_x, pts_y, color=colors_plt[pred_label_3d],s=1,alpha=0.8,zorder=-1)
+
+
+    # ignore_list = ['barrier', 'motorcycle', 'bicycle', 'traffic_cone']
+    ignore_list = ['barrier', 'bicycle', 'traffic_cone']
+
+    # Show Pred boxes.
+    color_list = ['salmon', 'darkcyan', 'orange', 'red', 'lightcoral', 'deepskyblue', 'gold', 'seagreen', 'deeppink',
+                 'dodgerblue', 'royalblue', 'yellow', 'violet', 'peru', 'palegreen', 'slateblue']
+    # color_list = ['Blues', 'PiYG']
+
+    for i, box in enumerate(boxes_est):
+        if box.name in ignore_list:
+            continue
+        # Show only predictions with a high score.
+        assert not np.isnan(box.score), 'Error: Box score cannot be NaN!'
+        if box.name in ['pedestrian']:
+            continue
+        if box.score < conf_th or abs(box.center[0]) > 15 or abs(box.center[1]) > 30:
+            continue
+
+        # colors = color_map(, cmap)
+        if i < 16:
+            color_box = color_list[i]
+        else:
+            color_box = color_list[-1]
+        # box.render(axes, view=np.eye(4), colors=('darkcyan', 'darkcyan', 'darkcyan'), linewidth=3, box_idx=None)
+
+        box.render(axes, view=np.eye(4), colors=(color_box, color_box, color_box), linewidth=3, box_idx=None)
+
+        if traj_use_perstep_offset:
+                # mode_idx = [0, 1, 2, 3, 4, 5]
+            mode_idx = [0]
+            # box.render_fut_trajs_grad_color(axes, linewidth=4, mode_idx=mode_idx, fut_ts=6, cmap='autumn')
+            box.render_fut_trajs_grad_color(axes, linewidth=6, mode_idx=mode_idx, fut_ts=3, cmap="autumn")
+        #cmap = LinearSegmentedColormap.from_list("mycmap", color_box)
+
+        if box.name in ['pedestrian']:
+            continue
+
+        else:
+            box.render_fut_trajs_coords(axes, color='tomato', linewidth=1)
+
+    # Show Planning.
+    axes.plot([-0.9, -0.9], [-2, 2], color='mediumseagreen', linewidth=3, alpha=0.8)
+    axes.plot([-0.9, 0.9], [2, 2], color='mediumseagreen', linewidth=3, alpha=0.8)
+    axes.plot([0.9, 0.9], [2, -2], color='mediumseagreen', linewidth=3, alpha=0.8)
+    axes.plot([0.9, -0.9], [-2, -2], color='mediumseagreen', linewidth=3, alpha=0.8)
+    axes.plot([0.0, 0.0], [0.0, 2], color='mediumseagreen', linewidth=3, alpha=0.8)
+    plan_cmd = np.argmax(pred_data['plan_results'][sample_token][1][0,0,0])
+    plan_traj = pred_data['plan_results'][sample_token][0][plan_cmd]
+    plan_traj[abs(plan_traj) < 0.01] = 0.0
+    plan_traj = plan_traj.cumsum(axis=0)
+    plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0)
+    plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1)
+
+    plan_vecs = None
+    for i in range(plan_traj.shape[0]):
+        plan_vec_i = plan_traj[i]
+        x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+        y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+        xy = np.stack((x_linspace, y_linspace), axis=1)
+        xy = np.stack((xy[:-1], xy[1:]), axis=1)
+        if plan_vecs is None:
+            plan_vecs = xy
+        else:
+            plan_vecs = np.concatenate((plan_vecs, xy), axis=0)
+
+    cmap = 'summer'
+    y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301))
+    colors = color_map(y[:-1], cmap)
+    line_segments = LineCollection(plan_vecs, colors=colors, linewidths=6, linestyles='solid', cmap=cmap)
+    axes.add_collection(line_segments)
+
+
+    axes.axes.xaxis.set_ticks([])
+    axes.axes.yaxis.set_ticks([])
+    axes.axis('off')
+    fig.set_tight_layout(True)
+    fig.canvas.draw()
+    plt.savefig(savepath+'/bev_pred.png', bbox_inches='tight', dpi=200)
+    plt.close()
+
+
+def obtain_sensor2top(nusc,
+                      sensor_token,
+                      l2e_t,
+                      l2e_r_mat,
+                      e2g_t,
+                      e2g_r_mat,
+                      sensor_type='lidar'):
+    """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+        sensor_token (str): Sample data token corresponding to the
+            specific sensor type.
+        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+            in shape (3, 3).
+        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+            in shape (3, 3).
+        sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+    Returns:
+        sweep (dict): Sweep information after transformation.
+    """
+    sd_rec = nusc.get('sample_data', sensor_token)
+    cs_record = nusc.get('calibrated_sensor',
+                         sd_rec['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
+        data_path = data_path.split(f'{os.getcwd()}/')[-1]  # relative path
+    sweep = {
+        'data_path': data_path,
+        'type': sensor_type,
+        'sample_data_token': sd_rec['token'],
+        'sensor2ego_translation': cs_record['translation'],
+        'sensor2ego_rotation': cs_record['rotation'],
+        'ego2global_translation': pose_record['translation'],
+        'ego2global_rotation': pose_record['rotation'],
+        'timestamp': sd_rec['timestamp']
+    }
+
+    l2e_r_s = sweep['sensor2ego_rotation']
+    l2e_t_s = sweep['sensor2ego_translation']
+    e2g_r_s = sweep['ego2global_rotation']
+    e2g_t_s = sweep['ego2global_translation']
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    sensor2lidar_rotation = R.T  # points @ R.T + T
+    sensor2lidar_translation = T
+
+    return sensor2lidar_rotation, sensor2lidar_translation
+
+def render_sample_data(
+        sample_toekn: str,
+        with_anns: bool = True,
+        box_vis_level: BoxVisibility = BoxVisibility.ANY,
+        axes_limit: float = 40,
+        ax=None,
+        nsweeps: int = 1,
+        out_path: str = None,
+        out_name: str = None,
+        underlay_map: bool = True,
+        use_flat_vehicle_coordinates: bool = True,
+        show_lidarseg: bool = False,
+        show_lidarseg_legend: bool = False,
+        filter_lidarseg_labels=None,
+        lidarseg_preds_bin_path: str = None,
+        verbose: bool = True,
+        show_panoptic: bool = False,
+        pred_data=None,
+        traj_use_perstep_offset: bool = True
+      ) -> None:
+    """
+    Render sample data onto axis.
+    :param sample_data_token: Sample_data token.
+    :param with_anns: Whether to draw box annotations.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param axes_limit: Axes limit for lidar and radar (measured in meters).
+    :param ax: Axes onto which to render.
+    :param nsweeps: Number of sweeps for lidar and radar.
+    :param out_path: Optional path to save the rendered figure to disk.
+    :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+        aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which
+        can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new
+        setting is more correct and rotates the plot by ~90 degrees.
+    :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set
+        to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+    :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame.
+    :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None
+        or the list is empty, all classes will be displayed.
+    :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation
+                                    predictions for the sample.
+    :param verbose: Whether to display the image after it is rendered.
+    :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set
+        to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+        If show_lidarseg is True, show_panoptic will be set to False.
+    """
+    lidiar_render(sample_toekn, pred_data, out_path=out_path,
+                  out_name=out_name, traj_use_perstep_offset=traj_use_perstep_offset)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Visualize VAD predictions')
+    parser.add_argument('--result-path', help='inference result file path')
+    parser.add_argument('--save-path', help='the dir to save visualization results')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    inference_result_path_0 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/00/pts_bbox/results_nusc.pkl'
+    inference_result_path_1 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e/Wed_Nov_15_10_59_19_2023/pts_bbox/results_nusc.pkl'
+    inference_result_path_2 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_base_e2e_vae/Wed_Nov_15_14_18_16_2023/pts_bbox/results_nusc.pkl'
+    inference_result_path_3 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/00/pts_bbox/results_nusc.pkl'
+    inference_result_path_4 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/00/pts_bbox/results_nusc.pkl'
+    # inference_result_path_0 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/00/pts_bbox/results_nusc.pkl'
+    # inference_result_path_1 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/01/pts_bbox/results_nusc.pkl'
+    # inference_result_path_2 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/02/pts_bbox/results_nusc.pkl'
+    # inference_result_path_3 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/03/pts_bbox/results_nusc.pkl'
+    # inference_result_path_4 = '/home/ubuntu/phd/unity/vad/VAD/test/VAD_tiny_e2e_generator/04/pts_bbox/results_nusc.pkl'
+
+    out_path = args.save_path
+    bevformer_results = mmcv.load(inference_result_path_0)
+    bevformer_results_1 = mmcv.load(inference_result_path_1)
+    bevformer_results_2 = mmcv.load(inference_result_path_2)
+    bevformer_results_3 = mmcv.load(inference_result_path_3)
+    bevformer_results_4 = mmcv.load(inference_result_path_4)
+    sample_token_list = list(bevformer_results['results'].keys())
+
+    nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True)
+    
+    imgs = []
+    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
+    video_path = osp.join(out_path, 'tiny.mp4')
+    video = cv2.VideoWriter(video_path, fourcc, 10, (2933, 800), True)
+    # for id in tqdm(range(len(sample_token_list))):
+    for id in tqdm(range(200)):
+        # 3025 1140
+        id = id + 3000
+        mmcv.mkdir_or_exist(out_path)
+        render_sample_data(sample_token_list[id],
+                           pred_data=bevformer_results,
+                           out_path=out_path)
+        pred_path = osp.join(out_path, 'bev_pred.png')
+        pred_img = cv2.imread(pred_path)
+        os.remove(pred_path)
+
+        sample_token = sample_token_list[id]
+        sample = nusc.get('sample', sample_token)
+        # sample = data['results'][sample_token_list[0]][0]
+        cams = [
+            'CAM_FRONT_LEFT',
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_RIGHT',
+        ]
+
+        cam_imgs = []
+        for cam in cams:
+            sample_data_token = sample['data'][cam]
+            sd_record = nusc.get('sample_data', sample_data_token)
+            sensor_modality = sd_record['sensor_modality']
+            if sensor_modality in ['lidar', 'radar']:
+                assert False
+            elif sensor_modality == 'camera':
+                boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']),
+                            name=record['detection_name'], token='predicted') for record in
+                        bevformer_results['results'][sample_token]]
+                data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token,
+                                                                            box_vis_level=BoxVisibility.ANY,
+                                                                            pred_anns=boxes)
+                _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=BoxVisibility.ANY)
+
+                data = Image.open(data_path)
+ 
+                # Show image.
+                _, ax = plt.subplots(1, 1, figsize=(6, 12))
+                ax.imshow(data)
+
+                if cam == 'CAM_FRONT':
+                    lidar_sd_record =  nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+                    lidar_cs_record = nusc.get('calibrated_sensor', lidar_sd_record['calibrated_sensor_token'])
+                    lidar_pose_record = nusc.get('ego_pose', lidar_sd_record['ego_pose_token'])
+
+                    # get plan traj [x,y,z,w] quaternion, w=1
+                    # we set z=-1 to get points near the ground in lidar coord system
+                    plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0])
+                    plan_traj = bevformer_results['plan_results'][sample_token][0][plan_cmd]
+
+                    ######
+                    plan_cmd_1 = np.argmax(bevformer_results_1['plan_results'][sample_token][1][0, 0, 0])
+                    plan_traj_1 = bevformer_results_1['plan_results'][sample_token][0][plan_cmd_1]
+                    plan_cmd_2 = np.argmax(bevformer_results_2['plan_results'][sample_token][1][0, 0, 0])
+                    plan_traj_2 = bevformer_results_2['plan_results'][sample_token][0][plan_cmd_2]
+                    plan_cmd_3 = np.argmax(bevformer_results_3['plan_results'][sample_token][1][0, 0, 0])
+                    plan_traj_3 = bevformer_results_3['plan_results'][sample_token][0][plan_cmd_3]
+                    plan_cmd_4 = np.argmax(bevformer_results_4['plan_results'][sample_token][1][0, 0, 0])
+                    plan_traj_4 = bevformer_results_4['plan_results'][sample_token][0][plan_cmd_4]
+
+                    plan_traj[abs(plan_traj) < 0.01] = 0.0
+                    plan_traj = plan_traj.cumsum(axis=0)
+
+                    plan_traj = np.concatenate((
+                        plan_traj[:, [0]],
+                        plan_traj[:, [1]],
+                        -1.0*np.ones((plan_traj.shape[0], 1)),
+                        np.ones((plan_traj.shape[0], 1)),
+                    ), axis=1)
+                    # add the start point in lcf
+                    plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0)
+                    # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2]
+                    plan_traj[0, 0] = 0.3
+                    plan_traj[0, 2] = -1.0
+                    plan_traj[0, 3] = 1.0
+
+
+                    ############### 1
+                    plan_traj_1[abs(plan_traj_1) < 0.01] = 0.0
+                    plan_traj_1 = plan_traj_1.cumsum(axis=0)
+
+                    plan_traj_1 = np.concatenate((
+                        plan_traj_1[:, [0]],
+                        plan_traj_1[:, [1]],
+                        -1.0*np.ones((plan_traj_1.shape[0], 1)),
+                        np.ones((plan_traj_1.shape[0], 1)),
+                    ), axis=1)
+                    # add the start point in lcf
+                    plan_traj_1 = np.concatenate((np.zeros((1, plan_traj_1.shape[1])), plan_traj_1), axis=0)
+                    # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2]
+                    plan_traj_1[0, 0] = 0.3
+                    plan_traj_1[0, 2] = -1.0
+                    plan_traj_1[0, 3] = 1.0
+
+                    ############### 2
+                    plan_traj_2[abs(plan_traj_2) < 0.01] = 0.0
+                    plan_traj_2 = plan_traj_2.cumsum(axis=0)
+
+                    plan_traj_2 = np.concatenate((
+                        plan_traj_2[:, [0]],
+                        plan_traj_2[:, [1]],
+                        -1.0 * np.ones((plan_traj_2.shape[0], 1)),
+                        np.ones((plan_traj_2.shape[0], 1)),
+                    ), axis=1)
+                    # add the start point in lcf
+                    plan_traj_2 = np.concatenate((np.zeros((1, plan_traj_2.shape[1])), plan_traj_2), axis=0)
+                    # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2]
+                    plan_traj_2[0, 0] = 0.3
+                    plan_traj_2[0, 2] = -1.0
+                    plan_traj_2[0, 3] = 1.0
+
+                    ############### 3
+                    plan_traj_3[abs(plan_traj_3) < 0.01] = 0.0
+                    plan_traj_3 = plan_traj_3.cumsum(axis=0)
+
+                    plan_traj_3 = np.concatenate((
+                        plan_traj_3[:, [0]],
+                        plan_traj_3[:, [1]],
+                        -1.0*np.ones((plan_traj_3.shape[0], 1)),
+                        np.ones((plan_traj_3.shape[0], 1)),
+                    ), axis=1)
+                    # add the start point in lcf
+                    plan_traj_3 = np.concatenate((np.zeros((1, plan_traj_3.shape[1])), plan_traj_3), axis=0)
+                    # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2]
+                    plan_traj_3[0, 0] = 0.3
+                    plan_traj_3[0, 2] = -1.0
+                    plan_traj_3[0, 3] = 1.0
+
+                    ############### 4
+                    plan_traj_4[abs(plan_traj_4) < 0.01] = 0.0
+                    plan_traj_4 = plan_traj_4.cumsum(axis=0)
+
+                    plan_traj_4 = np.concatenate((
+                        plan_traj_4[:, [0]],
+                        plan_traj_4[:, [1]],
+                        -1.0*np.ones((plan_traj_4.shape[0], 1)),
+                        np.ones((plan_traj_4.shape[0], 1)),
+                    ), axis=1)
+                    # add the start point in lcf
+                    plan_traj_4 = np.concatenate((np.zeros((1, plan_traj_4.shape[1])), plan_traj_4), axis=0)
+                    # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2]
+                    plan_traj_4[0, 0] = 0.3
+                    plan_traj_4[0, 2] = -1.0
+                    plan_traj_4[0, 3] = 1.0
+
+                    l2e_r = lidar_cs_record['rotation']
+                    l2e_t = lidar_cs_record['translation']
+                    e2g_r = lidar_pose_record['rotation']
+                    e2g_t = lidar_pose_record['translation']
+                    l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+                    e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+                    s2l_r, s2l_t = obtain_sensor2top(nusc, sample_data_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, cam)
+                    # obtain lidar to image transformation matrix
+                    lidar2cam_r = np.linalg.inv(s2l_r)
+                    lidar2cam_t = s2l_t @ lidar2cam_r.T
+                    lidar2cam_rt = np.eye(4)
+                    lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                    lidar2cam_rt[3, :3] = -lidar2cam_t
+                    viewpad = np.eye(4)
+                    viewpad[:camera_intrinsic.shape[0], :camera_intrinsic.shape[1]] = camera_intrinsic
+                    lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                    plan_traj = lidar2img_rt @ plan_traj.T
+                    plan_traj = plan_traj[0:2, ...] / np.maximum(
+                        plan_traj[2:3, ...], np.ones_like(plan_traj[2:3, ...]) * 1e-5)
+                    plan_traj = plan_traj.T
+                    plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1)
+
+                    plan_vecs = None
+                    for i in range(plan_traj.shape[0]):
+                        plan_vec_i = plan_traj[i]
+                        x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+                        y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+                        xy = np.stack((x_linspace, y_linspace), axis=1)
+                        xy = np.stack((xy[:-1], xy[1:]), axis=1)
+                        if plan_vecs is None:
+                            plan_vecs = xy
+                        else:
+                            plan_vecs = np.concatenate((plan_vecs, xy), axis=0)
+
+                    ##############1
+                    plan_traj_1 = lidar2img_rt @ plan_traj_1.T
+                    plan_traj_1 = plan_traj_1[0:2, ...] / np.maximum(
+                        plan_traj_1[2:3, ...], np.ones_like(plan_traj_1[2:3, ...]) * 1e-5)
+                    plan_traj_1 = plan_traj_1.T
+                    plan_traj_1 = np.stack((plan_traj_1[:-1], plan_traj_1[1:]), axis=1)
+
+                    plan_vecs_1 = None
+                    for i in range(plan_traj_1.shape[0]):
+                        plan_vec_i = plan_traj_1[i]
+                        x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+                        y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+                        xy = np.stack((x_linspace, y_linspace), axis=1)
+                        xy = np.stack((xy[:-1], xy[1:]), axis=1)
+                        if plan_vecs_1 is None:
+                            plan_vecs_1 = xy
+                        else:
+                            plan_vecs_1 = np.concatenate((plan_vecs_1, xy), axis=0)
+
+                    ##############2
+                    plan_traj_2 = lidar2img_rt @ plan_traj_2.T
+                    plan_traj_2 = plan_traj_2[0:2, ...] / np.maximum(
+                        plan_traj_2[2:3, ...], np.ones_like(plan_traj_2[2:3, ...]) * 1e-5)
+                    plan_traj_2 = plan_traj_2.T
+                    plan_traj_2 = np.stack((plan_traj_2[:-1], plan_traj_2[1:]), axis=1)
+
+                    plan_vecs_2 = None
+                    for i in range(plan_traj_2.shape[0]):
+                        plan_vec_i = plan_traj_2[i]
+                        x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+                        y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+                        xy = np.stack((x_linspace, y_linspace), axis=1)
+                        xy = np.stack((xy[:-1], xy[1:]), axis=1)
+                        if plan_vecs_2 is None:
+                            plan_vecs_2 = xy
+                        else:
+                            plan_vecs_2 = np.concatenate((plan_vecs_2, xy), axis=0)
+
+                    ##############3
+                    plan_traj_3 = lidar2img_rt @ plan_traj_3.T
+                    plan_traj_3 = plan_traj_3[0:2, ...] / np.maximum(
+                        plan_traj_3[2:3, ...], np.ones_like(plan_traj_3[2:3, ...]) * 1e-5)
+                    plan_traj_3 = plan_traj_3.T
+                    plan_traj_3 = np.stack((plan_traj_3[:-1], plan_traj_3[1:]), axis=1)
+
+                    plan_vecs_3 = None
+                    for i in range(plan_traj_3.shape[0]):
+                        plan_vec_i = plan_traj_3[i]
+                        x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+                        y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+                        xy = np.stack((x_linspace, y_linspace), axis=1)
+                        xy = np.stack((xy[:-1], xy[1:]), axis=1)
+                        if plan_vecs_3 is None:
+                            plan_vecs_3 = xy
+                        else:
+                            plan_vecs_3 = np.concatenate((plan_vecs_3, xy), axis=0)
+
+                    ##############4
+                    plan_traj_4 = lidar2img_rt @ plan_traj_4.T
+                    plan_traj_4 = plan_traj_4[0:2, ...] / np.maximum(
+                        plan_traj_4[2:3, ...], np.ones_like(plan_traj_4[2:3, ...]) * 1e-5)
+                    plan_traj_4 = plan_traj_4.T
+                    plan_traj_4 = np.stack((plan_traj_4[:-1], plan_traj_4[1:]), axis=1)
+
+                    plan_vecs_4 = None
+                    for i in range(plan_traj_4.shape[0]):
+                        plan_vec_i = plan_traj_4[i]
+                        x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+                        y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+                        xy = np.stack((x_linspace, y_linspace), axis=1)
+                        xy = np.stack((xy[:-1], xy[1:]), axis=1)
+                        if plan_vecs_4 is None:
+                            plan_vecs_4 = xy
+                        else:
+                            plan_vecs_4 = np.concatenate((plan_vecs_4, xy), axis=0)
+
+                    cmap = 'summer'
+                    cmap_1 = 'autumn'
+                    cmap_2 = 'winter'
+                    cmap_3 = 'spring'
+
+
+                    y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301))
+                    colors = color_map(y[:-1], cmap)
+                    line_segments = LineCollection(plan_vecs, colors=colors, linewidths=2, linestyles='solid', cmap=cmap)
+                    ax.add_collection(line_segments)
+
+                    line_segments_1 = LineCollection(plan_vecs_1, colors=colors, linewidths=2, linestyles='solid', cmap=cmap_1)
+                    ax.add_collection(line_segments_1)
+                    line_segments_2 = LineCollection(plan_vecs_2, colors=colors, linewidths=2, linestyles='solid', cmap=cmap_2)
+                    ax.add_collection(line_segments_2)
+                    line_segments_3 = LineCollection(plan_vecs_3, colors=colors, linewidths=2, linestyles='solid', cmap=cmap)
+                    ax.add_collection(line_segments_3)
+                    line_segments_4 = LineCollection(plan_vecs_4, colors=colors, linewidths=2, linestyles='solid', cmap=cmap)
+                    ax.add_collection(line_segments_4)
+
+                ax.set_xlim(0, data.size[0])
+                ax.set_ylim(data.size[1], 0)
+                ax.axis('off')
+                if out_path is not None:
+                    savepath = osp.join(out_path, f'{cam}_PRED')
+                    plt.savefig(savepath, bbox_inches='tight', dpi=200, pad_inches=0.0)
+                plt.close()
+
+                # Load boxes and image.
+                data_path = osp.join(out_path, f'{cam}_PRED.png')
+                cam_img = cv2.imread(data_path)
+                lw = 6
+                tf = max(lw - 3, 1)
+                w, h = cv2.getTextSize(cam, 0, fontScale=lw / 6, thickness=tf)[0]  # text width, height
+                # color=(0, 0, 0)
+                txt_color=(255, 255, 255)
+                cv2.putText(cam_img,
+                            cam, (10, h + 10),
+                            0,
+                            lw / 6,
+                            txt_color,
+                            thickness=tf,
+                            lineType=cv2.LINE_AA)
+                cam_imgs.append(cam_img)
+            else:
+                raise ValueError("Error: Unknown sensor modality!")
+
+        plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0])
+        cmd_list = ['Turn Right', 'Turn Left', 'Go Straight']
+        plan_cmd_str = cmd_list[plan_cmd]
+        pred_img = cv2.copyMakeBorder(pred_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0)
+        # font
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        # fontScale
+        fontScale = 1
+        # Line thickness of 2 px
+        thickness = 3
+        # org
+        org = (20, 40)      
+        # Blue color in BGR
+        color = (0, 0, 0)
+        # Using cv2.putText() method
+        # pred_img = cv2.putText(pred_img, 'BEV', org, font,
+        #                 fontScale, color, thickness, cv2.LINE_AA)
+        # pred_img = cv2.putText(pred_img, plan_cmd_str, (20, 770), font,
+        #                 fontScale, color, thickness, cv2.LINE_AA)
+        
+        sample_img = pred_img
+        cam_img_top = cv2.hconcat([cam_imgs[0], cam_imgs[1], cam_imgs[2]])
+        cam_img_down = cv2.hconcat([cam_imgs[3], cam_imgs[4], cam_imgs[5]])
+        cam_img = cv2.vconcat([cam_img_top, cam_img_down])
+        size = (2133, 800)
+        cam_img = cv2.resize(cam_img, size)
+        vis_img = cv2.hconcat([cam_img, sample_img])
+
+        video.write(vis_img)
+    
+    video.release()
+    cv2.destroyAllWindows()
diff --git a/GenAD-main/tools/create_data.py b/GenAD-main/tools/create_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2b0cc10f1fafa77a39cd8fbd9c1ac9386d2af72
--- /dev/null
+++ b/GenAD-main/tools/create_data.py
@@ -0,0 +1,305 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+from data_converter.create_gt_database import create_groundtruth_database
+from data_converter import nuscenes_converter as nuscenes_converter
+from data_converter import lyft_converter as lyft_converter
+from data_converter import kitti_converter as kitti
+from data_converter import indoor_converter as indoor
+import argparse
+from os import path as osp
+import sys
+sys.path.append('.')
+
+
+def kitti_data_prep(root_path, info_prefix, version, out_dir):
+    """Prepare data related to Kitti dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        out_dir (str): Output directory of the groundtruth database info.
+    """
+    kitti.create_kitti_info_file(root_path, info_prefix)
+    kitti.create_reduced_point_cloud(root_path, info_prefix)
+
+    info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')
+    info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl')
+    info_trainval_path = osp.join(root_path,
+                                  f'{info_prefix}_infos_trainval.pkl')
+    info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
+    kitti.export_2d_annotation(root_path, info_train_path)
+    kitti.export_2d_annotation(root_path, info_val_path)
+    kitti.export_2d_annotation(root_path, info_trainval_path)
+    kitti.export_2d_annotation(root_path, info_test_path)
+
+    create_groundtruth_database(
+        'KittiDataset',
+        root_path,
+        info_prefix,
+        f'{out_dir}/{info_prefix}_infos_train.pkl',
+        relative_path=False,
+        mask_anno_path='instances_train.json',
+        with_mask=(version == 'mask'))
+
+
+def nuscenes_data_prep(root_path,
+                       can_bus_root_path,
+                       info_prefix,
+                       version,
+                       dataset_name,
+                       out_dir,
+                       max_sweeps=10):
+    """Prepare data related to nuScenes dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        dataset_name (str): The dataset class name.
+        out_dir (str): Output directory of the groundtruth database info.
+        max_sweeps (int): Number of input consecutive frames. Default: 10
+    """
+    nuscenes_converter.create_nuscenes_infos(
+        root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+    if version == 'v1.0-test':
+        info_test_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_test.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_test_path, version=version)
+    else:
+        info_train_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_train.pkl')
+        info_val_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_val.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_train_path, version=version)
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_val_path, version=version)
+        # create_groundtruth_database(dataset_name, root_path, info_prefix,
+        #                             f'{out_dir}/{info_prefix}_infos_train.pkl')
+
+
+def lyft_data_prep(root_path, info_prefix, version, max_sweeps=10):
+    """Prepare data related to Lyft dataset.
+
+    Related data consists of '.pkl' files recording basic infos.
+    Although the ground truth database and 2D annotations are not used in
+    Lyft, it can also be generated like nuScenes.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        max_sweeps (int, optional): Number of input consecutive frames.
+            Defaults to 10.
+    """
+    lyft_converter.create_lyft_infos(
+        root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+
+def scannet_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for scannet dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def s3dis_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for s3dis dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def sunrgbd_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for sunrgbd dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def waymo_data_prep(root_path,
+                    info_prefix,
+                    version,
+                    out_dir,
+                    workers,
+                    max_sweeps=5):
+    """Prepare the info file for waymo dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+        max_sweeps (int): Number of input consecutive frames. Default: 5 \
+            Here we store pose information of these frames for later use.
+    """
+    from tools.data_converter import waymo_converter as waymo
+
+    splits = ['training', 'validation', 'testing']
+
+    for i, split in enumerate(splits):
+        load_dir = osp.join(root_path, 'waymo_format', split)
+        if split == 'validation':
+            save_dir = osp.join(out_dir, 'kitti_format', 'training')
+        else:
+            save_dir = osp.join(out_dir, 'kitti_format', split)
+        converter = waymo.Waymo2KITTI(
+            load_dir,
+            save_dir,
+            prefix=str(i),
+            workers=workers,
+            test_mode=(split == 'test'))
+        converter.convert()
+    # Generate waymo infos
+    out_dir = osp.join(out_dir, 'kitti_format')
+    kitti.create_waymo_info_file(out_dir, info_prefix, max_sweeps=max_sweeps)
+
+    create_groundtruth_database(
+        'WaymoDataset',
+        out_dir,
+        info_prefix,
+        f'{out_dir}/{info_prefix}_infos_train.pkl',
+        relative_path=False,
+        with_mask=False)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/kitti',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--canbus',
+    type=str,
+    default='./data',
+    help='specify the root path of nuScenes canbus')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.0',
+    required=False,
+    help='specify the dataset version, no need for kitti')
+parser.add_argument(
+    '--max-sweeps',
+    type=int,
+    default=10,
+    required=False,
+    help='specify sweeps of lidar per example')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/kitti',
+    required='False',
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+    '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    if args.dataset == 'kitti':
+        kitti_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir)
+    elif args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+        train_version = f'{args.version}-trainval'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+        train_version = f'{args.version}'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'lyft':
+        train_version = f'{args.version}-train'
+        lyft_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        lyft_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'waymo':
+        waymo_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir,
+            workers=args.workers,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'scannet':
+        scannet_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    elif args.dataset == 's3dis':
+        s3dis_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    elif args.dataset == 'sunrgbd':
+        sunrgbd_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
diff --git a/GenAD-main/tools/data_converter/__init__.py b/GenAD-main/tools/data_converter/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d
--- /dev/null
+++ b/GenAD-main/tools/data_converter/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/GenAD-main/tools/data_converter/create_gt_database.py b/GenAD-main/tools/data_converter/create_gt_database.py
new file mode 100644
index 0000000000000000000000000000000000000000..7317cedd08377643018b7d4a72f7b5c96397b59c
--- /dev/null
+++ b/GenAD-main/tools/data_converter/create_gt_database.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import pickle
+from mmcv import track_iter_progress
+from mmcv.ops import roi_align
+from os import path as osp
+from pycocotools import mask as maskUtils
+from pycocotools.coco import COCO
+
+from mmdet3d.core.bbox import box_np_ops as box_np_ops
+from mmdet3d.datasets import build_dataset
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+
+
+def _poly2mask(mask_ann, img_h, img_w):
+    if isinstance(mask_ann, list):
+        # polygon -- a single object might consist of multiple parts
+        # we merge all parts into one mask rle code
+        rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        rle = maskUtils.merge(rles)
+    elif isinstance(mask_ann['counts'], list):
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+    else:
+        # rle
+        rle = mask_ann
+    mask = maskUtils.decode(rle)
+    return mask
+
+
+def _parse_coco_ann_info(ann_info):
+    gt_bboxes = []
+    gt_labels = []
+    gt_bboxes_ignore = []
+    gt_masks_ann = []
+
+    for i, ann in enumerate(ann_info):
+        if ann.get('ignore', False):
+            continue
+        x1, y1, w, h = ann['bbox']
+        if ann['area'] <= 0:
+            continue
+        bbox = [x1, y1, x1 + w, y1 + h]
+        if ann.get('iscrowd', False):
+            gt_bboxes_ignore.append(bbox)
+        else:
+            gt_bboxes.append(bbox)
+            gt_masks_ann.append(ann['segmentation'])
+
+    if gt_bboxes:
+        gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+        gt_labels = np.array(gt_labels, dtype=np.int64)
+    else:
+        gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+        gt_labels = np.array([], dtype=np.int64)
+
+    if gt_bboxes_ignore:
+        gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+    else:
+        gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+    ann = dict(
+        bboxes=gt_bboxes, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann)
+
+    return ann
+
+
+def crop_image_patch_v2(pos_proposals, pos_assigned_gt_inds, gt_masks):
+    import torch
+    from torch.nn.modules.utils import _pair
+    device = pos_proposals.device
+    num_pos = pos_proposals.size(0)
+    fake_inds = (
+        torch.arange(num_pos,
+                     device=device).to(dtype=pos_proposals.dtype)[:, None])
+    rois = torch.cat([fake_inds, pos_proposals], dim=1)  # Nx5
+    mask_size = _pair(28)
+    rois = rois.to(device=device)
+    gt_masks_th = (
+        torch.from_numpy(gt_masks).to(device).index_select(
+            0, pos_assigned_gt_inds).to(dtype=rois.dtype))
+    # Use RoIAlign could apparently accelerate the training (~0.1s/iter)
+    targets = (
+        roi_align(gt_masks_th, rois, mask_size[::-1], 1.0, 0, True).squeeze(1))
+    return targets
+
+
+def crop_image_patch(pos_proposals, gt_masks, pos_assigned_gt_inds, org_img):
+    num_pos = pos_proposals.shape[0]
+    masks = []
+    img_patches = []
+    for i in range(num_pos):
+        gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+        bbox = pos_proposals[i, :].astype(np.int32)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1 + 1, 1)
+        h = np.maximum(y2 - y1 + 1, 1)
+
+        mask_patch = gt_mask[y1:y1 + h, x1:x1 + w]
+        masked_img = gt_mask[..., None] * org_img
+        img_patch = masked_img[y1:y1 + h, x1:x1 + w]
+
+        img_patches.append(img_patch)
+        masks.append(mask_patch)
+    return img_patches, masks
+
+
+def create_groundtruth_database(dataset_class_name,
+                                data_path,
+                                info_prefix,
+                                info_path=None,
+                                mask_anno_path=None,
+                                used_classes=None,
+                                database_save_path=None,
+                                db_info_save_path=None,
+                                relative_path=True,
+                                add_rgb=False,
+                                lidar_only=False,
+                                bev_only=False,
+                                coors_range=None,
+                                with_mask=False):
+    """Given the raw data, generate the ground truth database.
+
+    Args:
+        dataset_class_name （str): Name of the input dataset.
+        data_path (str): Path of the data.
+        info_prefix (str): Prefix of the info file.
+        info_path (str): Path of the info file.
+            Default: None.
+        mask_anno_path (str): Path of the mask_anno.
+            Default: None.
+        used_classes (list[str]): Classes have been used.
+            Default: None.
+        database_save_path (str): Path to save database.
+            Default: None.
+        db_info_save_path (str): Path to save db_info.
+            Default: None.
+        relative_path (bool): Whether to use relative path.
+            Default: True.
+        with_mask (bool): Whether to use mask.
+            Default: False.
+    """
+    print(f'Create GT Database of {dataset_class_name}')
+    dataset_cfg = dict(
+        type=dataset_class_name, data_root=data_path, ann_file=info_path)
+    if dataset_class_name == 'KittiDataset':
+        file_client_args = dict(backend='disk')
+        dataset_cfg.update(
+            test_mode=False,
+            split='training',
+            modality=dict(
+                use_lidar=True,
+                use_depth=False,
+                use_lidar_intensity=True,
+                use_camera=with_mask,
+            ),
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=4,
+                    use_dim=4,
+                    file_client_args=file_client_args),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True,
+                    file_client_args=file_client_args)
+            ])
+
+    elif dataset_class_name == 'NuScenesDataset':
+        dataset_cfg.update(
+            use_valid_flag=True,
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=5,
+                    use_dim=5),
+                dict(
+                    type='LoadPointsFromMultiSweeps',
+                    sweeps_num=10,
+                    use_dim=[0, 1, 2, 3, 4],
+                    pad_empty_sweeps=True,
+                    remove_close=True),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True)
+            ])
+
+    elif dataset_class_name == 'WaymoDataset':
+        file_client_args = dict(backend='disk')
+        dataset_cfg.update(
+            test_mode=False,
+            split='training',
+            modality=dict(
+                use_lidar=True,
+                use_depth=False,
+                use_lidar_intensity=True,
+                use_camera=False,
+            ),
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=6,
+                    use_dim=5,
+                    file_client_args=file_client_args),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True,
+                    file_client_args=file_client_args)
+            ])
+
+    dataset = build_dataset(dataset_cfg)
+
+    if database_save_path is None:
+        database_save_path = osp.join(data_path, f'{info_prefix}_gt_database')
+    if db_info_save_path is None:
+        db_info_save_path = osp.join(data_path,
+                                     f'{info_prefix}_dbinfos_train.pkl')
+    mmcv.mkdir_or_exist(database_save_path)
+    all_db_infos = dict()
+    if with_mask:
+        coco = COCO(osp.join(data_path, mask_anno_path))
+        imgIds = coco.getImgIds()
+        file2id = dict()
+        for i in imgIds:
+            info = coco.loadImgs([i])[0]
+            file2id.update({info['file_name']: i})
+
+    group_counter = 0
+    for j in track_iter_progress(list(range(len(dataset)))):
+        input_dict = dataset.get_data_info(j)
+        dataset.pre_pipeline(input_dict)
+        example = dataset.pipeline(input_dict)
+        annos = example['ann_info']
+        image_idx = example['sample_idx']
+        points = example['points'].tensor.numpy()
+        gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy()
+        names = annos['gt_names']
+        group_dict = dict()
+        if 'group_ids' in annos:
+            group_ids = annos['group_ids']
+        else:
+            group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64)
+        difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32)
+        if 'difficulty' in annos:
+            difficulty = annos['difficulty']
+
+        num_obj = gt_boxes_3d.shape[0]
+        point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d)
+
+        if with_mask:
+            # prepare masks
+            gt_boxes = annos['gt_bboxes']
+            img_path = osp.split(example['img_info']['filename'])[-1]
+            if img_path not in file2id.keys():
+                print(f'skip image {img_path} for empty mask')
+                continue
+            img_id = file2id[img_path]
+            kins_annIds = coco.getAnnIds(imgIds=img_id)
+            kins_raw_info = coco.loadAnns(kins_annIds)
+            kins_ann_info = _parse_coco_ann_info(kins_raw_info)
+            h, w = annos['img_shape'][:2]
+            gt_masks = [
+                _poly2mask(mask, h, w) for mask in kins_ann_info['masks']
+            ]
+            # get mask inds based on iou mapping
+            bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes)
+            mask_inds = bbox_iou.argmax(axis=0)
+            valid_inds = (bbox_iou.max(axis=0) > 0.5)
+
+            # mask the image
+            # use more precise crop when it is ready
+            # object_img_patches = np.ascontiguousarray(
+            #     np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2))
+            # crop image patches using roi_align
+            # object_img_patches = crop_image_patch_v2(
+            #     torch.Tensor(gt_boxes),
+            #     torch.Tensor(mask_inds).long(), object_img_patches)
+            object_img_patches, object_masks = crop_image_patch(
+                gt_boxes, gt_masks, mask_inds, annos['img'])
+
+        for i in range(num_obj):
+            filename = f'{image_idx}_{names[i]}_{i}.bin'
+            abs_filepath = osp.join(database_save_path, filename)
+            rel_filepath = osp.join(f'{info_prefix}_gt_database', filename)
+
+            # save point clouds and image patches for each object
+            gt_points = points[point_indices[:, i]]
+            gt_points[:, :3] -= gt_boxes_3d[i, :3]
+
+            if with_mask:
+                if object_masks[i].sum() == 0 or not valid_inds[i]:
+                    # Skip object for empty or invalid mask
+                    continue
+                img_patch_path = abs_filepath + '.png'
+                mask_patch_path = abs_filepath + '.mask.png'
+                mmcv.imwrite(object_img_patches[i], img_patch_path)
+                mmcv.imwrite(object_masks[i], mask_patch_path)
+
+            with open(abs_filepath, 'w') as f:
+                gt_points.tofile(f)
+
+            if (used_classes is None) or names[i] in used_classes:
+                db_info = {
+                    'name': names[i],
+                    'path': rel_filepath,
+                    'image_idx': image_idx,
+                    'gt_idx': i,
+                    'box3d_lidar': gt_boxes_3d[i],
+                    'num_points_in_gt': gt_points.shape[0],
+                    'difficulty': difficulty[i],
+                }
+                local_group_id = group_ids[i]
+                # if local_group_id >= 0:
+                if local_group_id not in group_dict:
+                    group_dict[local_group_id] = group_counter
+                    group_counter += 1
+                db_info['group_id'] = group_dict[local_group_id]
+                if 'score' in annos:
+                    db_info['score'] = annos['score'][i]
+                if with_mask:
+                    db_info.update({'box2d_camera': gt_boxes[i]})
+                if names[i] in all_db_infos:
+                    all_db_infos[names[i]].append(db_info)
+                else:
+                    all_db_infos[names[i]] = [db_info]
+
+    for k, v in all_db_infos.items():
+        print(f'load {len(v)} {k} database infos')
+
+    with open(db_info_save_path, 'wb') as f:
+        pickle.dump(all_db_infos, f)
diff --git a/GenAD-main/tools/data_converter/vad_nuscenes_converter.py b/GenAD-main/tools/data_converter/vad_nuscenes_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..338051cbc544f6860fc3ad6296b1271b037d1bd5
--- /dev/null
+++ b/GenAD-main/tools/data_converter/vad_nuscenes_converter.py
@@ -0,0 +1,1005 @@
+import os
+import math
+import copy
+import argparse
+from os import path as osp
+from collections import OrderedDict
+from typing import List, Tuple, Union
+
+import mmcv
+import numpy as np
+from pyquaternion import Quaternion
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.data_classes import Box
+from shapely.geometry import MultiPoint, box
+from mmdet3d.datasets import NuScenesDataset
+from nuscenes.utils.geometry_utils import view_points
+from mmdet3d.core.bbox.box_np_ops import points_cam2img
+from nuscenes.utils.geometry_utils import transform_matrix
+
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+
+ego_width, ego_length = 1.85, 4.084
+
+def quart_to_rpy(qua):
+    x, y, z, w = qua
+    roll = math.atan2(2 * (w * x + y * z), 1 - 2 * (x * x + y * y))
+    pitch = math.asin(2 * (w * y - x * z))
+    yaw = math.atan2(2 * (w * z + x * y), 1 - 2 * (z * z + y * y))
+    return roll, pitch, yaw
+
+def locate_message(utimes, utime):
+    i = np.searchsorted(utimes, utime)
+    if i == len(utimes) or (i > 0 and utime - utimes[i-1] < utimes[i] - utime):
+        i -= 1
+    return i
+
+
+def create_nuscenes_infos(root_path,
+                          out_path,
+                          can_bus_root_path,
+                          info_prefix,
+                          version='v1.0-trainval',
+                          max_sweeps=10):
+    """Create info file of nuscene dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str): Version of the data.
+            Default: 'v1.0-trainval'
+        max_sweeps (int): Max number of sweeps.
+            Default: 10
+    """
+    from nuscenes.nuscenes import NuScenes
+    from nuscenes.can_bus.can_bus_api import NuScenesCanBus
+    print(version, root_path)
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    nusc_can_bus = NuScenesCanBus(dataroot=can_bus_root_path)
+    from nuscenes.utils import splits
+    available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
+    assert version in available_vers
+    if version == 'v1.0-trainval':
+        train_scenes = splits.train
+        val_scenes = splits.val
+    elif version == 'v1.0-test':
+        train_scenes = splits.test
+        val_scenes = []
+    elif version == 'v1.0-mini':
+        train_scenes = splits.mini_train
+        val_scenes = splits.mini_val
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(nusc)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print('test scene: {}'.format(len(train_scenes)))
+    else:
+        print('train scene: {}, val scene: {}'.format(
+            len(train_scenes), len(val_scenes)))
+
+    train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+        nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print('test sample: {}'.format(len(train_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_test.pkl'.format(info_prefix))
+        mmcv.dump(data, info_path)
+    else:
+        print('train sample: {}, val sample: {}'.format(
+            len(train_nusc_infos), len(val_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_train.pkl'.format(info_prefix))
+        mmcv.dump(data, info_path)
+        data['infos'] = val_nusc_infos
+        info_val_path = osp.join(out_path,
+                                 '{}_infos_temporal_val.pkl'.format(info_prefix))
+        mmcv.dump(data, info_val_path)
+
+
+def get_available_scenes(nusc):
+    """Get available scenes from the input nuscenes class.
+
+    Given the raw data, get the information of available scenes for
+    further info generation.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+
+    Returns:
+        available_scenes (list[dict]): List of basic information for the
+            available scenes.
+    """
+    available_scenes = []
+    print('total scene num: {}'.format(len(nusc.scene)))
+    for scene in nusc.scene:
+        scene_token = scene['token']
+        scene_rec = nusc.get('scene', scene_token)
+        sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
+        sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+        has_more_frames = True
+        scene_not_exist = False
+        while has_more_frames:
+            lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
+            lidar_path = str(lidar_path)
+            if os.getcwd() in lidar_path:
+                # path from lyftdataset is absolute path
+                lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1]
+                # relative path
+            if not mmcv.is_filepath(lidar_path):
+                scene_not_exist = True
+                break
+            else:
+                break
+        if scene_not_exist:
+            continue
+        available_scenes.append(scene)
+    print('exist scene num: {}'.format(len(available_scenes)))
+    return available_scenes
+
+
+def _get_can_bus_info(nusc, nusc_can_bus, sample):
+    scene_name = nusc.get('scene', sample['scene_token'])['name']
+    sample_timestamp = sample['timestamp']
+    try:
+        pose_list = nusc_can_bus.get_messages(scene_name, 'pose')
+    except:
+        return np.zeros(18)  # server scenes do not have can bus information.
+    can_bus = []
+    # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp
+    last_pose = pose_list[0]
+    for i, pose in enumerate(pose_list):
+        if pose['utime'] > sample_timestamp:
+            break
+        last_pose = pose
+    _ = last_pose.pop('utime')  # useless
+    pos = last_pose.pop('pos')
+    rotation = last_pose.pop('orientation')
+    can_bus.extend(pos)
+    can_bus.extend(rotation)
+    for key in last_pose.keys():
+        can_bus.extend(pose[key])  # 16 elements
+    can_bus.extend([0., 0.])
+    return np.array(can_bus)
+
+
+def _fill_trainval_infos(nusc,
+                         nusc_can_bus,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10,
+                         fut_ts=6,
+                         his_ts=2):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool): Whether use the test mode. In the test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and validation set
+            that will be saved to the info file.
+    """
+    train_nusc_infos = []
+    val_nusc_infos = []
+    frame_idx = 0
+    cat2idx = {}
+    for idx, dic in enumerate(nusc.category):
+        cat2idx[dic['name']] = idx
+
+    for sample in mmcv.track_iter_progress(nusc.sample):
+        map_location = nusc.get('log', nusc.get('scene', sample['scene_token'])['log_token'])['location']
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = nusc.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+        if sample['prev'] != '':
+            sample_prev = nusc.get('sample', sample['prev'])
+            sd_rec_prev = nusc.get('sample_data', sample_prev['data']['LIDAR_TOP'])
+            pose_record_prev = nusc.get('ego_pose', sd_rec_prev['ego_pose_token'])
+        else:
+            pose_record_prev = None
+        if sample['next'] != '':
+            sample_next = nusc.get('sample', sample['next'])
+            sd_rec_next = nusc.get('sample_data', sample_next['data']['LIDAR_TOP'])
+            pose_record_next = nusc.get('ego_pose', sd_rec_next['ego_pose_token'])
+        else:
+            pose_record_next = None
+
+        lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+        mmcv.check_file_exist(lidar_path)
+        can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample)
+        fut_valid_flag = True
+        test_sample = copy.deepcopy(sample)
+        for i in range(fut_ts):
+            if test_sample['next'] != '':
+                test_sample = nusc.get('sample', test_sample['next'])
+            else:
+                fut_valid_flag = False
+        ##
+        info = {
+            'lidar_path': lidar_path,
+            'token': sample['token'],
+            'prev': sample['prev'],
+            'next': sample['next'],
+            'can_bus': can_bus,
+            'frame_idx': frame_idx,  # temporal related info
+            'sweeps': [],
+            'cams': dict(),
+            'scene_token': sample['scene_token'],  # temporal related info
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+            'fut_valid_flag': fut_valid_flag,
+            'map_location': map_location
+        }
+
+        if sample['next'] == '':
+            frame_idx = 0
+        else:
+            frame_idx += 1
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = nusc.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                nusc.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+            velocity = np.array(
+                [nusc.box_velocity(token)[:2] for token in sample['anns']])
+            valid_flag = np.array(
+                [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0
+                 for anno in annotations],
+                dtype=bool).reshape(-1)
+            # convert velo from global to lidar
+            for i in range(len(boxes)):
+                velo = np.array([*velocity[i], 0.0])
+                velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+                    l2e_r_mat).T
+                velocity[i] = velo[:2]
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in NuScenesDataset.NameMapping:
+                    names[i] = NuScenesDataset.NameMapping[names[i]]
+            names = np.array(names)
+            # we need to convert rot to SECOND format.
+            gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            
+            # get future coords for each box
+            # [num_box, fut_ts*2]
+            num_box = len(boxes)
+            gt_fut_trajs = np.zeros((num_box, fut_ts, 2))
+            gt_fut_yaw = np.zeros((num_box, fut_ts))
+            gt_fut_masks = np.zeros((num_box, fut_ts))
+            gt_boxes_yaw = -(gt_boxes[:,6] + np.pi / 2)
+            # agent lcf feat (x, y, yaw, vx, vy, width, length, height, type)
+            agent_lcf_feat = np.zeros((num_box, 9))
+            gt_fut_goal = np.zeros((num_box))
+            for i, anno in enumerate(annotations):
+                cur_box = boxes[i]
+                cur_anno = anno
+                agent_lcf_feat[i, 0:2] = cur_box.center[:2]	
+                agent_lcf_feat[i, 2] = gt_boxes_yaw[i]
+                agent_lcf_feat[i, 3:5] = velocity[i]
+                agent_lcf_feat[i, 5:8] = anno['size'] # width,length,height
+                agent_lcf_feat[i, 8] = cat2idx[anno['category_name']] if anno['category_name'] in cat2idx.keys() else -1
+                for j in range(fut_ts):
+                    if cur_anno['next'] != '':
+                        anno_next = nusc.get('sample_annotation', cur_anno['next'])
+                        box_next = Box(
+                            anno_next['translation'], anno_next['size'], Quaternion(anno_next['rotation'])
+                        )
+                        # Move box to ego vehicle coord system.
+                        box_next.translate(-np.array(pose_record['translation']))
+                        box_next.rotate(Quaternion(pose_record['rotation']).inverse)
+                        #  Move box to sensor coord system.
+                        box_next.translate(-np.array(cs_record['translation']))
+                        box_next.rotate(Quaternion(cs_record['rotation']).inverse)
+                        gt_fut_trajs[i, j] = box_next.center[:2] - cur_box.center[:2]
+                        gt_fut_masks[i, j] = 1
+                        # add yaw diff
+                        _, _, box_yaw = quart_to_rpy([cur_box.orientation.x, cur_box.orientation.y,
+                                                      cur_box.orientation.z, cur_box.orientation.w])
+                        _, _, box_yaw_next = quart_to_rpy([box_next.orientation.x, box_next.orientation.y,
+                                                           box_next.orientation.z, box_next.orientation.w])
+                        gt_fut_yaw[i, j] = box_yaw_next - box_yaw
+                        cur_anno = anno_next
+                        cur_box = box_next
+                    else:
+                        gt_fut_trajs[i, j:] = 0
+                        break
+                # get agent goal
+                gt_fut_coords = np.cumsum(gt_fut_trajs[i], axis=-2)
+                coord_diff = gt_fut_coords[-1] - gt_fut_coords[0]
+                if coord_diff.max() < 1.0: # static
+                    gt_fut_goal[i] = 9
+                else:
+                    box_mot_yaw = np.arctan2(coord_diff[1], coord_diff[0]) + np.pi
+                    gt_fut_goal[i] = box_mot_yaw // (np.pi / 4)  # 0-8: goal direction class
+
+            # get ego history traj (offset format)
+            ego_his_trajs = np.zeros((his_ts+1, 3))
+            ego_his_trajs_diff = np.zeros((his_ts+1, 3))
+            sample_cur = sample
+            for i in range(his_ts, -1, -1):
+                if sample_cur is not None:
+                    pose_mat = get_global_sensor_pose(sample_cur, nusc, inverse=False)
+                    ego_his_trajs[i] = pose_mat[:3, 3]
+                    has_prev = sample_cur['prev'] != ''
+                    has_next = sample_cur['next'] != ''
+                    if has_next:
+                        sample_next = nusc.get('sample', sample_cur['next'])
+                        pose_mat_next = get_global_sensor_pose(sample_next, nusc, inverse=False)
+                        ego_his_trajs_diff[i] = pose_mat_next[:3, 3] - ego_his_trajs[i]
+                    sample_cur = nusc.get('sample', sample_cur['prev']) if has_prev else None
+                else:
+                    ego_his_trajs[i] = ego_his_trajs[i+1] - ego_his_trajs_diff[i+1]
+                    ego_his_trajs_diff[i] = ego_his_trajs_diff[i+1]
+            
+            # global to ego at lcf
+            ego_his_trajs = ego_his_trajs - np.array(pose_record['translation'])
+            rot_mat = Quaternion(pose_record['rotation']).inverse.rotation_matrix
+            ego_his_trajs = np.dot(rot_mat, ego_his_trajs.T).T
+            # ego to lidar at lcf
+            ego_his_trajs = ego_his_trajs - np.array(cs_record['translation'])
+            rot_mat = Quaternion(cs_record['rotation']).inverse.rotation_matrix
+            ego_his_trajs = np.dot(rot_mat, ego_his_trajs.T).T
+            ego_his_trajs = ego_his_trajs[1:] - ego_his_trajs[:-1]
+
+            # get ego futute traj (offset format)
+            ego_fut_trajs = np.zeros((fut_ts+1, 3))
+            ego_fut_masks = np.zeros((fut_ts+1))
+            sample_cur = sample
+            for i in range(fut_ts+1):
+                pose_mat = get_global_sensor_pose(sample_cur, nusc, inverse=False)
+                ego_fut_trajs[i] = pose_mat[:3, 3]
+                ego_fut_masks[i] = 1
+                if sample_cur['next'] == '':
+                    ego_fut_trajs[i+1:] = ego_fut_trajs[i]
+                    break
+                else:
+                    sample_cur = nusc.get('sample', sample_cur['next'])
+            # global to ego at lcf
+            ego_fut_trajs = ego_fut_trajs - np.array(pose_record['translation'])
+            rot_mat = Quaternion(pose_record['rotation']).inverse.rotation_matrix
+            ego_fut_trajs = np.dot(rot_mat, ego_fut_trajs.T).T
+            # ego to lidar at lcf
+            ego_fut_trajs = ego_fut_trajs - np.array(cs_record['translation'])
+            rot_mat = Quaternion(cs_record['rotation']).inverse.rotation_matrix
+            ego_fut_trajs = np.dot(rot_mat, ego_fut_trajs.T).T
+            # drive command according to final fut step offset from lcf
+            if ego_fut_trajs[-1][0] >= 2:
+                command = np.array([1, 0, 0])  # Turn Right
+            elif ego_fut_trajs[-1][0] <= -2:
+                command = np.array([0, 1, 0])  # Turn Left
+            else:
+                command = np.array([0, 0, 1])  # Go Straight
+            # offset from lcf -> per-step offset
+            ego_fut_trajs = ego_fut_trajs[1:] - ego_fut_trajs[:-1]
+
+            ### ego lcf feat (vx, vy, ax, ay, w, length, width, vel, steer), w: yaw角速度
+            ego_lcf_feat = np.zeros(9)
+            # 根据odom推算自车速度及加速度
+            _, _, ego_yaw = quart_to_rpy(pose_record['rotation'])
+            ego_pos = np.array(pose_record['translation'])
+            if pose_record_prev is not None:
+                _, _, ego_yaw_prev = quart_to_rpy(pose_record_prev['rotation'])
+                ego_pos_prev = np.array(pose_record_prev['translation'])
+            if pose_record_next is not None:
+                _, _, ego_yaw_next = quart_to_rpy(pose_record_next['rotation'])
+                ego_pos_next = np.array(pose_record_next['translation'])
+            assert (pose_record_prev is not None) or (pose_record_next is not None), 'prev token and next token all empty'
+            if pose_record_prev is not None:
+                ego_w = (ego_yaw - ego_yaw_prev) / 0.5
+                ego_v = np.linalg.norm(ego_pos[:2] - ego_pos_prev[:2]) / 0.5
+                ego_vx, ego_vy = ego_v * math.cos(ego_yaw + np.pi/2), ego_v * math.sin(ego_yaw + np.pi/2)
+            else:
+                ego_w = (ego_yaw_next - ego_yaw) / 0.5
+                ego_v = np.linalg.norm(ego_pos_next[:2] - ego_pos[:2]) / 0.5
+                ego_vx, ego_vy = ego_v * math.cos(ego_yaw + np.pi/2), ego_v * math.sin(ego_yaw + np.pi/2)
+
+            ref_scene = nusc.get("scene", sample['scene_token'])
+            try:
+                pose_msgs = nusc_can_bus.get_messages(ref_scene['name'],'pose')
+                steer_msgs = nusc_can_bus.get_messages(ref_scene['name'], 'steeranglefeedback')
+                pose_uts = [msg['utime'] for msg in pose_msgs]
+                steer_uts = [msg['utime'] for msg in steer_msgs]
+                ref_utime = sample['timestamp']
+                pose_index = locate_message(pose_uts, ref_utime)
+                pose_data = pose_msgs[pose_index]
+                steer_index = locate_message(steer_uts, ref_utime)
+                steer_data = steer_msgs[steer_index]
+                # initial speed
+                v0 = pose_data["vel"][0]  # [0] means longitudinal velocity  m/s
+                # curvature (positive: turn left)
+                steering = steer_data["value"]
+                # flip x axis if in left-hand traffic (singapore)
+                flip_flag = True if map_location.startswith('singapore') else False
+                if flip_flag:
+                    steering *= -1
+                Kappa = 2 * steering / 2.588
+            except:
+                delta_x = ego_his_trajs[-1, 0] + ego_fut_trajs[0, 0]
+                delta_y = ego_his_trajs[-1, 1] + ego_fut_trajs[0, 1]
+                v0 = np.sqrt(delta_x**2 + delta_y**2)
+                Kappa = 0
+
+            ego_lcf_feat[:2] = np.array([ego_vx, ego_vy]) #can_bus[13:15]
+            ego_lcf_feat[2:4] = can_bus[7:9]
+            ego_lcf_feat[4] = ego_w #can_bus[12]
+            ego_lcf_feat[5:7] = np.array([ego_length, ego_width])
+            ego_lcf_feat[7] = v0
+            ego_lcf_feat[8] = Kappa
+
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['gt_velocity'] = velocity.reshape(-1, 2)
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+            info['valid_flag'] = valid_flag
+            info['gt_agent_fut_trajs'] = gt_fut_trajs.reshape(-1, fut_ts*2).astype(np.float32)
+            info['gt_agent_fut_masks'] = gt_fut_masks.reshape(-1, fut_ts).astype(np.float32)
+            info['gt_agent_lcf_feat'] = agent_lcf_feat.astype(np.float32)
+            info['gt_agent_fut_yaw'] = gt_fut_yaw.astype(np.float32)
+            info['gt_agent_fut_goal'] = gt_fut_goal.astype(np.float32)
+            info['gt_ego_his_trajs'] = ego_his_trajs[:, :2].astype(np.float32)
+            info['gt_ego_fut_trajs'] = ego_fut_trajs[:, :2].astype(np.float32)
+            info['gt_ego_fut_masks'] = ego_fut_masks[1:].astype(np.float32)
+            info['gt_ego_fut_cmd'] = command.astype(np.float32)
+            info['gt_ego_lcf_feat'] = ego_lcf_feat.astype(np.float32)
+
+        if sample['scene_token'] in train_scenes:
+            train_nusc_infos.append(info)
+        else:
+            val_nusc_infos.append(info)
+
+    return train_nusc_infos, val_nusc_infos
+
+def get_global_sensor_pose(rec, nusc, inverse=False):
+    lidar_sample_data = nusc.get('sample_data', rec['data']['LIDAR_TOP'])
+
+    sd_ep = nusc.get("ego_pose", lidar_sample_data["ego_pose_token"])
+    sd_cs = nusc.get("calibrated_sensor", lidar_sample_data["calibrated_sensor_token"])
+    if inverse is False:
+        global_from_ego = transform_matrix(sd_ep["translation"], Quaternion(sd_ep["rotation"]), inverse=False)
+        ego_from_sensor = transform_matrix(sd_cs["translation"], Quaternion(sd_cs["rotation"]), inverse=False)
+        pose = global_from_ego.dot(ego_from_sensor)
+        # translation equivalent writing
+        # pose_translation = np.array(sd_cs["translation"])
+        # rot_mat = Quaternion(sd_ep['rotation']).rotation_matrix
+        # pose_translation = np.dot(rot_mat, pose_translation)
+        # # pose_translation = pose[:3, 3]
+        # pose_translation = pose_translation + np.array(sd_ep["translation"])
+    else:
+        sensor_from_ego = transform_matrix(sd_cs["translation"], Quaternion(sd_cs["rotation"]), inverse=True)
+        ego_from_global = transform_matrix(sd_ep["translation"], Quaternion(sd_ep["rotation"]), inverse=True)
+        pose = sensor_from_ego.dot(ego_from_global)
+    return pose
+
+def obtain_sensor2top(nusc,
+                      sensor_token,
+                      l2e_t,
+                      l2e_r_mat,
+                      e2g_t,
+                      e2g_r_mat,
+                      sensor_type='lidar'):
+    """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+        sensor_token (str): Sample data token corresponding to the
+            specific sensor type.
+        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+            in shape (3, 3).
+        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+            in shape (3, 3).
+        sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+    Returns:
+        sweep (dict): Sweep information after transformation.
+    """
+    sd_rec = nusc.get('sample_data', sensor_token)
+    cs_record = nusc.get('calibrated_sensor',
+                         sd_rec['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
+        data_path = data_path.split(f'{os.getcwd()}/')[-1]  # relative path
+    sweep = {
+        'data_path': data_path,
+        'type': sensor_type,
+        'sample_data_token': sd_rec['token'],
+        'sensor2ego_translation': cs_record['translation'],
+        'sensor2ego_rotation': cs_record['rotation'],
+        'ego2global_translation': pose_record['translation'],
+        'ego2global_rotation': pose_record['rotation'],
+        'timestamp': sd_rec['timestamp']
+    }
+
+    l2e_r_s = sweep['sensor2ego_rotation']
+    l2e_t_s = sweep['sensor2ego_translation']
+    e2g_r_s = sweep['ego2global_rotation']
+    e2g_t_s = sweep['ego2global_translation']
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    sweep['sensor2lidar_rotation'] = R.T  # points @ R.T + T
+    sweep['sensor2lidar_translation'] = T
+    return sweep
+
+
+def export_2d_annotation(root_path, info_path, version, mono3d=False):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        version (str): Dataset version.
+        mono3d (bool): Whether to export mono3d annotation. Default: False.
+    """
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    nusc_infos = mmcv.load(info_path)['infos']
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in mmcv.track_iter_progress(nusc_infos):
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                nusc,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'],
+                mono3d=mono3d)
+            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'].split('data/nuscenes/')
+                    [-1],
+                    id=cam_info['sample_data_token'],
+                    token=info['token'],
+                    cam2ego_rotation=cam_info['sensor2ego_rotation'],
+                    cam2ego_translation=cam_info['sensor2ego_translation'],
+                    ego2global_rotation=info['ego2global_rotation'],
+                    ego2global_translation=info['ego2global_translation'],
+                    cam_intrinsic=cam_info['cam_intrinsic'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(nusc,
+                 sample_data_token: str,
+                 visibilities: List[str],
+                 mono3d=True):
+    """Get the 2D annotation records for a given `sample_data_token`.
+
+    Args:
+        sample_data_token (str): Sample data token belonging to a camera \
+            keyframe.
+        visibilities (list[str]): Visibility filter.
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+
+    # Get the sample data and the sample corresponding to that sample data.
+    sd_rec = nusc.get('sample_data', sample_data_token)
+
+    assert sd_rec[
+        'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+        ' for camera sample_data!'
+    if not sd_rec['is_key_frame']:
+        raise ValueError(
+            'The 2D re-projections are available only for keyframes.')
+
+    s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+    # Get the calibrated sensor and ego pose
+    # record to get the transformation matrices.
+    cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+    pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+    # Get all the annotation with the specified visibilties.
+    ann_recs = [
+        nusc.get('sample_annotation', token) for token in s_rec['anns']
+    ]
+    ann_recs = [
+        ann_rec for ann_rec in ann_recs
+        if (ann_rec['visibility_token'] in visibilities)
+    ]
+
+    repro_recs = []
+
+    for ann_rec in ann_recs:
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = ann_rec['token']
+        ann_rec['sample_data_token'] = sample_data_token
+
+        # Get the box in global coordinates.
+        box = nusc.get_box(ann_rec['token'])
+
+        # Move them to the ego-pose frame.
+        box.translate(-np.array(pose_rec['translation']))
+        box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+        # Move them to the calibrated sensor frame.
+        box.translate(-np.array(cs_rec['translation']))
+        box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box.corners()
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token, sd_rec['filename'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            loc = box.center.tolist()
+
+            dim = box.wlh
+            dim[[0, 1, 2]] = dim[[1, 2, 0]]  # convert wlh to our lhw
+            dim = dim.tolist()
+
+            rot = box.orientation.yaw_pitch_roll[0]
+            rot = [-rot]  # convert the rot to our cam coordinate
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_cam3d'] = loc + dim + rot
+            repro_rec['velo_cam3d'] = velo
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            repro_rec['attribute_name'] = attr_name
+            repro_rec['attribute_id'] = attr_id
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def post_process_coords(
+    corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+    """Get the intersection of the convex hull of the reprojected bbox corners
+    and the image canvas, return None if no intersection.
+
+    Args:
+        corner_coords (list[int]): Corner coordinates of reprojected
+            bounding box.
+        imsize (tuple[int]): Size of the image canvas.
+
+    Return:
+        tuple [float]: Intersection of the convex hull of the 2D box
+            corners and the image canvas.
+    """
+    polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+    img_canvas = box(0, 0, imsize[0], imsize[1])
+
+    if polygon_from_2d_box.intersects(img_canvas):
+        img_intersection = polygon_from_2d_box.intersection(img_canvas)
+        intersection_coords = np.array(
+            [coord for coord in img_intersection.exterior.coords])
+
+        min_x = min(intersection_coords[:, 0])
+        min_y = min(intersection_coords[:, 1])
+        max_x = max(intersection_coords[:, 0])
+        max_y = max(intersection_coords[:, 1])
+
+        return min_x, min_y, max_x, max_y
+    else:
+        return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+                    sample_data_token: str, filename: str) -> OrderedDict:
+    """Generate one 2D annotation record given various informations on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): flie name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, dx, dy of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    relevant_keys = [
+        'attribute_tokens',
+        'category_name',
+        'instance_token',
+        'next',
+        'num_lidar_pts',
+        'num_radar_pts',
+        'prev',
+        'sample_annotation_token',
+        'sample_data_token',
+        'visibility_token',
+    ]
+
+    for key, value in ann_rec.items():
+        if key in relevant_keys:
+            repro_rec[key] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in NuScenesDataset.NameMapping:
+        return None
+    cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']]
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = nus_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
+
+
+def nuscenes_data_prep(root_path,
+                       can_bus_root_path,
+                       info_prefix,
+                       version,
+                       dataset_name,
+                       out_dir,
+                       max_sweeps=10):
+    """Prepare data related to nuScenes dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        dataset_name (str): The dataset class name.
+        out_dir (str): Output directory of the groundtruth database info.
+        max_sweeps (int): Number of input consecutive frames. Default: 10
+    """
+    create_nuscenes_infos(
+        root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/kitti',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--canbus',
+    type=str,
+    default='./data',
+    help='specify the root path of nuScenes canbus')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.0',
+    required=False,
+    help='specify the dataset version, no need for kitti')
+parser.add_argument(
+    '--max-sweeps',
+    type=int,
+    default=10,
+    required=False,
+    help='specify sweeps of lidar per example')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/kitti',
+    required='False',
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+    '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    if args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+        train_version = f'{args.version}-trainval'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+        train_version = f'{args.version}'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
diff --git a/GenAD-main/tools/dist_test.sh b/GenAD-main/tools/dist_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3e2ec3007b1d5927a5bc5a63140ee7e11f500142
--- /dev/null
+++ b/GenAD-main/tools/dist_test.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29503}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --eval bbox
diff --git a/GenAD-main/tools/dist_train.sh b/GenAD-main/tools/dist_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..141b284d5e80e42dc66424c4c4900394413bc7fb
--- /dev/null
+++ b/GenAD-main/tools/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28509}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
diff --git a/GenAD-main/tools/dist_train_multi_nodes.sh b/GenAD-main/tools/dist_train_multi_nodes.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c3bcc56f6a1dc5994311a6959ceb24c97094028d
--- /dev/null
+++ b/GenAD-main/tools/dist_train_multi_nodes.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+set -x
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28509}
+
+if ! command -v nslookup &> /dev/null; then
+    apt update
+    apt install dnsutils -y
+fi
+
+output=$(nslookup $MY_APP_NAME)
+addresses=$(echo "$output" | awk '/^Name:/ { name=$2; next } name && /^Address:/ { print $2 }')
+sorted_address_list=($(printf '%s\n' "${addresses[@]}" | sort))
+
+i=0
+IFS=' ' read -ra addresses <<< "${sorted_address_list[@]}"
+
+for address in "${addresses[@]}"; do
+    POD_IPs[$i]=$address
+    i=$((i+1))
+done
+
+length=${#POD_IPs[@]}
+
+local_ip=$(hostname -I | grep -oP '\d+\.\d+\.\d+\.\d+')
+echo "local ip is $local_ip"
+echo "master ip is ${POD_IPs[0]}"
+
+if [ "$local_ip" == ${POD_IPs[0]} ]; then
+   #python -m torch.distributed.run --nproc_per_node=8 --master_port=2333 tools/train.py projects/configs/VAD/VAD_tiny_e2e.py --launcher pytorch --deterministic --work-dir ./outputs/VAD_tiny_e2e_v1_ar_test
+    source /remote-home/share/miniconda3/bin/activate && conda activate vad && python -m torch.distributed.launch --nproc_per_node=$GPUS --nnodes=2 --node_rank=0 --master_addr=${POD_IPs[0]} --master_port=$PORT $(dirname "$0")/train.py projects/configs/VAD/VAD_tiny_e2e.py --launcher pytorch ${@:3} --deterministic --work-dir ./outputs/VAD_tiny_e2e_v1_ar_test
+
+fi
+
+if [ "$local_ip" == ${POD_IPs[1]} ]; then
+   source /remote-home/share/miniconda3/bin/activate && conda activate vad && python -m torch.distributed.launch --nproc_per_node=$GPUS --nnodes=2 --node_rank=0 --master_addr=${POD_IPs[0]} --master_port=$PORT $(dirname "$0")/train.py projects/configs/VAD/VAD_tiny_e2e.py --launcher pytorch ${@:3} --deterministic --work-dir ./outputs/VAD_tiny_e2e_v1_ar_test
+#    command="python -m torch.distributed.launch --nproc_per_node=$GPUS --nnodes 2 --node_rank 1 --master_addr=${POD_IPs[0]} --master_port=$PORT $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic"
+fi
diff --git a/GenAD-main/tools/exp/data_analysis.py b/GenAD-main/tools/exp/data_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..8480cff0dd17fe3cbf1430b0a4f045c284e5e999
--- /dev/null
+++ b/GenAD-main/tools/exp/data_analysis.py
@@ -0,0 +1,22 @@
+import  pickle
+
+train=open(r'/home/ubuntu/data/nuscenes/vad_nuscenes_infos_temporal_train.pkl','rb')
+val=open(r'/home/ubuntu/data/nuscenes/vad_nuscenes_infos_temporal_val.pkl','rb')
+
+content_train=pickle.load(train)
+content_val=pickle.load(val)
+
+train_len = len(content_train['infos'])
+val_len = len(content_val['infos'])
+
+
+for i in range(val_len):
+    val_id = content_val['infos'][i]['lidar_path']
+    for j in range(train_len):
+        train_id = content_train['infos'][j]['lidar_path']
+
+        if val_id == train_id:
+            print("*************** there is val sample in training set ****************: ", j)
+
+
+print(1)
\ No newline at end of file
diff --git a/GenAD-main/tools/misc/browse_dataset.py b/GenAD-main/tools/misc/browse_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3419f66df56679088469a842cd62e31906df8a1
--- /dev/null
+++ b/GenAD-main/tools/misc/browse_dataset.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import numpy as np
+import warnings
+from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress
+from os import path as osp
+
+from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
+                               DepthInstance3DBoxes, LiDARInstance3DBoxes)
+from mmdet3d.core.visualizer import (show_multi_modality_result, show_result,
+                                     show_seg_result)
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--skip-type',
+        type=str,
+        nargs='+',
+        default=['Normalize'],
+        help='skip some useless pipeline')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument(
+        '--task',
+        type=str,
+        choices=['det', 'seg', 'multi_modality-det', 'mono-det'],
+        help='Determine the visualization method depending on the task.')
+    parser.add_argument(
+        '--online',
+        action='store_true',
+        help='Whether to perform online visualization. Note that you often '
+        'need a monitor to do so.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def build_data_cfg(config_path, skip_type, cfg_options):
+    """Build data config for loading visualization data."""
+    cfg = Config.fromfile(config_path)
+    if cfg_options is not None:
+        cfg.merge_from_dict(cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+    # extract inner dataset of `RepeatDataset` as `cfg.data.train`
+    # so we don't need to worry about it later
+    if cfg.data.train['type'] == 'RepeatDataset':
+        cfg.data.train = cfg.data.train.dataset
+    # use only first dataset for `ConcatDataset`
+    if cfg.data.train['type'] == 'ConcatDataset':
+        cfg.data.train = cfg.data.train.datasets[0]
+    train_data_cfg = cfg.data.train
+    # eval_pipeline purely consists of loading functions
+    # use eval_pipeline for data loading
+    train_data_cfg['pipeline'] = [
+        x for x in cfg.eval_pipeline if x['type'] not in skip_type
+    ]
+
+    return cfg
+
+
+def to_depth_mode(points, bboxes):
+    """Convert points and bboxes to Depth Coord and Depth Box mode."""
+    if points is not None:
+        points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR,
+                                           Coord3DMode.DEPTH)
+    if bboxes is not None:
+        bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR,
+                                   Box3DMode.DEPTH)
+    return points, bboxes
+
+
+def show_det_data(idx, dataset, out_dir, filename, show=False):
+    """Visualize 3D point cloud and 3D bboxes."""
+    example = dataset.prepare_train_data(idx)
+    points = example['points']._data.numpy()
+    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor
+    if dataset.box_mode_3d != Box3DMode.DEPTH:
+        points, gt_bboxes = to_depth_mode(points, gt_bboxes)
+    show_result(
+        points,
+        gt_bboxes.clone(),
+        None,
+        out_dir,
+        filename,
+        show=show,
+        snapshot=True)
+
+
+def show_seg_data(idx, dataset, out_dir, filename, show=False):
+    """Visualize 3D point cloud and segmentation mask."""
+    example = dataset.prepare_train_data(idx)
+    points = example['points']._data.numpy()
+    gt_seg = example['pts_semantic_mask']._data.numpy()
+    show_seg_result(
+        points,
+        gt_seg.copy(),
+        None,
+        out_dir,
+        filename,
+        np.array(dataset.PALETTE),
+        dataset.ignore_index,
+        show=show,
+        snapshot=True)
+
+
+def show_proj_bbox_img(idx,
+                       dataset,
+                       out_dir,
+                       filename,
+                       show=False,
+                       is_nus_mono=False):
+    """Visualize 3D bboxes on 2D image by projection."""
+    try:
+        example = dataset.prepare_train_data(idx)
+    except AttributeError:  # for Mono-3D datasets
+        example = dataset.prepare_train_img(idx)
+    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d']
+    img_metas = example['img_metas']._data
+    img = example['img']._data.numpy()
+    # need to transpose channel to first dim
+    img = img.transpose(1, 2, 0)
+    # no 3D gt bboxes, just show img
+    if gt_bboxes.tensor.shape[0] == 0:
+        gt_bboxes = None
+    if isinstance(gt_bboxes, DepthInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            None,
+            out_dir,
+            filename,
+            box_mode='depth',
+            img_metas=img_metas,
+            show=show)
+    elif isinstance(gt_bboxes, LiDARInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            img_metas['lidar2img'],
+            out_dir,
+            filename,
+            box_mode='lidar',
+            img_metas=img_metas,
+            show=show)
+    elif isinstance(gt_bboxes, CameraInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            img_metas['cam2img'],
+            out_dir,
+            filename,
+            box_mode='camera',
+            img_metas=img_metas,
+            show=show)
+    else:
+        # can't project, just show img
+        warnings.warn(
+            f'unrecognized gt box type {type(gt_bboxes)}, only show image')
+        show_multi_modality_result(
+            img, None, None, None, out_dir, filename, show=show)
+
+
+def main():
+    args = parse_args()
+
+    if args.output_dir is not None:
+        mkdir_or_exist(args.output_dir)
+
+    cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options)
+    try:
+        dataset = build_dataset(
+            cfg.data.train, default_args=dict(filter_empty_gt=False))
+    except TypeError:  # seg dataset doesn't have `filter_empty_gt` key
+        dataset = build_dataset(cfg.data.train)
+    data_infos = dataset.data_infos
+    dataset_type = cfg.dataset_type
+
+    # configure visualization mode
+    vis_task = args.task  # 'det', 'seg', 'multi_modality-det', 'mono-det'
+
+    for idx, data_info in enumerate(track_iter_progress(data_infos)):
+        if dataset_type in ['KittiDataset', 'WaymoDataset']:
+            data_path = data_info['point_cloud']['velodyne_path']
+        elif dataset_type in [
+                'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset',
+                'S3DISSegDataset', 'S3DISDataset'
+        ]:
+            data_path = data_info['pts_path']
+        elif dataset_type in ['NuScenesDataset', 'LyftDataset']:
+            data_path = data_info['lidar_path']
+        elif dataset_type in ['NuScenesMonoDataset']:
+            data_path = data_info['file_name']
+        else:
+            raise NotImplementedError(
+                f'unsupported dataset type {dataset_type}')
+
+        file_name = osp.splitext(osp.basename(data_path))[0]
+
+        if vis_task in ['det', 'multi_modality-det']:
+            # show 3D bboxes on 3D point clouds
+            show_det_data(
+                idx, dataset, args.output_dir, file_name, show=args.online)
+        if vis_task in ['multi_modality-det', 'mono-det']:
+            # project 3D bboxes to 2D image
+            show_proj_bbox_img(
+                idx,
+                dataset,
+                args.output_dir,
+                file_name,
+                show=args.online,
+                is_nus_mono=(dataset_type == 'NuScenesMonoDataset'))
+        elif vis_task in ['seg']:
+            # show 3D segmentation mask on 3D point clouds
+            show_seg_data(
+                idx, dataset, args.output_dir, file_name, show=args.online)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/GenAD-main/tools/misc/fuse_conv_bn.py b/GenAD-main/tools/misc/fuse_conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4e22018d66d3bd47119522e9da2ea6676ba5760
--- /dev/null
+++ b/GenAD-main/tools/misc/fuse_conv_bn.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import torch
+from mmcv.runner import save_checkpoint
+from torch import nn as nn
+
+from mmdet.apis import init_model
+
+
+def fuse_conv_bn(conv, bn):
+    """During inference, the functionary of batch norm layers is turned off but
+    only the mean and var alone channels are used, which exposes the chance to
+    fuse it with the preceding conv layers to save computations and simplify
+    network structures."""
+    conv_w = conv.weight
+    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+        bn.running_mean)
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(conv_w *
+                               factor.reshape([conv.out_channels, 1, 1, 1]))
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_module(m):
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in m.named_children():
+        if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = fuse_conv_bn(last_conv, child)
+            m._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            m._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_module(child)
+    return m
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='fuse Conv and BN layers in a model')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument('checkpoint', help='checkpoint file path')
+    parser.add_argument('out', help='output path of the converted model')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint)
+    # fuse conv and bn layers of the model
+    fused_model = fuse_module(model)
+    save_checkpoint(fused_model, args.out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/GenAD-main/tools/misc/print_config.py b/GenAD-main/tools/misc/print_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3100fc324b375330ba10316d71405c535d91fb7b
--- /dev/null
+++ b/GenAD-main/tools/misc/print_config.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from mmcv import Config, DictAction
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='arguments in dict')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/GenAD-main/tools/misc/visualize_results.py b/GenAD-main/tools/misc/visualize_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..302adc50eca960a6660104b33521d438cf54faa0
--- /dev/null
+++ b/GenAD-main/tools/misc/visualize_results.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import mmcv
+from mmcv import Config
+
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D visualize the results')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--result', help='results file in pickle format')
+    parser.add_argument(
+        '--show-dir', help='directory where visualize results will be saved')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.result is not None and \
+            not args.result.endswith(('.pkl', '.pickle')):
+        raise ValueError('The results file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    cfg.data.test.test_mode = True
+
+    # build the dataset
+    dataset = build_dataset(cfg.data.test)
+    results = mmcv.load(args.result)
+
+    if getattr(dataset, 'show', None) is not None:
+        # data loading pipeline for showing
+        eval_pipeline = cfg.get('eval_pipeline', {})
+        if eval_pipeline:
+            dataset.show(results, args.show_dir, pipeline=eval_pipeline)
+        else:
+            dataset.show(results, args.show_dir)  # use default pipeline
+    else:
+        raise NotImplementedError(
+            'Show is not implemented for dataset {}!'.format(
+                type(dataset).__name__))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/GenAD-main/tools/model_converters/convert_votenet_checkpoints.py b/GenAD-main/tools/model_converters/convert_votenet_checkpoints.py
new file mode 100644
index 0000000000000000000000000000000000000000..33792b00ddd96790acdcdf6ba9d8caf9da39b637
--- /dev/null
+++ b/GenAD-main/tools/model_converters/convert_votenet_checkpoints.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+import torch
+from mmcv import Config
+from mmcv.runner import load_state_dict
+
+from mmdet3d.models import build_detector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D upgrade model version(before v0.6.0) of VoteNet')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='path of the output checkpoint file')
+    args = parser.parse_args()
+    return args
+
+
+def parse_config(config_strings):
+    """Parse config from strings.
+
+    Args:
+        config_strings (string): strings of model config.
+
+    Returns:
+        Config: model config
+    """
+    temp_file = tempfile.NamedTemporaryFile()
+    config_path = f'{temp_file.name}.py'
+    with open(config_path, 'w') as f:
+        f.write(config_strings)
+
+    config = Config.fromfile(config_path)
+
+    # Update backbone config
+    if 'pool_mod' in config.model.backbone:
+        config.model.backbone.pop('pool_mod')
+
+    if 'sa_cfg' not in config.model.backbone:
+        config.model.backbone['sa_cfg'] = dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)
+
+    if 'type' not in config.model.bbox_head.vote_aggregation_cfg:
+        config.model.bbox_head.vote_aggregation_cfg['type'] = 'PointSAModule'
+
+    # Update bbox_head config
+    if 'pred_layer_cfg' not in config.model.bbox_head:
+        config.model.bbox_head['pred_layer_cfg'] = dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True)
+
+    if 'feat_channels' in config.model.bbox_head:
+        config.model.bbox_head.pop('feat_channels')
+
+    if 'vote_moudule_cfg' in config.model.bbox_head:
+        config.model.bbox_head['vote_module_cfg'] = config.model.bbox_head.pop(
+            'vote_moudule_cfg')
+
+    if config.model.bbox_head.vote_aggregation_cfg.use_xyz:
+        config.model.bbox_head.vote_aggregation_cfg.mlp_channels[0] -= 3
+
+    temp_file.close()
+
+    return config
+
+
+def main():
+    """Convert keys in checkpoints for VoteNet.
+
+    There can be some breaking changes during the development of mmdetection3d,
+    and this tool is used for upgrading checkpoints trained with old versions
+    (before v0.6.0) to the latest one.
+    """
+    args = parse_args()
+    checkpoint = torch.load(args.checkpoint)
+    cfg = parse_config(checkpoint['meta']['config'])
+    # Build the model and load checkpoint
+    model = build_detector(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    orig_ckpt = checkpoint['state_dict']
+    converted_ckpt = orig_ckpt.copy()
+
+    if cfg['dataset_type'] == 'ScanNetDataset':
+        NUM_CLASSES = 18
+    elif cfg['dataset_type'] == 'SUNRGBDDataset':
+        NUM_CLASSES = 10
+    else:
+        raise NotImplementedError
+
+    RENAME_PREFIX = {
+        'bbox_head.conv_pred.0': 'bbox_head.conv_pred.shared_convs.layer0',
+        'bbox_head.conv_pred.1': 'bbox_head.conv_pred.shared_convs.layer1'
+    }
+
+    DEL_KEYS = [
+        'bbox_head.conv_pred.0.bn.num_batches_tracked',
+        'bbox_head.conv_pred.1.bn.num_batches_tracked'
+    ]
+
+    EXTRACT_KEYS = {
+        'bbox_head.conv_pred.conv_cls.weight':
+        ('bbox_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]),
+        'bbox_head.conv_pred.conv_cls.bias':
+        ('bbox_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]),
+        'bbox_head.conv_pred.conv_reg.weight':
+        ('bbox_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]),
+        'bbox_head.conv_pred.conv_reg.bias':
+        ('bbox_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)])
+    }
+
+    # Delete some useless keys
+    for key in DEL_KEYS:
+        converted_ckpt.pop(key)
+
+    # Rename keys with specific prefix
+    RENAME_KEYS = dict()
+    for old_key in converted_ckpt.keys():
+        for rename_prefix in RENAME_PREFIX.keys():
+            if rename_prefix in old_key:
+                new_key = old_key.replace(rename_prefix,
+                                          RENAME_PREFIX[rename_prefix])
+                RENAME_KEYS[new_key] = old_key
+    for new_key, old_key in RENAME_KEYS.items():
+        converted_ckpt[new_key] = converted_ckpt.pop(old_key)
+
+    # Extract weights and rename the keys
+    for new_key, (old_key, indices) in EXTRACT_KEYS.items():
+        cur_layers = orig_ckpt[old_key]
+        converted_layers = []
+        for (start, end) in indices:
+            if end != -1:
+                converted_layers.append(cur_layers[start:end])
+            else:
+                converted_layers.append(cur_layers[start:])
+        converted_layers = torch.cat(converted_layers, 0)
+        converted_ckpt[new_key] = converted_layers
+        if old_key in converted_ckpt.keys():
+            converted_ckpt.pop(old_key)
+
+    # Check the converted checkpoint by loading to the model
+    load_state_dict(model, converted_ckpt, strict=True)
+    checkpoint['state_dict'] = converted_ckpt
+    torch.save(checkpoint, args.out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/GenAD-main/tools/model_converters/publish_model.py b/GenAD-main/tools/model_converters/publish_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..318fd46a65894575f5f3e915672b18d24ba133d8
--- /dev/null
+++ b/GenAD-main/tools/model_converters/publish_model.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/GenAD-main/tools/model_converters/regnet2mmdet.py b/GenAD-main/tools/model_converters/regnet2mmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dee3c878abc94c1298dcea6856e432a77339665
--- /dev/null
+++ b/GenAD-main/tools/model_converters/regnet2mmdet.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import torch
+from collections import OrderedDict
+
+
+def convert_stem(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('stem.conv', 'conv1')
+    new_key = new_key.replace('stem.bn', 'bn1')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_head(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('head.fc', 'fc')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_reslayer(model_key, model_weight, state_dict, converted_names):
+    split_keys = model_key.split('.')
+    layer, block, module = split_keys[:3]
+    block_id = int(block[1:])
+    layer_name = f'layer{int(layer[1:])}'
+    block_name = f'{block_id - 1}'
+
+    if block_id == 1 and module == 'bn':
+        new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}'
+    elif block_id == 1 and module == 'proj':
+        new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}'
+    elif module == 'f':
+        if split_keys[3] == 'a_bn':
+            module_name = 'bn1'
+        elif split_keys[3] == 'b_bn':
+            module_name = 'bn2'
+        elif split_keys[3] == 'c_bn':
+            module_name = 'bn3'
+        elif split_keys[3] == 'a':
+            module_name = 'conv1'
+        elif split_keys[3] == 'b':
+            module_name = 'conv2'
+        elif split_keys[3] == 'c':
+            module_name = 'conv3'
+        new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}'
+    else:
+        raise ValueError(f'Unsupported conversion of key {model_key}')
+    print(f'Convert {model_key} to {new_key}')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+
+
+def convert(src, dst):
+    """Convert keys in pycls pretrained RegNet models to mmdet style."""
+    # load caffe model
+    regnet_model = torch.load(src)
+    blobs = regnet_model['model_state']
+    # convert to pytorch style
+    state_dict = OrderedDict()
+    converted_names = set()
+    for key, weight in blobs.items():
+        if 'stem' in key:
+            convert_stem(key, weight, state_dict, converted_names)
+        elif 'head' in key:
+            convert_head(key, weight, state_dict, converted_names)
+        elif key.startswith('s'):
+            convert_reslayer(key, weight, state_dict, converted_names)
+
+    # check if all layers are converted
+    for key in blobs:
+        if key not in converted_names:
+            print(f'not converted: {key}')
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+    convert(args.src, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/GenAD-main/tools/test.py b/GenAD-main/tools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7a92af0068459608b66e3d6e39bde6d5c9caf15
--- /dev/null
+++ b/GenAD-main/tools/test.py
@@ -0,0 +1,298 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import sys
+sys.path.append('')
+import numpy as np
+import argparse
+import mmcv
+import os
+import copy
+import torch
+torch.multiprocessing.set_sharing_strategy('file_system')
+import warnings
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (get_dist_info, init_dist, load_checkpoint,
+                         wrap_fp16_model)
+
+from mmdet3d.apis import single_gpu_test
+from mmdet3d.datasets import build_dataset
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from mmdet3d.models import build_model
+from mmdet.apis import set_random_seed
+# from projects.mmdet3d_plugin.bevformer.apis.test import custom_multi_gpu_test
+from projects.mmdet3d_plugin.VAD.apis.test import custom_multi_gpu_test
+from mmdet.datasets import replace_ImageToTensor
+import time
+import os.path as osp
+import json
+
+import warnings
+warnings.filterwarnings("ignore")
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--json_dir', help='json parent dir name file') # NOTE: json file parent folder name
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where results will be saved')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu-collect is not specified')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function (deprecate), '
+        'change to --eval-options instead.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            '--options and --eval-options cannot be both specified, '
+            '--options is deprecated in favor of --eval-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --eval-options')
+        args.eval_options = args.options
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    cfg.model.pretrained = None
+    # in case the test dataset is concatenated
+    samples_per_gpu = 1
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+        if samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        if samples_per_gpu > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # set random seeds
+    # args.seed = None
+    if args.seed is not None:
+        set_random_seed(args.seed, deterministic=args.deterministic)
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=samples_per_gpu,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False,
+        nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+    )
+
+
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    # old versions did not save class info in checkpoints, this walkaround is
+    # for backward compatibility
+    if 'CLASSES' in checkpoint.get('meta', {}):
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+    # palette for visualization in segmentation tasks
+    if 'PALETTE' in checkpoint.get('meta', {}):
+        model.PALETTE = checkpoint['meta']['PALETTE']
+    elif hasattr(dataset, 'PALETTE'):
+        # segmentation dataset has `PALETTE` attribute
+        model.PALETTE = dataset.PALETTE
+
+    if not distributed:
+        # assert False
+        model = MMDataParallel(model, device_ids=[0])
+        outputs = single_gpu_test(model, data_loader, args.show, args.show_dir)
+    else:
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+        outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir,
+                                        args.gpu_collect)
+
+    tmp = {}
+    tmp['bbox_results'] = outputs
+    outputs = tmp
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            print(f'\nwriting results to {args.out}')
+            # assert False
+            if isinstance(outputs, list):
+                mmcv.dump(outputs, args.out)
+            else:
+                mmcv.dump(outputs['bbox_results'], args.out)
+        kwargs = {} if args.eval_options is None else args.eval_options
+        kwargs['jsonfile_prefix'] = osp.join('test', args.config.split(
+            '/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_'))
+        if args.format_only:
+            dataset.format_results(outputs['bbox_results'], **kwargs)
+
+        if args.eval:
+            eval_kwargs = cfg.get('evaluation', {}).copy()
+            # hard-code way to remove EvalHook args
+            for key in [
+                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+                    'rule'
+            ]:
+                eval_kwargs.pop(key, None)
+            eval_kwargs.update(dict(metric=args.eval, **kwargs))
+
+            print(dataset.evaluate(outputs['bbox_results'], **eval_kwargs))
+    
+        # # # NOTE: record to json
+        # json_path = args.json_dir
+        # if not os.path.exists(json_path):
+        #     os.makedirs(json_path)
+        
+        # metric_all = []
+        # for res in outputs['bbox_results']:
+        #     for k in res['metric_results'].keys():
+        #         if type(res['metric_results'][k]) is np.ndarray:
+        #             res['metric_results'][k] = res['metric_results'][k].tolist()
+        #     metric_all.append(res['metric_results'])
+        
+        # print('start saving to json done')
+        # with open(json_path+'/metric_record.json', "w", encoding="utf-8") as f2:
+        #     json.dump(metric_all, f2, indent=4)
+        # print('save to json done')
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/GenAD-main/tools/train.py b/GenAD-main/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc4568bcd8559df6653c33e2954875a759bdd629
--- /dev/null
+++ b/GenAD-main/tools/train.py
@@ -0,0 +1,328 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+ 
+from __future__ import division
+
+import argparse
+import copy
+import mmcv
+import os
+import time
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist
+from os import path as osp
+
+from mmdet import __version__ as mmdet_version
+from mmdet3d import __version__ as mmdet3d_version
+#from mmdet3d.apis import train_model
+
+from mmdet3d.datasets import build_dataset
+from mmdet3d.models import build_model
+from mmdet3d.utils import collect_env, get_root_logger
+from mmdet.apis import set_random_seed
+from mmseg import __version__ as mmseg_version
+
+from mmcv.utils import TORCH_VERSION, digit_version
+
+from torchstat import stat
+
+import cv2
+cv2.setNumThreads(1)
+
+import sys
+sys.path.append('')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file (deprecate), '
+        'change to --cfg-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both specified, '
+            '--options is deprecated in favor of --cfg-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+
+            # from projects.mmdet3d_plugin.bevformer.apis import custom_train_model
+            from projects.mmdet3d_plugin.VAD.apis.train import custom_train_model
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    # if args.resume_from is not None:
+    if args.resume_from is not None and osp.isfile(args.resume_from):
+        cfg.resume_from = args.resume_from
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+    if digit_version(TORCH_VERSION) == digit_version('1.8.1') and cfg.optimizer['type'] == 'AdamW':
+        cfg.optimizer['type'] = 'AdamW2' # fix bug in Adamw
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    # specify logger name, if we still use 'mmdet', the output info will be
+    # filtered and won't be saved in the log_file
+    # TODO: ugly workaround to judge whether we are training det or seg model
+    if cfg.model.type in ['EncoderDecoder3D']:
+        logger_name = 'mmseg'
+    else:
+        logger_name = 'mmdet'
+    logger = get_root_logger(
+        log_file=log_file, log_level=cfg.log_level, name=logger_name)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+    meta['config'] = cfg.pretty_text
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info(f'Set random seed to {args.seed}, '
+                    f'deterministic: {args.deterministic}')
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    model = build_model(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    model.init_weights()
+
+    # model.load_state_dict(torch.load('/home/ubuntu/phd/unity/vad/models/cvpr/vad_tiny_pl_vae_gru/epoch_60.pth'))
+
+    # 计算参数量
+    total_params = sum(p.numel() for p in model.parameters())
+    print(f"Total number of parameters: {total_params}")
+
+    # stat(model,(3,384,640))
+    # print("params")
+
+
+
+    # saved_state_dict = torch.load('/home/ubuntu/phd/unity/vad/VAD/ckpts/epoch_9.pth')
+    #
+    # # training_layer = ['pts_bbox_head.predict_model',
+    # #                   'pts_bbox_head.future_prediction',
+    # #                   'pts_bbox_head.present_distribution',
+    # #                   'pts_bbox_head.future_distribution']
+    # training_layer = ['pts_bbox_head.ego_fut_decoder',
+    #                   'pts_bbox_head.traj_branches',
+    #                   'pts_bbox_head.predict_model',
+    #                   'pts_bbox_head.future_prediction',
+    #                   'pts_bbox_head.present_distribution',
+    #                   'pts_bbox_head.future_distribution']
+    # specific_layers = {k: v for k, v in saved_state_dict['state_dict'].items() if training_layer[0] in k or
+    #                    training_layer[1] in k or
+    #                    training_layer[2] in k or
+    #                    training_layer[3] in k or
+    #                    training_layer[4] in k or
+    #                    training_layer[5] in k}
+    #
+    # model.load_state_dict(specific_layers, strict=False)
+    #
+    # for name, param in model.named_parameters():
+    #     for i in range(len(training_layer)):
+    #         if training_layer[i] not in name:
+    #             param.requires_grad = True
+    #         else:
+    #             print(name)
+    #             param.requires_grad = False
+
+    # training_layer = ['pts_bbox_head.ego_fut_decoder',
+    #                   'pts_bbox_head.traj_branches',
+    #                   'pts_bbox_head.predict_model',
+    #                   'pts_bbox_head.future_prediction',
+    #                   'pts_bbox_head.present_distribution',
+    #                   'pts_bbox_head.state_gru',
+    #                   'pts_bbox_head.future_distribution']
+    #
+    # for name, param in model.named_parameters():
+    #     for i in range(len(training_layer)):
+    #         if training_layer[i] not in name:
+    #             param.requires_grad = False
+    #         else:
+    #             print(name)
+    #             param.requires_grad = True
+
+
+
+
+
+    logger.info(f'Model:\n{model}')
+    datasets = [build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        # in case we use a dataset wrapper
+        if 'dataset' in cfg.data.train:
+            val_dataset.pipeline = cfg.data.train.dataset.pipeline
+        else:
+            val_dataset.pipeline = cfg.data.train.pipeline
+        # set test_mode=False here in deep copied config
+        # which do not affect AP/AR calculation later
+        # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow  # noqa
+        val_dataset.test_mode = False
+        datasets.append(build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmdet_version=mmdet_version,
+            mmseg_version=mmseg_version,
+            mmdet3d_version=mmdet3d_version,
+            config=cfg.pretty_text,
+            CLASSES=datasets[0].CLASSES,
+            PALETTE=datasets[0].PALETTE  # for segmentors
+            if hasattr(datasets[0], 'PALETTE') else None)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    custom_train_model(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()