Spaces:

mapitanywhere
/

testme

Running

App Files Files Community

hocherie commited on Dec 5, 2024

Commit

e150a4c

1 Parent(s): a293d60

upload files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +35 -0
LICENSE +433 -0
app.py +147 -0
config.yaml +36 -0
get_weights.sh +9 -0
mapper/__init__.py +30 -0
mapper/callbacks.py +105 -0
mapper/conf/data/kitti.yaml +40 -0
mapper/conf/data/mia.yaml +44 -0
mapper/conf/data/nuscenes.yaml +38 -0
mapper/conf/mapper_kitti.yaml +23 -0
mapper/conf/mapper_nuscenes.yaml +26 -0
mapper/conf/model/image_encoder/dino.yaml +5 -0
mapper/conf/model/image_encoder/resnet.yaml +12 -0
mapper/conf/model/mapper.yaml +15 -0
mapper/conf/pretrain.yaml +24 -0
mapper/conf/pretrain_resnet.yaml +26 -0
mapper/conf/training.yaml +30 -0
mapper/data/__init__.py +7 -0
mapper/data/base.py +19 -0
mapper/data/image.py +140 -0
mapper/data/kitti/data_module.py +32 -0
mapper/data/kitti/dataset.py +317 -0
mapper/data/kitti/transform.py +149 -0
mapper/data/mapillary/data_module.py +317 -0
mapper/data/mapillary/dataset.py +255 -0
mapper/data/module.py +64 -0
mapper/data/nuscenes/data_module.py +33 -0
mapper/data/nuscenes/dataset.py +207 -0
mapper/data/nuscenes/splits_roddick.py +197 -0
mapper/data/nuscenes/utils.py +214 -0
mapper/data/schema.py +75 -0
mapper/data/sequential.py +45 -0
mapper/data/torch.py +102 -0
mapper/data/utils.py +21 -0
mapper/mapper.py +112 -0
mapper/models/__init__.py +28 -0
mapper/models/base.py +59 -0
mapper/models/bev_projection.py +95 -0
mapper/models/dinov2/__init__.py +6 -0
mapper/models/dinov2/configs/__init__.py +22 -0
mapper/models/dinov2/configs/eval/vitb14_pretrain.yaml +6 -0
mapper/models/dinov2/configs/eval/vitg14_pretrain.yaml +7 -0
mapper/models/dinov2/configs/eval/vitl14_pretrain.yaml +6 -0
mapper/models/dinov2/configs/eval/vits14_pretrain.yaml +6 -0
mapper/models/dinov2/configs/eval/vits14_reg4_pretrain.yaml +9 -0
mapper/models/dinov2/configs/ssl_default_config.yaml +118 -0
mapper/models/dinov2/configs/train/vitg14.yaml +26 -0
mapper/models/dinov2/configs/train/vitl14.yaml +26 -0
mapper/models/dinov2/configs/train/vitl16_short.yaml +6 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+FROM pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    wget \
+    unzip \
+    vim \
+    ffmpeg \
+    libsm6 \
+    libxext6
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+WORKDIR $HOME/mapper
+RUN pip install --no-cache-dir 	gradio[oauth]==4.44.0 	"uvicorn>=0.14.0" 	spaces
+COPY --chown=user . $HOME/mapper
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install fastapi==0.115.0
+# Get Weights
+RUN bash get_weights.sh
+# Start the app
+CMD ["python", "app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,433 @@

+Attribution-ShareAlike 4.0 International License
+Copyright (c) 2024,
+Cherie Ho* · Jiaye (Tony) Zou* · Omar Alama*
+Sai Mitheran Jagadesh Kumar · Benjamin Chiang · Taneesh Gupta · Chen Wang
+Nikhil Keetha · Katia Sycara · Sebastian Scherer
+Carnegie Mellon University
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-ShareAlike 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-ShareAlike 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. BY-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution and ShareAlike.
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  k. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  l. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  m. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part; and
+            b. produce, reproduce, and Share Adapted Material.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+  b. ShareAlike.
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-SA Compatible License.
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

app.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import gradio as gr
+from matplotlib import pyplot as plt
+from mapper.utils.io import read_image
+from mapper.utils.exif import EXIF
+from mapper.utils.wrappers import Camera
+from mapper.data.image import rectify_image, pad_image, resize_image
+from mapper.utils.viz_2d import one_hot_argmax_to_rgb, plot_images
+from mapper.module import GenericModule
+from perspective2d import PerspectiveFields
+import torch
+import numpy as np
+from typing import Optional, Tuple
+from omegaconf import OmegaConf
+description = """
+<h1 align="center">
+  <ins>MapItAnywhere (MIA) </ins>
+  <br>
+  Empowering Bird’s Eye View Mapping using Large-scale Public Data
+  <br>
+<h3 align="center">
+    <a href="https://mapitanywhere.github.io" target="_blank">Project Page</a> |
+    <a href="https://arxiv.org/abs/2109.08203" target="_blank">Paper</a> |
+    <a href="https://github.com/MapItAnywhere/MapItAnywhere" target="_blank">Code</a>
+</h3>
+<p align="center">
+Mapper generates birds-eye-view maps from in-the-wild monocular first-person view images. You can try our demo by uploading your images or using the examples provided. Tip: You can also try out images across the world using <a href="https://www.mapillary.com/app" target="_blank">Mapillary</a> &#128521; Also try out some examples that are taken in cities we have not trained on!
+</p>
+"""
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+cfg = OmegaConf.load("config.yaml")
+class ImageCalibrator(PerspectiveFields):
+    def __init__(self, version: str = "Paramnet-360Cities-edina-centered"):
+        super().__init__(version)
+        self.eval()
+    def run(
+        self,
+        image_rgb: np.ndarray,
+        focal_length: Optional[float] = None,
+        exif: Optional[EXIF] = None,
+    ) -> Tuple[Tuple[float, float], Camera]:
+        h, w, *_ = image_rgb.shape
+        if focal_length is None and exif is not None:
+            _, focal_ratio = exif.extract_focal()
+            if focal_ratio != 0:
+                focal_length = focal_ratio * max(h, w)
+        calib = self.inference(img_bgr=image_rgb[..., ::-1])
+        roll_pitch = (calib["pred_roll"].item(), calib["pred_pitch"].item())
+        if focal_length is None:
+            vfov = calib["pred_vfov"].item()
+            focal_length = h / 2 / np.tan(np.deg2rad(vfov) / 2)
+        camera = Camera.from_dict(
+            {
+                "model": "SIMPLE_PINHOLE",
+                "width": w,
+                "height": h,
+                "params": [focal_length, w / 2 + 0.5, h / 2 + 0.5],
+            }
+        )
+        return roll_pitch, camera
+def preprocess_pipeline(image, roll_pitch, camera):
+    image = torch.from_numpy(image).float() / 255
+    image = image.permute(2, 0, 1).to(device)
+    camera = camera.to(device)
+    image, valid = rectify_image(image, camera.float(), -roll_pitch[0], -roll_pitch[1])
+    roll_pitch *= 0
+    image, _, camera, valid = resize_image(
+        image=image,
+        size=512,
+        camera=camera,
+        fn=max,
+        valid=valid
+    )
+    # image, valid, camera = pad_image(
+    #     image, 512, camera, valid
+    # )
+    camera = torch.stack([camera])
+    return {
+        "image": image.unsqueeze(0).to(device),
+        "valid": valid.unsqueeze(0).to(device),
+        "camera": camera.float().to(device),
+    }
+calibrator = ImageCalibrator().to(device)
+model = GenericModule(cfg)
+model = model.load_from_checkpoint("trained_weights/mapper-excl-ood.ckpt", strict=False, cfg=cfg)
+model = model.to(device)
+model = model.eval()
+def run(input_img):
+    image_path = input_img.name
+    image = read_image(image_path)
+    with open(image_path, "rb") as fid:
+        exif = EXIF(fid, lambda: image.shape[:2])
+    gravity, camera = calibrator.run(image, exif=exif)
+    data = preprocess_pipeline(image, gravity, camera)
+    res = model(data)
+    prediction = res['output']
+    rgb_prediction = one_hot_argmax_to_rgb(prediction, 6).squeeze(0).permute(1, 2, 0).cpu().long().numpy()
+    valid = res['valid_bev'].squeeze(0)[..., :-1]
+    rgb_prediction[~valid.cpu().numpy()] = 255
+     # TODO: add legend here
+    plot_images([image, rgb_prediction], titles=["Input Image", "Top-Down Prediction"], pad=2, adaptive=True)
+    return plt.gcf()
+examples = [
+    ["examples/left_crossing.jpg"],
+    ["examples/crossing.jpg"],
+    ["examples/two_roads.jpg"],
+    ["examples/japan_narrow_road.jpeg"],
+    ["examples/zurich_crossing.jpg"],
+    ["examples/night_road.jpg"],
+    ["examples/night_crossing.jpg"],
+]
+demo = gr.Interface(
+    fn=run,
+    inputs=[
+        gr.File(file_types=["image"], label="Input Image")
+    ],
+    outputs=[
+        gr.Plot(label="Prediction", format="png"),
+    ],
+    description=description,
+    examples=examples)
+demo.launch(share=True, server_name="0.0.0.0")

config.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+model:
+  image_encoder:
+    backbone:
+      pretrained: true
+      frozen: true
+      output_dim: 128
+    name: feature_extractor_DPT
+  segmentation_head:
+    dropout_rate: 0.2
+  name: map_perception_net
+  num_classes: 6
+  latent_dim: 128
+  z_max: 50
+  x_max: 25
+  pixel_per_meter: 2
+  num_scale_bins: 32
+  loss:
+    num_classes: 6
+    xent_weight: 1.0
+    dice_weight: 1.0
+    focal_loss: false
+    focal_loss_gamma: 2.0
+    requires_frustrum: true
+    requires_flood_mask: false
+    class_weights:
+    - 1.00351229
+    - 4.34782609
+    - 1.00110121
+    - 1.03124678
+    - 6.69792364
+    - 7.55857899
+    label_smoothing: 0.1
+  scale_range:
+  - 0
+  - 9
+  z_min: null

get_weights.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/bin/bash
+# URL of the file to download
+ood_weights="https://huggingface.co/mapitanywhere/mapper/resolve/main/weights/mapper-excl-ood/model.ckpt"
+mkdir -p trained_weights
+# Download the file using curl
+wget $ood_weights -O trained_weights/mapper-excl-ood.ckpt

mapper/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import os, sys
+sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+from pathlib import Path
+import logging
+import pytorch_lightning  # noqa: F401
+formatter = logging.Formatter(
+    fmt="[%(asctime)s %(name)s %(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+handler = logging.StreamHandler()
+handler.setFormatter(formatter)
+handler.setLevel(logging.INFO)
+logger = logging.getLogger("mapper")
+logger.setLevel(logging.INFO)
+logger.addHandler(handler)
+logger.propagate = False
+pl_logger = logging.getLogger("pytorch_lightning")
+if len(pl_logger.handlers):
+    pl_logger.handlers[0].setFormatter(formatter)
+repo_dir = Path(__file__).parent.parent
+EXPERIMENTS_PATH = repo_dir / "experiments/"
+DATASETS_PATH = repo_dir / "datasets/"

mapper/callbacks.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+import pytorch_lightning as pl
+from pathlib import Path
+from typing import Any
+import torchvision
+import wandb
+class EvalSaveCallback(pl.Callback):
+    def __init__(self, save_dir: Path) -> None:
+        super().__init__()
+        self.save_dir = save_dir
+    def save(self, outputs, batch, batch_idx):
+        name = batch['name']
+        filename = self.save_dir / f"{batch_idx:06d}_{name[0]}.pt"
+        torch.save({
+            "fpv": batch['image'],
+            "seg_masks": batch['seg_masks'],
+            'name': name,
+            "output": outputs["output"],
+            "valid_bev": outputs["valid_bev"],
+        }, filename)
+    def on_test_batch_end(self, trainer: pl.Trainer,
+                          pl_module: pl.LightningModule,
+                          outputs: torch.Tensor | Any | None,
+                          batch: Any,
+                          batch_idx: int,
+                          dataloader_idx: int = 0) -> None:
+        if not outputs:
+            return
+        self.save(outputs, batch, batch_idx)
+    def on_validation_batch_end(self, trainer: pl.Trainer,
+                                pl_module: pl.LightningModule,
+                                outputs: torch.Tensor | Any | None,
+                                batch: Any,
+                                batch_idx: int,
+                                dataloader_idx: int = 0) -> None:
+        if not outputs:
+            return
+        self.save(outputs, batch, batch_idx)
+class ImageLoggerCallback(pl.Callback):
+    def __init__(self, num_classes):
+        super().__init__()
+        self.num_classes = num_classes
+    def log_image(self, trainer, pl_module, outputs, batch, batch_idx, mode="train"):
+        fpv_rgb = batch["image"]
+        fpv_grid = torchvision.utils.make_grid(
+            fpv_rgb, nrow=8, normalize=False)
+        images = [
+            wandb.Image(fpv_grid, caption="fpv")
+        ]
+        pred = outputs['output'].permute(0, 2, 3, 1)
+        pred[outputs["valid_bev"][..., :-1] == 0] = 0
+        pred = (pred > 0.5).float()
+        pred = pred.permute(0, 3, 1, 2)
+        for i in range(self.num_classes):
+            gt_class_i = batch['seg_masks'][..., i]
+            gt_class_i_grid = torchvision.utils.make_grid(
+                gt_class_i.unsqueeze(1), nrow=8, normalize=False, pad_value=0)
+            pred_class_i = pred[:, i]
+            pred_class_i_grid = torchvision.utils.make_grid(
+                pred_class_i.unsqueeze(1), nrow=8, normalize=False, pad_value=0)
+            images += [
+                wandb.Image(gt_class_i_grid, caption=f"gt_class_{i}"),
+                wandb.Image(pred_class_i_grid, caption=f"pred_class_{i}")
+            ]
+        trainer.logger.experiment.log(
+            {
+                "{}/images".format(mode): images
+            }
+        )
+    def on_validation_batch_end(self, trainer, pl_module: pl.LightningModule, outputs, batch, batch_idx):
+        if batch_idx == 0:
+            with torch.no_grad():
+                outputs = pl_module(batch)
+            self.log_image(trainer, pl_module, outputs,
+                           batch, batch_idx, mode="val")
+    def on_train_batch_end(self, trainer, pl_module: pl.LightningModule, outputs, batch, batch_idx):
+        if batch_idx == 0:
+            pl_module.eval()
+            with torch.no_grad():
+                outputs = pl_module(batch)
+            self.log_image(trainer, pl_module, outputs,
+                           batch, batch_idx, mode="train")
+            pl_module.train()

mapper/conf/data/kitti.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+name: kitti
+seam_root_dir: /path/to/generated/seam
+dataset_root_dir: /path/to/kitti/dataset
+bev_percentage: 100
+pixel_per_meter: 2
+crop_size_meters: 50
+target_focal_length: 256
+resize_image: null
+pad_to_multiple: 14
+num_classes: 8
+loading:
+  train:
+    batch_size: 32
+    num_workers: 32
+  val:
+    batch_size: 32
+    num_workers: 32
+  test:
+    batch_size: 32
+    num_workers: 32
+pad_to_square: true
+rectify_pitch: true
+gravity_align: false
+class_mapping: [0, 0, 1, 2, 0, 3]
+augmentations:
+    enabled: True
+    brightness: 0.5
+    contrast: 0.5
+    saturation: 0.5
+    random_flip: 0.5
+    hue: 0.5
+    random_resized_crop: False
+    gaussian_noise:
+        enabled: False
+        mean: 0.0
+        std: 0.1
+    brightness_contrast:
+        enabled: True
+        brightness_factor: 0.2
+        contrast_factor: 0.2

mapper/conf/data/mia.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+name: mapillary
+scenes:
+- chicago
+- new_york
+- los_angeles
+- san_francisco
+split: /path/to/split/file
+data_dir: /path/to/mia/dataset
+loading:
+  train:
+    batch_size: 128
+    num_workers: 30
+  val:
+    batch_size: 128
+    num_workers: 30
+  test:
+    batch_size: 1
+    num_workers: 0
+  testsmall:
+    batch_size: 1
+    num_workers: 0
+num_classes: 6
+pixel_per_meter: 2
+crop_size_meters: 64
+resize_image: 512
+pad_to_square: true
+rectify_pitch: true
+gravity_align: true
+augmentations:
+    enabled: True
+    brightness: 0.5
+    contrast: 0.5
+    saturation: 0.5
+    random_flip: 0.5
+    hue: 0.5
+    random_resized_crop: False
+    gaussian_noise:
+        enabled: False
+        mean: 0.0
+        std: 0.1
+    brightness_contrast:
+        enabled: True
+        brightness_factor: 0.2
+        contrast_factor: 0.2

mapper/conf/data/nuscenes.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+name: nuscenes
+data_dir: /path/to/nuscenes/data
+map_dir: /path/to/generated/maps
+version: v1.0-trainval
+pixel_per_meter: 2
+crop_size_meters: 50
+resize_image: 512
+percentage: 1.0
+class_mapping: [0, 1, 2, 0, 0, 3]
+num_classes: 14
+loading:
+  train:
+    batch_size: 128
+    num_workers: 10
+  val:
+    batch_size: 128
+    num_workers: 10
+  test:
+    batch_size: 128
+    num_workers: 10
+pad_to_square: true
+rectify_pitch: true
+gravity_align: true
+augmentations:
+    enabled: True
+    brightness: 0.5
+    contrast: 0.5
+    saturation: 0.5
+    hue: 0.5
+    random_resized_crop: False
+    gaussian_noise:
+        enabled: False
+        mean: 0.0
+        std: 0.1
+    brightness_contrast:
+        enabled: True
+        brightness_factor: 0.2
+        contrast_factor: 0.2

mapper/conf/mapper_kitti.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+defaults:
+  - schema/data: kitti
+  - data: kitti
+  - model: mapper
+  - training
+  - _self_
+experiment:
+  name: MIA_DINOv2_Mapper_KITTI
+model:
+  loss:
+    xent_weight: 1.0
+    dice_weight: 1.0
+    focal_loss: false
+    focal_loss_gamma: 2.0
+    requires_frustrum: true
+    requires_flood_mask: true
+    class_weights: null
+    label_smoothing: 0.1
+training:
+  checkpoint: /path/to/checkpoint

mapper/conf/mapper_nuscenes.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+defaults:
+  - schema/data: nuscenes
+  - data: nuscenes
+  - model: mapper
+  - training
+  - _self_
+experiment:
+  name: MIA_DINOv2_Mapper_NuScenes
+model:
+  loss:
+    xent_weight: 1.0
+    dice_weight: 1.0
+    focal_loss: false
+    focal_loss_gamma: 2.0
+    class_weights: [1.00060036,  1.85908161,  1.0249052, 0., 0., 2.57267816]
+    requires_frustrum: true
+    label_smoothing: 0.1
+training:
+  checkpoint: /path/to/checkpoint
+  finetune: true
+  lr: 0.0001
+  trainer:
+    max_epochs: 50

mapper/conf/model/image_encoder/dino.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+name: feature_extractor_DPT
+backbone:
+  pretrained: true
+  frozen: true
+  output_dim: ${model.latent_dim} # Match Latent Dimension

mapper/conf/model/image_encoder/resnet.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+name: feature_extractor_resnet
+backbone:
+  pretrained: true
+  frozen: true
+  output_dim: ${model.latent_dim} # Match Latent Dimension
+  input_dim: 3
+  encoder: resnet50
+  num_downsample: null
+  remove_stride_from_first_conv: false
+  decoder_norm: "nn.BatchNorm2d"
+  do_average_pooling: false
+  checkpointed: false

mapper/conf/model/mapper.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+defaults:
+  - schema/backbone: dino
+  - image_encoder: dino
+segmentation_head:
+  dropout_rate: 0.2
+name: map_perception_net
+num_classes: 6
+latent_dim: 128
+z_max: 50
+x_max: 25
+pixel_per_meter: ${data.pixel_per_meter}
+num_scale_bins: 32
+loss:
+  num_classes: ${..num_classes}

mapper/conf/pretrain.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+defaults:
+  - schema/data: mia
+  - data: mia
+  - model: mapper
+  - training
+  - _self_
+experiment:
+  name: MIA_DINOv2_Pretrain
+model:
+  loss:
+    xent_weight: 1.0
+    dice_weight: 1.0
+    focal_loss: false
+    focal_loss_gamma: 2.0
+    requires_frustrum: true
+    class_weights: [ 1.00351229,  4.34782609,  1.00110121, 1.03124678,
+        6.69792364,  7.55857899 ]
+    label_smoothing: 0.1
+training:
+  trainer:
+    max_epochs: 15

mapper/conf/pretrain_resnet.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+defaults:
+  - schema/data: mia
+  - data: mia
+  - model: mapper
+  - training
+  - _self_
+  - override model/schema/backbone: resnet
+  - override model/image_encoder: resnet
+experiment:
+  name: MIA_DINOv2_Pretrain
+model:
+  loss:
+    xent_weight: 1.0
+    dice_weight: 1.0
+    focal_loss: false
+    focal_loss_gamma: 2.0
+    requires_frustrum: true
+    class_weights: [ 1.00351229,  4.34782609,  1.00110121, 1.03124678,
+        6.69792364,  7.55857899 ]
+training:
+  trainer:
+    max_steps: 10
+    max_epochs: 15

mapper/conf/training.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+experiment:
+  name: MGL_DINOv2_v4-baseline-less-class
+  seed: 42
+training:
+  num_classes: ${model.num_classes}
+  lr: 0.001
+  lr_scheduler:
+    name: "CosineAnnealingLR"
+    args:
+      T_max: $total_epochs
+      eta_min: 0.0000001
+  checkpoint: null
+  finetune: false
+  eval: false
+  save_dir: eval_results
+  trainer:
+    # val_check_interval: 250
+    # log_every_n_steps: 100
+    # limit_val_batches: 0
+    # max_steps: 500000
+    # num_epochs: 15
+    precision: bf16-mixed
+    accelerator: gpu
+    strategy: ddp_find_unused_parameters_true
+  checkpointing:
+    dirpath: checkpoints/
+    monitor: val/total/loss
+    save_top_k: -1
+    mode: min
+    save_last: True

mapper/data/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .mapillary.data_module import MapillaryDataModule
+from .nuscenes.data_module import NuScenesData
+modules = {
+    "mapillary": MapillaryDataModule,
+    "nuscenes": NuScenesData
+}

mapper/data/base.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from abc import abstractmethod
+from typing import Optional
+class DataBase():
+    def __init__(self) -> None:
+        raise NotImplementedError
+    @abstractmethod
+    def prepare_data(self) -> None:
+        raise NotImplementedError
+    @abstractmethod
+    def setup(self, stage: Optional[str] = None):
+        raise NotImplementedError
+    @abstractmethod
+    def dataset(self, stage: str):
+        raise NotImplementedError

mapper/data/image.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Callable, Optional, Union, Sequence
+import numpy as np
+import torch
+import torchvision.transforms.functional as tvf
+import collections
+from scipy.spatial.transform import Rotation
+from ..utils.geometry import from_homogeneous, to_homogeneous
+from ..utils.wrappers import Camera
+def rectify_image(
+    image: torch.Tensor,
+    cam: Camera,
+    roll: float,
+    pitch: Optional[float] = None,
+    valid: Optional[torch.Tensor] = None,
+):
+    *_, h, w = image.shape
+    grid = torch.meshgrid(
+        [torch.arange(w, device=image.device), torch.arange(h, device=image.device)],
+        indexing="xy",
+    )
+    grid = torch.stack(grid, -1).to(image.dtype)
+    if pitch is not None:
+        args = ("ZX", (roll, pitch))
+    else:
+        args = ("Z", roll)
+    R = Rotation.from_euler(*args, degrees=True).as_matrix()
+    R = torch.from_numpy(R).to(image)
+    grid_rect = to_homogeneous(cam.normalize(grid)) @ R.T
+    grid_rect = cam.denormalize(from_homogeneous(grid_rect))
+    grid_norm = (grid_rect + 0.5) / grid.new_tensor([w, h]) * 2 - 1
+    rectified = torch.nn.functional.grid_sample(
+        image[None],
+        grid_norm[None],
+        align_corners=False,
+        mode="bilinear",
+    ).squeeze(0)
+    if valid is None:
+        valid = torch.all((grid_norm >= -1) & (grid_norm <= 1), -1)
+    else:
+        valid = (
+            torch.nn.functional.grid_sample(
+                valid[None, None].float(),
+                grid_norm[None],
+                align_corners=False,
+                mode="nearest",
+            )[0, 0]
+            > 0
+        )
+    return rectified, valid
+def resize_image(
+    image: torch.Tensor,
+    size: Union[int, Sequence, np.ndarray],
+    fn: Optional[Callable] = None,
+    camera: Optional[Camera] = None,
+    valid: np.ndarray = None,
+):
+    """Resize an image to a fixed size, or according to max or min edge."""
+    *_, h, w = image.shape
+    if fn is not None:
+        assert isinstance(size, int)
+        scale = size / fn(h, w)
+        h_new, w_new = int(round(h * scale)), int(round(w * scale))
+        scale = (scale, scale)
+    else:
+        if isinstance(size, (collections.abc.Sequence, np.ndarray)):
+            w_new, h_new = size
+        elif isinstance(size, int):
+            w_new = h_new = size
+        else:
+            raise ValueError(f"Incorrect new size: {size}")
+        scale = (w_new / w, h_new / h)
+    if (w, h) != (w_new, h_new):
+        mode = tvf.InterpolationMode.BILINEAR
+        image = tvf.resize(image, (int(h_new), int(w_new)), interpolation=mode, antialias=True)
+        image.clip_(0, 1)
+        if camera is not None:
+            camera = camera.scale(scale)
+        if valid is not None:
+            valid = tvf.resize(
+                valid.unsqueeze(0),
+                (int(h_new), int(w_new)),
+                interpolation=tvf.InterpolationMode.NEAREST,
+            ).squeeze(0)
+    ret = [image, scale]
+    if camera is not None:
+        ret.append(camera)
+    if valid is not None:
+        ret.append(valid)
+    return ret
+def pad_image(
+    image: torch.Tensor,
+    size: Union[int, Sequence, np.ndarray],
+    camera: Optional[Camera] = None,
+    valid: torch.Tensor = None,
+    crop_and_center: bool = False,
+):
+    if isinstance(size, int):
+        w_new = h_new = size
+    elif isinstance(size, (collections.abc.Sequence, np.ndarray)):
+        w_new, h_new = size
+    else:
+        raise ValueError(f"Incorrect new size: {size}")
+    *c, h, w = image.shape
+    if crop_and_center:
+        diff = np.array([w - w_new, h - h_new])
+        left, top = left_top = np.round(diff / 2).astype(int)
+        right, bottom = diff - left_top
+    else:
+        assert h <= h_new
+        assert w <= w_new
+        top = bottom = left = right = 0
+    slice_out = np.s_[..., : min(h, h_new), : min(w, w_new)]
+    slice_in = np.s_[
+        ..., max(top, 0) : h - max(bottom, 0), max(left, 0) : w - max(right, 0)
+    ]
+    if (w, h) == (w_new, h_new):
+        out = image
+    else:
+        out = torch.zeros((*c, h_new, w_new), dtype=image.dtype)
+        out[slice_out] = image[slice_in]
+        if camera is not None:
+            camera = camera.crop((max(left, 0), max(top, 0)), (w_new, h_new))
+    out_valid = torch.zeros((h_new, w_new), dtype=torch.bool)
+    out_valid[slice_out] = True if valid is None else valid[slice_in]
+    if camera is not None:
+        return out, out_valid, camera
+    else:
+        return out, out_valid

mapper/data/kitti/data_module.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from ..base import DataBase
+from .dataset import BEVKitti360Dataset
+from ..schema import KITTIDataConfiguration
+class BEVKitti360Data(DataBase):
+    def __init__(self, cfg: KITTIDataConfiguration) -> None:
+        self.cfg = cfg
+        self._dataset = {}
+    def prepare_data(self) -> None:
+        return
+    def setup(self, stage: str) -> None:
+        split = {
+            'fit': 'train',
+            'val': 'val',
+            'validate': 'val',
+            'test': 'val',
+            "train": "train"
+        }[stage]
+        self._dataset[stage] = BEVKitti360Dataset(
+            cfg=self.cfg,
+            split_name=split
+        )
+    def dataset(self, stage: str):
+        if self._dataset.get(stage) is None:
+            self.setup(stage)
+        return self._dataset[stage]

mapper/data/kitti/dataset.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import os
+import numpy as np
+import torch.utils.data as data
+import umsgpack
+from PIL import Image
+import json
+import torchvision.transforms as tvf
+from .transform import BEVTransform
+from ..schema import KITTIDataConfiguration
+class BEVKitti360Dataset(data.Dataset):
+    _IMG_DIR = "img"
+    _BEV_MSK_DIR = "bev_msk"
+    _BEV_PLABEL_DIR = "bev_plabel_dynamic"
+    _FV_MSK_DIR = "front_msk_seam"
+    _BEV_DIR = "bev_ortho"
+    _LST_DIR = "split"
+    _PERCENTAGES_DIR = "percentages"
+    _BEV_METADATA_FILE = "metadata_ortho.bin"
+    _FV_METADATA_FILE = "metadata_front.bin"
+    def __init__(self, cfg: KITTIDataConfiguration, split_name="train"):
+        super(BEVKitti360Dataset, self).__init__()
+        self.cfg = cfg
+        self.seam_root_dir = cfg.seam_root_dir  # Directory of seamless data
+        self.kitti_root_dir = cfg.dataset_root_dir  #  Directory of the KITTI360 data
+        self.split_name = split_name
+        self.rgb_cameras = ['front']
+        if cfg.bev_percentage < 1:
+            self.bev_percentage = cfg.bev_percentage
+        else:
+            self.bev_percentage = int(cfg.bev_percentage)
+        # Folders
+        self._img_dir = os.path.join(self.seam_root_dir, BEVKitti360Dataset._IMG_DIR)
+        self._bev_msk_dir = os.path.join(self.seam_root_dir, BEVKitti360Dataset._BEV_MSK_DIR, BEVKitti360Dataset._BEV_DIR)
+        self._bev_plabel_dir = os.path.join(self.seam_root_dir, BEVKitti360Dataset._BEV_PLABEL_DIR, BEVKitti360Dataset._BEV_DIR)
+        self._fv_msk_dir = os.path.join(self.seam_root_dir, BEVKitti360Dataset._FV_MSK_DIR, "front")
+        self._lst_dir = os.path.join(self.seam_root_dir, BEVKitti360Dataset._LST_DIR)
+        self._percentages_dir = os.path.join(self.seam_root_dir, BEVKitti360Dataset._LST_DIR, BEVKitti360Dataset._PERCENTAGES_DIR)
+        # Load meta-data and split
+        self._bev_meta, self._bev_images, self._bev_images_all, self._fv_meta, self._fv_images, self._fv_images_all,\
+        self._img_map, self.bev_percent_split = self._load_split()
+        self.tfs = self.get_augmentations() if split_name == "train" else tvf.Compose([])
+        self.transform = BEVTransform(cfg, self.tfs)
+    def get_augmentations(self):
+        print(f"Augmentation!", "\n" * 10)
+        augmentations = [
+            tvf.ColorJitter(
+                brightness=self.cfg.augmentations.brightness,
+                contrast=self.cfg.augmentations.contrast,
+                saturation=self.cfg.augmentations.saturation,
+                hue=self.cfg.augmentations.hue,
+            )
+        ]
+        if self.cfg.augmentations.random_resized_crop:
+            augmentations.append(
+                tvf.RandomResizedCrop(scale=(0.8, 1.0))
+            )  # RandomResizedCrop
+        if self.cfg.augmentations.gaussian_noise.enabled:
+            augmentations.append(
+                tvf.GaussianNoise(
+                    mean=self.cfg.augmentations.gaussian_noise.mean,
+                    std=self.cfg.augmentations.gaussian_noise.std,
+                )
+            )  # Gaussian noise
+        if self.cfg.augmentations.brightness_contrast.enabled:
+            augmentations.append(
+                tvf.ColorJitter(
+                    brightness=self.cfg.augmentations.brightness_contrast.brightness_factor,
+                    contrast=self.cfg.augmentations.brightness_contrast.contrast_factor,
+                    saturation=0,  # Keep saturation at 0 for brightness and contrast adjustment
+                    hue=0,
+                )
+            )  # Brightness and contrast adjustment
+        return tvf.Compose(augmentations)
+    # Load the train or the validation split
+    def _load_split(self):
+        with open(os.path.join(self.seam_root_dir, BEVKitti360Dataset._BEV_METADATA_FILE), "rb") as fid:
+            bev_metadata = umsgpack.unpack(fid, encoding="utf-8")
+        with open(os.path.join(self.seam_root_dir, BEVKitti360Dataset._FV_METADATA_FILE), 'rb') as fid:
+            fv_metadata = umsgpack.unpack(fid, encoding="utf-8")
+        # Read the files for this split
+        with open(os.path.join(self._lst_dir, self.split_name + ".txt"), "r") as fid:
+            lst = fid.readlines()
+            lst = [line.strip() for line in lst]
+        if self.split_name == "train":
+            # Get all the frames in the train dataset. This will be used for generating samples for temporal consistency.
+            with open(os.path.join(self._lst_dir, "{}_all.txt".format(self.split_name)), 'r') as fid:
+                lst_all = fid.readlines()
+                lst_all = [line.strip() for line in lst_all]
+            # Get all the samples for which the BEV plabels have to be loaded.
+            percentage_file = os.path.join(self._percentages_dir, "{}_{}.txt".format(self.split_name, self.bev_percentage))
+            print("Loading {}% file".format(self.bev_percentage))
+            with open(percentage_file, 'r') as fid:
+                lst_percent = fid.readlines()
+                lst_percent = [line.strip() for line in lst_percent]
+        else:
+            lst_all = lst
+            lst_percent = lst
+        # Remove elements from lst if they are not in _FRONT_MSK_DIR
+        fv_msk_frames = os.listdir(self._fv_msk_dir)
+        fv_msk_frames = [frame.split(".")[0] for frame in fv_msk_frames]
+        fv_msk_frames_exist_map = {entry: True for entry in fv_msk_frames}  # This is to speed-up the dataloader
+        lst = [entry for entry in lst if entry in fv_msk_frames_exist_map]
+        lst_all = [entry for entry in lst_all if entry in fv_msk_frames_exist_map]
+        # Filter based on the samples plabels
+        if self.bev_percentage < 100:
+            lst_filt = [entry for entry in lst if entry in lst_percent]
+            lst = lst_filt
+        # Remove any potential duplicates
+        lst = set(lst)
+        lst_percent = set(lst_percent)
+        img_map = {}
+        for camera in self.rgb_cameras:
+            with open(os.path.join(self._img_dir, "{}.json".format(camera))) as fp:
+                map_list = json.load(fp)
+                map_dict = {k: v for d in map_list for k, v in d.items()}
+                img_map[camera] = map_dict
+        bev_meta = bev_metadata["meta"]
+        bev_images = [img_desc for img_desc in bev_metadata["images"] if img_desc["id"] in lst]
+        fv_meta = fv_metadata["meta"]
+        fv_images = [img_desc for img_desc in fv_metadata['images'] if img_desc['id'] in lst]
+        # Check for inconsistency due to inconsistencies in the input files or dataset
+        bev_images_ids = [bev_img["id"] for bev_img in bev_images]
+        fv_images_ids = [fv_img["id"] for fv_img in fv_images]
+        assert set(bev_images_ids) == set(fv_images_ids) and len(bev_images_ids) == len(fv_images_ids), 'Inconsistency between fv_images and bev_images detected'
+        if lst_all is not None:
+            bev_images_all = [img_desc for img_desc in bev_metadata['images'] if img_desc['id'] in lst_all]
+            fv_images_all = [img_desc for img_desc in fv_metadata['images'] if img_desc['id'] in lst_all]
+        else:
+            bev_images_all, fv_images_all = None, None
+        return bev_meta, bev_images, bev_images_all, fv_meta, fv_images, fv_images_all, img_map, lst_percent
+    def _find_index(self, list, key, value):
+        for i, dic in enumerate(list):
+            if dic[key] == value:
+                return i
+        return None
+    def _load_item(self, item_idx):
+        # Find the index of the element in the list containing all elements
+        all_idx = self._find_index(self._fv_images_all, "id", self._fv_images[item_idx]['id'])
+        if all_idx is None:
+            raise IOError("Required index not found!")
+        bev_img_desc = self._bev_images[item_idx]
+        fv_img_desc = self._fv_images[item_idx]
+        scene, frame_id = self._bev_images[item_idx]["id"].split(";")
+        # Get the RGB file names
+        img_file = os.path.join(
+            self.kitti_root_dir,
+            self._img_map["front"]["{}.png"
+                                   .format(bev_img_desc['id'])]
+        )
+        if not os.path.exists(img_file):
+            raise IOError(
+                "RGB image not found! Scene: {}, Frame: {}".format(scene, frame_id)
+            )
+        # Load the images
+        img = Image.open(img_file).convert(mode="RGB")
+        # Load the BEV mask
+        bev_msk_file = os.path.join(
+            self._bev_msk_dir,
+            "{}.png".format(bev_img_desc['id'])
+        )
+        bev_msk = Image.open(bev_msk_file)
+        bev_plabel = None
+        # Load the front mask
+        fv_msk_file = os.path.join(
+            self._fv_msk_dir,
+            "{}.png".format(fv_img_desc['id'])
+        )
+        fv_msk = Image.open(fv_msk_file)
+        bev_weights_msk_combined = None
+        # Get the other information
+        bev_cat = bev_img_desc["cat"]
+        bev_iscrowd = bev_img_desc["iscrowd"]
+        fv_cat = fv_img_desc['cat']
+        fv_iscrowd = fv_img_desc['iscrowd']
+        fv_intrinsics = fv_img_desc["cam_intrinsic"]
+        ego_pose = fv_img_desc['ego_pose']  # This loads the cam0 pose
+        # Get the ids of all the frames
+        frame_ids = bev_img_desc["id"]
+        return img, bev_msk, bev_plabel, fv_msk, bev_weights_msk_combined, bev_cat, \
+            bev_iscrowd, fv_cat, fv_iscrowd, fv_intrinsics, ego_pose, frame_ids
+    @property
+    def fv_categories(self):
+        """Category names"""
+        return self._fv_meta["categories"]
+    @property
+    def fv_num_categories(self):
+        """Number of categories"""
+        return len(self.fv_categories)
+    @property
+    def fv_num_stuff(self):
+        """Number of "stuff" categories"""
+        return self._fv_meta["num_stuff"]
+    @property
+    def fv_num_thing(self):
+        """Number of "thing" categories"""
+        return self.fv_num_categories - self.fv_num_stuff
+    @property
+    def bev_categories(self):
+        """Category names"""
+        return self._bev_meta["categories"]
+    @property
+    def bev_num_categories(self):
+        """Number of categories"""
+        return len(self.bev_categories)
+    @property
+    def bev_num_stuff(self):
+        """Number of "stuff" categories"""
+        return self._bev_meta["num_stuff"]
+    @property
+    def bev_num_thing(self):
+        """Number of "thing" categories"""
+        return self.bev_num_categories - self.bev_num_stuff
+    @property
+    def original_ids(self):
+        """Original class id of each category"""
+        return self._fv_meta["original_ids"]
+    @property
+    def palette(self):
+        """Default palette to be used when color-coding semantic labels"""
+        return np.array(self._fv_meta["palette"], dtype=np.uint8)
+    @property
+    def img_sizes(self):
+        """Size of each image of the dataset"""
+        return [img_desc["size"] for img_desc in self._fv_images]
+    @property
+    def img_categories(self):
+        """Categories present in each image of the dataset"""
+        return [img_desc["cat"] for img_desc in self._fv_images]
+    @property
+    def dataset_name(self):
+        return "Kitti360"
+    def __len__(self):
+        if self.cfg.percentage < 1:
+            return int(len(self._fv_images) * self.cfg.percentage)
+        return len(self._fv_images)
+    def __getitem__(self, item):
+        img, bev_msk, bev_plabel, fv_msk, bev_weights_msk, bev_cat, bev_iscrowd, fv_cat, fv_iscrowd, fv_intrinsics, ego_pose, idx = self._load_item(item)
+        rec = self.transform(img=img, bev_msk=bev_msk, bev_plabel=bev_plabel, fv_msk=fv_msk, bev_weights_msk=bev_weights_msk, bev_cat=bev_cat,
+                             bev_iscrowd=bev_iscrowd, fv_cat=fv_cat, fv_iscrowd=fv_iscrowd, fv_intrinsics=fv_intrinsics,
+                             ego_pose=ego_pose)
+        size = (img.size[1], img.size[0])
+        # Close the file
+        img.close()
+        bev_msk.close()
+        fv_msk.close()
+        rec["index"] = idx
+        rec["size"] = size
+        rec['name'] = idx
+        return rec
+    def get_image_desc(self, idx):
+        """Look up an image descriptor given the id"""
+        matching = [img_desc for img_desc in self._images if img_desc["id"] == idx]
+        if len(matching) == 1:
+            return matching[0]
+        else:
+            raise ValueError("No image found with id %s" % idx)

mapper/data/kitti/transform.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import numpy as np
+import torch
+from torchvision.transforms import functional as tfn
+import torchvision.transforms.functional as tvf
+from ..utils import decompose_rotmat
+from ..image import pad_image, rectify_image, resize_image
+from ...utils.wrappers import Camera
+from ..schema import KITTIDataConfiguration
+class BEVTransform:
+    def __init__(self,
+                 cfg: KITTIDataConfiguration, augmentations):
+        self.cfg = cfg
+        self.augmentations = augmentations
+    @staticmethod
+    def _compact_labels(msk, cat, iscrowd):
+        ids = np.unique(msk)
+        if 0 not in ids:
+            ids = np.concatenate((np.array([0], dtype=np.int32), ids), axis=0)
+        ids_to_compact = np.zeros((ids.max() + 1,), dtype=np.int32)
+        ids_to_compact[ids] = np.arange(0, ids.size, dtype=np.int32)
+        msk = ids_to_compact[msk]
+        cat = cat[ids]
+        iscrowd = iscrowd[ids]
+        return msk, cat, iscrowd
+    def __call__(self, img, bev_msk=None, bev_plabel=None, fv_msk=None, bev_weights_msk=None,
+                 bev_cat=None, bev_iscrowd=None, fv_cat=None, fv_iscrowd=None,
+                 fv_intrinsics=None, ego_pose=None):
+        # Wrap in np.array
+        if bev_cat is not None:
+            bev_cat = np.array(bev_cat, dtype=np.int32)
+        if bev_iscrowd is not None:
+            bev_iscrowd = np.array(bev_iscrowd, dtype=np.uint8)
+        if ego_pose is not None:
+            ego_pose = np.array(ego_pose, dtype=np.float32)
+        roll, pitch, yaw = decompose_rotmat(ego_pose[:3, :3])
+        # Image transformations
+        img = tfn.to_tensor(img)
+        # img = [self._normalize_image(rgb) for rgb in img]
+        fx = fv_intrinsics[0][0]
+        fy = fv_intrinsics[1][1]
+        cx = fv_intrinsics[0][2]
+        cy = fv_intrinsics[1][2]
+        width = img.shape[2]
+        height = img.shape[1]
+        cam = Camera(torch.tensor(
+                        [width, height, fx, fy, cx - 0.5, cy - 0.5])).float()
+        if not self.cfg.gravity_align:
+            # Turn off gravity alignment
+            roll = 0.0
+            pitch = 0.0
+            img, valid = rectify_image(img, cam, roll, pitch)
+        else:
+            img, valid = rectify_image(
+                img, cam, roll, pitch if self.cfg.rectify_pitch else None
+            )
+            roll = 0.0
+            if self.cfg.rectify_pitch:
+                pitch = 0.0
+        if self.cfg.target_focal_length is not None:
+            # Resize to a canonical focal length
+            factor = self.cfg.target_focal_length / cam.f.numpy()
+            size = (np.array(img.shape[-2:][::-1]) * factor).astype(int)
+            img, _, cam, valid = resize_image(img, size, camera=cam, valid=valid)
+            size_out = self.cfg.resize_image
+            if size_out is None:
+                # Round the edges up such that they are multiple of a factor
+                stride = self.cfg.pad_to_multiple
+                size_out = (np.ceil((size / stride)) * stride).astype(int)
+            # Crop or pad such that both edges are of the given size
+            img, valid, cam = pad_image(
+                img, size_out, cam, valid, crop_and_center=False
+            )
+        elif self.cfg.resize_image is not None:
+            img, _, cam, valid = resize_image(
+                img, self.cfg.resize_image, fn=max, camera=cam, valid=valid
+            )
+            if self.cfg.pad_to_square:
+                # Pad such that both edges are of the given size
+                img, valid, cam = pad_image(img, self.cfg.resize_image, cam, valid)
+        # Label transformations,
+        if bev_msk is not None:
+            bev_msk = np.expand_dims(
+                np.array(bev_msk, dtype=np.int32, copy=False),
+                axis=0
+            )
+            bev_msk, bev_cat, bev_iscrowd = self._compact_labels(
+                bev_msk, bev_cat, bev_iscrowd
+            )
+            bev_msk = torch.from_numpy(bev_msk)
+            bev_cat = torch.from_numpy(bev_cat)
+            rotated_mask = torch.rot90(bev_msk, dims=(1, 2))
+            cropped_mask = rotated_mask[:, :672, (rotated_mask.size(2) - 672) // 2:-(rotated_mask.size(2) - 672) // 2]
+            bev_msk = cropped_mask.squeeze(0)
+            seg_masks = bev_cat[bev_msk]
+            seg_masks_onehot = seg_masks.clone()
+            seg_masks_onehot[seg_masks_onehot == 255] = 0
+            seg_masks_onehot = torch.nn.functional.one_hot(
+                seg_masks_onehot.to(torch.int64),
+                num_classes=self.cfg.num_classes
+            )
+            seg_masks_onehot[seg_masks == 255] = 0
+            seg_masks_onehot = seg_masks_onehot.permute(2, 0, 1)
+            seg_masks_down = tvf.resize(seg_masks_onehot, (100, 100))
+            seg_masks_down = seg_masks_down.permute(1, 2, 0)
+            if self.cfg.class_mapping is not None:
+                seg_masks_down = seg_masks_down[:, :, self.cfg.class_mapping]
+        img = self.augmentations(img)
+        flood_masks = torch.all(seg_masks_down == 0, dim=2).float()
+        ret = {
+            "image": img,
+            "valid": valid,
+            "camera": cam,
+            "seg_masks": (seg_masks_down).float().contiguous(),
+            "flood_masks": flood_masks,
+            "roll_pitch_yaw": torch.tensor((roll, pitch, yaw)).float(),
+            "confidence_map": flood_masks,
+        }
+        for key, value in ret.items():
+            if isinstance(value, np.ndarray):
+                ret[key] = torch.from_numpy(value)
+        return ret

mapper/data/mapillary/data_module.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import json
+from collections import defaultdict
+import os
+import shutil
+import tarfile
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.utils.data as torchdata
+from omegaconf import DictConfig
+from ... import logger
+from .dataset import MapLocDataset
+from ..sequential import chunk_sequence
+from ..torch import collate, worker_init_fn
+from ..schema import MIADataConfiguration
+def pack_dump_dict(dump):
+    for per_seq in dump.values():
+        if "points" in per_seq:
+            for chunk in list(per_seq["points"]):
+                points = per_seq["points"].pop(chunk)
+                if points is not None:
+                    per_seq["points"][chunk] = np.array(
+                        per_seq["points"][chunk], np.float64
+                    )
+        for view in per_seq["views"].values():
+            for k in ["R_c2w", "roll_pitch_yaw"]:
+                view[k] = np.array(view[k], np.float32)
+            for k in ["chunk_id"]:
+                if k in view:
+                    view.pop(k)
+        if "observations" in view:
+            view["observations"] = np.array(view["observations"])
+        for camera in per_seq["cameras"].values():
+            for k in ["params"]:
+                camera[k] = np.array(camera[k], np.float32)
+    return dump
+class MapillaryDataModule(pl.LightningDataModule):
+    dump_filename = "dump.json"
+    images_archive = "images.tar.gz"
+    images_dirname = "images/"
+    semantic_masks_dirname = "semantic_masks/"
+    flood_dirname = "flood_fill/"
+    def __init__(self, cfg: MIADataConfiguration):
+        super().__init__()
+        self.cfg = cfg
+        self.root = self.cfg.data_dir
+        self.local_dir = None
+    def prepare_data(self):
+        for scene in self.cfg.scenes:
+            dump_dir = self.root / scene
+            assert (dump_dir / self.dump_filename).exists(), dump_dir
+            # assert (dump_dir / self.cfg.tiles_filename).exists(), dump_dir
+            if self.local_dir is None:
+                assert (dump_dir / self.images_dirname).exists(), dump_dir
+                continue
+            assert (dump_dir / self.semantic_masks_dirname).exists(), dump_dir
+            assert (dump_dir / self.flood_dirname).exists(), dump_dir
+            # Cache the folder of images locally to speed up reading
+            local_dir = self.local_dir / scene
+            if local_dir.exists():
+                shutil.rmtree(local_dir)
+            local_dir.mkdir(exist_ok=True, parents=True)
+            images_archive = dump_dir / self.images_archive
+            logger.info("Extracting the image archive %s.", images_archive)
+            with tarfile.open(images_archive) as fp:
+                fp.extractall(local_dir)
+    def setup(self, stage: Optional[str] = None):
+        self.dumps = {}
+        # self.tile_managers = {}
+        self.image_dirs = {}
+        self.seg_masks_dir = {}
+        self.flood_masks_dir = {}
+        names = []
+        for scene in self.cfg.scenes:
+            logger.info("Loading scene %s.", scene)
+            dump_dir = self.root / scene
+            logger.info("Loading dump json file %s.", self.dump_filename)
+            with (dump_dir / self.dump_filename).open("r") as fp:
+                self.dumps[scene] = pack_dump_dict(json.load(fp))
+            for seq, per_seq in self.dumps[scene].items():
+                for cam_id, cam_dict in per_seq["cameras"].items():
+                    if cam_dict["model"] != "PINHOLE":
+                        raise ValueError(
+                            f"Unsupported camera model: {cam_dict['model']} for {scene},{seq},{cam_id}"
+                        )
+            self.image_dirs[scene] = (
+                (self.local_dir or self.root) / scene / self.images_dirname
+            )
+            assert self.image_dirs[scene].exists(), self.image_dirs[scene]
+            self.seg_masks_dir[scene] = (
+                (self.local_dir or self.root) / scene / self.semantic_masks_dirname
+            )
+            assert self.seg_masks_dir[scene].exists(), self.seg_masks_dir[scene]
+            self.flood_masks_dir[scene] = (
+                (self.local_dir or self.root) / scene / self.flood_dirname
+            )
+            assert self.flood_masks_dir[scene].exists(), self.flood_masks_dir[scene]
+            images = set(x.split('.')[0] for x in os.listdir(self.image_dirs[scene]))
+            flood_masks = set(x.split('.')[0] for x in os.listdir(self.flood_masks_dir[scene]))
+            semantic_masks = set(x.split('.')[0] for x in os.listdir(self.seg_masks_dir[scene]))
+            for seq, data in self.dumps[scene].items():
+                for name in data["views"]:
+                    if name in images and name.split("_")[0] in flood_masks and name.split("_")[0] in semantic_masks:
+                        names.append((scene, seq, name))
+        self.parse_splits(self.cfg.split, names)
+        if self.cfg.filter_for is not None:
+            self.filter_elements()
+        self.pack_data()
+    def pack_data(self):
+        # We pack the data into compact tensors that can be shared across processes without copy
+        exclude = {
+            "compass_angle",
+            "compass_accuracy",
+            "gps_accuracy",
+            "chunk_key",
+            "panorama_offset",
+        }
+        cameras = {
+            scene: {seq: per_seq["cameras"] for seq, per_seq in per_scene.items()}
+            for scene, per_scene in self.dumps.items()
+        }
+        points = {
+            scene: {
+                seq: {
+                    i: torch.from_numpy(p) for i, p in per_seq.get("points", {}).items()
+                }
+                for seq, per_seq in per_scene.items()
+            }
+            for scene, per_scene in self.dumps.items()
+        }
+        self.data = {}
+        # TODO: remove
+        if self.cfg.split == "splits_MGL_13loc.json":
+        # Use Last 20% as Val
+            num_samples_to_move = int(len(self.splits['train']) * 0.2)
+            samples_to_move = self.splits['train'][-num_samples_to_move:]
+            self.splits['val'].extend(samples_to_move)
+            self.splits['train'] = self.splits['train'][:-num_samples_to_move]
+            print(f"Dataset Len: {len(self.splits['train']), len(self.splits['val'])}\n\n\n\n")
+        elif self.cfg.split == "splits_MGL_soma_70k_mappred_random.json":
+            for stage, names in self.splits.items():
+                print("Length of splits {}: ".format(stage), len(self.splits[stage]))
+        for stage, names in self.splits.items():
+            view = self.dumps[names[0][0]][names[0][1]]["views"][names[0][2]]
+            data = {k: [] for k in view.keys() - exclude}
+            for scene, seq, name in names:
+                for k in data:
+                    data[k].append(self.dumps[scene][seq]["views"][name].get(k, None))
+            for k in data:
+                v = np.array(data[k])
+                if np.issubdtype(v.dtype, np.integer) or np.issubdtype(
+                    v.dtype, np.floating
+                ):
+                    v = torch.from_numpy(v)
+                data[k] = v
+            data["cameras"] = cameras
+            data["points"] = points
+            self.data[stage] = data
+            self.splits[stage] = np.array(names)
+    def filter_elements(self):
+        for stage, names in self.splits.items():
+            names_select = []
+            for scene, seq, name in names:
+                view = self.dumps[scene][seq]["views"][name]
+                if self.cfg.filter_for == "ground_plane":
+                    if not (1.0 <= view["height"] <= 3.0):
+                        continue
+                    planes = self.dumps[scene][seq].get("plane")
+                    if planes is not None:
+                        inliers = planes[str(view["chunk_id"])][-1]
+                        if inliers < 10:
+                            continue
+                    if self.cfg.filter_by_ground_angle is not None:
+                        plane = np.array(view["plane_params"])
+                        normal = plane[:3] / np.linalg.norm(plane[:3])
+                        angle = np.rad2deg(np.arccos(np.abs(normal[-1])))
+                        if angle > self.cfg.filter_by_ground_angle:
+                            continue
+                elif self.cfg.filter_for == "pointcloud":
+                    if len(view["observations"]) < self.cfg.min_num_points:
+                        continue
+                elif self.cfg.filter_for is not None:
+                    raise ValueError(f"Unknown filtering: {self.cfg.filter_for}")
+                names_select.append((scene, seq, name))
+            logger.info(
+                "%s: Keep %d/%d images after filtering for %s.",
+                stage,
+                len(names_select),
+                len(names),
+                self.cfg.filter_for,
+            )
+            self.splits[stage] = names_select
+    def parse_splits(self, split_arg, names):
+        if split_arg is None:
+            self.splits = {
+                "train": names,
+                "val": names,
+            }
+        elif isinstance(split_arg, int):
+            names = np.random.RandomState(self.cfg.seed).permutation(names).tolist()
+            self.splits = {
+                "train": names[split_arg:],
+                "val": names[:split_arg],
+            }
+        elif isinstance(split_arg, float):
+            names = np.random.RandomState(self.cfg.seed).permutation(names).tolist()
+            self.splits = {
+                "train": names[int(split_arg * len(names)) :],
+                "val": names[: int(split_arg * len(names))],
+            }
+        elif isinstance(split_arg, DictConfig):
+            scenes_val = set(split_arg.val)
+            scenes_train = set(split_arg.train)
+            assert len(scenes_val - set(self.cfg.scenes)) == 0
+            assert len(scenes_train - set(self.cfg.scenes)) == 0
+            self.splits = {
+                "train": [n for n in names if n[0] in scenes_train],
+                "val": [n for n in names if n[0] in scenes_val],
+            }
+        elif isinstance(split_arg, str):
+            if "/" in split_arg:
+                split_path = self.root / split_arg
+            else:
+                split_path = Path(split_arg)
+            with split_path.open("r") as fp:
+                splits = json.load(fp)
+            splits = {
+                k: {loc: set(ids) for loc, ids in split.items()}
+                for k, split in splits.items()
+            }
+            self.splits = {}
+            for k, split in splits.items():
+                self.splits[k] = [
+                    n
+                    for n in names
+                    if n[0] in split and int(n[-1].rsplit("_", 1)[0]) in split[n[0]]
+                ]
+        else:
+            raise ValueError(split_arg)
+    def dataset(self, stage: str):
+        return MapLocDataset(
+            stage,
+            self.cfg,
+            self.splits[stage],
+            self.data[stage],
+            self.image_dirs,
+            self.seg_masks_dir,
+            self.flood_masks_dir,
+            image_ext=".jpg",
+        )
+    def sequence_dataset(self, stage: str, **kwargs):
+        keys = self.splits[stage]
+        seq2indices = defaultdict(list)
+        for index, (_, seq, _) in enumerate(keys):
+            seq2indices[seq].append(index)
+        # chunk the sequences to the required length
+        chunk2indices = {}
+        for seq, indices in seq2indices.items():
+            chunks = chunk_sequence(self.data[stage], indices, **kwargs)
+            for i, sub_indices in enumerate(chunks):
+                chunk2indices[seq, i] = sub_indices
+        # store the index of each chunk in its sequence
+        chunk_indices = torch.full((len(keys),), -1)
+        for (_, chunk_index), idx in chunk2indices.items():
+            chunk_indices[idx] = chunk_index
+        self.data[stage]["chunk_index"] = chunk_indices
+        dataset = self.dataset(stage)
+        return dataset, chunk2indices
+    def sequence_dataloader(self, stage: str, shuffle: bool = False, **kwargs):
+        dataset, chunk2idx = self.sequence_dataset(stage, **kwargs)
+        chunk_keys = sorted(chunk2idx)
+        if shuffle:
+            perm = torch.randperm(len(chunk_keys))
+            chunk_keys = [chunk_keys[i] for i in perm]
+        key_indices = [i for key in chunk_keys for i in chunk2idx[key]]
+        num_workers = self.cfg.loading[stage]["num_workers"]
+        loader = torchdata.DataLoader(
+            dataset,
+            batch_size=None,
+            sampler=key_indices,
+            num_workers=num_workers,
+            shuffle=False,
+            pin_memory=True,
+            persistent_workers=num_workers > 0,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate,
+        )
+        return loader, chunk_keys, chunk2idx

mapper/data/mapillary/dataset.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from copy import deepcopy
+from pathlib import Path
+from typing import Any, Dict, List
+import numpy as np
+import torch
+import torch.utils.data as torchdata
+import torchvision.transforms as tvf
+from PIL import Image
+from pathlib import Path
+from ...models.utils import deg2rad, rotmat2d
+from ...utils.io import read_image
+from ...utils.wrappers import Camera
+from ..image import pad_image, rectify_image, resize_image
+from ..utils import decompose_rotmat
+from ..schema import MIADataConfiguration
+class MapLocDataset(torchdata.Dataset):
+    def __init__(
+        self,
+        stage: str,
+        cfg: MIADataConfiguration,
+        names: List[str],
+        data: Dict[str, Any],
+        image_dirs: Dict[str, Path],
+        seg_mask_dirs: Dict[str, Path],
+        flood_masks_dirs: Dict[str, Path],
+        image_ext: str = "",
+    ):
+        self.stage = stage
+        self.cfg = deepcopy(cfg)
+        self.data = data
+        self.image_dirs = image_dirs
+        self.seg_mask_dirs = seg_mask_dirs
+        self.flood_masks_dirs = flood_masks_dirs
+        self.names = names
+        self.image_ext = image_ext
+        tfs = []
+        self.tfs = tvf.Compose(tfs)
+        self.augmentations = self.get_augmentations()
+    def __len__(self):
+        return len(self.names)
+    def __getitem__(self, idx):
+        if self.stage == "train" and self.cfg.random:
+            seed = None
+        else:
+            seed = [self.cfg.seed, idx]
+        (seed,) = np.random.SeedSequence(seed).generate_state(1)
+        scene, seq, name = self.names[idx]
+        view = self.get_view(
+            idx, scene, seq, name, seed
+        )
+        return view
+    def get_augmentations(self):
+        if self.stage != "train" or not self.cfg.augmentations.enabled:
+            print(f"No Augmentation!", "\n" * 10)
+            self.cfg.augmentations.random_flip = 0.0
+            return tvf.Compose([])
+        print(f"Augmentation!", "\n" * 10)
+        augmentations = [
+            tvf.ColorJitter(
+                brightness=self.cfg.augmentations.brightness,
+                contrast=self.cfg.augmentations.contrast,
+                saturation=self.cfg.augmentations.saturation,
+                hue=self.cfg.augmentations.hue,
+            )
+        ]
+        if self.cfg.augmentations.random_resized_crop:
+            augmentations.append(
+                tvf.RandomResizedCrop(scale=(0.8, 1.0))
+            )  # RandomResizedCrop
+        if self.cfg.augmentations.gaussian_noise.enabled:
+            augmentations.append(
+                tvf.GaussianNoise(
+                    mean=self.cfg.augmentations.gaussian_noise.mean,
+                    std=self.cfg.augmentations.gaussian_noise.std,
+                )
+            )  # Gaussian noise
+        if self.cfg.augmentations.brightness_contrast.enabled:
+            augmentations.append(
+                tvf.ColorJitter(
+                    brightness=self.cfg.augmentations.brightness_contrast.brightness_factor,
+                    contrast=self.cfg.augmentations.brightness_contrast.contrast_factor,
+                    saturation=0,  # Keep saturation at 0 for brightness and contrast adjustment
+                    hue=0,
+                )
+            )  # Brightness and contrast adjustment
+        return tvf.Compose(augmentations)
+    def random_flip(self, image, cam, valid, seg_mask, flood_mask, conf_mask):
+        if torch.rand(1) < self.cfg.augmentations.random_flip:
+            image = torch.flip(image, [-1])
+            cam = cam.flip()
+            valid = torch.flip(valid, [-1])
+            seg_mask = torch.flip(seg_mask, [1])
+            flood_mask = torch.flip(flood_mask, [-1])
+            conf_mask = torch.flip(conf_mask, [-1])
+        return image, cam, valid, seg_mask, flood_mask, conf_mask
+    def get_view(self, idx, scene, seq, name, seed):
+        data = {
+            "index": idx,
+            "name": name,
+            "scene": scene,
+            "sequence": seq,
+        }
+        cam_dict = self.data["cameras"][scene][seq][self.data["camera_id"][idx]]
+        cam = Camera.from_dict(cam_dict).float()
+        if "roll_pitch_yaw" in self.data:
+            roll, pitch, yaw = self.data["roll_pitch_yaw"][idx].numpy()
+        else:
+            roll, pitch, yaw = decompose_rotmat(
+                self.data["R_c2w"][idx].numpy())
+        image = read_image(self.image_dirs[scene] / (name + self.image_ext))
+        image = Image.fromarray(image)
+        image = self.augmentations(image)
+        image = np.array(image)
+        if "plane_params" in self.data:
+            # transform the plane parameters from world to camera frames
+            plane_w = self.data["plane_params"][idx]
+            data["ground_plane"] = torch.cat(
+                [rotmat2d(deg2rad(torch.tensor(yaw)))
+                 @ plane_w[:2], plane_w[2:]]
+            )
+        image, valid, cam, roll, pitch = self.process_image(
+            image, cam, roll, pitch, seed
+        )
+        if "chunk_index" in self.data:  # TODO: (cherie) do we need this?
+            data["chunk_id"] = (scene, seq, self.data["chunk_index"][idx])
+        # Semantic map extraction
+        seg_mask_path = self.seg_mask_dirs[scene] / \
+            (name.split("_")[0] + ".npy")
+        seg_masks_ours = np.load(seg_mask_path)
+        mask_center = (
+            seg_masks_ours.shape[0] // 2, seg_masks_ours.shape[1] // 2)
+        seg_masks_ours = seg_masks_ours[mask_center[0] -
+                                        100:mask_center[0], mask_center[1] - 50: mask_center[1] + 50]
+        if self.cfg.num_classes == 6:
+            seg_masks_ours = seg_masks_ours[..., [0, 1, 2, 4, 6, 7]]
+        flood_mask_path = self.flood_masks_dirs[scene] / \
+            (name.split("_")[0] + ".npy")
+        flood_mask = np.load(flood_mask_path)
+        flood_mask = flood_mask[mask_center[0]-100:mask_center[0],
+                                mask_center[1] - 50: mask_center[1] + 50]
+        confidence_map = flood_mask.copy()
+        confidence_map = (confidence_map - confidence_map.min()) / \
+            (confidence_map.max() - confidence_map.min() + 1e-6)
+        seg_masks_ours = torch.from_numpy(seg_masks_ours).float()
+        flood_mask = torch.from_numpy(flood_mask).float()
+        confidence_map = torch.from_numpy(confidence_map).float()
+        # Map Augmentations
+        with torch.random.fork_rng(devices=[]):
+            torch.manual_seed(seed)
+            image, cam, valid, seg_masks_ours, flood_mask, confidence_map = self.random_flip(
+                image, cam, valid, seg_masks_ours, flood_mask, confidence_map)
+        return {
+            **data,
+            "image": image,
+            "valid": valid,
+            "camera": cam,
+            "seg_masks": seg_masks_ours,
+            "flood_masks": flood_mask,
+            "roll_pitch_yaw": torch.tensor((roll, pitch, yaw)).float(),
+            "confidence_map": confidence_map
+            # "pixels_per_meter": torch.tensor(canvas.ppm).float(),
+        }
+    def process_image(self, image, cam, roll, pitch, seed):
+        image = (
+            torch.from_numpy(np.ascontiguousarray(image))
+            .permute(2, 0, 1)
+            .float()
+            .div_(255)
+        )
+        if not self.cfg.gravity_align:
+            # Turn off gravity alignment
+            roll = 0.0
+            pitch = 0.0
+            image, valid = rectify_image(image, cam, roll, pitch)
+        else:
+            image, valid = rectify_image(
+                image, cam, roll, pitch if self.cfg.rectify_pitch else None
+            )
+            roll = 0.0
+            if self.cfg.rectify_pitch:
+                pitch = 0.0
+        if self.cfg.target_focal_length is not None:
+            # Resize to a canonical focal length
+            factor = self.cfg.target_focal_length / cam.f.numpy()
+            size = (np.array(image.shape[-2:][::-1]) * factor).astype(int)
+            image, _, cam, valid = resize_image(
+                image, size, camera=cam, valid=valid)
+            size_out = self.cfg.resize_image
+            if size_out is None:
+                # Round the edges up such that they are multiple of a factor
+                stride = self.cfg.pad_to_multiple
+                size_out = (np.ceil((size / stride)) * stride).astype(int)
+            # Crop or pad such that both edges are of the given size
+            image, valid, cam = pad_image(
+                image, size_out, cam, valid, crop_and_center=True
+            )
+        elif self.cfg.resize_image is not None:
+            image, _, cam, valid = resize_image(
+                image, self.cfg.resize_image, fn=max, camera=cam, valid=valid
+            )
+            if self.cfg.pad_to_square:
+                # Pad such that both edges are of the given size
+                image, valid, cam = pad_image(
+                    image, self.cfg.resize_image, cam, valid)
+        if self.cfg.reduce_fov is not None:
+            h, w = image.shape[-2:]
+            f = float(cam.f[0])
+            fov = np.arctan(w / f / 2)
+            w_new = round(2 * f * np.tan(self.cfg.reduce_fov * fov))
+            image, valid, cam = pad_image(
+                image, (w_new, h), cam, valid, crop_and_center=True
+            )
+        with torch.random.fork_rng(devices=[]):
+            torch.manual_seed(seed)
+            image = self.tfs(image)
+        return image, valid, cam, roll, pitch

mapper/data/module.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import Optional
+from omegaconf import DictConfig
+import pytorch_lightning as L
+import torch.utils.data as torchdata
+from .torch import collate, worker_init_fn
+def get_dataset(name):
+    if name == "mapillary":
+        from .mapillary.data_module import MapillaryDataModule
+        return MapillaryDataModule
+    elif name == "nuscenes":
+        from .nuscenes.data_module import NuScenesData
+        return NuScenesData
+    elif name == "kitti":
+        from .kitti.data_module import BEVKitti360Data
+        return BEVKitti360Data
+    else:
+        raise NotImplementedError(f"Dataset {name} not implemented.")
+class GenericDataModule(L.LightningDataModule):
+    def __init__(self, cfg: DictConfig):
+        super().__init__()
+        self.cfg = cfg
+        self.data_module = get_dataset(cfg.name)(cfg)
+    def prepare_data(self) -> None:
+        self.data_module.prepare_data()
+    def setup(self, stage: Optional[str] = None):
+        self.data_module.setup(stage)
+    def dataloader(
+        self,
+        stage: str,
+        shuffle: bool = False,
+        num_workers: int = None,
+        sampler: Optional[torchdata.Sampler] = None,
+    ):
+        dataset = self.data_module.dataset(stage)
+        cfg = self.cfg["loading"][stage]
+        num_workers = cfg["num_workers"] if num_workers is None else num_workers
+        loader = torchdata.DataLoader(
+            dataset,
+            batch_size=cfg["batch_size"],
+            num_workers=num_workers,
+            shuffle=shuffle or (stage == "train"),
+            pin_memory=True,
+            persistent_workers=num_workers > 0,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate,
+            sampler=sampler,
+        )
+        return loader
+    def train_dataloader(self, **kwargs):
+        return self.dataloader("train", **kwargs)
+    def val_dataloader(self, **kwargs):
+        return self.dataloader("val", **kwargs)
+    def test_dataloader(self, **kwargs):
+        return self.dataloader("test", **kwargs)

mapper/data/nuscenes/data_module.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from ..base import DataBase
+from .dataset import NuScenesDataset
+from ..schema import NuScenesDataConfiguration
+class NuScenesData(DataBase):
+    def __init__(self, cfg: NuScenesDataConfiguration):
+        self.cfg = cfg
+        self._dataset = {}
+    def prepare_data(self):
+        pass
+    def setup(self, stage):
+        if stage is None:
+            stage = 'fit'
+        split = {
+            'fit': 'train',
+            'val': 'val',
+            'validate': 'val',
+            'test': 'test'
+        }[stage]
+        self._dataset[split] = NuScenesDataset(
+            split=split,
+            cfg=self.cfg
+        )
+    def dataset(self, stage):
+        if self._dataset.get(stage) is None:
+            self.setup(stage)
+        return self._dataset[stage]

mapper/data/nuscenes/dataset.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import os
+import torch
+import numpy as np
+from pyquaternion import Quaternion
+from nuscenes.nuscenes import NuScenes
+from itertools import chain
+from PIL import Image
+from torchvision import transforms as T
+import torchvision.transforms as tvf
+from torchvision.transforms.functional import to_tensor
+from .splits_roddick import create_splits_scenes_roddick
+from ..image import pad_image, rectify_image, resize_image
+from .utils import decode_binary_labels
+from ..utils import decompose_rotmat
+from ...utils.io import read_image
+from ...utils.wrappers import Camera
+from ..schema import NuScenesDataConfiguration
+class NuScenesDataset(torch.utils.data.Dataset):
+    def __init__(self, cfg: NuScenesDataConfiguration, split="train"):
+        self.cfg = cfg
+        self.nusc = NuScenes(version=cfg.version, dataroot=str(cfg.data_dir))
+        self.map_data_root = cfg.map_dir
+        self.split = split
+        self.scenes = create_splits_scenes_roddick() # custom based on Roddick et al.
+        scene_split = {
+            'v1.0-trainval': {'train': 'train', 'val': 'val', 'test': 'val'},
+            'v1.0-mini': {'train': 'mini_train', 'val': 'mini_val'},
+        }[cfg.version][split]
+        self.scenes = self.scenes[scene_split]
+        self.sample = list(filter(lambda sample: self.nusc.get(
+            'scene', sample['scene_token'])['name'] in self.scenes, self.nusc.sample))
+        self.tfs = self.get_augmentations() if split == "train" else T.Compose([])
+        data_tokens = []
+        for sample in self.sample:
+            data_token = sample['data']
+            data_token = [v for k,v in data_token.items() if k == "CAM_FRONT"]
+            data_tokens.append(data_token)
+        data_tokens = list(chain.from_iterable(data_tokens))
+        data = [self.nusc.get('sample_data', token) for token in data_tokens]
+        self.data = []
+        for d in data:
+            sample = self.nusc.get('sample', d['sample_token'])
+            scene = self.nusc.get('scene', sample['scene_token'])
+            location = self.nusc.get('log', scene['log_token'])['location']
+            file_name = d['filename']
+            ego_pose = self.nusc.get('ego_pose', d['ego_pose_token'])
+            calibrated_sensor = self.nusc.get(
+                "calibrated_sensor", d['calibrated_sensor_token'])
+            ego2global = np.eye(4).astype(np.float32)
+            ego2global[:3, :3] = Quaternion(ego_pose['rotation']).rotation_matrix
+            ego2global[:3, 3] = ego_pose['translation']
+            sensor2ego = np.eye(4).astype(np.float32)
+            sensor2ego[:3, :3] = Quaternion(
+                calibrated_sensor['rotation']).rotation_matrix
+            sensor2ego[:3, 3] = calibrated_sensor['translation']
+            sensor2global = ego2global @ sensor2ego
+            rotation = sensor2global[:3, :3]
+            roll, pitch, yaw = decompose_rotmat(rotation)
+            fx = calibrated_sensor['camera_intrinsic'][0][0]
+            fy = calibrated_sensor['camera_intrinsic'][1][1]
+            cx = calibrated_sensor['camera_intrinsic'][0][2]
+            cy = calibrated_sensor['camera_intrinsic'][1][2]
+            width = d['width']
+            height = d['height']
+            cam = Camera(torch.tensor(
+                [width, height, fx, fy, cx - 0.5, cy - 0.5])).float()
+            self.data.append({
+                'filename': file_name,
+                'yaw': yaw,
+                'pitch': pitch,
+                'roll': roll,
+                'cam': cam,
+                'sensor2global': sensor2global,
+                'token': d['token'],
+                'sample_token': d['sample_token'],
+                'location': location
+            })
+        if self.cfg.percentage < 1.0 and split == "train":
+            self.data = self.data[:int(len(self.data) * self.cfg.percentage)]
+    def get_augmentations(self):
+        print(f"Augmentation!", "\n" * 10)
+        augmentations = [
+            tvf.ColorJitter(
+                brightness=self.cfg.augmentations.brightness,
+                contrast=self.cfg.augmentations.contrast,
+                saturation=self.cfg.augmentations.saturation,
+                hue=self.cfg.augmentations.hue,
+            )
+        ]
+        if self.cfg.augmentations.random_resized_crop:
+            augmentations.append(
+                tvf.RandomResizedCrop(scale=(0.8, 1.0))
+            )  # RandomResizedCrop
+        if self.cfg.augmentations.gaussian_noise.enabled:
+            augmentations.append(
+                tvf.GaussianNoise(
+                    mean=self.cfg.augmentations.gaussian_noise.mean,
+                    std=self.cfg.augmentations.gaussian_noise.std,
+                )
+            )  # Gaussian noise
+        if self.cfg.augmentations.brightness_contrast.enabled:
+            augmentations.append(
+                tvf.ColorJitter(
+                    brightness=self.cfg.augmentations.brightness_contrast.brightness_factor,
+                    contrast=self.cfg.augmentations.brightness_contrast.contrast_factor,
+                    saturation=0,  # Keep saturation at 0 for brightness and contrast adjustment
+                    hue=0,
+                )
+            )  # Brightness and contrast adjustment
+        return tvf.Compose(augmentations)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        d = self.data[idx]
+        image = read_image(os.path.join(self.nusc.dataroot, d['filename']))
+        image = np.array(image)
+        cam = d['cam']
+        roll = d['roll']
+        pitch = d['pitch']
+        yaw = d['yaw']
+        with Image.open(self.map_data_root / f"{d['token']}.png") as semantic_image:
+            semantic_mask = to_tensor(semantic_image)
+        semantic_mask = decode_binary_labels(semantic_mask, self.cfg.num_classes + 1)
+        semantic_mask = torch.nn.functional.max_pool2d(semantic_mask.float(), (2, 2), stride=2) # 2 times downsample
+        semantic_mask = semantic_mask.permute(1, 2, 0)
+        semantic_mask = torch.flip(semantic_mask, [0])
+        visibility_mask = semantic_mask[..., -1]
+        semantic_mask = semantic_mask[..., :-1]
+        if self.cfg.class_mapping is not None:
+            semantic_mask = semantic_mask[..., self.cfg.class_mapping]
+        image = (
+            torch.from_numpy(np.ascontiguousarray(image))
+            .permute(2, 0, 1)
+            .float()
+            .div_(255)
+        )
+        if not self.cfg.gravity_align:
+            # Turn off gravity alignment
+            roll = 0.0
+            pitch = 0.0
+            image, valid = rectify_image(image, cam, roll, pitch)
+        else:
+            image, valid = rectify_image(
+                image, cam, roll, pitch if self.cfg.rectify_pitch else None
+            )
+            roll = 0.0
+            if self.cfg.rectify_pitch:
+                pitch = 0.0
+        if self.cfg.resize_image is not None:
+            image, _, cam, valid = resize_image(
+                image, self.cfg.resize_image, fn=max, camera=cam, valid=valid
+            )
+            if self.cfg.pad_to_square:
+                image, valid, cam = pad_image(image, self.cfg.resize_image, cam, valid)
+        image = self.tfs(image)
+        confidence_map = visibility_mask.clone().float()
+        confidence_map = (confidence_map - confidence_map.min()) / (confidence_map.max() - confidence_map.min())
+        return {
+            "image": image,
+            "roll_pitch_yaw": torch.tensor([roll, pitch, yaw]).float(),
+            "camera": cam,
+            "valid": valid,
+            "seg_masks": semantic_mask.float(),
+            "token": d['token'],
+            "sample_token": d['sample_token'],
+            'location': d['location'],
+            'flood_masks': visibility_mask.float(),
+            "confidence_map": confidence_map,
+            'name': d['sample_token']
+        }

mapper/data/nuscenes/splits_roddick.py ADDED Viewed

	@@ -0,0 +1,197 @@

+def create_splits_scenes_roddick():
+    train_roddick_scenes = [
+        "scene-0002", "scene-0003", "scene-0004", "scene-0005", "scene-0006",
+        "scene-0007", "scene-0008", "scene-0009", "scene-0012", "scene-0013",
+        "scene-0014", "scene-0015", "scene-0016", "scene-0017", "scene-0018",
+        "scene-0019", "scene-0021", "scene-0022", "scene-0023", "scene-0024",
+        "scene-0025", "scene-0026", "scene-0027", "scene-0028", "scene-0029",
+        "scene-0030", "scene-0031", "scene-0032", "scene-0033", "scene-0034",
+        "scene-0035", "scene-0036", "scene-0039", "scene-0042", "scene-0043",
+        "scene-0044", "scene-0045", "scene-0046", "scene-0047", "scene-0048",
+        "scene-0049", "scene-0050", "scene-0051", "scene-0052", "scene-0055",
+        "scene-0056", "scene-0057", "scene-0058", "scene-0059", "scene-0060",
+        "scene-0061", "scene-0062", "scene-0063", "scene-0064", "scene-0065",
+        "scene-0066", "scene-0067", "scene-0068", "scene-0069", "scene-0070",
+        "scene-0071", "scene-0072", "scene-0073", "scene-0074", "scene-0075",
+        "scene-0076", "scene-0092", "scene-0093", "scene-0094", "scene-0095",
+        "scene-0096", "scene-0097", "scene-0098", "scene-0099", "scene-0100",
+        "scene-0101", "scene-0102", "scene-0103", "scene-0104", "scene-0105",
+        "scene-0106", "scene-0107", "scene-0108", "scene-0109", "scene-0110",
+        "scene-0120", "scene-0123", "scene-0124", "scene-0125", "scene-0126",
+        "scene-0127", "scene-0128", "scene-0129", "scene-0130", "scene-0131",
+        "scene-0132", "scene-0133", "scene-0134", "scene-0135", "scene-0138",
+        "scene-0149", "scene-0150", "scene-0151", "scene-0154", "scene-0155",
+        "scene-0157", "scene-0158", "scene-0159", "scene-0161", "scene-0162",
+        "scene-0163", "scene-0164", "scene-0165", "scene-0166", "scene-0167",
+        "scene-0168", "scene-0170", "scene-0171", "scene-0172", "scene-0173",
+        "scene-0174", "scene-0175", "scene-0176", "scene-0177", "scene-0178",
+        "scene-0179", "scene-0180", "scene-0181", "scene-0182", "scene-0183",
+        "scene-0185", "scene-0187", "scene-0188", "scene-0190", "scene-0191",
+        "scene-0192", "scene-0193", "scene-0194", "scene-0195", "scene-0196",
+        "scene-0199", "scene-0200", "scene-0202", "scene-0203", "scene-0204",
+        "scene-0206", "scene-0207", "scene-0208", "scene-0209", "scene-0210",
+        "scene-0211", "scene-0212", "scene-0213", "scene-0214", "scene-0218",
+        "scene-0219", "scene-0220", "scene-0221", "scene-0222", "scene-0224",
+        "scene-0225", "scene-0226", "scene-0227", "scene-0228", "scene-0229",
+        "scene-0230", "scene-0231", "scene-0232", "scene-0233", "scene-0234",
+        "scene-0235", "scene-0236", "scene-0237", "scene-0238", "scene-0239",
+        "scene-0240", "scene-0241", "scene-0242", "scene-0243", "scene-0244",
+        "scene-0245", "scene-0246", "scene-0247", "scene-0248", "scene-0249",
+        "scene-0250", "scene-0251", "scene-0252", "scene-0253", "scene-0254",
+        "scene-0255", "scene-0256", "scene-0257", "scene-0258", "scene-0259",
+        "scene-0260", "scene-0261", "scene-0262", "scene-0263", "scene-0264",
+        "scene-0268", "scene-0270", "scene-0271", "scene-0272", "scene-0273",
+        "scene-0274", "scene-0275", "scene-0276", "scene-0277", "scene-0278",
+        "scene-0283", "scene-0284", "scene-0285", "scene-0286", "scene-0287",
+        "scene-0288", "scene-0289", "scene-0290", "scene-0291", "scene-0292",
+        "scene-0293", "scene-0294", "scene-0295", "scene-0296", "scene-0297",
+        "scene-0298", "scene-0299", "scene-0300", "scene-0301", "scene-0302",
+        "scene-0303", "scene-0304", "scene-0305", "scene-0306", "scene-0315",
+        "scene-0316", "scene-0317", "scene-0318", "scene-0321", "scene-0323",
+        "scene-0324", "scene-0328", "scene-0329", "scene-0330", "scene-0331",
+        "scene-0332", "scene-0344", "scene-0345", "scene-0346", "scene-0349",
+        "scene-0350", "scene-0351", "scene-0352", "scene-0353", "scene-0354",
+        "scene-0355", "scene-0356", "scene-0357", "scene-0358", "scene-0359",
+        "scene-0360", "scene-0361", "scene-0362", "scene-0363", "scene-0364",
+        "scene-0365", "scene-0367", "scene-0370", "scene-0371", "scene-0372",
+        "scene-0373", "scene-0374", "scene-0375", "scene-0376", "scene-0377",
+        "scene-0379", "scene-0380", "scene-0381", "scene-0382", "scene-0383",
+        "scene-0384", "scene-0385", "scene-0386", "scene-0388", "scene-0399",
+        "scene-0400", "scene-0401", "scene-0402", "scene-0403", "scene-0405",
+        "scene-0406", "scene-0407", "scene-0408", "scene-0420", "scene-0421",
+        "scene-0422", "scene-0423", "scene-0424", "scene-0425", "scene-0426",
+        "scene-0427", "scene-0428", "scene-0429", "scene-0430", "scene-0431",
+        "scene-0432", "scene-0433", "scene-0434", "scene-0435", "scene-0436",
+        "scene-0437", "scene-0438", "scene-0439", "scene-0440", "scene-0441",
+        "scene-0442", "scene-0443", "scene-0444", "scene-0445", "scene-0446",
+        "scene-0447", "scene-0448", "scene-0449", "scene-0450", "scene-0451",
+        "scene-0452", "scene-0453", "scene-0454", "scene-0455", "scene-0456",
+        "scene-0457", "scene-0458", "scene-0459", "scene-0461", "scene-0462",
+        "scene-0463", "scene-0464", "scene-0465", "scene-0467", "scene-0468",
+        "scene-0469", "scene-0471", "scene-0472", "scene-0474", "scene-0475",
+        "scene-0476", "scene-0477", "scene-0478", "scene-0479", "scene-0480",
+        "scene-0499", "scene-0500", "scene-0501", "scene-0502", "scene-0504",
+        "scene-0505", "scene-0506", "scene-0507", "scene-0508", "scene-0509",
+        "scene-0510", "scene-0511", "scene-0512", "scene-0513", "scene-0514",
+        "scene-0515", "scene-0517", "scene-0518", "scene-0519", "scene-0520",
+        "scene-0521", "scene-0522", "scene-0523", "scene-0524", "scene-0552",
+        "scene-0553", "scene-0554", "scene-0555", "scene-0559", "scene-0560",
+        "scene-0561", "scene-0562", "scene-0563", "scene-0564", "scene-0565",
+        "scene-0584", "scene-0585", "scene-0586", "scene-0587", "scene-0588",
+        "scene-0589", "scene-0590", "scene-0591", "scene-0592", "scene-0593",
+        "scene-0594", "scene-0595", "scene-0596", "scene-0597", "scene-0598",
+        "scene-0599", "scene-0600", "scene-0625", "scene-0626", "scene-0627",
+        "scene-0629", "scene-0630", "scene-0632", "scene-0633", "scene-0634",
+        "scene-0635", "scene-0636", "scene-0637", "scene-0638", "scene-0639",
+        "scene-0640", "scene-0652", "scene-0653", "scene-0654", "scene-0655",
+        "scene-0656", "scene-0657", "scene-0658", "scene-0659", "scene-0660",
+        "scene-0661", "scene-0662", "scene-0663", "scene-0664", "scene-0665",
+        "scene-0666", "scene-0667", "scene-0668", "scene-0669", "scene-0670",
+        "scene-0671", "scene-0672", "scene-0673", "scene-0674", "scene-0675",
+        "scene-0676", "scene-0677", "scene-0678", "scene-0679", "scene-0681",
+        "scene-0683", "scene-0684", "scene-0685", "scene-0686", "scene-0687",
+        "scene-0688", "scene-0689", "scene-0695", "scene-0696", "scene-0697",
+        "scene-0698", "scene-0700", "scene-0701", "scene-0703", "scene-0704",
+        "scene-0705", "scene-0706", "scene-0707", "scene-0708", "scene-0709",
+        "scene-0710", "scene-0711", "scene-0712", "scene-0713", "scene-0714",
+        "scene-0715", "scene-0716", "scene-0717", "scene-0718", "scene-0719",
+        "scene-0726", "scene-0727", "scene-0728", "scene-0730", "scene-0731",
+        "scene-0733", "scene-0734", "scene-0735", "scene-0736", "scene-0737",
+        "scene-0738", "scene-0780", "scene-0781", "scene-0782", "scene-0783",
+        "scene-0784", "scene-0786", "scene-0787", "scene-0789", "scene-0790",
+        "scene-0791", "scene-0792", "scene-0802", "scene-0806", "scene-0808",
+        "scene-0809", "scene-0810", "scene-0811", "scene-0812", "scene-0813",
+        "scene-0815", "scene-0816", "scene-0817", "scene-0819", "scene-0820",
+        "scene-0821", "scene-0822", "scene-0847", "scene-0848", "scene-0849",
+        "scene-0850", "scene-0851", "scene-0852", "scene-0853", "scene-0854",
+        "scene-0855", "scene-0856", "scene-0858", "scene-0860", "scene-0861",
+        "scene-0862", "scene-0863", "scene-0864", "scene-0865", "scene-0866",
+        "scene-0868", "scene-0869", "scene-0870", "scene-0871", "scene-0872",
+        "scene-0873", "scene-0875", "scene-0876", "scene-0877", "scene-0878",
+        "scene-0880", "scene-0882", "scene-0883", "scene-0884", "scene-0885",
+        "scene-0886", "scene-0887", "scene-0888", "scene-0889", "scene-0890",
+        "scene-0891", "scene-0892", "scene-0893", "scene-0894", "scene-0895",
+        "scene-0896", "scene-0897", "scene-0898", "scene-0899", "scene-0900",
+        "scene-0901", "scene-0902", "scene-0903", "scene-0904", "scene-0905",
+        "scene-0906", "scene-0907", "scene-0908", "scene-0909", "scene-0916",
+        "scene-0917", "scene-0921", "scene-0922", "scene-0923", "scene-0925",
+        "scene-0926", "scene-0927", "scene-0928", "scene-0929", "scene-0930",
+        "scene-0931", "scene-0945", "scene-0947", "scene-0949", "scene-0952",
+        "scene-0953", "scene-0955", "scene-0956", "scene-0957", "scene-0958",
+        "scene-0959", "scene-0960", "scene-0961", "scene-0966", "scene-0967",
+        "scene-0968", "scene-0969", "scene-0971", "scene-0972", "scene-0975",
+        "scene-0976", "scene-0977", "scene-0978", "scene-0979", "scene-0980",
+        "scene-0981", "scene-0982", "scene-0983", "scene-0984", "scene-0988",
+        "scene-0989", "scene-0990", "scene-0991", "scene-0992", "scene-0994",
+        "scene-0995", "scene-0996", "scene-0997", "scene-0998", "scene-0999",
+        "scene-1000", "scene-1001", "scene-1004", "scene-1005", "scene-1006",
+        "scene-1007", "scene-1008", "scene-1009", "scene-1010", "scene-1011",
+        "scene-1012", "scene-1013", "scene-1014", "scene-1015", "scene-1019",
+        "scene-1020", "scene-1021", "scene-1022", "scene-1023", "scene-1024",
+        "scene-1025", "scene-1044", "scene-1045", "scene-1046", "scene-1047",
+        "scene-1048", "scene-1049", "scene-1050", "scene-1051", "scene-1052",
+        "scene-1053", "scene-1054", "scene-1064", "scene-1065", "scene-1066",
+        "scene-1067", "scene-1068", "scene-1069", "scene-1070", "scene-1071",
+        "scene-1072", "scene-1073", "scene-1074", "scene-1075", "scene-1076",
+        "scene-1077", "scene-1078", "scene-1079", "scene-1080", "scene-1081",
+        "scene-1082", "scene-1083", "scene-1084", "scene-1085", "scene-1086",
+        "scene-1087", "scene-1088", "scene-1089", "scene-1090", "scene-1091",
+        "scene-1092", "scene-1093", "scene-1094", "scene-1095", "scene-1096",
+        "scene-1097", "scene-1098", "scene-1099", "scene-1100", "scene-1101",
+        "scene-1102", "scene-1104", "scene-1105", "scene-1106", "scene-1107",
+        "scene-1108", "scene-1109", "scene-1110"]
+    val_roddick_scenes = [
+        "scene-0001", "scene-0010", "scene-0011", "scene-0020", "scene-0038",
+        "scene-0041", "scene-0053", "scene-0054", "scene-0121", "scene-0122",
+        "scene-0139", "scene-0152", "scene-0160", "scene-0184", "scene-0269",
+        "scene-0347", "scene-0348", "scene-0366", "scene-0368", "scene-0369",
+        "scene-0378", "scene-0389", "scene-0390", "scene-0391", "scene-0392",
+        "scene-0393", "scene-0394", "scene-0395", "scene-0396", "scene-0397",
+        "scene-0398", "scene-0411", "scene-0412", "scene-0413", "scene-0414",
+        "scene-0415", "scene-0416", "scene-0417", "scene-0418", "scene-0419",
+        "scene-0525", "scene-0526", "scene-0527", "scene-0528", "scene-0529",
+        "scene-0530", "scene-0531", "scene-0532", "scene-0533", "scene-0534",
+        "scene-0535", "scene-0536", "scene-0537", "scene-0538", "scene-0539",
+        "scene-0541", "scene-0542", "scene-0543", "scene-0544", "scene-0545",
+        "scene-0546", "scene-0556", "scene-0557", "scene-0558", "scene-0566",
+        "scene-0568", "scene-0570", "scene-0571", "scene-0572", "scene-0573",
+        "scene-0574", "scene-0575", "scene-0576", "scene-0577", "scene-0578",
+        "scene-0580", "scene-0582", "scene-0583", "scene-0642", "scene-0643",
+        "scene-0644", "scene-0645", "scene-0646", "scene-0647", "scene-0648",
+        "scene-0649", "scene-0650", "scene-0651", "scene-0739", "scene-0740",
+        "scene-0741", "scene-0744", "scene-0746", "scene-0747", "scene-0749",
+        "scene-0750", "scene-0751", "scene-0752", "scene-0757", "scene-0758",
+        "scene-0759", "scene-0760", "scene-0761", "scene-0762", "scene-0763",
+        "scene-0764", "scene-0765", "scene-0767", "scene-0768", "scene-0769",
+        "scene-0770", "scene-0771", "scene-0775", "scene-0777", "scene-0778",
+        "scene-0794", "scene-0795", "scene-0796", "scene-0797", "scene-0798",
+        "scene-0799", "scene-0800", "scene-0803", "scene-0804", "scene-0911",
+        "scene-0912", "scene-0913", "scene-0914", "scene-0915", "scene-0919",
+        "scene-0920", "scene-0924", "scene-0962", "scene-0963", "scene-1002",
+        "scene-1003", "scene-1016", "scene-1017", "scene-1018", "scene-1055",
+        "scene-1056", "scene-1057", "scene-1058", "scene-1059", "scene-1060",
+        "scene-1061", "scene-1062", "scene-1063"]
+    calibration_roddick_scenes = [
+        "scene-0852", "scene-0429", "scene-0956", "scene-0194", "scene-0811",
+        "scene-1110", "scene-1107", "scene-0294", "scene-0900", "scene-0596",
+        "scene-0296", "scene-0885", "scene-0866", "scene-0105", "scene-0782",
+        "scene-0191", "scene-0876", "scene-0133", "scene-0231", "scene-0847",
+        "scene-0363", "scene-0026", "scene-0791", "scene-0909", "scene-0002",
+        "scene-0283", "scene-0007", "scene-0251", "scene-1100", "scene-0668",
+        "scene-0584", "scene-0287", "scene-0260", "scene-0171", "scene-0789",
+        "scene-0108", "scene-0190", "scene-0206", "scene-0635", "scene-0815",
+        "scene-0058", "scene-0710", "scene-0302", "scene-0639", "scene-0166",
+        "scene-0094", "scene-0735", "scene-0321", "scene-1091", "scene-0344"
+    ]
+    scenes_dict = {
+        "train": train_roddick_scenes,
+        "val": val_roddick_scenes,
+        "calibration": calibration_roddick_scenes
+    }
+    return scenes_dict

mapper/data/nuscenes/utils.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import os
+import numpy as np
+from shapely import geometry, affinity
+from pyquaternion import Quaternion
+import cv2
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.detection.constants import DETECTION_NAMES
+from nuscenes.utils.data_classes import LidarPointCloud
+from nuscenes.map_expansion.map_api import NuScenesMap
+from shapely.strtree import STRtree
+from collections import OrderedDict
+import torch
+def decode_binary_labels(labels, nclass):
+    bits = torch.pow(2, torch.arange(nclass))
+    return (labels & bits.view(-1, 1, 1)) > 0
+def transform_polygon(polygon, affine):
+    """
+    Transform a 2D polygon
+    """
+    a, b, tx, c, d, ty = affine.flatten()[:6]
+    return affinity.affine_transform(polygon, [a, b, c, d, tx, ty])
+def render_polygon(mask, polygon, extents, resolution, value=1):
+    if len(polygon) == 0:
+        return
+    polygon = (polygon - np.array(extents[:2])) / resolution
+    polygon = np.ascontiguousarray(polygon).round().astype(np.int32)
+    cv2.fillConvexPoly(mask, polygon, value)
+def transform(matrix, vectors):
+    vectors = np.dot(matrix[:-1, :-1], vectors.T)
+    vectors = vectors.T + matrix[:-1, -1]
+    return vectors
+CAMERA_NAMES = ['CAM_FRONT', 'CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT',
+                'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', 'CAM_BACK']
+NUSCENES_CLASS_NAMES = [
+    'drivable_area', 'ped_crossing', 'walkway', 'carpark', 'car', 'truck',
+    'bus', 'trailer', 'construction_vehicle', 'pedestrian', 'motorcycle',
+    'bicycle', 'traffic_cone', 'barrier'
+]
+STATIC_CLASSES = ['drivable_area', 'ped_crossing', 'walkway', 'carpark_area']
+LOCATIONS = ['boston-seaport', 'singapore-onenorth', 'singapore-queenstown',
+             'singapore-hollandvillage']
+def load_map_data(dataroot, location):
+    # Load the NuScenes map object
+    nusc_map = NuScenesMap(dataroot, location)
+    map_data = OrderedDict()
+    for layer in STATIC_CLASSES:
+        # Retrieve all data associated with the current layer
+        records = getattr(nusc_map, layer)
+        polygons = list()
+        # Drivable area records can contain multiple polygons
+        if layer == 'drivable_area':
+            for record in records:
+                # Convert each entry in the record into a shapely object
+                for token in record['polygon_tokens']:
+                    poly = nusc_map.extract_polygon(token)
+                    if poly.is_valid:
+                        polygons.append(poly)
+        else:
+            for record in records:
+                # Convert each entry in the record into a shapely object
+                poly = nusc_map.extract_polygon(record['polygon_token'])
+                if poly.is_valid:
+                    polygons.append(poly)
+        # Store as an R-Tree for fast intersection queries
+        map_data[layer] = STRtree(polygons)
+    return map_data
+def iterate_samples(nuscenes, start_token):
+    sample_token = start_token
+    while sample_token != '':
+        sample = nuscenes.get('sample', sample_token)
+        yield sample
+        sample_token = sample['next']
+def get_map_masks(nuscenes, map_data, sample_data, extents, resolution):
+    # Render each layer sequentially
+    layers = [get_layer_mask(nuscenes, polys, sample_data, extents,
+              resolution) for layer, polys in map_data.items()]
+    return np.stack(layers, axis=0)
+def get_layer_mask(nuscenes, polygons, sample_data, extents, resolution):
+    # Get the 2D affine transform from bev coords to map coords
+    tfm = get_sensor_transform(nuscenes, sample_data)[[0, 1, 3]][:, [0, 2, 3]]
+    inv_tfm = np.linalg.inv(tfm)
+    # Create a patch representing the birds-eye-view region in map coordinates
+    map_patch = geometry.box(*extents)
+    map_patch = transform_polygon(map_patch, tfm)
+    # Initialise the map mask
+    x1, z1, x2, z2 = extents
+    mask = np.zeros((int((z2 - z1) / resolution), int((x2 - x1) / resolution)),
+                    dtype=np.uint8)
+    # Find all polygons which intersect with the area of interest
+    for polygon in polygons.query(map_patch):
+        polygon = polygon.intersection(map_patch)
+        # Transform into map coordinates
+        polygon = transform_polygon(polygon, inv_tfm)
+        # Render the polygon to the mask
+        render_shapely_polygon(mask, polygon, extents, resolution)
+    return mask
+def get_object_masks(nuscenes, sample_data, extents, resolution):
+    # Initialize object masks
+    nclass = len(DETECTION_NAMES) + 1
+    grid_width = int((extents[2] - extents[0]) / resolution)
+    grid_height = int((extents[3] - extents[1]) / resolution)
+    masks = np.zeros((nclass, grid_height, grid_width), dtype=np.uint8)
+    # Get the 2D affine transform from bev coords to map coords
+    tfm = get_sensor_transform(nuscenes, sample_data)[[0, 1, 3]][:, [0, 2, 3]]
+    inv_tfm = np.linalg.inv(tfm)
+    for box in nuscenes.get_boxes(sample_data['token']):
+        # Get the index of the class
+        det_name = category_to_detection_name(box.name)
+        if det_name not in DETECTION_NAMES:
+            class_id = -1
+        else:
+            class_id = DETECTION_NAMES.index(det_name)
+        # Get bounding box coordinates in the grid coordinate frame
+        bbox = box.bottom_corners()[:2]
+        local_bbox = np.dot(inv_tfm[:2, :2], bbox).T + inv_tfm[:2, 2]
+        # Render the rotated bounding box to the mask
+        render_polygon(masks[class_id], local_bbox, extents, resolution)
+    return masks.astype(np.bool)
+def get_sensor_transform(nuscenes, sample_data):
+    # Load sensor transform data
+    sensor = nuscenes.get(
+        'calibrated_sensor', sample_data['calibrated_sensor_token'])
+    sensor_tfm = make_transform_matrix(sensor)
+    # Load ego pose data
+    pose = nuscenes.get('ego_pose', sample_data['ego_pose_token'])
+    pose_tfm = make_transform_matrix(pose)
+    return np.dot(pose_tfm, sensor_tfm)
+def load_point_cloud(nuscenes, sample_data):
+    # Load point cloud
+    lidar_path = os.path.join(nuscenes.dataroot, sample_data['filename'])
+    pcl = LidarPointCloud.from_file(lidar_path)
+    return pcl.points[:3, :].T
+def make_transform_matrix(record):
+    """
+    Create a 4x4 transform matrix from a calibrated_sensor or ego_pose record
+    """
+    transform = np.eye(4)
+    transform[:3, :3] = Quaternion(record['rotation']).rotation_matrix
+    transform[:3, 3] = np.array(record['translation'])
+    return transform
+def render_shapely_polygon(mask, polygon, extents, resolution):
+    if polygon.geom_type == 'Polygon':
+        # Render exteriors
+        render_polygon(mask, polygon.exterior.coords, extents, resolution, 1)
+        # Render interiors
+        for hole in polygon.interiors:
+            render_polygon(mask, hole.coords, extents, resolution, 0)
+    # Handle the case of compound shapes
+    else:
+        for poly in polygon:
+            render_shapely_polygon(mask, poly, extents, resolution)

mapper/data/schema.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from dataclasses import dataclass
+from typing import Optional, Any, Dict
+from pathlib import Path
+@dataclass
+class AugmentationConfiguration:
+    gaussian_noise: dict
+    brightness_contrast: dict
+    enabled: bool = False
+    brightness: float = 0.5
+    contrast: float = 0.5
+    saturation: float = 0.5
+    hue: float = 0.5
+    random_resized_crop: Any = False
+    random_flip: float = 0.5
+@dataclass(kw_only=True)
+class DataConfiguration:
+    augmentations: AugmentationConfiguration
+    loading: Dict[str, Dict[str, Any]]
+    target_focal_length: Optional[int] = None
+    reduce_fov: Optional[bool] = None
+    resize_image: Optional[Any] = None
+    pad_to_square: Optional[bool] = None
+    pad_to_multiple: Optional[int] = None
+    gravity_align: Optional[bool] = None
+    rectify_pitch: Optional[bool] = True
+    num_classes: int
+    name: str
+    seed: Optional[int] = 0
+    random: Optional[bool] = True
+    num_threads: Optional[int] = None
+@dataclass(kw_only=True)
+class MIADataConfiguration(DataConfiguration):
+    scenes: list[str]
+    split: Any
+    data_dir: Path
+    pixel_per_meter: int
+    crop_size_meters: int
+    name: str = "mapillary"
+    filter_for: Optional[str] = None
+    filter_by_ground_angle: Optional[float] = None
+    min_num_points: int = 0
+@dataclass(kw_only=True)
+class KITTIDataConfiguration(DataConfiguration):
+    seam_root_dir: Path
+    dataset_root_dir: Path
+    bev_percentage: float
+    pixel_per_meter: int
+    crop_size_meters: int
+    class_mapping: Optional[Any] = None
+    percentage: float = 1.0
+@dataclass(kw_only=True)
+class NuScenesDataConfiguration(DataConfiguration):
+    data_dir: Path
+    map_dir: Path
+    pixel_per_meter: int
+    crop_size_meters: int
+    percentage: float = 1.0
+    class_mapping: Optional[Any] = None
+    version: str = "v1.0-trainval"

mapper/data/sequential.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import numpy as np
+import torch
+def chunk_sequence(
+    data,
+    indices,
+    *,
+    names=None,
+    max_length=100,
+    min_length=1,
+    max_delay_s=None,
+    max_inter_dist=None,
+    max_total_dist=None,
+):
+    sort_array = data.get("capture_time", data.get("index"))
+    if sort_array is None:
+        sort_array = indices if names is None else names
+    indices = sorted(indices, key=lambda i: sort_array[i].tolist())
+    centers = torch.stack([data["t_c2w"][i][:2] for i in indices]).numpy()
+    dists = np.linalg.norm(np.diff(centers, axis=0), axis=-1)
+    if "capture_time" in data:
+        times = torch.stack([data["capture_time"][i] for i in indices])
+        times = times.double() / 1e3  # ms to s
+        delays = np.diff(times, axis=0)
+    else:
+        delays = np.zeros_like(dists)
+    chunks = [[indices[0]]]
+    dist_total = 0
+    for dist, delay, idx in zip(dists, delays, indices[1:]):
+        dist_total += dist
+        if (
+            (max_inter_dist is not None and dist > max_inter_dist)
+            or (max_total_dist is not None and dist_total > max_total_dist)
+            or (max_delay_s is not None and delay > max_delay_s)
+            or len(chunks[-1]) >= max_length
+        ):
+            chunks.append([])
+            dist_total = 0
+        chunks[-1].append(idx)
+    chunks = list(filter(lambda c: len(c) >= min_length, chunks))
+    chunks = sorted(chunks, key=len, reverse=True)
+    return chunks

mapper/data/torch.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import collections
+import os
+import torch
+from torch.utils.data import get_worker_info
+from torch.utils.data._utils.collate import (
+    default_collate_err_msg_format,
+    np_str_obj_array_pattern,
+)
+from lightning_fabric.utilities.seed import pl_worker_init_function
+def collate(batch):
+    """Difference with PyTorch default_collate: it can stack other tensor-like objects.
+    Adapted from PixLoc, Paul-Edouard Sarlin, ETH Zurich
+    https://github.com/cvg/pixloc
+    Released under the Apache License 2.0
+    """
+    if not isinstance(batch, list):  # no batching
+        return batch
+    # Filter None Elements
+    batch = [elem for elem in batch if elem is not None]
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, torch.Tensor):
+        out = None
+        if torch.utils.data.get_worker_info() is not None:
+            # If we're in a background process, concatenate directly into a
+            # shared memory tensor to avoid an extra copy
+            numel = sum(x.numel() for x in batch)
+            storage = elem.storage()._new_shared(numel, device=elem.device)
+            out = elem.new(storage).resize_(len(batch), *list(elem.size()))
+        return torch.stack(batch, 0, out=out)
+    elif (
+        elem_type.__module__ == "numpy"
+        and elem_type.__name__ != "str_"
+        and elem_type.__name__ != "string_"
+    ):
+        if elem_type.__name__ == "ndarray" or elem_type.__name__ == "memmap":
+            # array of string classes and object
+            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+            return collate([torch.as_tensor(b) for b in batch])
+        elif elem.shape == ():  # scalars
+            return torch.as_tensor(batch)
+    elif isinstance(elem, float):
+        return torch.tensor(batch, dtype=torch.float64)
+    elif isinstance(elem, int):
+        return torch.tensor(batch)
+    elif isinstance(elem, (str, bytes)):
+        return batch
+    elif isinstance(elem, collections.abc.Mapping):
+        return {key: collate([d[key] for d in batch]) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, "_fields"):  # namedtuple
+        return elem_type(*(collate(samples) for samples in zip(*batch)))
+    elif isinstance(elem, collections.abc.Sequence):
+        # check to make sure that the elements in batch have consistent size
+        it = iter(batch)
+        elem_size = len(next(it))
+        if not all(len(elem) == elem_size for elem in it):
+            raise RuntimeError("each element in list of batch should be of equal size")
+        transposed = zip(*batch)
+        return [collate(samples) for samples in transposed]
+    else:
+        # try to stack anyway in case the object implements stacking.
+        try:
+            return torch.stack(batch, 0)
+        except TypeError as e:
+            if "expected Tensor as element" in str(e):
+                return batch
+            else:
+                raise e
+def set_num_threads(nt):
+    """Force numpy and other libraries to use a limited number of threads."""
+    try:
+        import mkl
+    except ImportError:
+        pass
+    else:
+        mkl.set_num_threads(nt)
+    torch.set_num_threads(1)
+    os.environ["IPC_ENABLE"] = "1"
+    for o in [
+        "OPENBLAS_NUM_THREADS",
+        "NUMEXPR_NUM_THREADS",
+        "OMP_NUM_THREADS",
+        "MKL_NUM_THREADS",
+    ]:
+        os.environ[o] = str(nt)
+def worker_init_fn(i):
+    info = get_worker_info()
+    pl_worker_init_function(info.id)
+    num_threads = info.dataset.cfg.get("num_threads")
+    if num_threads is not None:
+        set_num_threads(num_threads)

mapper/data/utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import numpy as np
+from scipy.spatial.transform import Rotation
+def crop_map(raster, xy, size, seed=None):
+    h, w = raster.shape[-2:]
+    state = np.random.RandomState(seed)
+    top = state.randint(0, h - size + 1)
+    left = state.randint(0, w - size + 1)
+    raster = raster[..., top : top + size, left : left + size]
+    xy -= np.array([left, top])
+    return raster, xy
+def decompose_rotmat(R_c2w):
+    R_cv2xyz = Rotation.from_euler("X", -90, degrees=True)
+    rot_w2c = R_cv2xyz * Rotation.from_matrix(R_c2w).inv()
+    roll, pitch, yaw = rot_w2c.as_euler("YXZ", degrees=True)
+    return roll, pitch, yaw

mapper/mapper.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import time
+import torch
+import hydra
+import pytorch_lightning as pl
+from typing import Any
+from hydra.core.config_store import ConfigStore
+from omegaconf import OmegaConf
+from pytorch_lightning.loggers import WandbLogger
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pathlib import Path
+from dataclasses import dataclass
+from .module import GenericModule
+from .data.module import GenericDataModule
+from .callbacks import EvalSaveCallback, ImageLoggerCallback
+from .models.schema import ModelConfiguration, DINOConfiguration, ResNetConfiguration
+from .data.schema import MIADataConfiguration, KITTIDataConfiguration, NuScenesDataConfiguration
+@dataclass
+class ExperimentConfiguration:
+    name: str
+@dataclass
+class Configuration:
+    model: ModelConfiguration
+    experiment: ExperimentConfiguration
+    data: Any
+    training: Any
+cs = ConfigStore.instance()
+# Store root configuration schema
+cs.store(name="pretrain", node=Configuration)
+cs.store(name="mapper_nuscenes", node=Configuration)
+cs.store(name="mapper_kitti", node=Configuration)
+# Store data configuration schema
+cs.store(group="schema/data", name="mia",
+         node=MIADataConfiguration, package="data")
+cs.store(group="schema/data", name="kitti", node=KITTIDataConfiguration, package="data")
+cs.store(group="schema/data", name="nuscenes", node=NuScenesDataConfiguration, package="data")
+cs.store(group="model/schema/backbone", name="dino", node=DINOConfiguration, package="model.image_encoder.backbone")
+cs.store(group="model/schema/backbone", name="resnet", node=ResNetConfiguration, package="model.image_encoder.backbone")
+@hydra.main(version_base=None, config_path="conf", config_name="pretrain")
+def train(cfg: Configuration):
+    OmegaConf.resolve(cfg)
+    dm = GenericDataModule(cfg.data)
+    model = GenericModule(cfg)
+    exp_name_with_time = cfg.experiment.name + \
+        "_" + time.strftime("%Y-%m-%d_%H-%M-%S")
+    callbacks: list[pl.Callback]
+    if cfg.training.eval:
+        save_dir = Path(cfg.training.save_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        callbacks = [
+            EvalSaveCallback(save_dir=save_dir)
+        ]
+        logger = None
+    else:
+        callbacks = [
+            ImageLoggerCallback(num_classes=cfg.training.num_classes),
+            ModelCheckpoint(
+                monitor=cfg.training.checkpointing.monitor,
+                save_last=cfg.training.checkpointing.save_last,
+                save_top_k=cfg.training.checkpointing.save_top_k,
+            )
+        ]
+        logger = WandbLogger(
+            name=exp_name_with_time,
+            id=exp_name_with_time,
+            entity="mappred-large",
+            project="map-pred-full-v3",
+        )
+        logger.watch(model, log="all", log_freq=500)
+    if cfg.training.checkpoint is not None:
+        state_dict = torch.load(cfg.training.checkpoint)['state_dict']
+        model.load_state_dict(state_dict, strict=False)
+    trainer_args = OmegaConf.to_container(cfg.training.trainer)
+    trainer_args['callbacks'] = callbacks
+    trainer_args['logger'] = logger
+    trainer = pl.Trainer(**trainer_args)
+    if cfg.training.eval:
+        trainer.test(model, datamodule=dm)
+    else:
+        trainer.fit(model, datamodule=dm)
+if __name__ == "__main__":
+    pl.seed_everything(42)
+    torch.set_float32_matmul_precision("high")
+    train()

mapper/models/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Adapted from PixLoc, Paul-Edouard Sarlin, ETH Zurich
+# https://github.com/cvg/pixloc
+# Released under the Apache License 2.0
+import inspect
+from .base import BaseModel
+def get_class(mod_name, base_path, BaseClass):
+    """Get the class object which inherits from BaseClass and is defined in
+    the module named mod_name, child of base_path.
+    """
+    mod_path = "{}.{}".format(base_path, mod_name)
+    mod = __import__(mod_path, fromlist=[""])
+    classes = inspect.getmembers(mod, inspect.isclass)
+    # Filter classes defined in the module
+    classes = [c for c in classes if c[1].__module__ == mod_path]
+    # Filter classes inherited from BaseModel
+    classes = [c for c in classes if issubclass(c[1], BaseClass)]
+    assert len(classes) == 1, classes
+    return classes[0][1]
+def get_model(name):
+    return get_class(name, __name__, BaseModel)

mapper/models/base.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Adapted from PixLoc, Paul-Edouard Sarlin, ETH Zurich
+# https://github.com/cvg/pixloc
+# Released under the Apache License 2.0
+"""
+Base class for trainable models.
+"""
+from abc import ABCMeta, abstractmethod
+from copy import copy
+from omegaconf import OmegaConf
+from torch import nn
+class BaseModel(nn.Module, metaclass=ABCMeta):
+    required_data_keys = []
+    strict_conf = True
+    def __init__(self, conf):
+        """Perform some logic and call the _init method of the child model."""
+        super().__init__()
+        self.conf = conf
+        OmegaConf.set_readonly(conf, True)
+        OmegaConf.set_struct(conf, True)
+        self.required_data_keys = copy(self.required_data_keys)
+        self._init(conf)
+    def forward(self, data):
+        """Check the data and call the _forward method of the child model."""
+        def recursive_key_check(expected, given):
+            for key in expected:
+                assert key in given, f"Missing key {key} in data"
+                if isinstance(expected, dict):
+                    recursive_key_check(expected[key], given[key])
+        recursive_key_check(self.required_data_keys, data)
+        return self._forward(data)
+    @abstractmethod
+    def _init(self, conf):
+        """To be implemented by the child class."""
+        raise NotImplementedError
+    @abstractmethod
+    def _forward(self, data):
+        """To be implemented by the child class."""
+        raise NotImplementedError
+    def loss(self, pred, data):
+        """To be implemented by the child class."""
+        raise NotImplementedError
+    def metrics(self):
+        return {}  # no metrics

mapper/models/bev_projection.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import torch
+from torch.nn.functional import grid_sample
+from ..utils.geometry import from_homogeneous
+from .utils import make_grid
+class PolarProjectionDepth(torch.nn.Module):
+    def __init__(self, z_max, ppm, scale_range, z_min=None):
+        super().__init__()
+        self.z_max = z_max
+        self.Δ = Δ = 1 / ppm
+        self.z_min = z_min = Δ if z_min is None else z_min
+        self.scale_range = scale_range
+        z_steps = torch.arange(z_min, z_max + Δ, Δ)
+        self.register_buffer("depth_steps", z_steps, persistent=False)
+    def sample_depth_scores(self, pixel_scales, camera):
+        scale_steps = camera.f[..., None, 1] / self.depth_steps.flip(-1)
+        log_scale_steps = torch.log2(scale_steps)
+        scale_min, scale_max = self.scale_range
+        log_scale_norm = (log_scale_steps - scale_min) / \
+            (scale_max - scale_min)
+        log_scale_norm = log_scale_norm * 2 - 1  # in [-1, 1]
+        values = pixel_scales.flatten(1, 2).unsqueeze(-1)
+        indices = log_scale_norm.unsqueeze(-1)
+        indices = torch.stack([torch.zeros_like(indices), indices], -1)
+        depth_scores = grid_sample(values, indices, align_corners=True)
+        depth_scores = depth_scores.reshape(
+            pixel_scales.shape[:-1] + (len(self.depth_steps),)
+        )
+        return depth_scores
+    def forward(
+        self,
+        image,
+        pixel_scales,
+        camera,
+        return_total_score=False,
+    ):
+        depth_scores = self.sample_depth_scores(pixel_scales, camera)
+        depth_prob = torch.softmax(depth_scores, dim=1)
+        image_polar = torch.einsum("...dhw,...hwz->...dzw", image, depth_prob)
+        if return_total_score:
+            cell_score = torch.logsumexp(depth_scores, dim=1, keepdim=True)
+            return image_polar, cell_score.squeeze(1)
+        return image_polar
+class CartesianProjection(torch.nn.Module):
+    def __init__(self, z_max, x_max, ppm, z_min=None):
+        super().__init__()
+        self.z_max = z_max
+        self.x_max = x_max
+        self.Δ = Δ = 1 / ppm
+        self.z_min = z_min = Δ if z_min is None else z_min
+        grid_xz = make_grid(
+            x_max * 2 + Δ, z_max, step_y=Δ, step_x=Δ, orig_y=Δ, orig_x=-x_max, y_up=True
+        )
+        self.register_buffer("grid_xz", grid_xz, persistent=False)
+    def grid_to_polar(self, cam):
+        f, c = cam.f[..., 0][..., None, None], cam.c[..., 0][..., None, None]
+        u = from_homogeneous(self.grid_xz).squeeze(-1) * f + c
+        z_idx = (self.grid_xz[..., 1] - self.z_min) / \
+            self.Δ  # convert z value to index
+        z_idx = z_idx[None].expand_as(u)
+        grid_polar = torch.stack([u, z_idx], -1)
+        return grid_polar
+    def sample_from_polar(self, image_polar, valid_polar, grid_uz):
+        size = grid_uz.new_tensor(image_polar.shape[-2:][::-1])
+        grid_uz_norm = (grid_uz + 0.5) / size * 2 - 1
+        grid_uz_norm = grid_uz_norm * \
+            grid_uz.new_tensor([1, -1])  # y axis is up
+        image_bev = grid_sample(image_polar, grid_uz_norm, align_corners=False)
+        if valid_polar is None:
+            valid = torch.ones_like(image_polar[..., :1, :, :])
+        else:
+            valid = valid_polar.to(image_polar)[:, None]
+        valid = grid_sample(valid, grid_uz_norm, align_corners=False)
+        valid = valid.squeeze(1) > (1 - 1e-4)
+        return image_bev, valid
+    def forward(self, image_polar, valid_polar, cam):
+        grid_uz = self.grid_to_polar(cam)
+        image, valid = self.sample_from_polar(
+            image_polar, valid_polar, grid_uz)
+        return image, valid, grid_uz

mapper/models/dinov2/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+__version__ = "0.0.1"

mapper/models/dinov2/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import pathlib
+from omegaconf import OmegaConf
+def load_config(config_name: str):
+    config_filename = config_name + ".yaml"
+    return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename)
+dinov2_default_config = load_config("ssl_default_config")
+def load_and_merge_config(config_name: str):
+    default_config = OmegaConf.create(dinov2_default_config)
+    loaded_config = load_config(config_name)
+    return OmegaConf.merge(default_config, loaded_config)

mapper/models/dinov2/configs/eval/vitb14_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+student:
+  arch: vit_base
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98

mapper/models/dinov2/configs/eval/vitg14_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+student:
+  arch: vit_giant2
+  patch_size: 14
+  ffn_layer: swiglufused
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98

mapper/models/dinov2/configs/eval/vitl14_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+student:
+  arch: vit_large
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98

mapper/models/dinov2/configs/eval/vits14_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+student:
+  arch: vit_small
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98

mapper/models/dinov2/configs/eval/vits14_reg4_pretrain.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+student:
+  arch: vit_small
+  patch_size: 14
+  num_register_tokens: 4
+  interpolate_antialias: true
+  interpolate_offset: 0.0
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98

mapper/models/dinov2/configs/ssl_default_config.yaml ADDED Viewed

	@@ -0,0 +1,118 @@

+MODEL:
+  WEIGHTS: ''
+compute_precision:
+  grad_scaler: true
+  teacher:
+    backbone:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    dino_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    ibot_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+  student:
+    backbone:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    dino_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp32
+        buffer_dtype: fp32
+    ibot_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp32
+        buffer_dtype: fp32
+dino:
+  loss_weight: 1.0
+  head_n_prototypes: 65536
+  head_bottleneck_dim: 256
+  head_nlayers: 3
+  head_hidden_dim: 2048
+  koleo_loss_weight: 0.1
+ibot:
+  loss_weight: 1.0
+  mask_sample_probability: 0.5
+  mask_ratio_min_max:
+  - 0.1
+  - 0.5
+  separate_head: false
+  head_n_prototypes: 65536
+  head_bottleneck_dim: 256
+  head_nlayers: 3
+  head_hidden_dim: 2048
+train:
+  batch_size_per_gpu: 64
+  dataset_path: ImageNet:split=TRAIN
+  output_dir: .
+  saveckp_freq: 20
+  seed: 0
+  num_workers: 10
+  OFFICIAL_EPOCH_LENGTH: 1250
+  cache_dataset: true
+  centering: "centering" # or "sinkhorn_knopp"
+student:
+  arch: vit_large
+  patch_size: 16
+  drop_path_rate: 0.3
+  layerscale: 1.0e-05
+  drop_path_uniform: true
+  pretrained_weights: ''
+  ffn_layer: "mlp"
+  block_chunks: 0
+  qkv_bias: true
+  proj_bias: true
+  ffn_bias: true
+  num_register_tokens: 0
+  interpolate_antialias: false
+  interpolate_offset: 0.1
+teacher:
+  momentum_teacher: 0.992
+  final_momentum_teacher: 1
+  warmup_teacher_temp: 0.04
+  teacher_temp: 0.07
+  warmup_teacher_temp_epochs: 30
+optim:
+  epochs: 100
+  weight_decay: 0.04
+  weight_decay_end: 0.4
+  base_lr: 0.004  # learning rate for a batch size of 1024
+  lr: 0.  # will be set after applying scaling rule
+  warmup_epochs: 10
+  min_lr: 1.0e-06
+  clip_grad: 3.0
+  freeze_last_layer_epochs: 1
+  scaling_rule: sqrt_wrt_1024
+  patch_embed_lr_mult: 0.2
+  layerwise_decay: 0.9
+  adamw_beta1: 0.9
+  adamw_beta2: 0.999
+crops:
+  global_crops_scale:
+  - 0.32
+  - 1.0
+  local_crops_number: 8
+  local_crops_scale:
+  - 0.05
+  - 0.32
+  global_crops_size: 224
+  local_crops_size: 96
+evaluation:
+  eval_period_iterations: 12500

mapper/models/dinov2/configs/train/vitg14.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+dino:
+  head_n_prototypes: 131072
+  head_bottleneck_dim: 384
+ibot:
+  separate_head: true
+  head_n_prototypes: 131072
+train:
+  batch_size_per_gpu: 12
+  dataset_path: ImageNet22k
+  centering: sinkhorn_knopp
+student:
+  arch: vit_giant2
+  patch_size: 14
+  drop_path_rate: 0.4
+  ffn_layer: swiglufused
+  block_chunks: 4
+teacher:
+  momentum_teacher: 0.994
+optim:
+  epochs: 500
+  weight_decay_end: 0.2
+  base_lr: 2.0e-04  # learning rate for a batch size of 1024
+  warmup_epochs: 80
+  layerwise_decay: 1.0
+crops:
+  local_crops_size: 98

mapper/models/dinov2/configs/train/vitl14.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+dino:
+  head_n_prototypes: 131072
+  head_bottleneck_dim: 384
+ibot:
+  separate_head: true
+  head_n_prototypes: 131072
+train:
+  batch_size_per_gpu: 32
+  dataset_path: ImageNet22k
+  centering: sinkhorn_knopp
+student:
+  arch: vit_large
+  patch_size: 14
+  drop_path_rate: 0.4
+  ffn_layer: swiglufused
+  block_chunks: 4
+teacher:
+  momentum_teacher: 0.994
+optim:
+  epochs: 500
+  weight_decay_end: 0.2
+  base_lr: 2.0e-04  # learning rate for a batch size of 1024
+  warmup_epochs: 80
+  layerwise_decay: 1.0
+crops:
+  local_crops_size: 98

mapper/models/dinov2/configs/train/vitl16_short.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+# this corresponds to the default config
+train:
+  dataset_path: ImageNet:split=TRAIN
+  batch_size_per_gpu: 64
+student:
+  block_chunks: 4