Spaces:

Realcat
/

image-matching-webui

Running

App Files Files Community

Realcat commited on Apr 27, 2024

Commit

9cde3b4

1 Parent(s): d64a873

update: roma

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build_docker.sh +1 -0
hloc/matchers/roma.py +3 -1
third_party/{Roma → RoMa}/.gitignore +0 -0
third_party/RoMa/LICENSE +21 -0
third_party/{Roma → RoMa}/README.md +44 -15
third_party/{Roma → RoMa}/assets/sacre_coeur_A.jpg +0 -0
third_party/{Roma → RoMa}/assets/sacre_coeur_B.jpg +0 -0
third_party/RoMa/assets/toronto_A.jpg +3 -0
third_party/RoMa/assets/toronto_B.jpg +3 -0
third_party/{Roma → RoMa}/data/.gitignore +0 -0
third_party/RoMa/demo/demo_3D_effect.py +46 -0
third_party/{Roma → RoMa}/demo/demo_fundamental.py +5 -10
third_party/{Roma → RoMa}/demo/demo_match.py +11 -14
third_party/RoMa/demo/demo_match_opencv_sift.py +43 -0
third_party/RoMa/demo/gif/.gitignore +2 -0
third_party/{Roma → RoMa}/pretrained/dinov2_vitl14_pretrain.pth +0 -0
third_party/{Roma → RoMa}/pretrained/roma_outdoor.pth +0 -0
third_party/{Roma → RoMa}/requirements.txt +1 -1
third_party/{Roma → RoMa}/roma/__init__.py +2 -2
third_party/{Roma → RoMa}/roma/benchmarks/__init__.py +0 -0
third_party/{Roma → RoMa}/roma/benchmarks/hpatches_sequences_homog_benchmark.py +7 -5
third_party/{Roma → RoMa}/roma/benchmarks/megadepth_dense_benchmark.py +11 -27
third_party/{Roma → RoMa}/roma/benchmarks/megadepth_pose_estimation_benchmark.py +19 -49
third_party/{Roma → RoMa}/roma/benchmarks/scannet_benchmark.py +30 -27
third_party/{Roma → RoMa}/roma/checkpointing/__init__.py +0 -0
third_party/{Roma → RoMa}/roma/checkpointing/checkpoint.py +4 -5
third_party/{Roma → RoMa}/roma/datasets/__init__.py +1 -1
third_party/{Roma → RoMa}/roma/datasets/megadepth.py +42 -81
third_party/{Roma → RoMa}/roma/datasets/scannet.py +72 -103
third_party/RoMa/roma/losses/__init__.py +1 -0
third_party/{Roma → RoMa}/roma/losses/robust_loss.py +54 -119
third_party/RoMa/roma/models/__init__.py +1 -0
third_party/{Roma → RoMa}/roma/models/encoders.py +7 -15
third_party/{Roma → RoMa}/roma/models/matcher.py +100 -21
third_party/RoMa/roma/models/model_zoo/__init__.py +53 -0
third_party/{Roma → RoMa}/roma/models/model_zoo/roma_models.py +69 -84
third_party/{Roma → RoMa}/roma/models/transformer/__init__.py +14 -46
third_party/{Roma → RoMa}/roma/models/transformer/dinov2.py +23 -71
third_party/{Roma → RoMa}/roma/models/transformer/layers/__init__.py +0 -0
third_party/{Roma → RoMa}/roma/models/transformer/layers/attention.py +1 -5
third_party/{Roma → RoMa}/roma/models/transformer/layers/block.py +13 -45
third_party/{Roma → RoMa}/roma/models/transformer/layers/dino_head.py +2 -11
third_party/{Roma → RoMa}/roma/models/transformer/layers/drop_path.py +1 -3
third_party/{Roma → RoMa}/roma/models/transformer/layers/layer_scale.py +0 -0
third_party/{Roma → RoMa}/roma/models/transformer/layers/mlp.py +0 -0
third_party/{Roma → RoMa}/roma/models/transformer/layers/patch_embed.py +4 -16
third_party/{Roma → RoMa}/roma/models/transformer/layers/swiglu_ffn.py +0 -0
third_party/{Roma → RoMa}/roma/train/__init__.py +0 -0
third_party/{Roma → RoMa}/roma/train/train.py +15 -39
third_party/{Roma → RoMa}/roma/utils/__init__.py +0 -0

build_docker.sh CHANGED Viewed

@@ -1,3 +1,4 @@
 docker build -t image-matching-webui:latest . --no-cache
 docker tag image-matching-webui:latest vincentqin/image-matching-webui:latest
 docker push vincentqin/image-matching-webui:latest

 docker build -t image-matching-webui:latest . --no-cache
 docker tag image-matching-webui:latest vincentqin/image-matching-webui:latest
 docker push vincentqin/image-matching-webui:latest

hloc/matchers/roma.py CHANGED Viewed

@@ -6,7 +6,7 @@ from PIL import Image
 from ..utils.base_model import BaseModel
 from .. import logger
-roma_path = Path(__file__).parent / "../../third_party/Roma"
 sys.path.append(str(roma_path))
 from roma.models.model_zoo.roma_models import roma_model
@@ -63,6 +63,8 @@ class Roma(BaseModel):
             weights=weights,
             dinov2_weights=dinov2_weights,
             device=device,
         )
         logger.info(f"Load Roma model done.")

 from ..utils.base_model import BaseModel
 from .. import logger
+roma_path = Path(__file__).parent / "../../third_party/RoMa"
 sys.path.append(str(roma_path))
 from roma.models.model_zoo.roma_models import roma_model
             weights=weights,
             dinov2_weights=dinov2_weights,
             device=device,
+            #temp fix issue: https://github.com/Parskatt/RoMa/issues/26
+            amp_dtype=torch.float32,
         )
         logger.info(f"Load Roma model done.")

third_party/{Roma → RoMa}/.gitignore RENAMED Viewed

File without changes

third_party/RoMa/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Johan Edstedt
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

third_party/{Roma → RoMa}/README.md RENAMED Viewed

@@ -1,14 +1,29 @@
-# RoMa: Revisiting Robust Losses for Dense Feature Matching
-### [Project Page (TODO)](https://parskatt.github.io/RoMa) | [Paper](https://arxiv.org/abs/2305.15404)
 <br/>
-> RoMa: Revisiting Robust Lossses for Dense Feature Matching
-> [Johan Edstedt](https://scholar.google.com/citations?user=Ul-vMR0AAAAJ), [Qiyu Sun](https://scholar.google.com/citations?user=HS2WuHkAAAAJ), [Georg Bökman](https://scholar.google.com/citations?user=FUE3Wd0AAAAJ), [Mårten Wadenbäck](https://scholar.google.com/citations?user=6WRQpCQAAAAJ), [Michael Felsberg](https://scholar.google.com/citations?&user=lkWfR08AAAAJ)
-> Arxiv 2023
-**NOTE!!! Very early code, there might be bugs**
-The codebase is in the [roma folder](roma).
 ## Setup/Install
 In your python environment (tested on Linux python 3.10), run:
@@ -32,6 +47,19 @@ F, mask = cv2.findFundamentalMat(
     kptsA.cpu().numpy(), kptsB.cpu().numpy(), ransacReprojThreshold=0.2, method=cv2.USAC_MAGSAC, confidence=0.999999, maxIters=10000
 )
 ```
 ## Reproducing Results
 The experiments in the paper are provided in the [experiments folder](experiments).
@@ -46,7 +74,8 @@ torchrun --nproc_per_node=4 --nnodes=1 --rdzv_backend=c10d experiments/roma_outd
 python experiments/roma_outdoor.py --only_test --benchmark mega-1500
 ```
 ## License
-Due to our dependency on [DINOv2](https://github.com/facebookresearch/dinov2/blob/main/LICENSE), the license is sadly non-commercial only for the moment.
 ## Acknowledgement
 Our codebase builds on the code in [DKM](https://github.com/Parskatt/DKM).
@@ -54,10 +83,10 @@ Our codebase builds on the code in [DKM](https://github.com/Parskatt/DKM).
 ## BibTeX
 If you find our models useful, please consider citing our paper!
 ```
-@article{edstedt2023roma,
-title={{RoMa}: Revisiting Robust Lossses for Dense Feature Matching},
 author={Edstedt, Johan and Sun, Qiyu and Bökman, Georg and Wadenbäck, Mårten and Felsberg, Michael},
-journal={arXiv preprint arXiv:2305.15404},
-year={2023}
 }
 ```

+#
+<p align="center">
+  <h1 align="center"> <ins>RoMa</ins> 🏛️:<br> Robust Dense Feature Matching <br> ⭐CVPR 2024⭐</h1>
+  <p align="center">
+    <a href="https://scholar.google.com/citations?user=Ul-vMR0AAAAJ">Johan Edstedt</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=HS2WuHkAAAAJ">Qiyu Sun</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=FUE3Wd0AAAAJ">Georg Bökman</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=6WRQpCQAAAAJ">Mårten Wadenbäck</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=lkWfR08AAAAJ">Michael Felsberg</a>
+  </p>
+  <h2 align="center"><p>
+    <a href="https://arxiv.org/abs/2305.15404" align="center">Paper</a> |
+    <a href="https://parskatt.github.io/RoMa" align="center">Project Page</a>
+  </p></h2>
+  <div align="center"></div>
+</p>
 <br/>
+<p align="center">
+    <img src="https://github.com/Parskatt/RoMa/assets/22053118/15d8fea7-aa6d-479f-8a93-350d950d006b" alt="example" width=80%>
+    <br>
+    <em>RoMa is the robust dense feature matcher capable of estimating pixel-dense warps and reliable certainties for almost any image pair.</em>
+</p>
 ## Setup/Install
 In your python environment (tested on Linux python 3.10), run:
     kptsA.cpu().numpy(), kptsB.cpu().numpy(), ransacReprojThreshold=0.2, method=cv2.USAC_MAGSAC, confidence=0.999999, maxIters=10000
 )
 ```
+**New**: You can also match arbitrary keypoints with RoMa. A demo for this will be added soon.
+## Settings
+### Resolution
+By default RoMa uses an initial resolution of (560,560) which is then upsampled to (864,864).
+You can change this at construction (see roma_outdoor kwargs).
+You can also change this later, by changing the roma_model.w_resized, roma_model.h_resized, and roma_model.upsample_res.
+### Sampling
+roma_model.sample_thresh controls the thresholding used when sampling matches for estimation. In certain cases a lower or higher threshold may improve results.
 ## Reproducing Results
 The experiments in the paper are provided in the [experiments folder](experiments).
 python experiments/roma_outdoor.py --only_test --benchmark mega-1500
 ```
 ## License
+All our code except DINOv2 is MIT license.
+DINOv2 has an Apache 2 license [DINOv2](https://github.com/facebookresearch/dinov2/blob/main/LICENSE).
 ## Acknowledgement
 Our codebase builds on the code in [DKM](https://github.com/Parskatt/DKM).
 ## BibTeX
 If you find our models useful, please consider citing our paper!
 ```
+@article{edstedt2024roma,
+title={{RoMa: Robust Dense Feature Matching}},
 author={Edstedt, Johan and Sun, Qiyu and Bökman, Georg and Wadenbäck, Mårten and Felsberg, Michael},
+journal={IEEE Conference on Computer Vision and Pattern Recognition},
+year={2024}
 }
 ```

third_party/{Roma → RoMa}/assets/sacre_coeur_A.jpg RENAMED Viewed

File without changes

third_party/{Roma → RoMa}/assets/sacre_coeur_B.jpg RENAMED Viewed

File without changes

third_party/RoMa/assets/toronto_A.jpg ADDED Viewed

Git LFS Details

SHA256: 40270c227df93f0f31b55e0f2ff38eb24f47940c4800c83758a74a5dfd7346ec
Pointer size: 131 Bytes
Size of remote file: 525 kB

third_party/RoMa/assets/toronto_B.jpg ADDED Viewed

Git LFS Details

SHA256: a2c07550ed87e40fca8c38076eb3a81395d760a88bf0b8615167704107deff2f
Pointer size: 131 Bytes
Size of remote file: 286 kB

third_party/{Roma → RoMa}/data/.gitignore RENAMED Viewed

File without changes

third_party/RoMa/demo/demo_3D_effect.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from PIL import Image
+import torch
+import torch.nn.functional as F
+import numpy as np
+from roma.utils.utils import tensor_to_pil
+from roma import roma_outdoor
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument("--im_A_path", default="assets/toronto_A.jpg", type=str)
+    parser.add_argument("--im_B_path", default="assets/toronto_B.jpg", type=str)
+    parser.add_argument("--save_path", default="demo/gif/roma_warp_toronto", type=str)
+    args, _ = parser.parse_known_args()
+    im1_path = args.im_A_path
+    im2_path = args.im_B_path
+    save_path = args.save_path
+    # Create model
+    roma_model = roma_outdoor(device=device, coarse_res=560, upsample_res=(864, 1152))
+    roma_model.symmetric = False
+    H, W = roma_model.get_output_resolution()
+    im1 = Image.open(im1_path).resize((W, H))
+    im2 = Image.open(im2_path).resize((W, H))
+    # Match
+    warp, certainty = roma_model.match(im1_path, im2_path, device=device)
+    # Sampling not needed, but can be done with model.sample(warp, certainty)
+    x1 = (torch.tensor(np.array(im1)) / 255).to(device).permute(2, 0, 1)
+    x2 = (torch.tensor(np.array(im2)) / 255).to(device).permute(2, 0, 1)
+    coords_A, coords_B = warp[...,:2], warp[...,2:]
+    for i, x in enumerate(np.linspace(0,2*np.pi,200)):
+        t = (1 + np.cos(x))/2
+        interp_warp = (1-t)*coords_A + t*coords_B
+        im2_transfer_rgb = F.grid_sample(
+        x2[None], interp_warp[None], mode="bilinear", align_corners=False
+        )[0]
+        tensor_to_pil(im2_transfer_rgb, unnormalize=False).save(f"{save_path}_{i:03d}.jpg")

third_party/{Roma → RoMa}/demo/demo_fundamental.py RENAMED Viewed

@@ -3,12 +3,11 @@ import torch
 import cv2
 from roma import roma_outdoor
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 if __name__ == "__main__":
     from argparse import ArgumentParser
     parser = ArgumentParser()
     parser.add_argument("--im_A_path", default="assets/sacre_coeur_A.jpg", type=str)
     parser.add_argument("--im_B_path", default="assets/sacre_coeur_B.jpg", type=str)
@@ -20,6 +19,7 @@ if __name__ == "__main__":
     # Create model
     roma_model = roma_outdoor(device=device)
     W_A, H_A = Image.open(im1_path).size
     W_B, H_B = Image.open(im2_path).size
@@ -27,12 +27,7 @@ if __name__ == "__main__":
     warp, certainty = roma_model.match(im1_path, im2_path, device=device)
     # Sample matches for estimation
     matches, certainty = roma_model.sample(warp, certainty)
-    kpts1, kpts2 = roma_model.to_pixel_coordinates(matches, H_A, W_A, H_B, W_B)
     F, mask = cv2.findFundamentalMat(
-        kpts1.cpu().numpy(),
-        kpts2.cpu().numpy(),
-        ransacReprojThreshold=0.2,
-        method=cv2.USAC_MAGSAC,
-        confidence=0.999999,
-        maxIters=10000,
-    )

 import cv2
 from roma import roma_outdoor
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 if __name__ == "__main__":
     from argparse import ArgumentParser
     parser = ArgumentParser()
     parser.add_argument("--im_A_path", default="assets/sacre_coeur_A.jpg", type=str)
     parser.add_argument("--im_B_path", default="assets/sacre_coeur_B.jpg", type=str)
     # Create model
     roma_model = roma_outdoor(device=device)
     W_A, H_A = Image.open(im1_path).size
     W_B, H_B = Image.open(im2_path).size
     warp, certainty = roma_model.match(im1_path, im2_path, device=device)
     # Sample matches for estimation
     matches, certainty = roma_model.sample(warp, certainty)
+    kpts1, kpts2 = roma_model.to_pixel_coordinates(matches, H_A, W_A, H_B, W_B)
     F, mask = cv2.findFundamentalMat(
+        kpts1.cpu().numpy(), kpts2.cpu().numpy(), ransacReprojThreshold=0.2, method=cv2.USAC_MAGSAC, confidence=0.999999, maxIters=10000
+    )

third_party/{Roma → RoMa}/demo/demo_match.py RENAMED Viewed

@@ -4,20 +4,17 @@ import torch.nn.functional as F
 import numpy as np
 from roma.utils.utils import tensor_to_pil
-from roma import roma_indoor
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 if __name__ == "__main__":
     from argparse import ArgumentParser
     parser = ArgumentParser()
-    parser.add_argument("--im_A_path", default="assets/sacre_coeur_A.jpg", type=str)
-    parser.add_argument("--im_B_path", default="assets/sacre_coeur_B.jpg", type=str)
-    parser.add_argument(
-        "--save_path", default="demo/dkmv3_warp_sacre_coeur.jpg", type=str
-    )
     args, _ = parser.parse_known_args()
     im1_path = args.im_A_path
@@ -25,7 +22,7 @@ if __name__ == "__main__":
     save_path = args.save_path
     # Create model
-    roma_model = roma_indoor(device=device)
     H, W = roma_model.get_output_resolution()
@@ -39,12 +36,12 @@ if __name__ == "__main__":
     x2 = (torch.tensor(np.array(im2)) / 255).to(device).permute(2, 0, 1)
     im2_transfer_rgb = F.grid_sample(
-        x2[None], warp[:, :W, 2:][None], mode="bilinear", align_corners=False
     )[0]
     im1_transfer_rgb = F.grid_sample(
-        x1[None], warp[:, W:, :2][None], mode="bilinear", align_corners=False
     )[0]
-    warp_im = torch.cat((im2_transfer_rgb, im1_transfer_rgb), dim=2)
-    white_im = torch.ones((H, 2 * W), device=device)
     vis_im = certainty * warp_im + (1 - certainty) * white_im
-    tensor_to_pil(vis_im, unnormalize=False).save(save_path)

 import numpy as np
 from roma.utils.utils import tensor_to_pil
+from roma import roma_outdoor
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 if __name__ == "__main__":
     from argparse import ArgumentParser
     parser = ArgumentParser()
+    parser.add_argument("--im_A_path", default="assets/toronto_A.jpg", type=str)
+    parser.add_argument("--im_B_path", default="assets/toronto_B.jpg", type=str)
+    parser.add_argument("--save_path", default="demo/roma_warp_toronto.jpg", type=str)
     args, _ = parser.parse_known_args()
     im1_path = args.im_A_path
     save_path = args.save_path
     # Create model
+    roma_model = roma_outdoor(device=device, coarse_res=560, upsample_res=(864, 1152))
     H, W = roma_model.get_output_resolution()
     x2 = (torch.tensor(np.array(im2)) / 255).to(device).permute(2, 0, 1)
     im2_transfer_rgb = F.grid_sample(
+    x2[None], warp[:,:W, 2:][None], mode="bilinear", align_corners=False
     )[0]
     im1_transfer_rgb = F.grid_sample(
+    x1[None], warp[:, W:, :2][None], mode="bilinear", align_corners=False
     )[0]
+    warp_im = torch.cat((im2_transfer_rgb,im1_transfer_rgb),dim=2)
+    white_im = torch.ones((H,2*W),device=device)
     vis_im = certainty * warp_im + (1 - certainty) * white_im
+    tensor_to_pil(vis_im, unnormalize=False).save(save_path)

third_party/RoMa/demo/demo_match_opencv_sift.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from PIL import Image
+import numpy as np
+import numpy as np
+import cv2 as cv
+import matplotlib.pyplot as plt
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+    parser.add_argument("--im_A_path", default="assets/toronto_A.jpg", type=str)
+    parser.add_argument("--im_B_path", default="assets/toronto_B.jpg", type=str)
+    parser.add_argument("--save_path", default="demo/roma_warp_toronto.jpg", type=str)
+    args, _ = parser.parse_known_args()
+    im1_path = args.im_A_path
+    im2_path = args.im_B_path
+    save_path = args.save_path
+    img1 = cv.imread(im1_path,cv.IMREAD_GRAYSCALE)          # queryImage
+    img2 = cv.imread(im2_path,cv.IMREAD_GRAYSCALE) # trainImage
+    # Initiate SIFT detector
+    sift = cv.SIFT_create()
+    # find the keypoints and descriptors with SIFT
+    kp1, des1 = sift.detectAndCompute(img1,None)
+    kp2, des2 = sift.detectAndCompute(img2,None)
+    # BFMatcher with default params
+    bf = cv.BFMatcher()
+    matches = bf.knnMatch(des1,des2,k=2)
+    # Apply ratio test
+    good = []
+    for m,n in matches:
+        if m.distance < 0.75*n.distance:
+            good.append([m])
+    # cv.drawMatchesKnn expects list of lists as matches.
+    draw_params = dict(matchColor = (255,0,0), # draw matches in red color
+                   singlePointColor = None,
+                   flags = 2)
+    img3 = cv.drawMatchesKnn(img1,kp1,img2,kp2,good,None,**draw_params)
+    Image.fromarray(img3).save("demo/sift_matches.png")

third_party/RoMa/demo/gif/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *
2	+ !.gitignore

third_party/{Roma → RoMa}/pretrained/dinov2_vitl14_pretrain.pth RENAMED Viewed

File without changes

third_party/{Roma → RoMa}/pretrained/roma_outdoor.pth RENAMED Viewed

File without changes

third_party/{Roma → RoMa}/requirements.txt RENAMED Viewed

@@ -10,4 +10,4 @@ matplotlib
 h5py
 wandb
 timm
-xformers # Optional, used for memefficient attention

 h5py
 wandb
 timm
+#xformers # Optional, used for memefficient attention

third_party/{Roma → RoMa}/roma/__init__.py RENAMED Viewed

@@ -2,7 +2,7 @@ import os
 from .models import roma_outdoor, roma_indoor
 DEBUG_MODE = False
-RANK = int(os.environ.get("RANK", default=0))
 GLOBAL_STEP = 0
 STEP_SIZE = 1
-LOCAL_RANK = -1

 from .models import roma_outdoor, roma_indoor
 DEBUG_MODE = False
+RANK = int(os.environ.get('RANK', default = 0))
 GLOBAL_STEP = 0
 STEP_SIZE = 1
+LOCAL_RANK = -1

third_party/{Roma → RoMa}/roma/benchmarks/__init__.py RENAMED Viewed

File without changes

third_party/{Roma → RoMa}/roma/benchmarks/hpatches_sequences_homog_benchmark.py RENAMED Viewed

@@ -53,7 +53,7 @@ class HpatchesHomogBenchmark:
         )
         return im_A_coords, im_A_to_im_B
-    def benchmark(self, model, model_name=None):
         n_matches = []
         homog_dists = []
         for seq_idx, seq_name in tqdm(
@@ -69,7 +69,9 @@ class HpatchesHomogBenchmark:
                 H = np.loadtxt(
                     os.path.join(self.seqs_path, seq_name, "H_1_" + str(im_idx))
                 )
-                dense_matches, dense_certainty = model.match(im_A_path, im_B_path)
                 good_matches, _ = model.sample(dense_matches, dense_certainty, 5000)
                 pos_a, pos_b = self.convert_coordinates(
                     good_matches[:, :2], good_matches[:, 2:], w1, h1, w2, h2
@@ -78,9 +80,9 @@ class HpatchesHomogBenchmark:
                     H_pred, inliers = cv2.findHomography(
                         pos_a,
                         pos_b,
-                        method=cv2.RANSAC,
-                        confidence=0.99999,
-                        ransacReprojThreshold=3 * min(w2, h2) / 480,
                     )
                 except:
                     H_pred = None

         )
         return im_A_coords, im_A_to_im_B
+    def benchmark(self, model, model_name = None):
         n_matches = []
         homog_dists = []
         for seq_idx, seq_name in tqdm(
                 H = np.loadtxt(
                     os.path.join(self.seqs_path, seq_name, "H_1_" + str(im_idx))
                 )
+                dense_matches, dense_certainty = model.match(
+                    im_A_path, im_B_path
+                )
                 good_matches, _ = model.sample(dense_matches, dense_certainty, 5000)
                 pos_a, pos_b = self.convert_coordinates(
                     good_matches[:, :2], good_matches[:, 2:], w1, h1, w2, h2
                     H_pred, inliers = cv2.findHomography(
                         pos_a,
                         pos_b,
+                        method = cv2.RANSAC,
+                        confidence = 0.99999,
+                        ransacReprojThreshold = 3 * min(w2, h2) / 480,
                     )
                 except:
                     H_pred = None

third_party/{Roma → RoMa}/roma/benchmarks/megadepth_dense_benchmark.py RENAMED Viewed

@@ -6,11 +6,8 @@ from roma.utils import warp_kpts
 from torch.utils.data import ConcatDataset
 import roma
 class MegadepthDenseBenchmark:
-    def __init__(
-        self, data_root="data/megadepth", h=384, w=512, num_samples=2000
-    ) -> None:
         mega = MegadepthBuilder(data_root=data_root)
         self.dataset = ConcatDataset(
             mega.build_scenes(split="test_loftr", ht=h, wt=w)
@@ -52,15 +49,13 @@ class MegadepthDenseBenchmark:
             pck_3_tot = 0.0
             pck_5_tot = 0.0
             sampler = torch.utils.data.WeightedRandomSampler(
-                torch.ones(len(self.dataset)),
-                replacement=False,
-                num_samples=self.num_samples,
             )
             B = batch_size
             dataloader = torch.utils.data.DataLoader(
                 self.dataset, batch_size=B, num_workers=batch_size, sampler=sampler
             )
-            for idx, data in tqdm.tqdm(enumerate(dataloader), disable=roma.RANK > 0):
                 im_A, im_B, depth1, depth2, T_1to2, K1, K2 = (
                     data["im_A"],
                     data["im_B"],
@@ -77,36 +72,25 @@ class MegadepthDenseBenchmark:
                 if roma.DEBUG_MODE:
                     from roma.utils.utils import tensor_to_pil
                     import torch.nn.functional as F
                     path = "vis"
                     H, W = model.get_output_resolution()
-                    white_im = torch.ones((B, 1, H, W), device="cuda")
                     im_B_transfer_rgb = F.grid_sample(
-                        im_B.cuda(),
-                        matches[:, :, :W, 2:],
-                        mode="bilinear",
-                        align_corners=False,
                     )
                     warp_im = im_B_transfer_rgb
-                    c_b = certainty[
-                        :, None
-                    ]  # (certainty*0.9 + 0.1*torch.ones_like(certainty))[:,None]
                     vis_im = c_b * warp_im + (1 - c_b) * white_im
                     for b in range(B):
                         import os
-                        os.makedirs(
-                            f"{path}/{model.name}/{idx}_{b}_{H}_{W}", exist_ok=True
-                        )
                         tensor_to_pil(vis_im[b], unnormalize=True).save(
-                            f"{path}/{model.name}/{idx}_{b}_{H}_{W}/warp.jpg"
-                        )
                         tensor_to_pil(im_A[b].cuda(), unnormalize=True).save(
-                            f"{path}/{model.name}/{idx}_{b}_{H}_{W}/im_A.jpg"
-                        )
                         tensor_to_pil(im_B[b].cuda(), unnormalize=True).save(
-                            f"{path}/{model.name}/{idx}_{b}_{H}_{W}/im_B.jpg"
-                        )
                 gd_tot, pck_1_tot, pck_3_tot, pck_5_tot = (
                     gd_tot + gd.mean(),

 from torch.utils.data import ConcatDataset
 import roma
 class MegadepthDenseBenchmark:
+    def __init__(self, data_root="data/megadepth", h = 384, w = 512, num_samples = 2000) -> None:
         mega = MegadepthBuilder(data_root=data_root)
         self.dataset = ConcatDataset(
             mega.build_scenes(split="test_loftr", ht=h, wt=w)
             pck_3_tot = 0.0
             pck_5_tot = 0.0
             sampler = torch.utils.data.WeightedRandomSampler(
+                torch.ones(len(self.dataset)), replacement=False, num_samples=self.num_samples
             )
             B = batch_size
             dataloader = torch.utils.data.DataLoader(
                 self.dataset, batch_size=B, num_workers=batch_size, sampler=sampler
             )
+            for idx, data in tqdm.tqdm(enumerate(dataloader), disable = roma.RANK > 0):
                 im_A, im_B, depth1, depth2, T_1to2, K1, K2 = (
                     data["im_A"],
                     data["im_B"],
                 if roma.DEBUG_MODE:
                     from roma.utils.utils import tensor_to_pil
                     import torch.nn.functional as F
                     path = "vis"
                     H, W = model.get_output_resolution()
+                    white_im = torch.ones((B,1,H,W),device="cuda")
                     im_B_transfer_rgb = F.grid_sample(
+                        im_B.cuda(), matches[:,:,:W, 2:], mode="bilinear", align_corners=False
                     )
                     warp_im = im_B_transfer_rgb
+                    c_b = certainty[:,None]#(certainty*0.9 + 0.1*torch.ones_like(certainty))[:,None]
                     vis_im = c_b * warp_im + (1 - c_b) * white_im
                     for b in range(B):
                         import os
+                        os.makedirs(f"{path}/{model.name}/{idx}_{b}_{H}_{W}",exist_ok=True)
                         tensor_to_pil(vis_im[b], unnormalize=True).save(
+                            f"{path}/{model.name}/{idx}_{b}_{H}_{W}/warp.jpg")
                         tensor_to_pil(im_A[b].cuda(), unnormalize=True).save(
+                            f"{path}/{model.name}/{idx}_{b}_{H}_{W}/im_A.jpg")
                         tensor_to_pil(im_B[b].cuda(), unnormalize=True).save(
+                            f"{path}/{model.name}/{idx}_{b}_{H}_{W}/im_B.jpg")
                 gd_tot, pck_1_tot, pck_3_tot, pck_5_tot = (
                     gd_tot + gd.mean(),

third_party/{Roma → RoMa}/roma/benchmarks/megadepth_pose_estimation_benchmark.py RENAMED Viewed

@@ -7,9 +7,8 @@ import torch.nn.functional as F
 import roma
 import kornia.geometry.epipolar as kepi
 class MegaDepthPoseEstimationBenchmark:
-    def __init__(self, data_root="data/megadepth", scene_names=None) -> None:
         if scene_names is None:
             self.scene_names = [
                 "0015_0.1_0.3.npz",
@@ -26,22 +25,13 @@ class MegaDepthPoseEstimationBenchmark:
         ]
         self.data_root = data_root
-    def benchmark(
-        self,
-        model,
-        model_name=None,
-        resolution=None,
-        scale_intrinsics=True,
-        calibrated=True,
-    ):
-        H, W = model.get_output_resolution()
         with torch.no_grad():
             data_root = self.data_root
             tot_e_t, tot_e_R, tot_e_pose = [], [], []
             thresholds = [5, 10, 20]
             for scene_ind in range(len(self.scenes)):
                 import os
                 scene_name = os.path.splitext(self.scene_names[scene_ind])[0]
                 scene = self.scenes[scene_ind]
                 pairs = scene["pair_infos"]
@@ -58,22 +48,21 @@ class MegaDepthPoseEstimationBenchmark:
                     T2 = poses[idx2].copy()
                     R2, t2 = T2[:3, :3], T2[:3, 3]
                     R, t = compute_relative_pose(R1, t1, R2, t2)
-                    T1_to_2 = np.concatenate((R, t[:, None]), axis=-1)
                     im_A_path = f"{data_root}/{im_paths[idx1]}"
                     im_B_path = f"{data_root}/{im_paths[idx2]}"
                     dense_matches, dense_certainty = model.match(
                         im_A_path, im_B_path, K1.copy(), K2.copy(), T1_to_2.copy()
                     )
-                    sparse_matches, _ = model.sample(
-                        dense_matches, dense_certainty, 5000
                     )
                     im_A = Image.open(im_A_path)
                     w1, h1 = im_A.size
                     im_B = Image.open(im_B_path)
                     w2, h2 = im_B.size
-                    if scale_intrinsics:
                         scale1 = 1200 / max(w1, h1)
                         scale2 = 1200 / max(w2, h2)
                         w1, h1 = scale1 * w1, scale1 * h1
@@ -82,42 +71,23 @@ class MegaDepthPoseEstimationBenchmark:
                         K1[:2] = K1[:2] * scale1
                         K2[:2] = K2[:2] * scale2
-                    kpts1 = sparse_matches[:, :2]
-                    kpts1 = np.stack(
-                        (
-                            w1 * (kpts1[:, 0] + 1) / 2,
-                            h1 * (kpts1[:, 1] + 1) / 2,
-                        ),
-                        axis=-1,
-                    )
-                    kpts2 = sparse_matches[:, 2:]
-                    kpts2 = np.stack(
-                        (
-                            w2 * (kpts2[:, 0] + 1) / 2,
-                            h2 * (kpts2[:, 1] + 1) / 2,
-                        ),
-                        axis=-1,
-                    )
                     for _ in range(5):
                         shuffling = np.random.permutation(np.arange(len(kpts1)))
                         kpts1 = kpts1[shuffling]
                         kpts2 = kpts2[shuffling]
                         try:
-                            threshold = 0.5
-                            if calibrated:
-                                norm_threshold = threshold / (
-                                    np.mean(np.abs(K1[:2, :2]))
-                                    + np.mean(np.abs(K2[:2, :2]))
-                                )
-                                R_est, t_est, mask = estimate_pose(
-                                    kpts1,
-                                    kpts2,
-                                    K1,
-                                    K2,
-                                    norm_threshold,
-                                    conf=0.99999,
-                                )
                             T1_to_2_est = np.concatenate((R_est, t_est), axis=-1)  #
                             e_t, e_R = compute_pose_error(T1_to_2_est, R, t)
                             e_pose = max(e_t, e_R)

 import roma
 import kornia.geometry.epipolar as kepi
 class MegaDepthPoseEstimationBenchmark:
+    def __init__(self, data_root="data/megadepth", scene_names = None) -> None:
         if scene_names is None:
             self.scene_names = [
                 "0015_0.1_0.3.npz",
         ]
         self.data_root = data_root
+    def benchmark(self, model, model_name = None):
         with torch.no_grad():
             data_root = self.data_root
             tot_e_t, tot_e_R, tot_e_pose = [], [], []
             thresholds = [5, 10, 20]
             for scene_ind in range(len(self.scenes)):
                 import os
                 scene_name = os.path.splitext(self.scene_names[scene_ind])[0]
                 scene = self.scenes[scene_ind]
                 pairs = scene["pair_infos"]
                     T2 = poses[idx2].copy()
                     R2, t2 = T2[:3, :3], T2[:3, 3]
                     R, t = compute_relative_pose(R1, t1, R2, t2)
+                    T1_to_2 = np.concatenate((R,t[:,None]), axis=-1)
                     im_A_path = f"{data_root}/{im_paths[idx1]}"
                     im_B_path = f"{data_root}/{im_paths[idx2]}"
                     dense_matches, dense_certainty = model.match(
                         im_A_path, im_B_path, K1.copy(), K2.copy(), T1_to_2.copy()
                     )
+                    sparse_matches,_ = model.sample(
+                        dense_matches, dense_certainty, 5_000
                     )
                     im_A = Image.open(im_A_path)
                     w1, h1 = im_A.size
                     im_B = Image.open(im_B_path)
                     w2, h2 = im_B.size
+                    if True: # Note: we keep this true as it was used in DKM/RoMa papers. There is very little difference compared to setting to False.
                         scale1 = 1200 / max(w1, h1)
                         scale2 = 1200 / max(w2, h2)
                         w1, h1 = scale1 * w1, scale1 * h1
                         K1[:2] = K1[:2] * scale1
                         K2[:2] = K2[:2] * scale2
+                    kpts1, kpts2 = model.to_pixel_coordinates(sparse_matches, h1, w1, h2, w2)
+                    kpts1, kpts2 = kpts1.cpu().numpy(), kpts2.cpu().numpy()
                     for _ in range(5):
                         shuffling = np.random.permutation(np.arange(len(kpts1)))
                         kpts1 = kpts1[shuffling]
                         kpts2 = kpts2[shuffling]
                         try:
+                            threshold = 0.5
+                            norm_threshold = threshold / (np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2])))
+                            R_est, t_est, mask = estimate_pose(
+                                kpts1,
+                                kpts2,
+                                K1,
+                                K2,
+                                norm_threshold,
+                                conf=0.99999,
+                            )
                             T1_to_2_est = np.concatenate((R_est, t_est), axis=-1)  #
                             e_t, e_R = compute_pose_error(T1_to_2_est, R, t)
                             e_pose = max(e_t, e_R)

third_party/{Roma → RoMa}/roma/benchmarks/scannet_benchmark.py RENAMED Viewed

@@ -10,7 +10,7 @@ class ScanNetBenchmark:
     def __init__(self, data_root="data/scannet") -> None:
         self.data_root = data_root
-    def benchmark(self, model, model_name=None):
         model.train(False)
         with torch.no_grad():
             data_root = self.data_root
@@ -24,20 +24,20 @@ class ScanNetBenchmark:
                 scene = pairs[pairind]
                 scene_name = f"scene0{scene[0]}_00"
                 im_A_path = osp.join(
-                    self.data_root,
-                    "scans_test",
-                    scene_name,
-                    "color",
-                    f"{scene[2]}.jpg",
-                )
                 im_A = Image.open(im_A_path)
                 im_B_path = osp.join(
-                    self.data_root,
-                    "scans_test",
-                    scene_name,
-                    "color",
-                    f"{scene[3]}.jpg",
-                )
                 im_B = Image.open(im_B_path)
                 T_gt = rel_pose[pairind].reshape(3, 4)
                 R, t = T_gt[:3, :3], T_gt[:3, 3]
@@ -76,20 +76,24 @@ class ScanNetBenchmark:
                 offset = 0.5
                 kpts1 = sparse_matches[:, :2]
-                kpts1 = np.stack(
-                    (
-                        w1 * (kpts1[:, 0] + 1) / 2 - offset,
-                        h1 * (kpts1[:, 1] + 1) / 2 - offset,
-                    ),
-                    axis=-1,
                 )
                 kpts2 = sparse_matches[:, 2:]
-                kpts2 = np.stack(
-                    (
-                        w2 * (kpts2[:, 0] + 1) / 2 - offset,
-                        h2 * (kpts2[:, 1] + 1) / 2 - offset,
-                    ),
-                    axis=-1,
                 )
                 for _ in range(5):
                     shuffling = np.random.permutation(np.arange(len(kpts1)))
@@ -97,8 +101,7 @@ class ScanNetBenchmark:
                     kpts2 = kpts2[shuffling]
                     try:
                         norm_threshold = 0.5 / (
-                            np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2]))
-                        )
                         R_est, t_est, mask = estimate_pose(
                             kpts1,
                             kpts2,

     def __init__(self, data_root="data/scannet") -> None:
         self.data_root = data_root
+    def benchmark(self, model, model_name = None):
         model.train(False)
         with torch.no_grad():
             data_root = self.data_root
                 scene = pairs[pairind]
                 scene_name = f"scene0{scene[0]}_00"
                 im_A_path = osp.join(
+                        self.data_root,
+                        "scans_test",
+                        scene_name,
+                        "color",
+                        f"{scene[2]}.jpg",
+                    )
                 im_A = Image.open(im_A_path)
                 im_B_path = osp.join(
+                        self.data_root,
+                        "scans_test",
+                        scene_name,
+                        "color",
+                        f"{scene[3]}.jpg",
+                    )
                 im_B = Image.open(im_B_path)
                 T_gt = rel_pose[pairind].reshape(3, 4)
                 R, t = T_gt[:3, :3], T_gt[:3, 3]
                 offset = 0.5
                 kpts1 = sparse_matches[:, :2]
+                kpts1 = (
+                    np.stack(
+                        (
+                            w1 * (kpts1[:, 0] + 1) / 2 - offset,
+                            h1 * (kpts1[:, 1] + 1) / 2 - offset,
+                        ),
+                        axis=-1,
+                    )
                 )
                 kpts2 = sparse_matches[:, 2:]
+                kpts2 = (
+                    np.stack(
+                        (
+                            w2 * (kpts2[:, 0] + 1) / 2 - offset,
+                            h2 * (kpts2[:, 1] + 1) / 2 - offset,
+                        ),
+                        axis=-1,
+                    )
                 )
                 for _ in range(5):
                     shuffling = np.random.permutation(np.arange(len(kpts1)))
                     kpts2 = kpts2[shuffling]
                     try:
                         norm_threshold = 0.5 / (
+                        np.mean(np.abs(K1[:2, :2])) + np.mean(np.abs(K2[:2, :2])))
                         R_est, t_est, mask = estimate_pose(
                             kpts1,
                             kpts2,

third_party/{Roma → RoMa}/roma/checkpointing/__init__.py RENAMED Viewed

File without changes

third_party/{Roma → RoMa}/roma/checkpointing/checkpoint.py RENAMED Viewed

@@ -7,7 +7,6 @@ import gc
 import roma
 class CheckPoint:
     def __init__(self, dir=None, name="tmp"):
         self.name = name
@@ -20,7 +19,7 @@ class CheckPoint:
         optimizer,
         lr_scheduler,
         n,
-    ):
         if roma.RANK == 0:
             assert model is not None
             if isinstance(model, (DataParallel, DistributedDataParallel)):
@@ -33,14 +32,14 @@ class CheckPoint:
             }
             torch.save(states, self.dir + self.name + f"_latest.pth")
             logger.info(f"Saved states {list(states.keys())}, at step {n}")
     def load(
         self,
         model,
         optimizer,
         lr_scheduler,
         n,
-    ):
         if os.path.exists(self.dir + self.name + f"_latest.pth") and roma.RANK == 0:
             states = torch.load(self.dir + self.name + f"_latest.pth")
             if "model" in states:
@@ -58,4 +57,4 @@ class CheckPoint:
             del states
             gc.collect()
             torch.cuda.empty_cache()
-        return model, optimizer, lr_scheduler, n

 import roma
 class CheckPoint:
     def __init__(self, dir=None, name="tmp"):
         self.name = name
         optimizer,
         lr_scheduler,
         n,
+        ):
         if roma.RANK == 0:
             assert model is not None
             if isinstance(model, (DataParallel, DistributedDataParallel)):
             }
             torch.save(states, self.dir + self.name + f"_latest.pth")
             logger.info(f"Saved states {list(states.keys())}, at step {n}")
     def load(
         self,
         model,
         optimizer,
         lr_scheduler,
         n,
+        ):
         if os.path.exists(self.dir + self.name + f"_latest.pth") and roma.RANK == 0:
             states = torch.load(self.dir + self.name + f"_latest.pth")
             if "model" in states:
             del states
             gc.collect()
             torch.cuda.empty_cache()
+        return model, optimizer, lr_scheduler, n

third_party/{Roma → RoMa}/roma/datasets/__init__.py RENAMED Viewed

	@@ -1,2 +1,2 @@
1	from .megadepth import MegadepthBuilder
2	- from .scannet import ScanNetBuilder


1	from .megadepth import MegadepthBuilder
2	+ from .scannet import ScanNetBuilder

third_party/{Roma → RoMa}/roma/datasets/megadepth.py RENAMED Viewed

@@ -10,7 +10,6 @@ import roma
 from roma.utils import *
 import math
 class MegadepthScene:
     def __init__(
         self,
@@ -23,20 +22,18 @@ class MegadepthScene:
         shake_t=0,
         rot_prob=0.0,
         normalize=True,
-        max_num_pairs=100_000,
-        scene_name=None,
-        use_horizontal_flip_aug=False,
-        use_single_horizontal_flip_aug=False,
-        colorjiggle_params=None,
-        random_eraser=None,
-        use_randaug=False,
-        randaug_params=None,
-        randomize_size=False,
     ) -> None:
         self.data_root = data_root
-        self.scene_name = (
-            os.path.splitext(scene_name)[0] + f"_{min_overlap}_{max_overlap}"
-        )
         self.image_paths = scene_info["image_paths"]
         self.depth_paths = scene_info["depth_paths"]
         self.intrinsics = scene_info["intrinsics"]
@@ -54,18 +51,18 @@ class MegadepthScene:
             self.overlaps = self.overlaps[pairinds]
         if randomize_size:
             area = ht * wt
-            s = int(16 * (math.sqrt(area) // 16))
-            sizes = ((ht, wt), (s, s), (wt, ht))
             choice = roma.RANK % 3
-            ht, wt = sizes[choice]
         # counts, bins = np.histogram(self.overlaps,20)
         # print(counts)
         self.im_transform_ops = get_tuple_transform_ops(
-            resize=(ht, wt),
-            normalize=normalize,
-            colorjiggle_params=colorjiggle_params,
         )
-        self.depth_transform_ops = get_depth_tuple_transform_ops(resize=(ht, wt))
         self.wt, self.ht = wt, ht
         self.shake_t = shake_t
         self.random_eraser = random_eraser
@@ -78,19 +75,17 @@ class MegadepthScene:
     def load_im(self, im_path):
         im = Image.open(im_path)
         return im
-    def horizontal_flip(self, im_A, im_B, depth_A, depth_B, K_A, K_B):
         im_A = im_A.flip(-1)
         im_B = im_B.flip(-1)
-        depth_A, depth_B = depth_A.flip(-1), depth_B.flip(-1)
-        flip_mat = torch.tensor([[-1, 0, self.wt], [0, 1, 0], [0, 0, 1.0]]).to(
-            K_A.device
-        )
-        K_A = flip_mat @ K_A
-        K_B = flip_mat @ K_B
         return im_A, im_B, depth_A, depth_B, K_A, K_B
     def load_depth(self, depth_ref, crop=None):
         depth = np.array(h5py.File(depth_ref, "r")["depth"])
         return torch.from_numpy(depth)
@@ -145,31 +140,29 @@ class MegadepthScene:
         depth_A, depth_B = self.depth_transform_ops(
             (depth_A[None, None], depth_B[None, None])
         )
-        [im_A, im_B, depth_A, depth_B], t = self.rand_shake(
-            im_A, im_B, depth_A, depth_B
-        )
         K1[:2, 2] += t
         K2[:2, 2] += t
         im_A, im_B = im_A[None], im_B[None]
         if self.random_eraser is not None:
             im_A, depth_A = self.random_eraser(im_A, depth_A)
             im_B, depth_B = self.random_eraser(im_B, depth_B)
         if self.use_horizontal_flip_aug:
             if np.random.rand() > 0.5:
-                im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip(
-                    im_A, im_B, depth_A, depth_B, K1, K2
-                )
         if self.use_single_horizontal_flip_aug:
             if np.random.rand() > 0.5:
                 im_B, depth_B, K2 = self.single_horizontal_flip(im_B, depth_B, K2)
         if roma.DEBUG_MODE:
-            tensor_to_pil(im_A[0], unnormalize=True).save(f"vis/im_A.jpg")
-            tensor_to_pil(im_B[0], unnormalize=True).save(f"vis/im_B.jpg")
         data_dict = {
             "im_A": im_A[0],
             "im_A_identifier": self.image_paths[idx1].split("/")[-1].split(".jpg")[0],
@@ -182,53 +175,25 @@ class MegadepthScene:
             "T_1to2": T_1to2,
             "im_A_path": im_A_ref,
             "im_B_path": im_B_ref,
         }
         return data_dict
 class MegadepthBuilder:
-    def __init__(
-        self, data_root="data/megadepth", loftr_ignore=True, imc21_ignore=True
-    ) -> None:
         self.data_root = data_root
         self.scene_info_root = os.path.join(data_root, "prep_scene_info")
         self.all_scenes = os.listdir(self.scene_info_root)
         self.test_scenes = ["0017.npy", "0004.npy", "0048.npy", "0013.npy"]
         # LoFTR did the D2-net preprocessing differently than we did and got more ignore scenes, can optionially ignore those
-        self.loftr_ignore_scenes = set(
-            [
-                "0121.npy",
-                "0133.npy",
-                "0168.npy",
-                "0178.npy",
-                "0229.npy",
-                "0349.npy",
-                "0412.npy",
-                "0430.npy",
-                "0443.npy",
-                "1001.npy",
-                "5014.npy",
-                "5015.npy",
-                "5016.npy",
-            ]
-        )
-        self.imc21_scenes = set(
-            [
-                "0008.npy",
-                "0019.npy",
-                "0021.npy",
-                "0024.npy",
-                "0025.npy",
-                "0032.npy",
-                "0063.npy",
-                "1589.npy",
-            ]
-        )
         self.test_scenes_loftr = ["0015.npy", "0022.npy"]
         self.loftr_ignore = loftr_ignore
         self.imc21_ignore = imc21_ignore
-    def build_scenes(self, split="train", min_overlap=0.0, scene_names=None, **kwargs):
         if split == "train":
             scene_names = set(self.all_scenes) - set(self.test_scenes)
         elif split == "train_loftr":
@@ -252,11 +217,7 @@ class MegadepthBuilder:
             ).item()
             scenes.append(
                 MegadepthScene(
-                    self.data_root,
-                    scene_info,
-                    min_overlap=min_overlap,
-                    scene_name=scene_name,
-                    **kwargs,
                 )
             )
         return scenes

 from roma.utils import *
 import math
 class MegadepthScene:
     def __init__(
         self,
         shake_t=0,
         rot_prob=0.0,
         normalize=True,
+        max_num_pairs = 100_000,
+        scene_name = None,
+        use_horizontal_flip_aug = False,
+        use_single_horizontal_flip_aug = False,
+        colorjiggle_params = None,
+        random_eraser = None,
+        use_randaug = False,
+        randaug_params = None,
+        randomize_size = False,
     ) -> None:
         self.data_root = data_root
+        self.scene_name = os.path.splitext(scene_name)[0]+f"_{min_overlap}_{max_overlap}"
         self.image_paths = scene_info["image_paths"]
         self.depth_paths = scene_info["depth_paths"]
         self.intrinsics = scene_info["intrinsics"]
             self.overlaps = self.overlaps[pairinds]
         if randomize_size:
             area = ht * wt
+            s = int(16 * (math.sqrt(area)//16))
+            sizes = ((ht,wt), (s,s), (wt,ht))
             choice = roma.RANK % 3
+            ht, wt = sizes[choice]
         # counts, bins = np.histogram(self.overlaps,20)
         # print(counts)
         self.im_transform_ops = get_tuple_transform_ops(
+            resize=(ht, wt), normalize=normalize, colorjiggle_params = colorjiggle_params,
         )
+        self.depth_transform_ops = get_depth_tuple_transform_ops(
+                resize=(ht, wt)
+            )
         self.wt, self.ht = wt, ht
         self.shake_t = shake_t
         self.random_eraser = random_eraser
     def load_im(self, im_path):
         im = Image.open(im_path)
         return im
+    def horizontal_flip(self, im_A, im_B, depth_A, depth_B,  K_A, K_B):
         im_A = im_A.flip(-1)
         im_B = im_B.flip(-1)
+        depth_A, depth_B = depth_A.flip(-1), depth_B.flip(-1)
+        flip_mat = torch.tensor([[-1, 0, self.wt],[0,1,0],[0,0,1.]]).to(K_A.device)
+        K_A = flip_mat@K_A
+        K_B = flip_mat@K_B
         return im_A, im_B, depth_A, depth_B, K_A, K_B
     def load_depth(self, depth_ref, crop=None):
         depth = np.array(h5py.File(depth_ref, "r")["depth"])
         return torch.from_numpy(depth)
         depth_A, depth_B = self.depth_transform_ops(
             (depth_A[None, None], depth_B[None, None])
         )
+        [im_A, im_B, depth_A, depth_B], t = self.rand_shake(im_A, im_B, depth_A, depth_B)
         K1[:2, 2] += t
         K2[:2, 2] += t
         im_A, im_B = im_A[None], im_B[None]
         if self.random_eraser is not None:
             im_A, depth_A = self.random_eraser(im_A, depth_A)
             im_B, depth_B = self.random_eraser(im_B, depth_B)
         if self.use_horizontal_flip_aug:
             if np.random.rand() > 0.5:
+                im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip(im_A, im_B, depth_A, depth_B, K1, K2)
         if self.use_single_horizontal_flip_aug:
             if np.random.rand() > 0.5:
                 im_B, depth_B, K2 = self.single_horizontal_flip(im_B, depth_B, K2)
         if roma.DEBUG_MODE:
+            tensor_to_pil(im_A[0], unnormalize=True).save(
+                            f"vis/im_A.jpg")
+            tensor_to_pil(im_B[0], unnormalize=True).save(
+                            f"vis/im_B.jpg")
         data_dict = {
             "im_A": im_A[0],
             "im_A_identifier": self.image_paths[idx1].split("/")[-1].split(".jpg")[0],
             "T_1to2": T_1to2,
             "im_A_path": im_A_ref,
             "im_B_path": im_B_ref,
         }
         return data_dict
 class MegadepthBuilder:
+    def __init__(self, data_root="data/megadepth", loftr_ignore=True, imc21_ignore = True) -> None:
         self.data_root = data_root
         self.scene_info_root = os.path.join(data_root, "prep_scene_info")
         self.all_scenes = os.listdir(self.scene_info_root)
         self.test_scenes = ["0017.npy", "0004.npy", "0048.npy", "0013.npy"]
         # LoFTR did the D2-net preprocessing differently than we did and got more ignore scenes, can optionially ignore those
+        self.loftr_ignore_scenes = set(['0121.npy', '0133.npy', '0168.npy', '0178.npy', '0229.npy', '0349.npy', '0412.npy', '0430.npy', '0443.npy', '1001.npy', '5014.npy', '5015.npy', '5016.npy'])
+        self.imc21_scenes = set(['0008.npy', '0019.npy', '0021.npy', '0024.npy', '0025.npy', '0032.npy', '0063.npy', '1589.npy'])
         self.test_scenes_loftr = ["0015.npy", "0022.npy"]
         self.loftr_ignore = loftr_ignore
         self.imc21_ignore = imc21_ignore
+    def build_scenes(self, split="train", min_overlap=0.0, scene_names = None, **kwargs):
         if split == "train":
             scene_names = set(self.all_scenes) - set(self.test_scenes)
         elif split == "train_loftr":
             ).item()
             scenes.append(
                 MegadepthScene(
+                    self.data_root, scene_info, min_overlap=min_overlap,scene_name = scene_name, **kwargs
                 )
             )
         return scenes

third_party/{Roma → RoMa}/roma/datasets/scannet.py RENAMED Viewed

@@ -5,7 +5,10 @@ import cv2
 import h5py
 import numpy as np
 import torch
-from torch.utils.data import Dataset, DataLoader, ConcatDataset
 import torchvision.transforms.functional as tvf
 import kornia.augmentation as K
@@ -16,36 +19,22 @@ from roma.utils import get_depth_tuple_transform_ops, get_tuple_transform_ops
 from roma.utils.transforms import GeometricSequential
 from tqdm import tqdm
 class ScanNetScene:
-    def __init__(
-        self,
-        data_root,
-        scene_info,
-        ht=384,
-        wt=512,
-        min_overlap=0.0,
-        shake_t=0,
-        rot_prob=0.0,
-        use_horizontal_flip_aug=False,
-    ) -> None:
-        self.scene_root = osp.join(data_root, "scans", "scans_train")
-        self.data_names = scene_info["name"]
-        self.overlaps = scene_info["score"]
         # Only sample 10s
-        valid = (self.data_names[:, -2:] % 10).sum(axis=-1) == 0
         self.overlaps = self.overlaps[valid]
         self.data_names = self.data_names[valid]
         if len(self.data_names) > 10000:
-            pairinds = np.random.choice(
-                np.arange(0, len(self.data_names)), 10000, replace=False
-            )
             self.data_names = self.data_names[pairinds]
             self.overlaps = self.overlaps[pairinds]
         self.im_transform_ops = get_tuple_transform_ops(resize=(ht, wt), normalize=True)
-        self.depth_transform_ops = get_depth_tuple_transform_ops(
-            resize=(ht, wt), normalize=False
-        )
         self.wt, self.ht = wt, ht
         self.shake_t = shake_t
         self.H_generator = GeometricSequential(K.RandomAffine(degrees=90, p=rot_prob))
@@ -54,7 +43,7 @@ class ScanNetScene:
     def load_im(self, im_B, crop=None):
         im = Image.open(im_B)
         return im
     def load_depth(self, depth_ref, crop=None):
         depth = cv2.imread(str(depth_ref), cv2.IMREAD_UNCHANGED)
         depth = depth / 1000
@@ -63,73 +52,64 @@ class ScanNetScene:
     def __len__(self):
         return len(self.data_names)
     def scale_intrinsic(self, K, wi, hi):
-        sx, sy = self.wt / wi, self.ht / hi
-        sK = torch.tensor([[sx, 0, 0], [0, sy, 0], [0, 0, 1]])
-        return sK @ K
-    def horizontal_flip(self, im_A, im_B, depth_A, depth_B, K_A, K_B):
         im_A = im_A.flip(-1)
         im_B = im_B.flip(-1)
-        depth_A, depth_B = depth_A.flip(-1), depth_B.flip(-1)
-        flip_mat = torch.tensor([[-1, 0, self.wt], [0, 1, 0], [0, 0, 1.0]]).to(
-            K_A.device
-        )
-        K_A = flip_mat @ K_A
-        K_B = flip_mat @ K_B
         return im_A, im_B, depth_A, depth_B, K_A, K_B
-    def read_scannet_pose(self, path):
-        """Read ScanNet's Camera2World pose and transform it to World2Camera.
         Returns:
             pose_w2c (np.ndarray): (4, 4)
         """
-        cam2world = np.loadtxt(path, delimiter=" ")
         world2cam = np.linalg.inv(cam2world)
         return world2cam
-    def read_scannet_intrinsic(self, path):
-        """Read ScanNet's intrinsic matrix and return the 3x3 matrix."""
-        intrinsic = np.loadtxt(path, delimiter=" ")
-        return torch.tensor(intrinsic[:-1, :-1], dtype=torch.float)
     def __getitem__(self, pair_idx):
         # read intrinsics of original size
         data_name = self.data_names[pair_idx]
         scene_name, scene_sub_name, stem_name_1, stem_name_2 = data_name
-        scene_name = f"scene{scene_name:04d}_{scene_sub_name:02d}"
         # read the intrinsic of depthmap
-        K1 = K2 = self.read_scannet_intrinsic(
-            osp.join(self.scene_root, scene_name, "intrinsic", "intrinsic_color.txt")
-        )  # the depth K is not the same, but doesnt really matter
         # read and compute relative poses
-        T1 = self.read_scannet_pose(
-            osp.join(self.scene_root, scene_name, "pose", f"{stem_name_1}.txt")
-        )
-        T2 = self.read_scannet_pose(
-            osp.join(self.scene_root, scene_name, "pose", f"{stem_name_2}.txt")
-        )
-        T_1to2 = torch.tensor(np.matmul(T2, np.linalg.inv(T1)), dtype=torch.float)[
-            :4, :4
-        ]  # (4, 4)
         # Load positive pair data
-        im_A_ref = os.path.join(
-            self.scene_root, scene_name, "color", f"{stem_name_1}.jpg"
-        )
-        im_B_ref = os.path.join(
-            self.scene_root, scene_name, "color", f"{stem_name_2}.jpg"
-        )
-        depth_A_ref = os.path.join(
-            self.scene_root, scene_name, "depth", f"{stem_name_1}.png"
-        )
-        depth_B_ref = os.path.join(
-            self.scene_root, scene_name, "depth", f"{stem_name_2}.png"
-        )
         im_A = self.load_im(im_A_ref)
         im_B = self.load_im(im_B_ref)
@@ -141,51 +121,40 @@ class ScanNetScene:
         K2 = self.scale_intrinsic(K2, im_B.width, im_B.height)
         # Process images
         im_A, im_B = self.im_transform_ops((im_A, im_B))
-        depth_A, depth_B = self.depth_transform_ops(
-            (depth_A[None, None], depth_B[None, None])
-        )
         if self.use_horizontal_flip_aug:
             if np.random.rand() > 0.5:
-                im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip(
-                    im_A, im_B, depth_A, depth_B, K1, K2
-                )
-        data_dict = {
-            "im_A": im_A,
-            "im_B": im_B,
-            "im_A_depth": depth_A[0, 0],
-            "im_B_depth": depth_B[0, 0],
-            "K1": K1,
-            "K2": K2,
-            "T_1to2": T_1to2,
-        }
         return data_dict
 class ScanNetBuilder:
-    def __init__(self, data_root="data/scannet") -> None:
         self.data_root = data_root
-        self.scene_info_root = os.path.join(data_root, "scannet_indices")
         self.all_scenes = os.listdir(self.scene_info_root)
-    def build_scenes(self, split="train", min_overlap=0.0, **kwargs):
         # Note: split doesn't matter here as we always use same scannet_train scenes
         scene_names = self.all_scenes
         scenes = []
-        for scene_name in tqdm(scene_names, disable=roma.RANK > 0):
-            scene_info = np.load(
-                os.path.join(self.scene_info_root, scene_name), allow_pickle=True
-            )
-            scenes.append(
-                ScanNetScene(
-                    self.data_root, scene_info, min_overlap=min_overlap, **kwargs
-                )
-            )
         return scenes
-    def weight_scenes(self, concat_dataset, alpha=0.5):
         ns = []
         for d in concat_dataset.datasets:
             ns.append(len(d))
-        ws = torch.cat([torch.ones(n) / n**alpha for n in ns])
         return ws

 import h5py
 import numpy as np
 import torch
+from torch.utils.data import (
+    Dataset,
+    DataLoader,
+    ConcatDataset)
 import torchvision.transforms.functional as tvf
 import kornia.augmentation as K
 from roma.utils.transforms import GeometricSequential
 from tqdm import tqdm
 class ScanNetScene:
+    def __init__(self, data_root, scene_info, ht = 384, wt = 512, min_overlap=0., shake_t = 0, rot_prob=0.,use_horizontal_flip_aug = False,
+) -> None:
+        self.scene_root = osp.join(data_root,"scans","scans_train")
+        self.data_names = scene_info['name']
+        self.overlaps = scene_info['score']
         # Only sample 10s
+        valid = (self.data_names[:,-2:] % 10).sum(axis=-1) == 0
         self.overlaps = self.overlaps[valid]
         self.data_names = self.data_names[valid]
         if len(self.data_names) > 10000:
+            pairinds = np.random.choice(np.arange(0,len(self.data_names)),10000,replace=False)
             self.data_names = self.data_names[pairinds]
             self.overlaps = self.overlaps[pairinds]
         self.im_transform_ops = get_tuple_transform_ops(resize=(ht, wt), normalize=True)
+        self.depth_transform_ops = get_depth_tuple_transform_ops(resize=(ht, wt), normalize=False)
         self.wt, self.ht = wt, ht
         self.shake_t = shake_t
         self.H_generator = GeometricSequential(K.RandomAffine(degrees=90, p=rot_prob))
     def load_im(self, im_B, crop=None):
         im = Image.open(im_B)
         return im
     def load_depth(self, depth_ref, crop=None):
         depth = cv2.imread(str(depth_ref), cv2.IMREAD_UNCHANGED)
         depth = depth / 1000
     def __len__(self):
         return len(self.data_names)
     def scale_intrinsic(self, K, wi, hi):
+        sx, sy = self.wt / wi, self.ht /  hi
+        sK = torch.tensor([[sx, 0, 0],
+                        [0, sy, 0],
+                        [0, 0, 1]])
+        return sK@K
+    def horizontal_flip(self, im_A, im_B, depth_A, depth_B,  K_A, K_B):
         im_A = im_A.flip(-1)
         im_B = im_B.flip(-1)
+        depth_A, depth_B = depth_A.flip(-1), depth_B.flip(-1)
+        flip_mat = torch.tensor([[-1, 0, self.wt],[0,1,0],[0,0,1.]]).to(K_A.device)
+        K_A = flip_mat@K_A
+        K_B = flip_mat@K_B
         return im_A, im_B, depth_A, depth_B, K_A, K_B
+    def read_scannet_pose(self,path):
+        """ Read ScanNet's Camera2World pose and transform it to World2Camera.
         Returns:
             pose_w2c (np.ndarray): (4, 4)
         """
+        cam2world = np.loadtxt(path, delimiter=' ')
         world2cam = np.linalg.inv(cam2world)
         return world2cam
+    def read_scannet_intrinsic(self,path):
+        """ Read ScanNet's intrinsic matrix and return the 3x3 matrix.
+        """
+        intrinsic = np.loadtxt(path, delimiter=' ')
+        return torch.tensor(intrinsic[:-1, :-1], dtype = torch.float)
     def __getitem__(self, pair_idx):
         # read intrinsics of original size
         data_name = self.data_names[pair_idx]
         scene_name, scene_sub_name, stem_name_1, stem_name_2 = data_name
+        scene_name = f'scene{scene_name:04d}_{scene_sub_name:02d}'
         # read the intrinsic of depthmap
+        K1 = K2 =  self.read_scannet_intrinsic(osp.join(self.scene_root,
+                       scene_name,
+                       'intrinsic', 'intrinsic_color.txt'))#the depth K is not the same, but doesnt really matter
         # read and compute relative poses
+        T1 =  self.read_scannet_pose(osp.join(self.scene_root,
+                       scene_name,
+                       'pose', f'{stem_name_1}.txt'))
+        T2 =  self.read_scannet_pose(osp.join(self.scene_root,
+                       scene_name,
+                       'pose', f'{stem_name_2}.txt'))
+        T_1to2 = torch.tensor(np.matmul(T2, np.linalg.inv(T1)), dtype=torch.float)[:4, :4]  # (4, 4)
         # Load positive pair data
+        im_A_ref = os.path.join(self.scene_root, scene_name, 'color', f'{stem_name_1}.jpg')
+        im_B_ref = os.path.join(self.scene_root, scene_name, 'color', f'{stem_name_2}.jpg')
+        depth_A_ref = os.path.join(self.scene_root, scene_name, 'depth', f'{stem_name_1}.png')
+        depth_B_ref = os.path.join(self.scene_root, scene_name, 'depth', f'{stem_name_2}.png')
         im_A = self.load_im(im_A_ref)
         im_B = self.load_im(im_B_ref)
         K2 = self.scale_intrinsic(K2, im_B.width, im_B.height)
         # Process images
         im_A, im_B = self.im_transform_ops((im_A, im_B))
+        depth_A, depth_B = self.depth_transform_ops((depth_A[None,None], depth_B[None,None]))
         if self.use_horizontal_flip_aug:
             if np.random.rand() > 0.5:
+                im_A, im_B, depth_A, depth_B, K1, K2 = self.horizontal_flip(im_A, im_B, depth_A, depth_B, K1, K2)
+        data_dict = {'im_A': im_A,
+                    'im_B': im_B,
+                    'im_A_depth': depth_A[0,0],
+                    'im_B_depth': depth_B[0,0],
+                    'K1': K1,
+                    'K2': K2,
+                    'T_1to2':T_1to2,
+                    }
         return data_dict
 class ScanNetBuilder:
+    def __init__(self, data_root = 'data/scannet') -> None:
         self.data_root = data_root
+        self.scene_info_root = os.path.join(data_root,'scannet_indices')
         self.all_scenes = os.listdir(self.scene_info_root)
+    def build_scenes(self, split = 'train', min_overlap=0., **kwargs):
         # Note: split doesn't matter here as we always use same scannet_train scenes
         scene_names = self.all_scenes
         scenes = []
+        for scene_name in tqdm(scene_names, disable = roma.RANK > 0):
+            scene_info = np.load(os.path.join(self.scene_info_root,scene_name), allow_pickle=True)
+            scenes.append(ScanNetScene(self.data_root, scene_info, min_overlap=min_overlap, **kwargs))
         return scenes
+    def weight_scenes(self, concat_dataset, alpha=.5):
         ns = []
         for d in concat_dataset.datasets:
             ns.append(len(d))
+        ws = torch.cat([torch.ones(n)/n**alpha for n in ns])
         return ws

third_party/RoMa/roma/losses/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .robust_loss import RobustLosses

third_party/{Roma → RoMa}/roma/losses/robust_loss.py RENAMED Viewed

@@ -7,7 +7,6 @@ import wandb
 import roma
 import math
 class RobustLosses(nn.Module):
     def __init__(
         self,
@@ -18,12 +17,12 @@ class RobustLosses(nn.Module):
         local_loss=True,
         local_dist=4.0,
         local_largest_scale=8,
-        smooth_mask=False,
-        depth_interpolation_mode="bilinear",
-        mask_depth_loss=False,
-        relative_depth_error_threshold=0.05,
-        alpha=1.0,
-        c=1e-3,
     ):
         super().__init__()
         self.robust = robust  # measured in pixels
@@ -46,103 +45,68 @@ class RobustLosses(nn.Module):
             B, C, H, W = scale_gm_cls.shape
             device = x2.device
             cls_res = round(math.sqrt(C))
-            G = torch.meshgrid(
-                *[
-                    torch.linspace(
-                        -1 + 1 / cls_res, 1 - 1 / cls_res, steps=cls_res, device=device
-                    )
-                    for _ in range(2)
-                ]
-            )
-            G = torch.stack((G[1], G[0]), dim=-1).reshape(C, 2)
-            GT = (
-                (G[None, :, None, None, :] - x2[:, None])
-                .norm(dim=-1)
-                .min(dim=1)
-                .indices
-            )
-        cls_loss = F.cross_entropy(scale_gm_cls, GT, reduction="none")[prob > 0.99]
         if not torch.any(cls_loss):
-            cls_loss = certainty_loss * 0.0  # Prevent issues where prob is 0 everywhere
-        certainty_loss = F.binary_cross_entropy_with_logits(gm_certainty[:, 0], prob)
         losses = {
             f"gm_certainty_loss_{scale}": certainty_loss.mean(),
             f"gm_cls_loss_{scale}": cls_loss.mean(),
         }
-        wandb.log(losses, step=roma.GLOBAL_STEP)
         return losses
-    def delta_cls_loss(
-        self, x2, prob, flow_pre_delta, delta_cls, certainty, scale, offset_scale
-    ):
         with torch.no_grad():
             B, C, H, W = delta_cls.shape
             device = x2.device
             cls_res = round(math.sqrt(C))
-            G = torch.meshgrid(
-                *[
-                    torch.linspace(
-                        -1 + 1 / cls_res, 1 - 1 / cls_res, steps=cls_res, device=device
-                    )
-                    for _ in range(2)
-                ]
-            )
-            G = torch.stack((G[1], G[0]), dim=-1).reshape(C, 2) * offset_scale
-            GT = (
-                (G[None, :, None, None, :] + flow_pre_delta[:, None] - x2[:, None])
-                .norm(dim=-1)
-                .min(dim=1)
-                .indices
-            )
-        cls_loss = F.cross_entropy(delta_cls, GT, reduction="none")[prob > 0.99]
         if not torch.any(cls_loss):
-            cls_loss = certainty_loss * 0.0  # Prevent issues where prob is 0 everywhere
-        certainty_loss = F.binary_cross_entropy_with_logits(certainty[:, 0], prob)
         losses = {
             f"delta_certainty_loss_{scale}": certainty_loss.mean(),
             f"delta_cls_loss_{scale}": cls_loss.mean(),
         }
-        wandb.log(losses, step=roma.GLOBAL_STEP)
         return losses
-    def regression_loss(self, x2, prob, flow, certainty, scale, eps=1e-8, mode="delta"):
-        epe = (flow.permute(0, 2, 3, 1) - x2).norm(dim=-1)
         if scale == 1:
-            pck_05 = (epe[prob > 0.99] < 0.5 * (2 / 512)).float().mean()
-            wandb.log({"train_pck_05": pck_05}, step=roma.GLOBAL_STEP)
         ce_loss = F.binary_cross_entropy_with_logits(certainty[:, 0], prob)
         a = self.alpha
         cs = self.c * scale
         x = epe[prob > 0.99]
-        reg_loss = cs**a * ((x / (cs)) ** 2 + 1**2) ** (a / 2)
         if not torch.any(reg_loss):
-            reg_loss = ce_loss * 0.0  # Prevent issues where prob is 0 everywhere
         losses = {
             f"{mode}_certainty_loss_{scale}": ce_loss.mean(),
             f"{mode}_regression_loss_{scale}": reg_loss.mean(),
         }
-        wandb.log(losses, step=roma.GLOBAL_STEP)
         return losses
     def forward(self, corresps, batch):
         scales = list(corresps.keys())
         tot_loss = 0.0
         # scale_weights due to differences in scale for regression gradients and classification gradients
-        scale_weights = {1: 1, 2: 1, 4: 1, 8: 1, 16: 1}
         for scale in scales:
             scale_corresps = corresps[scale]
-            (
-                scale_certainty,
-                flow_pre_delta,
-                delta_cls,
-                offset_scale,
-                scale_gm_cls,
-                scale_gm_certainty,
-                flow,
-                scale_gm_flow,
-            ) = (
                 scale_corresps["certainty"],
                 scale_corresps["flow_pre_delta"],
                 scale_corresps.get("delta_cls"),
@@ -151,72 +115,43 @@ class RobustLosses(nn.Module):
                 scale_corresps.get("gm_certainty"),
                 scale_corresps["flow"],
                 scale_corresps.get("gm_flow"),
             )
             flow_pre_delta = rearrange(flow_pre_delta, "b d h w -> b h w d")
             b, h, w, d = flow_pre_delta.shape
-            gt_warp, gt_prob = get_gt_warp(
-                batch["im_A_depth"],
-                batch["im_B_depth"],
-                batch["T_1to2"],
-                batch["K1"],
-                batch["K2"],
-                H=h,
-                W=w,
-            )
             x2 = gt_warp.float()
             prob = gt_prob
             if self.local_largest_scale >= scale:
                 prob = prob * (
-                    F.interpolate(prev_epe[:, None], size=(h, w), mode="nearest-exact")[
-                        :, 0
-                    ]
-                    < (2 / 512) * (self.local_dist[scale] * scale)
-                )
             if scale_gm_cls is not None:
-                gm_cls_losses = self.gm_cls_loss(
-                    x2, prob, scale_gm_cls, scale_gm_certainty, scale
-                )
-                gm_loss = (
-                    self.ce_weight * gm_cls_losses[f"gm_certainty_loss_{scale}"]
-                    + gm_cls_losses[f"gm_cls_loss_{scale}"]
-                )
                 tot_loss = tot_loss + scale_weights[scale] * gm_loss
             elif scale_gm_flow is not None:
-                gm_flow_losses = self.regression_loss(
-                    x2, prob, scale_gm_flow, scale_gm_certainty, scale, mode="gm"
-                )
-                gm_loss = (
-                    self.ce_weight * gm_flow_losses[f"gm_certainty_loss_{scale}"]
-                    + gm_flow_losses[f"gm_regression_loss_{scale}"]
-                )
                 tot_loss = tot_loss + scale_weights[scale] * gm_loss
             if delta_cls is not None:
-                delta_cls_losses = self.delta_cls_loss(
-                    x2,
-                    prob,
-                    flow_pre_delta,
-                    delta_cls,
-                    scale_certainty,
-                    scale,
-                    offset_scale,
-                )
-                delta_cls_loss = (
-                    self.ce_weight * delta_cls_losses[f"delta_certainty_loss_{scale}"]
-                    + delta_cls_losses[f"delta_cls_loss_{scale}"]
-                )
                 tot_loss = tot_loss + scale_weights[scale] * delta_cls_loss
             else:
-                delta_regression_losses = self.regression_loss(
-                    x2, prob, flow, scale_certainty, scale
-                )
-                reg_loss = (
-                    self.ce_weight
-                    * delta_regression_losses[f"delta_certainty_loss_{scale}"]
-                    + delta_regression_losses[f"delta_regression_loss_{scale}"]
-                )
                 tot_loss = tot_loss + scale_weights[scale] * reg_loss
-            prev_epe = (flow.permute(0, 2, 3, 1) - x2).norm(dim=-1).detach()
         return tot_loss

 import roma
 import math
 class RobustLosses(nn.Module):
     def __init__(
         self,
         local_loss=True,
         local_dist=4.0,
         local_largest_scale=8,
+        smooth_mask = False,
+        depth_interpolation_mode = "bilinear",
+        mask_depth_loss = False,
+        relative_depth_error_threshold = 0.05,
+        alpha = 1.,
+        c = 1e-3,
     ):
         super().__init__()
         self.robust = robust  # measured in pixels
             B, C, H, W = scale_gm_cls.shape
             device = x2.device
             cls_res = round(math.sqrt(C))
+            G = torch.meshgrid(*[torch.linspace(-1+1/cls_res, 1 - 1/cls_res, steps = cls_res,device = device) for _ in range(2)])
+            G = torch.stack((G[1], G[0]), dim = -1).reshape(C,2)
+            GT = (G[None,:,None,None,:]-x2[:,None]).norm(dim=-1).min(dim=1).indices
+        cls_loss = F.cross_entropy(scale_gm_cls, GT, reduction  = 'none')[prob > 0.99]
         if not torch.any(cls_loss):
+            cls_loss = (certainty_loss * 0.0)  # Prevent issues where prob is 0 everywhere
+        certainty_loss = F.binary_cross_entropy_with_logits(gm_certainty[:,0], prob)
         losses = {
             f"gm_certainty_loss_{scale}": certainty_loss.mean(),
             f"gm_cls_loss_{scale}": cls_loss.mean(),
         }
+        wandb.log(losses, step = roma.GLOBAL_STEP)
         return losses
+    def delta_cls_loss(self, x2, prob, flow_pre_delta, delta_cls, certainty, scale, offset_scale):
         with torch.no_grad():
             B, C, H, W = delta_cls.shape
             device = x2.device
             cls_res = round(math.sqrt(C))
+            G = torch.meshgrid(*[torch.linspace(-1+1/cls_res, 1 - 1/cls_res, steps = cls_res,device = device) for _ in range(2)])
+            G = torch.stack((G[1], G[0]), dim = -1).reshape(C,2) * offset_scale
+            GT = (G[None,:,None,None,:] + flow_pre_delta[:,None] - x2[:,None]).norm(dim=-1).min(dim=1).indices
+        cls_loss = F.cross_entropy(delta_cls, GT, reduction  = 'none')[prob > 0.99]
         if not torch.any(cls_loss):
+            cls_loss = (certainty_loss * 0.0)  # Prevent issues where prob is 0 everywhere
+        certainty_loss = F.binary_cross_entropy_with_logits(certainty[:,0], prob)
         losses = {
             f"delta_certainty_loss_{scale}": certainty_loss.mean(),
             f"delta_cls_loss_{scale}": cls_loss.mean(),
         }
+        wandb.log(losses, step = roma.GLOBAL_STEP)
         return losses
+    def regression_loss(self, x2, prob, flow, certainty, scale, eps=1e-8, mode = "delta"):
+        epe = (flow.permute(0,2,3,1) - x2).norm(dim=-1)
         if scale == 1:
+            pck_05 = (epe[prob > 0.99] < 0.5 * (2/512)).float().mean()
+            wandb.log({"train_pck_05": pck_05}, step = roma.GLOBAL_STEP)
         ce_loss = F.binary_cross_entropy_with_logits(certainty[:, 0], prob)
         a = self.alpha
         cs = self.c * scale
         x = epe[prob > 0.99]
+        reg_loss = cs**a * ((x/(cs))**2 + 1**2)**(a/2)
         if not torch.any(reg_loss):
+            reg_loss = (ce_loss * 0.0)  # Prevent issues where prob is 0 everywhere
         losses = {
             f"{mode}_certainty_loss_{scale}": ce_loss.mean(),
             f"{mode}_regression_loss_{scale}": reg_loss.mean(),
         }
+        wandb.log(losses, step = roma.GLOBAL_STEP)
         return losses
     def forward(self, corresps, batch):
         scales = list(corresps.keys())
         tot_loss = 0.0
         # scale_weights due to differences in scale for regression gradients and classification gradients
+        scale_weights = {1:1, 2:1, 4:1, 8:1, 16:1}
         for scale in scales:
             scale_corresps = corresps[scale]
+            scale_certainty, flow_pre_delta, delta_cls, offset_scale, scale_gm_cls, scale_gm_certainty, flow, scale_gm_flow = (
                 scale_corresps["certainty"],
                 scale_corresps["flow_pre_delta"],
                 scale_corresps.get("delta_cls"),
                 scale_corresps.get("gm_certainty"),
                 scale_corresps["flow"],
                 scale_corresps.get("gm_flow"),
             )
             flow_pre_delta = rearrange(flow_pre_delta, "b d h w -> b h w d")
             b, h, w, d = flow_pre_delta.shape
+            gt_warp, gt_prob = get_gt_warp(
+            batch["im_A_depth"],
+            batch["im_B_depth"],
+            batch["T_1to2"],
+            batch["K1"],
+            batch["K2"],
+            H=h,
+            W=w,
+        )
             x2 = gt_warp.float()
             prob = gt_prob
             if self.local_largest_scale >= scale:
                 prob = prob * (
+                        F.interpolate(prev_epe[:, None], size=(h, w), mode="nearest-exact")[:, 0]
+                        < (2 / 512) * (self.local_dist[scale] * scale))
             if scale_gm_cls is not None:
+                gm_cls_losses = self.gm_cls_loss(x2, prob, scale_gm_cls, scale_gm_certainty, scale)
+                gm_loss = self.ce_weight * gm_cls_losses[f"gm_certainty_loss_{scale}"] + gm_cls_losses[f"gm_cls_loss_{scale}"]
                 tot_loss = tot_loss + scale_weights[scale] * gm_loss
             elif scale_gm_flow is not None:
+                gm_flow_losses = self.regression_loss(x2, prob, scale_gm_flow, scale_gm_certainty, scale, mode = "gm")
+                gm_loss = self.ce_weight * gm_flow_losses[f"gm_certainty_loss_{scale}"] + gm_flow_losses[f"gm_regression_loss_{scale}"]
                 tot_loss = tot_loss + scale_weights[scale] * gm_loss
             if delta_cls is not None:
+                delta_cls_losses = self.delta_cls_loss(x2, prob, flow_pre_delta, delta_cls, scale_certainty, scale, offset_scale)
+                delta_cls_loss = self.ce_weight * delta_cls_losses[f"delta_certainty_loss_{scale}"] + delta_cls_losses[f"delta_cls_loss_{scale}"]
                 tot_loss = tot_loss + scale_weights[scale] * delta_cls_loss
             else:
+                delta_regression_losses = self.regression_loss(x2, prob, flow, scale_certainty, scale)
+                reg_loss = self.ce_weight * delta_regression_losses[f"delta_certainty_loss_{scale}"] + delta_regression_losses[f"delta_regression_loss_{scale}"]
                 tot_loss = tot_loss + scale_weights[scale] * reg_loss
+            prev_epe = (flow.permute(0,2,3,1) - x2).norm(dim=-1).detach()
         return tot_loss

third_party/RoMa/roma/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model_zoo import roma_outdoor, roma_indoor

third_party/{Roma → RoMa}/roma/models/encoders.py RENAMED Viewed

@@ -8,7 +8,8 @@ import gc
 class ResNet50(nn.Module):
-    def __init__(self, pretrained=False, high_res = False, weights = None, dilation = None, freeze_bn = True, anti_aliased = False, early_exit = False, amp = False) -> None:
         super().__init__()
         if dilation is None:
             dilation = [False,False,False]
@@ -24,10 +25,7 @@ class ResNet50(nn.Module):
         self.freeze_bn = freeze_bn
         self.early_exit = early_exit
         self.amp = amp
-        if not torch.cuda.is_available():
-            self.amp_dtype = torch.float32
-        else:
-            self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
     def forward(self, x, **kwargs):
         with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
@@ -59,14 +57,11 @@ class ResNet50(nn.Module):
                 pass
 class VGG19(nn.Module):
-    def __init__(self, pretrained=False, amp = False) -> None:
         super().__init__()
         self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40])
         self.amp = amp
-        if not torch.cuda.is_available():
-            self.amp_dtype = torch.float32
-        else:
-            self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
     def forward(self, x, **kwargs):
         with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
@@ -80,7 +75,7 @@ class VGG19(nn.Module):
             return feats
 class CNNandDinov2(nn.Module):
-    def __init__(self, cnn_kwargs = None, amp = False, use_vgg = False, dinov2_weights = None):
         super().__init__()
         if dinov2_weights is None:
             dinov2_weights = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth", map_location="cpu")
@@ -100,10 +95,7 @@ class CNNandDinov2(nn.Module):
         else:
             self.cnn = VGG19(**cnn_kwargs)
         self.amp = amp
-        if not torch.cuda.is_available():
-            self.amp_dtype = torch.float32
-        else:
-            self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
         if self.amp:
             dinov2_vitl14 = dinov2_vitl14.to(self.amp_dtype)
         self.dinov2_vitl14 = [dinov2_vitl14] # ugly hack to not show parameters to DDP

 class ResNet50(nn.Module):
+    def __init__(self, pretrained=False, high_res = False, weights = None,
+                 dilation = None, freeze_bn = True, anti_aliased = False, early_exit = False, amp = False, amp_dtype = torch.float16) -> None:
         super().__init__()
         if dilation is None:
             dilation = [False,False,False]
         self.freeze_bn = freeze_bn
         self.early_exit = early_exit
         self.amp = amp
+        self.amp_dtype = amp_dtype
     def forward(self, x, **kwargs):
         with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
                 pass
 class VGG19(nn.Module):
+    def __init__(self, pretrained=False, amp = False, amp_dtype = torch.float16) -> None:
         super().__init__()
         self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40])
         self.amp = amp
+        self.amp_dtype = amp_dtype
     def forward(self, x, **kwargs):
         with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
             return feats
 class CNNandDinov2(nn.Module):
+    def __init__(self, cnn_kwargs = None, amp = False, use_vgg = False, dinov2_weights = None, amp_dtype = torch.float16):
         super().__init__()
         if dinov2_weights is None:
             dinov2_weights = torch.hub.load_state_dict_from_url("https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth", map_location="cpu")
         else:
             self.cnn = VGG19(**cnn_kwargs)
         self.amp = amp
+        self.amp_dtype = amp_dtype
         if self.amp:
             dinov2_vitl14 = dinov2_vitl14.to(self.amp_dtype)
         self.dinov2_vitl14 = [dinov2_vitl14] # ugly hack to not show parameters to DDP

third_party/{Roma → RoMa}/roma/models/matcher.py RENAMED Viewed

@@ -7,6 +7,7 @@ import torch.nn.functional as F
 from einops import rearrange
 import warnings
 from warnings import warn
 import roma
 from roma.utils import get_tuple_transform_ops
@@ -37,6 +38,7 @@ class ConvRefiner(nn.Module):
         sample_mode = "bilinear",
         norm_type = nn.BatchNorm2d,
         bn_momentum = 0.1,
     ):
         super().__init__()
         self.bn_momentum = bn_momentum
@@ -71,12 +73,8 @@ class ConvRefiner(nn.Module):
         self.disable_local_corr_grad = disable_local_corr_grad
         self.is_classifier = is_classifier
         self.sample_mode = sample_mode
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        if not torch.cuda.is_available():
-            self.amp_dtype = torch.float32
-        else:
-            self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
     def create_block(
         self,
         in_dim,
@@ -113,8 +111,8 @@ class ConvRefiner(nn.Module):
             if self.has_displacement_emb:
                 im_A_coords = torch.meshgrid(
                 (
-                    torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device=self.device),
-                    torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device=self.device),
                 )
                 )
                 im_A_coords = torch.stack((im_A_coords[1], im_A_coords[0]))
@@ -278,7 +276,7 @@ class Decoder(nn.Module):
     def __init__(
         self, embedding_decoder, gps, proj, conv_refiner, detach=False, scales="all", pos_embeddings = None,
         num_refinement_steps_per_scale = 1, warp_noise_std = 0.0, displacement_dropout_p = 0.0, gm_warp_dropout_p = 0.0,
-        flow_upsample_mode = "bilinear"
     ):
         super().__init__()
         self.embedding_decoder = embedding_decoder
@@ -300,11 +298,8 @@ class Decoder(nn.Module):
         self.displacement_dropout_p = displacement_dropout_p
         self.gm_warp_dropout_p = gm_warp_dropout_p
         self.flow_upsample_mode = flow_upsample_mode
-        if not torch.cuda.is_available():
-            self.amp_dtype = torch.float32
-        else:
-            self.amp_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
     def get_placeholder_flow(self, b, h, w, device):
         coarse_coords = torch.meshgrid(
             (
@@ -367,7 +362,7 @@ class Decoder(nn.Module):
             corresps[ins] = {}
             f1_s, f2_s = f1[ins], f2[ins]
             if new_scale in self.proj:
-                with torch.autocast("cuda", self.amp_dtype):
                     f1_s, f2_s = self.proj[new_scale](f1_s), self.proj[new_scale](f2_s)
             if ins in coarse_scales:
@@ -429,11 +424,12 @@ class RegressionMatcher(nn.Module):
         decoder,
         h=448,
         w=448,
-        sample_mode = "threshold",
         upsample_preds = False,
         symmetric = False,
         name = None,
         attenuate_cert = None,
     ):
         super().__init__()
         self.attenuate_cert = attenuate_cert
@@ -448,6 +444,7 @@ class RegressionMatcher(nn.Module):
         self.upsample_res = (14*16*6, 14*16*6)
         self.symmetric = symmetric
         self.sample_thresh = 0.05
     def get_output_resolution(self):
         if not self.upsample_preds:
@@ -527,12 +524,62 @@ class RegressionMatcher(nn.Module):
                                 scale_factor=scale_factor)
         return corresps
-    def to_pixel_coordinates(self, matches, H_A, W_A, H_B, W_B):
-        kpts_A, kpts_B = matches[...,:2], matches[...,2:]
         kpts_A = torch.stack((W_A/2 * (kpts_A[...,0]+1), H_A/2 * (kpts_A[...,1]+1)),axis=-1)
         kpts_B = torch.stack((W_B/2 * (kpts_B[...,0]+1), H_B/2 * (kpts_B[...,1]+1)),axis=-1)
         return kpts_A, kpts_B
     def match(
         self,
         im_A_path,
@@ -543,9 +590,8 @@ class RegressionMatcher(nn.Module):
     ):
         if device is None:
             device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        from PIL import Image
         if isinstance(im_A_path, (str, os.PathLike)):
-            im_A, im_B = Image.open(im_A_path), Image.open(im_B_path)
         else:
             # Assume its not a path
             im_A, im_B = im_A_path, im_B_path
@@ -597,7 +643,14 @@ class RegressionMatcher(nn.Module):
                 test_transform = get_tuple_transform_ops(
                     resize=(hs, ws), normalize=True
                 )
-                im_A, im_B = Image.open(im_A_path), Image.open(im_B_path)
                 im_A, im_B = test_transform((im_A, im_B))
                 im_A, im_B = im_A[None].to(device), im_B[None].to(device)
                 scale_factor = math.sqrt(self.upsample_res[0] * self.upsample_res[1] / (self.w_resized * self.h_resized))
@@ -653,4 +706,30 @@ class RegressionMatcher(nn.Module):
                     warp[0],
                     certainty[0, 0],
                 )

 from einops import rearrange
 import warnings
 from warnings import warn
+from PIL import Image
 import roma
 from roma.utils import get_tuple_transform_ops
         sample_mode = "bilinear",
         norm_type = nn.BatchNorm2d,
         bn_momentum = 0.1,
+        amp_dtype = torch.float16,
     ):
         super().__init__()
         self.bn_momentum = bn_momentum
         self.disable_local_corr_grad = disable_local_corr_grad
         self.is_classifier = is_classifier
         self.sample_mode = sample_mode
+        self.amp_dtype = amp_dtype
     def create_block(
         self,
         in_dim,
             if self.has_displacement_emb:
                 im_A_coords = torch.meshgrid(
                 (
+                    torch.linspace(-1 + 1 / hs, 1 - 1 / hs, hs, device=x.device),
+                    torch.linspace(-1 + 1 / ws, 1 - 1 / ws, ws, device=x.device),
                 )
                 )
                 im_A_coords = torch.stack((im_A_coords[1], im_A_coords[0]))
     def __init__(
         self, embedding_decoder, gps, proj, conv_refiner, detach=False, scales="all", pos_embeddings = None,
         num_refinement_steps_per_scale = 1, warp_noise_std = 0.0, displacement_dropout_p = 0.0, gm_warp_dropout_p = 0.0,
+        flow_upsample_mode = "bilinear", amp_dtype = torch.float16,
     ):
         super().__init__()
         self.embedding_decoder = embedding_decoder
         self.displacement_dropout_p = displacement_dropout_p
         self.gm_warp_dropout_p = gm_warp_dropout_p
         self.flow_upsample_mode = flow_upsample_mode
+        self.amp_dtype = amp_dtype
     def get_placeholder_flow(self, b, h, w, device):
         coarse_coords = torch.meshgrid(
             (
             corresps[ins] = {}
             f1_s, f2_s = f1[ins], f2[ins]
             if new_scale in self.proj:
+                with torch.autocast("cuda", dtype = self.amp_dtype):
                     f1_s, f2_s = self.proj[new_scale](f1_s), self.proj[new_scale](f2_s)
             if ins in coarse_scales:
         decoder,
         h=448,
         w=448,
+        sample_mode = "threshold_balanced",
         upsample_preds = False,
         symmetric = False,
         name = None,
         attenuate_cert = None,
+        recrop_upsample = False,
     ):
         super().__init__()
         self.attenuate_cert = attenuate_cert
         self.upsample_res = (14*16*6, 14*16*6)
         self.symmetric = symmetric
         self.sample_thresh = 0.05
+        self.recrop_upsample = recrop_upsample
     def get_output_resolution(self):
         if not self.upsample_preds:
                                 scale_factor=scale_factor)
         return corresps
+    def to_pixel_coordinates(self, coords, H_A, W_A, H_B, W_B):
+        if isinstance(coords, (list, tuple)):
+            kpts_A, kpts_B = coords[0], coords[1]
+        else:
+            kpts_A, kpts_B = coords[...,:2], coords[...,2:]
         kpts_A = torch.stack((W_A/2 * (kpts_A[...,0]+1), H_A/2 * (kpts_A[...,1]+1)),axis=-1)
         kpts_B = torch.stack((W_B/2 * (kpts_B[...,0]+1), H_B/2 * (kpts_B[...,1]+1)),axis=-1)
         return kpts_A, kpts_B
+    def to_normalized_coordinates(self, coords, H_A, W_A, H_B, W_B):
+        if isinstance(coords, (list, tuple)):
+            kpts_A, kpts_B = coords[0], coords[1]
+        else:
+            kpts_A, kpts_B = coords[...,:2], coords[...,2:]
+        kpts_A = torch.stack((2/W_A * kpts_A[...,0] - 1, 2/H_A * kpts_A[...,1] - 1),axis=-1)
+        kpts_B = torch.stack((2/W_B * kpts_B[...,0] - 1, 2/H_B * kpts_B[...,1] - 1),axis=-1)
+        return kpts_A, kpts_B
+    def match_keypoints(self, x_A, x_B, warp, certainty, return_tuple = True, return_inds = False):
+        x_A_to_B = F.grid_sample(warp[...,-2:].permute(2,0,1)[None], x_A[None,None], align_corners = False, mode = "bilinear")[0,:,0].mT
+        cert_A_to_B = F.grid_sample(certainty[None,None,...], x_A[None,None], align_corners = False, mode = "bilinear")[0,0,0]
+        D = torch.cdist(x_A_to_B, x_B)
+        inds_A, inds_B = torch.nonzero((D == D.min(dim=-1, keepdim = True).values) * (D == D.min(dim=-2, keepdim = True).values) * (cert_A_to_B[:,None] > self.sample_thresh), as_tuple = True)
+        if return_tuple:
+            if return_inds:
+                return inds_A, inds_B
+            else:
+                return x_A[inds_A], x_B[inds_B]
+        else:
+            if return_inds:
+                return torch.cat((inds_A, inds_B),dim=-1)
+            else:
+                return torch.cat((x_A[inds_A], x_B[inds_B]),dim=-1)
+    def get_roi(self, certainty, W, H, thr = 0.025):
+        raise NotImplementedError("WIP, disable for now")
+        hs,ws = certainty.shape
+        certainty = certainty/certainty.sum(dim=(-1,-2))
+        cum_certainty_w = certainty.cumsum(dim=-1).sum(dim=-2)
+        cum_certainty_h = certainty.cumsum(dim=-2).sum(dim=-1)
+        print(cum_certainty_w)
+        print(torch.min(torch.nonzero(cum_certainty_w > thr)))
+        print(torch.min(torch.nonzero(cum_certainty_w < thr)))
+        left = int(W/ws * torch.min(torch.nonzero(cum_certainty_w > thr)))
+        right = int(W/ws * torch.max(torch.nonzero(cum_certainty_w < 1 - thr)))
+        top = int(H/hs * torch.min(torch.nonzero(cum_certainty_h > thr)))
+        bottom = int(H/hs * torch.max(torch.nonzero(cum_certainty_h < 1 - thr)))
+        print(left, right, top, bottom)
+        return left, top, right, bottom
+    def recrop(self, certainty, image_path):
+        roi = self.get_roi(certainty, *Image.open(image_path).size)
+        return Image.open(image_path).convert("RGB").crop(roi)
+    @torch.inference_mode()
     def match(
         self,
         im_A_path,
     ):
         if device is None:
             device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         if isinstance(im_A_path, (str, os.PathLike)):
+            im_A, im_B = Image.open(im_A_path).convert("RGB"), Image.open(im_B_path).convert("RGB")
         else:
             # Assume its not a path
             im_A, im_B = im_A_path, im_B_path
                 test_transform = get_tuple_transform_ops(
                     resize=(hs, ws), normalize=True
                 )
+                if self.recrop_upsample:
+                    certainty = corresps[finest_scale]["certainty"]
+                    print(certainty.shape)
+                    im_A = self.recrop(certainty[0,0], im_A_path)
+                    im_B = self.recrop(certainty[1,0], im_B_path)
+                    #TODO: need to adjust corresps when doing this
+                else:
+                    im_A, im_B = Image.open(im_A_path).convert("RGB"), Image.open(im_B_path).convert("RGB")
                 im_A, im_B = test_transform((im_A, im_B))
                 im_A, im_B = im_A[None].to(device), im_B[None].to(device)
                 scale_factor = math.sqrt(self.upsample_res[0] * self.upsample_res[1] / (self.w_resized * self.h_resized))
                     warp[0],
                     certainty[0, 0],
                 )
+    def visualize_warp(self, warp, certainty, im_A = None, im_B = None, im_A_path = None, im_B_path = None, device = "cuda", symmetric = True, save_path = None):
+        assert symmetric == True, "Currently assuming bidirectional warp, might update this if someone complains ;)"
+        H,W2,_ = warp.shape
+        W = W2//2 if symmetric else W2
+        if im_A is None:
+            from PIL import Image
+            im_A, im_B = Image.open(im_A_path).convert("RGB"), Image.open(im_B_path).convert("RGB")
+        im_A = im_A.resize((W,H))
+        im_B = im_B.resize((W,H))
+        x_A = (torch.tensor(np.array(im_A)) / 255).to(device).permute(2, 0, 1)
+        x_B = (torch.tensor(np.array(im_B)) / 255).to(device).permute(2, 0, 1)
+        im_A_transfer_rgb = F.grid_sample(
+        x_B[None], warp[:,:W, 2:][None], mode="bilinear", align_corners=False
+        )[0]
+        im_B_transfer_rgb = F.grid_sample(
+        x_A[None], warp[:, W:, :2][None], mode="bilinear", align_corners=False
+        )[0]
+        warp_im = torch.cat((im_A_transfer_rgb,im_B_transfer_rgb),dim=2)
+        white_im = torch.ones((H,2*W),device=device)
+        vis_im = certainty * warp_im + (1 - certainty) * white_im
+        if save_path is not None:
+            from roma.utils import tensor_to_pil
+            tensor_to_pil(vis_im, unnormalize=False).save(save_path)
+        return vis_im

third_party/RoMa/roma/models/model_zoo/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from typing import Union
+import torch
+from .roma_models import roma_model
+weight_urls = {
+    "roma": {
+        "outdoor": "https://github.com/Parskatt/storage/releases/download/roma/roma_outdoor.pth",
+        "indoor": "https://github.com/Parskatt/storage/releases/download/roma/roma_indoor.pth",
+    },
+    "dinov2": "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth", #hopefully this doesnt change :D
+}
+def roma_outdoor(device, weights=None, dinov2_weights=None, coarse_res: Union[int,tuple[int,int]] = 560, upsample_res: Union[int,tuple[int,int]] = 864, amp_dtype: torch.dtype = torch.float16):
+    if isinstance(coarse_res, int):
+        coarse_res = (coarse_res, coarse_res)
+    if isinstance(upsample_res, int):
+        upsample_res = (upsample_res, upsample_res)
+    assert coarse_res[0] % 14 == 0, "Needs to be multiple of 14 for backbone"
+    assert coarse_res[1] % 14 == 0, "Needs to be multiple of 14 for backbone"
+    if weights is None:
+        weights = torch.hub.load_state_dict_from_url(weight_urls["roma"]["outdoor"],
+                                                     map_location=device)
+    if dinov2_weights is None:
+        dinov2_weights = torch.hub.load_state_dict_from_url(weight_urls["dinov2"],
+                                                     map_location=device)
+    model = roma_model(resolution=coarse_res, upsample_preds=True,
+               weights=weights,dinov2_weights = dinov2_weights,device=device, amp_dtype=amp_dtype)
+    model.upsample_res = upsample_res
+    print(f"Using coarse resolution {coarse_res}, and upsample res {model.upsample_res}")
+    return model
+def roma_indoor(device, weights=None, dinov2_weights=None, coarse_res: Union[int,tuple[int,int]] = 560, upsample_res: Union[int,tuple[int,int]] = 864, amp_dtype: torch.dtype = torch.float16):
+    if isinstance(coarse_res, int):
+        coarse_res = (coarse_res, coarse_res)
+    if isinstance(upsample_res, int):
+        upsample_res = (upsample_res, upsample_res)
+    assert coarse_res[0] % 14 == 0, "Needs to be multiple of 14 for backbone"
+    assert coarse_res[1] % 14 == 0, "Needs to be multiple of 14 for backbone"
+    if weights is None:
+        weights = torch.hub.load_state_dict_from_url(weight_urls["roma"]["indoor"],
+                                                     map_location=device)
+    if dinov2_weights is None:
+        dinov2_weights = torch.hub.load_state_dict_from_url(weight_urls["dinov2"],
+                                                     map_location=device)
+    model = roma_model(resolution=coarse_res, upsample_preds=True,
+               weights=weights,dinov2_weights = dinov2_weights,device=device, amp_dtype=amp_dtype)
+    model.upsample_res = upsample_res
+    print(f"Using coarse resolution {coarse_res}, and upsample res {model.upsample_res}")
+    return model

third_party/{Roma → RoMa}/roma/models/model_zoo/roma_models.py RENAMED Viewed

@@ -1,98 +1,91 @@
 import warnings
 import torch.nn as nn
 from roma.models.matcher import *
 from roma.models.transformer import Block, TransformerDecoder, MemEffAttention
 from roma.models.encoders import *
-def roma_model(
-    resolution, upsample_preds, device=None, weights=None, dinov2_weights=None, **kwargs
-):
     # roma weights and dinov2 weights are loaded seperately, as dinov2 weights are not parameters
-    torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
-    torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
-    warnings.filterwarnings(
-        "ignore", category=UserWarning, message="TypedStorage is deprecated"
-    )
     gp_dim = 512
     feat_dim = 512
     decoder_dim = gp_dim + feat_dim
     cls_to_coord_res = 64
     coordinate_decoder = TransformerDecoder(
-        nn.Sequential(
-            *[Block(decoder_dim, 8, attn_class=MemEffAttention) for _ in range(5)]
-        ),
-        decoder_dim,
         cls_to_coord_res**2 + 1,
         is_classifier=True,
-        amp=True,
-        pos_enc=False,
-    )
     dw = True
     hidden_blocks = 8
     kernel_size = 5
     displacement_emb = "linear"
     disable_local_corr_grad = True
     conv_refiner = nn.ModuleDict(
         {
             "16": ConvRefiner(
-                2 * 512 + 128 + (2 * 7 + 1) ** 2,
-                2 * 512 + 128 + (2 * 7 + 1) ** 2,
                 2 + 1,
                 kernel_size=kernel_size,
                 dw=dw,
                 hidden_blocks=hidden_blocks,
                 displacement_emb=displacement_emb,
                 displacement_emb_dim=128,
-                local_corr_radius=7,
-                corr_in_other=True,
-                amp=True,
-                disable_local_corr_grad=disable_local_corr_grad,
-                bn_momentum=0.01,
             ),
             "8": ConvRefiner(
-                2 * 512 + 64 + (2 * 3 + 1) ** 2,
-                2 * 512 + 64 + (2 * 3 + 1) ** 2,
                 2 + 1,
                 kernel_size=kernel_size,
                 dw=dw,
                 hidden_blocks=hidden_blocks,
                 displacement_emb=displacement_emb,
                 displacement_emb_dim=64,
-                local_corr_radius=3,
-                corr_in_other=True,
-                amp=True,
-                disable_local_corr_grad=disable_local_corr_grad,
-                bn_momentum=0.01,
             ),
             "4": ConvRefiner(
-                2 * 256 + 32 + (2 * 2 + 1) ** 2,
-                2 * 256 + 32 + (2 * 2 + 1) ** 2,
                 2 + 1,
                 kernel_size=kernel_size,
                 dw=dw,
                 hidden_blocks=hidden_blocks,
                 displacement_emb=displacement_emb,
                 displacement_emb_dim=32,
-                local_corr_radius=2,
-                corr_in_other=True,
-                amp=True,
-                disable_local_corr_grad=disable_local_corr_grad,
-                bn_momentum=0.01,
             ),
             "2": ConvRefiner(
-                2 * 64 + 16,
-                128 + 16,
                 2 + 1,
                 kernel_size=kernel_size,
                 dw=dw,
                 hidden_blocks=hidden_blocks,
                 displacement_emb=displacement_emb,
                 displacement_emb_dim=16,
-                amp=True,
-                disable_local_corr_grad=disable_local_corr_grad,
-                bn_momentum=0.01,
             ),
             "1": ConvRefiner(
                 2 * 9 + 6,
@@ -100,12 +93,12 @@ def roma_model(
                 2 + 1,
                 kernel_size=kernel_size,
                 dw=dw,
-                hidden_blocks=hidden_blocks,
-                displacement_emb=displacement_emb,
-                displacement_emb_dim=6,
-                amp=True,
-                disable_local_corr_grad=disable_local_corr_grad,
-                bn_momentum=0.01,
             ),
         }
     )
@@ -130,46 +123,38 @@ def roma_model(
     proj4 = nn.Sequential(nn.Conv2d(256, 256, 1, 1), nn.BatchNorm2d(256))
     proj2 = nn.Sequential(nn.Conv2d(128, 64, 1, 1), nn.BatchNorm2d(64))
     proj1 = nn.Sequential(nn.Conv2d(64, 9, 1, 1), nn.BatchNorm2d(9))
-    proj = nn.ModuleDict(
-        {
-            "16": proj16,
-            "8": proj8,
-            "4": proj4,
-            "2": proj2,
-            "1": proj1,
-        }
-    )
     displacement_dropout_p = 0.0
     gm_warp_dropout_p = 0.0
-    decoder = Decoder(
-        coordinate_decoder,
-        gps,
-        proj,
-        conv_refiner,
-        detach=True,
-        scales=["16", "8", "4", "2", "1"],
-        displacement_dropout_p=displacement_dropout_p,
-        gm_warp_dropout_p=gm_warp_dropout_p,
-    )
     encoder = CNNandDinov2(
-        cnn_kwargs=dict(pretrained=False, amp=True),
-        amp=True,
-        use_vgg=True,
-        dinov2_weights=dinov2_weights,
     )
-    h, w = resolution
     symmetric = True
     attenuate_cert = True
-    matcher = RegressionMatcher(
-        encoder,
-        decoder,
-        h=h,
-        w=w,
-        upsample_preds=upsample_preds,
-        symmetric=symmetric,
-        attenuate_cert=attenuate_cert,
-        **kwargs
-    ).to(device)
     matcher.load_state_dict(weights)
     return matcher

 import warnings
 import torch.nn as nn
+import torch
 from roma.models.matcher import *
 from roma.models.transformer import Block, TransformerDecoder, MemEffAttention
 from roma.models.encoders import *
+def roma_model(resolution, upsample_preds, device = None, weights=None, dinov2_weights=None, amp_dtype: torch.dtype=torch.float16, **kwargs):
     # roma weights and dinov2 weights are loaded seperately, as dinov2 weights are not parameters
+    #torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul TODO: these probably ruin stuff, should be careful
+    #torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
+    warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
     gp_dim = 512
     feat_dim = 512
     decoder_dim = gp_dim + feat_dim
     cls_to_coord_res = 64
     coordinate_decoder = TransformerDecoder(
+        nn.Sequential(*[Block(decoder_dim, 8, attn_class=MemEffAttention) for _ in range(5)]),
+        decoder_dim,
         cls_to_coord_res**2 + 1,
         is_classifier=True,
+        amp = True,
+        pos_enc = False,)
     dw = True
     hidden_blocks = 8
     kernel_size = 5
     displacement_emb = "linear"
     disable_local_corr_grad = True
     conv_refiner = nn.ModuleDict(
         {
             "16": ConvRefiner(
+                2 * 512+128+(2*7+1)**2,
+                2 * 512+128+(2*7+1)**2,
                 2 + 1,
                 kernel_size=kernel_size,
                 dw=dw,
                 hidden_blocks=hidden_blocks,
                 displacement_emb=displacement_emb,
                 displacement_emb_dim=128,
+                local_corr_radius = 7,
+                corr_in_other = True,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
             ),
             "8": ConvRefiner(
+                2 * 512+64+(2*3+1)**2,
+                2 * 512+64+(2*3+1)**2,
                 2 + 1,
                 kernel_size=kernel_size,
                 dw=dw,
                 hidden_blocks=hidden_blocks,
                 displacement_emb=displacement_emb,
                 displacement_emb_dim=64,
+                local_corr_radius = 3,
+                corr_in_other = True,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
             ),
             "4": ConvRefiner(
+                2 * 256+32+(2*2+1)**2,
+                2 * 256+32+(2*2+1)**2,
                 2 + 1,
                 kernel_size=kernel_size,
                 dw=dw,
                 hidden_blocks=hidden_blocks,
                 displacement_emb=displacement_emb,
                 displacement_emb_dim=32,
+                local_corr_radius = 2,
+                corr_in_other = True,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
             ),
             "2": ConvRefiner(
+                2 * 64+16,
+                128+16,
                 2 + 1,
                 kernel_size=kernel_size,
                 dw=dw,
                 hidden_blocks=hidden_blocks,
                 displacement_emb=displacement_emb,
                 displacement_emb_dim=16,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
             ),
             "1": ConvRefiner(
                 2 * 9 + 6,
                 2 + 1,
                 kernel_size=kernel_size,
                 dw=dw,
+                hidden_blocks = hidden_blocks,
+                displacement_emb = displacement_emb,
+                displacement_emb_dim = 6,
+                amp = True,
+                disable_local_corr_grad = disable_local_corr_grad,
+                bn_momentum = 0.01,
             ),
         }
     )
     proj4 = nn.Sequential(nn.Conv2d(256, 256, 1, 1), nn.BatchNorm2d(256))
     proj2 = nn.Sequential(nn.Conv2d(128, 64, 1, 1), nn.BatchNorm2d(64))
     proj1 = nn.Sequential(nn.Conv2d(64, 9, 1, 1), nn.BatchNorm2d(9))
+    proj = nn.ModuleDict({
+        "16": proj16,
+        "8": proj8,
+        "4": proj4,
+        "2": proj2,
+        "1": proj1,
+        })
     displacement_dropout_p = 0.0
     gm_warp_dropout_p = 0.0
+    decoder = Decoder(coordinate_decoder,
+                      gps,
+                      proj,
+                      conv_refiner,
+                      detach=True,
+                      scales=["16", "8", "4", "2", "1"],
+                      displacement_dropout_p = displacement_dropout_p,
+                      gm_warp_dropout_p = gm_warp_dropout_p)
     encoder = CNNandDinov2(
+        cnn_kwargs = dict(
+            pretrained=False,
+            amp = True),
+        amp = True,
+        use_vgg = True,
+        dinov2_weights = dinov2_weights,
+        amp_dtype=amp_dtype,
     )
+    h,w = resolution
     symmetric = True
     attenuate_cert = True
+    sample_mode = "threshold_balanced"
+    matcher = RegressionMatcher(encoder, decoder, h=h, w=w, upsample_preds=upsample_preds,
+                                symmetric = symmetric, attenuate_cert = attenuate_cert, sample_mode = sample_mode, **kwargs).to(device)
     matcher.load_state_dict(weights)
     return matcher

third_party/{Roma → RoMa}/roma/models/transformer/__init__.py RENAMED Viewed

@@ -7,23 +7,9 @@ from .layers.block import Block
 from .layers.attention import MemEffAttention
 from .dinov2 import vit_large
-device = "cuda" if torch.cuda.is_available() else "cpu"
 class TransformerDecoder(nn.Module):
-    def __init__(
-        self,
-        blocks,
-        hidden_dim,
-        out_dim,
-        is_classifier=False,
-        *args,
-        amp=False,
-        pos_enc=True,
-        learned_embeddings=False,
-        embedding_dim=None,
-        **kwargs
-    ) -> None:
         super().__init__(*args, **kwargs)
         self.blocks = blocks
         self.to_out = nn.Linear(hidden_dim, out_dim)
@@ -32,48 +18,30 @@ class TransformerDecoder(nn.Module):
         self._scales = [16]
         self.is_classifier = is_classifier
         self.amp = amp
-        if torch.cuda.is_available():
-            if torch.cuda.is_bf16_supported():
-                self.amp_dtype = torch.bfloat16
-            else:
-                self.amp_dtype = torch.float16
-        else:
-            self.amp_dtype = torch.float32
         self.pos_enc = pos_enc
         self.learned_embeddings = learned_embeddings
         if self.learned_embeddings:
-            self.learned_pos_embeddings = nn.Parameter(
-                nn.init.kaiming_normal_(
-                    torch.empty((1, hidden_dim, embedding_dim, embedding_dim))
-                )
-            )
     def scales(self):
         return self._scales.copy()
     def forward(self, gp_posterior, features, old_stuff, new_scale):
-        with torch.autocast(device, dtype=self.amp_dtype, enabled=self.amp):
-            B, C, H, W = gp_posterior.shape
-            x = torch.cat((gp_posterior, features), dim=1)
-            B, C, H, W = x.shape
-            grid = get_grid(B, H, W, x.device).reshape(B, H * W, 2)
             if self.learned_embeddings:
-                pos_enc = (
-                    F.interpolate(
-                        self.learned_pos_embeddings,
-                        size=(H, W),
-                        mode="bilinear",
-                        align_corners=False,
-                    )
-                    .permute(0, 2, 3, 1)
-                    .reshape(1, H * W, C)
-                )
             else:
                 pos_enc = 0
-            tokens = x.reshape(B, C, H * W).permute(0, 2, 1) + pos_enc
             z = self.blocks(tokens)
             out = self.to_out(z)
-            out = out.permute(0, 2, 1).reshape(B, self.out_dim, H, W)
             warp, certainty = out[:, :-1], out[:, -1:]
             return warp, certainty, None

 from .layers.attention import MemEffAttention
 from .dinov2 import vit_large
 class TransformerDecoder(nn.Module):
+    def __init__(self, blocks, hidden_dim, out_dim, is_classifier = False, *args,
+                 amp = False, pos_enc = True, learned_embeddings = False, embedding_dim = None, amp_dtype = torch.float16, **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.blocks = blocks
         self.to_out = nn.Linear(hidden_dim, out_dim)
         self._scales = [16]
         self.is_classifier = is_classifier
         self.amp = amp
+        self.amp_dtype = amp_dtype
         self.pos_enc = pos_enc
         self.learned_embeddings = learned_embeddings
         if self.learned_embeddings:
+            self.learned_pos_embeddings = nn.Parameter(nn.init.kaiming_normal_(torch.empty((1, hidden_dim, embedding_dim, embedding_dim))))
     def scales(self):
         return self._scales.copy()
     def forward(self, gp_posterior, features, old_stuff, new_scale):
+        with torch.autocast("cuda", dtype=self.amp_dtype, enabled=self.amp):
+            B,C,H,W = gp_posterior.shape
+            x = torch.cat((gp_posterior, features), dim = 1)
+            B,C,H,W = x.shape
+            grid = get_grid(B, H, W, x.device).reshape(B,H*W,2)
             if self.learned_embeddings:
+                pos_enc = F.interpolate(self.learned_pos_embeddings, size = (H,W), mode = 'bilinear', align_corners = False).permute(0,2,3,1).reshape(1,H*W,C)
             else:
                 pos_enc = 0
+            tokens = x.reshape(B,C,H*W).permute(0,2,1) + pos_enc
             z = self.blocks(tokens)
             out = self.to_out(z)
+            out = out.permute(0,2,1).reshape(B, self.out_dim, H, W)
             warp, certainty = out[:, :-1], out[:, -1:]
             return warp, certainty, None

third_party/{Roma → RoMa}/roma/models/transformer/dinov2.py RENAMED Viewed

@@ -18,29 +18,16 @@ import torch.nn as nn
 import torch.utils.checkpoint
 from torch.nn.init import trunc_normal_
-from .layers import (
-    Mlp,
-    PatchEmbed,
-    SwiGLUFFNFused,
-    MemEffAttention,
-    NestedTensorBlock as Block,
-)
-def named_apply(
-    fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False
-) -> nn.Module:
     if not depth_first and include_root:
         fn(module=module, name=name)
     for child_name, child_module in module.named_children():
         child_name = ".".join((name, child_name)) if name else child_name
-        named_apply(
-            fn=fn,
-            module=child_module,
-            name=child_name,
-            depth_first=depth_first,
-            include_root=True,
-        )
     if depth_first and include_root:
         fn(module=module, name=name)
     return module
@@ -100,33 +87,22 @@ class DinoVisionTransformer(nn.Module):
         super().__init__()
         norm_layer = partial(nn.LayerNorm, eps=1e-6)
-        self.num_features = (
-            self.embed_dim
-        ) = embed_dim  # num_features for consistency with other models
         self.num_tokens = 1
         self.n_blocks = depth
         self.num_heads = num_heads
         self.patch_size = patch_size
-        self.patch_embed = embed_layer(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim,
-        )
         num_patches = self.patch_embed.num_patches
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
-        self.pos_embed = nn.Parameter(
-            torch.zeros(1, num_patches + self.num_tokens, embed_dim)
-        )
         if drop_path_uniform is True:
             dpr = [drop_path_rate] * depth
         else:
-            dpr = [
-                x.item() for x in torch.linspace(0, drop_path_rate, depth)
-            ]  # stochastic depth decay rule
         if ffn_layer == "mlp":
             ffn_layer = Mlp
@@ -163,9 +139,7 @@ class DinoVisionTransformer(nn.Module):
             chunksize = depth // block_chunks
             for i in range(0, depth, chunksize):
                 # this is to keep the block index consistent if we chunk the block list
-                chunked_blocks.append(
-                    [nn.Identity()] * i + blocks_list[i : i + chunksize]
-                )
             self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
         else:
             self.chunked_blocks = False
@@ -179,7 +153,7 @@ class DinoVisionTransformer(nn.Module):
         self.init_weights()
         for param in self.parameters():
             param.requires_grad = False
     @property
     def device(self):
         return self.cls_token.device
@@ -206,29 +180,20 @@ class DinoVisionTransformer(nn.Module):
         w0, h0 = w0 + 0.1, h0 + 0.1
         patch_pos_embed = nn.functional.interpolate(
-            patch_pos_embed.reshape(
-                1, int(math.sqrt(N)), int(math.sqrt(N)), dim
-            ).permute(0, 3, 1, 2),
             scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
             mode="bicubic",
         )
-        assert (
-            int(w0) == patch_pos_embed.shape[-2]
-            and int(h0) == patch_pos_embed.shape[-1]
-        )
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
-        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(
-            previous_dtype
-        )
     def prepare_tokens_with_masks(self, x, masks=None):
         B, nc, w, h = x.shape
         x = self.patch_embed(x)
         if masks is not None:
-            x = torch.where(
-                masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x
-            )
         x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
         x = x + self.interpolate_pos_encoding(x, w, h)
@@ -236,10 +201,7 @@ class DinoVisionTransformer(nn.Module):
         return x
     def forward_features_list(self, x_list, masks_list):
-        x = [
-            self.prepare_tokens_with_masks(x, masks)
-            for x, masks in zip(x_list, masks_list)
-        ]
         for blk in self.blocks:
             x = blk(x)
@@ -278,34 +240,26 @@ class DinoVisionTransformer(nn.Module):
         x = self.prepare_tokens_with_masks(x)
         # If n is an int, take the n last blocks. If it's a list, take them
         output, total_block_len = [], len(self.blocks)
-        blocks_to_take = (
-            range(total_block_len - n, total_block_len) if isinstance(n, int) else n
-        )
         for i, blk in enumerate(self.blocks):
             x = blk(x)
             if i in blocks_to_take:
                 output.append(x)
-        assert len(output) == len(
-            blocks_to_take
-        ), f"only {len(output)} / {len(blocks_to_take)} blocks found"
         return output
     def _get_intermediate_layers_chunked(self, x, n=1):
         x = self.prepare_tokens_with_masks(x)
         output, i, total_block_len = [], 0, len(self.blocks[-1])
         # If n is an int, take the n last blocks. If it's a list, take them
-        blocks_to_take = (
-            range(total_block_len - n, total_block_len) if isinstance(n, int) else n
-        )
         for block_chunk in self.blocks:
             for blk in block_chunk[i:]:  # Passing the nn.Identity()
                 x = blk(x)
                 if i in blocks_to_take:
                     output.append(x)
                 i += 1
-        assert len(output) == len(
-            blocks_to_take
-        ), f"only {len(output)} / {len(blocks_to_take)} blocks found"
         return output
     def get_intermediate_layers(
@@ -327,9 +281,7 @@ class DinoVisionTransformer(nn.Module):
         if reshape:
             B, _, w, h = x.shape
             outputs = [
-                out.reshape(B, w // self.patch_size, h // self.patch_size, -1)
-                .permute(0, 3, 1, 2)
-                .contiguous()
                 for out in outputs
             ]
         if return_class_token:
@@ -404,4 +356,4 @@ def vit_giant2(patch_size=16, **kwargs):
         block_fn=partial(Block, attn_class=MemEffAttention),
         **kwargs,
     )
-    return model

 import torch.utils.checkpoint
 from torch.nn.init import trunc_normal_
+from .layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
     if not depth_first and include_root:
         fn(module=module, name=name)
     for child_name, child_module in module.named_children():
         child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
     if depth_first and include_root:
         fn(module=module, name=name)
     return module
         super().__init__()
         norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
         self.num_tokens = 1
         self.n_blocks = depth
         self.num_heads = num_heads
         self.patch_size = patch_size
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
         num_patches = self.patch_embed.num_patches
         self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
         if drop_path_uniform is True:
             dpr = [drop_path_rate] * depth
         else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
         if ffn_layer == "mlp":
             ffn_layer = Mlp
             chunksize = depth // block_chunks
             for i in range(0, depth, chunksize):
                 # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
             self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
         else:
             self.chunked_blocks = False
         self.init_weights()
         for param in self.parameters():
             param.requires_grad = False
     @property
     def device(self):
         return self.cls_token.device
         w0, h0 = w0 + 0.1, h0 + 0.1
         patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
             scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
             mode="bicubic",
         )
+        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
         patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
     def prepare_tokens_with_masks(self, x, masks=None):
         B, nc, w, h = x.shape
         x = self.patch_embed(x)
         if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
         x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
         x = x + self.interpolate_pos_encoding(x, w, h)
         return x
     def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
         for blk in self.blocks:
             x = blk(x)
         x = self.prepare_tokens_with_masks(x)
         # If n is an int, take the n last blocks. If it's a list, take them
         output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
         for i, blk in enumerate(self.blocks):
             x = blk(x)
             if i in blocks_to_take:
                 output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
         return output
     def _get_intermediate_layers_chunked(self, x, n=1):
         x = self.prepare_tokens_with_masks(x)
         output, i, total_block_len = [], 0, len(self.blocks[-1])
         # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
         for block_chunk in self.blocks:
             for blk in block_chunk[i:]:  # Passing the nn.Identity()
                 x = blk(x)
                 if i in blocks_to_take:
                     output.append(x)
                 i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
         return output
     def get_intermediate_layers(
         if reshape:
             B, _, w, h = x.shape
             outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
                 for out in outputs
             ]
         if return_class_token:
         block_fn=partial(Block, attn_class=MemEffAttention),
         **kwargs,
     )
+    return model

third_party/{Roma → RoMa}/roma/models/transformer/layers/__init__.py RENAMED Viewed

File without changes

third_party/{Roma → RoMa}/roma/models/transformer/layers/attention.py RENAMED Viewed

@@ -48,11 +48,7 @@ class Attention(nn.Module):
     def forward(self, x: Tensor) -> Tensor:
         B, N, C = x.shape
-        qkv = (
-            self.qkv(x)
-            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
-            .permute(2, 0, 3, 1, 4)
-        )
         q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
         attn = q @ k.transpose(-2, -1)

     def forward(self, x: Tensor) -> Tensor:
         B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
         q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
         attn = q @ k.transpose(-2, -1)

third_party/{Roma → RoMa}/roma/models/transformer/layers/block.py RENAMED Viewed

@@ -62,9 +62,7 @@ class Block(nn.Module):
             attn_drop=attn_drop,
             proj_drop=drop,
         )
-        self.ls1 = (
-            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
-        )
         self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         self.norm2 = norm_layer(dim)
@@ -76,9 +74,7 @@ class Block(nn.Module):
             drop=drop,
             bias=ffn_bias,
         )
-        self.ls2 = (
-            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
-        )
         self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         self.sample_drop_ratio = drop_path
@@ -131,9 +127,7 @@ def drop_add_residual_stochastic_depth(
     residual_scale_factor = b / sample_subset_size
     # 3) add the residual
-    x_plus_residual = torch.index_add(
-        x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor
-    )
     return x_plus_residual.view_as(x)
@@ -149,16 +143,10 @@ def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None
     if scaling_vector is None:
         x_flat = x.flatten(1)
         residual = residual.flatten(1)
-        x_plus_residual = torch.index_add(
-            x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor
-        )
     else:
         x_plus_residual = scaled_index_add(
-            x,
-            brange,
-            residual.to(dtype=x.dtype),
-            scaling=scaling_vector,
-            alpha=residual_scale_factor,
         )
     return x_plus_residual
@@ -170,11 +158,7 @@ def get_attn_bias_and_cat(x_list, branges=None):
     """
     this will perform the index select, cat the tensors, and provide the attn_bias from cache
     """
-    batch_sizes = (
-        [b.shape[0] for b in branges]
-        if branges is not None
-        else [x.shape[0] for x in x_list]
-    )
     all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
     if all_shapes not in attn_bias_cache.keys():
         seqlens = []
@@ -186,9 +170,7 @@ def get_attn_bias_and_cat(x_list, branges=None):
         attn_bias_cache[all_shapes] = attn_bias
     if branges is not None:
-        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(
-            1, -1, x_list[0].shape[-1]
-        )
     else:
         tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
         cat_tensors = torch.cat(tensors_bs1, dim=1)
@@ -203,9 +185,7 @@ def drop_add_residual_stochastic_depth_list(
     scaling_vector=None,
 ) -> Tensor:
     # 1) generate random set of indices for dropping samples in the batch
-    branges_scales = [
-        get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list
-    ]
     branges = [s[0] for s in branges_scales]
     residual_scale_factors = [s[1] for s in branges_scales]
@@ -216,14 +196,8 @@ def drop_add_residual_stochastic_depth_list(
     residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
     outputs = []
-    for x, brange, residual, residual_scale_factor in zip(
-        x_list, branges, residual_list, residual_scale_factors
-    ):
-        outputs.append(
-            add_residual(
-                x, brange, residual, residual_scale_factor, scaling_vector
-            ).view_as(x)
-        )
     return outputs
@@ -246,17 +220,13 @@ class NestedTensorBlock(Block):
                 x_list,
                 residual_func=attn_residual_func,
                 sample_drop_ratio=self.sample_drop_ratio,
-                scaling_vector=self.ls1.gamma
-                if isinstance(self.ls1, LayerScale)
-                else None,
             )
             x_list = drop_add_residual_stochastic_depth_list(
                 x_list,
                 residual_func=ffn_residual_func,
                 sample_drop_ratio=self.sample_drop_ratio,
-                scaling_vector=self.ls2.gamma
-                if isinstance(self.ls1, LayerScale)
-                else None,
             )
             return x_list
         else:
@@ -276,9 +246,7 @@ class NestedTensorBlock(Block):
         if isinstance(x_or_x_list, Tensor):
             return super().forward(x_or_x_list)
         elif isinstance(x_or_x_list, list):
-            assert (
-                XFORMERS_AVAILABLE
-            ), "Please install xFormers for nested tensors usage"
             return self.forward_nested(x_or_x_list)
         else:
             raise AssertionError

             attn_drop=attn_drop,
             proj_drop=drop,
         )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
         self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         self.norm2 = norm_layer(dim)
             drop=drop,
             bias=ffn_bias,
         )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
         self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
         self.sample_drop_ratio = drop_path
     residual_scale_factor = b / sample_subset_size
     # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
     return x_plus_residual.view_as(x)
     if scaling_vector is None:
         x_flat = x.flatten(1)
         residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
     else:
         x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
         )
     return x_plus_residual
     """
     this will perform the index select, cat the tensors, and provide the attn_bias from cache
     """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
     all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
     if all_shapes not in attn_bias_cache.keys():
         seqlens = []
         attn_bias_cache[all_shapes] = attn_bias
     if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
     else:
         tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
         cat_tensors = torch.cat(tensors_bs1, dim=1)
     scaling_vector=None,
 ) -> Tensor:
     # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
     branges = [s[0] for s in branges_scales]
     residual_scale_factors = [s[1] for s in branges_scales]
     residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
     outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
     return outputs
                 x_list,
                 residual_func=attn_residual_func,
                 sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
             )
             x_list = drop_add_residual_stochastic_depth_list(
                 x_list,
                 residual_func=ffn_residual_func,
                 sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
             )
             return x_list
         else:
         if isinstance(x_or_x_list, Tensor):
             return super().forward(x_or_x_list)
         elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
             return self.forward_nested(x_or_x_list)
         else:
             raise AssertionError

third_party/{Roma → RoMa}/roma/models/transformer/layers/dino_head.py RENAMED Viewed

@@ -23,14 +23,7 @@ class DINOHead(nn.Module):
     ):
         super().__init__()
         nlayers = max(nlayers, 1)
-        self.mlp = _build_mlp(
-            nlayers,
-            in_dim,
-            bottleneck_dim,
-            hidden_dim=hidden_dim,
-            use_bn=use_bn,
-            bias=mlp_bias,
-        )
         self.apply(self._init_weights)
         self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
         self.last_layer.weight_g.data.fill_(1)
@@ -49,9 +42,7 @@ class DINOHead(nn.Module):
         return x
-def _build_mlp(
-    nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True
-):
     if nlayers == 1:
         return nn.Linear(in_dim, bottleneck_dim, bias=bias)
     else:

     ):
         super().__init__()
         nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
         self.apply(self._init_weights)
         self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
         self.last_layer.weight_g.data.fill_(1)
         return x
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
     if nlayers == 1:
         return nn.Linear(in_dim, bottleneck_dim, bias=bias)
     else:

third_party/{Roma → RoMa}/roma/models/transformer/layers/drop_path.py RENAMED Viewed

@@ -16,9 +16,7 @@ def drop_path(x, drop_prob: float = 0.0, training: bool = False):
     if drop_prob == 0.0 or not training:
         return x
     keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (
-        x.ndim - 1
-    )  # work with diff dim tensors, not just 2D ConvNets
     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
     if keep_prob > 0.0:
         random_tensor.div_(keep_prob)

     if drop_prob == 0.0 or not training:
         return x
     keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
     random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
     if keep_prob > 0.0:
         random_tensor.div_(keep_prob)

third_party/{Roma → RoMa}/roma/models/transformer/layers/layer_scale.py RENAMED Viewed

File without changes

third_party/{Roma → RoMa}/roma/models/transformer/layers/mlp.py RENAMED Viewed

File without changes

third_party/{Roma → RoMa}/roma/models/transformer/layers/patch_embed.py RENAMED Viewed

@@ -63,21 +63,15 @@ class PatchEmbed(nn.Module):
         self.flatten_embedding = flatten_embedding
-        self.proj = nn.Conv2d(
-            in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW
-        )
         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
     def forward(self, x: Tensor) -> Tensor:
         _, _, H, W = x.shape
         patch_H, patch_W = self.patch_size
-        assert (
-            H % patch_H == 0
-        ), f"Input image height {H} is not a multiple of patch height {patch_H}"
-        assert (
-            W % patch_W == 0
-        ), f"Input image width {W} is not a multiple of patch width: {patch_W}"
         x = self.proj(x)  # B C H W
         H, W = x.size(2), x.size(3)
@@ -89,13 +83,7 @@ class PatchEmbed(nn.Module):
     def flops(self) -> float:
         Ho, Wo = self.patches_resolution
-        flops = (
-            Ho
-            * Wo
-            * self.embed_dim
-            * self.in_chans
-            * (self.patch_size[0] * self.patch_size[1])
-        )
         if self.norm is not None:
             flops += Ho * Wo * self.embed_dim
         return flops

         self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
         self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
     def forward(self, x: Tensor) -> Tensor:
         _, _, H, W = x.shape
         patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
         x = self.proj(x)  # B C H W
         H, W = x.size(2), x.size(3)
     def flops(self) -> float:
         Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
         if self.norm is not None:
             flops += Ho * Wo * self.embed_dim
         return flops

third_party/{Roma → RoMa}/roma/models/transformer/layers/swiglu_ffn.py RENAMED Viewed

File without changes

third_party/{Roma → RoMa}/roma/train/__init__.py RENAMED Viewed

File without changes

third_party/{Roma → RoMa}/roma/train/train.py RENAMED Viewed

@@ -4,62 +4,41 @@ import roma
 import torch
 import wandb
-def log_param_statistics(named_parameters, norm_type=2):
     named_parameters = list(named_parameters)
     grads = [p.grad for n, p in named_parameters if p.grad is not None]
-    weight_norms = [
-        p.norm(p=norm_type) for n, p in named_parameters if p.grad is not None
-    ]
-    names = [n for n, p in named_parameters if p.grad is not None]
     param_norm = torch.stack(weight_norms).norm(p=norm_type)
     device = grads[0].device
-    grad_norms = torch.stack(
-        [torch.norm(g.detach(), norm_type).to(device) for g in grads]
-    )
     nans_or_infs = torch.isinf(grad_norms) | torch.isnan(grad_norms)
     nan_inf_names = [name for name, naninf in zip(names, nans_or_infs) if naninf]
     total_grad_norm = torch.norm(grad_norms, norm_type)
     if torch.any(nans_or_infs):
         print(f"These params have nan or inf grads: {nan_inf_names}")
-    wandb.log({"grad_norm": total_grad_norm.item()}, step=roma.GLOBAL_STEP)
-    wandb.log({"param_norm": param_norm.item()}, step=roma.GLOBAL_STEP)
-def train_step(
-    train_batch, model, objective, optimizer, grad_scaler, grad_clip_norm=1.0, **kwargs
-):
     optimizer.zero_grad()
     out = model(train_batch)
     l = objective(out, train_batch)
     grad_scaler.scale(l).backward()
     grad_scaler.unscale_(optimizer)
     log_param_statistics(model.named_parameters())
-    torch.nn.utils.clip_grad_norm_(
-        model.parameters(), grad_clip_norm
-    )  # what should max norm be?
     grad_scaler.step(optimizer)
     grad_scaler.update()
-    wandb.log({"grad_scale": grad_scaler._scale.item()}, step=roma.GLOBAL_STEP)
-    if grad_scaler._scale < 1.0:
-        grad_scaler._scale = torch.tensor(1.0).to(grad_scaler._scale)
-    roma.GLOBAL_STEP = roma.GLOBAL_STEP + roma.STEP_SIZE  # increment global step
     return {"train_out": out, "train_loss": l.item()}
 def train_k_steps(
-    n_0,
-    k,
-    dataloader,
-    model,
-    objective,
-    optimizer,
-    lr_scheduler,
-    grad_scaler,
-    progress_bar=True,
-    grad_clip_norm=1.0,
-    warmup=None,
-    ema_model=None,
 ):
     for n in tqdm(range(n_0, n_0 + k), disable=(not progress_bar) or roma.RANK > 0):
         batch = next(dataloader)
@@ -73,7 +52,7 @@ def train_k_steps(
             lr_scheduler=lr_scheduler,
             grad_scaler=grad_scaler,
             n=n,
-            grad_clip_norm=grad_clip_norm,
         )
         if ema_model is not None:
             ema_model.update()
@@ -82,10 +61,7 @@ def train_k_steps(
                 lr_scheduler.step()
         else:
             lr_scheduler.step()
-        [
-            wandb.log({f"lr_group_{grp}": lr})
-            for grp, lr in enumerate(lr_scheduler.get_last_lr())
-        ]
 def train_epoch(

 import torch
 import wandb
+def log_param_statistics(named_parameters, norm_type = 2):
     named_parameters = list(named_parameters)
     grads = [p.grad for n, p in named_parameters if p.grad is not None]
+    weight_norms = [p.norm(p=norm_type) for n, p in named_parameters if p.grad is not None]
+    names = [n for n,p in named_parameters if p.grad is not None]
     param_norm = torch.stack(weight_norms).norm(p=norm_type)
     device = grads[0].device
+    grad_norms = torch.stack([torch.norm(g.detach(), norm_type).to(device) for g in grads])
     nans_or_infs = torch.isinf(grad_norms) | torch.isnan(grad_norms)
     nan_inf_names = [name for name, naninf in zip(names, nans_or_infs) if naninf]
     total_grad_norm = torch.norm(grad_norms, norm_type)
     if torch.any(nans_or_infs):
         print(f"These params have nan or inf grads: {nan_inf_names}")
+    wandb.log({"grad_norm": total_grad_norm.item()}, step = roma.GLOBAL_STEP)
+    wandb.log({"param_norm": param_norm.item()}, step = roma.GLOBAL_STEP)
+def train_step(train_batch, model, objective, optimizer, grad_scaler, grad_clip_norm = 1.,**kwargs):
     optimizer.zero_grad()
     out = model(train_batch)
     l = objective(out, train_batch)
     grad_scaler.scale(l).backward()
     grad_scaler.unscale_(optimizer)
     log_param_statistics(model.named_parameters())
+    torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_norm) # what should max norm be?
     grad_scaler.step(optimizer)
     grad_scaler.update()
+    wandb.log({"grad_scale": grad_scaler._scale.item()}, step = roma.GLOBAL_STEP)
+    if grad_scaler._scale < 1.:
+        grad_scaler._scale = torch.tensor(1.).to(grad_scaler._scale)
+    roma.GLOBAL_STEP = roma.GLOBAL_STEP + roma.STEP_SIZE # increment global step
     return {"train_out": out, "train_loss": l.item()}
 def train_k_steps(
+    n_0, k, dataloader, model, objective, optimizer, lr_scheduler, grad_scaler, progress_bar=True, grad_clip_norm = 1., warmup = None, ema_model = None,
 ):
     for n in tqdm(range(n_0, n_0 + k), disable=(not progress_bar) or roma.RANK > 0):
         batch = next(dataloader)
             lr_scheduler=lr_scheduler,
             grad_scaler=grad_scaler,
             n=n,
+            grad_clip_norm = grad_clip_norm,
         )
         if ema_model is not None:
             ema_model.update()
                 lr_scheduler.step()
         else:
             lr_scheduler.step()
+        [wandb.log({f"lr_group_{grp}": lr}) for grp, lr in enumerate(lr_scheduler.get_last_lr())]
 def train_epoch(

third_party/{Roma → RoMa}/roma/utils/__init__.py RENAMED Viewed

File without changes