Spaces:

zino36
/

Image-Matching-app

Sleeping

App Files Files Community

zino36 commited on Jul 23, 2024

Commit

20a4a01

verified ·

1 Parent(s): fc5081c

Upload 7 files

Browse files

Files changed (7) hide show

demo-2.py +283 -0
demo.py +29 -312
device.py +76 -0
image.py +126 -0
image_pairs.py +104 -0
path_to_dust3r.py +19 -0
viz.py +381 -0

demo-2.py ADDED Viewed

	@@ -0,0 +1,283 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# gradio demo
+# --------------------------------------------------------
+import argparse
+import math
+import builtins
+import datetime
+import gradio
+import os
+import torch
+import numpy as np
+import functools
+import trimesh
+import copy
+from scipy.spatial.transform import Rotation
+from dust3r.inference import inference
+from dust3r.image_pairs import make_pairs
+from dust3r.utils.image import load_images, rgb
+from dust3r.utils.device import to_numpy
+from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+import matplotlib.pyplot as pl
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser_url = parser.add_mutually_exclusive_group()
+    parser_url.add_argument("--local_network", action='store_true', default=False,
+                            help="make app accessible on local network: address will be set to 0.0.0.0")
+    parser_url.add_argument("--server_name", type=str, default=None, help="server url, default is 127.0.0.1")
+    parser.add_argument("--image_size", type=int, default=512, choices=[512, 224], help="image size")
+    parser.add_argument("--server_port", type=int, help=("will start gradio app on this port (if available). "
+                                                         "If None, will search for an available port starting at 7860."),
+                        default=None)
+    parser_weights = parser.add_mutually_exclusive_group(required=True)
+    parser_weights.add_argument("--weights", type=str, help="path to the model weights", default=None)
+    parser_weights.add_argument("--model_name", type=str, help="name of the model weights",
+                                choices=["DUSt3R_ViTLarge_BaseDecoder_512_dpt",
+                                         "DUSt3R_ViTLarge_BaseDecoder_512_linear",
+                                         "DUSt3R_ViTLarge_BaseDecoder_224_linear"])
+    parser.add_argument("--device", type=str, default='cuda', help="pytorch device")
+    parser.add_argument("--tmp_dir", type=str, default=None, help="value for tempfile.tempdir")
+    parser.add_argument("--silent", action='store_true', default=False,
+                        help="silence logs")
+    return parser
+def set_print_with_timestamp(time_format="%Y-%m-%d %H:%M:%S"):
+    builtin_print = builtins.print
+    def print_with_timestamp(*args, **kwargs):
+        now = datetime.datetime.now()
+        formatted_date_time = now.strftime(time_format)
+        builtin_print(f'[{formatted_date_time}] ', end='')  # print with time stamp
+        builtin_print(*args, **kwargs)
+    builtins.print = print_with_timestamp
+def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
+                                 cam_color=None, as_pointcloud=False,
+                                 transparent_cams=False, silent=False):
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+    scene = trimesh.Scene()
+    # full pointcloud
+    if as_pointcloud:
+        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+        col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+        pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+        scene.add_geometry(pct)
+    else:
+        meshes = []
+        for i in range(len(imgs)):
+            meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i], mask[i]))
+        mesh = trimesh.Trimesh(**cat_meshes(meshes))
+        scene.add_geometry(mesh)
+    # add each camera
+    for i, pose_c2w in enumerate(cams2world):
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      None if transparent_cams else imgs[i], focals[i],
+                      imsize=imgs[i].shape[1::-1], screen_width=cam_size)
+    rot = np.eye(4)
+    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
+    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
+    outfile = os.path.join(outdir, 'scene.glb')
+    if not silent:
+        print('(exporting 3D scene to', outfile, ')')
+    scene.export(file_obj=outfile)
+    return outfile
+def get_3D_model_from_scene(outdir, silent, scene, min_conf_thr=3, as_pointcloud=False, mask_sky=False,
+                            clean_depth=False, transparent_cams=False, cam_size=0.05):
+    """
+    extract 3D_model (glb file) from a reconstructed scene
+    """
+    if scene is None:
+        return None
+    # post processes
+    if clean_depth:
+        scene = scene.clean_pointcloud()
+    if mask_sky:
+        scene = scene.mask_sky()
+    # get optimized values from scene
+    rgbimg = scene.imgs
+    focals = scene.get_focals().cpu()
+    cams2world = scene.get_im_poses().cpu()
+    # 3D pointcloud from depthmap, poses and intrinsics
+    pts3d = to_numpy(scene.get_pts3d())
+    scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
+    msk = to_numpy(scene.get_masks())
+    return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
+                                        transparent_cams=transparent_cams, cam_size=cam_size, silent=silent)
+def get_reconstructed_scene(outdir, model, device, silent, image_size, filelist, schedule, niter, min_conf_thr,
+                            as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
+                            scenegraph_type, winsize, refid):
+    """
+    from a list of images, run dust3r inference, global aligner.
+    then run get_3D_model_from_scene
+    """
+    imgs = load_images(filelist, size=image_size, verbose=not silent)
+    if len(imgs) == 1:
+        imgs = [imgs[0], copy.deepcopy(imgs[0])]
+        imgs[1]['idx'] = 1
+    if scenegraph_type == "swin":
+        scenegraph_type = scenegraph_type + "-" + str(winsize)
+    elif scenegraph_type == "oneref":
+        scenegraph_type = scenegraph_type + "-" + str(refid)
+    pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True)
+    output = inference(pairs, model, device, batch_size=1, verbose=not silent)
+    mode = GlobalAlignerMode.PointCloudOptimizer if len(imgs) > 2 else GlobalAlignerMode.PairViewer
+    scene = global_aligner(output, device=device, mode=mode, verbose=not silent)
+    lr = 0.01
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        loss = scene.compute_global_alignment(init='mst', niter=niter, schedule=schedule, lr=lr)
+    outfile = get_3D_model_from_scene(outdir, silent, scene, min_conf_thr, as_pointcloud, mask_sky,
+                                      clean_depth, transparent_cams, cam_size)
+    # also return rgb, depth and confidence imgs
+    # depth is normalized with the max value for all images
+    # we apply the jet colormap on the confidence maps
+    rgbimg = scene.imgs
+    depths = to_numpy(scene.get_depthmaps())
+    confs = to_numpy([c for c in scene.im_conf])
+    cmap = pl.get_cmap('jet')
+    depths_max = max([d.max() for d in depths])
+    depths = [d / depths_max for d in depths]
+    confs_max = max([d.max() for d in confs])
+    confs = [cmap(d / confs_max) for d in confs]
+    imgs = []
+    for i in range(len(rgbimg)):
+        imgs.append(rgbimg[i])
+        imgs.append(rgb(depths[i]))
+        imgs.append(rgb(confs[i]))
+    return scene, outfile, imgs
+def set_scenegraph_options(inputfiles, winsize, refid, scenegraph_type):
+    num_files = len(inputfiles) if inputfiles is not None else 1
+    max_winsize = max(1, math.ceil((num_files - 1) / 2))
+    if scenegraph_type == "swin":
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=True)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=False)
+    elif scenegraph_type == "oneref":
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=False)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=True)
+    else:
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=False)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=False)
+    return winsize, refid
+def main_demo(tmpdirname, model, device, image_size, server_name, server_port, silent=False):
+    recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, model, device, silent, image_size)
+    model_from_scene_fun = functools.partial(get_3D_model_from_scene, tmpdirname, silent)
+    with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="DUSt3R Demo") as demo:
+        # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
+        scene = gradio.State(None)
+        gradio.HTML('<h2 style="text-align: center;">DUSt3R Demo</h2>')
+        with gradio.Column():
+            inputfiles = gradio.File(file_count="multiple")
+            with gradio.Row():
+                schedule = gradio.Dropdown(["linear", "cosine"],
+                                           value='linear', label="schedule", info="For global alignment!")
+                niter = gradio.Number(value=300, precision=0, minimum=0, maximum=5000,
+                                      label="num_iterations", info="For global alignment!")
+                scenegraph_type = gradio.Dropdown([("complete: all possible image pairs", "complete"),
+                                                   ("swin: sliding window", "swin"),
+                                                   ("oneref: match one image with all", "oneref")],
+                                                  value='complete', label="Scenegraph",
+                                                  info="Define how to make pairs",
+                                                  interactive=True)
+                winsize = gradio.Slider(label="Scene Graph: Window Size", value=1,
+                                        minimum=1, maximum=1, step=1, visible=False)
+                refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0, maximum=0, step=1, visible=False)
+            run_btn = gradio.Button("Run")
+            with gradio.Row():
+                # adjust the confidence threshold
+                min_conf_thr = gradio.Slider(label="min_conf_thr", value=3.0, minimum=1.0, maximum=20, step=0.1)
+                # adjust the camera size in the output pointcloud
+                cam_size = gradio.Slider(label="cam_size", value=0.05, minimum=0.001, maximum=0.1, step=0.001)
+            with gradio.Row():
+                as_pointcloud = gradio.Checkbox(value=False, label="As pointcloud")
+                # two post process implemented
+                mask_sky = gradio.Checkbox(value=False, label="Mask sky")
+                clean_depth = gradio.Checkbox(value=True, label="Clean-up depthmaps")
+                transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras")
+            outmodel = gradio.Model3D()
+            outgallery = gradio.Gallery(label='rgb,depth,confidence', columns=3, height="100%")
+            # events
+            scenegraph_type.change(set_scenegraph_options,
+                                   inputs=[inputfiles, winsize, refid, scenegraph_type],
+                                   outputs=[winsize, refid])
+            inputfiles.change(set_scenegraph_options,
+                              inputs=[inputfiles, winsize, refid, scenegraph_type],
+                              outputs=[winsize, refid])
+            run_btn.click(fn=recon_fun,
+                          inputs=[inputfiles, schedule, niter, min_conf_thr, as_pointcloud,
+                                  mask_sky, clean_depth, transparent_cams, cam_size,
+                                  scenegraph_type, winsize, refid],
+                          outputs=[scene, outmodel, outgallery])
+            min_conf_thr.release(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                         clean_depth, transparent_cams, cam_size],
+                                 outputs=outmodel)
+            cam_size.change(fn=model_from_scene_fun,
+                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                    clean_depth, transparent_cams, cam_size],
+                            outputs=outmodel)
+            as_pointcloud.change(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                         clean_depth, transparent_cams, cam_size],
+                                 outputs=outmodel)
+            mask_sky.change(fn=model_from_scene_fun,
+                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                    clean_depth, transparent_cams, cam_size],
+                            outputs=outmodel)
+            clean_depth.change(fn=model_from_scene_fun,
+                               inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                       clean_depth, transparent_cams, cam_size],
+                               outputs=outmodel)
+            transparent_cams.change(model_from_scene_fun,
+                                    inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                            clean_depth, transparent_cams, cam_size],
+                                    outputs=outmodel)
+    demo.launch(share=False, server_name=server_name, server_port=server_port)

demo.py CHANGED Viewed

@@ -3,328 +3,45 @@
 # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
 #
 # --------------------------------------------------------
-# sparse gradio demo functions
 # --------------------------------------------------------
-import math
-import gradio
 import os
-import numpy as np
-import functools
-import trimesh
-import copy
-from scipy.spatial.transform import Rotation
 import tempfile
-import shutil
-from sparse_ga import sparse_global_alignment
-from tsdf_optimizer import TSDFPostProcess
-import path_to_dust3r  # noqa
-from image_pairs import make_pairs
-from image import load_images
-from device import to_numpy
-from viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
-from demo import get_args_parser as dust3r_get_args_parser
 import matplotlib.pyplot as pl
-class SparseGAState():
-    def __init__(self, sparse_ga, should_delete=False, cache_dir=None, outfile_name=None):
-        self.sparse_ga = sparse_ga
-        self.cache_dir = cache_dir
-        self.outfile_name = outfile_name
-        self.should_delete = should_delete
-    def __del__(self):
-        if not self.should_delete:
-            return
-        if self.cache_dir is not None and os.path.isdir(self.cache_dir):
-            shutil.rmtree(self.cache_dir)
-        self.cache_dir = None
-        if self.outfile_name is not None and os.path.isfile(self.outfile_name):
-            os.remove(self.outfile_name)
-        self.outfile_name = None
-def get_args_parser():
-    parser = dust3r_get_args_parser()
-    parser.add_argument('--share', action='store_true')
-    parser.add_argument('--gradio_delete_cache', default=None, type=int,
-                        help='age/frequency at which gradio removes the file. If >0, matching cache is purged')
-    actions = parser._actions
-    for action in actions:
-        if action.dest == 'model_name':
-            action.choices = ["MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric"]
-    # change defaults
-    parser.prog = 'mast3r demo'
-    return parser
-def _convert_scene_output_to_glb(outfile, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
-                                 cam_color=None, as_pointcloud=False,
-                                 transparent_cams=False, silent=False):
-    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
-    pts3d = to_numpy(pts3d)
-    imgs = to_numpy(imgs)
-    focals = to_numpy(focals)
-    cams2world = to_numpy(cams2world)
-    scene = trimesh.Scene()
-    # full pointcloud
-    if as_pointcloud:
-        pts = np.concatenate([p[m.ravel()] for p, m in zip(pts3d, mask)]).reshape(-1, 3)
-        col = np.concatenate([p[m] for p, m in zip(imgs, mask)]).reshape(-1, 3)
-        valid_msk = np.isfinite(pts.sum(axis=1))
-        pct = trimesh.PointCloud(pts[valid_msk], colors=col[valid_msk])
-        scene.add_geometry(pct)
-    else:
-        meshes = []
-        for i in range(len(imgs)):
-            pts3d_i = pts3d[i].reshape(imgs[i].shape)
-            msk_i = mask[i] & np.isfinite(pts3d_i.sum(axis=-1))
-            meshes.append(pts3d_to_trimesh(imgs[i], pts3d_i, msk_i))
-        mesh = trimesh.Trimesh(**cat_meshes(meshes))
-        scene.add_geometry(mesh)
-    # add each camera
-    for i, pose_c2w in enumerate(cams2world):
-        if isinstance(cam_color, list):
-            camera_edge_color = cam_color[i]
-        else:
-            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
-        add_scene_cam(scene, pose_c2w, camera_edge_color,
-                      None if transparent_cams else imgs[i], focals[i],
-                      imsize=imgs[i].shape[1::-1], screen_width=cam_size)
-    rot = np.eye(4)
-    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
-    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
-    if not silent:
-        print('(exporting 3D scene to', outfile, ')')
-    scene.export(file_obj=outfile)
-    return outfile
-def get_3D_model_from_scene(silent, scene_state, min_conf_thr=2, as_pointcloud=False, mask_sky=False,
-                            clean_depth=False, transparent_cams=False, cam_size=0.05, TSDF_thresh=0):
-    """
-    extract 3D_model (glb file) from a reconstructed scene
-    """
-    if scene_state is None:
-        return None
-    outfile = scene_state.outfile_name
-    if outfile is None:
-        return None
-    # get optimized values from scene
-    scene = scene_state.sparse_ga
-    rgbimg = scene.imgs
-    focals = scene.get_focals().cpu()
-    cams2world = scene.get_im_poses().cpu()
-    # 3D pointcloud from depthmap, poses and intrinsics
-    if TSDF_thresh > 0:
-        tsdf = TSDFPostProcess(scene, TSDF_thresh=TSDF_thresh)
-        pts3d, _, confs = to_numpy(tsdf.get_dense_pts3d(clean_depth=clean_depth))
     else:
-        pts3d, _, confs = to_numpy(scene.get_dense_pts3d(clean_depth=clean_depth))
-    msk = to_numpy([c > min_conf_thr for c in confs])
-    return _convert_scene_output_to_glb(outfile, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
-                                        transparent_cams=transparent_cams, cam_size=cam_size, silent=silent)
-def get_reconstructed_scene(outdir, gradio_delete_cache, model, device, silent, image_size, current_scene_state,
-                            filelist, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, matching_conf_thr,
-                            as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size, scenegraph_type, winsize,
-                            win_cyclic, refid, TSDF_thresh, shared_intrinsics, **kw):
-    """
-    from a list of images, run mast3r inference, sparse global aligner.
-    then run get_3D_model_from_scene
-    """
-    imgs = load_images(filelist, size=image_size, verbose=not silent)
-    if len(imgs) == 1:
-        imgs = [imgs[0], copy.deepcopy(imgs[0])]
-        imgs[1]['idx'] = 1
-        filelist = [filelist[0], filelist[0] + '_2']
-    scene_graph_params = [scenegraph_type]
-    if scenegraph_type in ["swin", "logwin"]:
-        scene_graph_params.append(str(winsize))
-    elif scenegraph_type == "oneref":
-        scene_graph_params.append(str(refid))
-    if scenegraph_type in ["swin", "logwin"] and not win_cyclic:
-        scene_graph_params.append('noncyclic')
-    scene_graph = '-'.join(scene_graph_params)
-    pairs = make_pairs(imgs, scene_graph=scene_graph, prefilter=None, symmetrize=True)
-    if optim_level == 'coarse':
-        niter2 = 0
-    # Sparse GA (forward mast3r -> matching -> 3D optim -> 2D refinement -> triangulation)
-    if current_scene_state is not None and \
-        not current_scene_state.should_delete and \
-            current_scene_state.cache_dir is not None:
-        cache_dir = current_scene_state.cache_dir
-    elif gradio_delete_cache:
-        cache_dir = tempfile.mkdtemp(suffix='_cache', dir=outdir)
     else:
-        cache_dir = os.path.join(outdir, 'cache')
-    scene = sparse_global_alignment(filelist, pairs, cache_dir,
-                                    model, lr1=lr1, niter1=niter1, lr2=lr2, niter2=niter2, device=device,
-                                    opt_depth='depth' in optim_level, shared_intrinsics=shared_intrinsics,
-                                    matching_conf_thr=matching_conf_thr, **kw)
-    if current_scene_state is not None and \
-        not current_scene_state.should_delete and \
-            current_scene_state.outfile_name is not None:
-        outfile_name = current_scene_state.outfile_name
-    else:
-        outfile_name = tempfile.mktemp(suffix='_scene.glb', dir=outdir)
-    scene_state = SparseGAState(scene, gradio_delete_cache, cache_dir, outfile_name)
-    outfile = get_3D_model_from_scene(silent, scene_state, min_conf_thr, as_pointcloud, mask_sky,
-                                      clean_depth, transparent_cams, cam_size, TSDF_thresh)
-    return scene_state, outfile
-def set_scenegraph_options(inputfiles, win_cyclic, refid, scenegraph_type):
-    num_files = len(inputfiles) if inputfiles is not None else 1
-    show_win_controls = scenegraph_type in ["swin", "logwin"]
-    show_winsize = scenegraph_type in ["swin", "logwin"]
-    show_cyclic = scenegraph_type in ["swin", "logwin"]
-    max_winsize, min_winsize = 1, 1
-    if scenegraph_type == "swin":
-        if win_cyclic:
-            max_winsize = max(1, math.ceil((num_files - 1) / 2))
-        else:
-            max_winsize = num_files - 1
-    elif scenegraph_type == "logwin":
-        if win_cyclic:
-            half_size = math.ceil((num_files - 1) / 2)
-            max_winsize = max(1, math.ceil(math.log(half_size, 2)))
-        else:
-            max_winsize = max(1, math.ceil(math.log(num_files, 2)))
-    winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
-                            minimum=min_winsize, maximum=max_winsize, step=1, visible=show_winsize)
-    win_cyclic = gradio.Checkbox(value=win_cyclic, label="Cyclic sequence", visible=show_cyclic)
-    win_col = gradio.Column(visible=show_win_controls)
-    refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
-                          maximum=num_files - 1, step=1, visible=scenegraph_type == 'oneref')
-    return win_col, winsize, win_cyclic, refid
-def main_demo(tmpdirname, model, device, image_size, server_name, server_port, silent=False,
-              share=False, gradio_delete_cache=False):
-    if not silent:
-        print('Outputing stuff in', tmpdirname)
-    recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, gradio_delete_cache, model, device,
-                                  silent, image_size)
-    model_from_scene_fun = functools.partial(get_3D_model_from_scene, silent)
-    def get_context(delete_cache):
-        css = """.gradio-container {margin: 0 !important; min-width: 100%};"""
-        title = "MASt3R Demo"
-        if delete_cache:
-            return gradio.Blocks(css=css, title=title, delete_cache=(delete_cache, delete_cache))
-        else:
-            return gradio.Blocks(css=css, title="MASt3R Demo")  # for compatibility with older versions
-    with get_context(gradio_delete_cache) as demo:
-        # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
-        scene = gradio.State(None)
-        gradio.HTML('<h2 style="text-align: center;">MASt3R Demo</h2>')
-        with gradio.Column():
-            inputfiles = gradio.File(file_count="multiple")
-            with gradio.Row():
-                with gradio.Column():
-                    with gradio.Row():
-                        lr1 = gradio.Slider(label="Coarse LR", value=0.07, minimum=0.01, maximum=0.2, step=0.01)
-                        niter1 = gradio.Number(value=500, precision=0, minimum=0, maximum=10_000,
-                                               label="num_iterations", info="For coarse alignment!")
-                        lr2 = gradio.Slider(label="Fine LR", value=0.014, minimum=0.005, maximum=0.05, step=0.001)
-                        niter2 = gradio.Number(value=200, precision=0, minimum=0, maximum=100_000,
-                                               label="num_iterations", info="For refinement!")
-                        optim_level = gradio.Dropdown(["coarse", "refine", "refine+depth"],
-                                                      value='refine', label="OptLevel",
-                                                      info="Optimization level")
-                    with gradio.Row():
-                        matching_conf_thr = gradio.Slider(label="Matching Confidence Thr", value=5.,
-                                                          minimum=0., maximum=30., step=0.1,
-                                                          info="Before Fallback to Regr3D!")
-                        shared_intrinsics = gradio.Checkbox(value=False, label="Shared intrinsics",
-                                                            info="Only optimize one set of intrinsics for all views")
-                        scenegraph_type = gradio.Dropdown([("complete: all possible image pairs", "complete"),
-                                                           ("swin: sliding window", "swin"),
-                                                           ("logwin: sliding window with long range", "logwin"),
-                                                           ("oneref: match one image with all", "oneref")],
-                                                          value='complete', label="Scenegraph",
-                                                          info="Define how to make pairs",
-                                                          interactive=True)
-                        with gradio.Column(visible=False) as win_col:
-                            winsize = gradio.Slider(label="Scene Graph: Window Size", value=1,
-                                                    minimum=1, maximum=1, step=1)
-                            win_cyclic = gradio.Checkbox(value=False, label="Cyclic sequence")
-                        refid = gradio.Slider(label="Scene Graph: Id", value=0,
-                                              minimum=0, maximum=0, step=1, visible=False)
-            run_btn = gradio.Button("Run")
-            with gradio.Row():
-                # adjust the confidence threshold
-                min_conf_thr = gradio.Slider(label="min_conf_thr", value=1.5, minimum=0.0, maximum=10, step=0.1)
-                # adjust the camera size in the output pointcloud
-                cam_size = gradio.Slider(label="cam_size", value=0.2, minimum=0.001, maximum=1.0, step=0.001)
-                TSDF_thresh = gradio.Slider(label="TSDF Threshold", value=0., minimum=0., maximum=1., step=0.01)
-            with gradio.Row():
-                as_pointcloud = gradio.Checkbox(value=True, label="As pointcloud")
-                # two post process implemented
-                mask_sky = gradio.Checkbox(value=False, label="Mask sky")
-                clean_depth = gradio.Checkbox(value=True, label="Clean-up depthmaps")
-                transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras")
-            outmodel = gradio.Model3D()
-            # events
-            scenegraph_type.change(set_scenegraph_options,
-                                   inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
-                                   outputs=[win_col, winsize, win_cyclic, refid])
-            inputfiles.change(set_scenegraph_options,
-                              inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
-                              outputs=[win_col, winsize, win_cyclic, refid])
-            win_cyclic.change(set_scenegraph_options,
-                              inputs=[inputfiles, win_cyclic, refid, scenegraph_type],
-                              outputs=[win_col, winsize, win_cyclic, refid])
-            run_btn.click(fn=recon_fun,
-                          inputs=[scene, inputfiles, optim_level, lr1, niter1, lr2, niter2, min_conf_thr, matching_conf_thr,
-                                  as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
-                                  scenegraph_type, winsize, win_cyclic, refid, TSDF_thresh, shared_intrinsics],
-                          outputs=[scene, outmodel])
-            min_conf_thr.release(fn=model_from_scene_fun,
-                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-                                         clean_depth, transparent_cams, cam_size, TSDF_thresh],
-                                 outputs=outmodel)
-            cam_size.change(fn=model_from_scene_fun,
-                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-                                    clean_depth, transparent_cams, cam_size, TSDF_thresh],
-                            outputs=outmodel)
-            TSDF_thresh.change(fn=model_from_scene_fun,
-                               inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-                                       clean_depth, transparent_cams, cam_size, TSDF_thresh],
-                               outputs=outmodel)
-            as_pointcloud.change(fn=model_from_scene_fun,
-                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-                                         clean_depth, transparent_cams, cam_size, TSDF_thresh],
-                                 outputs=outmodel)
-            mask_sky.change(fn=model_from_scene_fun,
-                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-                                    clean_depth, transparent_cams, cam_size, TSDF_thresh],
-                            outputs=outmodel)
-            clean_depth.change(fn=model_from_scene_fun,
-                               inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-                                       clean_depth, transparent_cams, cam_size, TSDF_thresh],
-                               outputs=outmodel)
-            transparent_cams.change(model_from_scene_fun,
-                                    inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-                                            clean_depth, transparent_cams, cam_size, TSDF_thresh],
-                                    outputs=outmodel)
-    demo.launch(share=share, server_name=server_name, server_port=server_port)

 # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
 #
 # --------------------------------------------------------
+# gradio demo executable
 # --------------------------------------------------------
 import os
+import torch
 import tempfile
+from contextlib import nullcontext
+from mast3r.demo import get_args_parser, main_demo
+from mast3r.model import AsymmetricMASt3R
+from mast3r.utils.misc import hash_md5
 import matplotlib.pyplot as pl
+pl.ion()
+torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+if __name__ == '__main__':
+    parser = get_args_parser()
+    args = parser.parse_args()
+    if args.server_name is not None:
+        server_name = args.server_name
     else:
+        server_name = '0.0.0.0' if args.local_network else '127.0.0.1'
+    if args.weights is not None:
+        weights_path = args.weights
     else:
+        weights_path = "naver/" + args.model_name
+    model = AsymmetricMASt3R.from_pretrained(weights_path).to(args.device)
+    chkpt_tag = hash_md5(weights_path)
+    def get_context(tmp_dir):
+        return tempfile.TemporaryDirectory(suffix='_mast3r_gradio_demo') if tmp_dir is None \
+            else nullcontext(tmp_dir)
+    with get_context(args.tmp_dir) as tmpdirname:
+        cache_path = os.path.join(tmpdirname, chkpt_tag)
+        os.makedirs(cache_path, exist_ok=True)
+        main_demo(cache_path, model, args.device, args.image_size, server_name, args.server_port, silent=args.silent,
+                  share=args.share, gradio_delete_cache=args.gradio_delete_cache)

device.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for DUSt3R
+# --------------------------------------------------------
+import numpy as np
+import torch
+def todevice(batch, device, callback=None, non_blocking=False):
+    ''' Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
+    batch: list, tuple, dict of tensors or other things
+    device: pytorch device or 'numpy'
+    callback: function that would be called on every sub-elements.
+    '''
+    if callback:
+        batch = callback(batch)
+    if isinstance(batch, dict):
+        return {k: todevice(v, device) for k, v in batch.items()}
+    if isinstance(batch, (tuple, list)):
+        return type(batch)(todevice(x, device) for x in batch)
+    x = batch
+    if device == 'numpy':
+        if isinstance(x, torch.Tensor):
+            x = x.detach().cpu().numpy()
+    elif x is not None:
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x)
+        if torch.is_tensor(x):
+            x = x.to(device, non_blocking=non_blocking)
+    return x
+to_device = todevice  # alias
+def to_numpy(x): return todevice(x, 'numpy')
+def to_cpu(x): return todevice(x, 'cpu')
+def to_cuda(x): return todevice(x, 'cuda')
+def collate_with_cat(whatever, lists=False):
+    if isinstance(whatever, dict):
+        return {k: collate_with_cat(vals, lists=lists) for k, vals in whatever.items()}
+    elif isinstance(whatever, (tuple, list)):
+        if len(whatever) == 0:
+            return whatever
+        elem = whatever[0]
+        T = type(whatever)
+        if elem is None:
+            return None
+        if isinstance(elem, (bool, float, int, str)):
+            return whatever
+        if isinstance(elem, tuple):
+            return T(collate_with_cat(x, lists=lists) for x in zip(*whatever))
+        if isinstance(elem, dict):
+            return {k: collate_with_cat([e[k] for e in whatever], lists=lists) for k in elem}
+        if isinstance(elem, torch.Tensor):
+            return listify(whatever) if lists else torch.cat(whatever)
+        if isinstance(elem, np.ndarray):
+            return listify(whatever) if lists else torch.cat([torch.from_numpy(x) for x in whatever])
+        # otherwise, we just chain lists
+        return sum(whatever, T())
+def listify(elems):
+    return [x for e in elems for x in e]

image.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions about images (loading/converting...)
+# --------------------------------------------------------
+import os
+import torch
+import numpy as np
+import PIL.Image
+from PIL.ImageOps import exif_transpose
+import torchvision.transforms as tvf
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+try:
+    from pillow_heif import register_heif_opener  # noqa
+    register_heif_opener()
+    heif_support_enabled = True
+except ImportError:
+    heif_support_enabled = False
+ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+def img_to_arr( img ):
+    if isinstance(img, str):
+        img = imread_cv2(img)
+    return img
+def imread_cv2(path, options=cv2.IMREAD_COLOR):
+    """ Open an image or a depthmap with opencv-python.
+    """
+    if path.endswith(('.exr', 'EXR')):
+        options = cv2.IMREAD_ANYDEPTH
+    img = cv2.imread(path, options)
+    if img is None:
+        raise IOError(f'Could not load image={path} with {options=}')
+    if img.ndim == 3:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    return img
+def rgb(ftensor, true_shape=None):
+    if isinstance(ftensor, list):
+        return [rgb(x, true_shape=true_shape) for x in ftensor]
+    if isinstance(ftensor, torch.Tensor):
+        ftensor = ftensor.detach().cpu().numpy()  # H,W,3
+    if ftensor.ndim == 3 and ftensor.shape[0] == 3:
+        ftensor = ftensor.transpose(1, 2, 0)
+    elif ftensor.ndim == 4 and ftensor.shape[1] == 3:
+        ftensor = ftensor.transpose(0, 2, 3, 1)
+    if true_shape is not None:
+        H, W = true_shape
+        ftensor = ftensor[:H, :W]
+    if ftensor.dtype == np.uint8:
+        img = np.float32(ftensor) / 255
+    else:
+        img = (ftensor * 0.5) + 0.5
+    return img.clip(min=0, max=1)
+def _resize_pil_image(img, long_edge_size):
+    S = max(img.size)
+    if S > long_edge_size:
+        interp = PIL.Image.LANCZOS
+    elif S <= long_edge_size:
+        interp = PIL.Image.BICUBIC
+    new_size = tuple(int(round(x*long_edge_size/S)) for x in img.size)
+    return img.resize(new_size, interp)
+def load_images(folder_or_list, size, square_ok=False, verbose=True):
+    """ open and convert all images in a list or folder to proper input format for DUSt3R
+    """
+    if isinstance(folder_or_list, str):
+        if verbose:
+            print(f'>> Loading images from {folder_or_list}')
+        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
+    elif isinstance(folder_or_list, list):
+        if verbose:
+            print(f'>> Loading a list of {len(folder_or_list)} images')
+        root, folder_content = '', folder_or_list
+    else:
+        raise ValueError(f'bad {folder_or_list=} ({type(folder_or_list)})')
+    supported_images_extensions = ['.jpg', '.jpeg', '.png']
+    if heif_support_enabled:
+        supported_images_extensions += ['.heic', '.heif']
+    supported_images_extensions = tuple(supported_images_extensions)
+    imgs = []
+    for path in folder_content:
+        if not path.lower().endswith(supported_images_extensions):
+            continue
+        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert('RGB')
+        W1, H1 = img.size
+        if size == 224:
+            # resize short side to 224 (then crop)
+            img = _resize_pil_image(img, round(size * max(W1/H1, H1/W1)))
+        else:
+            # resize long side to 512
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W//2, H//2
+        if size == 224:
+            half = min(cx, cy)
+            img = img.crop((cx-half, cy-half, cx+half, cy+half))
+        else:
+            halfw, halfh = ((2*cx)//16)*8, ((2*cy)//16)*8
+            if not (square_ok) and W == H:
+                halfh = 3*halfw/4
+            img = img.crop((cx-halfw, cy-halfh, cx+halfw, cy+halfh))
+        W2, H2 = img.size
+        if verbose:
+            print(f' - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}')
+        imgs.append(dict(img=ImgNorm(img)[None], true_shape=np.int32(
+            [img.size[::-1]]), idx=len(imgs), instance=str(len(imgs))))
+    assert imgs, 'no images foud at '+root
+    if verbose:
+        print(f' (Found {len(imgs)} images)')
+    return imgs

image_pairs.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilities needed to load image pairs
+# --------------------------------------------------------
+import numpy as np
+import torch
+def make_pairs(imgs, scene_graph='complete', prefilter=None, symmetrize=True):
+    pairs = []
+    if scene_graph == 'complete':  # complete graph
+        for i in range(len(imgs)):
+            for j in range(i):
+                pairs.append((imgs[i], imgs[j]))
+    elif scene_graph.startswith('swin'):
+        iscyclic = not scene_graph.endswith('noncyclic')
+        try:
+            winsize = int(scene_graph.split('-')[1])
+        except Exception as e:
+            winsize = 3
+        pairsid = set()
+        for i in range(len(imgs)):
+            for j in range(1, winsize + 1):
+                idx = (i + j)
+                if iscyclic:
+                    idx = idx % len(imgs)  # explicit loop closure
+                if idx >= len(imgs):
+                    continue
+                pairsid.add((i, idx) if i < idx else (idx, i))
+        for i, j in pairsid:
+            pairs.append((imgs[i], imgs[j]))
+    elif scene_graph.startswith('logwin'):
+        iscyclic = not scene_graph.endswith('noncyclic')
+        try:
+            winsize = int(scene_graph.split('-')[1])
+        except Exception as e:
+            winsize = 3
+        offsets = [2**i for i in range(winsize)]
+        pairsid = set()
+        for i in range(len(imgs)):
+            ixs_l = [i - off for off in offsets]
+            ixs_r = [i + off for off in offsets]
+            for j in ixs_l + ixs_r:
+                if iscyclic:
+                    j = j % len(imgs)  # Explicit loop closure
+                if j < 0 or j >= len(imgs) or j == i:
+                    continue
+                pairsid.add((i, j) if i < j else (j, i))
+        for i, j in pairsid:
+            pairs.append((imgs[i], imgs[j]))
+    elif scene_graph.startswith('oneref'):
+        refid = int(scene_graph.split('-')[1]) if '-' in scene_graph else 0
+        for j in range(len(imgs)):
+            if j != refid:
+                pairs.append((imgs[refid], imgs[j]))
+    if symmetrize:
+        pairs += [(img2, img1) for img1, img2 in pairs]
+    # now, remove edges
+    if isinstance(prefilter, str) and prefilter.startswith('seq'):
+        pairs = filter_pairs_seq(pairs, int(prefilter[3:]))
+    if isinstance(prefilter, str) and prefilter.startswith('cyc'):
+        pairs = filter_pairs_seq(pairs, int(prefilter[3:]), cyclic=True)
+    return pairs
+def sel(x, kept):
+    if isinstance(x, dict):
+        return {k: sel(v, kept) for k, v in x.items()}
+    if isinstance(x, (torch.Tensor, np.ndarray)):
+        return x[kept]
+    if isinstance(x, (tuple, list)):
+        return type(x)([x[k] for k in kept])
+def _filter_edges_seq(edges, seq_dis_thr, cyclic=False):
+    # number of images
+    n = max(max(e) for e in edges) + 1
+    kept = []
+    for e, (i, j) in enumerate(edges):
+        dis = abs(i - j)
+        if cyclic:
+            dis = min(dis, abs(i + n - j), abs(i - n - j))
+        if dis <= seq_dis_thr:
+            kept.append(e)
+    return kept
+def filter_pairs_seq(pairs, seq_dis_thr, cyclic=False):
+    edges = [(img1['idx'], img2['idx']) for img1, img2 in pairs]
+    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
+    return [pairs[i] for i in kept]
+def filter_edges_seq(view1, view2, pred1, pred2, seq_dis_thr, cyclic=False):
+    edges = [(int(i), int(j)) for i, j in zip(view1['idx'], view2['idx'])]
+    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
+    print(f'>> Filtering edges more than {seq_dis_thr} frames apart: kept {len(kept)}/{len(edges)} edges')
+    return sel(view1, kept), sel(view2, kept), sel(pred1, kept), sel(pred2, kept)

path_to_dust3r.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# dust3r submodule import
+# --------------------------------------------------------
+import sys
+import os.path as path
+HERE_PATH = path.normpath(path.dirname(__file__))
+DUSt3R_REPO_PATH = path.normpath(path.join(HERE_PATH, '../../dust3r'))
+DUSt3R_LIB_PATH = path.join(DUSt3R_REPO_PATH, 'dust3r')
+# check the presence of models directory in repo to be sure its cloned
+if path.isdir(DUSt3R_LIB_PATH):
+    # workaround for sibling import
+    sys.path.insert(0, DUSt3R_REPO_PATH)
+else:
+    raise ImportError(f"dust3r is not initialized, could not find: {DUSt3R_LIB_PATH}.\n "
+                      "Did you forget to run 'git submodule update --init --recursive' ?")

viz.py ADDED Viewed

	@@ -0,0 +1,381 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Visualization utilities using trimesh
+# --------------------------------------------------------
+import PIL.Image
+import numpy as np
+from scipy.spatial.transform import Rotation
+import torch
+from dust3r.utils.geometry import geotrf, get_med_dist_between_poses, depthmap_to_absolute_camera_coordinates
+from dust3r.utils.device import to_numpy
+from dust3r.utils.image import rgb, img_to_arr
+try:
+    import trimesh
+except ImportError:
+    print('/!\\ module trimesh is not installed, cannot visualize results /!\\')
+def cat_3d(vecs):
+    if isinstance(vecs, (np.ndarray, torch.Tensor)):
+        vecs = [vecs]
+    return np.concatenate([p.reshape(-1, 3) for p in to_numpy(vecs)])
+def show_raw_pointcloud(pts3d, colors, point_size=2):
+    scene = trimesh.Scene()
+    pct = trimesh.PointCloud(cat_3d(pts3d), colors=cat_3d(colors))
+    scene.add_geometry(pct)
+    scene.show(line_settings={'point_size': point_size})
+def pts3d_to_trimesh(img, pts3d, valid=None):
+    H, W, THREE = img.shape
+    assert THREE == 3
+    assert img.shape == pts3d.shape
+    vertices = pts3d.reshape(-1, 3)
+    # make squares: each pixel == 2 triangles
+    idx = np.arange(len(vertices)).reshape(H, W)
+    idx1 = idx[:-1, :-1].ravel()  # top-left corner
+    idx2 = idx[:-1, +1:].ravel()  # right-left corner
+    idx3 = idx[+1:, :-1].ravel()  # bottom-left corner
+    idx4 = idx[+1:, +1:].ravel()  # bottom-right corner
+    faces = np.concatenate((
+        np.c_[idx1, idx2, idx3],
+        np.c_[idx3, idx2, idx1],  # same triangle, but backward (cheap solution to cancel face culling)
+        np.c_[idx2, idx3, idx4],
+        np.c_[idx4, idx3, idx2],  # same triangle, but backward (cheap solution to cancel face culling)
+    ), axis=0)
+    # prepare triangle colors
+    face_colors = np.concatenate((
+        img[:-1, :-1].reshape(-1, 3),
+        img[:-1, :-1].reshape(-1, 3),
+        img[+1:, +1:].reshape(-1, 3),
+        img[+1:, +1:].reshape(-1, 3)
+    ), axis=0)
+    # remove invalid faces
+    if valid is not None:
+        assert valid.shape == (H, W)
+        valid_idxs = valid.ravel()
+        valid_faces = valid_idxs[faces].all(axis=-1)
+        faces = faces[valid_faces]
+        face_colors = face_colors[valid_faces]
+    assert len(faces) == len(face_colors)
+    return dict(vertices=vertices, face_colors=face_colors, faces=faces)
+def cat_meshes(meshes):
+    vertices, faces, colors = zip(*[(m['vertices'], m['faces'], m['face_colors']) for m in meshes])
+    n_vertices = np.cumsum([0]+[len(v) for v in vertices])
+    for i in range(len(faces)):
+        faces[i][:] += n_vertices[i]
+    vertices = np.concatenate(vertices)
+    colors = np.concatenate(colors)
+    faces = np.concatenate(faces)
+    return dict(vertices=vertices, face_colors=colors, faces=faces)
+def show_duster_pairs(view1, view2, pred1, pred2):
+    import matplotlib.pyplot as pl
+    pl.ion()
+    for e in range(len(view1['instance'])):
+        i = view1['idx'][e]
+        j = view2['idx'][e]
+        img1 = rgb(view1['img'][e])
+        img2 = rgb(view2['img'][e])
+        conf1 = pred1['conf'][e].squeeze()
+        conf2 = pred2['conf'][e].squeeze()
+        score = conf1.mean()*conf2.mean()
+        print(f">> Showing pair #{e} {i}-{j} {score=:g}")
+        pl.clf()
+        pl.subplot(221).imshow(img1)
+        pl.subplot(223).imshow(img2)
+        pl.subplot(222).imshow(conf1, vmin=1, vmax=30)
+        pl.subplot(224).imshow(conf2, vmin=1, vmax=30)
+        pts1 = pred1['pts3d'][e]
+        pts2 = pred2['pts3d_in_other_view'][e]
+        pl.subplots_adjust(0, 0, 1, 1, 0, 0)
+        if input('show pointcloud? (y/n) ') == 'y':
+            show_raw_pointcloud(cat(pts1, pts2), cat(img1, img2), point_size=5)
+def auto_cam_size(im_poses):
+    return 0.1 * get_med_dist_between_poses(im_poses)
+class SceneViz:
+    def __init__(self):
+        self.scene = trimesh.Scene()
+    def add_rgbd(self, image, depth, intrinsics=None, cam2world=None, zfar=np.inf, mask=None):
+        image = img_to_arr(image)
+        # make up some intrinsics
+        if intrinsics is None:
+            H, W, THREE = image.shape
+            focal = max(H, W)
+            intrinsics = np.float32([[focal, 0, W/2], [0, focal, H/2], [0, 0, 1]])
+        # compute 3d points
+        pts3d = depthmap_to_pts3d(depth, intrinsics, cam2world=cam2world)
+        return self.add_pointcloud(pts3d, image, mask=(depth<zfar) if mask is None else mask)
+    def add_pointcloud(self, pts3d, color=(0,0,0), mask=None, denoise=False):
+        pts3d = to_numpy(pts3d)
+        mask = to_numpy(mask)
+        if not isinstance(pts3d, list):
+            pts3d = [pts3d.reshape(-1,3)]
+            if mask is not None:
+                mask = [mask.ravel()]
+        if not isinstance(color, (tuple,list)):
+            color = [color.reshape(-1,3)]
+        if mask is None:
+            mask = [slice(None)] * len(pts3d)
+        pts = np.concatenate([p[m] for p,m in zip(pts3d,mask)])
+        pct = trimesh.PointCloud(pts)
+        if isinstance(color, (list, np.ndarray, torch.Tensor)):
+            color = to_numpy(color)
+            col = np.concatenate([p[m] for p,m in zip(color,mask)])
+            assert col.shape == pts.shape, bb()
+            pct.visual.vertex_colors = uint8(col.reshape(-1,3))
+        else:
+            assert len(color) == 3
+            pct.visual.vertex_colors = np.broadcast_to(uint8(color), pts.shape)
+        if denoise:
+            # remove points which are noisy
+            centroid = np.median(pct.vertices, axis=0)
+            dist_to_centroid = np.linalg.norm( pct.vertices - centroid, axis=-1)
+            dist_thr = np.quantile(dist_to_centroid, 0.99)
+            valid = (dist_to_centroid < dist_thr)
+            # new cleaned pointcloud
+            pct = trimesh.PointCloud(pct.vertices[valid], color=pct.visual.vertex_colors[valid])
+        self.scene.add_geometry(pct)
+        return self
+    def add_rgbd(self, image, depth, intrinsics=None, cam2world=None, zfar=np.inf, mask=None):
+        # make up some intrinsics
+        if intrinsics is None:
+            H, W, THREE = image.shape
+            focal = max(H, W)
+            intrinsics = np.float32([[focal, 0, W/2], [0, focal, H/2], [0, 0, 1]])
+        # compute 3d points
+        pts3d, mask2 = depthmap_to_absolute_camera_coordinates(depth, intrinsics, cam2world)
+        mask2 &= (depth<zfar)
+        # combine with provided mask if any
+        if mask is not None:
+            mask2 &= mask
+        return self.add_pointcloud(pts3d, image, mask=mask2)
+    def add_camera(self, pose_c2w, focal=None, color=(0, 0, 0), image=None, imsize=None, cam_size=0.03):
+        pose_c2w, focal, color, image = to_numpy((pose_c2w, focal, color, image))
+        image = img_to_arr(image)
+        if isinstance(focal, np.ndarray) and focal.shape == (3,3):
+            intrinsics = focal
+            focal = (intrinsics[0,0] * intrinsics[1,1]) ** 0.5
+            if imsize is None:
+                imsize = (2*intrinsics[0,2], 2*intrinsics[1,2])
+        add_scene_cam(self.scene, pose_c2w, color, image, focal, imsize=imsize, screen_width=cam_size, marker=None)
+        return self
+    def add_cameras(self, poses, focals=None, images=None, imsizes=None, colors=None, **kw):
+        get = lambda arr,idx: None if arr is None else arr[idx]
+        for i, pose_c2w in enumerate(poses):
+            self.add_camera(pose_c2w, get(focals,i), image=get(images,i), color=get(colors,i), imsize=get(imsizes,i), **kw)
+        return self
+    def show(self, point_size=2):
+        self.scene.show(line_settings= {'point_size': point_size})
+def show_raw_pointcloud_with_cams(imgs, pts3d, mask, focals, cams2world,
+                                  point_size=2, cam_size=0.05, cam_color=None):
+    """ Visualization of a pointcloud with cameras
+        imgs = (N, H, W, 3) or N-size list of [(H,W,3), ...]
+        pts3d = (N, H, W, 3) or N-size list of [(H,W,3), ...]
+        focals = (N,) or N-size list of [focal, ...]
+        cams2world = (N,4,4) or N-size list of [(4,4), ...]
+    """
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+    scene = trimesh.Scene()
+    # full pointcloud
+    pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+    col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+    pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+    scene.add_geometry(pct)
+    # add each camera
+    for i, pose_c2w in enumerate(cams2world):
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      imgs[i] if i < len(imgs) else None, focals[i], screen_width=cam_size)
+    scene.show(line_settings={'point_size': point_size})
+def add_scene_cam(scene, pose_c2w, edge_color, image=None, focal=None, imsize=None,
+                  screen_width=0.03, marker=None):
+    if image is not None:
+        image = np.asarray(image)
+        H, W, THREE = image.shape
+        assert THREE == 3
+        if image.dtype != np.uint8:
+            image = np.uint8(255*image)
+    elif imsize is not None:
+        W, H = imsize
+    elif focal is not None:
+        H = W = focal / 1.1
+    else:
+        H = W = 1
+    if isinstance(focal, np.ndarray):
+        focal = focal[0]
+    if not focal:
+        focal = min(H,W) * 1.1 # default value
+    # create fake camera
+    height = max( screen_width/10, focal * screen_width / H )
+    width = screen_width * 0.5**0.5
+    rot45 = np.eye(4)
+    rot45[:3, :3] = Rotation.from_euler('z', np.deg2rad(45)).as_matrix()
+    rot45[2, 3] = -height  # set the tip of the cone = optical center
+    aspect_ratio = np.eye(4)
+    aspect_ratio[0, 0] = W/H
+    transform = pose_c2w @ OPENGL @ aspect_ratio @ rot45
+    cam = trimesh.creation.cone(width, height, sections=4)  # , transform=transform)
+    # this is the image
+    if image is not None:
+        vertices = geotrf(transform, cam.vertices[[4, 5, 1, 3]])
+        faces = np.array([[0, 1, 2], [0, 2, 3], [2, 1, 0], [3, 2, 0]])
+        img = trimesh.Trimesh(vertices=vertices, faces=faces)
+        uv_coords = np.float32([[0, 0], [1, 0], [1, 1], [0, 1]])
+        img.visual = trimesh.visual.TextureVisuals(uv_coords, image=PIL.Image.fromarray(image))
+        scene.add_geometry(img)
+    # this is the camera mesh
+    rot2 = np.eye(4)
+    rot2[:3, :3] = Rotation.from_euler('z', np.deg2rad(2)).as_matrix()
+    vertices = np.r_[cam.vertices, 0.95*cam.vertices, geotrf(rot2, cam.vertices)]
+    vertices = geotrf(transform, vertices)
+    faces = []
+    for face in cam.faces:
+        if 0 in face:
+            continue
+        a, b, c = face
+        a2, b2, c2 = face + len(cam.vertices)
+        a3, b3, c3 = face + 2*len(cam.vertices)
+        # add 3 pseudo-edges
+        faces.append((a, b, b2))
+        faces.append((a, a2, c))
+        faces.append((c2, b, c))
+        faces.append((a, b, b3))
+        faces.append((a, a3, c))
+        faces.append((c3, b, c))
+    # no culling
+    faces += [(c, b, a) for a, b, c in faces]
+    cam = trimesh.Trimesh(vertices=vertices, faces=faces)
+    cam.visual.face_colors[:, :3] = edge_color
+    scene.add_geometry(cam)
+    if marker == 'o':
+        marker = trimesh.creation.icosphere(3, radius=screen_width/4)
+        marker.vertices += pose_c2w[:3,3]
+        marker.visual.face_colors[:,:3] = edge_color
+        scene.add_geometry(marker)
+def cat(a, b):
+    return np.concatenate((a.reshape(-1, 3), b.reshape(-1, 3)))
+OPENGL = np.array([[1, 0, 0, 0],
+                   [0, -1, 0, 0],
+                   [0, 0, -1, 0],
+                   [0, 0, 0, 1]])
+CAM_COLORS = [(255, 0, 0), (0, 0, 255), (0, 255, 0), (255, 0, 255), (255, 204, 0), (0, 204, 204),
+              (128, 255, 255), (255, 128, 255), (255, 255, 128), (0, 0, 0), (128, 128, 128)]
+def uint8(colors):
+    if not isinstance(colors, np.ndarray):
+        colors = np.array(colors)
+    if np.issubdtype(colors.dtype, np.floating):
+        colors *= 255
+    assert 0 <= colors.min() and colors.max() < 256
+    return np.uint8(colors)
+def segment_sky(image):
+    import cv2
+    from scipy import ndimage
+    # Convert to HSV
+    image = to_numpy(image)
+    if np.issubdtype(image.dtype, np.floating):
+        image = np.uint8(255*image.clip(min=0, max=1))
+    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+    # Define range for blue color and create mask
+    lower_blue = np.array([0, 0, 100])
+    upper_blue = np.array([30, 255, 255])
+    mask = cv2.inRange(hsv, lower_blue, upper_blue).view(bool)
+    # add luminous gray
+    mask |= (hsv[:, :, 1] < 10) & (hsv[:, :, 2] > 150)
+    mask |= (hsv[:, :, 1] < 30) & (hsv[:, :, 2] > 180)
+    mask |= (hsv[:, :, 1] < 50) & (hsv[:, :, 2] > 220)
+    # Morphological operations
+    kernel = np.ones((5, 5), np.uint8)
+    mask2 = ndimage.binary_opening(mask, structure=kernel)
+    # keep only largest CC
+    _, labels, stats, _ = cv2.connectedComponentsWithStats(mask2.view(np.uint8), connectivity=8)
+    cc_sizes = stats[1:, cv2.CC_STAT_AREA]
+    order = cc_sizes.argsort()[::-1]  # bigger first
+    i = 0
+    selection = []
+    while i < len(order) and cc_sizes[order[i]] > cc_sizes[order[0]] / 2:
+        selection.append(1 + order[i])
+        i += 1
+    mask3 = np.in1d(labels, selection).reshape(labels.shape)
+    # Apply mask
+    return torch.from_numpy(mask3)