Spaces:

kxic
/

EscherNet

Running on Zero

App Files Files Community

kxhit commited on Jun 12, 2024

Commit

a4e1ae5

1 Parent(s): 319afdb

space gpu for carvekit

Browse files

Files changed (33) hide show

app.py +1 -0
app_bk.py +0 -786
app_mini.py +0 -773
mini_dust3r/__init__.py +0 -0
mini_dust3r/api/__init__.py +0 -3
mini_dust3r/api/inference.py +0 -225
mini_dust3r/cloud_opt/__init__.py +0 -44
mini_dust3r/cloud_opt/base_opt.py +0 -390
mini_dust3r/cloud_opt/commons.py +0 -90
mini_dust3r/cloud_opt/init_im_poses.py +0 -316
mini_dust3r/cloud_opt/modular_optimizer.py +0 -145
mini_dust3r/cloud_opt/optimizer.py +0 -248
mini_dust3r/cloud_opt/pair_viewer.py +0 -127
mini_dust3r/croco/blocks.py +0 -241
mini_dust3r/croco/croco.py +0 -249
mini_dust3r/croco/dpt_block.py +0 -450
mini_dust3r/croco/masking.py +0 -25
mini_dust3r/croco/pos_embed.py +0 -159
mini_dust3r/heads/__init__.py +0 -19
mini_dust3r/heads/dpt_head.py +0 -114
mini_dust3r/heads/linear_head.py +0 -41
mini_dust3r/heads/postprocess.py +0 -58
mini_dust3r/image_pairs.py +0 -85
mini_dust3r/inference.py +0 -204
mini_dust3r/model.py +0 -259
mini_dust3r/optim_factory.py +0 -14
mini_dust3r/patch_embed.py +0 -69
mini_dust3r/post_process.py +0 -60
mini_dust3r/utils/device.py +0 -76
mini_dust3r/utils/geometry.py +0 -361
mini_dust3r/utils/image.py +0 -141
mini_dust3r/utils/misc.py +0 -121
mini_dust3r/viz.py +0 -320

app.py CHANGED Viewed

@@ -131,6 +131,7 @@ def sam_init():
     predictor = SamPredictor(sam)
     return predictor
 def create_carvekit_interface():
     # Check doc strings for more information
     interface = HiInterface(object_type="object",  # Can be "object" or "hairs-like".

     predictor = SamPredictor(sam)
     return predictor
+@spaces.GPU
 def create_carvekit_interface():
     # Check doc strings for more information
     interface = HiInterface(object_type="object",  # Can be "object" or "hairs-like".

app_bk.py DELETED Viewed

@@ -1,786 +0,0 @@
-import spaces
-import torch
-print("cuda is available: ", torch.cuda.is_available())
-import gradio as gr
-import os
-import shutil
-import rembg
-import numpy as np
-import math
-import open3d as o3d
-from PIL import Image
-import torchvision
-import trimesh
-from skimage.io import imsave
-import imageio
-import cv2
-import matplotlib.pyplot as pl
-pl.ion()
-CaPE_TYPE = "6DoF"
-device = 'cuda' #if torch.cuda.is_available() else 'cpu'
-weight_dtype = torch.float16
-torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
-# EscherNet
-# create angles in archimedean spiral with N steps
-def get_archimedean_spiral(sphere_radius, num_steps=250):
-    # x-z plane, around upper y
-    '''
-    https://en.wikipedia.org/wiki/Spiral, section "Spherical spiral". c = a / pi
-    '''
-    a = 40
-    r = sphere_radius
-    translations = []
-    angles = []
-    # i = a / 2
-    i = 0.01
-    while i < a:
-        theta = i / a * math.pi
-        x = r * math.sin(theta) * math.cos(-i)
-        z = r * math.sin(-theta + math.pi) * math.sin(-i)
-        y = r * - math.cos(theta)
-        # translations.append((x, y, z))    # origin
-        translations.append((x, z, -y))
-        angles.append([np.rad2deg(-i), np.rad2deg(theta)])
-        # i += a / (2 * num_steps)
-        i += a / (1 * num_steps)
-    return np.array(translations), np.stack(angles)
-def look_at(origin, target, up):
-    forward = (target - origin)
-    forward = forward / np.linalg.norm(forward)
-    right = np.cross(up, forward)
-    right = right / np.linalg.norm(right)
-    new_up = np.cross(forward, right)
-    rotation_matrix = np.column_stack((right, new_up, -forward, target))
-    matrix = np.row_stack((rotation_matrix, [0, 0, 0, 1]))
-    return matrix
-import einops
-import sys
-sys.path.insert(0, "./6DoF/")   # TODO change it when deploying
-# use the customized diffusers modules
-from diffusers import DDIMScheduler
-from dataset import get_pose
-from CN_encoder import CN_encoder
-from pipeline_zero1to3 import Zero1to3StableDiffusionPipeline
-from segment_anything import sam_model_registry, SamPredictor
-# import rembg
-from carvekit.api.high import HiInterface
-pretrained_model_name_or_path = "kxic/EscherNet_demo"
-resolution = 256
-h,w = resolution,resolution
-guidance_scale = 3.0
-radius = 2.2
-bg_color = [1., 1., 1., 1.]
-image_transforms = torchvision.transforms.Compose(
-        [
-            torchvision.transforms.Resize((resolution, resolution)),  # 256, 256
-            torchvision.transforms.ToTensor(),
-            torchvision.transforms.Normalize([0.5], [0.5])
-        ]
-    )
-xyzs_spiral, angles_spiral = get_archimedean_spiral(1.5, 200)
-# only half toop
-xyzs_spiral = xyzs_spiral[:100]
-angles_spiral = angles_spiral[:100]
-# Init pipeline
-scheduler = DDIMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler", revision=None)
-image_encoder = CN_encoder.from_pretrained(pretrained_model_name_or_path, subfolder="image_encoder", revision=None)
-pipeline = Zero1to3StableDiffusionPipeline.from_pretrained(
-    pretrained_model_name_or_path,
-    revision=None,
-    scheduler=scheduler,
-    image_encoder=None,
-    safety_checker=None,
-    feature_extractor=None,
-    torch_dtype=weight_dtype,
-)
-pipeline.image_encoder = image_encoder.to(weight_dtype)
-pipeline.set_progress_bar_config(disable=False)
-pipeline = pipeline.to(device)
-# pipeline.enable_xformers_memory_efficient_attention()
-# enable vae slicing
-pipeline.enable_vae_slicing()
-# pipeline.enable_xformers_memory_efficient_attention()
-#### object segmentation
-def sam_init():
-    sam_checkpoint = os.path.join("./sam_pt/sam_vit_h_4b8939.pth")
-    if os.path.exists(sam_checkpoint) is False:
-        os.system("wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth -P ./sam_pt/")
-    model_type = "vit_h"
-    sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device=device)
-    predictor = SamPredictor(sam)
-    return predictor
-def create_carvekit_interface():
-    # Check doc strings for more information
-    interface = HiInterface(object_type="object",  # Can be "object" or "hairs-like".
-                            batch_size_seg=6,
-                            batch_size_matting=1,
-                            device="cpu",
-                            seg_mask_size=640,  # Use 640 for Tracer B7 and 320 for U2Net
-                            matting_mask_size=2048,
-                            trimap_prob_threshold=231,
-                            trimap_dilation=30,
-                            trimap_erosion_iters=5,
-                            fp16=True)
-    return interface
-# rembg_session = rembg.new_session()
-rembg_session = create_carvekit_interface()
-predictor = sam_init()
-@spaces.GPU(duration=120)
-def run_eschernet(eschernet_input_dict, sample_steps, sample_seed, nvs_num, nvs_mode):
-    # set the random seed
-    generator = torch.Generator(device=device).manual_seed(sample_seed)
-    # generator = None
-    T_out = nvs_num
-    T_in = len(eschernet_input_dict['imgs'])
-    ####### output pose
-    # TODO choose T_out number of poses sequentially from the spiral
-    xyzs = xyzs_spiral[::(len(xyzs_spiral) // T_out)]
-    angles_out = angles_spiral[::(len(xyzs_spiral) // T_out)]
-    ####### input's max radius for translation scaling
-    radii = eschernet_input_dict['radii']
-    max_t = np.max(radii)
-    min_t = np.min(radii)
-    ####### input pose
-    pose_in = []
-    for T_in_index in range(T_in):
-        pose = get_pose(np.linalg.inv(eschernet_input_dict['poses'][T_in_index]))
-        pose[1:3, :] *= -1   # coordinate system conversion
-        pose[3, 3] *= 1. / max_t * radius    # scale radius to [1.5, 2.2]
-        pose_in.append(torch.from_numpy(pose))
-    ####### input image
-    img = eschernet_input_dict['imgs'] / 255.
-    img[img[:, :, :, -1] == 0.] = bg_color
-    # TODO batch image_transforms
-    input_image = [image_transforms(Image.fromarray(np.uint8(im[:, :, :3] * 255.)).convert("RGB")) for im in img]
-    ####### nvs pose
-    pose_out = []
-    for T_out_index in range(T_out):
-        azimuth, polar = angles_out[T_out_index]
-        if CaPE_TYPE == "4DoF":
-            pose_out.append(torch.tensor([np.deg2rad(polar), np.deg2rad(azimuth), 0., 0.]))
-        elif CaPE_TYPE == "6DoF":
-            pose = look_at(origin=np.array([0, 0, 0]), target=xyzs[T_out_index], up=np.array([0, 0, 1]))
-            pose = np.linalg.inv(pose)
-            pose[2, :] *= -1
-            pose_out.append(torch.from_numpy(get_pose(pose)))
-    # [B, T, C, H, W]
-    input_image = torch.stack(input_image, dim=0).to(device).to(weight_dtype).unsqueeze(0)
-    # [B, T, 4]
-    pose_in = np.stack(pose_in)
-    pose_out = np.stack(pose_out)
-    if CaPE_TYPE == "6DoF":
-        pose_in_inv = np.linalg.inv(pose_in).transpose([0, 2, 1])
-        pose_out_inv = np.linalg.inv(pose_out).transpose([0, 2, 1])
-        pose_in_inv = torch.from_numpy(pose_in_inv).to(device).to(weight_dtype).unsqueeze(0)
-        pose_out_inv = torch.from_numpy(pose_out_inv).to(device).to(weight_dtype).unsqueeze(0)
-    pose_in = torch.from_numpy(pose_in).to(device).to(weight_dtype).unsqueeze(0)
-    pose_out = torch.from_numpy(pose_out).to(device).to(weight_dtype).unsqueeze(0)
-    input_image = einops.rearrange(input_image, "b t c h w -> (b t) c h w")
-    assert T_in == input_image.shape[0]
-    assert T_in == pose_in.shape[1]
-    assert T_out == pose_out.shape[1]
-    # run inference
-    # pipeline.to(device)
-    pipeline.enable_xformers_memory_efficient_attention()
-    image = pipeline(input_imgs=input_image, prompt_imgs=input_image,
-                         poses=[[pose_out, pose_out_inv], [pose_in, pose_in_inv]],
-                         height=h, width=w, T_in=T_in, T_out=T_out,
-                         guidance_scale=guidance_scale, num_inference_steps=50, generator=generator,
-                         output_type="numpy").images
-    # save output image
-    output_dir = os.path.join(tmpdirname, "eschernet")
-    if os.path.exists(output_dir):
-        shutil.rmtree(output_dir)
-    os.makedirs(output_dir, exist_ok=True)
-    # # save to N imgs
-    # for i in range(T_out):
-    #     imsave(os.path.join(output_dir, f'{i}.png'), (image[i] * 255).astype(np.uint8))
-    # make a gif
-    frames = [Image.fromarray((image[i] * 255).astype(np.uint8)) for i in range(T_out)]
-    # frame_one = frames[0]
-    # frame_one.save(os.path.join(output_dir, "output.gif"), format="GIF", append_images=frames,
-    #                save_all=True, duration=50, loop=1)
-    # get a video
-    video_path = os.path.join(output_dir, "output.mp4")
-    imageio.mimwrite(video_path, np.stack(frames), fps=10, codec='h264')
-    return video_path
-# TODO mesh it
-@spaces.GPU(duration=120)
-def make3d():
-    pass
-############################ Dust3r as Pose Estimation ############################
-from scipy.spatial.transform import Rotation
-import copy
-from dust3r.inference import inference
-from dust3r.model import AsymmetricCroCo3DStereo
-from dust3r.image_pairs import make_pairs
-from dust3r.utils.image import load_images, rgb
-from dust3r.utils.device import to_numpy
-from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
-from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
-import math
-@spaces.GPU(duration=120)
-def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
-                                 cam_color=None, as_pointcloud=False,
-                                 transparent_cams=False, silent=False, same_focals=False):
-    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world)
-    if not same_focals:
-        assert (len(cams2world) == len(focals))
-    pts3d = to_numpy(pts3d)
-    imgs = to_numpy(imgs)
-    focals = to_numpy(focals)
-    cams2world = to_numpy(cams2world)
-    scene = trimesh.Scene()
-    # add axes
-    scene.add_geometry(trimesh.creation.axis(axis_length=0.5, axis_radius=0.001))
-    # full pointcloud
-    if as_pointcloud:
-        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
-        col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
-        pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
-        scene.add_geometry(pct)
-    else:
-        meshes = []
-        for i in range(len(imgs)):
-            meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i], mask[i]))
-        mesh = trimesh.Trimesh(**cat_meshes(meshes))
-        scene.add_geometry(mesh)
-    # add each camera
-    for i, pose_c2w in enumerate(cams2world):
-        if isinstance(cam_color, list):
-            camera_edge_color = cam_color[i]
-        else:
-            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
-        if same_focals:
-            focal = focals[0]
-        else:
-            focal = focals[i]
-        add_scene_cam(scene, pose_c2w, camera_edge_color,
-                      None if transparent_cams else imgs[i], focal,
-                      imsize=imgs[i].shape[1::-1], screen_width=cam_size)
-    rot = np.eye(4)
-    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
-    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
-    outfile = os.path.join(outdir, 'scene.glb')
-    if not silent:
-        print('(exporting 3D scene to', outfile, ')')
-    scene.export(file_obj=outfile)
-    return outfile
-@spaces.GPU(duration=120)
-def get_3D_model_from_scene(outdir, silent, scene, min_conf_thr=3, as_pointcloud=False, mask_sky=False,
-                            clean_depth=False, transparent_cams=False, cam_size=0.05, same_focals=False):
-    """
-    extract 3D_model (glb file) from a reconstructed scene
-    """
-    if scene is None:
-        return None
-    # post processes
-    if clean_depth:
-        scene = scene.clean_pointcloud()
-    if mask_sky:
-        scene = scene.mask_sky()
-    # get optimized values from scene
-    rgbimg = to_numpy(scene.imgs)
-    focals = to_numpy(scene.get_focals().cpu())
-    # cams2world = to_numpy(scene.get_im_poses().cpu())
-    # TODO use the vis_poses
-    cams2world = scene.vis_poses
-    # 3D pointcloud from depthmap, poses and intrinsics
-    # pts3d = to_numpy(scene.get_pts3d())
-    # TODO use the vis_poses
-    pts3d = scene.vis_pts3d
-    scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
-    msk = to_numpy(scene.get_masks())
-    return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
-                                        transparent_cams=transparent_cams, cam_size=cam_size, silent=silent,
-                                        same_focals=same_focals)
-@spaces.GPU(duration=120)
-def get_reconstructed_scene(filelist, schedule, niter, min_conf_thr,
-                            as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
-                            scenegraph_type, winsize, refid, same_focals):
-    """
-    from a list of images, run dust3r inference, global aligner.
-    then run get_3D_model_from_scene
-    """
-    silent = False
-    image_size = 224
-    # remove the directory if it already exists
-    outdir = tmpdirname
-    if os.path.exists(outdir):
-        shutil.rmtree(outdir)
-    os.makedirs(outdir, exist_ok=True)
-    imgs, imgs_rgba = load_images(filelist, size=image_size, verbose=not silent, do_remove_background=True, rembg_session=rembg_session, predictor=predictor)
-    if len(imgs) == 1:
-        imgs = [imgs[0], copy.deepcopy(imgs[0])]
-        imgs[1]['idx'] = 1
-    if scenegraph_type == "swin":
-        scenegraph_type = scenegraph_type + "-" + str(winsize)
-    elif scenegraph_type == "oneref":
-        scenegraph_type = scenegraph_type + "-" + str(refid)
-    pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True)
-    output = inference(pairs, model, device, batch_size=1, verbose=not silent)
-    mode = GlobalAlignerMode.PointCloudOptimizer if len(imgs) > 2 else GlobalAlignerMode.PairViewer
-    scene = global_aligner(output, device=device, mode=mode, verbose=not silent, same_focals=same_focals)
-    lr = 0.01
-    if mode == GlobalAlignerMode.PointCloudOptimizer:
-        loss = scene.compute_global_alignment(init='mst', niter=niter, schedule=schedule, lr=lr)
-    # outfile = get_3D_model_from_scene(outdir, silent, scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                                   clean_depth, transparent_cams, cam_size, same_focals=same_focals)
-    # also return rgb, depth and confidence imgs
-    # depth is normalized with the max value for all images
-    # we apply the jet colormap on the confidence maps
-    rgbimg = scene.imgs
-    # depths = to_numpy(scene.get_depthmaps())
-    # confs = to_numpy([c for c in scene.im_conf])
-    # cmap = pl.get_cmap('jet')
-    # depths_max = max([d.max() for d in depths])
-    # depths = [d / depths_max for d in depths]
-    # confs_max = max([d.max() for d in confs])
-    # confs = [cmap(d / confs_max) for d in confs]
-    imgs = []
-    rgbaimg = []
-    for i in range(len(rgbimg)):   # when only 1 image, scene.imgs is two
-        imgs.append(rgbimg[i])
-        # imgs.append(rgb(depths[i]))
-        # imgs.append(rgb(confs[i]))
-        # imgs.append(imgs_rgba[i])
-        if len(imgs_rgba) == 1 and i == 1:
-            imgs.append(imgs_rgba[0])
-            rgbaimg.append(np.array(imgs_rgba[0]))
-        else:
-            imgs.append(imgs_rgba[i])
-            rgbaimg.append(np.array(imgs_rgba[i]))
-    rgbaimg = np.array(rgbaimg)
-    # for eschernet
-    # get optimized values from scene
-    rgbimg = to_numpy(scene.imgs)
-    # focals = to_numpy(scene.get_focals().cpu())
-    cams2world = to_numpy(scene.get_im_poses().cpu())
-    # 3D pointcloud from depthmap, poses and intrinsics
-    pts3d = to_numpy(scene.get_pts3d())
-    scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
-    msk = to_numpy(scene.get_masks())
-    obj_mask = rgbaimg[..., 3] > 0
-    # TODO set global coordinate system at the center of the scene, z-axis is up
-    pts = np.concatenate([p[m] for p, m in zip(pts3d, msk)]).reshape(-1, 3)
-    pts_obj = np.concatenate([p[m&obj_m] for p, m, obj_m in zip(pts3d, msk, obj_mask)]).reshape(-1, 3)
-    centroid = np.mean(pts_obj, axis=0) # obj center
-    obj2world = np.eye(4)
-    obj2world[:3, 3] = -centroid  # T_wc
-    # get z_up vector
-    # TODO fit a plane and get the normal vector
-    pcd = o3d.geometry.PointCloud()
-    pcd.points = o3d.utility.Vector3dVector(pts)
-    plane_model, inliers = pcd.segment_plane(distance_threshold=0.01, ransac_n=3, num_iterations=1000)
-    # get the normalised normal vector dim = 3
-    normal = plane_model[:3] / np.linalg.norm(plane_model[:3])
-    # the normal direction should be pointing up
-    if normal[1] < 0:
-        normal = -normal
-    # print("normal", normal)
-    # # TODO z-up 180
-    # z_up = np.array([[1,0,0,0],
-    #                       [0,-1,0,0],
-    #                       [0,0,-1,0],
-    #                       [0,0,0,1]])
-    # obj2world = z_up @ obj2world
-    # # avg the y
-    # z_up_avg = cams2world[:,:3,3].sum(0) / np.linalg.norm(cams2world[:,:3,3].sum(0), axis=-1)    # average direction in cam coordinate
-    # # import pdb; pdb.set_trace()
-    # rot_axis = np.cross(np.array([0, 0, 1]), z_up_avg)
-    # rot_angle = np.arccos(np.dot(np.array([0, 0, 1]), z_up_avg) / (np.linalg.norm(z_up_avg) + 1e-6))
-    # rot = Rotation.from_rotvec(rot_angle * rot_axis)
-    # z_up = np.eye(4)
-    # z_up[:3, :3] = rot.as_matrix()
-    # get the rotation matrix from normal to z-axis
-    z_axis = np.array([0, 0, 1])
-    rot_axis = np.cross(normal, z_axis)
-    rot_angle = np.arccos(np.dot(normal, z_axis) / (np.linalg.norm(normal) + 1e-6))
-    rot = Rotation.from_rotvec(rot_angle * rot_axis)
-    z_up = np.eye(4)
-    z_up[:3, :3] = rot.as_matrix()
-    obj2world = z_up @ obj2world
-    # flip 180
-    flip_rot = np.array([[1, 0, 0, 0],
-                         [0, -1, 0, 0],
-                         [0, 0, -1, 0],
-                         [0, 0, 0, 1]])
-    obj2world = flip_rot @ obj2world
-    # get new cams2obj
-    cams2obj = []
-    for i, cam2world in enumerate(cams2world):
-        cams2obj.append(obj2world @ cam2world)
-    # TODO transform pts3d to the new coordinate system
-    for i, pts in enumerate(pts3d):
-        pts3d[i] = (obj2world @ np.concatenate([pts, np.ones_like(pts)[..., :1]], axis=-1).transpose(2, 0, 1).reshape(4,
-                                                                                                                      -1)) \
-                       .reshape(4, pts.shape[0], pts.shape[1]).transpose(1, 2, 0)[..., :3]
-    cams2world = np.array(cams2obj)
-    # TODO rewrite hack
-    scene.vis_poses = cams2world.copy()
-    scene.vis_pts3d = pts3d.copy()
-    # TODO save cams2world and rgbimg to each file, file name "000.npy", "001.npy", ... and "000.png", "001.png", ...
-    for i, (img, img_rgba, pose) in enumerate(zip(rgbimg, rgbaimg, cams2world)):
-        np.save(os.path.join(outdir, f"{i:03d}.npy"), pose)
-        pl.imsave(os.path.join(outdir, f"{i:03d}.png"), img)
-        pl.imsave(os.path.join(outdir, f"{i:03d}_rgba.png"), img_rgba)
-        # np.save(os.path.join(outdir, f"{i:03d}_focal.npy"), to_numpy(focal))
-    # save the min/max radius of camera
-    radii = np.linalg.norm(np.linalg.inv(cams2world)[..., :3, 3])
-    np.save(os.path.join(outdir, "radii.npy"), radii)
-    eschernet_input = {"poses": cams2world,
-                       "radii": radii,
-                       "imgs": rgbaimg}
-    print("got eschernet input")
-    outfile = get_3D_model_from_scene(outdir, silent, scene, min_conf_thr, as_pointcloud, mask_sky,
-                                      clean_depth, transparent_cams, cam_size, same_focals=same_focals)
-    return scene, outfile, imgs, eschernet_input
-def set_scenegraph_options(inputfiles, winsize, refid, scenegraph_type):
-    num_files = len(inputfiles) if inputfiles is not None else 1
-    max_winsize = max(1, math.ceil((num_files - 1) / 2))
-    if scenegraph_type == "swin":
-        winsize = gr.Slider(label="Scene Graph: Window Size", value=max_winsize,
-                                minimum=1, maximum=max_winsize, step=1, visible=True)
-        refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0,
-                              maximum=num_files - 1, step=1, visible=False)
-    elif scenegraph_type == "oneref":
-        winsize = gr.Slider(label="Scene Graph: Window Size", value=max_winsize,
-                                minimum=1, maximum=max_winsize, step=1, visible=False)
-        refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0,
-                              maximum=num_files - 1, step=1, visible=True)
-    else:
-        winsize = gr.Slider(label="Scene Graph: Window Size", value=max_winsize,
-                                minimum=1, maximum=max_winsize, step=1, visible=False)
-        refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0,
-                              maximum=num_files - 1, step=1, visible=False)
-    return winsize, refid
-def get_examples(path):
-    objs = []
-    for obj_name in sorted(os.listdir(path)):
-        img_files = []
-        for img_file in sorted(os.listdir(os.path.join(path, obj_name))):
-            img_files.append(os.path.join(path, obj_name, img_file))
-        objs.append([img_files])
-    print("objs = ", objs)
-    return objs
-def preview_input(inputfiles):
-    if inputfiles is None:
-        return None
-    imgs = []
-    for img_file in inputfiles:
-        img = pl.imread(img_file)
-        imgs.append(img)
-    return imgs
-# def main():
-# dustr init
-silent = False
-image_size = 224
-weights_path = 'checkpoints/DUSt3R_ViTLarge_BaseDecoder_224_linear.pth'
-model = AsymmetricCroCo3DStereo.from_pretrained(weights_path).to(device)
-# dust3r will write the 3D model inside tmpdirname
-# with tempfile.TemporaryDirectory(suffix='dust3r_gradio_demo') as tmpdirname:
-tmpdirname = os.path.join('logs/user_object')
-# remove the directory if it already exists
-if os.path.exists(tmpdirname):
-    shutil.rmtree(tmpdirname)
-os.makedirs(tmpdirname, exist_ok=True)
-if not silent:
-    print('Outputing stuff in', tmpdirname)
-_HEADER_ = '''
-<h2><b>[CVPR'24 Oral] EscherNet: A Generative Model for Scalable View Synthesis</b></h2>
-<b>EscherNet</b> is a multiview diffusion model for scalable generative any-to-any number/pose novel view synthesis.
-Image views are treated as tokens and the camera pose is encoded by <b>CaPE (Camera Positional Encoding)</b>.
-<a href='https://kxhit.github.io/EscherNet' target='_blank'>Project</a> <b>|</b>
-<a href='https://github.com/kxhit/EscherNet' target='_blank'>GitHub</a> <b>|</b>
-<a href='https://arxiv.org/abs/2402.03908' target='_blank'>ArXiv</a>
-<h4><b>Tips:</b></h4>
-- Our model can take <b>any number input images</b>. The more images you provide <b>(>=3 for this demo)</b>, the better the results.
-- Our model can generate <b>any number and any pose</b> novel views. You can specify the number of views you want to generate. In this demo, we set novel views on an <b>archemedian spiral</b> for simplicity.
-- The pose estimation is done using <a href='https://github.com/naver/dust3r' target='_blank'>DUSt3R</a>. You can also provide your own poses or get pose via any SLAM system.
-- The current checkpoint supports 6DoF camera pose and is trained on 30k 3D <a href='https://objaverse.allenai.org/' target='_blank'>Objaverse</a> objects for demo. Scaling is on the roadmap!
-'''
-_CITE_ = r"""
-📝 <b>Citation</b>:
-```bibtex
-@article{kong2024eschernet,
-    title={EscherNet: A Generative Model for Scalable View Synthesis},
-    author={Kong, Xin and Liu, Shikun and Lyu, Xiaoyang and Taher, Marwan and Qi, Xiaojuan and Davison, Andrew J},
-    journal={arXiv preprint arXiv:2402.03908},
-    year={2024}
-    }
-```
-"""
-with gr.Blocks() as demo:
-    gr.Markdown(_HEADER_)
-    # mv_images = gr.State()
-    scene = gr.State(None)
-    eschernet_input = gr.State(None)
-    with gr.Row(variant="panel"):
-        # left column
-        with gr.Column():
-            with gr.Row():
-                input_image = gr.File(file_count="multiple")
-            with gr.Row():
-                run_dust3r = gr.Button("Get Pose!", elem_id="dust3r")
-            with gr.Row():
-                processed_image = gr.Gallery(label='Input Views', columns=2, height="100%")
-            with gr.Row(variant="panel"):
-                # input examples under "examples" folder
-                gr.Examples(
-                    examples=get_examples('examples'),
-                    inputs=[input_image],
-                    label="Examples (click one set of images to start!)",
-                    examples_per_page=20
-                )
-        # right column
-        with gr.Column():
-            with gr.Row():
-                outmodel = gr.Model3D()
-            with gr.Row():
-                gr.Markdown('''
-                <h4><b>Check if the pose (blue is axis is estimated z-up direction) and segmentation looks correct. If not, remove the incorrect images and try again.</b></h4>
-                ''')
-            with gr.Row():
-                with gr.Group():
-                    do_remove_background = gr.Checkbox(
-                        label="Remove Background", value=True
-                    )
-                    sample_seed = gr.Number(value=42, label="Seed Value", precision=0)
-                    sample_steps = gr.Slider(
-                        label="Sample Steps",
-                        minimum=30,
-                        maximum=75,
-                        value=50,
-                        step=5,
-                        visible=False
-                    )
-                    nvs_num = gr.Slider(
-                        label="Number of Novel Views",
-                        minimum=5,
-                        maximum=100,
-                        value=30,
-                        step=1
-                    )
-                    nvs_mode = gr.Dropdown(["archimedes circle"],   # "fixed 4 views", "fixed 8 views"
-                                       value="archimedes circle", label="Novel Views Pose Chosen", visible=True)
-            with gr.Row():
-                gr.Markdown('''
-                <h4><b>Choose your desired novel view poses number and generate! The more output images the longer it takes.</b></h4>
-                ''')
-            with gr.Row():
-                submit = gr.Button("Submit", elem_id="eschernet", variant="primary")
-            with gr.Row():
-                with gr.Column():
-                    output_video = gr.Video(
-                        label="video", format="mp4",
-                        width=379,
-                        autoplay=True,
-                        interactive=False
-                    )
-            with gr.Row():
-                gr.Markdown('''
-                <h4><b>The novel views are generated on an archimedean spiral (rotating around z-up axis and looking at the object center). You can download the video.</b></h4>
-                ''')
-    gr.Markdown(_CITE_)
-    # set dust3r parameter invisible to be clean
-    with gr.Column():
-        with gr.Row():
-            schedule = gr.Dropdown(["linear", "cosine"],
-                                       value='linear', label="schedule", info="For global alignment!", visible=False)
-            niter = gr.Number(value=300, precision=0, minimum=0, maximum=5000,
-                                  label="num_iterations", info="For global alignment!", visible=False)
-            scenegraph_type = gr.Dropdown(["complete", "swin", "oneref"],
-                                              value='complete', label="Scenegraph",
-                                              info="Define how to make pairs",
-                                              interactive=True, visible=False)
-            same_focals = gr.Checkbox(value=True, label="Focal", info="Use the same focal for all cameras", visible=False)
-            winsize = gr.Slider(label="Scene Graph: Window Size", value=1,
-                                    minimum=1, maximum=1, step=1, visible=False)
-            refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0, maximum=0, step=1, visible=False)
-        with gr.Row():
-            # adjust the confidence threshold
-            min_conf_thr = gr.Slider(label="min_conf_thr", value=3.0, minimum=1.0, maximum=20, step=0.1, visible=False)
-            # adjust the camera size in the output pointcloud
-            cam_size = gr.Slider(label="cam_size", value=0.05, minimum=0.01, maximum=0.5, step=0.001, visible=False)
-        with gr.Row():
-            as_pointcloud = gr.Checkbox(value=False, label="As pointcloud", visible=False)
-            # two post process implemented
-            mask_sky = gr.Checkbox(value=False, label="Mask sky", visible=False)
-            clean_depth = gr.Checkbox(value=True, label="Clean-up depthmaps", visible=False)
-            transparent_cams = gr.Checkbox(value=False, label="Transparent cameras", visible=False)
-    # events
-    # scenegraph_type.change(set_scenegraph_options,
-    #                        inputs=[input_image, winsize, refid, scenegraph_type],
-    #                        outputs=[winsize, refid])
-    # min_conf_thr.release(fn=model_from_scene_fun,
-    #                      inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                              clean_depth, transparent_cams, cam_size, same_focals],
-    #                      outputs=outmodel)
-    # cam_size.change(fn=model_from_scene_fun,
-    #                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                         clean_depth, transparent_cams, cam_size, same_focals],
-    #                 outputs=outmodel)
-    # as_pointcloud.change(fn=model_from_scene_fun,
-    #                      inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                              clean_depth, transparent_cams, cam_size, same_focals],
-    #                      outputs=outmodel)
-    # mask_sky.change(fn=model_from_scene_fun,
-    #                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                         clean_depth, transparent_cams, cam_size, same_focals],
-    #                 outputs=outmodel)
-    # clean_depth.change(fn=model_from_scene_fun,
-    #                    inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                            clean_depth, transparent_cams, cam_size, same_focals],
-    #                    outputs=outmodel)
-    # transparent_cams.change(model_from_scene_fun,
-    #                         inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                                 clean_depth, transparent_cams, cam_size, same_focals],
-    #                         outputs=outmodel)
-    # run_dust3r.click(fn=recon_fun,
-    #               inputs=[input_image, schedule, niter, min_conf_thr, as_pointcloud,
-    #                       mask_sky, clean_depth, transparent_cams, cam_size,
-    #                       scenegraph_type, winsize, refid, same_focals],
-    #               outputs=[scene, outmodel, processed_image, eschernet_input])
-    # events
-    input_image.change(set_scenegraph_options,
-                       inputs=[input_image, winsize, refid, scenegraph_type],
-                       outputs=[winsize, refid])
-    run_dust3r.click(fn=get_reconstructed_scene,
-                     inputs=[input_image, schedule, niter, min_conf_thr, as_pointcloud,
-                             mask_sky, clean_depth, transparent_cams, cam_size,
-                             scenegraph_type, winsize, refid, same_focals],
-                     outputs=[scene, outmodel, processed_image, eschernet_input])
-    # events
-    input_image.change(fn=preview_input,
-                       inputs=[input_image],
-                       outputs=[processed_image])
-    submit.click(fn=run_eschernet,
-                 inputs=[eschernet_input, sample_steps, sample_seed,
-                         nvs_num, nvs_mode],
-                 outputs=[output_video])
-# demo.queue(max_size=10)
-# demo.launch(share=True, server_name="0.0.0.0", server_port=None)
-demo.queue(max_size=10).launch()
-# if __name__ == '__main__':
-#     main()

app_mini.py DELETED Viewed

@@ -1,773 +0,0 @@
-import spaces
-import torch
-print("cuda is available: ", torch.cuda.is_available())
-import gradio as gr
-import os
-import shutil
-import rembg
-import numpy as np
-import math
-import open3d as o3d
-from PIL import Image
-import torchvision
-import trimesh
-from skimage.io import imsave
-import imageio
-import cv2
-import matplotlib.pyplot as pl
-pl.ion()
-CaPE_TYPE = "6DoF"
-device = 'cuda' #if torch.cuda.is_available() else 'cpu'
-weight_dtype = torch.float16
-torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
-# EscherNet
-# create angles in archimedean spiral with N steps
-def get_archimedean_spiral(sphere_radius, num_steps=250):
-    # x-z plane, around upper y
-    '''
-    https://en.wikipedia.org/wiki/Spiral, section "Spherical spiral". c = a / pi
-    '''
-    a = 40
-    r = sphere_radius
-    translations = []
-    angles = []
-    # i = a / 2
-    i = 0.01
-    while i < a:
-        theta = i / a * math.pi
-        x = r * math.sin(theta) * math.cos(-i)
-        z = r * math.sin(-theta + math.pi) * math.sin(-i)
-        y = r * - math.cos(theta)
-        # translations.append((x, y, z))    # origin
-        translations.append((x, z, -y))
-        angles.append([np.rad2deg(-i), np.rad2deg(theta)])
-        # i += a / (2 * num_steps)
-        i += a / (1 * num_steps)
-    return np.array(translations), np.stack(angles)
-def look_at(origin, target, up):
-    forward = (target - origin)
-    forward = forward / np.linalg.norm(forward)
-    right = np.cross(up, forward)
-    right = right / np.linalg.norm(right)
-    new_up = np.cross(forward, right)
-    rotation_matrix = np.column_stack((right, new_up, -forward, target))
-    matrix = np.row_stack((rotation_matrix, [0, 0, 0, 1]))
-    return matrix
-import einops
-import sys
-sys.path.insert(0, "./6DoF/")   # TODO change it when deploying
-# use the customized diffusers modules
-from diffusers import DDIMScheduler
-from dataset import get_pose
-from CN_encoder import CN_encoder
-from pipeline_zero1to3 import Zero1to3StableDiffusionPipeline
-from segment_anything import sam_model_registry, SamPredictor
-# import rembg
-from carvekit.api.high import HiInterface
-pretrained_model_name_or_path = "kxic/EscherNet_demo"
-resolution = 256
-h,w = resolution,resolution
-guidance_scale = 3.0
-radius = 2.2
-bg_color = [1., 1., 1., 1.]
-image_transforms = torchvision.transforms.Compose(
-        [
-            torchvision.transforms.Resize((resolution, resolution)),  # 256, 256
-            torchvision.transforms.ToTensor(),
-            torchvision.transforms.Normalize([0.5], [0.5])
-        ]
-    )
-xyzs_spiral, angles_spiral = get_archimedean_spiral(1.5, 200)
-# only half toop
-xyzs_spiral = xyzs_spiral[:100]
-angles_spiral = angles_spiral[:100]
-# Init pipeline
-scheduler = DDIMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder="scheduler", revision=None)
-image_encoder = CN_encoder.from_pretrained(pretrained_model_name_or_path, subfolder="image_encoder", revision=None)
-pipeline = Zero1to3StableDiffusionPipeline.from_pretrained(
-    pretrained_model_name_or_path,
-    revision=None,
-    scheduler=scheduler,
-    image_encoder=None,
-    safety_checker=None,
-    feature_extractor=None,
-    torch_dtype=weight_dtype,
-)
-pipeline.image_encoder = image_encoder.to(weight_dtype)
-pipeline.set_progress_bar_config(disable=False)
-pipeline = pipeline.to(device)
-# pipeline.enable_xformers_memory_efficient_attention()
-# enable vae slicing
-pipeline.enable_vae_slicing()
-# pipeline.enable_xformers_memory_efficient_attention()
-#### object segmentation
-def sam_init():
-    sam_checkpoint = os.path.join("./sam_pt/sam_vit_h_4b8939.pth")
-    if os.path.exists(sam_checkpoint) is False:
-        os.system("wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth -P ./sam_pt/")
-    model_type = "vit_h"
-    sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device=device)
-    predictor = SamPredictor(sam)
-    return predictor
-def create_carvekit_interface():
-    # Check doc strings for more information
-    interface = HiInterface(object_type="object",  # Can be "object" or "hairs-like".
-                            batch_size_seg=6,
-                            batch_size_matting=1,
-                            device="cpu",
-                            seg_mask_size=640,  # Use 640 for Tracer B7 and 320 for U2Net
-                            matting_mask_size=2048,
-                            trimap_prob_threshold=231,
-                            trimap_dilation=30,
-                            trimap_erosion_iters=5,
-                            fp16=True)
-    return interface
-# rembg_session = rembg.new_session()
-rembg_session = create_carvekit_interface()
-predictor = sam_init()
-@spaces.GPU(duration=120)
-def run_eschernet(eschernet_input_dict, sample_steps, sample_seed, nvs_num, nvs_mode):
-    # set the random seed
-    generator = torch.Generator(device=device).manual_seed(sample_seed)
-    # generator = None
-    T_out = nvs_num
-    T_in = len(eschernet_input_dict['imgs'])
-    ####### output pose
-    # TODO choose T_out number of poses sequentially from the spiral
-    xyzs = xyzs_spiral[::(len(xyzs_spiral) // T_out)]
-    angles_out = angles_spiral[::(len(xyzs_spiral) // T_out)]
-    ####### input's max radius for translation scaling
-    radii = eschernet_input_dict['radii']
-    max_t = np.max(radii)
-    min_t = np.min(radii)
-    ####### input pose
-    pose_in = []
-    for T_in_index in range(T_in):
-        pose = get_pose(np.linalg.inv(eschernet_input_dict['poses'][T_in_index]))
-        pose[1:3, :] *= -1   # coordinate system conversion
-        pose[3, 3] *= 1. / max_t * radius    # scale radius to [1.5, 2.2]
-        pose_in.append(torch.from_numpy(pose))
-    ####### input image
-    img = eschernet_input_dict['imgs'] / 255.
-    img[img[:, :, :, -1] == 0.] = bg_color
-    # TODO batch image_transforms
-    input_image = [image_transforms(Image.fromarray(np.uint8(im[:, :, :3] * 255.)).convert("RGB")) for im in img]
-    ####### nvs pose
-    pose_out = []
-    for T_out_index in range(T_out):
-        azimuth, polar = angles_out[T_out_index]
-        if CaPE_TYPE == "4DoF":
-            pose_out.append(torch.tensor([np.deg2rad(polar), np.deg2rad(azimuth), 0., 0.]))
-        elif CaPE_TYPE == "6DoF":
-            pose = look_at(origin=np.array([0, 0, 0]), target=xyzs[T_out_index], up=np.array([0, 0, 1]))
-            pose = np.linalg.inv(pose)
-            pose[2, :] *= -1
-            pose_out.append(torch.from_numpy(get_pose(pose)))
-    # [B, T, C, H, W]
-    input_image = torch.stack(input_image, dim=0).to(device).to(weight_dtype).unsqueeze(0)
-    # [B, T, 4]
-    pose_in = np.stack(pose_in)
-    pose_out = np.stack(pose_out)
-    if CaPE_TYPE == "6DoF":
-        pose_in_inv = np.linalg.inv(pose_in).transpose([0, 2, 1])
-        pose_out_inv = np.linalg.inv(pose_out).transpose([0, 2, 1])
-        pose_in_inv = torch.from_numpy(pose_in_inv).to(device).to(weight_dtype).unsqueeze(0)
-        pose_out_inv = torch.from_numpy(pose_out_inv).to(device).to(weight_dtype).unsqueeze(0)
-    pose_in = torch.from_numpy(pose_in).to(device).to(weight_dtype).unsqueeze(0)
-    pose_out = torch.from_numpy(pose_out).to(device).to(weight_dtype).unsqueeze(0)
-    input_image = einops.rearrange(input_image, "b t c h w -> (b t) c h w")
-    assert T_in == input_image.shape[0]
-    assert T_in == pose_in.shape[1]
-    assert T_out == pose_out.shape[1]
-    # run inference
-    # pipeline.to(device)
-    pipeline.enable_xformers_memory_efficient_attention()
-    image = pipeline(input_imgs=input_image, prompt_imgs=input_image,
-                         poses=[[pose_out, pose_out_inv], [pose_in, pose_in_inv]],
-                         height=h, width=w, T_in=T_in, T_out=T_out,
-                         guidance_scale=guidance_scale, num_inference_steps=50, generator=generator,
-                         output_type="numpy").images
-    # save output image
-    output_dir = os.path.join(tmpdirname, "eschernet")
-    if os.path.exists(output_dir):
-        shutil.rmtree(output_dir)
-    os.makedirs(output_dir, exist_ok=True)
-    # # save to N imgs
-    # for i in range(T_out):
-    #     imsave(os.path.join(output_dir, f'{i}.png'), (image[i] * 255).astype(np.uint8))
-    # make a gif
-    frames = [Image.fromarray((image[i] * 255).astype(np.uint8)) for i in range(T_out)]
-    # frame_one = frames[0]
-    # frame_one.save(os.path.join(output_dir, "output.gif"), format="GIF", append_images=frames,
-    #                save_all=True, duration=50, loop=1)
-    # get a video
-    video_path = os.path.join(output_dir, "output.mp4")
-    imageio.mimwrite(video_path, np.stack(frames), fps=10, codec='h264')
-    return video_path
-# TODO mesh it
-@spaces.GPU(duration=120)
-def make3d():
-    pass
-############################ Dust3r as Pose Estimation ############################
-from scipy.spatial.transform import Rotation
-import copy
-from dust3r.inference import inference
-from dust3r.model import AsymmetricCroCo3DStereo
-from dust3r.image_pairs import make_pairs
-from dust3r.utils.image import load_images, rgb
-from dust3r.utils.device import to_numpy
-from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
-from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
-import math
-from mini_dust3r.api import OptimizedResult, inferece_dust3r, log_optimized_result
-from mini_dust3r.model import AsymmetricCroCo3DStereo
-# @spaces.GPU(duration=120)
-def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
-                                 cam_color=None, as_pointcloud=False,
-                                 transparent_cams=False, silent=False, same_focals=False):
-    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world)
-    if not same_focals:
-        assert (len(cams2world) == len(focals))
-    pts3d = to_numpy(pts3d)
-    imgs = to_numpy(imgs)
-    focals = to_numpy(focals)
-    cams2world = to_numpy(cams2world)
-    scene = trimesh.Scene()
-    # add axes
-    scene.add_geometry(trimesh.creation.axis(axis_length=0.5, axis_radius=0.001))
-    # full pointcloud
-    if as_pointcloud:
-        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
-        col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
-        pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
-        scene.add_geometry(pct)
-    else:
-        meshes = []
-        for i in range(len(imgs)):
-            meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i], mask[i]))
-        mesh = trimesh.Trimesh(**cat_meshes(meshes))
-        scene.add_geometry(mesh)
-    # add each camera
-    for i, pose_c2w in enumerate(cams2world):
-        if isinstance(cam_color, list):
-            camera_edge_color = cam_color[i]
-        else:
-            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
-        if same_focals:
-            focal = focals[0]
-        else:
-            focal = focals[i]
-        add_scene_cam(scene, pose_c2w, camera_edge_color,
-                      None if transparent_cams else imgs[i], focal,
-                      imsize=imgs[i].shape[1::-1], screen_width=cam_size)
-    rot = np.eye(4)
-    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
-    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
-    outfile = os.path.join(outdir, 'scene.glb')
-    if not silent:
-        print('(exporting 3D scene to', outfile, ')')
-    scene.export(file_obj=outfile)
-    return outfile
-# @spaces.GPU(duration=120)
-def get_3D_model_from_scene(outdir, silent, scene, min_conf_thr=3, as_pointcloud=False, mask_sky=False,
-                            clean_depth=False, transparent_cams=False, cam_size=0.05, same_focals=False):
-    """
-    extract 3D_model (glb file) from a reconstructed scene
-    """
-    if scene is None:
-        return None
-    # post processes
-    if clean_depth:
-        scene = scene.clean_pointcloud()
-    if mask_sky:
-        scene = scene.mask_sky()
-    # get optimized values from scene
-    rgbimg = to_numpy(scene.imgs)
-    focals = to_numpy(scene.get_focals().cpu())
-    # cams2world = to_numpy(scene.get_im_poses().cpu())
-    # TODO use the vis_poses
-    cams2world = scene.vis_poses
-    # 3D pointcloud from depthmap, poses and intrinsics
-    # pts3d = to_numpy(scene.get_pts3d())
-    # TODO use the vis_poses
-    pts3d = scene.vis_pts3d
-    scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
-    msk = to_numpy(scene.get_masks())
-    return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
-                                        transparent_cams=transparent_cams, cam_size=cam_size, silent=silent,
-                                        same_focals=same_focals)
-@spaces.GPU(duration=120)
-def get_reconstructed_scene(filelist, schedule, niter, min_conf_thr,
-                            as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
-                            scenegraph_type, winsize, refid, same_focals):
-    """
-    from a list of images, run dust3r inference, global aligner.
-    then run get_3D_model_from_scene
-    """
-    silent = False
-    image_size = 224
-    # remove the directory if it already exists
-    outdir = tmpdirname
-    if os.path.exists(outdir):
-        shutil.rmtree(outdir)
-    os.makedirs(outdir, exist_ok=True)
-    # imgs, imgs_rgba = load_images(filelist, size=image_size, verbose=not silent, do_remove_background=True, rembg_session=rembg_session, predictor=predictor)
-    optimized_results: OptimizedResult = inferece_dust3r(
-        image_dir_or_list=filelist,
-        model=model,
-        device=device,
-        batch_size=1,
-    )
-    rgbimg = optimized_results.rgb_hw3_list
-    imgs_rgba = rgbimg
-    cams2world = optimized_results.world_T_cam_b44
-    pts3d = optimized_results.point_cloud
-    pts_obj = pts3d
-    outfile = os.path.join(outdir, 'scene.glb')
-    # save point cloud trimesh.PointCloud to .ply
-    pts3d.export(os.path.join(outdir, 'scene.glb'))
-    # rgbimg = to_numpy(scene.imgs)
-    imgs = []
-    rgbaimg = []
-    for i in range(len(rgbimg)):   # when only 1 image, scene.imgs is two
-        imgs.append(rgbimg[i])
-        # imgs.append(rgb(depths[i]))
-        # imgs.append(rgb(confs[i]))
-        # imgs.append(imgs_rgba[i])
-        if len(imgs_rgba) == 1 and i == 1:
-            imgs.append(imgs_rgba[0])
-            rgbaimg.append(np.array(imgs_rgba[0]))
-        else:
-            imgs.append(imgs_rgba[i])
-            rgbaimg.append(np.array(imgs_rgba[i]))
-    rgbaimg = np.array(rgbaimg)
-    # for eschernet
-    # cams2world = to_numpy(scene.get_im_poses().cpu())
-    # pts3d = to_numpy(scene.get_pts3d())
-    # scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
-    # msk = to_numpy(scene.get_masks())
-    # obj_mask = rgbaimg[..., 3] > 0
-    # # TODO set global coordinate system at the center of the scene, z-axis is up
-    # # pts = np.concatenate([p[m] for p, m in zip(pts3d, msk)]).reshape(-1, 3)
-    # # pts_obj = np.concatenate([p[m&obj_m] for p, m, obj_m in zip(pts3d, msk, obj_mask)]).reshape(-1, 3)
-    # centroid = np.mean(pts_obj, axis=0) # obj center
-    # obj2world = np.eye(4)
-    # obj2world[:3, 3] = -centroid  # T_wc
-    #
-    # # get z_up vector
-    # # TODO fit a plane and get the normal vector
-    # pcd = o3d.geometry.PointCloud()
-    # pcd.points = o3d.utility.Vector3dVector(pts)
-    # plane_model, inliers = pcd.segment_plane(distance_threshold=0.01, ransac_n=3, num_iterations=1000)
-    # # get the normalised normal vector dim = 3
-    # normal = plane_model[:3] / np.linalg.norm(plane_model[:3])
-    # # the normal direction should be pointing up
-    # if normal[1] < 0:
-    #     normal = -normal
-    # # print("normal", normal)
-    #
-    # # # TODO z-up 180
-    # # z_up = np.array([[1,0,0,0],
-    # #                       [0,-1,0,0],
-    # #                       [0,0,-1,0],
-    # #                       [0,0,0,1]])
-    # # obj2world = z_up @ obj2world
-    #
-    # # # avg the y
-    # # z_up_avg = cams2world[:,:3,3].sum(0) / np.linalg.norm(cams2world[:,:3,3].sum(0), axis=-1)    # average direction in cam coordinate
-    # # # import pdb; pdb.set_trace()
-    # # rot_axis = np.cross(np.array([0, 0, 1]), z_up_avg)
-    # # rot_angle = np.arccos(np.dot(np.array([0, 0, 1]), z_up_avg) / (np.linalg.norm(z_up_avg) + 1e-6))
-    # # rot = Rotation.from_rotvec(rot_angle * rot_axis)
-    # # z_up = np.eye(4)
-    # # z_up[:3, :3] = rot.as_matrix()
-    #
-    # # get the rotation matrix from normal to z-axis
-    # z_axis = np.array([0, 0, 1])
-    # rot_axis = np.cross(normal, z_axis)
-    # rot_angle = np.arccos(np.dot(normal, z_axis) / (np.linalg.norm(normal) + 1e-6))
-    # rot = Rotation.from_rotvec(rot_angle * rot_axis)
-    # z_up = np.eye(4)
-    # z_up[:3, :3] = rot.as_matrix()
-    # obj2world = z_up @ obj2world
-    # # flip 180
-    # flip_rot = np.array([[1, 0, 0, 0],
-    #                      [0, -1, 0, 0],
-    #                      [0, 0, -1, 0],
-    #                      [0, 0, 0, 1]])
-    # obj2world = flip_rot @ obj2world
-    #
-    # # get new cams2obj
-    # cams2obj = []
-    # for i, cam2world in enumerate(cams2world):
-    #     cams2obj.append(obj2world @ cam2world)
-    # # TODO transform pts3d to the new coordinate system
-    # for i, pts in enumerate(pts3d):
-    #     pts3d[i] = (obj2world @ np.concatenate([pts, np.ones_like(pts)[..., :1]], axis=-1).transpose(2, 0, 1).reshape(4,
-    #                                                                                                                   -1)) \
-    #                    .reshape(4, pts.shape[0], pts.shape[1]).transpose(1, 2, 0)[..., :3]
-    # cams2world = np.array(cams2obj)
-    # # TODO rewrite hack
-    # scene.vis_poses = cams2world.copy()
-    # scene.vis_pts3d = pts3d.copy()
-    # # TODO save cams2world and rgbimg to each file, file name "000.npy", "001.npy", ... and "000.png", "001.png", ...
-    # for i, (img, img_rgba, pose) in enumerate(zip(rgbimg, rgbaimg, cams2world)):
-    #     np.save(os.path.join(outdir, f"{i:03d}.npy"), pose)
-    #     pl.imsave(os.path.join(outdir, f"{i:03d}.png"), img)
-    #     pl.imsave(os.path.join(outdir, f"{i:03d}_rgba.png"), img_rgba)
-    #     # np.save(os.path.join(outdir, f"{i:03d}_focal.npy"), to_numpy(focal))
-    # save the min/max radius of camera
-    radii = np.linalg.norm(np.linalg.inv(cams2world)[..., :3, 3])
-    # np.save(os.path.join(outdir, "radii.npy"), radii)
-    eschernet_input = {"poses": cams2world,
-                       "radii": radii,
-                       "imgs": rgbaimg}
-    print("got eschernet input")
-    # outfile = get_3D_model_from_scene(outdir, silent, scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                                   clean_depth, transparent_cams, cam_size, same_focals=same_focals)
-    return scene, outfile, imgs, eschernet_input
-def set_scenegraph_options(inputfiles, winsize, refid, scenegraph_type):
-    num_files = len(inputfiles) if inputfiles is not None else 1
-    max_winsize = max(1, math.ceil((num_files - 1) / 2))
-    if scenegraph_type == "swin":
-        winsize = gr.Slider(label="Scene Graph: Window Size", value=max_winsize,
-                                minimum=1, maximum=max_winsize, step=1, visible=True)
-        refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0,
-                              maximum=num_files - 1, step=1, visible=False)
-    elif scenegraph_type == "oneref":
-        winsize = gr.Slider(label="Scene Graph: Window Size", value=max_winsize,
-                                minimum=1, maximum=max_winsize, step=1, visible=False)
-        refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0,
-                              maximum=num_files - 1, step=1, visible=True)
-    else:
-        winsize = gr.Slider(label="Scene Graph: Window Size", value=max_winsize,
-                                minimum=1, maximum=max_winsize, step=1, visible=False)
-        refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0,
-                              maximum=num_files - 1, step=1, visible=False)
-    return winsize, refid
-def get_examples(path):
-    objs = []
-    for obj_name in sorted(os.listdir(path)):
-        img_files = []
-        for img_file in sorted(os.listdir(os.path.join(path, obj_name))):
-            img_files.append(os.path.join(path, obj_name, img_file))
-        objs.append([img_files])
-    print("objs = ", objs)
-    return objs
-def preview_input(inputfiles):
-    if inputfiles is None:
-        return None
-    imgs = []
-    for img_file in inputfiles:
-        img = pl.imread(img_file)
-        imgs.append(img)
-    return imgs
-# def main():
-# dustr init
-silent = False
-image_size = 224
-weights_path = 'checkpoints/DUSt3R_ViTLarge_BaseDecoder_224_linear.pth'
-model = AsymmetricCroCo3DStereo.from_pretrained(weights_path).to(device)
-# dust3r will write the 3D model inside tmpdirname
-# with tempfile.TemporaryDirectory(suffix='dust3r_gradio_demo') as tmpdirname:
-tmpdirname = os.path.join('logs/user_object')
-# remove the directory if it already exists
-if os.path.exists(tmpdirname):
-    shutil.rmtree(tmpdirname)
-os.makedirs(tmpdirname, exist_ok=True)
-if not silent:
-    print('Outputing stuff in', tmpdirname)
-_HEADER_ = '''
-<h2><b>[CVPR'24 Oral] EscherNet: A Generative Model for Scalable View Synthesis</b></h2>
-<b>EscherNet</b> is a multiview diffusion model for scalable generative any-to-any number/pose novel view synthesis.
-Image views are treated as tokens and the camera pose is encoded by <b>CaPE (Camera Positional Encoding)</b>.
-<a href='https://kxhit.github.io/EscherNet' target='_blank'>Project</a> <b>|</b>
-<a href='https://github.com/kxhit/EscherNet' target='_blank'>GitHub</a> <b>|</b>
-<a href='https://arxiv.org/abs/2402.03908' target='_blank'>ArXiv</a>
-<h4><b>Tips:</b></h4>
-- Our model can take <b>any number input images</b>. The more images you provide <b>(>=3 for this demo)</b>, the better the results.
-- Our model can generate <b>any number and any pose</b> novel views. You can specify the number of views you want to generate. In this demo, we set novel views on an <b>archemedian spiral</b> for simplicity.
-- The pose estimation is done using <a href='https://github.com/naver/dust3r' target='_blank'>DUSt3R</a>. You can also provide your own poses or get pose via any SLAM system.
-- The current checkpoint supports 6DoF camera pose and is trained on 30k 3D <a href='https://objaverse.allenai.org/' target='_blank'>Objaverse</a> objects for demo. Scaling is on the roadmap!
-'''
-_CITE_ = r"""
-📝 <b>Citation</b>:
-```bibtex
-@article{kong2024eschernet,
-    title={EscherNet: A Generative Model for Scalable View Synthesis},
-    author={Kong, Xin and Liu, Shikun and Lyu, Xiaoyang and Taher, Marwan and Qi, Xiaojuan and Davison, Andrew J},
-    journal={arXiv preprint arXiv:2402.03908},
-    year={2024}
-    }
-```
-"""
-with gr.Blocks() as demo:
-    gr.Markdown(_HEADER_)
-    # mv_images = gr.State()
-    scene = gr.State(None)
-    eschernet_input = gr.State(None)
-    with gr.Row(variant="panel"):
-        # left column
-        with gr.Column():
-            with gr.Row():
-                input_image = gr.File(file_count="multiple")
-            with gr.Row():
-                run_dust3r = gr.Button("Get Pose!", elem_id="dust3r")
-            with gr.Row():
-                processed_image = gr.Gallery(label='Input Views', columns=2, height="100%")
-            with gr.Row(variant="panel"):
-                # input examples under "examples" folder
-                gr.Examples(
-                    examples=get_examples('examples'),
-                    inputs=[input_image],
-                    label="Examples (click one set of images to start!)",
-                    examples_per_page=20
-                )
-        # right column
-        with gr.Column():
-            with gr.Row():
-                outmodel = gr.Model3D()
-            with gr.Row():
-                gr.Markdown('''
-                <h4><b>Check if the pose (blue is axis is estimated z-up direction) and segmentation looks correct. If not, remove the incorrect images and try again.</b></h4>
-                ''')
-            with gr.Row():
-                with gr.Group():
-                    do_remove_background = gr.Checkbox(
-                        label="Remove Background", value=True
-                    )
-                    sample_seed = gr.Number(value=42, label="Seed Value", precision=0)
-                    sample_steps = gr.Slider(
-                        label="Sample Steps",
-                        minimum=30,
-                        maximum=75,
-                        value=50,
-                        step=5,
-                        visible=False
-                    )
-                    nvs_num = gr.Slider(
-                        label="Number of Novel Views",
-                        minimum=5,
-                        maximum=100,
-                        value=30,
-                        step=1
-                    )
-                    nvs_mode = gr.Dropdown(["archimedes circle"],   # "fixed 4 views", "fixed 8 views"
-                                       value="archimedes circle", label="Novel Views Pose Chosen", visible=True)
-            with gr.Row():
-                gr.Markdown('''
-                <h4><b>Choose your desired novel view poses number and generate! The more output images the longer it takes.</b></h4>
-                ''')
-            with gr.Row():
-                submit = gr.Button("Submit", elem_id="eschernet", variant="primary")
-            with gr.Row():
-                with gr.Column():
-                    output_video = gr.Video(
-                        label="video", format="mp4",
-                        width=379,
-                        autoplay=True,
-                        interactive=False
-                    )
-            with gr.Row():
-                gr.Markdown('''
-                <h4><b>The novel views are generated on an archimedean spiral (rotating around z-up axis and looking at the object center). You can download the video.</b></h4>
-                ''')
-    gr.Markdown(_CITE_)
-    # set dust3r parameter invisible to be clean
-    with gr.Column():
-        with gr.Row():
-            schedule = gr.Dropdown(["linear", "cosine"],
-                                       value='linear', label="schedule", info="For global alignment!", visible=False)
-            niter = gr.Number(value=300, precision=0, minimum=0, maximum=5000,
-                                  label="num_iterations", info="For global alignment!", visible=False)
-            scenegraph_type = gr.Dropdown(["complete", "swin", "oneref"],
-                                              value='complete', label="Scenegraph",
-                                              info="Define how to make pairs",
-                                              interactive=True, visible=False)
-            same_focals = gr.Checkbox(value=True, label="Focal", info="Use the same focal for all cameras", visible=False)
-            winsize = gr.Slider(label="Scene Graph: Window Size", value=1,
-                                    minimum=1, maximum=1, step=1, visible=False)
-            refid = gr.Slider(label="Scene Graph: Id", value=0, minimum=0, maximum=0, step=1, visible=False)
-        with gr.Row():
-            # adjust the confidence threshold
-            min_conf_thr = gr.Slider(label="min_conf_thr", value=3.0, minimum=1.0, maximum=20, step=0.1, visible=False)
-            # adjust the camera size in the output pointcloud
-            cam_size = gr.Slider(label="cam_size", value=0.05, minimum=0.01, maximum=0.5, step=0.001, visible=False)
-        with gr.Row():
-            as_pointcloud = gr.Checkbox(value=False, label="As pointcloud", visible=False)
-            # two post process implemented
-            mask_sky = gr.Checkbox(value=False, label="Mask sky", visible=False)
-            clean_depth = gr.Checkbox(value=True, label="Clean-up depthmaps", visible=False)
-            transparent_cams = gr.Checkbox(value=False, label="Transparent cameras", visible=False)
-    # events
-    # scenegraph_type.change(set_scenegraph_options,
-    #                        inputs=[input_image, winsize, refid, scenegraph_type],
-    #                        outputs=[winsize, refid])
-    # min_conf_thr.release(fn=model_from_scene_fun,
-    #                      inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                              clean_depth, transparent_cams, cam_size, same_focals],
-    #                      outputs=outmodel)
-    # cam_size.change(fn=model_from_scene_fun,
-    #                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                         clean_depth, transparent_cams, cam_size, same_focals],
-    #                 outputs=outmodel)
-    # as_pointcloud.change(fn=model_from_scene_fun,
-    #                      inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                              clean_depth, transparent_cams, cam_size, same_focals],
-    #                      outputs=outmodel)
-    # mask_sky.change(fn=model_from_scene_fun,
-    #                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                         clean_depth, transparent_cams, cam_size, same_focals],
-    #                 outputs=outmodel)
-    # clean_depth.change(fn=model_from_scene_fun,
-    #                    inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                            clean_depth, transparent_cams, cam_size, same_focals],
-    #                    outputs=outmodel)
-    # transparent_cams.change(model_from_scene_fun,
-    #                         inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
-    #                                 clean_depth, transparent_cams, cam_size, same_focals],
-    #                         outputs=outmodel)
-    # run_dust3r.click(fn=recon_fun,
-    #               inputs=[input_image, schedule, niter, min_conf_thr, as_pointcloud,
-    #                       mask_sky, clean_depth, transparent_cams, cam_size,
-    #                       scenegraph_type, winsize, refid, same_focals],
-    #               outputs=[scene, outmodel, processed_image, eschernet_input])
-    # events
-    input_image.change(set_scenegraph_options,
-                       inputs=[input_image, winsize, refid, scenegraph_type],
-                       outputs=[winsize, refid])
-    run_dust3r.click(fn=get_reconstructed_scene,
-                     inputs=[input_image, schedule, niter, min_conf_thr, as_pointcloud,
-                             mask_sky, clean_depth, transparent_cams, cam_size,
-                             scenegraph_type, winsize, refid, same_focals],
-                     outputs=[scene, outmodel, processed_image, eschernet_input])
-    # events
-    input_image.change(fn=preview_input,
-                       inputs=[input_image],
-                       outputs=[processed_image])
-    submit.click(fn=run_eschernet,
-                 inputs=[eschernet_input, sample_steps, sample_seed,
-                         nvs_num, nvs_mode],
-                 outputs=[output_video])
-# demo.queue(max_size=10)
-# demo.launch(share=True, server_name="0.0.0.0", server_port=None)
-demo.queue(max_size=10).launch()
-# if __name__ == '__main__':
-#     main()

mini_dust3r/__init__.py DELETED Viewed

File without changes

mini_dust3r/api/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .inference import inferece_dust3r, OptimizedResult, log_optimized_result
-__all__ = ["inferece_dust3r", "OptimizedResult", "log_optimized_result"]

mini_dust3r/api/inference.py DELETED Viewed

@@ -1,225 +0,0 @@
-import rerun as rr
-from pathlib import Path
-from typing import Literal
-import copy
-import torch
-import numpy as np
-from jaxtyping import Float32, Bool
-import trimesh
-from tqdm import tqdm
-from mini_dust3r.utils.image import load_images, ImageDict
-from mini_dust3r.inference import inference, Dust3rResult
-from mini_dust3r.model import AsymmetricCroCo3DStereo
-from mini_dust3r.image_pairs import make_pairs
-from mini_dust3r.cloud_opt import global_aligner, GlobalAlignerMode
-from mini_dust3r.cloud_opt.base_opt import BasePCOptimizer
-from mini_dust3r.viz import pts3d_to_trimesh, cat_meshes
-from dataclasses import dataclass
-@dataclass
-class OptimizedResult:
-    K_b33: Float32[np.ndarray, "b 3 3"]
-    world_T_cam_b44: Float32[np.ndarray, "b 4 4"]
-    rgb_hw3_list: list[Float32[np.ndarray, "h w 3"]]
-    depth_hw_list: list[Float32[np.ndarray, "h w"]]
-    conf_hw_list: list[Float32[np.ndarray, "h w"]]
-    masks_list: Bool[np.ndarray, "h w"]
-    point_cloud: trimesh.PointCloud
-    mesh: trimesh.Trimesh
-def log_optimized_result(
-    optimized_result: OptimizedResult, parent_log_path: Path
-) -> None:
-    rr.log(f"{parent_log_path}", rr.ViewCoordinates.RDF, timeless=True)
-    # log pointcloud
-    rr.log(
-        f"{parent_log_path}/pointcloud",
-        rr.Points3D(
-            positions=optimized_result.point_cloud.vertices,
-            colors=optimized_result.point_cloud.colors,
-        ),
-        timeless=True,
-    )
-    mesh = optimized_result.mesh
-    rr.log(
-        f"{parent_log_path}/mesh",
-        rr.Mesh3D(
-            vertex_positions=mesh.vertices,
-            vertex_colors=mesh.visual.vertex_colors,
-            indices=mesh.faces,
-        ),
-        timeless=True,
-    )
-    pbar = tqdm(
-        zip(
-            optimized_result.rgb_hw3_list,
-            optimized_result.depth_hw_list,
-            optimized_result.K_b33,
-            optimized_result.world_T_cam_b44,
-        ),
-        total=len(optimized_result.rgb_hw3_list),
-    )
-    for i, (rgb_hw3, depth_hw, k_33, world_T_cam_44) in enumerate(pbar):
-        camera_log_path = f"{parent_log_path}/camera_{i}"
-        height, width, _ = rgb_hw3.shape
-        rr.log(
-            f"{camera_log_path}",
-            rr.Transform3D(
-                translation=world_T_cam_44[:3, 3],
-                mat3x3=world_T_cam_44[:3, :3],
-                from_parent=False,
-            ),
-        )
-        rr.log(
-            f"{camera_log_path}/pinhole",
-            rr.Pinhole(
-                image_from_camera=k_33,
-                height=height,
-                width=width,
-                camera_xyz=rr.ViewCoordinates.RDF,
-            ),
-        )
-        rr.log(
-            f"{camera_log_path}/pinhole/rgb",
-            rr.Image(rgb_hw3),
-        )
-        rr.log(
-            f"{camera_log_path}/pinhole/depth",
-            rr.DepthImage(depth_hw),
-        )
-def scene_to_results(scene: BasePCOptimizer, min_conf_thr: int) -> OptimizedResult:
-    ### get camera parameters K and T
-    K_b33: Float32[np.ndarray, "b 3 3"] = scene.get_intrinsics().numpy(force=True)
-    world_T_cam_b44: Float32[np.ndarray, "b 4 4"] = scene.get_im_poses().numpy(
-        force=True
-    )
-    ### image, confidence, depths
-    rgb_hw3_list: list[Float32[np.ndarray, "h w 3"]] = scene.imgs
-    depth_hw_list: list[Float32[np.ndarray, "h w"]] = [
-        depth.numpy(force=True) for depth in scene.get_depthmaps()
-    ]
-    # normalized depth
-    # depth_hw_list = [depth_hw / depth_hw.max() for depth_hw in depth_hw_list]
-    conf_hw_list: list[Float32[np.ndarray, "h w"]] = [
-        c.numpy(force=True) for c in scene.im_conf
-    ]
-    # normalize confidence
-    # conf_hw_list = [conf_hw / conf_hw.max() for conf_hw in conf_hw_list]
-    # point cloud, mesh
-    pts3d_list: list[Float32[np.ndarray, "h w 3"]] = [
-        pt3d.numpy(force=True) for pt3d in scene.get_pts3d()
-    ]
-    # get log confidence
-    log_conf_trf: Float32[torch.Tensor, ""] = scene.conf_trf(torch.tensor(min_conf_thr))
-    # set the minimum confidence threshold
-    scene.min_conf_thr = float(log_conf_trf)
-    masks_list: Bool[np.ndarray, "h w"] = [
-        mask.numpy(force=True) for mask in scene.get_masks()
-    ]
-    point_cloud: Float32[np.ndarray, "num_points 3"] = np.concatenate(
-        [p[m] for p, m in zip(pts3d_list, masks_list)]
-    )
-    colors: Float32[np.ndarray, "num_points 3"] = np.concatenate(
-        [p[m] for p, m in zip(rgb_hw3_list, masks_list)]
-    )
-    point_cloud = trimesh.PointCloud(
-        point_cloud.reshape(-1, 3), colors=colors.reshape(-1, 3)
-    )
-    meshes = []
-    pbar = tqdm(zip(rgb_hw3_list, pts3d_list, masks_list), total=len(rgb_hw3_list))
-    for rgb_hw3, pts3d, mask in pbar:
-        meshes.append(pts3d_to_trimesh(rgb_hw3, pts3d, mask))
-    mesh = trimesh.Trimesh(**cat_meshes(meshes))
-    optimised_result = OptimizedResult(
-        K_b33=K_b33,
-        world_T_cam_b44=world_T_cam_b44,
-        rgb_hw3_list=rgb_hw3_list,
-        depth_hw_list=depth_hw_list,
-        conf_hw_list=conf_hw_list,
-        masks_list=masks_list,
-        point_cloud=point_cloud,
-        mesh=mesh,
-    )
-    return optimised_result
-def inferece_dust3r(
-    image_dir_or_list: Path | list[Path],
-    model: AsymmetricCroCo3DStereo,
-    device: Literal["cpu", "cuda", "mps"],
-    batch_size: int = 1,
-    image_size: Literal[224, 512] = 512,
-    niter: int = 100,
-    schedule: Literal["linear", "cosine"] = "linear",
-    min_conf_thr: float = 10,
-) -> OptimizedResult:
-    """
-    Perform inference using the Dust3r algorithm.
-    Args:
-        image_dir_or_list (Union[Path, List[Path]]): Path to the directory containing images or a list of image paths.
-        model (AsymmetricCroCo3DStereo): The Dust3r model to use for inference.
-        device (Literal["cpu", "cuda", "mps"]): The device to use for inference ("cpu", "cuda", or "mps").
-        batch_size (int, optional): The batch size for inference. Defaults to 1.
-        image_size (Literal[224, 512], optional): The size of the input images. Defaults to 512.
-        niter (int, optional): The number of iterations for the global alignment optimization. Defaults to 100.
-        schedule (Literal["linear", "cosine"], optional): The learning rate schedule for the global alignment optimization. Defaults to "linear".
-        min_conf_thr (float, optional): The minimum confidence threshold for the optimized result. Defaults to 10.
-    Returns:
-        OptimizedResult: The optimized result containing the RGB, depth, and confidence images.
-    Raises:
-        ValueError: If `image_dir_or_list` is neither a list of paths nor a path.
-    """
-    if isinstance(image_dir_or_list, list):
-        imgs: list[ImageDict] = load_images(
-            folder_or_list=image_dir_or_list, size=image_size, verbose=True
-        )
-    elif isinstance(image_dir_or_list, Path):
-        imgs: list[ImageDict] = load_images(
-            folder_or_list=str(image_dir_or_list), size=image_size, verbose=True
-        )
-    else:
-        raise ValueError("image_dir_or_list should be a list of paths or a path")
-    # if only one image was loaded, duplicate it to feed into stereo network
-    if len(imgs) == 1:
-        imgs = [imgs[0], copy.deepcopy(imgs[0])]
-        imgs[1]["idx"] = 1
-    pairs: list[tuple[ImageDict, ImageDict]] = make_pairs(
-        imgs, scene_graph="complete", prefilter=None, symmetrize=True
-    )
-    output: Dust3rResult = inference(pairs, model, device, batch_size=batch_size)
-    mode = (
-        GlobalAlignerMode.PointCloudOptimizer
-        if len(imgs) > 2
-        else GlobalAlignerMode.PairViewer
-    )
-    scene: BasePCOptimizer = global_aligner(
-        dust3r_output=output, device=device, mode=mode
-    )
-    lr = 0.01
-    if mode == GlobalAlignerMode.PointCloudOptimizer:
-        loss = scene.compute_global_alignment(
-            init="mst", niter=niter, schedule=schedule, lr=lr
-        )
-    # get the optimized result from the scene
-    optimized_result: OptimizedResult = scene_to_results(scene, min_conf_thr)
-    return optimized_result

mini_dust3r/cloud_opt/__init__.py DELETED Viewed

@@ -1,44 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# global alignment optimization wrapper function
-# --------------------------------------------------------
-from enum import Enum
-from .optimizer import PointCloudOptimizer
-from .modular_optimizer import ModularPointCloudOptimizer
-from .pair_viewer import PairViewer
-from mini_dust3r.inference import Dust3rResult
-from typing import Literal
-class GlobalAlignerMode(Enum):
-    PointCloudOptimizer = "PointCloudOptimizer"
-    ModularPointCloudOptimizer = "ModularPointCloudOptimizer"
-    PairViewer = "PairViewer"
-def global_aligner(
-    dust3r_output: Dust3rResult,
-    device: Literal["cpu", "cuda", "mps"],
-    mode: GlobalAlignerMode = GlobalAlignerMode.PointCloudOptimizer,
-    **optim_kw,
-):
-    # extract all inputs
-    view1, view2, pred1, pred2 = [
-        dust3r_output[k] for k in "view1 view2 pred1 pred2".split()
-    ]
-    # build the optimizer
-    if mode == GlobalAlignerMode.PointCloudOptimizer:
-        net = PointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(device)
-    elif mode == GlobalAlignerMode.ModularPointCloudOptimizer:
-        net = ModularPointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(
-            device
-        )
-    elif mode == GlobalAlignerMode.PairViewer:
-        net = PairViewer(view1, view2, pred1, pred2, **optim_kw).to(device)
-    else:
-        raise NotImplementedError(f"Unknown mode {mode}")
-    return net

mini_dust3r/cloud_opt/base_opt.py DELETED Viewed

@@ -1,390 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# Base class for the global alignement procedure
-# --------------------------------------------------------
-from copy import deepcopy
-import numpy as np
-import torch
-import torch.nn as nn
-import roma
-from copy import deepcopy
-import tqdm
-from mini_dust3r.utils.geometry import inv, geotrf
-from mini_dust3r.utils.device import to_numpy
-from mini_dust3r.utils.image import rgb
-from mini_dust3r.viz import SceneViz, segment_sky, auto_cam_size
-from mini_dust3r.optim_factory import adjust_learning_rate_by_lr
-from mini_dust3r.cloud_opt.commons import (edge_str, ALL_DISTS, NoGradParamDict, get_imshapes, signed_expm1, signed_log1p,
-                                      cosine_schedule, linear_schedule, get_conf_trf)
-import mini_dust3r.cloud_opt.init_im_poses as init_fun
-class BasePCOptimizer (nn.Module):
-    """ Optimize a global scene, given a list of pairwise observations.
-    Graph node: images
-    Graph edges: observations = (pred1, pred2)
-    """
-    def __init__(self, *args, **kwargs):
-        if len(args) == 1 and len(kwargs) == 0:
-            other = deepcopy(args[0])
-            attrs = '''edges is_symmetrized dist n_imgs pred_i pred_j imshapes
-                        min_conf_thr conf_thr conf_i conf_j im_conf
-                        base_scale norm_pw_scale POSE_DIM pw_poses
-                        pw_adaptors pw_adaptors has_im_poses rand_pose imgs verbose'''.split()
-            self.__dict__.update({k: other[k] for k in attrs})
-        else:
-            self._init_from_views(*args, **kwargs)
-    def _init_from_views(self, view1, view2, pred1, pred2,
-                         dist='l1',
-                         conf='log',
-                         min_conf_thr=3,
-                         base_scale=0.5,
-                         allow_pw_adaptors=False,
-                         pw_break=20,
-                         rand_pose=torch.randn,
-                         iterationsCount=None,
-                         verbose=True):
-        super().__init__()
-        if not isinstance(view1['idx'], list):
-            view1['idx'] = view1['idx'].tolist()
-        if not isinstance(view2['idx'], list):
-            view2['idx'] = view2['idx'].tolist()
-        self.edges = [(int(i), int(j)) for i, j in zip(view1['idx'], view2['idx'])]
-        self.is_symmetrized = set(self.edges) == {(j, i) for i, j in self.edges}
-        self.dist = ALL_DISTS[dist]
-        self.verbose = verbose
-        self.n_imgs = self._check_edges()
-        # input data
-        pred1_pts = pred1['pts3d']
-        pred2_pts = pred2['pts3d_in_other_view']
-        self.pred_i = NoGradParamDict({ij: pred1_pts[n] for n, ij in enumerate(self.str_edges)})
-        self.pred_j = NoGradParamDict({ij: pred2_pts[n] for n, ij in enumerate(self.str_edges)})
-        self.imshapes = get_imshapes(self.edges, pred1_pts, pred2_pts)
-        # work in log-scale with conf
-        pred1_conf = pred1['conf']
-        pred2_conf = pred2['conf']
-        self.min_conf_thr = min_conf_thr
-        self.conf_trf = get_conf_trf(conf)
-        self.conf_i = NoGradParamDict({ij: pred1_conf[n] for n, ij in enumerate(self.str_edges)})
-        self.conf_j = NoGradParamDict({ij: pred2_conf[n] for n, ij in enumerate(self.str_edges)})
-        self.im_conf = self._compute_img_conf(pred1_conf, pred2_conf)
-        # pairwise pose parameters
-        self.base_scale = base_scale
-        self.norm_pw_scale = True
-        self.pw_break = pw_break
-        self.POSE_DIM = 7
-        self.pw_poses = nn.Parameter(rand_pose((self.n_edges, 1+self.POSE_DIM)))  # pairwise poses
-        self.pw_adaptors = nn.Parameter(torch.zeros((self.n_edges, 2)))  # slight xy/z adaptation
-        self.pw_adaptors.requires_grad_(allow_pw_adaptors)
-        self.has_im_poses = False
-        self.rand_pose = rand_pose
-        # possibly store images for show_pointcloud
-        self.imgs = None
-        if 'img' in view1 and 'img' in view2:
-            imgs = [torch.zeros((3,)+hw) for hw in self.imshapes]
-            for v in range(len(self.edges)):
-                idx = view1['idx'][v]
-                imgs[idx] = view1['img'][v]
-                idx = view2['idx'][v]
-                imgs[idx] = view2['img'][v]
-            self.imgs = rgb(imgs)
-    @property
-    def n_edges(self):
-        return len(self.edges)
-    @property
-    def str_edges(self):
-        return [edge_str(i, j) for i, j in self.edges]
-    @property
-    def imsizes(self):
-        return [(w, h) for h, w in self.imshapes]
-    @property
-    def device(self):
-        return next(iter(self.parameters())).device
-    def state_dict(self, trainable=True):
-        all_params = super().state_dict()
-        return {k: v for k, v in all_params.items() if k.startswith(('_', 'pred_i.', 'pred_j.', 'conf_i.', 'conf_j.')) != trainable}
-    def load_state_dict(self, data):
-        return super().load_state_dict(self.state_dict(trainable=False) | data)
-    def _check_edges(self):
-        indices = sorted({i for edge in self.edges for i in edge})
-        assert indices == list(range(len(indices))), 'bad pair indices: missing values '
-        return len(indices)
-    @torch.no_grad()
-    def _compute_img_conf(self, pred1_conf, pred2_conf):
-        im_conf = nn.ParameterList([torch.zeros(hw, device=self.device) for hw in self.imshapes])
-        for e, (i, j) in enumerate(self.edges):
-            im_conf[i] = torch.maximum(im_conf[i], pred1_conf[e])
-            im_conf[j] = torch.maximum(im_conf[j], pred2_conf[e])
-        return im_conf
-    def get_adaptors(self):
-        adapt = self.pw_adaptors
-        adapt = torch.cat((adapt[:, 0:1], adapt), dim=-1)  # (scale_xy, scale_xy, scale_z)
-        if self.norm_pw_scale:  # normalize so that the product == 1
-            adapt = adapt - adapt.mean(dim=1, keepdim=True)
-        return (adapt / self.pw_break).exp()
-    def _get_poses(self, poses):
-        # normalize rotation
-        Q = poses[:, :4]
-        T = signed_expm1(poses[:, 4:7])
-        RT = roma.RigidUnitQuat(Q, T).normalize().to_homogeneous()
-        return RT
-    def _set_pose(self, poses, idx, R, T=None, scale=None, force=False):
-        # all poses == cam-to-world
-        pose = poses[idx]
-        if not (pose.requires_grad or force):
-            return pose
-        if R.shape == (4, 4):
-            assert T is None
-            T = R[:3, 3]
-            R = R[:3, :3]
-        if R is not None:
-            pose.data[0:4] = roma.rotmat_to_unitquat(R)
-        if T is not None:
-            pose.data[4:7] = signed_log1p(T / (scale or 1))  # translation is function of scale
-        if scale is not None:
-            assert poses.shape[-1] in (8, 13)
-            pose.data[-1] = np.log(float(scale))
-        return pose
-    def get_pw_norm_scale_factor(self):
-        if self.norm_pw_scale:
-            # normalize scales so that things cannot go south
-            # we want that exp(scale) ~= self.base_scale
-            return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp()
-        else:
-            return 1  # don't norm scale for known poses
-    def get_pw_scale(self):
-        scale = self.pw_poses[:, -1].exp()  # (n_edges,)
-        scale = scale * self.get_pw_norm_scale_factor()
-        return scale
-    def get_pw_poses(self):  # cam to world
-        RT = self._get_poses(self.pw_poses)
-        scaled_RT = RT.clone()
-        scaled_RT[:, :3] *= self.get_pw_scale().view(-1, 1, 1)  # scale the rotation AND translation
-        return scaled_RT
-    def get_masks(self):
-        return [(conf > self.min_conf_thr) for conf in self.im_conf]
-    def depth_to_pts3d(self):
-        raise NotImplementedError()
-    def get_pts3d(self, raw=False):
-        res = self.depth_to_pts3d()
-        if not raw:
-            res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
-        return res
-    def _set_focal(self, idx, focal, force=False):
-        raise NotImplementedError()
-    def get_focals(self):
-        raise NotImplementedError()
-    def get_known_focal_mask(self):
-        raise NotImplementedError()
-    def get_principal_points(self):
-        raise NotImplementedError()
-    def get_conf(self, mode=None):
-        trf = self.conf_trf if mode is None else get_conf_trf(mode)
-        return [trf(c) for c in self.im_conf]
-    def get_im_poses(self):
-        raise NotImplementedError()
-    def _set_depthmap(self, idx, depth, force=False):
-        raise NotImplementedError()
-    def get_depthmaps(self, raw=False):
-        raise NotImplementedError()
-    @torch.no_grad()
-    def clean_pointcloud(self, tol=0.001, max_bad_conf=0):
-        """ Method:
-        1) express all 3d points in each camera coordinate frame
-        2) if they're in front of a depthmap --> then lower their confidence
-        """
-        assert 0 <= tol < 1
-        cams = inv(self.get_im_poses())
-        K = self.get_intrinsics()
-        depthmaps = self.get_depthmaps()
-        res = deepcopy(self)
-        for i, pts3d in enumerate(self.depth_to_pts3d()):
-            for j in range(self.n_imgs):
-                if i == j:
-                    continue
-                # project 3dpts in other view
-                Hi, Wi = self.imshapes[i]
-                Hj, Wj = self.imshapes[j]
-                proj = geotrf(cams[j], pts3d[:Hi*Wi]).reshape(Hi, Wi, 3)
-                proj_depth = proj[:, :, 2]
-                u, v = geotrf(K[j], proj, norm=1, ncol=2).round().long().unbind(-1)
-                # check which points are actually in the visible cone
-                msk_i = (proj_depth > 0) & (0 <= u) & (u < Wj) & (0 <= v) & (v < Hj)
-                msk_j = v[msk_i], u[msk_i]
-                # find bad points = those in front but less confident
-                bad_points = (proj_depth[msk_i] < (1-tol) * depthmaps[j][msk_j]
-                              ) & (res.im_conf[i][msk_i] < res.im_conf[j][msk_j])
-                bad_msk_i = msk_i.clone()
-                bad_msk_i[msk_i] = bad_points
-                res.im_conf[i][bad_msk_i] = res.im_conf[i][bad_msk_i].clip_(max=max_bad_conf)
-        return res
-    def forward(self, ret_details=False):
-        pw_poses = self.get_pw_poses()  # cam-to-world
-        pw_adapt = self.get_adaptors()
-        proj_pts3d = self.get_pts3d()
-        # pre-compute pixel weights
-        weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()}
-        weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()}
-        loss = 0
-        if ret_details:
-            details = -torch.ones((self.n_imgs, self.n_imgs))
-        for e, (i, j) in enumerate(self.edges):
-            i_j = edge_str(i, j)
-            # distance in image i and j
-            aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j])
-            aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j])
-            li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean()
-            lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean()
-            loss = loss + li + lj
-            if ret_details:
-                details[i, j] = li + lj
-        loss /= self.n_edges  # average over all pairs
-        if ret_details:
-            return loss, details
-        return loss
-    @torch.cuda.amp.autocast(enabled=False)
-    def compute_global_alignment(self, init=None, niter_PnP=10, **kw):
-        if init is None:
-            pass
-        elif init == 'msp' or init == 'mst':
-            init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP)
-        elif init == 'known_poses':
-            init_fun.init_from_known_poses(self, min_conf_thr=self.min_conf_thr,
-                                           niter_PnP=niter_PnP)
-        else:
-            raise ValueError(f'bad value for {init=}')
-        return global_alignment_loop(self, **kw)
-    @torch.no_grad()
-    def mask_sky(self):
-        res = deepcopy(self)
-        for i in range(self.n_imgs):
-            sky = segment_sky(self.imgs[i])
-            res.im_conf[i][sky] = 0
-        return res
-    def show(self, show_pw_cams=False, show_pw_pts3d=False, cam_size=None, **kw):
-        viz = SceneViz()
-        if self.imgs is None:
-            colors = np.random.randint(0, 256, size=(self.n_imgs, 3))
-            colors = list(map(tuple, colors.tolist()))
-            for n in range(self.n_imgs):
-                viz.add_pointcloud(self.get_pts3d()[n], colors[n], self.get_masks()[n])
-        else:
-            viz.add_pointcloud(self.get_pts3d(), self.imgs, self.get_masks())
-            colors = np.random.randint(256, size=(self.n_imgs, 3))
-        # camera poses
-        im_poses = to_numpy(self.get_im_poses())
-        if cam_size is None:
-            cam_size = auto_cam_size(im_poses)
-        viz.add_cameras(im_poses, self.get_focals(), colors=colors,
-                        images=self.imgs, imsizes=self.imsizes, cam_size=cam_size)
-        if show_pw_cams:
-            pw_poses = self.get_pw_poses()
-            viz.add_cameras(pw_poses, color=(192, 0, 192), cam_size=cam_size)
-            if show_pw_pts3d:
-                pts = [geotrf(pw_poses[e], self.pred_i[edge_str(i, j)]) for e, (i, j) in enumerate(self.edges)]
-                viz.add_pointcloud(pts, (128, 0, 128))
-        viz.show(**kw)
-        return viz
-def global_alignment_loop(net, lr=0.01, niter=300, schedule='cosine', lr_min=1e-6):
-    params = [p for p in net.parameters() if p.requires_grad]
-    if not params:
-        return net
-    verbose = net.verbose
-    if verbose:
-        print('Global alignement - optimizing for:')
-        print([name for name, value in net.named_parameters() if value.requires_grad])
-    lr_base = lr
-    optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9))
-    loss = float('inf')
-    if verbose:
-        with tqdm.tqdm(total=niter) as bar:
-            while bar.n < bar.total:
-                loss = global_alignment_iter(net, bar.n, niter, lr_base, lr_min, optimizer, schedule)
-                bar.set_postfix_str(f'{lr=:g} loss={loss:g}')
-                bar.update()
-    else:
-        for n in range(niter):
-            loss = global_alignment_iter(net, n, niter, lr_base, lr_min, optimizer, schedule)
-    return loss
-def global_alignment_iter(net, cur_iter, niter, lr_base, lr_min, optimizer, schedule):
-    t = cur_iter / niter
-    if schedule == 'cosine':
-        lr = cosine_schedule(t, lr_base, lr_min)
-    elif schedule == 'linear':
-        lr = linear_schedule(t, lr_base, lr_min)
-    else:
-        raise ValueError(f'bad lr {schedule=}')
-    adjust_learning_rate_by_lr(optimizer, lr)
-    optimizer.zero_grad()
-    loss = net()
-    loss.backward()
-    optimizer.step()
-    return float(loss)

mini_dust3r/cloud_opt/commons.py DELETED Viewed

@@ -1,90 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# utility functions for global alignment
-# --------------------------------------------------------
-import torch
-import torch.nn as nn
-import numpy as np
-def edge_str(i, j):
-    return f'{i}_{j}'
-def i_j_ij(ij):
-    return edge_str(*ij), ij
-def edge_conf(conf_i, conf_j, edge):
-    return float(conf_i[edge].mean() * conf_j[edge].mean())
-def compute_edge_scores(edges, conf_i, conf_j):
-    return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges}
-def NoGradParamDict(x):
-    assert isinstance(x, dict)
-    return nn.ParameterDict(x).requires_grad_(False)
-def get_imshapes(edges, pred_i, pred_j):
-    n_imgs = max(max(e) for e in edges) + 1
-    imshapes = [None] * n_imgs
-    for e, (i, j) in enumerate(edges):
-        shape_i = tuple(pred_i[e].shape[0:2])
-        shape_j = tuple(pred_j[e].shape[0:2])
-        if imshapes[i]:
-            assert imshapes[i] == shape_i, f'incorrect shape for image {i}'
-        if imshapes[j]:
-            assert imshapes[j] == shape_j, f'incorrect shape for image {j}'
-        imshapes[i] = shape_i
-        imshapes[j] = shape_j
-    return imshapes
-def get_conf_trf(mode):
-    if mode == 'log':
-        def conf_trf(x): return x.log()
-    elif mode == 'sqrt':
-        def conf_trf(x): return x.sqrt()
-    elif mode == 'm1':
-        def conf_trf(x): return x-1
-    elif mode in ('id', 'none'):
-        def conf_trf(x): return x
-    else:
-        raise ValueError(f'bad mode for {mode=}')
-    return conf_trf
-def l2_dist(a, b, weight):
-    return ((a - b).square().sum(dim=-1) * weight)
-def l1_dist(a, b, weight):
-    return ((a - b).norm(dim=-1) * weight)
-ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
-def signed_log1p(x):
-    sign = torch.sign(x)
-    return sign * torch.log1p(torch.abs(x))
-def signed_expm1(x):
-    sign = torch.sign(x)
-    return sign * torch.expm1(torch.abs(x))
-def cosine_schedule(t, lr_start, lr_end):
-    assert 0 <= t <= 1
-    return lr_end + (lr_start - lr_end) * (1+np.cos(t * np.pi))/2
-def linear_schedule(t, lr_start, lr_end):
-    assert 0 <= t <= 1
-    return lr_start + (lr_end - lr_start) * t

mini_dust3r/cloud_opt/init_im_poses.py DELETED Viewed

@@ -1,316 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# Initialization functions for global alignment
-# --------------------------------------------------------
-from functools import cache
-import numpy as np
-import scipy.sparse as sp
-import torch
-import cv2
-import roma
-from tqdm import tqdm
-from mini_dust3r.utils.geometry import geotrf, inv, get_med_dist_between_poses
-from mini_dust3r.post_process import estimate_focal_knowing_depth
-from mini_dust3r.viz import to_numpy
-from mini_dust3r.cloud_opt.commons import edge_str, i_j_ij, compute_edge_scores
-@torch.no_grad()
-def init_from_known_poses(self, niter_PnP=10, min_conf_thr=3):
-    device = self.device
-    # indices of known poses
-    nkp, known_poses_msk, known_poses = get_known_poses(self)
-    assert nkp == self.n_imgs, 'not all poses are known'
-    # get all focals
-    nkf, _, im_focals = get_known_focals(self)
-    assert nkf == self.n_imgs
-    im_pp = self.get_principal_points()
-    best_depthmaps = {}
-    # init all pairwise poses
-    for e, (i, j) in enumerate(tqdm(self.edges, disable=not self.verbose)):
-        i_j = edge_str(i, j)
-        # find relative pose for this pair
-        P1 = torch.eye(4, device=device)
-        msk = self.conf_i[i_j] > min(min_conf_thr, self.conf_i[i_j].min() - 0.1)
-        _, P2 = fast_pnp(self.pred_j[i_j], float(im_focals[i].mean()),
-                         pp=im_pp[i], msk=msk, device=device, niter_PnP=niter_PnP)
-        # align the two predicted camera with the two gt cameras
-        s, R, T = align_multiple_poses(torch.stack((P1, P2)), known_poses[[i, j]])
-        # normally we have known_poses[i] ~= sRT_to_4x4(s,R,T,device) @ P1
-        # and geotrf(sRT_to_4x4(1,R,T,device), s*P2[:3,3])
-        self._set_pose(self.pw_poses, e, R, T, scale=s)
-        # remember if this is a good depthmap
-        score = float(self.conf_i[i_j].mean())
-        if score > best_depthmaps.get(i, (0,))[0]:
-            best_depthmaps[i] = score, i_j, s
-    # init all image poses
-    for n in range(self.n_imgs):
-        assert known_poses_msk[n]
-        _, i_j, scale = best_depthmaps[n]
-        depth = self.pred_i[i_j][:, :, 2]
-        self._set_depthmap(n, depth * scale)
-@torch.no_grad()
-def init_minimum_spanning_tree(self, **kw):
-    """ Init all camera poses (image-wise and pairwise poses) given
-        an initial set of pairwise estimations.
-    """
-    device = self.device
-    pts3d, _, im_focals, im_poses = minimum_spanning_tree(self.imshapes, self.edges,
-                                                          self.pred_i, self.pred_j, self.conf_i, self.conf_j, self.im_conf, self.min_conf_thr,
-                                                          device, has_im_poses=self.has_im_poses, verbose=self.verbose,
-                                                          **kw)
-    return init_from_pts3d(self, pts3d, im_focals, im_poses)
-def init_from_pts3d(self, pts3d, im_focals, im_poses):
-    # init poses
-    nkp, known_poses_msk, known_poses = get_known_poses(self)
-    if nkp == 1:
-        raise NotImplementedError("Would be simpler to just align everything afterwards on the single known pose")
-    elif nkp > 1:
-        # global rigid SE3 alignment
-        s, R, T = align_multiple_poses(im_poses[known_poses_msk], known_poses[known_poses_msk])
-        trf = sRT_to_4x4(s, R, T, device=known_poses.device)
-        # rotate everything
-        im_poses = trf @ im_poses
-        im_poses[:, :3, :3] /= s  # undo scaling on the rotation part
-        for img_pts3d in pts3d:
-            img_pts3d[:] = geotrf(trf, img_pts3d)
-    # set all pairwise poses
-    for e, (i, j) in enumerate(self.edges):
-        i_j = edge_str(i, j)
-        # compute transform that goes from cam to world
-        s, R, T = rigid_points_registration(self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j])
-        self._set_pose(self.pw_poses, e, R, T, scale=s)
-    # take into account the scale normalization
-    s_factor = self.get_pw_norm_scale_factor()
-    im_poses[:, :3, 3] *= s_factor  # apply downscaling factor
-    for img_pts3d in pts3d:
-        img_pts3d *= s_factor
-    # init all image poses
-    if self.has_im_poses:
-        for i in range(self.n_imgs):
-            cam2world = im_poses[i]
-            depth = geotrf(inv(cam2world), pts3d[i])[..., 2]
-            self._set_depthmap(i, depth)
-            self._set_pose(self.im_poses, i, cam2world)
-            if im_focals[i] is not None:
-                self._set_focal(i, im_focals[i])
-    if self.verbose:
-        print(' init loss =', float(self()))
-def minimum_spanning_tree(imshapes, edges, pred_i, pred_j, conf_i, conf_j, im_conf, min_conf_thr,
-                          device, has_im_poses=True, niter_PnP=10, verbose=True):
-    n_imgs = len(imshapes)
-    sparse_graph = -dict_to_sparse_graph(compute_edge_scores(map(i_j_ij, edges), conf_i, conf_j))
-    msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo()
-    # temp variable to store 3d points
-    pts3d = [None] * len(imshapes)
-    todo = sorted(zip(-msp.data, msp.row, msp.col))  # sorted edges
-    im_poses = [None] * n_imgs
-    im_focals = [None] * n_imgs
-    # init with strongest edge
-    score, i, j = todo.pop()
-    if verbose:
-        print(f' init edge ({i}*,{j}*) {score=}')
-    i_j = edge_str(i, j)
-    pts3d[i] = pred_i[i_j].clone()
-    pts3d[j] = pred_j[i_j].clone()
-    done = {i, j}
-    if has_im_poses:
-        im_poses[i] = torch.eye(4, device=device)
-        im_focals[i] = estimate_focal(pred_i[i_j])
-    # set initial pointcloud based on pairwise graph
-    msp_edges = [(i, j)]
-    while todo:
-        # each time, predict the next one
-        score, i, j = todo.pop()
-        if im_focals[i] is None:
-            im_focals[i] = estimate_focal(pred_i[i_j])
-        if i in done:
-            if verbose:
-                print(f' init edge ({i},{j}*) {score=}')
-            assert j not in done
-            # align pred[i] with pts3d[i], and then set j accordingly
-            i_j = edge_str(i, j)
-            s, R, T = rigid_points_registration(pred_i[i_j], pts3d[i], conf=conf_i[i_j])
-            trf = sRT_to_4x4(s, R, T, device)
-            pts3d[j] = geotrf(trf, pred_j[i_j])
-            done.add(j)
-            msp_edges.append((i, j))
-            if has_im_poses and im_poses[i] is None:
-                im_poses[i] = sRT_to_4x4(1, R, T, device)
-        elif j in done:
-            if verbose:
-                print(f' init edge ({i}*,{j}) {score=}')
-            assert i not in done
-            i_j = edge_str(i, j)
-            s, R, T = rigid_points_registration(pred_j[i_j], pts3d[j], conf=conf_j[i_j])
-            trf = sRT_to_4x4(s, R, T, device)
-            pts3d[i] = geotrf(trf, pred_i[i_j])
-            done.add(i)
-            msp_edges.append((i, j))
-            if has_im_poses and im_poses[i] is None:
-                im_poses[i] = sRT_to_4x4(1, R, T, device)
-        else:
-            # let's try again later
-            todo.insert(0, (score, i, j))
-    if has_im_poses:
-        # complete all missing informations
-        pair_scores = list(sparse_graph.values())  # already negative scores: less is best
-        edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[np.argsort(pair_scores)]
-        for i, j in edges_from_best_to_worse.tolist():
-            if im_focals[i] is None:
-                im_focals[i] = estimate_focal(pred_i[edge_str(i, j)])
-        for i in range(n_imgs):
-            if im_poses[i] is None:
-                msk = im_conf[i] > min_conf_thr
-                res = fast_pnp(pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP)
-                if res:
-                    im_focals[i], im_poses[i] = res
-            if im_poses[i] is None:
-                im_poses[i] = torch.eye(4, device=device)
-        im_poses = torch.stack(im_poses)
-    else:
-        im_poses = im_focals = None
-    return pts3d, msp_edges, im_focals, im_poses
-def dict_to_sparse_graph(dic):
-    n_imgs = max(max(e) for e in dic) + 1
-    res = sp.dok_array((n_imgs, n_imgs))
-    for edge, value in dic.items():
-        res[edge] = value
-    return res
-def rigid_points_registration(pts1, pts2, conf):
-    R, T, s = roma.rigid_points_registration(
-        pts1.reshape(-1, 3), pts2.reshape(-1, 3), weights=conf.ravel(), compute_scaling=True)
-    return s, R, T  # return un-scaled (R, T)
-def sRT_to_4x4(scale, R, T, device):
-    trf = torch.eye(4, device=device)
-    trf[:3, :3] = R * scale
-    trf[:3, 3] = T.ravel()  # doesn't need scaling
-    return trf
-def estimate_focal(pts3d_i, pp=None):
-    if pp is None:
-        H, W, THREE = pts3d_i.shape
-        assert THREE == 3
-        pp = torch.tensor((W/2, H/2), device=pts3d_i.device)
-    focal = estimate_focal_knowing_depth(pts3d_i.unsqueeze(0), pp.unsqueeze(0), focal_mode='weiszfeld').ravel()
-    return float(focal)
-@cache
-def pixel_grid(H, W):
-    return np.mgrid[:W, :H].T.astype(np.float32)
-def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10):
-    # extract camera poses and focals with RANSAC-PnP
-    if msk.sum() < 4:
-        return None  # we need at least 4 points for PnP
-    pts3d, msk = map(to_numpy, (pts3d, msk))
-    H, W, THREE = pts3d.shape
-    assert THREE == 3
-    pixels = pixel_grid(H, W)
-    if focal is None:
-        S = max(W, H)
-        tentative_focals = np.geomspace(S/2, S*3, 21)
-    else:
-        tentative_focals = [focal]
-    if pp is None:
-        pp = (W/2, H/2)
-    else:
-        pp = to_numpy(pp)
-    best = 0,
-    for focal in tentative_focals:
-        K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
-        success, R, T, inliers = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None,
-                                                    iterationsCount=niter_PnP, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
-        if not success:
-            continue
-        score = len(inliers)
-        if success and score > best[0]:
-            best = score, R, T, focal
-    if not best[0]:
-        return None
-    _, R, T, best_focal = best
-    R = cv2.Rodrigues(R)[0]  # world to cam
-    R, T = map(torch.from_numpy, (R, T))
-    return best_focal, inv(sRT_to_4x4(1, R, T, device))  # cam to world
-def get_known_poses(self):
-    if self.has_im_poses:
-        known_poses_msk = torch.tensor([not (p.requires_grad) for p in self.im_poses])
-        known_poses = self.get_im_poses()
-        return known_poses_msk.sum(), known_poses_msk, known_poses
-    else:
-        return 0, None, None
-def get_known_focals(self):
-    if self.has_im_poses:
-        known_focal_msk = self.get_known_focal_mask()
-        known_focals = self.get_focals()
-        return known_focal_msk.sum(), known_focal_msk, known_focals
-    else:
-        return 0, None, None
-def align_multiple_poses(src_poses, target_poses):
-    N = len(src_poses)
-    assert src_poses.shape == target_poses.shape == (N, 4, 4)
-    def center_and_z(poses):
-        eps = get_med_dist_between_poses(poses) / 100
-        return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps*poses[:, :3, 2]))
-    R, T, s = roma.rigid_points_registration(center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True)
-    return s, R, T

mini_dust3r/cloud_opt/modular_optimizer.py DELETED Viewed

@@ -1,145 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# Slower implementation of the global alignment that allows to freeze partial poses/intrinsics
-# --------------------------------------------------------
-import numpy as np
-import torch
-import torch.nn as nn
-from mini_dust3r.cloud_opt.base_opt import BasePCOptimizer
-from mini_dust3r.utils.geometry import geotrf
-from mini_dust3r.utils.device import to_cpu, to_numpy
-from mini_dust3r.utils.geometry import depthmap_to_pts3d
-class ModularPointCloudOptimizer (BasePCOptimizer):
-    """ Optimize a global scene, given a list of pairwise observations.
-    Unlike PointCloudOptimizer, you can fix parts of the optimization process (partial poses/intrinsics)
-    Graph node: images
-    Graph edges: observations = (pred1, pred2)
-    """
-    def __init__(self, *args, optimize_pp=False, fx_and_fy=False, focal_brake=20, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.has_im_poses = True  # by definition of this class
-        self.focal_brake = focal_brake
-        # adding thing to optimize
-        self.im_depthmaps = nn.ParameterList(torch.randn(H, W)/10-3 for H, W in self.imshapes)  # log(depth)
-        self.im_poses = nn.ParameterList(self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs))  # camera poses
-        default_focals = [self.focal_brake * np.log(max(H, W)) for H, W in self.imshapes]
-        self.im_focals = nn.ParameterList(torch.FloatTensor([f, f] if fx_and_fy else [
-                                          f]) for f in default_focals)  # camera intrinsics
-        self.im_pp = nn.ParameterList(torch.zeros((2,)) for _ in range(self.n_imgs))  # camera intrinsics
-        self.im_pp.requires_grad_(optimize_pp)
-    def preset_pose(self, known_poses, pose_msk=None):  # cam-to-world
-        if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2:
-            known_poses = [known_poses]
-        for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses):
-            if self.verbose:
-                print(f' (setting pose #{idx} = {pose[:3,3]})')
-            self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose), force=True))
-        # normalize scale if there's less than 1 known pose
-        n_known_poses = sum((p.requires_grad is False) for p in self.im_poses)
-        self.norm_pw_scale = (n_known_poses <= 1)
-    def preset_intrinsics(self, known_intrinsics, msk=None):
-        if isinstance(known_intrinsics, torch.Tensor) and known_intrinsics.ndim == 2:
-            known_intrinsics = [known_intrinsics]
-        for K in known_intrinsics:
-            assert K.shape == (3, 3)
-        self.preset_focal([K.diagonal()[:2].mean() for K in known_intrinsics], msk)
-        self.preset_principal_point([K[:2, 2] for K in known_intrinsics], msk)
-    def preset_focal(self, known_focals, msk=None):
-        for idx, focal in zip(self._get_msk_indices(msk), known_focals):
-            if self.verbose:
-                print(f' (setting focal #{idx} = {focal})')
-            self._no_grad(self._set_focal(idx, focal, force=True))
-    def preset_principal_point(self, known_pp, msk=None):
-        for idx, pp in zip(self._get_msk_indices(msk), known_pp):
-            if self.verbose:
-                print(f' (setting principal point #{idx} = {pp})')
-            self._no_grad(self._set_principal_point(idx, pp, force=True))
-    def _no_grad(self, tensor):
-        return tensor.requires_grad_(False)
-    def _get_msk_indices(self, msk):
-        if msk is None:
-            return range(self.n_imgs)
-        elif isinstance(msk, int):
-            return [msk]
-        elif isinstance(msk, (tuple, list)):
-            return self._get_msk_indices(np.array(msk))
-        elif msk.dtype in (bool, torch.bool, np.bool_):
-            assert len(msk) == self.n_imgs
-            return np.where(msk)[0]
-        elif np.issubdtype(msk.dtype, np.integer):
-            return msk
-        else:
-            raise ValueError(f'bad {msk=}')
-    def _set_focal(self, idx, focal, force=False):
-        param = self.im_focals[idx]
-        if param.requires_grad or force:  # can only init a parameter not already initialized
-            param.data[:] = self.focal_brake * np.log(focal)
-        return param
-    def get_focals(self):
-        log_focals = torch.stack(list(self.im_focals), dim=0)
-        return (log_focals / self.focal_brake).exp()
-    def _set_principal_point(self, idx, pp, force=False):
-        param = self.im_pp[idx]
-        H, W = self.imshapes[idx]
-        if param.requires_grad or force:  # can only init a parameter not already initialized
-            param.data[:] = to_cpu(to_numpy(pp) - (W/2, H/2)) / 10
-        return param
-    def get_principal_points(self):
-        return torch.stack([pp.new((W/2, H/2))+10*pp for pp, (H, W) in zip(self.im_pp, self.imshapes)])
-    def get_intrinsics(self):
-        K = torch.zeros((self.n_imgs, 3, 3), device=self.device)
-        focals = self.get_focals().view(self.n_imgs, -1)
-        K[:, 0, 0] = focals[:, 0]
-        K[:, 1, 1] = focals[:, -1]
-        K[:, :2, 2] = self.get_principal_points()
-        K[:, 2, 2] = 1
-        return K
-    def get_im_poses(self):  # cam to world
-        cam2world = self._get_poses(torch.stack(list(self.im_poses)))
-        return cam2world
-    def _set_depthmap(self, idx, depth, force=False):
-        param = self.im_depthmaps[idx]
-        if param.requires_grad or force:  # can only init a parameter not already initialized
-            param.data[:] = depth.log().nan_to_num(neginf=0)
-        return param
-    def get_depthmaps(self):
-        return [d.exp() for d in self.im_depthmaps]
-    def depth_to_pts3d(self):
-        # Get depths and  projection params if not provided
-        focals = self.get_focals()
-        pp = self.get_principal_points()
-        im_poses = self.get_im_poses()
-        depth = self.get_depthmaps()
-        # convert focal to (1,2,H,W) constant field
-        def focal_ex(i): return focals[i][..., None, None].expand(1, *focals[i].shape, *self.imshapes[i])
-        # get pointmaps in camera frame
-        rel_ptmaps = [depthmap_to_pts3d(depth[i][None], focal_ex(i), pp=pp[i:i+1])[0] for i in range(im_poses.shape[0])]
-        # project to world frame
-        return [geotrf(pose, ptmap) for pose, ptmap in zip(im_poses, rel_ptmaps)]
-    def get_pts3d(self):
-        return self.depth_to_pts3d()

mini_dust3r/cloud_opt/optimizer.py DELETED Viewed

@@ -1,248 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# Main class for the implementation of the global alignment
-# --------------------------------------------------------
-import numpy as np
-import torch
-import torch.nn as nn
-from mini_dust3r.cloud_opt.base_opt import BasePCOptimizer
-from mini_dust3r.utils.geometry import xy_grid, geotrf
-from mini_dust3r.utils.device import to_cpu, to_numpy
-class PointCloudOptimizer(BasePCOptimizer):
-    """ Optimize a global scene, given a list of pairwise observations.
-    Graph node: images
-    Graph edges: observations = (pred1, pred2)
-    """
-    def __init__(self, *args, optimize_pp=False, focal_break=20, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.has_im_poses = True  # by definition of this class
-        self.focal_break = focal_break
-        # adding thing to optimize
-        self.im_depthmaps = nn.ParameterList(torch.randn(H, W)/10-3 for H, W in self.imshapes)  # log(depth)
-        self.im_poses = nn.ParameterList(self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs))  # camera poses
-        self.im_focals = nn.ParameterList(torch.FloatTensor(
-            [self.focal_break*np.log(max(H, W))]) for H, W in self.imshapes)  # camera intrinsics
-        self.im_pp = nn.ParameterList(torch.zeros((2,)) for _ in range(self.n_imgs))  # camera intrinsics
-        self.im_pp.requires_grad_(optimize_pp)
-        self.imshape = self.imshapes[0]
-        im_areas = [h*w for h, w in self.imshapes]
-        self.max_area = max(im_areas)
-        # adding thing to optimize
-        self.im_depthmaps = ParameterStack(self.im_depthmaps, is_param=True, fill=self.max_area)
-        self.im_poses = ParameterStack(self.im_poses, is_param=True)
-        self.im_focals = ParameterStack(self.im_focals, is_param=True)
-        self.im_pp = ParameterStack(self.im_pp, is_param=True)
-        self.register_buffer('_pp', torch.tensor([(w/2, h/2) for h, w in self.imshapes]))
-        self.register_buffer('_grid', ParameterStack(
-            [xy_grid(W, H, device=self.device) for H, W in self.imshapes], fill=self.max_area))
-        # pre-compute pixel weights
-        self.register_buffer('_weight_i', ParameterStack(
-            [self.conf_trf(self.conf_i[i_j]) for i_j in self.str_edges], fill=self.max_area))
-        self.register_buffer('_weight_j', ParameterStack(
-            [self.conf_trf(self.conf_j[i_j]) for i_j in self.str_edges], fill=self.max_area))
-        # precompute aa
-        self.register_buffer('_stacked_pred_i', ParameterStack(self.pred_i, self.str_edges, fill=self.max_area))
-        self.register_buffer('_stacked_pred_j', ParameterStack(self.pred_j, self.str_edges, fill=self.max_area))
-        self.register_buffer('_ei', torch.tensor([i for i, j in self.edges]))
-        self.register_buffer('_ej', torch.tensor([j for i, j in self.edges]))
-        self.total_area_i = sum([im_areas[i] for i, j in self.edges])
-        self.total_area_j = sum([im_areas[j] for i, j in self.edges])
-    def _check_all_imgs_are_selected(self, msk):
-        assert np.all(self._get_msk_indices(msk) == np.arange(self.n_imgs)), 'incomplete mask!'
-    def preset_pose(self, known_poses, pose_msk=None):  # cam-to-world
-        self._check_all_imgs_are_selected(pose_msk)
-        if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2:
-            known_poses = [known_poses]
-        for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses):
-            if self.verbose:
-                print(f' (setting pose #{idx} = {pose[:3,3]})')
-            self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose)))
-        # normalize scale if there's less than 1 known pose
-        n_known_poses = sum((p.requires_grad is False) for p in self.im_poses)
-        self.norm_pw_scale = (n_known_poses <= 1)
-        self.im_poses.requires_grad_(False)
-        self.norm_pw_scale = False
-    def preset_focal(self, known_focals, msk=None):
-        self._check_all_imgs_are_selected(msk)
-        for idx, focal in zip(self._get_msk_indices(msk), known_focals):
-            if self.verbose:
-                print(f' (setting focal #{idx} = {focal})')
-            self._no_grad(self._set_focal(idx, focal))
-        self.im_focals.requires_grad_(False)
-    def preset_principal_point(self, known_pp, msk=None):
-        self._check_all_imgs_are_selected(msk)
-        for idx, pp in zip(self._get_msk_indices(msk), known_pp):
-            if self.verbose:
-                print(f' (setting principal point #{idx} = {pp})')
-            self._no_grad(self._set_principal_point(idx, pp))
-        self.im_pp.requires_grad_(False)
-    def _get_msk_indices(self, msk):
-        if msk is None:
-            return range(self.n_imgs)
-        elif isinstance(msk, int):
-            return [msk]
-        elif isinstance(msk, (tuple, list)):
-            return self._get_msk_indices(np.array(msk))
-        elif msk.dtype in (bool, torch.bool, np.bool_):
-            assert len(msk) == self.n_imgs
-            return np.where(msk)[0]
-        elif np.issubdtype(msk.dtype, np.integer):
-            return msk
-        else:
-            raise ValueError(f'bad {msk=}')
-    def _no_grad(self, tensor):
-        assert tensor.requires_grad, 'it must be True at this point, otherwise no modification occurs'
-    def _set_focal(self, idx, focal, force=False):
-        param = self.im_focals[idx]
-        if param.requires_grad or force:  # can only init a parameter not already initialized
-            param.data[:] = self.focal_break * np.log(focal)
-        return param
-    def get_focals(self):
-        log_focals = torch.stack(list(self.im_focals), dim=0)
-        return (log_focals / self.focal_break).exp()
-    def get_known_focal_mask(self):
-        return torch.tensor([not (p.requires_grad) for p in self.im_focals])
-    def _set_principal_point(self, idx, pp, force=False):
-        param = self.im_pp[idx]
-        H, W = self.imshapes[idx]
-        if param.requires_grad or force:  # can only init a parameter not already initialized
-            param.data[:] = to_cpu(to_numpy(pp) - (W/2, H/2)) / 10
-        return param
-    def get_principal_points(self):
-        return self._pp + 10 * self.im_pp
-    def get_intrinsics(self):
-        K = torch.zeros((self.n_imgs, 3, 3), device=self.device)
-        focals = self.get_focals().flatten()
-        K[:, 0, 0] = K[:, 1, 1] = focals
-        K[:, :2, 2] = self.get_principal_points()
-        K[:, 2, 2] = 1
-        return K
-    def get_im_poses(self):  # cam to world
-        cam2world = self._get_poses(self.im_poses)
-        return cam2world
-    def _set_depthmap(self, idx, depth, force=False):
-        depth = _ravel_hw(depth, self.max_area)
-        param = self.im_depthmaps[idx]
-        if param.requires_grad or force:  # can only init a parameter not already initialized
-            param.data[:] = depth.log().nan_to_num(neginf=0)
-        return param
-    def get_depthmaps(self, raw=False):
-        res = self.im_depthmaps.exp()
-        if not raw:
-            res = [dm[:h*w].view(h, w) for dm, (h, w) in zip(res, self.imshapes)]
-        return res
-    def depth_to_pts3d(self):
-        # Get depths and  projection params if not provided
-        focals = self.get_focals()
-        pp = self.get_principal_points()
-        im_poses = self.get_im_poses()
-        depth = self.get_depthmaps(raw=True)
-        # get pointmaps in camera frame
-        rel_ptmaps = _fast_depthmap_to_pts3d(depth, self._grid, focals, pp=pp)
-        # project to world frame
-        return geotrf(im_poses, rel_ptmaps)
-    def get_pts3d(self, raw=False):
-        res = self.depth_to_pts3d()
-        if not raw:
-            res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
-        return res
-    def forward(self):
-        pw_poses = self.get_pw_poses()  # cam-to-world
-        pw_adapt = self.get_adaptors().unsqueeze(1)
-        proj_pts3d = self.get_pts3d(raw=True)
-        # rotate pairwise prediction according to pw_poses
-        aligned_pred_i = geotrf(pw_poses, pw_adapt * self._stacked_pred_i)
-        aligned_pred_j = geotrf(pw_poses, pw_adapt * self._stacked_pred_j)
-        # compute the less
-        li = self.dist(proj_pts3d[self._ei], aligned_pred_i, weight=self._weight_i).sum() / self.total_area_i
-        lj = self.dist(proj_pts3d[self._ej], aligned_pred_j, weight=self._weight_j).sum() / self.total_area_j
-        return li + lj
-def _fast_depthmap_to_pts3d(depth, pixel_grid, focal, pp):
-    pp = pp.unsqueeze(1)
-    focal = focal.unsqueeze(1)
-    assert focal.shape == (len(depth), 1, 1)
-    assert pp.shape == (len(depth), 1, 2)
-    assert pixel_grid.shape == depth.shape + (2,)
-    depth = depth.unsqueeze(-1)
-    return torch.cat((depth * (pixel_grid - pp) / focal, depth), dim=-1)
-def ParameterStack(params, keys=None, is_param=None, fill=0):
-    if keys is not None:
-        params = [params[k] for k in keys]
-    if fill > 0:
-        params = [_ravel_hw(p, fill) for p in params]
-    requires_grad = params[0].requires_grad
-    assert all(p.requires_grad == requires_grad for p in params)
-    params = torch.stack(list(params)).float().detach()
-    if is_param or requires_grad:
-        params = nn.Parameter(params)
-        params.requires_grad_(requires_grad)
-    return params
-def _ravel_hw(tensor, fill=0):
-    # ravel H,W
-    tensor = tensor.view((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:])
-    if len(tensor) < fill:
-        tensor = torch.cat((tensor, tensor.new_zeros((fill - len(tensor),)+tensor.shape[1:])))
-    return tensor
-def acceptable_focal_range(H, W, minf=0.5, maxf=3.5):
-    focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2))  # size / 1.1547005383792515
-    return minf*focal_base, maxf*focal_base
-def apply_mask(img, msk):
-    img = img.copy()
-    img[msk] = 0
-    return img

mini_dust3r/cloud_opt/pair_viewer.py DELETED Viewed

@@ -1,127 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# Dummy optimizer for visualizing pairs
-# --------------------------------------------------------
-import numpy as np
-import torch
-import torch.nn as nn
-import cv2
-from mini_dust3r.cloud_opt.base_opt import BasePCOptimizer
-from mini_dust3r.utils.geometry import inv, geotrf, depthmap_to_absolute_camera_coordinates
-from mini_dust3r.cloud_opt.commons import edge_str
-from mini_dust3r.post_process import estimate_focal_knowing_depth
-class PairViewer (BasePCOptimizer):
-    """
-    This a Dummy Optimizer.
-    To use only when the goal is to visualize the results for a pair of images (with is_symmetrized)
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        assert self.is_symmetrized and self.n_edges == 2
-        self.has_im_poses = True
-        # compute all parameters directly from raw input
-        self.focals = []
-        self.pp = []
-        rel_poses = []
-        confs = []
-        for i in range(self.n_imgs):
-            conf = float(self.conf_i[edge_str(i, 1-i)].mean() * self.conf_j[edge_str(i, 1-i)].mean())
-            if self.verbose:
-                print(f'  - {conf=:.3} for edge {i}-{1-i}')
-            confs.append(conf)
-            H, W = self.imshapes[i]
-            pts3d = self.pred_i[edge_str(i, 1-i)]
-            pp = torch.tensor((W/2, H/2))
-            focal = float(estimate_focal_knowing_depth(pts3d[None], pp, focal_mode='weiszfeld'))
-            self.focals.append(focal)
-            self.pp.append(pp)
-            # estimate the pose of pts1 in image 2
-            pixels = np.mgrid[:W, :H].T.astype(np.float32)
-            pts3d = self.pred_j[edge_str(1-i, i)].numpy()
-            assert pts3d.shape[:2] == (H, W)
-            msk = self.get_masks()[i].numpy()
-            K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
-            try:
-                res = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None,
-                                         iterationsCount=100, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
-                success, R, T, inliers = res
-                assert success
-                R = cv2.Rodrigues(R)[0]  # world to cam
-                pose = inv(np.r_[np.c_[R, T], [(0, 0, 0, 1)]])  # cam to world
-            except:
-                pose = np.eye(4)
-            rel_poses.append(torch.from_numpy(pose.astype(np.float32)))
-        # let's use the pair with the most confidence
-        if confs[0] > confs[1]:
-            # ptcloud is expressed in camera1
-            self.im_poses = [torch.eye(4), rel_poses[1]]  # I, cam2-to-cam1
-            self.depth = [self.pred_i['0_1'][..., 2], geotrf(inv(rel_poses[1]), self.pred_j['0_1'])[..., 2]]
-        else:
-            # ptcloud is expressed in camera2
-            self.im_poses = [rel_poses[0], torch.eye(4)]  # I, cam1-to-cam2
-            self.depth = [geotrf(inv(rel_poses[0]), self.pred_j['1_0'])[..., 2], self.pred_i['1_0'][..., 2]]
-        self.im_poses = nn.Parameter(torch.stack(self.im_poses, dim=0), requires_grad=False)
-        self.focals = nn.Parameter(torch.tensor(self.focals), requires_grad=False)
-        self.pp = nn.Parameter(torch.stack(self.pp, dim=0), requires_grad=False)
-        self.depth = nn.ParameterList(self.depth)
-        for p in self.parameters():
-            p.requires_grad = False
-    def _set_depthmap(self, idx, depth, force=False):
-        if self.verbose:
-            print('_set_depthmap is ignored in PairViewer')
-        return
-    def get_depthmaps(self, raw=False):
-        depth = [d.to(self.device) for d in self.depth]
-        return depth
-    def _set_focal(self, idx, focal, force=False):
-        self.focals[idx] = focal
-    def get_focals(self):
-        return self.focals
-    def get_known_focal_mask(self):
-        return torch.tensor([not (p.requires_grad) for p in self.focals])
-    def get_principal_points(self):
-        return self.pp
-    def get_intrinsics(self):
-        focals = self.get_focals()
-        pps = self.get_principal_points()
-        K = torch.zeros((len(focals), 3, 3), device=self.device)
-        for i in range(len(focals)):
-            K[i, 0, 0] = K[i, 1, 1] = focals[i]
-            K[i, :2, 2] = pps[i]
-            K[i, 2, 2] = 1
-        return K
-    def get_im_poses(self):
-        return self.im_poses
-    def depth_to_pts3d(self):
-        pts3d = []
-        for d, intrinsics, im_pose in zip(self.depth, self.get_intrinsics(), self.get_im_poses()):
-            pts, _ = depthmap_to_absolute_camera_coordinates(d.cpu().numpy(),
-                                                             intrinsics.cpu().numpy(),
-                                                             im_pose.cpu().numpy())
-            pts3d.append(torch.from_numpy(pts).to(device=self.device))
-        return pts3d
-    def forward(self):
-        return float('nan')

mini_dust3r/croco/blocks.py DELETED Viewed

@@ -1,241 +0,0 @@
-# Copyright (C) 2022-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-# --------------------------------------------------------
-# Main encoder/decoder blocks
-# --------------------------------------------------------
-# References:
-# timm
-# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
-# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py
-# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
-# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py
-# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py
-import torch
-import torch.nn as nn
-from itertools import repeat
-import collections.abc
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
-            return x
-        return tuple(repeat(x, n))
-    return parse
-to_2tuple = _ntuple(2)
-def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    """
-    if drop_prob == 0. or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-    if keep_prob > 0.0 and scale_by_keep:
-        random_tensor.div_(keep_prob)
-    return x * random_tensor
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    """
-    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
-    def extra_repr(self):
-        return f'drop_prob={round(self.drop_prob,3):0.3f}'
-class Mlp(nn.Module):
-    """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
-    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        bias = to_2tuple(bias)
-        drop_probs = to_2tuple(drop)
-        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
-        self.act = act_layer()
-        self.drop1 = nn.Dropout(drop_probs[0])
-        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
-        self.drop2 = nn.Dropout(drop_probs[1])
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop1(x)
-        x = self.fc2(x)
-        x = self.drop2(x)
-        return x
-class Attention(nn.Module):
-    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = head_dim ** -0.5
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-        self.rope = rope
-    def forward(self, x, xpos):
-        B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1,3)
-        q, k, v = [qkv[:,:,i] for i in range(3)]
-        # q,k,v = qkv.unbind(2)  # make torchscript happy (cannot use tensor as tuple)
-        if self.rope is not None:
-            q = self.rope(q, xpos)
-            k = self.rope(k, xpos)
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-class Block(nn.Module):
-    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
-                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rope=None):
-        super().__init__()
-        self.norm1 = norm_layer(dim)
-        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-    def forward(self, x, xpos):
-        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-        return x
-class CrossAttention(nn.Module):
-    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = head_dim ** -0.5
-        self.projq = nn.Linear(dim, dim, bias=qkv_bias)
-        self.projk = nn.Linear(dim, dim, bias=qkv_bias)
-        self.projv = nn.Linear(dim, dim, bias=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-        self.rope = rope
-    def forward(self, query, key, value, qpos, kpos):
-        B, Nq, C = query.shape
-        Nk = key.shape[1]
-        Nv = value.shape[1]
-        q = self.projq(query).reshape(B,Nq,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
-        k = self.projk(key).reshape(B,Nk,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
-        v = self.projv(value).reshape(B,Nv,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
-        if self.rope is not None:
-            q = self.rope(q, qpos)
-            k = self.rope(k, kpos)
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-        x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-class DecoderBlock(nn.Module):
-    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
-                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_mem=True, rope=None):
-        super().__init__()
-        self.norm1 = norm_layer(dim)
-        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
-        self.cross_attn = CrossAttention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = norm_layer(dim)
-        self.norm3 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
-        self.norm_y = norm_layer(dim) if norm_mem else nn.Identity()
-    def forward(self, x, y, xpos, ypos):
-        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
-        y_ = self.norm_y(y)
-        x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos))
-        x = x + self.drop_path(self.mlp(self.norm3(x)))
-        return x, y
-# patch embedding
-class PositionGetter(object):
-    """ return positions of patches """
-    def __init__(self):
-        self.cache_positions = {}
-    def __call__(self, b, h, w, device):
-        if not (h,w) in self.cache_positions:
-            x = torch.arange(w, device=device)
-            y = torch.arange(h, device=device)
-            self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2)
-        pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone()
-        return pos
-class PatchEmbed(nn.Module):
-    """ just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed"""
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
-        self.num_patches = self.grid_size[0] * self.grid_size[1]
-        self.flatten = flatten
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
-        self.position_getter = PositionGetter()
-    def forward(self, x):
-        B, C, H, W = x.shape
-        torch._assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
-        torch._assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
-        x = self.proj(x)
-        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        x = self.norm(x)
-        return x, pos
-    def _init_weights(self):
-        w = self.proj.weight.data
-        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))

mini_dust3r/croco/croco.py DELETED Viewed

@@ -1,249 +0,0 @@
-# Copyright (C) 2022-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-# --------------------------------------------------------
-# CroCo model during pretraining
-# --------------------------------------------------------
-import torch
-import torch.nn as nn
-torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12
-from functools import partial
-from mini_dust3r.croco.blocks import Block, DecoderBlock, PatchEmbed
-from mini_dust3r.croco.pos_embed import get_2d_sincos_pos_embed, RoPE2D
-from mini_dust3r.croco.masking import RandomMask
-class CroCoNet(nn.Module):
-    def __init__(self,
-                 img_size=224,           # input image size
-                 patch_size=16,          # patch_size
-                 mask_ratio=0.9,         # ratios of masked tokens
-                 enc_embed_dim=768,      # encoder feature dimension
-                 enc_depth=12,           # encoder depth
-                 enc_num_heads=12,       # encoder number of heads in the transformer block
-                 dec_embed_dim=512,      # decoder feature dimension
-                 dec_depth=8,            # decoder depth
-                 dec_num_heads=16,       # decoder number of heads in the transformer block
-                 mlp_ratio=4,
-                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
-                 norm_im2_in_dec=True,   # whether to apply normalization of the 'memory' = (second image) in the decoder
-                 pos_embed='cosine',     # positional embedding (either cosine or RoPE100)
-                ):
-        super(CroCoNet, self).__init__()
-        # patch embeddings  (with initialization done as in MAE)
-        self._set_patch_embed(img_size, patch_size, enc_embed_dim)
-        # mask generations
-        self._set_mask_generator(self.patch_embed.num_patches, mask_ratio)
-        self.pos_embed = pos_embed
-        if pos_embed=='cosine':
-            # positional embedding of the encoder
-            enc_pos_embed = get_2d_sincos_pos_embed(enc_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0)
-            self.register_buffer('enc_pos_embed', torch.from_numpy(enc_pos_embed).float())
-            # positional embedding of the decoder
-            dec_pos_embed = get_2d_sincos_pos_embed(dec_embed_dim, int(self.patch_embed.num_patches**.5), n_cls_token=0)
-            self.register_buffer('dec_pos_embed', torch.from_numpy(dec_pos_embed).float())
-            # pos embedding in each block
-            self.rope = None # nothing for cosine
-        elif pos_embed.startswith('RoPE'): # eg RoPE100
-            self.enc_pos_embed = None # nothing to add in the encoder with RoPE
-            self.dec_pos_embed = None # nothing to add in the decoder with RoPE
-            if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions")
-            freq = float(pos_embed[len('RoPE'):])
-            self.rope = RoPE2D(freq=freq)
-        else:
-            raise NotImplementedError('Unknown pos_embed '+pos_embed)
-        # transformer for the encoder
-        self.enc_depth = enc_depth
-        self.enc_embed_dim = enc_embed_dim
-        self.enc_blocks = nn.ModuleList([
-            Block(enc_embed_dim, enc_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer, rope=self.rope)
-            for i in range(enc_depth)])
-        self.enc_norm = norm_layer(enc_embed_dim)
-        # masked tokens
-        self._set_mask_token(dec_embed_dim)
-        # decoder
-        self._set_decoder(enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec)
-        # prediction head
-        self._set_prediction_head(dec_embed_dim, patch_size)
-        # initializer weights
-        self.initialize_weights()
-    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
-        self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim)
-    def _set_mask_generator(self, num_patches, mask_ratio):
-        self.mask_generator = RandomMask(num_patches, mask_ratio)
-    def _set_mask_token(self, dec_embed_dim):
-        self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim))
-    def _set_decoder(self, enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec):
-        self.dec_depth = dec_depth
-        self.dec_embed_dim = dec_embed_dim
-        # transfer from encoder to decoder
-        self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True)
-        # transformer for the decoder
-        self.dec_blocks = nn.ModuleList([
-            DecoderBlock(dec_embed_dim, dec_num_heads, mlp_ratio=mlp_ratio, qkv_bias=True, norm_layer=norm_layer, norm_mem=norm_im2_in_dec, rope=self.rope)
-            for i in range(dec_depth)])
-        # final norm layer
-        self.dec_norm = norm_layer(dec_embed_dim)
-    def _set_prediction_head(self, dec_embed_dim, patch_size):
-         self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True)
-    def initialize_weights(self):
-        # patch embed
-        self.patch_embed._init_weights()
-        # mask tokens
-        if self.mask_token is not None: torch.nn.init.normal_(self.mask_token, std=.02)
-        # linears and layer norms
-        self.apply(self._init_weights)
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            # we use xavier_uniform following official JAX ViT:
-            torch.nn.init.xavier_uniform_(m.weight)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
-    def _encode_image(self, image, do_mask=False, return_all_blocks=False):
-        """
-        image has B x 3 x img_size x img_size
-        do_mask: whether to perform masking or not
-        return_all_blocks: if True, return the features at the end of every block
-                           instead of just the features from the last block (eg for some prediction heads)
-        """
-        # embed the image into patches  (x has size B x Npatches x C)
-        # and get position if each return patch (pos has size B x Npatches x 2)
-        x, pos = self.patch_embed(image)
-        # add positional embedding without cls token
-        if self.enc_pos_embed is not None:
-            x = x + self.enc_pos_embed[None,...]
-        # apply masking
-        B,N,C = x.size()
-        if do_mask:
-            masks = self.mask_generator(x)
-            x = x[~masks].view(B, -1, C)
-            posvis = pos[~masks].view(B, -1, 2)
-        else:
-            B,N,C = x.size()
-            masks = torch.zeros((B,N), dtype=bool)
-            posvis = pos
-        # now apply the transformer encoder and normalization
-        if return_all_blocks:
-            out = []
-            for blk in self.enc_blocks:
-                x = blk(x, posvis)
-                out.append(x)
-            out[-1] = self.enc_norm(out[-1])
-            return out, pos, masks
-        else:
-            for blk in self.enc_blocks:
-                x = blk(x, posvis)
-            x = self.enc_norm(x)
-            return x, pos, masks
-    def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False):
-        """
-        return_all_blocks: if True, return the features at the end of every block
-                           instead of just the features from the last block (eg for some prediction heads)
-        masks1 can be None => assume image1 fully visible
-        """
-        # encoder to decoder layer
-        visf1 = self.decoder_embed(feat1)
-        f2 = self.decoder_embed(feat2)
-        # append masked tokens to the sequence
-        B,Nenc,C = visf1.size()
-        if masks1 is None: # downstreams
-            f1_ = visf1
-        else: # pretraining
-            Ntotal = masks1.size(1)
-            f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype)
-            f1_[~masks1] = visf1.view(B * Nenc, C)
-        # add positional embedding
-        if self.dec_pos_embed is not None:
-            f1_ = f1_ + self.dec_pos_embed
-            f2 = f2 + self.dec_pos_embed
-        # apply Transformer blocks
-        out = f1_
-        out2 = f2
-        if return_all_blocks:
-            _out, out = out, []
-            for blk in self.dec_blocks:
-                _out, out2 = blk(_out, out2, pos1, pos2)
-                out.append(_out)
-            out[-1] = self.dec_norm(out[-1])
-        else:
-            for blk in self.dec_blocks:
-                out, out2 = blk(out, out2, pos1, pos2)
-            out = self.dec_norm(out)
-        return out
-    def patchify(self, imgs):
-        """
-        imgs: (B, 3, H, W)
-        x: (B, L, patch_size**2 *3)
-        """
-        p = self.patch_embed.patch_size[0]
-        assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
-        h = w = imgs.shape[2] // p
-        x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
-        x = torch.einsum('nchpwq->nhwpqc', x)
-        x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
-        return x
-    def unpatchify(self, x, channels=3):
-        """
-        x: (N, L, patch_size**2 *channels)
-        imgs: (N, 3, H, W)
-        """
-        patch_size = self.patch_embed.patch_size[0]
-        h = w = int(x.shape[1]**.5)
-        assert h * w == x.shape[1]
-        x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels))
-        x = torch.einsum('nhwpqc->nchpwq', x)
-        imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size))
-        return imgs
-    def forward(self, img1, img2):
-        """
-        img1: tensor of size B x 3 x img_size x img_size
-        img2: tensor of size B x 3 x img_size x img_size
-        out will be    B x N x (3*patch_size*patch_size)
-        masks are also returned as B x N just in case
-        """
-        # encoder of the masked first image
-        feat1, pos1, mask1 = self._encode_image(img1, do_mask=True)
-        # encoder of the second image
-        feat2, pos2, _ = self._encode_image(img2, do_mask=False)
-        # decoder
-        decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2)
-        # prediction head
-        out = self.prediction_head(decfeat)
-        # get target
-        target = self.patchify(img1)
-        return out, mask1, target

mini_dust3r/croco/dpt_block.py DELETED Viewed

@@ -1,450 +0,0 @@
-# Copyright (C) 2022-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-# --------------------------------------------------------
-# DPT head for ViTs
-# --------------------------------------------------------
-# References:
-# https://github.com/isl-org/DPT
-# https://github.com/EPFL-VILAB/MultiMAE/blob/main/multimae/output_adapters.py
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange, repeat
-from typing import Union, Tuple, Iterable, List, Optional, Dict
-def pair(t):
-    return t if isinstance(t, tuple) else (t, t)
-def make_scratch(in_shape, out_shape, groups=1, expand=False):
-    scratch = nn.Module()
-    out_shape1 = out_shape
-    out_shape2 = out_shape
-    out_shape3 = out_shape
-    out_shape4 = out_shape
-    if expand == True:
-        out_shape1 = out_shape
-        out_shape2 = out_shape * 2
-        out_shape3 = out_shape * 4
-        out_shape4 = out_shape * 8
-    scratch.layer1_rn = nn.Conv2d(
-        in_shape[0],
-        out_shape1,
-        kernel_size=3,
-        stride=1,
-        padding=1,
-        bias=False,
-        groups=groups,
-    )
-    scratch.layer2_rn = nn.Conv2d(
-        in_shape[1],
-        out_shape2,
-        kernel_size=3,
-        stride=1,
-        padding=1,
-        bias=False,
-        groups=groups,
-    )
-    scratch.layer3_rn = nn.Conv2d(
-        in_shape[2],
-        out_shape3,
-        kernel_size=3,
-        stride=1,
-        padding=1,
-        bias=False,
-        groups=groups,
-    )
-    scratch.layer4_rn = nn.Conv2d(
-        in_shape[3],
-        out_shape4,
-        kernel_size=3,
-        stride=1,
-        padding=1,
-        bias=False,
-        groups=groups,
-    )
-    scratch.layer_rn = nn.ModuleList([
-        scratch.layer1_rn,
-        scratch.layer2_rn,
-        scratch.layer3_rn,
-        scratch.layer4_rn,
-    ])
-    return scratch
-class ResidualConvUnit_custom(nn.Module):
-    """Residual convolution module."""
-    def __init__(self, features, activation, bn):
-        """Init.
-        Args:
-            features (int): number of features
-        """
-        super().__init__()
-        self.bn = bn
-        self.groups = 1
-        self.conv1 = nn.Conv2d(
-            features,
-            features,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            bias=not self.bn,
-            groups=self.groups,
-        )
-        self.conv2 = nn.Conv2d(
-            features,
-            features,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            bias=not self.bn,
-            groups=self.groups,
-        )
-        if self.bn == True:
-            self.bn1 = nn.BatchNorm2d(features)
-            self.bn2 = nn.BatchNorm2d(features)
-        self.activation = activation
-        self.skip_add = nn.quantized.FloatFunctional()
-    def forward(self, x):
-        """Forward pass.
-        Args:
-            x (tensor): input
-        Returns:
-            tensor: output
-        """
-        out = self.activation(x)
-        out = self.conv1(out)
-        if self.bn == True:
-            out = self.bn1(out)
-        out = self.activation(out)
-        out = self.conv2(out)
-        if self.bn == True:
-            out = self.bn2(out)
-        if self.groups > 1:
-            out = self.conv_merge(out)
-        return self.skip_add.add(out, x)
-class FeatureFusionBlock_custom(nn.Module):
-    """Feature fusion block."""
-    def __init__(
-        self,
-        features,
-        activation,
-        deconv=False,
-        bn=False,
-        expand=False,
-        align_corners=True,
-        width_ratio=1,
-    ):
-        """Init.
-        Args:
-            features (int): number of features
-        """
-        super(FeatureFusionBlock_custom, self).__init__()
-        self.width_ratio = width_ratio
-        self.deconv = deconv
-        self.align_corners = align_corners
-        self.groups = 1
-        self.expand = expand
-        out_features = features
-        if self.expand == True:
-            out_features = features // 2
-        self.out_conv = nn.Conv2d(
-            features,
-            out_features,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=True,
-            groups=1,
-        )
-        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
-        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
-        self.skip_add = nn.quantized.FloatFunctional()
-    def forward(self, *xs):
-        """Forward pass.
-        Returns:
-            tensor: output
-        """
-        output = xs[0]
-        if len(xs) == 2:
-            res = self.resConfUnit1(xs[1])
-            if self.width_ratio != 1:
-                res = F.interpolate(res, size=(output.shape[2], output.shape[3]), mode='bilinear')
-            output = self.skip_add.add(output, res)
-            # output += res
-        output = self.resConfUnit2(output)
-        if self.width_ratio != 1:
-            # and output.shape[3] < self.width_ratio * output.shape[2]
-            #size=(image.shape[])
-            if (output.shape[3] / output.shape[2]) < (2 / 3) * self.width_ratio:
-                shape = 3 * output.shape[3]
-            else:
-                shape = int(self.width_ratio * 2 * output.shape[2])
-            output  = F.interpolate(output, size=(2* output.shape[2], shape), mode='bilinear')
-        else:
-            output = nn.functional.interpolate(output, scale_factor=2,
-                    mode="bilinear", align_corners=self.align_corners)
-        output = self.out_conv(output)
-        return output
-def make_fusion_block(features, use_bn, width_ratio=1):
-    return FeatureFusionBlock_custom(
-        features,
-        nn.ReLU(False),
-        deconv=False,
-        bn=use_bn,
-        expand=False,
-        align_corners=True,
-        width_ratio=width_ratio,
-    )
-class Interpolate(nn.Module):
-    """Interpolation module."""
-    def __init__(self, scale_factor, mode, align_corners=False):
-        """Init.
-        Args:
-            scale_factor (float): scaling
-            mode (str): interpolation mode
-        """
-        super(Interpolate, self).__init__()
-        self.interp = nn.functional.interpolate
-        self.scale_factor = scale_factor
-        self.mode = mode
-        self.align_corners = align_corners
-    def forward(self, x):
-        """Forward pass.
-        Args:
-            x (tensor): input
-        Returns:
-            tensor: interpolated data
-        """
-        x = self.interp(
-            x,
-            scale_factor=self.scale_factor,
-            mode=self.mode,
-            align_corners=self.align_corners,
-        )
-        return x
-class DPTOutputAdapter(nn.Module):
-    """DPT output adapter.
-    :param num_cahnnels: Number of output channels
-    :param stride_level: tride level compared to the full-sized image.
-        E.g. 4 for 1/4th the size of the image.
-    :param patch_size_full: Int or tuple of the patch size over the full image size.
-        Patch size for smaller inputs will be computed accordingly.
-    :param hooks: Index of intermediate layers
-    :param layer_dims: Dimension of intermediate layers
-    :param feature_dim: Feature dimension
-    :param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression
-    :param use_bn: If set to True, activates batch norm
-    :param dim_tokens_enc:  Dimension of tokens coming from encoder
-    """
-    def __init__(self,
-                 num_channels: int = 1,
-                 stride_level: int = 1,
-                 patch_size: Union[int, Tuple[int, int]] = 16,
-                 main_tasks: Iterable[str] = ('rgb',),
-                 hooks: List[int] = [2, 5, 8, 11],
-                 layer_dims: List[int] = [96, 192, 384, 768],
-                 feature_dim: int = 256,
-                 last_dim: int = 32,
-                 use_bn: bool = False,
-                 dim_tokens_enc: Optional[int] = None,
-                 head_type: str = 'regression',
-                 output_width_ratio=1,
-                 **kwargs):
-        super().__init__()
-        self.num_channels = num_channels
-        self.stride_level = stride_level
-        self.patch_size = pair(patch_size)
-        self.main_tasks = main_tasks
-        self.hooks = hooks
-        self.layer_dims = layer_dims
-        self.feature_dim = feature_dim
-        self.dim_tokens_enc = dim_tokens_enc * len(self.main_tasks) if dim_tokens_enc is not None else None
-        self.head_type = head_type
-        # Actual patch height and width, taking into account stride of input
-        self.P_H = max(1, self.patch_size[0] // stride_level)
-        self.P_W = max(1, self.patch_size[1] // stride_level)
-        self.scratch = make_scratch(layer_dims, feature_dim, groups=1, expand=False)
-        self.scratch.refinenet1 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
-        self.scratch.refinenet2 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
-        self.scratch.refinenet3 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
-        self.scratch.refinenet4 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
-        if self.head_type == 'regression':
-            # The "DPTDepthModel" head
-            self.head = nn.Sequential(
-                nn.Conv2d(feature_dim, feature_dim // 2, kernel_size=3, stride=1, padding=1),
-                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
-                nn.Conv2d(feature_dim // 2, last_dim, kernel_size=3, stride=1, padding=1),
-                nn.ReLU(True),
-                nn.Conv2d(last_dim, self.num_channels, kernel_size=1, stride=1, padding=0)
-            )
-        elif self.head_type == 'semseg':
-            # The "DPTSegmentationModel" head
-            self.head = nn.Sequential(
-                nn.Conv2d(feature_dim, feature_dim, kernel_size=3, padding=1, bias=False),
-                nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(),
-                nn.ReLU(True),
-                nn.Dropout(0.1, False),
-                nn.Conv2d(feature_dim, self.num_channels, kernel_size=1),
-                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
-            )
-        else:
-            raise ValueError('DPT head_type must be "regression" or "semseg".')
-        if self.dim_tokens_enc is not None:
-            self.init(dim_tokens_enc=dim_tokens_enc)
-    def init(self, dim_tokens_enc=768):
-        """
-        Initialize parts of decoder that are dependent on dimension of encoder tokens.
-        Should be called when setting up MultiMAE.
-        :param dim_tokens_enc: Dimension of tokens coming from encoder
-        """
-        #print(dim_tokens_enc)
-        # Set up activation postprocessing layers
-        if isinstance(dim_tokens_enc, int):
-            dim_tokens_enc = 4 * [dim_tokens_enc]
-        self.dim_tokens_enc = [dt * len(self.main_tasks) for dt in dim_tokens_enc]
-        self.act_1_postprocess = nn.Sequential(
-            nn.Conv2d(
-                in_channels=self.dim_tokens_enc[0],
-                out_channels=self.layer_dims[0],
-                kernel_size=1, stride=1, padding=0,
-            ),
-            nn.ConvTranspose2d(
-                in_channels=self.layer_dims[0],
-                out_channels=self.layer_dims[0],
-                kernel_size=4, stride=4, padding=0,
-                bias=True, dilation=1, groups=1,
-            )
-        )
-        self.act_2_postprocess = nn.Sequential(
-            nn.Conv2d(
-                in_channels=self.dim_tokens_enc[1],
-                out_channels=self.layer_dims[1],
-                kernel_size=1, stride=1, padding=0,
-            ),
-            nn.ConvTranspose2d(
-                in_channels=self.layer_dims[1],
-                out_channels=self.layer_dims[1],
-                kernel_size=2, stride=2, padding=0,
-                bias=True, dilation=1, groups=1,
-            )
-        )
-        self.act_3_postprocess = nn.Sequential(
-            nn.Conv2d(
-                in_channels=self.dim_tokens_enc[2],
-                out_channels=self.layer_dims[2],
-                kernel_size=1, stride=1, padding=0,
-            )
-        )
-        self.act_4_postprocess = nn.Sequential(
-            nn.Conv2d(
-                in_channels=self.dim_tokens_enc[3],
-                out_channels=self.layer_dims[3],
-                kernel_size=1, stride=1, padding=0,
-            ),
-            nn.Conv2d(
-                in_channels=self.layer_dims[3],
-                out_channels=self.layer_dims[3],
-                kernel_size=3, stride=2, padding=1,
-            )
-        )
-        self.act_postprocess = nn.ModuleList([
-            self.act_1_postprocess,
-            self.act_2_postprocess,
-            self.act_3_postprocess,
-            self.act_4_postprocess
-        ])
-    def adapt_tokens(self, encoder_tokens):
-        # Adapt tokens
-        x = []
-        x.append(encoder_tokens[:, :])
-        x = torch.cat(x, dim=-1)
-        return x
-    def forward(self, encoder_tokens: List[torch.Tensor], image_size):
-            #input_info: Dict):
-        assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
-        H, W = image_size
-        # Number of patches in height and width
-        N_H = H // (self.stride_level * self.P_H)
-        N_W = W // (self.stride_level * self.P_W)
-        # Hook decoder onto 4 layers from specified ViT layers
-        layers = [encoder_tokens[hook] for hook in self.hooks]
-        # Extract only task-relevant tokens and ignore global tokens.
-        layers = [self.adapt_tokens(l) for l in layers]
-        # Reshape tokens to spatial representation
-        layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
-        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
-        # Project layers to chosen feature dim
-        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
-        # Fuse layers using refinement stages
-        path_4 = self.scratch.refinenet4(layers[3])
-        path_3 = self.scratch.refinenet3(path_4, layers[2])
-        path_2 = self.scratch.refinenet2(path_3, layers[1])
-        path_1 = self.scratch.refinenet1(path_2, layers[0])
-        # Output head
-        out = self.head(path_1)
-        return out

mini_dust3r/croco/masking.py DELETED Viewed

@@ -1,25 +0,0 @@
-# Copyright (C) 2022-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-# --------------------------------------------------------
-# Masking utils
-# --------------------------------------------------------
-import torch
-import torch.nn as nn
-class RandomMask(nn.Module):
-    """
-    random masking
-    """
-    def __init__(self, num_patches, mask_ratio):
-        super().__init__()
-        self.num_patches = num_patches
-        self.num_mask = int(mask_ratio * self.num_patches)
-    def __call__(self, x):
-        noise = torch.rand(x.size(0), self.num_patches, device=x.device)
-        argsort = torch.argsort(noise, dim=1)
-        return argsort < self.num_mask

mini_dust3r/croco/pos_embed.py DELETED Viewed

@@ -1,159 +0,0 @@
-# Copyright (C) 2022-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-# --------------------------------------------------------
-# Position embedding utils
-# --------------------------------------------------------
-import numpy as np
-import torch
-# --------------------------------------------------------
-# 2D sine-cosine position embedding
-# References:
-# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
-# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
-# MoCo v3: https://github.com/facebookresearch/moco-v3
-# --------------------------------------------------------
-def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0):
-    """
-    grid_size: int of the grid height and width
-    return:
-    pos_embed: [grid_size*grid_size, embed_dim] or [n_cls_token+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
-    """
-    grid_h = np.arange(grid_size, dtype=np.float32)
-    grid_w = np.arange(grid_size, dtype=np.float32)
-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    grid = np.stack(grid, axis=0)
-    grid = grid.reshape([2, 1, grid_size, grid_size])
-    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
-    if n_cls_token>0:
-        pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0)
-    return pos_embed
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
-    assert embed_dim % 2 == 0
-    # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
-    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
-    return emb
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
-    """
-    embed_dim: output dimension for each position
-    pos: a list of positions to be encoded: size (M,)
-    out: (M, D)
-    """
-    assert embed_dim % 2 == 0
-    omega = np.arange(embed_dim // 2, dtype=float)
-    omega /= embed_dim / 2.
-    omega = 1. / 10000**omega  # (D/2,)
-    pos = pos.reshape(-1)  # (M,)
-    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
-    emb_sin = np.sin(out) # (M, D/2)
-    emb_cos = np.cos(out) # (M, D/2)
-    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
-    return emb
-# --------------------------------------------------------
-# Interpolate position embeddings for high-resolution
-# References:
-# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
-# DeiT: https://github.com/facebookresearch/deit
-# --------------------------------------------------------
-def interpolate_pos_embed(model, checkpoint_model):
-    if 'pos_embed' in checkpoint_model:
-        pos_embed_checkpoint = checkpoint_model['pos_embed']
-        embedding_size = pos_embed_checkpoint.shape[-1]
-        num_patches = model.patch_embed.num_patches
-        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
-        # height (== width) for the checkpoint position embedding
-        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
-        # height (== width) for the new position embedding
-        new_size = int(num_patches ** 0.5)
-        # class_token and dist_token are kept unchanged
-        if orig_size != new_size:
-            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
-            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
-            # only the position tokens are interpolated
-            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
-            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
-            pos_tokens = torch.nn.functional.interpolate(
-                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
-            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
-            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
-            checkpoint_model['pos_embed'] = new_pos_embed
-#----------------------------------------------------------
-# RoPE2D: RoPE implementation in 2D
-#----------------------------------------------------------
-try:
-    from mini_dust3r.croco.curope import cuRoPE2D
-    RoPE2D = cuRoPE2D
-except ImportError:
-    print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead')
-    class RoPE2D(torch.nn.Module):
-        def __init__(self, freq=100.0, F0=1.0):
-            super().__init__()
-            self.base = freq
-            self.F0 = F0
-            self.cache = {}
-        def get_cos_sin(self, D, seq_len, device, dtype):
-            if (D,seq_len,device,dtype) not in self.cache:
-                inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
-                t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
-                freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
-                freqs = torch.cat((freqs, freqs), dim=-1)
-                cos = freqs.cos() # (Seq, Dim)
-                sin = freqs.sin()
-                self.cache[D,seq_len,device,dtype] = (cos,sin)
-            return self.cache[D,seq_len,device,dtype]
-        @staticmethod
-        def rotate_half(x):
-            x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
-            return torch.cat((-x2, x1), dim=-1)
-        def apply_rope1d(self, tokens, pos1d, cos, sin):
-            assert pos1d.ndim==2
-            cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
-            sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
-            return (tokens * cos) + (self.rotate_half(tokens) * sin)
-        def forward(self, tokens, positions):
-            """
-            input:
-                * tokens: batch_size x nheads x ntokens x dim
-                * positions: batch_size x ntokens x 2 (y and x position of each token)
-            output:
-                * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
-            """
-            assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
-            D = tokens.size(3) // 2
-            assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
-            cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
-            # split features into two along the feature dimension, and apply rope1d on each half
-            y, x = tokens.chunk(2, dim=-1)
-            y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
-            x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
-            tokens = torch.cat((y, x), dim=-1)
-            return tokens

mini_dust3r/heads/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# head factory
-# --------------------------------------------------------
-from .linear_head import LinearPts3d
-from .dpt_head import create_dpt_head
-def head_factory(head_type, output_mode, net, has_conf=False):
-    """" build a prediction head for the decoder
-    """
-    if head_type == 'linear' and output_mode == 'pts3d':
-        return LinearPts3d(net, has_conf)
-    elif head_type == 'dpt' and output_mode == 'pts3d':
-        return create_dpt_head(net, has_conf=has_conf)
-    else:
-        raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}")

mini_dust3r/heads/dpt_head.py DELETED Viewed

@@ -1,114 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# dpt head implementation for DUST3R
-# Downstream heads assume inputs of size B x N x C (where N is the number of tokens) ;
-# or if it takes as input the output at every layer, the attribute return_all_layers should be set to True
-# the forward function also takes as input a dictionnary img_info with key "height" and "width"
-# for PixelwiseTask, the output will be of dimension B x num_channels x H x W
-# --------------------------------------------------------
-from einops import rearrange
-from typing import List
-import torch
-import torch.nn as nn
-from mini_dust3r.heads.postprocess import postprocess
-from mini_dust3r.croco.dpt_block import DPTOutputAdapter
-class DPTOutputAdapter_fix(DPTOutputAdapter):
-    """
-    Adapt croco's DPTOutputAdapter implementation for dust3r:
-    remove duplicated weigths, and fix forward for dust3r
-    """
-    def init(self, dim_tokens_enc=768):
-        super().init(dim_tokens_enc)
-        # these are duplicated weights
-        del self.act_1_postprocess
-        del self.act_2_postprocess
-        del self.act_3_postprocess
-        del self.act_4_postprocess
-    def forward(self, encoder_tokens: List[torch.Tensor], image_size=None):
-        assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
-        # H, W = input_info['image_size']
-        image_size = self.image_size if image_size is None else image_size
-        H, W = image_size
-        # Number of patches in height and width
-        N_H = H // (self.stride_level * self.P_H)
-        N_W = W // (self.stride_level * self.P_W)
-        # Hook decoder onto 4 layers from specified ViT layers
-        layers = [encoder_tokens[hook] for hook in self.hooks]
-        # Extract only task-relevant tokens and ignore global tokens.
-        layers = [self.adapt_tokens(l) for l in layers]
-        # Reshape tokens to spatial representation
-        layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
-        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
-        # Project layers to chosen feature dim
-        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
-        # Fuse layers using refinement stages
-        path_4 = self.scratch.refinenet4(layers[3])[:, :, :layers[2].shape[2], :layers[2].shape[3]]
-        path_3 = self.scratch.refinenet3(path_4, layers[2])
-        path_2 = self.scratch.refinenet2(path_3, layers[1])
-        path_1 = self.scratch.refinenet1(path_2, layers[0])
-        # Output head
-        out = self.head(path_1)
-        return out
-class PixelwiseTaskWithDPT(nn.Module):
-    """ DPT module for dust3r, can return 3D points + confidence for all pixels"""
-    def __init__(self, *, n_cls_token=0, hooks_idx=None, dim_tokens=None,
-                 output_width_ratio=1, num_channels=1, postprocess=None, depth_mode=None, conf_mode=None, **kwargs):
-        super(PixelwiseTaskWithDPT, self).__init__()
-        self.return_all_layers = True  # backbone needs to return all layers
-        self.postprocess = postprocess
-        self.depth_mode = depth_mode
-        self.conf_mode = conf_mode
-        assert n_cls_token == 0, "Not implemented"
-        dpt_args = dict(output_width_ratio=output_width_ratio,
-                        num_channels=num_channels,
-                        **kwargs)
-        if hooks_idx is not None:
-            dpt_args.update(hooks=hooks_idx)
-        self.dpt = DPTOutputAdapter_fix(**dpt_args)
-        dpt_init_args = {} if dim_tokens is None else {'dim_tokens_enc': dim_tokens}
-        self.dpt.init(**dpt_init_args)
-    def forward(self, x, img_info):
-        out = self.dpt(x, image_size=(img_info[0], img_info[1]))
-        if self.postprocess:
-            out = self.postprocess(out, self.depth_mode, self.conf_mode)
-        return out
-def create_dpt_head(net, has_conf=False):
-    """
-    return PixelwiseTaskWithDPT for given net params
-    """
-    assert net.dec_depth > 9
-    l2 = net.dec_depth
-    feature_dim = 256
-    last_dim = feature_dim//2
-    out_nchan = 3
-    ed = net.enc_embed_dim
-    dd = net.dec_embed_dim
-    return PixelwiseTaskWithDPT(num_channels=out_nchan + has_conf,
-                                feature_dim=feature_dim,
-                                last_dim=last_dim,
-                                hooks_idx=[0, l2*2//4, l2*3//4, l2],
-                                dim_tokens=[ed, dd, dd, dd],
-                                postprocess=postprocess,
-                                depth_mode=net.depth_mode,
-                                conf_mode=net.conf_mode,
-                                head_type='regression')

mini_dust3r/heads/linear_head.py DELETED Viewed

@@ -1,41 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# linear head implementation for DUST3R
-# --------------------------------------------------------
-import torch.nn as nn
-import torch.nn.functional as F
-from mini_dust3r.heads.postprocess import postprocess
-class LinearPts3d (nn.Module):
-    """
-    Linear head for dust3r
-    Each token outputs: - 16x16 3D points (+ confidence)
-    """
-    def __init__(self, net, has_conf=False):
-        super().__init__()
-        self.patch_size = net.patch_embed.patch_size[0]
-        self.depth_mode = net.depth_mode
-        self.conf_mode = net.conf_mode
-        self.has_conf = has_conf
-        self.proj = nn.Linear(net.dec_embed_dim, (3 + has_conf)*self.patch_size**2)
-    def setup(self, croconet):
-        pass
-    def forward(self, decout, img_shape):
-        H, W = img_shape
-        tokens = decout[-1]
-        B, S, D = tokens.shape
-        # extract 3D points
-        feat = self.proj(tokens)  # B,S,D
-        feat = feat.transpose(-1, -2).view(B, -1, H//self.patch_size, W//self.patch_size)
-        feat = F.pixel_shuffle(feat, self.patch_size)  # B,3,H,W
-        # permute + norm depth
-        return postprocess(feat, self.depth_mode, self.conf_mode)

mini_dust3r/heads/postprocess.py DELETED Viewed

@@ -1,58 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# post process function for all heads: extract 3D points/confidence from output
-# --------------------------------------------------------
-import torch
-def postprocess(out, depth_mode, conf_mode):
-    """
-    extract 3D points/confidence from prediction head output
-    """
-    fmap = out.permute(0, 2, 3, 1)  # B,H,W,3
-    res = dict(pts3d=reg_dense_depth(fmap[:, :, :, 0:3], mode=depth_mode))
-    if conf_mode is not None:
-        res['conf'] = reg_dense_conf(fmap[:, :, :, 3], mode=conf_mode)
-    return res
-def reg_dense_depth(xyz, mode):
-    """
-    extract 3D points from prediction head output
-    """
-    mode, vmin, vmax = mode
-    no_bounds = (vmin == -float('inf')) and (vmax == float('inf'))
-    assert no_bounds
-    if mode == 'linear':
-        if no_bounds:
-            return xyz  # [-inf, +inf]
-        return xyz.clip(min=vmin, max=vmax)
-    # distance to origin
-    d = xyz.norm(dim=-1, keepdim=True)
-    xyz = xyz / d.clip(min=1e-8)
-    if mode == 'square':
-        return xyz * d.square()
-    if mode == 'exp':
-        return xyz * torch.expm1(d)
-    raise ValueError(f'bad {mode=}')
-def reg_dense_conf(x, mode):
-    """
-    extract confidence from prediction head output
-    """
-    mode, vmin, vmax = mode
-    if mode == 'exp':
-        return vmin + x.exp().clip(max=vmax-vmin)
-    if mode == 'sigmoid':
-        return (vmax - vmin) * torch.sigmoid(x) + vmin
-    raise ValueError(f'bad {mode=}')

mini_dust3r/image_pairs.py DELETED Viewed

@@ -1,85 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# utilities needed to load image pairs
-# --------------------------------------------------------
-import numpy as np
-import torch
-from mini_dust3r.utils.image import ImageDict
-def make_pairs(
-    imgs: list[ImageDict],
-    scene_graph: str = "complete",
-    prefilter=None,
-    symmetrize=True,
-) -> list[tuple[ImageDict, ImageDict]]:
-    pairs = []
-    if scene_graph == "complete":  # complete graph
-        for i in range(len(imgs)):
-            for j in range(i):
-                pairs.append((imgs[i], imgs[j]))
-    elif scene_graph.startswith("swin"):
-        winsize = int(scene_graph.split("-")[1]) if "-" in scene_graph else 3
-        pairsid = set()
-        for i in range(len(imgs)):
-            for j in range(1, winsize + 1):
-                idx = (i + j) % len(imgs)  # explicit loop closure
-                pairsid.add((i, idx) if i < idx else (idx, i))
-        for i, j in pairsid:
-            pairs.append((imgs[i], imgs[j]))
-    elif scene_graph.startswith("oneref"):
-        refid = int(scene_graph.split("-")[1]) if "-" in scene_graph else 0
-        for j in range(len(imgs)):
-            if j != refid:
-                pairs.append((imgs[refid], imgs[j]))
-    if symmetrize:
-        pairs += [(img2, img1) for img1, img2 in pairs]
-    # now, remove edges
-    if isinstance(prefilter, str) and prefilter.startswith("seq"):
-        pairs = filter_pairs_seq(pairs, int(prefilter[3:]))
-    if isinstance(prefilter, str) and prefilter.startswith("cyc"):
-        pairs = filter_pairs_seq(pairs, int(prefilter[3:]), cyclic=True)
-    return pairs
-def sel(x, kept):
-    if isinstance(x, dict):
-        return {k: sel(v, kept) for k, v in x.items()}
-    if isinstance(x, (torch.Tensor, np.ndarray)):
-        return x[kept]
-    if isinstance(x, (tuple, list)):
-        return type(x)([x[k] for k in kept])
-def _filter_edges_seq(edges, seq_dis_thr, cyclic=False):
-    # number of images
-    n = max(max(e) for e in edges) + 1
-    kept = []
-    for e, (i, j) in enumerate(edges):
-        dis = abs(i - j)
-        if cyclic:
-            dis = min(dis, abs(i + n - j), abs(i - n - j))
-        if dis <= seq_dis_thr:
-            kept.append(e)
-    return kept
-def filter_pairs_seq(pairs, seq_dis_thr, cyclic=False):
-    edges = [(img1["idx"], img2["idx"]) for img1, img2 in pairs]
-    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
-    return [pairs[i] for i in kept]
-def filter_edges_seq(view1, view2, pred1, pred2, seq_dis_thr, cyclic=False):
-    edges = [(int(i), int(j)) for i, j in zip(view1["idx"], view2["idx"])]
-    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
-    print(
-        f">> Filtering edges more than {seq_dis_thr} frames apart: kept {len(kept)}/{len(edges)} edges"
-    )
-    return sel(view1, kept), sel(view2, kept), sel(pred1, kept), sel(pred2, kept)

mini_dust3r/inference.py DELETED Viewed

@@ -1,204 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# utilities needed for the inference
-# --------------------------------------------------------
-import tqdm
-import torch
-from mini_dust3r.utils.device import to_cpu, collate_with_cat
-from mini_dust3r.utils.misc import invalid_to_nans
-from mini_dust3r.utils.geometry import depthmap_to_pts3d, geotrf
-from mini_dust3r.utils.image import ImageDict
-from mini_dust3r.model import AsymmetricCroCo3DStereo
-from typing import Literal, TypedDict, Optional
-from jaxtyping import Float32
-class Dust3rPred1(TypedDict):
-    pts3d: Float32[torch.Tensor, "b h w c"]
-    conf: Float32[torch.Tensor, "b h w"]
-class Dust3rPred2(TypedDict):
-    pts3d_in_other_view: Float32[torch.Tensor, "b h w c"]
-    conf: Float32[torch.Tensor, "b h w"]
-class Dust3rResult(TypedDict):
-    view1: ImageDict
-    view2: ImageDict
-    pred1: Dust3rPred1
-    pred2: Dust3rPred2
-    loss: Optional[int]
-def _interleave_imgs(img1, img2):
-    res = {}
-    for key, value1 in img1.items():
-        value2 = img2[key]
-        if isinstance(value1, torch.Tensor):
-            value = torch.stack((value1, value2), dim=1).flatten(0, 1)
-        else:
-            value = [x for pair in zip(value1, value2) for x in pair]
-        res[key] = value
-    return res
-def make_batch_symmetric(batch):
-    view1, view2 = batch
-    view1, view2 = (_interleave_imgs(view1, view2), _interleave_imgs(view2, view1))
-    return view1, view2
-def loss_of_one_batch(
-    batch, model, criterion, device, symmetrize_batch=False, use_amp=False, ret=None
-):
-    view1, view2 = batch
-    for view in batch:
-        for name in (
-            "img pts3d valid_mask camera_pose camera_intrinsics F_matrix corres".split()
-        ):  # pseudo_focal
-            if name not in view:
-                continue
-            view[name] = view[name].to(device, non_blocking=True)
-    if symmetrize_batch:
-        view1, view2 = make_batch_symmetric(batch)
-    with torch.cuda.amp.autocast(enabled=bool(use_amp)):
-        pred1, pred2 = model(view1, view2)
-        # loss is supposed to be symmetric
-        with torch.cuda.amp.autocast(enabled=False):
-            loss = (
-                criterion(view1, view2, pred1, pred2) if criterion is not None else None
-            )
-    result = dict(view1=view1, view2=view2, pred1=pred1, pred2=pred2, loss=loss)
-    return result[ret] if ret else result
-@torch.no_grad()
-def inference(
-    pairs: list[tuple[ImageDict, ImageDict]],
-    model: AsymmetricCroCo3DStereo,
-    device: Literal["cpu", "cuda", "mps"],
-    batch_size: int = 8,
-    verbose: bool = True,
-) -> Dust3rResult:
-    if verbose:
-        print(f">> Inference with model on {len(pairs)} image pairs")
-    result = []
-    # first, check if all images have the same size
-    multiple_shapes = not (check_if_same_size(pairs))
-    if multiple_shapes:  # force bs=1
-        batch_size = 1
-    for i in tqdm.trange(0, len(pairs), batch_size, disable=not verbose):
-        res: Dust3rResult = loss_of_one_batch(
-            collate_with_cat(pairs[i : i + batch_size]), model, None, device
-        )
-        result.append(to_cpu(res))
-    result = collate_with_cat(result, lists=multiple_shapes)
-    return result
-def check_if_same_size(pairs):
-    shapes1 = [img1["img"].shape[-2:] for img1, img2 in pairs]
-    shapes2 = [img2["img"].shape[-2:] for img1, img2 in pairs]
-    return all(shapes1[0] == s for s in shapes1) and all(
-        shapes2[0] == s for s in shapes2
-    )
-def get_pred_pts3d(gt, pred, use_pose=False):
-    if "depth" in pred and "pseudo_focal" in pred:
-        try:
-            pp = gt["camera_intrinsics"][..., :2, 2]
-        except KeyError:
-            pp = None
-        pts3d = depthmap_to_pts3d(**pred, pp=pp)
-    elif "pts3d" in pred:
-        # pts3d from my camera
-        pts3d = pred["pts3d"]
-    elif "pts3d_in_other_view" in pred:
-        # pts3d from the other camera, already transformed
-        assert use_pose is True
-        return pred["pts3d_in_other_view"]  # return!
-    if use_pose:
-        camera_pose = pred.get("camera_pose")
-        assert camera_pose is not None
-        pts3d = geotrf(camera_pose, pts3d)
-    return pts3d
-def find_opt_scaling(
-    gt_pts1,
-    gt_pts2,
-    pr_pts1,
-    pr_pts2=None,
-    fit_mode="weiszfeld_stop_grad",
-    valid1=None,
-    valid2=None,
-):
-    assert gt_pts1.ndim == pr_pts1.ndim == 4
-    assert gt_pts1.shape == pr_pts1.shape
-    if gt_pts2 is not None:
-        assert gt_pts2.ndim == pr_pts2.ndim == 4
-        assert gt_pts2.shape == pr_pts2.shape
-    # concat the pointcloud
-    nan_gt_pts1 = invalid_to_nans(gt_pts1, valid1).flatten(1, 2)
-    nan_gt_pts2 = (
-        invalid_to_nans(gt_pts2, valid2).flatten(1, 2) if gt_pts2 is not None else None
-    )
-    pr_pts1 = invalid_to_nans(pr_pts1, valid1).flatten(1, 2)
-    pr_pts2 = (
-        invalid_to_nans(pr_pts2, valid2).flatten(1, 2) if pr_pts2 is not None else None
-    )
-    all_gt = (
-        torch.cat((nan_gt_pts1, nan_gt_pts2), dim=1)
-        if gt_pts2 is not None
-        else nan_gt_pts1
-    )
-    all_pr = torch.cat((pr_pts1, pr_pts2), dim=1) if pr_pts2 is not None else pr_pts1
-    dot_gt_pr = (all_pr * all_gt).sum(dim=-1)
-    dot_gt_gt = all_gt.square().sum(dim=-1)
-    if fit_mode.startswith("avg"):
-        # scaling = (all_pr / all_gt).view(B, -1).mean(dim=1)
-        scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1)
-    elif fit_mode.startswith("median"):
-        scaling = (dot_gt_pr / dot_gt_gt).nanmedian(dim=1).values
-    elif fit_mode.startswith("weiszfeld"):
-        # init scaling with l2 closed form
-        scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1)
-        # iterative re-weighted least-squares
-        for iter in range(10):
-            # re-weighting by inverse of distance
-            dis = (all_pr - scaling.view(-1, 1, 1) * all_gt).norm(dim=-1)
-            # print(dis.nanmean(-1))
-            w = dis.clip_(min=1e-8).reciprocal()
-            # update the scaling with the new weights
-            scaling = (w * dot_gt_pr).nanmean(dim=1) / (w * dot_gt_gt).nanmean(dim=1)
-    else:
-        raise ValueError(f"bad {fit_mode=}")
-    if fit_mode.endswith("stop_grad"):
-        scaling = scaling.detach()
-    scaling = scaling.clip(min=1e-3)
-    # assert scaling.isfinite().all(), bb()
-    return scaling

mini_dust3r/model.py DELETED Viewed

@@ -1,259 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# DUSt3R model class
-# --------------------------------------------------------
-from copy import deepcopy
-import torch
-import os
-from packaging import version
-import huggingface_hub
-from .utils.misc import (
-    fill_default_args,
-    freeze_all_params,
-    is_symmetrized,
-    interleave,
-    transpose_to_landscape,
-)
-from .heads import head_factory
-from mini_dust3r.patch_embed import get_patch_embed
-from mini_dust3r.croco.croco import CroCoNet
-inf = float("inf")
-hf_version_number = huggingface_hub.__version__
-assert version.parse(hf_version_number) >= version.parse(
-    "0.22.0"
-), "Outdated huggingface_hub version, please reinstall requirements.txt"
-def load_model(model_path, device, verbose=True):
-    if verbose:
-        print("... loading model from", model_path)
-    ckpt = torch.load(model_path, map_location="cpu")
-    args = ckpt["args"].model.replace("ManyAR_PatchEmbed", "PatchEmbedDust3R")
-    if "landscape_only" not in args:
-        args = args[:-1] + ", landscape_only=False)"
-    else:
-        args = args.replace(" ", "").replace(
-            "landscape_only=True", "landscape_only=False"
-        )
-    assert "landscape_only=False" in args
-    if verbose:
-        print(f"instantiating : {args}")
-    net = eval(args)
-    s = net.load_state_dict(ckpt["model"], strict=False)
-    if verbose:
-        print(s)
-    return net.to(device)
-class AsymmetricCroCo3DStereo(
-    CroCoNet,
-    huggingface_hub.PyTorchModelHubMixin,
-    library_name="dust3r",
-    repo_url="https://github.com/naver/dust3r",
-    tags=["image-to-3d"],
-):
-    """Two siamese encoders, followed by two decoders.
-    The goal is to output 3d points directly, both images in view1's frame
-    (hence the asymmetry).
-    """
-    def __init__(
-        self,
-        output_mode="pts3d",
-        head_type="linear",
-        depth_mode=("exp", -inf, inf),
-        conf_mode=("exp", 1, inf),
-        freeze="none",
-        landscape_only=True,
-        patch_embed_cls="PatchEmbedDust3R",  # PatchEmbedDust3R or ManyAR_PatchEmbed
-        **croco_kwargs,
-    ):
-        self.patch_embed_cls = patch_embed_cls
-        self.croco_args = fill_default_args(croco_kwargs, super().__init__)
-        super().__init__(**croco_kwargs)
-        # dust3r specific initialization
-        self.dec_blocks2 = deepcopy(self.dec_blocks)
-        self.set_downstream_head(
-            output_mode,
-            head_type,
-            landscape_only,
-            depth_mode,
-            conf_mode,
-            **croco_kwargs,
-        )
-        self.set_freeze(freeze)
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kw):
-        if os.path.isfile(pretrained_model_name_or_path):
-            return load_model(pretrained_model_name_or_path, device="cpu")
-        else:
-            return super(AsymmetricCroCo3DStereo, cls).from_pretrained(
-                pretrained_model_name_or_path, **kw
-            )
-    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
-        self.patch_embed = get_patch_embed(
-            self.patch_embed_cls, img_size, patch_size, enc_embed_dim
-        )
-    def load_state_dict(self, ckpt, **kw):
-        # duplicate all weights for the second decoder if not present
-        new_ckpt = dict(ckpt)
-        if not any(k.startswith("dec_blocks2") for k in ckpt):
-            for key, value in ckpt.items():
-                if key.startswith("dec_blocks"):
-                    new_ckpt[key.replace("dec_blocks", "dec_blocks2")] = value
-        return super().load_state_dict(new_ckpt, **kw)
-    def set_freeze(self, freeze):  # this is for use by downstream models
-        self.freeze = freeze
-        to_be_frozen = {
-            "none": [],
-            "mask": [self.mask_token],
-            "encoder": [self.mask_token, self.patch_embed, self.enc_blocks],
-        }
-        freeze_all_params(to_be_frozen[freeze])
-    def _set_prediction_head(self, *args, **kwargs):
-        """No prediction head"""
-        return
-    def set_downstream_head(
-        self,
-        output_mode,
-        head_type,
-        landscape_only,
-        depth_mode,
-        conf_mode,
-        patch_size,
-        img_size,
-        **kw,
-    ):
-        assert (
-            img_size[0] % patch_size == 0 and img_size[1] % patch_size == 0
-        ), f"{img_size=} must be multiple of {patch_size=}"
-        self.output_mode = output_mode
-        self.head_type = head_type
-        self.depth_mode = depth_mode
-        self.conf_mode = conf_mode
-        # allocate heads
-        self.downstream_head1 = head_factory(
-            head_type, output_mode, self, has_conf=bool(conf_mode)
-        )
-        self.downstream_head2 = head_factory(
-            head_type, output_mode, self, has_conf=bool(conf_mode)
-        )
-        # magic wrapper
-        self.head1 = transpose_to_landscape(
-            self.downstream_head1, activate=landscape_only
-        )
-        self.head2 = transpose_to_landscape(
-            self.downstream_head2, activate=landscape_only
-        )
-    def _encode_image(self, image, true_shape):
-        # embed the image into patches  (x has size B x Npatches x C)
-        x, pos = self.patch_embed(image, true_shape=true_shape)
-        # add positional embedding without cls token
-        assert self.enc_pos_embed is None
-        # now apply the transformer encoder and normalization
-        for blk in self.enc_blocks:
-            x = blk(x, pos)
-        x = self.enc_norm(x)
-        return x, pos, None
-    def _encode_image_pairs(self, img1, img2, true_shape1, true_shape2):
-        if img1.shape[-2:] == img2.shape[-2:]:
-            out, pos, _ = self._encode_image(
-                torch.cat((img1, img2), dim=0),
-                torch.cat((true_shape1, true_shape2), dim=0),
-            )
-            out, out2 = out.chunk(2, dim=0)
-            pos, pos2 = pos.chunk(2, dim=0)
-        else:
-            out, pos, _ = self._encode_image(img1, true_shape1)
-            out2, pos2, _ = self._encode_image(img2, true_shape2)
-        return out, out2, pos, pos2
-    def _encode_symmetrized(self, view1, view2):
-        img1 = view1["img"]
-        img2 = view2["img"]
-        B = img1.shape[0]
-        # Recover true_shape when available, otherwise assume that the img shape is the true one
-        shape1 = view1.get(
-            "true_shape", torch.tensor(img1.shape[-2:])[None].repeat(B, 1)
-        )
-        shape2 = view2.get(
-            "true_shape", torch.tensor(img2.shape[-2:])[None].repeat(B, 1)
-        )
-        # warning! maybe the images have different portrait/landscape orientations
-        if is_symmetrized(view1, view2):
-            # computing half of forward pass!'
-            feat1, feat2, pos1, pos2 = self._encode_image_pairs(
-                img1[::2], img2[::2], shape1[::2], shape2[::2]
-            )
-            feat1, feat2 = interleave(feat1, feat2)
-            pos1, pos2 = interleave(pos1, pos2)
-        else:
-            feat1, feat2, pos1, pos2 = self._encode_image_pairs(
-                img1, img2, shape1, shape2
-            )
-        return (shape1, shape2), (feat1, feat2), (pos1, pos2)
-    def _decoder(self, f1, pos1, f2, pos2):
-        final_output = [(f1, f2)]  # before projection
-        # project to decoder dim
-        f1 = self.decoder_embed(f1)
-        f2 = self.decoder_embed(f2)
-        final_output.append((f1, f2))
-        for blk1, blk2 in zip(self.dec_blocks, self.dec_blocks2):
-            # img1 side
-            f1, _ = blk1(*final_output[-1][::+1], pos1, pos2)
-            # img2 side
-            f2, _ = blk2(*final_output[-1][::-1], pos2, pos1)
-            # store the result
-            final_output.append((f1, f2))
-        # normalize last output
-        del final_output[1]  # duplicate with final_output[0]
-        final_output[-1] = tuple(map(self.dec_norm, final_output[-1]))
-        return zip(*final_output)
-    def _downstream_head(self, head_num, decout, img_shape):
-        B, S, D = decout[-1].shape
-        # img_shape = tuple(map(int, img_shape))
-        head = getattr(self, f"head{head_num}")
-        return head(decout, img_shape)
-    def forward(self, view1, view2):
-        # encode the two images --> B,S,D
-        (shape1, shape2), (feat1, feat2), (pos1, pos2) = self._encode_symmetrized(
-            view1, view2
-        )
-        # combine all ref images into object-centric representation
-        dec1, dec2 = self._decoder(feat1, pos1, feat2, pos2)
-        with torch.cuda.amp.autocast(enabled=False):
-            res1 = self._downstream_head(1, [tok.float() for tok in dec1], shape1)
-            res2 = self._downstream_head(2, [tok.float() for tok in dec2], shape2)
-        res2["pts3d_in_other_view"] = res2.pop(
-            "pts3d"
-        )  # predict view2's pts3d in view1's frame
-        return res1, res2

mini_dust3r/optim_factory.py DELETED Viewed

@@ -1,14 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# optimization functions
-# --------------------------------------------------------
-def adjust_learning_rate_by_lr(optimizer, lr):
-    for param_group in optimizer.param_groups:
-        if "lr_scale" in param_group:
-            param_group["lr"] = lr * param_group["lr_scale"]
-        else:
-            param_group["lr"] = lr

mini_dust3r/patch_embed.py DELETED Viewed

@@ -1,69 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# PatchEmbed implementation for DUST3R,
-# in particular ManyAR_PatchEmbed that Handle images with non-square aspect ratio
-# --------------------------------------------------------
-import torch
-from mini_dust3r.croco.blocks import PatchEmbed
-def get_patch_embed(patch_embed_cls, img_size, patch_size, enc_embed_dim):
-    assert patch_embed_cls in ['PatchEmbedDust3R', 'ManyAR_PatchEmbed']
-    patch_embed = eval(patch_embed_cls)(img_size, patch_size, 3, enc_embed_dim)
-    return patch_embed
-class PatchEmbedDust3R(PatchEmbed):
-    def forward(self, x, **kw):
-        B, C, H, W = x.shape
-        assert H % self.patch_size[0] == 0, f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})."
-        assert W % self.patch_size[1] == 0, f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})."
-        x = self.proj(x)
-        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        x = self.norm(x)
-        return x, pos
-class ManyAR_PatchEmbed (PatchEmbed):
-    """ Handle images with non-square aspect ratio.
-        All images in the same batch have the same aspect ratio.
-        true_shape = [(height, width) ...] indicates the actual shape of each image.
-    """
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
-        self.embed_dim = embed_dim
-        super().__init__(img_size, patch_size, in_chans, embed_dim, norm_layer, flatten)
-    def forward(self, img, true_shape):
-        B, C, H, W = img.shape
-        assert W >= H, f'img should be in landscape mode, but got {W=} {H=}'
-        assert H % self.patch_size[0] == 0, f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})."
-        assert W % self.patch_size[1] == 0, f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})."
-        assert true_shape.shape == (B, 2), f"true_shape has the wrong shape={true_shape.shape}"
-        # size expressed in tokens
-        W //= self.patch_size[0]
-        H //= self.patch_size[1]
-        n_tokens = H * W
-        height, width = true_shape.T
-        is_landscape = (width >= height)
-        is_portrait = ~is_landscape
-        # allocate result
-        x = img.new_zeros((B, n_tokens, self.embed_dim))
-        pos = img.new_zeros((B, n_tokens, 2), dtype=torch.int64)
-        # linear projection, transposed if necessary
-        x[is_landscape] = self.proj(img[is_landscape]).permute(0, 2, 3, 1).flatten(1, 2).float()
-        x[is_portrait] = self.proj(img[is_portrait].swapaxes(-1, -2)).permute(0, 2, 3, 1).flatten(1, 2).float()
-        pos[is_landscape] = self.position_getter(1, H, W, pos.device)
-        pos[is_portrait] = self.position_getter(1, W, H, pos.device)
-        x = self.norm(x)
-        return x, pos

mini_dust3r/post_process.py DELETED Viewed

@@ -1,60 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# utilities for interpreting the DUST3R output
-# --------------------------------------------------------
-import numpy as np
-import torch
-from mini_dust3r.utils.geometry import xy_grid
-def estimate_focal_knowing_depth(pts3d, pp, focal_mode='median', min_focal=0., max_focal=np.inf):
-    """ Reprojection method, for when the absolute depth is known:
-        1) estimate the camera focal using a robust estimator
-        2) reproject points onto true rays, minimizing a certain error
-    """
-    B, H, W, THREE = pts3d.shape
-    assert THREE == 3
-    # centered pixel grid
-    pixels = xy_grid(W, H, device=pts3d.device).view(1, -1, 2) - pp.view(-1, 1, 2)  # B,HW,2
-    pts3d = pts3d.flatten(1, 2)  # (B, HW, 3)
-    if focal_mode == 'median':
-        with torch.no_grad():
-            # direct estimation of focal
-            u, v = pixels.unbind(dim=-1)
-            x, y, z = pts3d.unbind(dim=-1)
-            fx_votes = (u * z) / x
-            fy_votes = (v * z) / y
-            # assume square pixels, hence same focal for X and Y
-            f_votes = torch.cat((fx_votes.view(B, -1), fy_votes.view(B, -1)), dim=-1)
-            focal = torch.nanmedian(f_votes, dim=-1).values
-    elif focal_mode == 'weiszfeld':
-        # init focal with l2 closed form
-        # we try to find focal = argmin Sum | pixel - focal * (x,y)/z|
-        xy_over_z = (pts3d[..., :2] / pts3d[..., 2:3]).nan_to_num(posinf=0, neginf=0)  # homogeneous (x,y,1)
-        dot_xy_px = (xy_over_z * pixels).sum(dim=-1)
-        dot_xy_xy = xy_over_z.square().sum(dim=-1)
-        focal = dot_xy_px.mean(dim=1) / dot_xy_xy.mean(dim=1)
-        # iterative re-weighted least-squares
-        for iter in range(10):
-            # re-weighting by inverse of distance
-            dis = (pixels - focal.view(-1, 1, 1) * xy_over_z).norm(dim=-1)
-            # print(dis.nanmean(-1))
-            w = dis.clip(min=1e-8).reciprocal()
-            # update the scaling with the new weights
-            focal = (w * dot_xy_px).mean(dim=1) / (w * dot_xy_xy).mean(dim=1)
-    else:
-        raise ValueError(f'bad {focal_mode=}')
-    focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2))  # size / 1.1547005383792515
-    focal = focal.clip(min=min_focal*focal_base, max=max_focal*focal_base)
-    # print(focal)
-    return focal

mini_dust3r/utils/device.py DELETED Viewed

@@ -1,76 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# utilitary functions for DUSt3R
-# --------------------------------------------------------
-import numpy as np
-import torch
-def todevice(batch, device, callback=None, non_blocking=False):
-    ''' Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
-    batch: list, tuple, dict of tensors or other things
-    device: pytorch device or 'numpy'
-    callback: function that would be called on every sub-elements.
-    '''
-    if callback:
-        batch = callback(batch)
-    if isinstance(batch, dict):
-        return {k: todevice(v, device) for k, v in batch.items()}
-    if isinstance(batch, (tuple, list)):
-        return type(batch)(todevice(x, device) for x in batch)
-    x = batch
-    if device == 'numpy':
-        if isinstance(x, torch.Tensor):
-            x = x.detach().cpu().numpy()
-    elif x is not None:
-        if isinstance(x, np.ndarray):
-            x = torch.from_numpy(x)
-        if torch.is_tensor(x):
-            x = x.to(device, non_blocking=non_blocking)
-    return x
-to_device = todevice  # alias
-def to_numpy(x): return todevice(x, 'numpy')
-def to_cpu(x): return todevice(x, 'cpu')
-def to_cuda(x): return todevice(x, 'cuda')
-def collate_with_cat(whatever, lists=False):
-    if isinstance(whatever, dict):
-        return {k: collate_with_cat(vals, lists=lists) for k, vals in whatever.items()}
-    elif isinstance(whatever, (tuple, list)):
-        if len(whatever) == 0:
-            return whatever
-        elem = whatever[0]
-        T = type(whatever)
-        if elem is None:
-            return None
-        if isinstance(elem, (bool, float, int, str)):
-            return whatever
-        if isinstance(elem, tuple):
-            return T(collate_with_cat(x, lists=lists) for x in zip(*whatever))
-        if isinstance(elem, dict):
-            return {k: collate_with_cat([e[k] for e in whatever], lists=lists) for k in elem}
-        if isinstance(elem, torch.Tensor):
-            return listify(whatever) if lists else torch.cat(whatever)
-        if isinstance(elem, np.ndarray):
-            return listify(whatever) if lists else torch.cat([torch.from_numpy(x) for x in whatever])
-        # otherwise, we just chain lists
-        return sum(whatever, T())
-def listify(elems):
-    return [x for e in elems for x in e]

mini_dust3r/utils/geometry.py DELETED Viewed

@@ -1,361 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# geometry utilitary functions
-# --------------------------------------------------------
-import torch
-import numpy as np
-from scipy.spatial import cKDTree as KDTree
-from mini_dust3r.utils.misc import invalid_to_zeros, invalid_to_nans
-from mini_dust3r.utils.device import to_numpy
-def xy_grid(W, H, device=None, origin=(0, 0), unsqueeze=None, cat_dim=-1, homogeneous=False, **arange_kw):
-    """ Output a (H,W,2) array of int32
-        with output[j,i,0] = i + origin[0]
-             output[j,i,1] = j + origin[1]
-    """
-    if device is None:
-        # numpy
-        arange, meshgrid, stack, ones = np.arange, np.meshgrid, np.stack, np.ones
-    else:
-        # torch
-        arange = lambda *a, **kw: torch.arange(*a, device=device, **kw)
-        meshgrid, stack = torch.meshgrid, torch.stack
-        ones = lambda *a: torch.ones(*a, device=device)
-    tw, th = [arange(o, o+s, **arange_kw) for s, o in zip((W, H), origin)]
-    grid = meshgrid(tw, th, indexing='xy')
-    if homogeneous:
-        grid = grid + (ones((H, W)),)
-    if unsqueeze is not None:
-        grid = (grid[0].unsqueeze(unsqueeze), grid[1].unsqueeze(unsqueeze))
-    if cat_dim is not None:
-        grid = stack(grid, cat_dim)
-    return grid
-def geotrf(Trf, pts, ncol=None, norm=False):
-    """ Apply a geometric transformation to a list of 3-D points.
-    H: 3x3 or 4x4 projection matrix (typically a Homography)
-    p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
-    ncol: int. number of columns of the result (2 or 3)
-    norm: float. if != 0, the resut is projected on the z=norm plane.
-    Returns an array of projected 2d points.
-    """
-    assert Trf.ndim >= 2
-    if isinstance(Trf, np.ndarray):
-        pts = np.asarray(pts)
-    elif isinstance(Trf, torch.Tensor):
-        pts = torch.as_tensor(pts, dtype=Trf.dtype)
-    # adapt shape if necessary
-    output_reshape = pts.shape[:-1]
-    ncol = ncol or pts.shape[-1]
-    # optimized code
-    if (isinstance(Trf, torch.Tensor) and isinstance(pts, torch.Tensor) and
-            Trf.ndim == 3 and pts.ndim == 4):
-        d = pts.shape[3]
-        if Trf.shape[-1] == d:
-            pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
-        elif Trf.shape[-1] == d+1:
-            pts = torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts) + Trf[:, None, None, :d, d]
-        else:
-            raise ValueError(f'bad shape, not ending with 3 or 4, for {pts.shape=}')
-    else:
-        if Trf.ndim >= 3:
-            n = Trf.ndim-2
-            assert Trf.shape[:n] == pts.shape[:n], 'batch size does not match'
-            Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
-            if pts.ndim > Trf.ndim:
-                # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
-                pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
-            elif pts.ndim == 2:
-                # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
-                pts = pts[:, None, :]
-        if pts.shape[-1]+1 == Trf.shape[-1]:
-            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
-            pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
-        elif pts.shape[-1] == Trf.shape[-1]:
-            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
-            pts = pts @ Trf
-        else:
-            pts = Trf @ pts.T
-            if pts.ndim >= 2:
-                pts = pts.swapaxes(-1, -2)
-    if norm:
-        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
-        if norm != 1:
-            pts *= norm
-    res = pts[..., :ncol].reshape(*output_reshape, ncol)
-    return res
-def inv(mat):
-    """ Invert a torch or numpy matrix
-    """
-    if isinstance(mat, torch.Tensor):
-        return torch.linalg.inv(mat)
-    if isinstance(mat, np.ndarray):
-        return np.linalg.inv(mat)
-    raise ValueError(f'bad matrix type = {type(mat)}')
-def depthmap_to_pts3d(depth, pseudo_focal, pp=None, **_):
-    """
-    Args:
-        - depthmap (BxHxW array):
-        - pseudo_focal: [B,H,W] ; [B,2,H,W] or [B,1,H,W]
-    Returns:
-        pointmap of absolute coordinates (BxHxWx3 array)
-    """
-    if len(depth.shape) == 4:
-        B, H, W, n = depth.shape
-    else:
-        B, H, W = depth.shape
-        n = None
-    if len(pseudo_focal.shape) == 3:  # [B,H,W]
-        pseudo_focalx = pseudo_focaly = pseudo_focal
-    elif len(pseudo_focal.shape) == 4:  # [B,2,H,W] or [B,1,H,W]
-        pseudo_focalx = pseudo_focal[:, 0]
-        if pseudo_focal.shape[1] == 2:
-            pseudo_focaly = pseudo_focal[:, 1]
-        else:
-            pseudo_focaly = pseudo_focalx
-    else:
-        raise NotImplementedError("Error, unknown input focal shape format.")
-    assert pseudo_focalx.shape == depth.shape[:3]
-    assert pseudo_focaly.shape == depth.shape[:3]
-    grid_x, grid_y = xy_grid(W, H, cat_dim=0, device=depth.device)[:, None]
-    # set principal point
-    if pp is None:
-        grid_x = grid_x - (W-1)/2
-        grid_y = grid_y - (H-1)/2
-    else:
-        grid_x = grid_x.expand(B, -1, -1) - pp[:, 0, None, None]
-        grid_y = grid_y.expand(B, -1, -1) - pp[:, 1, None, None]
-    if n is None:
-        pts3d = torch.empty((B, H, W, 3), device=depth.device)
-        pts3d[..., 0] = depth * grid_x / pseudo_focalx
-        pts3d[..., 1] = depth * grid_y / pseudo_focaly
-        pts3d[..., 2] = depth
-    else:
-        pts3d = torch.empty((B, H, W, 3, n), device=depth.device)
-        pts3d[..., 0, :] = depth * (grid_x / pseudo_focalx)[..., None]
-        pts3d[..., 1, :] = depth * (grid_y / pseudo_focaly)[..., None]
-        pts3d[..., 2, :] = depth
-    return pts3d
-def depthmap_to_camera_coordinates(depthmap, camera_intrinsics, pseudo_focal=None):
-    """
-    Args:
-        - depthmap (HxW array):
-        - camera_intrinsics: a 3x3 matrix
-    Returns:
-        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.
-    """
-    camera_intrinsics = np.float32(camera_intrinsics)
-    H, W = depthmap.shape
-    # Compute 3D ray associated with each pixel
-    # Strong assumption: there are no skew terms
-    assert camera_intrinsics[0, 1] == 0.0
-    assert camera_intrinsics[1, 0] == 0.0
-    if pseudo_focal is None:
-        fu = camera_intrinsics[0, 0]
-        fv = camera_intrinsics[1, 1]
-    else:
-        assert pseudo_focal.shape == (H, W)
-        fu = fv = pseudo_focal
-    cu = camera_intrinsics[0, 2]
-    cv = camera_intrinsics[1, 2]
-    u, v = np.meshgrid(np.arange(W), np.arange(H))
-    z_cam = depthmap
-    x_cam = (u - cu) * z_cam / fu
-    y_cam = (v - cv) * z_cam / fv
-    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)
-    # Mask for valid coordinates
-    valid_mask = (depthmap > 0.0)
-    return X_cam, valid_mask
-def depthmap_to_absolute_camera_coordinates(depthmap, camera_intrinsics, camera_pose, **kw):
-    """
-    Args:
-        - depthmap (HxW array):
-        - camera_intrinsics: a 3x3 matrix
-        - camera_pose: a 4x3 or 4x4 cam2world matrix
-    Returns:
-        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels."""
-    X_cam, valid_mask = depthmap_to_camera_coordinates(depthmap, camera_intrinsics)
-    # R_cam2world = np.float32(camera_params["R_cam2world"])
-    # t_cam2world = np.float32(camera_params["t_cam2world"]).squeeze()
-    R_cam2world = camera_pose[:3, :3]
-    t_cam2world = camera_pose[:3, 3]
-    # Express in absolute coordinates (invalid depth values)
-    X_world = np.einsum("ik, vuk -> vui", R_cam2world, X_cam) + t_cam2world[None, None, :]
-    return X_world, valid_mask
-def colmap_to_opencv_intrinsics(K):
-    """
-    Modify camera intrinsics to follow a different convention.
-    Coordinates of the center of the top-left pixels are by default:
-    - (0.5, 0.5) in Colmap
-    - (0,0) in OpenCV
-    """
-    K = K.copy()
-    K[0, 2] -= 0.5
-    K[1, 2] -= 0.5
-    return K
-def opencv_to_colmap_intrinsics(K):
-    """
-    Modify camera intrinsics to follow a different convention.
-    Coordinates of the center of the top-left pixels are by default:
-    - (0.5, 0.5) in Colmap
-    - (0,0) in OpenCV
-    """
-    K = K.copy()
-    K[0, 2] += 0.5
-    K[1, 2] += 0.5
-    return K
-def normalize_pointcloud(pts1, pts2, norm_mode='avg_dis', valid1=None, valid2=None):
-    """ renorm pointmaps pts1, pts2 with norm_mode
-    """
-    assert pts1.ndim >= 3 and pts1.shape[-1] == 3
-    assert pts2 is None or (pts2.ndim >= 3 and pts2.shape[-1] == 3)
-    norm_mode, dis_mode = norm_mode.split('_')
-    if norm_mode == 'avg':
-        # gather all points together (joint normalization)
-        nan_pts1, nnz1 = invalid_to_zeros(pts1, valid1, ndim=3)
-        nan_pts2, nnz2 = invalid_to_zeros(pts2, valid2, ndim=3) if pts2 is not None else (None, 0)
-        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
-        # compute distance to origin
-        all_dis = all_pts.norm(dim=-1)
-        if dis_mode == 'dis':
-            pass  # do nothing
-        elif dis_mode == 'log1p':
-            all_dis = torch.log1p(all_dis)
-        elif dis_mode == 'warp-log1p':
-            # actually warp input points before normalizing them
-            log_dis = torch.log1p(all_dis)
-            warp_factor = log_dis / all_dis.clip(min=1e-8)
-            H1, W1 = pts1.shape[1:-1]
-            pts1 = pts1 * warp_factor[:, :W1*H1].view(-1, H1, W1, 1)
-            if pts2 is not None:
-                H2, W2 = pts2.shape[1:-1]
-                pts2 = pts2 * warp_factor[:, W1*H1:].view(-1, H2, W2, 1)
-            all_dis = log_dis  # this is their true distance afterwards
-        else:
-            raise ValueError(f'bad {dis_mode=}')
-        norm_factor = all_dis.sum(dim=1) / (nnz1 + nnz2 + 1e-8)
-    else:
-        # gather all points together (joint normalization)
-        nan_pts1 = invalid_to_nans(pts1, valid1, ndim=3)
-        nan_pts2 = invalid_to_nans(pts2, valid2, ndim=3) if pts2 is not None else None
-        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
-        # compute distance to origin
-        all_dis = all_pts.norm(dim=-1)
-        if norm_mode == 'avg':
-            norm_factor = all_dis.nanmean(dim=1)
-        elif norm_mode == 'median':
-            norm_factor = all_dis.nanmedian(dim=1).values.detach()
-        elif norm_mode == 'sqrt':
-            norm_factor = all_dis.sqrt().nanmean(dim=1)**2
-        else:
-            raise ValueError(f'bad {norm_mode=}')
-    norm_factor = norm_factor.clip(min=1e-8)
-    while norm_factor.ndim < pts1.ndim:
-        norm_factor.unsqueeze_(-1)
-    res = pts1 / norm_factor
-    if pts2 is not None:
-        res = (res, pts2 / norm_factor)
-    return res
-@torch.no_grad()
-def get_joint_pointcloud_depth(z1, z2, valid_mask1, valid_mask2=None, quantile=0.5):
-    # set invalid points to NaN
-    _z1 = invalid_to_nans(z1, valid_mask1).reshape(len(z1), -1)
-    _z2 = invalid_to_nans(z2, valid_mask2).reshape(len(z2), -1) if z2 is not None else None
-    _z = torch.cat((_z1, _z2), dim=-1) if z2 is not None else _z1
-    # compute median depth overall (ignoring nans)
-    if quantile == 0.5:
-        shift_z = torch.nanmedian(_z, dim=-1).values
-    else:
-        shift_z = torch.nanquantile(_z, quantile, dim=-1)
-    return shift_z  # (B,)
-@torch.no_grad()
-def get_joint_pointcloud_center_scale(pts1, pts2, valid_mask1=None, valid_mask2=None, z_only=False, center=True):
-    # set invalid points to NaN
-    _pts1 = invalid_to_nans(pts1, valid_mask1).reshape(len(pts1), -1, 3)
-    _pts2 = invalid_to_nans(pts2, valid_mask2).reshape(len(pts2), -1, 3) if pts2 is not None else None
-    _pts = torch.cat((_pts1, _pts2), dim=1) if pts2 is not None else _pts1
-    # compute median center
-    _center = torch.nanmedian(_pts, dim=1, keepdim=True).values  # (B,1,3)
-    if z_only:
-        _center[..., :2] = 0  # do not center X and Y
-    # compute median norm
-    _norm = ((_pts - _center) if center else _pts).norm(dim=-1)
-    scale = torch.nanmedian(_norm, dim=1).values
-    return _center[:, None, :, :], scale[:, None, None, None]
-def find_reciprocal_matches(P1, P2):
-    """
-    returns 3 values:
-    1 - reciprocal_in_P2: a boolean array of size P2.shape[0], a "True" value indicates a match
-    2 - nn2_in_P1: a int array of size P2.shape[0], it contains the indexes of the closest points in P1
-    3 - reciprocal_in_P2.sum(): the number of matches
-    """
-    tree1 = KDTree(P1)
-    tree2 = KDTree(P2)
-    _, nn1_in_P2 = tree2.query(P1, workers=8)
-    _, nn2_in_P1 = tree1.query(P2, workers=8)
-    reciprocal_in_P1 = (nn2_in_P1[nn1_in_P2] == np.arange(len(nn1_in_P2)))
-    reciprocal_in_P2 = (nn1_in_P2[nn2_in_P1] == np.arange(len(nn2_in_P1)))
-    assert reciprocal_in_P1.sum() == reciprocal_in_P2.sum()
-    return reciprocal_in_P2, nn2_in_P1, reciprocal_in_P2.sum()
-def get_med_dist_between_poses(poses):
-    from scipy.spatial.distance import pdist
-    return np.median(pdist([to_numpy(p[:3, 3]) for p in poses]))

mini_dust3r/utils/image.py DELETED Viewed

@@ -1,141 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# utilitary functions about images (loading/converting...)
-# --------------------------------------------------------
-import os
-import torch
-import numpy as np
-import PIL.Image
-from PIL.ImageOps import exif_transpose
-import torchvision.transforms as tvf
-os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
-import cv2  # noqa
-from typing import Literal, TypedDict
-from jaxtyping import Float32, Int32
-try:
-    from pillow_heif import register_heif_opener  # noqa
-    register_heif_opener()
-    heif_support_enabled = True
-except ImportError:
-    heif_support_enabled = False
-ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
-class ImageDict(TypedDict):
-    img: Float32[torch.Tensor, "b c h w"]
-    true_shape: tuple[int, int] | Int32[torch.Tensor, "b 2"]
-    idx: int | list[int]
-    instance: str | list[str]
-def imread_cv2(path, options=cv2.IMREAD_COLOR):
-    """Open an image or a depthmap with opencv-python."""
-    if path.endswith((".exr", "EXR")):
-        options = cv2.IMREAD_ANYDEPTH
-    img = cv2.imread(path, options)
-    if img is None:
-        raise IOError(f"Could not load image={path} with {options=}")
-    if img.ndim == 3:
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    return img
-def rgb(ftensor, true_shape=None):
-    if isinstance(ftensor, list):
-        return [rgb(x, true_shape=true_shape) for x in ftensor]
-    if isinstance(ftensor, torch.Tensor):
-        ftensor = ftensor.detach().cpu().numpy()  # H,W,3
-    if ftensor.ndim == 3 and ftensor.shape[0] == 3:
-        ftensor = ftensor.transpose(1, 2, 0)
-    elif ftensor.ndim == 4 and ftensor.shape[1] == 3:
-        ftensor = ftensor.transpose(0, 2, 3, 1)
-    if true_shape is not None:
-        H, W = true_shape
-        ftensor = ftensor[:H, :W]
-    if ftensor.dtype == np.uint8:
-        img = np.float32(ftensor) / 255
-    else:
-        img = (ftensor * 0.5) + 0.5
-    return img.clip(min=0, max=1)
-def _resize_pil_image(img, long_edge_size):
-    S = max(img.size)
-    if S > long_edge_size:
-        interp = PIL.Image.LANCZOS
-    elif S <= long_edge_size:
-        interp = PIL.Image.BICUBIC
-    new_size = tuple(int(round(x * long_edge_size / S)) for x in img.size)
-    return img.resize(new_size, interp)
-def load_images(
-    folder_or_list: str | list,
-    size: Literal[224, 512],
-    square_ok: bool = False,
-    verbose: bool = True,
-) -> list[ImageDict]:
-    """open and convert all images in a list or folder to proper input format for DUSt3R"""
-    if isinstance(folder_or_list, str):
-        if verbose:
-            print(f">> Loading images from {folder_or_list}")
-        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
-    elif isinstance(folder_or_list, list):
-        if verbose:
-            print(f">> Loading a list of {len(folder_or_list)} images")
-        root, folder_content = "", folder_or_list
-    else:
-        raise ValueError(f"bad {folder_or_list=} ({type(folder_or_list)})")
-    supported_images_extensions = [".jpg", ".jpeg", ".png"]
-    if heif_support_enabled:
-        supported_images_extensions += [".heic", ".heif"]
-    supported_images_extensions = tuple(supported_images_extensions)
-    imgs = []
-    for path in folder_content:
-        if not path.lower().endswith(supported_images_extensions):
-            continue
-        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert("RGB")
-        W1, H1 = img.size
-        if size == 224:
-            # resize short side to 224 (then crop)
-            img = _resize_pil_image(img, round(size * max(W1 / H1, H1 / W1)))
-        else:
-            # resize long side to 512
-            img = _resize_pil_image(img, size)
-        W, H = img.size
-        cx, cy = W // 2, H // 2
-        if size == 224:
-            half = min(cx, cy)
-            img = img.crop((cx - half, cy - half, cx + half, cy + half))
-        else:
-            halfw, halfh = ((2 * cx) // 16) * 8, ((2 * cy) // 16) * 8
-            if not (square_ok) and W == H:
-                halfh = 3 * halfw / 4
-            img = img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh))
-        W2, H2 = img.size
-        if verbose:
-            print(f" - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}")
-        imgs.append(
-            dict(
-                img=ImgNorm(img)[None],
-                true_shape=np.int32([img.size[::-1]]),
-                idx=len(imgs),
-                instance=str(len(imgs)),
-            )
-        )
-    assert imgs, "no images foud at " + root
-    if verbose:
-        print(f" (Found {len(imgs)} images)")
-    return imgs

mini_dust3r/utils/misc.py DELETED Viewed

@@ -1,121 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# utilitary functions for DUSt3R
-# --------------------------------------------------------
-import torch
-def fill_default_args(kwargs, func):
-    import inspect  # a bit hacky but it works reliably
-    signature = inspect.signature(func)
-    for k, v in signature.parameters.items():
-        if v.default is inspect.Parameter.empty:
-            continue
-        kwargs.setdefault(k, v.default)
-    return kwargs
-def freeze_all_params(modules):
-    for module in modules:
-        try:
-            for n, param in module.named_parameters():
-                param.requires_grad = False
-        except AttributeError:
-            # module is directly a parameter
-            module.requires_grad = False
-def is_symmetrized(gt1, gt2):
-    x = gt1['instance']
-    y = gt2['instance']
-    if len(x) == len(y) and len(x) == 1:
-        return False  # special case of batchsize 1
-    ok = True
-    for i in range(0, len(x), 2):
-        ok = ok and (x[i] == y[i+1]) and (x[i+1] == y[i])
-    return ok
-def flip(tensor):
-    """ flip so that tensor[0::2] <=> tensor[1::2] """
-    return torch.stack((tensor[1::2], tensor[0::2]), dim=1).flatten(0, 1)
-def interleave(tensor1, tensor2):
-    res1 = torch.stack((tensor1, tensor2), dim=1).flatten(0, 1)
-    res2 = torch.stack((tensor2, tensor1), dim=1).flatten(0, 1)
-    return res1, res2
-def transpose_to_landscape(head, activate=True):
-    """ Predict in the correct aspect-ratio,
-        then transpose the result in landscape
-        and stack everything back together.
-    """
-    def wrapper_no(decout, true_shape):
-        B = len(true_shape)
-        assert true_shape[0:1].allclose(true_shape), 'true_shape must be all identical'
-        H, W = true_shape[0].cpu().tolist()
-        res = head(decout, (H, W))
-        return res
-    def wrapper_yes(decout, true_shape):
-        B = len(true_shape)
-        # by definition, the batch is in landscape mode so W >= H
-        H, W = int(true_shape.min()), int(true_shape.max())
-        height, width = true_shape.T
-        is_landscape = (width >= height)
-        is_portrait = ~is_landscape
-        # true_shape = true_shape.cpu()
-        if is_landscape.all():
-            return head(decout, (H, W))
-        if is_portrait.all():
-            return transposed(head(decout, (W, H)))
-        # batch is a mix of both portraint & landscape
-        def selout(ar): return [d[ar] for d in decout]
-        l_result = head(selout(is_landscape), (H, W))
-        p_result = transposed(head(selout(is_portrait),  (W, H)))
-        # allocate full result
-        result = {}
-        for k in l_result | p_result:
-            x = l_result[k].new(B, *l_result[k].shape[1:])
-            x[is_landscape] = l_result[k]
-            x[is_portrait] = p_result[k]
-            result[k] = x
-        return result
-    return wrapper_yes if activate else wrapper_no
-def transposed(dic):
-    return {k: v.swapaxes(1, 2) for k, v in dic.items()}
-def invalid_to_nans(arr, valid_mask, ndim=999):
-    if valid_mask is not None:
-        arr = arr.clone()
-        arr[~valid_mask] = float('nan')
-    if arr.ndim > ndim:
-        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
-    return arr
-def invalid_to_zeros(arr, valid_mask, ndim=999):
-    if valid_mask is not None:
-        arr = arr.clone()
-        arr[~valid_mask] = 0
-        nnz = valid_mask.view(len(valid_mask), -1).sum(1)
-    else:
-        nnz = arr.numel() // len(arr) if len(arr) else 0  # number of point per image
-    if arr.ndim > ndim:
-        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
-    return arr, nnz

mini_dust3r/viz.py DELETED Viewed

@@ -1,320 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# Visualization utilities using trimesh
-# --------------------------------------------------------
-import PIL.Image
-import numpy as np
-from scipy.spatial.transform import Rotation
-import torch
-from mini_dust3r.utils.geometry import geotrf, get_med_dist_between_poses
-from mini_dust3r.utils.device import to_numpy
-from mini_dust3r.utils.image import rgb
-try:
-    import trimesh
-except ImportError:
-    print('/!\\ module trimesh is not installed, cannot visualize results /!\\')
-def cat_3d(vecs):
-    if isinstance(vecs, (np.ndarray, torch.Tensor)):
-        vecs = [vecs]
-    return np.concatenate([p.reshape(-1, 3) for p in to_numpy(vecs)])
-def show_raw_pointcloud(pts3d, colors, point_size=2):
-    scene = trimesh.Scene()
-    pct = trimesh.PointCloud(cat_3d(pts3d), colors=cat_3d(colors))
-    scene.add_geometry(pct)
-    scene.show(line_settings={'point_size': point_size})
-def pts3d_to_trimesh(img, pts3d, valid=None):
-    H, W, THREE = img.shape
-    assert THREE == 3
-    assert img.shape == pts3d.shape
-    vertices = pts3d.reshape(-1, 3)
-    # make squares: each pixel == 2 triangles
-    idx = np.arange(len(vertices)).reshape(H, W)
-    idx1 = idx[:-1, :-1].ravel()  # top-left corner
-    idx2 = idx[:-1, +1:].ravel()  # right-left corner
-    idx3 = idx[+1:, :-1].ravel()  # bottom-left corner
-    idx4 = idx[+1:, +1:].ravel()  # bottom-right corner
-    faces = np.concatenate((
-        np.c_[idx1, idx2, idx3],
-        np.c_[idx3, idx2, idx1],  # same triangle, but backward (cheap solution to cancel face culling)
-        np.c_[idx2, idx3, idx4],
-        np.c_[idx4, idx3, idx2],  # same triangle, but backward (cheap solution to cancel face culling)
-    ), axis=0)
-    # prepare triangle colors
-    face_colors = np.concatenate((
-        img[:-1, :-1].reshape(-1, 3),
-        img[:-1, :-1].reshape(-1, 3),
-        img[+1:, +1:].reshape(-1, 3),
-        img[+1:, +1:].reshape(-1, 3)
-    ), axis=0)
-    # remove invalid faces
-    if valid is not None:
-        assert valid.shape == (H, W)
-        valid_idxs = valid.ravel()
-        valid_faces = valid_idxs[faces].all(axis=-1)
-        faces = faces[valid_faces]
-        face_colors = face_colors[valid_faces]
-    assert len(faces) == len(face_colors)
-    return dict(vertices=vertices, face_colors=face_colors, faces=faces)
-def cat_meshes(meshes):
-    vertices, faces, colors = zip(*[(m['vertices'], m['faces'], m['face_colors']) for m in meshes])
-    n_vertices = np.cumsum([0]+[len(v) for v in vertices])
-    for i in range(len(faces)):
-        faces[i][:] += n_vertices[i]
-    vertices = np.concatenate(vertices)
-    colors = np.concatenate(colors)
-    faces = np.concatenate(faces)
-    return dict(vertices=vertices, face_colors=colors, faces=faces)
-def show_duster_pairs(view1, view2, pred1, pred2):
-    import matplotlib.pyplot as pl
-    pl.ion()
-    for e in range(len(view1['instance'])):
-        i = view1['idx'][e]
-        j = view2['idx'][e]
-        img1 = rgb(view1['img'][e])
-        img2 = rgb(view2['img'][e])
-        conf1 = pred1['conf'][e].squeeze()
-        conf2 = pred2['conf'][e].squeeze()
-        score = conf1.mean()*conf2.mean()
-        print(f">> Showing pair #{e} {i}-{j} {score=:g}")
-        pl.clf()
-        pl.subplot(221).imshow(img1)
-        pl.subplot(223).imshow(img2)
-        pl.subplot(222).imshow(conf1, vmin=1, vmax=30)
-        pl.subplot(224).imshow(conf2, vmin=1, vmax=30)
-        pts1 = pred1['pts3d'][e]
-        pts2 = pred2['pts3d_in_other_view'][e]
-        pl.subplots_adjust(0, 0, 1, 1, 0, 0)
-        if input('show pointcloud? (y/n) ') == 'y':
-            show_raw_pointcloud(cat(pts1, pts2), cat(img1, img2), point_size=5)
-def auto_cam_size(im_poses):
-    return 0.1 * get_med_dist_between_poses(im_poses)
-class SceneViz:
-    def __init__(self):
-        self.scene = trimesh.Scene()
-    def add_pointcloud(self, pts3d, color, mask=None):
-        pts3d = to_numpy(pts3d)
-        mask = to_numpy(mask)
-        if mask is None:
-            mask = [slice(None)] * len(pts3d)
-        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
-        pct = trimesh.PointCloud(pts.reshape(-1, 3))
-        if isinstance(color, (list, np.ndarray, torch.Tensor)):
-            color = to_numpy(color)
-            col = np.concatenate([p[m] for p, m in zip(color, mask)])
-            assert col.shape == pts.shape
-            pct.visual.vertex_colors = uint8(col.reshape(-1, 3))
-        else:
-            assert len(color) == 3
-            pct.visual.vertex_colors = np.broadcast_to(uint8(color), pts.shape)
-        self.scene.add_geometry(pct)
-        return self
-    def add_camera(self, pose_c2w, focal=None, color=(0, 0, 0), image=None, imsize=None, cam_size=0.03):
-        pose_c2w, focal, color, image = to_numpy((pose_c2w, focal, color, image))
-        add_scene_cam(self.scene, pose_c2w, color, image, focal, screen_width=cam_size)
-        return self
-    def add_cameras(self, poses, focals=None, images=None, imsizes=None, colors=None, **kw):
-        def get(arr, idx): return None if arr is None else arr[idx]
-        for i, pose_c2w in enumerate(poses):
-            self.add_camera(pose_c2w, get(focals, i), image=get(images, i),
-                            color=get(colors, i), imsize=get(imsizes, i), **kw)
-        return self
-    def show(self, point_size=2):
-        self.scene.show(line_settings={'point_size': point_size})
-def show_raw_pointcloud_with_cams(imgs, pts3d, mask, focals, cams2world,
-                                  point_size=2, cam_size=0.05, cam_color=None):
-    """ Visualization of a pointcloud with cameras
-        imgs = (N, H, W, 3) or N-size list of [(H,W,3), ...]
-        pts3d = (N, H, W, 3) or N-size list of [(H,W,3), ...]
-        focals = (N,) or N-size list of [focal, ...]
-        cams2world = (N,4,4) or N-size list of [(4,4), ...]
-    """
-    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
-    pts3d = to_numpy(pts3d)
-    imgs = to_numpy(imgs)
-    focals = to_numpy(focals)
-    cams2world = to_numpy(cams2world)
-    scene = trimesh.Scene()
-    # full pointcloud
-    pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
-    col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
-    pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
-    scene.add_geometry(pct)
-    # add each camera
-    for i, pose_c2w in enumerate(cams2world):
-        if isinstance(cam_color, list):
-            camera_edge_color = cam_color[i]
-        else:
-            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
-        add_scene_cam(scene, pose_c2w, camera_edge_color,
-                      imgs[i] if i < len(imgs) else None, focals[i], screen_width=cam_size)
-    scene.show(line_settings={'point_size': point_size})
-def add_scene_cam(scene, pose_c2w, edge_color, image=None, focal=None, imsize=None, screen_width=0.03):
-    if image is not None:
-        H, W, THREE = image.shape
-        assert THREE == 3
-        if image.dtype != np.uint8:
-            image = np.uint8(255*image)
-    elif imsize is not None:
-        W, H = imsize
-    elif focal is not None:
-        H = W = focal / 1.1
-    else:
-        H = W = 1
-    if focal is None:
-        focal = min(H, W) * 1.1  # default value
-    elif isinstance(focal, np.ndarray):
-        focal = focal[0]
-    # create fake camera
-    height = focal * screen_width / H
-    width = screen_width * 0.5**0.5
-    rot45 = np.eye(4)
-    rot45[:3, :3] = Rotation.from_euler('z', np.deg2rad(45)).as_matrix()
-    rot45[2, 3] = -height  # set the tip of the cone = optical center
-    aspect_ratio = np.eye(4)
-    aspect_ratio[0, 0] = W/H
-    transform = pose_c2w @ OPENGL @ aspect_ratio @ rot45
-    cam = trimesh.creation.cone(width, height, sections=4)  # , transform=transform)
-    # this is the image
-    if image is not None:
-        vertices = geotrf(transform, cam.vertices[[4, 5, 1, 3]])
-        faces = np.array([[0, 1, 2], [0, 2, 3], [2, 1, 0], [3, 2, 0]])
-        img = trimesh.Trimesh(vertices=vertices, faces=faces)
-        uv_coords = np.float32([[0, 0], [1, 0], [1, 1], [0, 1]])
-        img.visual = trimesh.visual.TextureVisuals(uv_coords, image=PIL.Image.fromarray(image))
-        scene.add_geometry(img)
-    # this is the camera mesh
-    rot2 = np.eye(4)
-    rot2[:3, :3] = Rotation.from_euler('z', np.deg2rad(2)).as_matrix()
-    vertices = np.r_[cam.vertices, 0.95*cam.vertices, geotrf(rot2, cam.vertices)]
-    vertices = geotrf(transform, vertices)
-    faces = []
-    for face in cam.faces:
-        if 0 in face:
-            continue
-        a, b, c = face
-        a2, b2, c2 = face + len(cam.vertices)
-        a3, b3, c3 = face + 2*len(cam.vertices)
-        # add 3 pseudo-edges
-        faces.append((a, b, b2))
-        faces.append((a, a2, c))
-        faces.append((c2, b, c))
-        faces.append((a, b, b3))
-        faces.append((a, a3, c))
-        faces.append((c3, b, c))
-    # no culling
-    faces += [(c, b, a) for a, b, c in faces]
-    cam = trimesh.Trimesh(vertices=vertices, faces=faces)
-    cam.visual.face_colors[:, :3] = edge_color
-    scene.add_geometry(cam)
-def cat(a, b):
-    return np.concatenate((a.reshape(-1, 3), b.reshape(-1, 3)))
-OPENGL = np.array([[1, 0, 0, 0],
-                   [0, -1, 0, 0],
-                   [0, 0, -1, 0],
-                   [0, 0, 0, 1]])
-CAM_COLORS = [(255, 0, 0), (0, 0, 255), (0, 255, 0), (255, 0, 255), (255, 204, 0), (0, 204, 204),
-              (128, 255, 255), (255, 128, 255), (255, 255, 128), (0, 0, 0), (128, 128, 128)]
-def uint8(colors):
-    if not isinstance(colors, np.ndarray):
-        colors = np.array(colors)
-    if np.issubdtype(colors.dtype, np.floating):
-        colors *= 255
-    assert 0 <= colors.min() and colors.max() < 256
-    return np.uint8(colors)
-def segment_sky(image):
-    import cv2
-    from scipy import ndimage
-    # Convert to HSV
-    image = to_numpy(image)
-    if np.issubdtype(image.dtype, np.floating):
-        image = np.uint8(255*image.clip(min=0, max=1))
-    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
-    # Define range for blue color and create mask
-    lower_blue = np.array([0, 0, 100])
-    upper_blue = np.array([30, 255, 255])
-    mask = cv2.inRange(hsv, lower_blue, upper_blue).view(bool)
-    # add luminous gray
-    mask |= (hsv[:, :, 1] < 10) & (hsv[:, :, 2] > 150)
-    mask |= (hsv[:, :, 1] < 30) & (hsv[:, :, 2] > 180)
-    mask |= (hsv[:, :, 1] < 50) & (hsv[:, :, 2] > 220)
-    # Morphological operations
-    kernel = np.ones((5, 5), np.uint8)
-    mask2 = ndimage.binary_opening(mask, structure=kernel)
-    # keep only largest CC
-    _, labels, stats, _ = cv2.connectedComponentsWithStats(mask2.view(np.uint8), connectivity=8)
-    cc_sizes = stats[1:, cv2.CC_STAT_AREA]
-    order = cc_sizes.argsort()[::-1]  # bigger first
-    i = 0
-    selection = []
-    while i < len(order) and cc_sizes[order[i]] > cc_sizes[order[0]] / 2:
-        selection.append(1 + order[i])
-        i += 1
-    mask3 = np.in1d(labels, selection).reshape(labels.shape)
-    # Apply mask
-    return torch.from_numpy(mask3)