Spaces:
Running
on
Zero
Running
on
Zero
import cv2 | |
import math | |
import scipy | |
import numpy as np | |
import torch | |
import open3d as o3d | |
from tqdm import tqdm | |
from .camera_util import create_camera_to_world | |
############################################################################### | |
# Camera Trajectory | |
############################################################################### | |
def fibonacci_sampling_on_sphere(num_samples=1): | |
points = [] | |
phi = np.pi * (3.0 - np.sqrt(5.0)) # golden angle in radians | |
for i in range(num_samples): | |
y = 1 - (i / float(num_samples - 1)) * 2 # y goes from 1 to -1 | |
radius = np.sqrt(1 - y * y) # radius at y | |
theta = phi * i # golden angle increment | |
x = np.cos(theta) * radius | |
z = np.sin(theta) * radius | |
points.append([x, y, z]) | |
points = np.array(points) | |
return points | |
def get_fibonacci_cameras(N=20, radius=2.0, device='cuda'): | |
def normalize_vecs(vectors): | |
return vectors / (torch.norm(vectors, dim=-1, keepdim=True)) | |
t = torch.linspace(0, 1, N).reshape(-1, 1) | |
cam_pos = fibonacci_sampling_on_sphere(N) | |
cam_pos = torch.from_numpy(cam_pos).float().to(device) | |
cam_pos = cam_pos * radius | |
forward_vector = normalize_vecs(-cam_pos) | |
up_vector = torch.tensor([0, 0, 1], dtype=torch.float, | |
device=device).reshape(-1).expand_as(forward_vector) | |
right_vector = normalize_vecs(torch.cross(forward_vector, up_vector, dim=-1)) | |
up_vector = normalize_vecs(torch.cross(right_vector, forward_vector, dim=-1)) | |
rotate = torch.stack( | |
(right_vector, -up_vector, forward_vector), dim=-1) | |
rotation_matrix = torch.eye(4, device=device).unsqueeze(0).repeat(forward_vector.shape[0], 1, 1) | |
rotation_matrix[:, :3, :3] = rotate | |
translation_matrix = torch.eye(4, device=device).unsqueeze(0).repeat(forward_vector.shape[0], 1, 1) | |
translation_matrix[:, :3, 3] = cam_pos | |
cam2world = translation_matrix @ rotation_matrix | |
return cam2world | |
def get_circular_cameras(N=120, elevation=0, radius=2.0, normalize=True, device='cuda'): | |
camera_positions = [] | |
for i in range(N): | |
azimuth = 2 * np.pi * i / N - np.pi / 2 | |
x = radius * np.cos(elevation) * np.cos(azimuth) | |
y = radius * np.cos(elevation) * np.sin(azimuth) | |
z = radius * np.sin(elevation) | |
camera_positions.append([x, y, z]) | |
camera_positions = np.array(camera_positions) | |
camera_positions = torch.from_numpy(camera_positions).float() | |
c2ws = create_camera_to_world(camera_positions, camera_system='opencv') | |
if normalize: | |
c2ws_first = create_camera_to_world(torch.tensor([0, -2, 0]), camera_system='opencv').unsqueeze(0) | |
c2ws = torch.linalg.inv(c2ws_first) @ c2ws | |
return c2ws | |
############################################################################### | |
# TSDF Fusion | |
############################################################################### | |
def rgbd_to_mesh(images, depths, c2ws, fov, mesh_path, cam_elev_thr=0): | |
voxel_length = 2 * 2.0 / 512.0 | |
sdf_trunc = 2 * 0.02 | |
color_type = o3d.pipelines.integration.TSDFVolumeColorType.RGB8 | |
volume = o3d.pipelines.integration.ScalableTSDFVolume( | |
voxel_length=voxel_length, | |
sdf_trunc=sdf_trunc, | |
color_type=color_type, | |
) | |
for i in tqdm(range(c2ws.shape[0])): | |
camera_to_world = c2ws[i] | |
world_to_camera = np.linalg.inv(camera_to_world) | |
camera_position = camera_to_world[:3, 3] | |
# camera_elevation = np.rad2deg(np.arcsin(camera_position[2])) | |
camera_elevation = np.rad2deg(np.arcsin(camera_position[2] / np.linalg.norm(camera_position))) | |
if camera_elevation < cam_elev_thr: | |
continue | |
color_image = o3d.geometry.Image(np.ascontiguousarray(images[i])) | |
depth_image = o3d.geometry.Image(np.ascontiguousarray(depths[i])) | |
rgbd_image = o3d.geometry.RGBDImage.create_from_color_and_depth( | |
color_image, depth_image, depth_scale=1.0, depth_trunc=4.0, convert_rgb_to_intensity=False | |
) | |
camera_intrinsics = o3d.camera.PinholeCameraIntrinsic() | |
fx = fy = images[i].shape[1] / 2. / np.tan(np.deg2rad(fov / 2.0)) | |
cx = cy = images[i].shape[1] / 2. | |
h = images[i].shape[0] | |
w = images[i].shape[1] | |
camera_intrinsics.set_intrinsics( | |
w, h, fx, fy, cx, cy | |
) | |
volume.integrate( | |
rgbd_image, | |
camera_intrinsics, | |
world_to_camera, | |
) | |
fused_mesh = volume.extract_triangle_mesh() | |
triangle_clusters, cluster_n_triangles, cluster_area = ( | |
fused_mesh.cluster_connected_triangles()) | |
triangle_clusters = np.asarray(triangle_clusters) | |
cluster_n_triangles = np.asarray(cluster_n_triangles) | |
cluster_area = np.asarray(cluster_area) | |
triangles_to_remove = cluster_n_triangles[triangle_clusters] < 500 | |
fused_mesh.remove_triangles_by_mask(triangles_to_remove) | |
fused_mesh.remove_unreferenced_vertices() | |
fused_mesh = fused_mesh.filter_smooth_simple(number_of_iterations=2) | |
fused_mesh = fused_mesh.compute_vertex_normals() | |
o3d.io.write_triangle_mesh(mesh_path, fused_mesh) | |
############################################################################### | |
# Visualization | |
############################################################################### | |
def viewmatrix(lookdir, up, position): | |
"""Construct lookat view matrix.""" | |
vec2 = normalize(lookdir) | |
vec0 = normalize(np.cross(up, vec2)) | |
vec1 = normalize(np.cross(vec2, vec0)) | |
m = np.stack([vec0, vec1, vec2, position], axis=1) | |
return m | |
def normalize(x): | |
"""Normalization helper function.""" | |
return x / np.linalg.norm(x) | |
def generate_interpolated_path(poses, n_interp, spline_degree=5, | |
smoothness=.03, rot_weight=.1): | |
"""Creates a smooth spline path between input keyframe camera poses. | |
Spline is calculated with poses in format (position, lookat-point, up-point). | |
Args: | |
poses: (n, 3, 4) array of input pose keyframes. | |
n_interp: returned path will have n_interp * (n - 1) total poses. | |
spline_degree: polynomial degree of B-spline. | |
smoothness: parameter for spline smoothing, 0 forces exact interpolation. | |
rot_weight: relative weighting of rotation/translation in spline solve. | |
Returns: | |
Array of new camera poses with shape (n_interp * (n - 1), 3, 4). | |
""" | |
def poses_to_points(poses, dist): | |
"""Converts from pose matrices to (position, lookat, up) format.""" | |
pos = poses[:, :3, -1] | |
lookat = poses[:, :3, -1] - dist * poses[:, :3, 2] | |
up = poses[:, :3, -1] + dist * poses[:, :3, 1] | |
return np.stack([pos, lookat, up], 1) | |
def points_to_poses(points): | |
"""Converts from (position, lookat, up) format to pose matrices.""" | |
return np.array([viewmatrix(p - l, u - p, p) for p, l, u in points]) | |
def interp(points, n, k, s): | |
"""Runs multidimensional B-spline interpolation on the input points.""" | |
sh = points.shape | |
pts = np.reshape(points, (sh[0], -1)) | |
k = min(k, sh[0] - 1) | |
tck, _ = scipy.interpolate.splprep(pts.T, k=k, s=s) | |
u = np.linspace(0, 1, n, endpoint=False) | |
new_points = np.array(scipy.interpolate.splev(u, tck)) | |
new_points = np.reshape(new_points.T, (n, sh[1], sh[2])) | |
return new_points | |
points = poses_to_points(poses, dist=rot_weight) | |
new_points = interp(points, | |
n_interp * (points.shape[0] - 1), | |
k=spline_degree, | |
s=smoothness) | |
return points_to_poses(new_points) | |
############################################################################### | |
# Camera Estimation | |
############################################################################### | |
def xy_grid(W, H, device=None, origin=(0, 0), unsqueeze=None, cat_dim=-1, homogeneous=False, **arange_kw): | |
""" Output a (H,W,2) array of int32 | |
with output[j,i,0] = i + origin[0] | |
output[j,i,1] = j + origin[1] | |
""" | |
if device is None: | |
# numpy | |
arange, meshgrid, stack, ones = np.arange, np.meshgrid, np.stack, np.ones | |
else: | |
# torch | |
arange = lambda *a, **kw: torch.arange(*a, device=device, **kw) | |
meshgrid, stack = torch.meshgrid, torch.stack | |
ones = lambda *a: torch.ones(*a, device=device) | |
tw, th = [arange(o, o + s, **arange_kw) for s, o in zip((W, H), origin)] | |
grid = meshgrid(tw, th, indexing='xy') | |
if homogeneous: | |
grid = grid + (ones((H, W)),) | |
if unsqueeze is not None: | |
grid = (grid[0].unsqueeze(unsqueeze), grid[1].unsqueeze(unsqueeze)) | |
if cat_dim is not None: | |
grid = stack(grid, cat_dim) | |
return grid | |
def estimate_focal(pts3d, pp=None, mask=None, min_focal=0., max_focal=np.inf): | |
""" | |
Reprojection method, for when the absolute depth is known: | |
1) estimate the camera focal using a robust estimator | |
2) reproject points onto true rays, minimizing a certain error | |
""" | |
H, W, THREE = pts3d.shape | |
assert THREE == 3 | |
if pp is None: | |
pp = torch.tensor([W/2, H/2]).to(pts3d) | |
# centered pixel grid | |
pixels = xy_grid(W, H, device=pts3d.device).view(-1, 2) - pp.view(1, 2) # (HW, 2) | |
pts3d = pts3d.view(H*W, 3).contiguous() # (HW, 3) | |
# mask points if provided | |
if mask is not None: | |
mask = mask.to(pts3d.device).ravel().bool() | |
assert len(mask) == pts3d.shape[0] | |
pts3d = pts3d[mask] | |
pixels = pixels[mask] | |
# weiszfeld | |
# init focal with l2 closed form | |
# we try to find focal = argmin Sum | pixel - focal * (x,y)/z| | |
xy_over_z = (pts3d[..., :2] / pts3d[..., 2:3]).nan_to_num(posinf=0, neginf=0) # homogeneous (x,y,1) | |
dot_xy_px = (xy_over_z * pixels).sum(dim=-1) | |
dot_xy_xy = xy_over_z.square().sum(dim=-1) | |
focal = dot_xy_px.mean(dim=0) / dot_xy_xy.mean(dim=0) | |
# iterative re-weighted least-squares | |
for iter in range(10): | |
# re-weighting by inverse of distance | |
dis = (pixels - focal.view(-1, 1) * xy_over_z).norm(dim=-1) | |
# print(dis.nanmean(-1)) | |
w = dis.clip(min=1e-8).reciprocal() | |
# update the scaling with the new weights | |
focal = (w * dot_xy_px).mean(dim=0) / (w * dot_xy_xy).mean(dim=0) | |
focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2)) # size / 1.1547005383792515 | |
focal = focal.clip(min=min_focal*focal_base, max=max_focal*focal_base) | |
return focal.ravel() | |
def fast_pnp(pts3d, mask, focal=None, pp=None, niter_PnP=10): | |
""" | |
Estimate camera poses and focals with RANSAC-PnP. | |
Inputs: | |
pts3d: H x W x 3 | |
focal: 1 | |
mask: H x W | |
pp | |
""" | |
H, W, _ = pts3d.shape | |
pixels = np.mgrid[:W, :H].T.astype(float) | |
if focal is None: | |
S = max(W, H) | |
tentative_focals = np.geomspace(S/2, S*3, 21) | |
else: | |
tentative_focals = [focal] | |
if pp is None: | |
pp = (W/2, H/2) | |
best = 0, | |
for focal in tentative_focals: | |
K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)]) | |
success, R, T, inliers = cv2.solvePnPRansac(pts3d[mask], pixels[mask], K, None, | |
iterationsCount=niter_PnP, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP) | |
if not success: | |
continue | |
score = len(inliers) | |
if success and score > best[0]: | |
best = score, R, T, focal | |
if not best[0]: | |
return None | |
_, R, T, best_focal = best | |
R = cv2.Rodrigues(R)[0] # world to cam | |
world2cam = np.eye(4).astype(float) | |
world2cam[:3, :3] = R | |
world2cam[:3, 3] = T.reshape(3) | |
cam2world = np.linalg.inv(world2cam) | |
return best_focal, cam2world | |