mm3dtest / projects /NeRF-Det /nerfdet /multiview_pipeline.py
giantmonkeyTC
2344
34d1f8b
# Copyright (c) OpenMMLab. All rights reserved.
import mmcv
import numpy as np
from mmcv.transforms import BaseTransform, Compose
from PIL import Image
from mmdet3d.registry import TRANSFORMS
def get_dtu_raydir(pixelcoords, intrinsic, rot, dir_norm=None):
# rot is c2w
# pixelcoords: H x W x 2
x = (pixelcoords[..., 0] + 0.5 - intrinsic[0, 2]) / intrinsic[0, 0]
y = (pixelcoords[..., 1] + 0.5 - intrinsic[1, 2]) / intrinsic[1, 1]
z = np.ones_like(x)
dirs = np.stack([x, y, z], axis=-1)
# dirs = np.sum(dirs[...,None,:] * rot[:,:], axis=-1) # h*w*1*3 x 3*3
dirs = dirs @ rot[:, :].T #
if dir_norm:
dirs = dirs / (np.linalg.norm(dirs, axis=-1, keepdims=True) + 1e-5)
return dirs
@TRANSFORMS.register_module()
class MultiViewPipeline(BaseTransform):
"""MultiViewPipeline used in nerfdet.
Required Keys:
- depth_info
- img_prefix
- img_info
- lidar2img
- c2w
- cammrotc2w
- lightpos
- ray_info
Modified Keys:
- lidar2img
Added Keys:
- img
- denorm_images
- depth
- c2w
- camrotc2w
- lightpos
- pixels
- raydirs
- gt_images
- gt_depths
- nerf_sizes
- depth_range
Args:
transforms (list[dict]): The transform pipeline
used to process the imgs.
n_images (int): The number of sampled views.
mean (array): The mean values used in normalization.
std (array): The variance values used in normalization.
margin (int): The margin value. Defaults to 10.
depth_range (array): The range of the depth.
Defaults to [0.5, 5.5].
loading (str): The mode of loading. Defaults to 'random'.
nerf_target_views (int): The number of novel views.
sample_freq (int): The frequency of sampling.
"""
def __init__(self,
transforms: dict,
n_images: int,
mean: tuple = [123.675, 116.28, 103.53],
std: tuple = [58.395, 57.12, 57.375],
margin: int = 10,
depth_range: tuple = [0.5, 5.5],
loading: str = 'random',
nerf_target_views: int = 0,
sample_freq: int = 3):
self.transforms = Compose(transforms)
self.depth_transforms = Compose(transforms[1])
self.n_images = n_images
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
self.margin = margin
self.depth_range = depth_range
self.loading = loading
self.sample_freq = sample_freq
self.nerf_target_views = nerf_target_views
def transform(self, results: dict) -> dict:
"""Nerfdet transform function.
Args:
results (dict): Result dict from loading pipeline
Returns:
dict: The result dict containing the processed results.
Updated key and value are described below.
- img (list): The loaded origin image.
- denorm_images (list): The denormalized image.
- depth (list): The origin depth image.
- c2w (list): The c2w matrixes.
- camrotc2w (list): The rotation matrixes.
- lightpos (list): The transform parameters of the camera.
- pixels (list): Some pixel information.
- raydirs (list): The ray-directions.
- gt_images (list): The groundtruth images.
- gt_depths (list): The groundtruth depth images.
- nerf_sizes (array): The size of the groundtruth images.
- depth_range (array): The range of the depth.
Here we give a detailed explanation of some keys mentioned above.
Let P_c be the coordinate of camera, P_w be the coordinate of world.
There is such a conversion relationship: P_c = R @ P_w + T.
The 'camrotc2w' mentioned above corresponds to the R matrix here.
The 'lightpos' corresponds to the T matrix here. And if you put
R and T together, you can get the camera extrinsics matrix. It
corresponds to the 'c2w' mentioned above.
"""
imgs = []
depths = []
extrinsics = []
c2ws = []
camrotc2ws = []
lightposes = []
pixels = []
raydirs = []
gt_images = []
gt_depths = []
denorm_imgs_list = []
nerf_sizes = []
if self.loading == 'random':
ids = np.arange(len(results['img_info']))
replace = True if self.n_images > len(ids) else False
ids = np.random.choice(ids, self.n_images, replace=replace)
if self.nerf_target_views != 0:
target_id = np.random.choice(
ids, self.nerf_target_views, replace=False)
ids = np.setdiff1d(ids, target_id)
ids = ids.tolist()
target_id = target_id.tolist()
else:
ids = np.arange(len(results['img_info']))
begin_id = 0
ids = np.arange(begin_id,
begin_id + self.n_images * self.sample_freq,
self.sample_freq)
if self.nerf_target_views != 0:
target_id = ids
ratio = 0
size = (240, 320)
for i in ids:
_results = dict()
_results['img_path'] = results['img_info'][i]['filename']
_results = self.transforms(_results)
imgs.append(_results['img'])
# normalize
for key in _results.get('img_fields', ['img']):
_results[key] = mmcv.imnormalize(_results[key], self.mean,
self.std, True)
_results['img_norm_cfg'] = dict(
mean=self.mean, std=self.std, to_rgb=True)
# pad
for key in _results.get('img_fields', ['img']):
padded_img = mmcv.impad(_results[key], shape=size, pad_val=0)
_results[key] = padded_img
_results['pad_shape'] = padded_img.shape
_results['pad_fixed_size'] = size
ori_shape = _results['ori_shape']
aft_shape = _results['img_shape']
ratio = ori_shape[0] / aft_shape[0]
# prepare the depth information
if 'depth_info' in results.keys():
if '.npy' in results['depth_info'][i]['filename']:
_results['depth'] = np.load(
results['depth_info'][i]['filename'])
else:
_results['depth'] = np.asarray((Image.open(
results['depth_info'][i]['filename']))) / 1000
_results['depth'] = mmcv.imresize(
_results['depth'], (aft_shape[1], aft_shape[0]))
depths.append(_results['depth'])
denorm_img = mmcv.imdenormalize(
_results['img'], self.mean, self.std, to_bgr=True).astype(
np.uint8) / 255.0
denorm_imgs_list.append(denorm_img)
height, width = padded_img.shape[:2]
extrinsics.append(results['lidar2img']['extrinsic'][i])
# prepare the nerf information
if 'ray_info' in results.keys():
intrinsics_nerf = results['lidar2img']['intrinsic'].copy()
intrinsics_nerf[:2] = intrinsics_nerf[:2] / ratio
assert self.nerf_target_views > 0
for i in target_id:
c2ws.append(results['c2w'][i])
camrotc2ws.append(results['camrotc2w'][i])
lightposes.append(results['lightpos'][i])
px, py = np.meshgrid(
np.arange(self.margin,
width - self.margin).astype(np.float32),
np.arange(self.margin,
height - self.margin).astype(np.float32))
pixelcoords = np.stack((px, py),
axis=-1).astype(np.float32) # H x W x 2
pixels.append(pixelcoords)
raydir = get_dtu_raydir(pixelcoords, intrinsics_nerf,
results['camrotc2w'][i])
raydirs.append(np.reshape(raydir.astype(np.float32), (-1, 3)))
# read target images
temp_results = dict()
temp_results['img_path'] = results['img_info'][i]['filename']
temp_results_ = self.transforms(temp_results)
# normalize
for key in temp_results.get('img_fields', ['img']):
temp_results[key] = mmcv.imnormalize(
temp_results[key], self.mean, self.std, True)
temp_results['img_norm_cfg'] = dict(
mean=self.mean, std=self.std, to_rgb=True)
# pad
for key in temp_results.get('img_fields', ['img']):
padded_img = mmcv.impad(
temp_results[key], shape=size, pad_val=0)
temp_results[key] = padded_img
temp_results['pad_shape'] = padded_img.shape
temp_results['pad_fixed_size'] = size
# denormalize target_images.
denorm_imgs = mmcv.imdenormalize(
temp_results_['img'], self.mean, self.std,
to_bgr=True).astype(np.uint8)
gt_rgb_shape = denorm_imgs.shape
gt_image = denorm_imgs[py.astype(np.int32),
px.astype(np.int32), :]
nerf_sizes.append(np.array(gt_image.shape))
gt_image = np.reshape(gt_image, (-1, 3))
gt_images.append(gt_image / 255.0)
if 'depth_info' in results.keys():
if '.npy' in results['depth_info'][i]['filename']:
_results['depth'] = np.load(
results['depth_info'][i]['filename'])
else:
depth_image = Image.open(
results['depth_info'][i]['filename'])
_results['depth'] = np.asarray(depth_image) / 1000
_results['depth'] = mmcv.imresize(
_results['depth'],
(gt_rgb_shape[1], gt_rgb_shape[0]))
_results['depth'] = _results['depth']
gt_depth = _results['depth'][py.astype(np.int32),
px.astype(np.int32)]
gt_depths.append(gt_depth)
for key in _results.keys():
if key not in ['img', 'img_info']:
results[key] = _results[key]
results['img'] = imgs
if 'ray_info' in results.keys():
results['c2w'] = c2ws
results['camrotc2w'] = camrotc2ws
results['lightpos'] = lightposes
results['pixels'] = pixels
results['raydirs'] = raydirs
results['gt_images'] = gt_images
results['gt_depths'] = gt_depths
results['nerf_sizes'] = nerf_sizes
results['denorm_images'] = denorm_imgs_list
results['depth_range'] = np.array([self.depth_range])
if len(depths) != 0:
results['depth'] = depths
results['lidar2img']['extrinsic'] = extrinsics
return results
@TRANSFORMS.register_module()
class RandomShiftOrigin(BaseTransform):
def __init__(self, std):
self.std = std
def transform(self, results):
shift = np.random.normal(.0, self.std, 3)
results['lidar2img']['origin'] += shift
return results