yslan's picture
init
7f51798
# SPDX-FileCopyrightText: Copyright (c) 2021-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
#
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
from threading import local
import torch
import torch.nn as nn
from torch_utils import persistence
from .networks_stylegan2 import Generator as StyleGAN2Backbone
from .networks_stylegan2 import ToRGBLayer, SynthesisNetwork, MappingNetwork
from .volumetric_rendering.renderer import ImportanceRenderer
from .volumetric_rendering.ray_sampler import RaySampler, PatchRaySampler
import dnnlib
from pdb import set_trace as st
import math
import torch.nn.functional as F
import itertools
from ldm.modules.diffusionmodules.model import SimpleDecoder, Decoder
@persistence.persistent_class
class TriPlaneGenerator(torch.nn.Module):
def __init__(
self,
z_dim, # Input latent (Z) dimensionality.
c_dim, # Conditioning label (C) dimensionality.
w_dim, # Intermediate latent (W) dimensionality.
img_resolution, # Output resolution.
img_channels, # Number of output color channels.
sr_num_fp16_res=0,
mapping_kwargs={}, # Arguments for MappingNetwork.
rendering_kwargs={},
sr_kwargs={},
bcg_synthesis_kwargs={},
# pifu_kwargs={},
# ada_kwargs={}, # not used, place holder
**synthesis_kwargs, # Arguments for SynthesisNetwork.
):
super().__init__()
self.z_dim = z_dim
self.c_dim = c_dim
self.w_dim = w_dim
self.img_resolution = img_resolution
self.img_channels = img_channels
self.renderer = ImportanceRenderer()
# if 'PatchRaySampler' in rendering_kwargs:
# self.ray_sampler = PatchRaySampler()
# else:
# self.ray_sampler = RaySampler()
self.backbone = StyleGAN2Backbone(z_dim,
c_dim,
w_dim,
img_resolution=256,
img_channels=32 * 3,
mapping_kwargs=mapping_kwargs,
**synthesis_kwargs)
self.superresolution = dnnlib.util.construct_class_by_name(
class_name=rendering_kwargs['superresolution_module'],
channels=32,
img_resolution=img_resolution,
sr_num_fp16_res=sr_num_fp16_res,
sr_antialias=rendering_kwargs['sr_antialias'],
**sr_kwargs)
# self.bcg_synthesis = None
if rendering_kwargs.get('use_background', False):
self.bcg_synthesis = SynthesisNetwork(
w_dim,
img_resolution=self.superresolution.input_resolution,
img_channels=32,
**bcg_synthesis_kwargs)
self.bcg_mapping = MappingNetwork(z_dim=z_dim,
c_dim=c_dim,
w_dim=w_dim,
num_ws=self.num_ws,
**mapping_kwargs)
# New mapping network for self-adaptive camera pose, dim = 3
self.decoder = OSGDecoder(
32, {
'decoder_lr_mul': rendering_kwargs.get('decoder_lr_mul', 1),
'decoder_output_dim': 32
})
self.neural_rendering_resolution = 64
self.rendering_kwargs = rendering_kwargs
self._last_planes = None
self.pool_256 = torch.nn.AdaptiveAvgPool2d((256, 256))
def mapping(self,
z,
c,
truncation_psi=1,
truncation_cutoff=None,
update_emas=False):
if self.rendering_kwargs['c_gen_conditioning_zero']:
c = torch.zeros_like(c)
return self.backbone.mapping(z,
c *
self.rendering_kwargs.get('c_scale', 0),
truncation_psi=truncation_psi,
truncation_cutoff=truncation_cutoff,
update_emas=update_emas)
def synthesis(self,
ws,
c,
neural_rendering_resolution=None,
update_emas=False,
cache_backbone=False,
use_cached_backbone=False,
return_meta=False,
return_raw_only=False,
**synthesis_kwargs):
return_sampling_details_flag = self.rendering_kwargs.get(
'return_sampling_details_flag', False)
if return_sampling_details_flag:
return_meta = True
cam2world_matrix = c[:, :16].view(-1, 4, 4)
# cam2world_matrix = torch.eye(4, device=c.device).unsqueeze(0).repeat_interleave(c.shape[0], dim=0)
# c[:, :16] = cam2world_matrix.view(-1, 16)
intrinsics = c[:, 16:25].view(-1, 3, 3)
if neural_rendering_resolution is None:
neural_rendering_resolution = self.neural_rendering_resolution
else:
self.neural_rendering_resolution = neural_rendering_resolution
H = W = self.neural_rendering_resolution
# Create a batch of rays for volume rendering
ray_origins, ray_directions = self.ray_sampler(
cam2world_matrix, intrinsics, neural_rendering_resolution)
# Create triplanes by running StyleGAN backbone
N, M, _ = ray_origins.shape
if use_cached_backbone and self._last_planes is not None:
planes = self._last_planes
else:
planes = self.backbone.synthesis(
ws[:, :self.backbone.num_ws, :], # ws, BS 14 512
update_emas=update_emas,
**synthesis_kwargs)
if cache_backbone:
self._last_planes = planes
# Reshape output into three 32-channel planes
planes = planes.view(len(planes), 3, 32, planes.shape[-2],
planes.shape[-1]) # BS 96 256 256
# Perform volume rendering
# st()
rendering_details = self.renderer(
planes,
self.decoder,
ray_origins,
ray_directions,
self.rendering_kwargs,
# return_meta=True)
return_meta=return_meta)
# calibs = create_calib_matrix(c)
# all_coords = rendering_details['all_coords']
# B, num_rays, S, _ = all_coords.shape
# all_coords_B3N = all_coords.reshape(B, -1, 3).permute(0,2,1)
# homo_coords = torch.cat([all_coords, torch.zeros_like(all_coords[..., :1])], -1)
# homo_coords[..., -1] = 1
# homo_coords = homo_coords.reshape(homo_coords.shape[0], -1, 4)
# homo_coords = homo_coords.permute(0,2,1)
# xyz = calibs @ homo_coords
# xyz = xyz.permute(0,2,1).reshape(B, H, W, S, 4)
# st()
# xyz_proj = perspective(all_coords_B3N, calibs)
# xyz_proj = xyz_proj.permute(0,2,1).reshape(B, H, W, S, 3) # [0,0] - [1,1]
# st()
feature_samples, depth_samples, weights_samples = (
rendering_details[k]
for k in ['feature_samples', 'depth_samples', 'weights_samples'])
if return_sampling_details_flag:
shape_synthesized = rendering_details['shape_synthesized']
else:
shape_synthesized = None
# Reshape into 'raw' neural-rendered image
feature_image = feature_samples.permute(0, 2, 1).reshape(
N, feature_samples.shape[-1], H, W).contiguous() # B 32 H W
depth_image = depth_samples.permute(0, 2, 1).reshape(N, 1, H, W)
# Run superresolution to get final image
rgb_image = feature_image[:, :3] # B 3 H W
if not return_raw_only:
sr_image = self.superresolution(
rgb_image,
feature_image,
ws[:, -1:, :], # only use the last layer
noise_mode=self.rendering_kwargs['superresolution_noise_mode'],
**{
k: synthesis_kwargs[k]
for k in synthesis_kwargs.keys() if k != 'noise_mode'
})
else:
sr_image = rgb_image
ret_dict = {
'image': sr_image,
'image_raw': rgb_image,
'image_depth': depth_image,
'weights_samples': weights_samples,
'shape_synthesized': shape_synthesized
}
if return_meta:
ret_dict.update({
# 'feature_image': feature_image,
'feature_volume':
rendering_details['feature_volume'],
'all_coords':
rendering_details['all_coords'],
'weights':
rendering_details['weights'],
})
return ret_dict
def sample(self,
coordinates,
directions,
z,
c,
truncation_psi=1,
truncation_cutoff=None,
update_emas=False,
**synthesis_kwargs):
# Compute RGB features, density for arbitrary 3D coordinates. Mostly used for extracting shapes.
ws = self.mapping(z,
c,
truncation_psi=truncation_psi,
truncation_cutoff=truncation_cutoff,
update_emas=update_emas)
planes = self.backbone.synthesis(ws,
update_emas=update_emas,
**synthesis_kwargs)
planes = planes.view(len(planes), 3, 32, planes.shape[-2],
planes.shape[-1])
return self.renderer.run_model(planes, self.decoder, coordinates,
directions, self.rendering_kwargs)
def sample_mixed(self,
coordinates,
directions,
ws,
truncation_psi=1,
truncation_cutoff=None,
update_emas=False,
**synthesis_kwargs):
# Same as sample, but expects latent vectors 'ws' instead of Gaussian noise 'z'
planes = self.backbone.synthesis(ws,
update_emas=update_emas,
**synthesis_kwargs)
planes = planes.view(len(planes), 3, 32, planes.shape[-2],
planes.shape[-1])
return self.renderer.run_model(planes, self.decoder, coordinates,
directions, self.rendering_kwargs)
def forward(self,
z,
c,
truncation_psi=1,
truncation_cutoff=None,
neural_rendering_resolution=None,
update_emas=False,
cache_backbone=False,
use_cached_backbone=False,
**synthesis_kwargs):
# Render a batch of generated images.
ws = self.mapping(z,
c,
truncation_psi=truncation_psi,
truncation_cutoff=truncation_cutoff,
update_emas=update_emas)
return self.synthesis(
ws,
c,
update_emas=update_emas,
neural_rendering_resolution=neural_rendering_resolution,
cache_backbone=cache_backbone,
use_cached_backbone=use_cached_backbone,
**synthesis_kwargs)
from .networks_stylegan2 import FullyConnectedLayer
# class OSGDecoder(torch.nn.Module):
# def __init__(self, n_features, options):
# super().__init__()
# self.hidden_dim = 64
# self.output_dim = options['decoder_output_dim']
# self.n_features = n_features
# self.net = torch.nn.Sequential(
# FullyConnectedLayer(n_features,
# self.hidden_dim,
# lr_multiplier=options['decoder_lr_mul']),
# torch.nn.Softplus(),
# FullyConnectedLayer(self.hidden_dim,
# 1 + options['decoder_output_dim'],
# lr_multiplier=options['decoder_lr_mul']))
# def forward(self, sampled_features, ray_directions):
# # Aggregate features
# sampled_features = sampled_features.mean(1)
# x = sampled_features
# N, M, C = x.shape
# x = x.view(N * M, C)
# x = self.net(x)
# x = x.view(N, M, -1)
# rgb = torch.sigmoid(x[..., 1:]) * (
# 1 + 2 * 0.001) - 0.001 # Uses sigmoid clamping from MipNeRF
# sigma = x[..., 0:1]
# return {'rgb': rgb, 'sigma': sigma}
@persistence.persistent_class
class OSGDecoder(torch.nn.Module):
def __init__(self, n_features, options):
super().__init__()
self.hidden_dim = 64
self.decoder_output_dim = options['decoder_output_dim']
self.net = torch.nn.Sequential(
FullyConnectedLayer(n_features,
self.hidden_dim,
lr_multiplier=options['decoder_lr_mul']),
torch.nn.Softplus(),
FullyConnectedLayer(self.hidden_dim,
1 + options['decoder_output_dim'],
lr_multiplier=options['decoder_lr_mul']))
self.activation = options.get('decoder_activation', 'sigmoid')
def forward(self, sampled_features, ray_directions):
# Aggregate features
sampled_features = sampled_features.mean(1)
x = sampled_features
N, M, C = x.shape
x = x.view(N * M, C)
x = self.net(x)
x = x.view(N, M, -1)
rgb = x[..., 1:]
sigma = x[..., 0:1]
if self.activation == "sigmoid":
# Original EG3D
rgb = torch.sigmoid(rgb) * (1 + 2 * 0.001) - 0.001
elif self.activation == "lrelu":
# StyleGAN2-style, use with toRGB
rgb = torch.nn.functional.leaky_relu(rgb, 0.2,
inplace=True) * math.sqrt(2)
return {'rgb': rgb, 'sigma': sigma}
class OSGDecoderFlexicube(OSGDecoder):
# https://github.com/TencentARC/InstantMesh/blob/0a64425c6d390afa40128132cec42cd5c6408bbf/src/models/renderer/synthesizer_mesh.py#L15
def __init__(self, n_features, options, hidden_dim: int = 64, num_layers: int = 4, activation: nn.Module = nn.ReLU):
super().__init__(n_features, options)
# self.net_sdf = nn.Sequential(
# nn.Linear(3 * n_features, hidden_dim),
# activation(),
# *itertools.chain(*[[
# nn.Linear(hidden_dim, hidden_dim),
# activation(),
# ] for _ in range(num_layers - 2)]),
# nn.Linear(hidden_dim, 1),
# )
# self.net_rgb = nn.Sequential(
# nn.Linear(3 * n_features, hidden_dim),
# activation(),
# *itertools.chain(*[[
# nn.Linear(hidden_dim, hidden_dim),
# activation(),
# ] for _ in range(num_layers - 2)]),
# nn.Linear(hidden_dim, 3),
# )
# ! sdf and rgb prediction adopts old tradition, softplus here
# TODO, load pre-trained model weights
self.net_sdf = torch.nn.Sequential(
FullyConnectedLayer(n_features,
self.hidden_dim,
lr_multiplier=options['decoder_lr_mul']),
torch.nn.Softplus(),
FullyConnectedLayer(self.hidden_dim,
1,
lr_multiplier=options['decoder_lr_mul']))
self.net_rgb = torch.nn.Sequential(
FullyConnectedLayer(n_features,
self.hidden_dim,
lr_multiplier=options['decoder_lr_mul']),
torch.nn.Softplus(),
FullyConnectedLayer(self.hidden_dim,
options['decoder_output_dim'],
lr_multiplier=options['decoder_lr_mul']))
# ! for following MLP, use new behaviour
self.net_deformation = nn.Sequential(
nn.Linear(3 * n_features, hidden_dim),
activation(),
*itertools.chain(*[[
nn.Linear(hidden_dim, hidden_dim),
activation(),
] for _ in range(num_layers - 2)]),
nn.Linear(hidden_dim, 3),
)
self.net_weight = nn.Sequential(
nn.Linear(8 * 3 * n_features, hidden_dim),
activation(),
*itertools.chain(*[[
nn.Linear(hidden_dim, hidden_dim),
activation(),
] for _ in range(num_layers - 2)]),
nn.Linear(hidden_dim, 21),
)
# init all bias to zero
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.zeros_(m.bias)
# def forward(self, sampled_features, ray_directions):
# # Aggregate features
# sampled_features = sampled_features.mean(1)
# x = sampled_features
# N, M, C = x.shape
# x = x.view(N * M, C)
# x = self.net(x)
# x = x.view(N, M, -1)
# rgb = x[..., 1:]
# sigma = x[..., 0:1]
# if self.activation == "sigmoid":
# # Original EG3D
# rgb = torch.sigmoid(rgb) * (1 + 2 * 0.001) - 0.001
# elif self.activation == "lrelu":
# # StyleGAN2-style, use with toRGB
# rgb = torch.nn.functional.leaky_relu(rgb, 0.2,
# inplace=True) * math.sqrt(2)
# return {'rgb': rgb, 'sigma': sigma}
def get_geometry_prediction(self, sampled_features, flexicubes_indices):
_N, n_planes, _M, _C = sampled_features.shape
sdf = self.net_sdf(sampled_features.mean(1)) # for compat issue
sampled_features = sampled_features.permute(0, 2, 1, 3).reshape(_N, _M, n_planes*_C)
deformation = self.net_deformation(sampled_features)
grid_features = torch.index_select(input=sampled_features, index=flexicubes_indices.reshape(-1), dim=1)
grid_features = grid_features.reshape(
sampled_features.shape[0], flexicubes_indices.shape[0], flexicubes_indices.shape[1] * sampled_features.shape[-1])
weight = self.net_weight(grid_features) * 0.1
return sdf, deformation, weight
def get_texture_prediction(self, sampled_features):
_N, n_planes, _M, _C = sampled_features.shape
# sampled_features = sampled_features.permute(0, 2, 1, 3).reshape(_N, _M, n_planes*_C)
sampled_features = sampled_features.mean(1)
rgb = self.net_rgb(sampled_features) # sigmoid feat by default
rgb = torch.sigmoid(rgb)*(1 + 2*0.001) - 0.001 # Uses sigmoid clamping from MipNeRF
return rgb
class LRMOSGDecoder(nn.Module):
"""
Triplane decoder that gives RGB and sigma values from sampled features.
Using ReLU here instead of Softplus in the original implementation.
Reference:
EG3D: https://github.com/NVlabs/eg3d/blob/main/eg3d/training/triplane.py#L112
"""
def __init__(self, n_features: int,
hidden_dim: int = 64, num_layers: int = 4, activation: nn.Module = nn.ReLU):
super().__init__()
self.decoder_output_dim = 3
self.net = nn.Sequential(
nn.Linear(3 * n_features, hidden_dim),
activation(),
*itertools.chain(*[[
nn.Linear(hidden_dim, hidden_dim),
activation(),
] for _ in range(num_layers - 2)]),
nn.Linear(hidden_dim, 1 + self.decoder_output_dim),
)
# init all bias to zero
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.zeros_(m.bias)
def forward(self, sampled_features, ray_directions):
# Aggregate features by mean
# sampled_features = sampled_features.mean(1)
# Aggregate features by concatenation
_N, n_planes, _M, _C = sampled_features.shape
sampled_features = sampled_features.permute(0, 2, 1, 3).reshape(_N, _M, n_planes*_C)
x = sampled_features
N, M, C = x.shape
x = x.contiguous().view(N*M, C)
x = self.net(x)
x = x.view(N, M, -1)
rgb = torch.sigmoid(x[..., 1:])*(1 + 2*0.001) - 0.001 # Uses sigmoid clamping from MipNeRF
sigma = x[..., 0:1]
return {'rgb': rgb, 'sigma': sigma}
class Triplane(torch.nn.Module):
def __init__(
self,
c_dim=25, # Conditioning label (C) dimensionality.
img_resolution=128, # Output resolution.
img_channels=3, # Number of output color channels.
out_chans=96,
triplane_size=224,
rendering_kwargs={},
decoder_in_chans=32,
decoder_output_dim=32,
sr_num_fp16_res=0,
sr_kwargs={},
create_triplane=False, # for overfitting single instance study
bcg_synthesis_kwargs={},
lrm_decoder=False,
):
super().__init__()
self.c_dim = c_dim
self.img_resolution = img_resolution # TODO
self.img_channels = img_channels
self.triplane_size = triplane_size
self.decoder_in_chans = decoder_in_chans
self.out_chans = out_chans
self.renderer = ImportanceRenderer()
if 'PatchRaySampler' in rendering_kwargs:
self.ray_sampler = PatchRaySampler()
else:
self.ray_sampler = RaySampler()
if lrm_decoder:
self.decoder = LRMOSGDecoder(
decoder_in_chans,)
else:
self.decoder = OSGDecoder(
decoder_in_chans,
{
'decoder_lr_mul': rendering_kwargs.get('decoder_lr_mul', 1),
# 'decoder_output_dim': 32
'decoder_output_dim': decoder_output_dim
})
self.neural_rendering_resolution = img_resolution # TODO
# self.neural_rendering_resolution = 128 # TODO
self.rendering_kwargs = rendering_kwargs
self.create_triplane = create_triplane
if create_triplane:
self.planes = nn.Parameter(torch.randn(1, out_chans, 256, 256))
if bool(sr_kwargs): # check whether empty
assert decoder_in_chans == decoder_output_dim, 'tradition'
if rendering_kwargs['superresolution_module'] in [
'torch_utils.components.PixelUnshuffleUpsample',
'torch_utils.components.NearestConvSR',
'torch_utils.components.NearestConvSR_Residual'
]:
self.superresolution = dnnlib.util.construct_class_by_name(
class_name=rendering_kwargs['superresolution_module'],
# * for PixelUnshuffleUpsample
sr_ratio=2, # 2x SR, 128 -> 256
output_dim=decoder_output_dim,
num_out_ch=3,
)
else:
self.superresolution = dnnlib.util.construct_class_by_name(
class_name=rendering_kwargs['superresolution_module'],
# * for stylegan upsample
channels=decoder_output_dim,
img_resolution=img_resolution,
sr_num_fp16_res=sr_num_fp16_res,
sr_antialias=rendering_kwargs['sr_antialias'],
**sr_kwargs)
else:
self.superresolution = None
self.bcg_synthesis = None
# * pure reconstruction
def forward(
self,
planes=None,
# img,
c=None,
ws=None,
ray_origins=None,
ray_directions=None,
z_bcg=None,
neural_rendering_resolution=None,
update_emas=False,
cache_backbone=False,
use_cached_backbone=False,
return_meta=False,
return_raw_only=False,
sample_ray_only=False,
fg_bbox=None,
**synthesis_kwargs):
cam2world_matrix = c[:, :16].reshape(-1, 4, 4)
# cam2world_matrix = torch.eye(4, device=c.device).unsqueeze(0).repeat_interleave(c.shape[0], dim=0)
# c[:, :16] = cam2world_matrix.view(-1, 16)
intrinsics = c[:, 16:25].reshape(-1, 3, 3)
if neural_rendering_resolution is None:
neural_rendering_resolution = self.neural_rendering_resolution
else:
self.neural_rendering_resolution = neural_rendering_resolution
if ray_directions is None: # when output video
H = W = self.neural_rendering_resolution
# Create a batch of rays for volume rendering
# ray_origins, ray_directions, ray_bboxes = self.ray_sampler(
# cam2world_matrix, intrinsics, neural_rendering_resolution)
if sample_ray_only: # ! for sampling
ray_origins, ray_directions, ray_bboxes = self.ray_sampler(
cam2world_matrix, intrinsics,
self.rendering_kwargs.get( 'patch_rendering_resolution' ),
self.neural_rendering_resolution, fg_bbox)
# for patch supervision
ret_dict = {
'ray_origins': ray_origins,
'ray_directions': ray_directions,
'ray_bboxes': ray_bboxes,
}
return ret_dict
else: # ! for rendering
ray_origins, ray_directions, _ = self.ray_sampler(
cam2world_matrix, intrinsics, self.neural_rendering_resolution,
self.neural_rendering_resolution)
else:
assert ray_origins is not None
H = W = int(ray_directions.shape[1]**
0.5) # dynamically set patch resolution
# ! match the batch size, if not returned
if planes is None:
assert self.planes is not None
planes = self.planes.repeat_interleave(c.shape[0], dim=0)
return_sampling_details_flag = self.rendering_kwargs.get(
'return_sampling_details_flag', False)
if return_sampling_details_flag:
return_meta = True
# Create triplanes by running StyleGAN backbone
N, M, _ = ray_origins.shape
# Reshape output into three 32-channel planes
if planes.shape[1] == 3 * 2 * self.decoder_in_chans:
# if isinstance(planes, tuple):
# N *= 2
triplane_bg = True
# planes = torch.cat(planes, 0) # inference in parallel
# ray_origins = ray_origins.repeat(2,1,1)
# ray_directions = ray_directions.repeat(2,1,1)
else:
triplane_bg = False
# assert not triplane_bg
# ! hard coded, will fix later
# if planes.shape[1] == 3 * self.decoder_in_chans:
# else:
# planes = planes.view(len(planes), 3, self.decoder_in_chans,
planes = planes.reshape(
len(planes),
3,
-1, # ! support background plane
planes.shape[-2],
planes.shape[-1]) # BS 96 256 256
# Perform volume rendering
rendering_details = self.renderer(planes,
self.decoder,
ray_origins,
ray_directions,
self.rendering_kwargs,
return_meta=return_meta)
feature_samples, depth_samples, weights_samples = (
rendering_details[k]
for k in ['feature_samples', 'depth_samples', 'weights_samples'])
if return_sampling_details_flag:
shape_synthesized = rendering_details['shape_synthesized']
else:
shape_synthesized = None
# Reshape into 'raw' neural-rendered image
feature_image = feature_samples.permute(0, 2, 1).reshape(
N, feature_samples.shape[-1], H,
W).contiguous() # B 32 H W, in [-1,1]
depth_image = depth_samples.permute(0, 2, 1).reshape(N, 1, H, W)
weights_samples = weights_samples.permute(0, 2, 1).reshape(N, 1, H, W)
# Generate Background
# if self.bcg_synthesis:
# # bg composition
# # if self.decoder.activation == "sigmoid":
# # feature_image = feature_image * 2 - 1 # Scale to (-1, 1), taken from ray marcher
# assert isinstance(
# z_bcg, torch.Tensor
# ) # 512 latents after reparmaterization, reuse the name
# # ws_bcg = ws[:,:self.bcg_synthesis.num_ws] if ws_bcg is None else ws_bcg[:,:self.bcg_synthesis.num_ws]
# with torch.autocast(device_type='cuda',
# dtype=torch.float16,
# enabled=False):
# ws_bcg = self.bcg_mapping(z_bcg, c=None) # reuse the name
# if ws_bcg.size(1) < self.bcg_synthesis.num_ws:
# ws_bcg = torch.cat([
# ws_bcg, ws_bcg[:, -1:].repeat(
# 1, self.bcg_synthesis.num_ws - ws_bcg.size(1), 1)
# ], 1)
# bcg_image = self.bcg_synthesis(ws_bcg,
# update_emas=update_emas,
# **synthesis_kwargs)
# bcg_image = torch.nn.functional.interpolate(
# bcg_image,
# size=feature_image.shape[2:],
# mode='bilinear',
# align_corners=False,
# antialias=self.rendering_kwargs['sr_antialias'])
# feature_image = feature_image + (1 - weights_samples) * bcg_image
# # Generate Raw image
# assert self.torgb
# rgb_image = self.torgb(feature_image,
# ws_bcg[:, -1],
# fused_modconv=False)
# rgb_image = rgb_image.to(dtype=torch.float32,
# memory_format=torch.contiguous_format)
# # st()
# else:
mask_image = weights_samples * (1 + 2 * 0.001) - 0.001
if triplane_bg:
# true_bs = N // 2
# weights_samples = weights_samples[:true_bs]
# mask_image = mask_image[:true_bs]
# feature_image = feature_image[:true_bs] * mask_image + feature_image[true_bs:] * (1-mask_image) # the first is foreground
# depth_image = depth_image[:true_bs]
# ! composited colors
# rgb_final = (
# 1 - fg_ret_dict['weights']
# ) * bg_ret_dict['rgb_final'] + fg_ret_dict[
# 'feature_samples'] # https://github.com/SizheAn/PanoHead/blob/17ad915941c7e2703d5aa3eb5ff12eac47c90e53/training/triplane.py#L127C45-L127C64
# ret_dict.update({
# 'feature_samples': rgb_final,
# })
# st()
feature_image = (1 - mask_image) * rendering_details[
'bg_ret_dict']['rgb_final'] + feature_image
rgb_image = feature_image[:, :3]
# # Run superresolution to get final image
if self.superresolution is not None and not return_raw_only:
# assert ws is not None, 'feed in [cls] token here for SR module'
if ws is not None and ws.ndim == 2:
ws = ws.unsqueeze(
1)[:, -1:, :] # follow stylegan tradition, B, N, C
sr_image = self.superresolution(
rgb=rgb_image,
x=feature_image,
base_x=rgb_image,
ws=ws, # only use the last layer
noise_mode=self.
rendering_kwargs['superresolution_noise_mode'], # none
**{
k: synthesis_kwargs[k]
for k in synthesis_kwargs.keys() if k != 'noise_mode'
})
else:
# sr_image = rgb_image
sr_image = None
if shape_synthesized is not None:
shape_synthesized.update({
'image_depth': depth_image,
}) # for 3D loss easy computation, wrap all 3D in a single dict
ret_dict = {
'feature_image': feature_image,
# 'image_raw': feature_image[:, :3],
'image_raw': rgb_image,
'image_depth': depth_image,
'weights_samples': weights_samples,
# 'silhouette': mask_image,
# 'silhouette_normalized_3channel': (mask_image*2-1).repeat_interleave(3,1), # N 3 H W
'shape_synthesized': shape_synthesized,
"image_mask": mask_image,
}
if sr_image is not None:
ret_dict.update({
'image_sr': sr_image,
})
if return_meta:
ret_dict.update({
'feature_volume':
rendering_details['feature_volume'],
'all_coords':
rendering_details['all_coords'],
'weights':
rendering_details['weights'],
})
return ret_dict
class Triplane_fg_bg_plane(Triplane):
# a separate background plane
def __init__(self,
c_dim=25,
img_resolution=128,
img_channels=3,
out_chans=96,
triplane_size=224,
rendering_kwargs={},
decoder_in_chans=32,
decoder_output_dim=32,
sr_num_fp16_res=0,
sr_kwargs={},
bcg_synthesis_kwargs={}):
super().__init__(c_dim, img_resolution, img_channels, out_chans,
triplane_size, rendering_kwargs, decoder_in_chans,
decoder_output_dim, sr_num_fp16_res, sr_kwargs,
bcg_synthesis_kwargs)
self.bcg_decoder = Decoder(
ch=64, # half channel size
out_ch=32,
# ch_mult=(1, 2, 4),
ch_mult=(1, 2), # use res=64 for now
num_res_blocks=2,
dropout=0.0,
attn_resolutions=(),
z_channels=4,
resolution=64,
in_channels=3,
)
# * pure reconstruction
def forward(
self,
planes,
bg_plane,
# img,
c,
ws=None,
z_bcg=None,
neural_rendering_resolution=None,
update_emas=False,
cache_backbone=False,
use_cached_backbone=False,
return_meta=False,
return_raw_only=False,
**synthesis_kwargs):
# ! match the batch size
if planes is None:
assert self.planes is not None
planes = self.planes.repeat_interleave(c.shape[0], dim=0)
return_sampling_details_flag = self.rendering_kwargs.get(
'return_sampling_details_flag', False)
if return_sampling_details_flag:
return_meta = True
cam2world_matrix = c[:, :16].reshape(-1, 4, 4)
# cam2world_matrix = torch.eye(4, device=c.device).unsqueeze(0).repeat_interleave(c.shape[0], dim=0)
# c[:, :16] = cam2world_matrix.view(-1, 16)
intrinsics = c[:, 16:25].reshape(-1, 3, 3)
if neural_rendering_resolution is None:
neural_rendering_resolution = self.neural_rendering_resolution
else:
self.neural_rendering_resolution = neural_rendering_resolution
H = W = self.neural_rendering_resolution
# Create a batch of rays for volume rendering
ray_origins, ray_directions, _ = self.ray_sampler(
cam2world_matrix, intrinsics, neural_rendering_resolution)
# Create triplanes by running StyleGAN backbone
N, M, _ = ray_origins.shape
# # Reshape output into three 32-channel planes
# if planes.shape[1] == 3 * 2 * self.decoder_in_chans:
# # if isinstance(planes, tuple):
# # N *= 2
# triplane_bg = True
# # planes = torch.cat(planes, 0) # inference in parallel
# # ray_origins = ray_origins.repeat(2,1,1)
# # ray_directions = ray_directions.repeat(2,1,1)
# else:
# triplane_bg = False
# assert not triplane_bg
planes = planes.view(
len(planes),
3,
-1, # ! support background plane
planes.shape[-2],
planes.shape[-1]) # BS 96 256 256
# Perform volume rendering
rendering_details = self.renderer(planes,
self.decoder,
ray_origins,
ray_directions,
self.rendering_kwargs,
return_meta=return_meta)
feature_samples, depth_samples, weights_samples = (
rendering_details[k]
for k in ['feature_samples', 'depth_samples', 'weights_samples'])
if return_sampling_details_flag:
shape_synthesized = rendering_details['shape_synthesized']
else:
shape_synthesized = None
# Reshape into 'raw' neural-rendered image
feature_image = feature_samples.permute(0, 2, 1).reshape(
N, feature_samples.shape[-1], H,
W).contiguous() # B 32 H W, in [-1,1]
depth_image = depth_samples.permute(0, 2, 1).reshape(N, 1, H, W)
weights_samples = weights_samples.permute(0, 2, 1).reshape(N, 1, H, W)
bcg_image = self.bcg_decoder(bg_plane)
bcg_image = torch.nn.functional.interpolate(
bcg_image,
size=feature_image.shape[2:],
mode='bilinear',
align_corners=False,
antialias=self.rendering_kwargs['sr_antialias'])
mask_image = weights_samples * (1 + 2 * 0.001) - 0.001
# ! fuse fg/bg model output
feature_image = feature_image + (1 - weights_samples) * bcg_image
rgb_image = feature_image[:, :3]
# # Run superresolution to get final image
if self.superresolution is not None and not return_raw_only:
# assert ws is not None, 'feed in [cls] token here for SR module'
if ws is not None and ws.ndim == 2:
ws = ws.unsqueeze(
1)[:, -1:, :] # follow stylegan tradition, B, N, C
sr_image = self.superresolution(
rgb=rgb_image,
x=feature_image,
base_x=rgb_image,
ws=ws, # only use the last layer
noise_mode=self.
rendering_kwargs['superresolution_noise_mode'], # none
**{
k: synthesis_kwargs[k]
for k in synthesis_kwargs.keys() if k != 'noise_mode'
})
else:
# sr_image = rgb_image
sr_image = None
if shape_synthesized is not None:
shape_synthesized.update({
'image_depth': depth_image,
}) # for 3D loss easy computation, wrap all 3D in a single dict
ret_dict = {
'feature_image': feature_image,
# 'image_raw': feature_image[:, :3],
'image_raw': rgb_image,
'image_depth': depth_image,
'weights_samples': weights_samples,
# 'silhouette': mask_image,
# 'silhouette_normalized_3channel': (mask_image*2-1).repeat_interleave(3,1), # N 3 H W
'shape_synthesized': shape_synthesized,
"image_mask": mask_image,
}
if sr_image is not None:
ret_dict.update({
'image_sr': sr_image,
})
if return_meta:
ret_dict.update({
'feature_volume':
rendering_details['feature_volume'],
'all_coords':
rendering_details['all_coords'],
'weights':
rendering_details['weights'],
})
return ret_dict
class TriplaneFlexiCube(Triplane):
def __init__(self, c_dim=25, img_resolution=128, img_channels=3, out_chans=96, triplane_size=224, rendering_kwargs={}, decoder_in_chans=32, decoder_output_dim=32, sr_num_fp16_res=0, sr_kwargs={},
create_triplane=False, # for overfitting single instance study
bcg_synthesis_kwargs={},
lrm_decoder=False,
):
super().__init__(c_dim, img_resolution, img_channels, out_chans, triplane_size, rendering_kwargs, decoder_in_chans, decoder_output_dim, sr_num_fp16_res, sr_kwargs, bcg_synthesis_kwargs, lrm_decoder=lrm_decoder, create_triplane=create_triplane)
# https://github.com/TencentARC/InstantMesh/blob/0a64425c6d390afa40128132cec42cd5c6408bbf/src/models/renderer/synthesizer_mesh.py#L93
def get_geometry_prediction(self, planes, sample_coordinates, flexicubes_indices):
plane_axes = self.plane_axes.to(planes.device)
sampled_features = sample_from_planes(
plane_axes, planes, sample_coordinates, padding_mode='zeros', box_warp=self.rendering_kwargs['box_warp'])
sdf, deformation, weight = self.decoder.get_geometry_prediction(sampled_features, flexicubes_indices)
return sdf, deformation, weight
def get_texture_prediction(self, planes, sample_coordinates):
plane_axes = self.plane_axes.to(planes.device)
sampled_features = sample_from_planes(
plane_axes, planes, sample_coordinates, padding_mode='zeros', box_warp=self.rendering_kwargs['box_warp'])
rgb = self.decoder.get_texture_prediction(sampled_features)
return rgb