Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
from torch import nn | |
from nsr.triplane import Triplane_fg_bg_plane | |
# import timm | |
# from vit.vit_triplane import ViTTriplane, Triplane, ViTTriplaneDecomposed | |
from vit.vit_triplane import Triplane, ViTTriplaneDecomposed | |
import argparse | |
import inspect | |
import dnnlib | |
from guided_diffusion import dist_util | |
from pdb import set_trace as st | |
import vit.vision_transformer as vits | |
from guided_diffusion import logger | |
from .confnet import ConfNet | |
from ldm.modules.diffusionmodules.model import Encoder, Decoder, MVEncoder, MVEncoderGS, MVEncoderGSDynamicInp, MVEncoderGSDynamicInp_CA | |
from ldm.modules.diffusionmodules.mv_unet import MVUNet, LGM_MVEncoder | |
from torch.profiler import profile, record_function, ProfilerActivity | |
# from nsr.gs import GaussianRenderer | |
from nsr.gs_surfel import GaussianRenderer2DGS | |
# from nsr.srt.encoder import ImprovedSRTEncoderVAE, ImprovedSRTEncoderVAE_L5_vitl, ImprovedSRTEncoderVAE_mlp_ratio4, ImprovedSRTEncoderVAE_L6, ImprovedSRTEncoderVAE_mlp_ratio4_f8, ImprovedSRTEncoderVAE_mlp_ratio4_heavyPatchify, ImprovedSRTEncoderVAE_mlp_ratio4_f8_L6, ImprovedSRTEncoderVAE_mlp_ratio4_L6, HybridEncoder, ImprovedSRTEncoderVAE_mlp_ratio4_decomposed, HybridEncoderPCDStructuredLatent | |
from nsr.srt.encoder import * | |
# from ldm.modules.diffusionmodules.openaimodel import MultiViewUNetModel_Encoder | |
# * create pre-trained encoder & triplane / other nsr decoder | |
class AE(torch.nn.Module): | |
def __init__(self, | |
encoder, | |
decoder, | |
img_size, | |
encoder_cls_token, | |
decoder_cls_token, | |
preprocess, | |
use_clip, | |
dino_version='v1', | |
clip_dtype=None, | |
no_dim_up_mlp=False, | |
dim_up_mlp_as_func=False, | |
uvit_skip_encoder=False, | |
confnet=None) -> None: | |
super().__init__() | |
self.encoder = encoder | |
self.decoder = decoder | |
self.img_size = img_size | |
self.encoder_cls_token = encoder_cls_token | |
self.decoder_cls_token = decoder_cls_token | |
self.use_clip = use_clip | |
self.dino_version = dino_version | |
self.confnet = confnet | |
if self.dino_version == 'v2': | |
self.encoder.mask_token = None | |
self.decoder.vit_decoder.mask_token = None | |
if 'sd' not in self.dino_version: | |
self.uvit_skip_encoder = uvit_skip_encoder | |
if uvit_skip_encoder: | |
logger.log( | |
f'enables uvit: length of vit_encoder.blocks: {len(self.encoder.blocks)}' | |
) | |
for blk in self.encoder.blocks[len(self.encoder.blocks) // 2:]: | |
blk.skip_linear = nn.Linear(2 * self.encoder.embed_dim, | |
self.encoder.embed_dim) | |
# trunc_normal_(blk.skip_linear.weight, std=.02) | |
nn.init.constant_(blk.skip_linear.weight, 0) | |
if isinstance( | |
blk.skip_linear, | |
nn.Linear) and blk.skip_linear.bias is not None: | |
nn.init.constant_(blk.skip_linear.bias, 0) | |
else: | |
logger.log(f'disable uvit') | |
else: | |
if 'dit' not in self.dino_version: # dino vit, not dit | |
self.decoder.vit_decoder.cls_token = None | |
self.decoder.vit_decoder.patch_embed.proj = nn.Identity() | |
self.decoder.triplane_decoder.planes = None | |
self.decoder.vit_decoder.mask_token = None | |
if self.use_clip: | |
self.clip_dtype = clip_dtype # torch.float16 | |
else: | |
if not no_dim_up_mlp and self.encoder.embed_dim != self.decoder.vit_decoder.embed_dim: | |
self.dim_up_mlp = nn.Linear( | |
self.encoder.embed_dim, | |
self.decoder.vit_decoder.embed_dim) | |
logger.log( | |
f"dim_up_mlp: {self.encoder.embed_dim} -> {self.decoder.vit_decoder.embed_dim}, as_func: {self.dim_up_mlp_as_func}" | |
) | |
else: | |
logger.log('ignore dim_up_mlp: ', no_dim_up_mlp) | |
self.preprocess = preprocess | |
self.dim_up_mlp = None # CLIP/B-16 | |
self.dim_up_mlp_as_func = dim_up_mlp_as_func | |
# * remove certain components to make sure no unused parameters during DDP | |
# self.decoder.vit_decoder.cls_token = nn.Identity() | |
torch.cuda.empty_cache() | |
# self.decoder.vit_decoder.patch_embed.proj.bias = nn.Identity() | |
# self.decoder.vit_decoder.patch_embed.proj.weight = nn.Identity() | |
# self.decoder.vit_decoder.patch_embed.proj.bias = nn.Identity() | |
def encode(self, *args, **kwargs): | |
if not self.use_clip: | |
if self.dino_version == 'v1': | |
latent = self.encode_dinov1(*args, **kwargs) | |
elif self.dino_version == 'v2': | |
if self.uvit_skip_encoder: | |
latent = self.encode_dinov2_uvit(*args, **kwargs) | |
else: | |
latent = self.encode_dinov2(*args, **kwargs) | |
else: | |
latent = self.encoder(*args, **kwargs) | |
else: | |
latent = self.encode_clip(*args, **kwargs) | |
return latent | |
def encode_dinov1(self, x): | |
# return self.encoder(img) | |
x = self.encoder.prepare_tokens(x) | |
for blk in self.encoder.blocks: | |
x = blk(x) | |
x = self.encoder.norm(x) | |
if not self.encoder_cls_token: | |
return x[:, 1:] | |
return x | |
def encode_dinov2(self, x): | |
# return self.encoder(img) | |
x = self.encoder.prepare_tokens_with_masks(x, masks=None) | |
for blk in self.encoder.blocks: | |
x = blk(x) | |
x_norm = self.encoder.norm(x) | |
if not self.encoder_cls_token: | |
return x_norm[:, 1:] | |
# else: | |
# return x_norm[:, :1] | |
# return { | |
# "x_norm_clstoken": x_norm[:, 0], | |
# "x_norm_patchtokens": x_norm[:, 1:], | |
# } | |
return x_norm | |
def encode_dinov2_uvit(self, x): | |
# return self.encoder(img) | |
x = self.encoder.prepare_tokens_with_masks(x, masks=None) | |
# for blk in self.encoder.blocks: | |
# x = blk(x) | |
skips = [x] | |
# in blks | |
for blk in self.encoder.blocks[0:len(self.encoder.blocks) // 2 - 1]: | |
x = blk(x) # B 3 N C | |
skips.append(x) | |
# mid blks | |
for blk in self.encoder.blocks[len(self.encoder.blocks) // 2 - | |
1:len(self.encoder.blocks) // 2]: | |
x = blk(x) # B 3 N C | |
# out blks | |
for blk in self.encoder.blocks[len(self.encoder.blocks) // 2:]: | |
x = x + blk.skip_linear(torch.cat( | |
[x, skips.pop()], dim=-1)) # long skip connections in uvit | |
x = blk(x) # B 3 N C | |
x_norm = self.encoder.norm(x) | |
if not self.decoder_cls_token: | |
return x_norm[:, 1:] | |
return x_norm | |
def encode_clip(self, x): | |
# * replace with CLIP encoding pipeline | |
# return self.encoder(img) | |
# x = x.dtype(self.clip_dtype) | |
x = self.encoder.conv1(x) # shape = [*, width, grid, grid] | |
x = x.reshape(x.shape[0], x.shape[1], | |
-1) # shape = [*, width, grid ** 2] | |
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] | |
x = torch.cat([ | |
self.encoder.class_embedding.to(x.dtype) + torch.zeros( | |
x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x | |
], | |
dim=1) # shape = [*, grid ** 2 + 1, width] | |
x = x + self.encoder.positional_embedding.to(x.dtype) | |
x = self.encoder.ln_pre(x) | |
x = x.permute(1, 0, 2) # NLD -> LND | |
x = self.encoder.transformer(x) | |
x = x.permute(1, 0, 2) # LND -> NLD | |
x = self.encoder.ln_post(x[:, 1:, :]) # * return the spatial tokens | |
return x | |
# x = self.ln_post(x[:, 0, :]) # * return the spatial tokens | |
# if self.proj is not None: | |
# x = x @ self.proj | |
# return x | |
def decode_wo_triplane(self, latent, c=None, img_size=None): | |
if img_size is None: | |
img_size = self.img_size | |
if self.dim_up_mlp is not None: | |
if not self.dim_up_mlp_as_func: | |
latent = self.dim_up_mlp(latent) | |
# return self.decoder.vit_decode(latent, img_size) | |
else: | |
return self.decoder.vit_decode( | |
latent, img_size, | |
dim_up_mlp=self.dim_up_mlp) # used in vae-ldm | |
return self.decoder.vit_decode(latent, img_size, c=c) | |
def decode(self, latent, c, img_size=None, return_raw_only=False): | |
# if img_size is None: | |
# img_size = self.img_size | |
# if self.dim_up_mlp is not None: | |
# latent = self.dim_up_mlp(latent) | |
latent = self.decode_wo_triplane(latent, img_size=img_size, c=c) | |
# return self.decoder.triplane_decode(latent, c, return_raw_only=return_raw_only) | |
return self.decoder.triplane_decode(latent, c) | |
def decode_after_vae_no_render( | |
self, | |
ret_dict, | |
img_size=None, | |
): | |
if img_size is None: | |
img_size = self.img_size | |
assert self.dim_up_mlp is None | |
# if not self.dim_up_mlp_as_func: | |
# latent = self.dim_up_mlp(latent) | |
# return self.decoder.vit_decode(latent, img_size) | |
latent = self.decoder.vit_decode_backbone(ret_dict, img_size) | |
ret_dict = self.decoder.vit_decode_postprocess(latent, ret_dict) | |
return ret_dict | |
def decode_after_vae_no_render_gs( | |
self, | |
ret_dict, | |
img_size=None, | |
): | |
ret_after_decoder = self.decode_after_vae_no_render(ret_dict, img_size) | |
return self.decoder.forward_gaussians(ret_after_decoder, c=None) | |
def decode_after_vae( | |
self, | |
# latent, | |
ret_dict, # vae_dict | |
c, | |
img_size=None, | |
return_raw_only=False): | |
ret_dict = self.decode_after_vae_no_render(ret_dict, img_size) | |
return self.decoder.triplane_decode(ret_dict, c) | |
def decode_confmap(self, img): | |
assert self.confnet is not None | |
# https://github.com/elliottwu/unsup3d/blob/dc961410d61684561f19525c2f7e9ee6f4dacb91/unsup3d/model.py#L152 | |
# conf_sigma_l1 = self.confnet(img) # Bx2xHxW | |
return self.confnet(img) # Bx1xHxW | |
def encode_decode(self, img, c, return_raw_only=False): | |
latent = self.encode(img) | |
pred = self.decode(latent, c, return_raw_only=return_raw_only) | |
if self.confnet is not None: | |
pred.update({ | |
'conf_sigma': self.decode_confmap(img) # 224x224 | |
}) | |
return pred | |
def forward(self, | |
img=None, | |
c=None, | |
latent=None, | |
behaviour='enc_dec', | |
coordinates=None, | |
directions=None, | |
return_raw_only=False, | |
*args, | |
**kwargs): | |
"""wrap all operations inside forward() for DDP use. | |
""" | |
if behaviour == 'enc_dec': | |
pred = self.encode_decode(img, c, return_raw_only=return_raw_only) | |
return pred | |
elif behaviour == 'enc': | |
latent = self.encode(img) | |
return latent | |
elif behaviour == 'dec': | |
assert latent is not None | |
pred: dict = self.decode(latent, | |
c, | |
self.img_size, | |
return_raw_only=return_raw_only) | |
return pred | |
elif behaviour == 'dec_wo_triplane': | |
assert latent is not None | |
pred: dict = self.decode_wo_triplane(latent, self.img_size) | |
return pred | |
elif behaviour == 'enc_dec_wo_triplane': | |
# with profile(activities=[ | |
# ProfilerActivity.CUDA], record_shapes=True) as prof: | |
# with record_function("encoding"): | |
latent = self.encode(img, c=c, **kwargs) | |
# print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) | |
# with profile(activities=[ | |
# ProfilerActivity.CUDA], record_shapes=True) as prof: | |
# with record_function("decoding"): | |
pred: dict = self.decode_wo_triplane(latent, | |
img_size=self.img_size, | |
c=c) | |
# print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) | |
# st() | |
return pred | |
elif behaviour == 'encoder_vae': | |
latent = self.encode(img) | |
ret_dict = self.decoder.vae_reparameterization(latent, True) | |
return ret_dict | |
elif behaviour == 'decode_after_vae_no_render': | |
pred: dict = self.decode_after_vae_no_render(latent, self.img_size) | |
return pred | |
elif behaviour == 'decode_gs_after_vae_no_render': | |
pred: dict = self.decode_after_vae_no_render_gs(latent, self.img_size) | |
return pred | |
elif behaviour == 'decode_after_vae': | |
pred: dict = self.decode_after_vae(latent, c, self.img_size) | |
return pred | |
# elif behaviour == 'gaussian_dec': | |
# assert latent is not None | |
# pred: dict = self.decoder.triplane_decode( | |
# latent, c, return_raw_only=return_raw_only, **kwargs) | |
# # pred: dict = self.decoder.triplane_decode(latent, c) | |
elif behaviour == 'triplane_dec': | |
assert latent is not None | |
pred: dict = self.decoder.triplane_decode( | |
latent, c, return_raw_only=return_raw_only, **kwargs) | |
# pred: dict = self.decoder.triplane_decode(latent, c) | |
elif behaviour == 'triplane_decode_grid': | |
assert latent is not None | |
pred: dict = self.decoder.triplane_decode_grid(latent, **kwargs) | |
# pred: dict = self.decoder.triplane_decode(latent, c) | |
elif behaviour == 'vit_postprocess_triplane_dec': | |
assert latent is not None | |
latent = self.decoder.vit_decode_postprocess( | |
latent) # translate spatial token from vit-decoder into 2D | |
pred: dict = self.decoder.triplane_decode( | |
latent, c) # render with triplane | |
elif behaviour == 'triplane_renderer': | |
assert latent is not None | |
pred: dict = self.decoder.triplane_renderer( | |
latent, coordinates, directions) | |
# elif behaviour == 'triplane_SR': | |
# assert latent is not None | |
# pred: dict = self.decoder.triplane_renderer( | |
# latent, coordinates, directions) | |
elif behaviour == 'get_rendering_kwargs': | |
pred = self.decoder.triplane_decoder.rendering_kwargs | |
return pred | |
class AE_CLIPEncoder(AE): | |
def __init__(self, encoder, decoder, img_size, cls_token) -> None: | |
super().__init__(encoder, decoder, img_size, cls_token) | |
class AE_with_Diffusion(torch.nn.Module): | |
def __init__(self, auto_encoder, denoise_model) -> None: | |
super().__init__() | |
self.auto_encoder = auto_encoder | |
self.denoise_model = denoise_model # simply for easy MPTrainer manipulation | |
def forward(self, | |
img, | |
c, | |
behaviour='enc_dec', | |
latent=None, | |
*args, | |
**kwargs): | |
# wrap auto_encoder and denoising model inside a single forward function to use DDP (only forward supported) and MPTrainer (single model) easier | |
if behaviour == 'enc_dec': | |
pred = self.auto_encoder(img, c) | |
return pred | |
elif behaviour == 'enc': | |
latent = self.auto_encoder.encode(img) | |
if self.auto_encoder.dim_up_mlp is not None: | |
latent = self.auto_encoder.dim_up_mlp(latent) | |
return latent | |
elif behaviour == 'dec': | |
assert latent is not None | |
pred: dict = self.auto_encoder.decode(latent, c, self.img_size) | |
return pred | |
elif behaviour == 'denoise': | |
assert latent is not None | |
pred: dict = self.denoise_model(*args, **kwargs) | |
return pred | |
def eg3d_options_default(): | |
opts = dnnlib.EasyDict( | |
dict( | |
cbase=32768, | |
cmax=512, | |
map_depth=2, | |
g_class_name='nsr.triplane.TriPlaneGenerator', # TODO | |
g_num_fp16_res=0, | |
)) | |
return opts | |
def rendering_options_defaults(opts): | |
rendering_options = { | |
# 'image_resolution': c.training_set_kwargs.resolution, | |
'image_resolution': 256, | |
'disparity_space_sampling': False, | |
'clamp_mode': 'softplus', | |
'c_gen_conditioning_zero': | |
True, # if true, fill generator pose conditioning label with dummy zero vector | |
# 'gpc_reg_prob': opts.gpc_reg_prob if opts.gen_pose_cond else None, | |
'c_scale': | |
opts.c_scale, # mutliplier for generator pose conditioning label | |
'superresolution_noise_mode': 'none', | |
'density_reg': opts.density_reg, # strength of density regularization | |
'density_reg_p_dist': opts. | |
density_reg_p_dist, # distance at which to sample perturbed points for density regularization | |
'reg_type': opts. | |
reg_type, # for experimenting with variations on density regularization | |
'decoder_lr_mul': 1, | |
# opts.decoder_lr_mul, # learning rate multiplier for decoder | |
'decoder_activation': 'sigmoid', | |
'sr_antialias': True, | |
'return_triplane_features': False, # for DDF supervision | |
'return_sampling_details_flag': False, | |
# * shape default sr | |
# 'superresolution_module': 'nsr.superresolution.SuperresolutionHybrid4X', | |
# 'superresolution_module': | |
# 'torch_utils.components.PixelUnshuffleUpsample', | |
'superresolution_module': 'torch_utils.components.NearestConvSR', | |
} | |
if opts.cfg == 'ffhq': | |
rendering_options.update({ | |
'superresolution_module': | |
'nsr.superresolution.SuperresolutionHybrid8XDC', | |
'focal': 2985.29 / 700, | |
'depth_resolution': | |
48 - 0, # number of uniform samples to take per ray. | |
'depth_resolution_importance': | |
48 - 0, # number of importance samples to take per ray. | |
'bg_depth_resolution': | |
16, # 4/14 in stylenerf, https://github.com/facebookresearch/StyleNeRF/blob/7f5610a058f27fcc360c6b972181983d7df794cb/conf/model/stylenerf_ffhq.yaml#L48 | |
'ray_start': | |
2.25, # near point along each ray to start taking samples. | |
'ray_end': | |
3.3, # far point along each ray to stop taking samples. | |
'box_warp': | |
1, # the side-length of the bounding box spanned by the tri-planes; box_warp=1 means [-0.5, -0.5, -0.5] -> [0.5, 0.5, 0.5]. | |
'avg_camera_radius': | |
2.7, # used only in the visualizer to specify camera orbit radius. | |
'avg_camera_pivot': [ | |
0, 0, 0.2 | |
], # used only in the visualizer to control center of camera rotation. | |
'superresolution_noise_mode': 'random', | |
}) | |
elif opts.cfg == 'afhq': | |
rendering_options.update({ | |
'superresolution_module': | |
'nsr.superresolution.SuperresolutionHybrid8X', | |
'superresolution_noise_mode': 'random', | |
'focal': 4.2647, | |
'depth_resolution': 48, | |
'depth_resolution_importance': 48, | |
'ray_start': 2.25, | |
'ray_end': 3.3, | |
'box_warp': 1, | |
'avg_camera_radius': 2.7, | |
'avg_camera_pivot': [0, 0, -0.06], | |
}) | |
elif opts.cfg == 'shapenet': # TODO, lies in a sphere | |
rendering_options.update({ | |
'depth_resolution': 64, | |
'depth_resolution_importance': 64, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': 0.2, | |
'ray_end': 2.2, | |
# 'ray_start': opts.ray_start, | |
# 'ray_end': opts.ray_end, | |
'box_warp': 2, # TODO, how to set this value? | |
'white_back': True, | |
'avg_camera_radius': 1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
}) | |
elif opts.cfg == 'eg3d_shapenet_aug_resolution': | |
rendering_options.update({ | |
'depth_resolution': 80, | |
'depth_resolution_importance': 80, | |
'ray_start': 0.1, | |
'ray_end': 1.9, # 2.6/1.7*1.2 | |
'box_warp': 1.1, | |
'white_back': True, | |
'avg_camera_radius': 1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
}) | |
elif opts.cfg == 'eg3d_shapenet_aug_resolution_chair': | |
rendering_options.update({ | |
'depth_resolution': 96, | |
'depth_resolution_importance': 96, | |
'ray_start': 0.1, | |
'ray_end': 1.9, # 2.6/1.7*1.2 | |
'box_warp': 1.1, | |
'white_back': True, | |
'avg_camera_radius': 1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
}) | |
elif opts.cfg == 'eg3d_shapenet_aug_resolution_chair_128': | |
rendering_options.update({ | |
'depth_resolution': 128, | |
'depth_resolution_importance': 128, | |
'ray_start': 0.1, | |
'ray_end': 1.9, # 2.6/1.7*1.2 | |
'box_warp': 1.1, | |
'white_back': True, | |
'avg_camera_radius': 1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
}) | |
elif opts.cfg == 'eg3d_shapenet_aug_resolution_chair_64': | |
rendering_options.update({ | |
'depth_resolution': 64, | |
'depth_resolution_importance': 64, | |
'ray_start': 0.1, | |
'ray_end': 1.9, # 2.6/1.7*1.2 | |
'box_warp': 1.1, | |
'white_back': True, | |
'avg_camera_radius': 1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
}) | |
elif opts.cfg == 'srn_shapenet_aug_resolution_chair_128': | |
rendering_options.update({ | |
'depth_resolution': 128, | |
'depth_resolution_importance': 128, | |
'ray_start': 1.25, | |
'ray_end': 2.75, | |
'box_warp': 1.5, | |
'white_back': True, | |
'avg_camera_radius': 2, | |
'avg_camera_pivot': [0, 0, 0], | |
}) | |
elif opts.cfg == 'eg3d_shapenet_aug_resolution_chair_128_residualSR': | |
rendering_options.update({ | |
'depth_resolution': | |
128, | |
'depth_resolution_importance': | |
128, | |
'ray_start': | |
0.1, | |
'ray_end': | |
1.9, # 2.6/1.7*1.2 | |
'box_warp': | |
1.1, | |
'white_back': | |
True, | |
'avg_camera_radius': | |
1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
'superresolution_module': | |
'torch_utils.components.NearestConvSR_Residual', | |
}) | |
elif opts.cfg == 'shapenet_tuneray': # TODO, lies in a sphere | |
rendering_options.update({ | |
'depth_resolution': 64, | |
'depth_resolution_importance': 64, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': opts.ray_start, | |
'ray_end': opts.ray_end, | |
'box_warp': | |
opts.ray_end - opts.ray_start, # TODO, how to set this value? | |
'white_back': True, | |
'avg_camera_radius': 1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
}) | |
elif opts.cfg == 'shapenet_tuneray_aug_resolution': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': 80, | |
'depth_resolution_importance': 80, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': opts.ray_start, | |
'ray_end': opts.ray_end, | |
'box_warp': | |
opts.ray_end - opts.ray_start, # TODO, how to set this value? | |
'white_back': True, | |
'avg_camera_radius': 1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
}) | |
elif opts.cfg == 'shapenet_tuneray_aug_resolution_64': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': 128, | |
'depth_resolution_importance': 128, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': opts.ray_start, | |
'ray_end': opts.ray_end, | |
'box_warp': | |
opts.ray_end - opts.ray_start, # TODO, how to set this value? | |
'white_back': True, | |
'avg_camera_radius': 1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
}) | |
elif opts.cfg == 'shapenet_tuneray_aug_resolution_64_96': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': 96, | |
'depth_resolution_importance': 96, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': opts.ray_start, | |
'ray_end': opts.ray_end, | |
'box_warp': | |
opts.ray_end - opts.ray_start, # TODO, how to set this value? | |
'white_back': True, | |
'avg_camera_radius': 1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
}) | |
# ! default version | |
elif opts.cfg == 'shapenet_tuneray_aug_resolution_64_96_nearestSR': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': | |
96, | |
'depth_resolution_importance': | |
96, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': | |
opts.ray_start, | |
'ray_end': | |
opts.ray_end, | |
'box_warp': | |
opts.ray_end - opts.ray_start, # TODO, how to set this value? | |
'white_back': | |
True, | |
'avg_camera_radius': | |
1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
'superresolution_module': | |
'torch_utils.components.NearestConvSR', | |
}) | |
# ! 64+64, since ssdnerf adopts this setting | |
elif opts.cfg == 'shapenet_tuneray_aug_resolution_64_64_nearestSR': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': | |
64, | |
'depth_resolution_importance': | |
64, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': | |
opts.ray_start, | |
'ray_end': | |
opts.ray_end, | |
'box_warp': | |
opts.ray_end - opts.ray_start, # TODO, how to set this value? | |
'white_back': | |
True, | |
'avg_camera_radius': | |
1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
'superresolution_module': | |
'torch_utils.components.NearestConvSR', | |
}) | |
# ! 64+64+patch, since ssdnerf adopts this setting | |
elif opts.cfg == 'shapenet_tuneray_aug_resolution_64_64_nearestSR_patch': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': | |
64, | |
'depth_resolution_importance': | |
64, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': | |
opts.ray_start, | |
'ray_end': | |
opts.ray_end, | |
'box_warp': | |
opts.ray_end - opts.ray_start, # TODO, how to set this value? | |
'white_back': | |
True, | |
'avg_camera_radius': | |
1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
'superresolution_module': | |
'torch_utils.components.NearestConvSR', | |
# patch configs | |
'PatchRaySampler': | |
True, | |
# 'patch_rendering_resolution': 32, | |
# 'patch_rendering_resolution': 48, | |
'patch_rendering_resolution': | |
opts.patch_rendering_resolution, | |
}) | |
elif opts.cfg == 'objverse_tuneray_aug_resolution_64_64_nearestSR': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': | |
64, | |
'depth_resolution_importance': | |
64, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': | |
opts.ray_start, | |
# 'auto', | |
'ray_end': | |
opts.ray_end, | |
# 'auto', | |
'box_warp': | |
opts.ray_end - opts.ray_start, # TODO, how to set this value? | |
# 2, | |
'white_back': | |
True, | |
'avg_camera_radius': | |
1.946, # ? | |
'avg_camera_pivot': [0, 0, 0], | |
'superresolution_module': | |
'torch_utils.components.NearestConvSR', | |
# patch configs | |
# 'PatchRaySampler': False, | |
# 'patch_rendering_resolution': 32, | |
# 'patch_rendering_resolution': 48, | |
# 'patch_rendering_resolution': opts.patch_rendering_resolution, | |
}) | |
elif opts.cfg == 'objverse_tuneray_aug_resolution_64_64_auto': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': | |
64, | |
'depth_resolution_importance': | |
64, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': | |
'auto', | |
'ray_end': | |
'auto', | |
'box_warp': | |
0.9, | |
'white_back': | |
True, | |
'radius_range': [1.5, 2], | |
# 'z_near': 1.5-0.45, # radius in [1.5, 2], https://github.com/modelscope/richdreamer/issues/12#issuecomment-1897734616 | |
# 'z_far': 2.0+0.45, | |
'sampler_bbox_min': | |
-0.45, | |
'sampler_bbox_max': | |
0.45, | |
# 'avg_camera_pivot': [0, 0, 0], # not used | |
'filter_out_of_bbox': | |
True, | |
# 'superresolution_module': | |
# 'torch_utils.components.NearestConvSR', | |
# patch configs | |
'PatchRaySampler': | |
True, | |
# 'patch_rendering_resolution': 32, | |
# 'patch_rendering_resolution': 48, | |
'patch_rendering_resolution': | |
opts.patch_rendering_resolution, | |
}) | |
rendering_options['z_near'] = rendering_options['radius_range'][ | |
0] + rendering_options['sampler_bbox_min'] | |
rendering_options['z_far'] = rendering_options['radius_range'][ | |
1] + rendering_options['sampler_bbox_max'] | |
elif opts.cfg == 'objverse_tuneray_aug_resolution_56_56_auto': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': | |
56, | |
'depth_resolution_importance': | |
56, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': | |
'auto', | |
'ray_end': | |
'auto', | |
'box_warp': | |
0.9, | |
'white_back': | |
True, | |
'radius_range': [1.5, 2], | |
# 'z_near': 1.5-0.45, # radius in [1.5, 2], https://github.com/modelscope/richdreamer/issues/12#issuecomment-1897734616 | |
# 'z_far': 2.0+0.45, | |
'sampler_bbox_min': | |
-0.45, | |
'sampler_bbox_max': | |
0.45, | |
# 'avg_camera_pivot': [0, 0, 0], # not used | |
'filter_out_of_bbox': | |
True, | |
# 'superresolution_module': | |
# 'torch_utils.components.NearestConvSR', | |
# patch configs | |
'PatchRaySampler': | |
True, | |
# 'patch_rendering_resolution': 32, | |
# 'patch_rendering_resolution': 48, | |
'patch_rendering_resolution': | |
opts.patch_rendering_resolution, | |
}) | |
rendering_options['z_near'] = rendering_options['radius_range'][ | |
0] + rendering_options['sampler_bbox_min'] | |
rendering_options['z_far'] = rendering_options['radius_range'][ | |
1] + rendering_options['sampler_bbox_max'] | |
elif opts.cfg == 'shapenet_tuneray_aug_resolution_64_96_nearestResidualSR': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': | |
96, | |
'depth_resolution_importance': | |
96, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': | |
opts.ray_start, | |
'ray_end': | |
opts.ray_end, | |
'box_warp': | |
opts.ray_end - opts.ray_start, # TODO, how to set this value? | |
'white_back': | |
True, | |
'avg_camera_radius': | |
1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
'superresolution_module': | |
'torch_utils.components.NearestConvSR_Residual', | |
}) | |
elif opts.cfg == 'shapenet_tuneray_aug_resolution_64_64_nearestResidualSR': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': | |
64, | |
'depth_resolution_importance': | |
64, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': | |
opts.ray_start, | |
'ray_end': | |
opts.ray_end, | |
'box_warp': | |
opts.ray_end - opts.ray_start, # TODO, how to set this value? | |
'white_back': | |
True, | |
'avg_camera_radius': | |
1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
'superresolution_module': | |
'torch_utils.components.NearestConvSR_Residual', | |
}) | |
elif opts.cfg == 'shapenet_tuneray_aug_resolution_64_104': # to differentiate hwc | |
rendering_options.update({ | |
'depth_resolution': 104, | |
'depth_resolution_importance': 104, | |
# * radius 1.2 setting, newly rendered images | |
'ray_start': opts.ray_start, | |
'ray_end': opts.ray_end, | |
'box_warp': | |
opts.ray_end - opts.ray_start, # TODO, how to set this value? | |
'white_back': True, | |
'avg_camera_radius': 1.2, | |
'avg_camera_pivot': [0, 0, 0], | |
}) | |
rendering_options.update({'return_sampling_details_flag': True}) | |
rendering_options.update({'return_sampling_details_flag': True}) | |
return rendering_options | |
def model_encoder_defaults(): | |
return dict( | |
use_clip=False, | |
arch_encoder="vits", | |
arch_decoder="vits", | |
load_pretrain_encoder=False, | |
encoder_lr=1e-5, | |
encoder_weight_decay= | |
0.001, # https://github.com/google-research/vision_transformer | |
no_dim_up_mlp=False, | |
dim_up_mlp_as_func=False, | |
decoder_load_pretrained=True, | |
uvit_skip_encoder=False, | |
# vae ldm | |
vae_p=1, | |
ldm_z_channels=4, | |
ldm_embed_dim=4, | |
use_conf_map=False, | |
# sd E, lite version by default | |
sd_E_ch=64, | |
z_channels=3 * 4, | |
latent_num=768, | |
sd_E_num_res_blocks=1, | |
num_frames=4, | |
# vit_decoder | |
arch_dit_decoder='DiT2-B/2', | |
return_all_dit_layers=False, | |
# sd D | |
# sd_D_ch=32, | |
# sd_D_res_blocks=1, | |
# sd_D_res_blocks=1, | |
lrm_decoder=False, | |
gs_rendering=False, | |
surfel_rendering=False, | |
plane_n=3, | |
in_plane_attention=True, | |
vae_dit_token_size=16, | |
) | |
def triplane_decoder_defaults(): | |
opts = dict( | |
triplane_fg_bg=False,flexicube_decoder=False, | |
cfg='shapenet', | |
density_reg=0.25, | |
density_reg_p_dist=0.004, | |
reg_type='l1', | |
triplane_decoder_lr=0.0025, # follow eg3d G lr | |
super_resolution_lr=0.0025, | |
# triplane_decoder_wd=0.1, | |
c_scale=1, | |
nsr_lr=0.02, | |
triplane_size=224, | |
decoder_in_chans=32, | |
triplane_in_chans=-1, | |
decoder_output_dim=3, | |
out_chans=96, | |
c_dim=25, # Conditioning label (C) dimensionality. | |
# ray_start=0.2, | |
# ray_end=2.2, | |
ray_start=0.6, # shapenet default | |
ray_end=1.8, | |
rendering_kwargs={}, | |
sr_training=False, | |
bcg_synthesis=False, # from panohead | |
bcg_synthesis_kwargs={}, # G_kwargs.copy() | |
# | |
image_size=128, # raw 3D rendering output resolution. | |
patch_rendering_resolution=45, | |
) | |
# else: | |
# assert False, "Need to specify config" | |
# opts = dict(opts) | |
# opts.pop('cfg') | |
return opts | |
def vit_decoder_defaults(): | |
res = dict( | |
vit_decoder_lr=1e-5, # follow eg3d G lr | |
vit_decoder_wd=0.001, | |
) | |
return res | |
def nsr_decoder_defaults(): | |
res = { | |
'decomposed': False, | |
} # TODO, add defaults for all nsr | |
res.update(triplane_decoder_defaults()) # triplane by default now | |
res.update(vit_decoder_defaults()) # type: ignore | |
return res | |
def loss_defaults(): | |
opt = dict( | |
color_criterion='mse', | |
l2_lambda=1.0, | |
lpips_lambda=0., | |
lpips_delay_iter=0, | |
sr_delay_iter=0, | |
# kl_anneal=0, | |
kl_anneal=False, | |
latent_lambda=0., | |
latent_criterion='mse', | |
kl_lambda=0.0, | |
pt_ft_kl=False, | |
ft_kl=False, | |
# pt_kl_lambda=1e-8, | |
# kl_anneal=False, | |
ssim_lambda=0., | |
l1_lambda=0., | |
id_lambda=0.0, | |
depth_lambda=0.0, # TODO | |
alpha_lambda=0.0, # TODO | |
fg_mse=False, | |
bg_lamdba=0.0, | |
density_reg=0.0, # tvloss in eg3d | |
density_reg_p_dist=0.004, # 'density regularization strength.' | |
density_reg_every=4, # lazy density reg | |
# 3D supervision, ffhq/afhq eg3d warm up | |
shape_uniform_lambda=0.005, | |
shape_importance_lambda=0.01, | |
shape_depth_lambda=0., | |
xyz_lambda=0.0, | |
emd_lambda=0.0, | |
cd_lambda=0.0, | |
pruning_ot_lambda=0.0, | |
# 2dgs | |
lambda_normal=0.0, | |
lambda_dist=0.0, | |
#gghead reg | |
lambda_scale_reg=0.0, | |
lambda_opa_reg=0.0, | |
# gan loss | |
rec_cvD_lambda=0.01, | |
nvs_cvD_lambda=0.025, | |
patchgan_disc_factor=0.01, | |
patchgan_disc_g_weight=0.2, # | |
r1_gamma=1.0, # ffhq default value for eg3d | |
sds_lamdba=1.0, | |
nvs_D_lr_mul=1, # compared with 1e-4 | |
cano_D_lr_mul=1, # compared with 1e-4 | |
# lsgm loss | |
ce_balanced_kl=1., | |
p_eps_lambda=1, | |
# symmetric loss | |
symmetry_loss=False, | |
depth_smoothness_lambda=0.0, | |
ce_lambda=1.0, | |
negative_entropy_lambda=1.0, | |
grad_clip=False, | |
online_mask=False, # in unsup3d | |
fps_sampling=False, # for emd loss | |
subset_fps_sampling=False, # for emd loss | |
subset_half_fps_sampling=False, | |
# gaussian loss | |
commitment_loss_lambda=0.0, | |
rand_aug_bg=False, | |
) | |
return opt | |
def dataset_defaults(): | |
res = dict( | |
use_lmdb=False, | |
dataset_size=-1, | |
use_wds=False, | |
use_lmdb_compressed=True, | |
compile=False, | |
interval=1, | |
objv_dataset=False, | |
decode_encode_img_only=False, | |
load_wds_diff=False, | |
load_wds_latent=False, | |
eval_load_wds_instance=True, | |
shards_lst="", | |
eval_shards_lst="", | |
mv_input=False, | |
duplicate_sample=True, | |
orthog_duplicate=False, | |
split_chunk_input=False, # split=8 per chunk | |
load_real=False, | |
load_mv_real=False, | |
load_gso=False, | |
four_view_for_latent=False, | |
single_view_for_i23d=False, | |
shuffle_across_cls=False, | |
load_extra_36_view=False, | |
mv_latent_dir='', | |
append_depth=False, | |
append_xyz=False, | |
read_normal=False, | |
plucker_embedding=False, | |
perturb_pcd_scale=0.0, | |
gs_cam_format=False, | |
frame_0_as_canonical= | |
False, # transform the first pose to a fixed position | |
pcd_path=None, | |
stage_1_output_dir='', | |
load_pcd=False, | |
use_chunk=False, # jpeg chunk | |
split_chunk_size=8, | |
load_caption_dataset=False, | |
load_mv_dataset=False, | |
export_mesh=False, | |
) | |
return res | |
def encoder_and_nsr_defaults(): | |
""" | |
Defaults for image training. | |
""" | |
# ViT configs | |
res = dict( | |
dino_version='v1', | |
encoder_in_channels=3, | |
img_size=[224], | |
patch_size=16, # ViT-S/16 | |
in_chans=384, | |
num_classes=0, | |
embed_dim=384, # Check ViT encoder dim | |
depth=6, | |
num_heads=16, | |
mlp_ratio=4., | |
qkv_bias=False, | |
qk_scale=None, | |
drop_rate=0.1, | |
attn_drop_rate=0., | |
drop_path_rate=0., | |
norm_layer='nn.LayerNorm', | |
# img_resolution=128, # Output resolution. | |
cls_token=False, | |
# image_size=128, # rendered output resolution. | |
# img_channels=3, # Number of output color channels. | |
encoder_cls_token=False, | |
decoder_cls_token=False, | |
sr_kwargs={}, | |
sr_ratio=2, | |
# sd configs | |
) | |
# Triplane configs | |
res.update(model_encoder_defaults()) | |
res.update(nsr_decoder_defaults()) | |
res.update( | |
ae_classname='vit.vit_triplane.ViTTriplaneDecomposed') # if add SR | |
return res | |
def create_3DAE_model( | |
arch_encoder, | |
arch_decoder, | |
dino_version='v1', | |
img_size=[224], | |
patch_size=16, | |
in_chans=384, | |
num_classes=0, | |
embed_dim=1024, # Check ViT encoder dim | |
depth=6, | |
num_heads=16, | |
mlp_ratio=4., | |
qkv_bias=False, | |
qk_scale=None, | |
drop_rate=0.1, | |
attn_drop_rate=0., | |
drop_path_rate=0., | |
# norm_layer=nn.LayerNorm, | |
norm_layer='nn.LayerNorm', | |
out_chans=96, | |
decoder_in_chans=32, | |
triplane_in_chans=-1, | |
decoder_output_dim=32, | |
encoder_cls_token=False, | |
decoder_cls_token=False, | |
c_dim=25, # Conditioning label (C) dimensionality. | |
image_size=128, # Output resolution. | |
img_channels=3, # Number of output color channels. | |
rendering_kwargs={}, | |
load_pretrain_encoder=False, | |
decomposed=True, | |
triplane_size=224, | |
ae_classname='ViTTriplaneDecomposed', | |
use_clip=False, | |
sr_kwargs={}, | |
sr_ratio=2, | |
no_dim_up_mlp=False, | |
dim_up_mlp_as_func=False, | |
decoder_load_pretrained=True, | |
uvit_skip_encoder=False, | |
bcg_synthesis_kwargs={}, | |
# decoder params | |
vae_p=1, | |
ldm_z_channels=4, | |
vae_dit_token_size=16, | |
ldm_embed_dim=4, | |
use_conf_map=False, | |
triplane_fg_bg=False, | |
flexicube_decoder=False, | |
encoder_in_channels=3, | |
sd_E_ch=64, | |
z_channels=3 * 4, | |
latent_num=768, | |
sd_E_num_res_blocks=1, | |
num_frames=4, | |
arch_dit_decoder='DiT2-B/2', | |
in_plane_attention=True, | |
lrm_decoder=False, | |
gs_rendering=False, | |
surfel_rendering=False, | |
return_all_dit_layers=False, | |
plane_n=3, | |
*args, | |
**kwargs): | |
# TODO, check pre-trained ViT encoder cfgs | |
preprocess = None | |
clip_dtype = None | |
if load_pretrain_encoder: | |
if not use_clip: | |
if dino_version == 'v1': | |
encoder = torch.hub.load( | |
'facebookresearch/dino:main', | |
'dino_{}{}'.format(arch_encoder, patch_size)) | |
logger.log( | |
f'loaded pre-trained dino v1 ViT-S{patch_size} encoder ckpt' | |
) | |
elif dino_version == 'v2': | |
encoder = torch.hub.load( | |
'facebookresearch/dinov2', | |
'dinov2_{}{}'.format(arch_encoder, patch_size)) | |
logger.log( | |
f'loaded pre-trained dino v2 {arch_encoder}{patch_size} encoder ckpt' | |
) | |
elif 'sd' in dino_version: # just for compat | |
if 'mv' in dino_version: | |
if 'lgm' in dino_version: | |
encoder_cls = MVUNet( | |
input_size=256, | |
up_channels=(1024, 1024, 512, 256, | |
128), # one more decoder | |
up_attention=(True, True, True, False, False), | |
splat_size=128, | |
output_size= | |
512, # render & supervise Gaussians at a higher resolution. | |
batch_size=8, | |
num_views=8, | |
gradient_accumulation_steps=1, | |
# mixed_precision='bf16', | |
) | |
elif 'gs' in dino_version: | |
encoder_cls = MVEncoder | |
else: | |
encoder_cls = MVEncoder | |
else: | |
encoder_cls = Encoder | |
encoder = encoder_cls( # mono input | |
double_z=True, | |
resolution=256, | |
in_channels=encoder_in_channels, | |
# ch=128, | |
ch=64, # ! fit in the memory | |
# ch_mult=[1,2,4,4], | |
# num_res_blocks=2, | |
ch_mult=[1, 2, 4, 4], | |
num_res_blocks=1, | |
dropout=0.0, | |
attn_resolutions=[], | |
out_ch=3, # unused | |
z_channels=z_channels, | |
) # stable diffusion encoder | |
else: | |
raise NotImplementedError() | |
else: | |
import clip | |
model, preprocess = clip.load("ViT-B/16", device=dist_util.dev()) | |
model.float() # convert weight to float32 | |
clip_dtype = model.dtype | |
encoder = getattr( | |
model, 'visual') # only use the CLIP visual encoder here | |
encoder.requires_grad_(False) | |
logger.log( | |
f'loaded pre-trained CLIP ViT-B{patch_size} encoder, fixed.') | |
elif 'sd' in dino_version: | |
attn_kwargs = {} | |
attn_type = "mv-vanilla" | |
if 'mv' in dino_version: | |
if 'lgm' in dino_version: | |
encoder = LGM_MVEncoder( | |
in_channels=9, | |
# input_size=256, | |
up_channels=(1024, 1024, 512, 256, | |
128), # one more decoder | |
up_attention=(True, True, True, False, False), | |
# splat_size=128, | |
# output_size= | |
# 512, # render & supervise Gaussians at a higher resolution. | |
# batch_size=8, | |
# num_views=8, | |
# gradient_accumulation_steps=1, | |
# mixed_precision='bf16', | |
) | |
elif 'srt' in dino_version: | |
if 'hybrid' in dino_version: | |
encoder_cls = HybridEncoder # best overall performance | |
# if 'vitl' in dino_version: | |
# encoder_cls = ImprovedSRTEncoderVAE_L5_vitl | |
# elif 'l6' in dino_version: | |
# encoder_cls = ImprovedSRTEncoderVAE_L6 | |
# elif 'mlp4' in dino_version: | |
# if 'f8'in dino_version: | |
# # encoder_cls = ImprovedSRTEncoderVAE_mlp_ratio4_f8 | |
# encoder_cls = ImprovedSRTEncoderVAE_mlp_ratio4_f8_L6 | |
elif 'l6' in dino_version: | |
encoder_cls = ImprovedSRTEncoderVAE_mlp_ratio4_L6 | |
elif 'heavy' in dino_version: | |
encoder_cls = ImprovedSRTEncoderVAE_mlp_ratio4_heavyPatchify | |
elif 'decomposed' in dino_version: | |
encoder_cls = ImprovedSRTEncoderVAE_mlp_ratio4_decomposed | |
elif 'pcd-structured' in dino_version: | |
attn_kwargs = { | |
'n_heads': 8, | |
'd_head': 64, | |
} | |
if 'pc2' in dino_version: | |
encoder_cls = HybridEncoderPCDStructuredLatentSNoPCD_PC2 # pixel-aligned by rasterization projection | |
elif 'nopcd' in dino_version: | |
encoder_cls = HybridEncoderPCDStructuredLatentSNoPCD | |
elif 'uniformfps' in dino_version: | |
encoder_cls = HybridEncoderPCDStructuredLatentUniformFPS | |
elif 'pixelaligned' in dino_version: | |
encoder_cls = HybridEncoderPCDStructuredLatentSNoPCD_PixelAlignedQuery | |
else: | |
encoder_cls = HybridEncoderPCDStructuredLatent | |
else: | |
encoder_cls = ImprovedSRTEncoderVAE_mlp_ratio4 # best overall performance | |
# else: # default version | |
# encoder_cls = ImprovedSRTEncoderVAE | |
elif 'gs' in dino_version: | |
if 'dynaInp' in dino_version: | |
if 'ca' in dino_version: | |
encoder_cls = MVEncoderGSDynamicInp_CA | |
else: | |
encoder_cls = MVEncoderGSDynamicInp | |
else: | |
encoder_cls = MVEncoderGS | |
attn_kwargs = { | |
'n_heads': 8, | |
'd_head': 64, | |
} | |
else: | |
if 'dynaInp' in dino_version: | |
if 'ca' in dino_version: | |
encoder_cls = MVEncoderGSDynamicInp_CA | |
else: | |
encoder_cls = MVEncoderGSDynamicInp | |
else: | |
encoder_cls = MVEncoder | |
attn_kwargs = { | |
'n_heads': 8, | |
'd_head': 64, | |
} | |
else: | |
encoder_cls = Encoder | |
if 'lgm' not in dino_version: # TODO, for compat now | |
# st() | |
encoder = encoder_cls( | |
double_z=True, | |
resolution=256, | |
in_channels=encoder_in_channels, | |
# ch=128, | |
# ch=64, # ! fit in the memory | |
ch=sd_E_ch, | |
# ch_mult=[1,2,4,4], | |
# num_res_blocks=2, | |
ch_mult=[1, 2, 4, 4], | |
# num_res_blocks=1, | |
num_res_blocks=sd_E_num_res_blocks, | |
num_frames=num_frames, | |
dropout=0.0, | |
attn_resolutions=[], | |
out_ch=3, # unused | |
z_channels=z_channels, # 4 * 3 | |
attn_kwargs=attn_kwargs, | |
attn_type=attn_type, | |
latent_num=latent_num, | |
) # stable diffusion encoder | |
else: | |
encoder = vits.__dict__[arch_encoder]( | |
patch_size=patch_size, | |
drop_path_rate=drop_path_rate, # stochastic depth | |
img_size=img_size) | |
assert decomposed | |
if decomposed: | |
if not gs_rendering: | |
if triplane_in_chans == -1: | |
triplane_in_chans = decoder_in_chans | |
if triplane_fg_bg: | |
triplane_renderer_cls = Triplane_fg_bg_plane | |
elif flexicube_decoder: | |
triplane_renderer_cls = Triplane | |
else: | |
triplane_renderer_cls = Triplane | |
# triplane_decoder = Triplane( | |
triplane_decoder = triplane_renderer_cls( | |
c_dim, # Conditioning label (C) dimensionality. | |
image_size, # Output resolution. | |
img_channels, # Number of output color channels. | |
rendering_kwargs=rendering_kwargs, | |
out_chans=out_chans, | |
# create_triplane=True, # compatability, remove later | |
triplane_size=triplane_size, | |
decoder_in_chans=triplane_in_chans, | |
decoder_output_dim=decoder_output_dim, | |
sr_kwargs=sr_kwargs, | |
bcg_synthesis_kwargs=bcg_synthesis_kwargs, | |
lrm_decoder=lrm_decoder) | |
elif surfel_rendering: | |
triplane_decoder = GaussianRenderer2DGS( | |
image_size, out_chans, rendering_kwargs=rendering_kwargs) | |
# else: | |
# triplane_decoder = GaussianRenderer( | |
# image_size, out_chans, rendering_kwargs=rendering_kwargs) | |
if load_pretrain_encoder: | |
if dino_version == 'v1': | |
vit_decoder = torch.hub.load( | |
'facebookresearch/dino:main', | |
'dino_{}{}'.format(arch_decoder, patch_size)) | |
logger.log( | |
'loaded pre-trained decoder', | |
"facebookresearch/dino:main', 'dino_{}{}".format( | |
arch_decoder, patch_size)) | |
else: | |
vit_decoder = torch.hub.load( | |
'facebookresearch/dinov2', | |
# 'dinov2_{}{}'.format(arch_decoder, patch_size)) | |
'dinov2_{}{}'.format(arch_decoder, patch_size), | |
pretrained=decoder_load_pretrained) | |
logger.log( | |
'loaded pre-trained decoder', | |
"facebookresearch/dinov2', 'dinov2_{}{}".format( | |
arch_decoder, | |
patch_size), 'pretrianed=', decoder_load_pretrained) | |
elif 'dit' in dino_version: | |
from dit.dit_decoder import DiT2_models, DiTBlock | |
# st() | |
vit_decoder = DiT2_models[arch_dit_decoder]( | |
input_size=16, | |
num_classes=0, | |
learn_sigma=False, | |
in_channels=embed_dim, | |
mixed_prediction=False, | |
context_dim=None, # add CLIP text embedding | |
roll_out=True, | |
plane_n=4 if ('gs' in dino_version | |
and 'trilatent' not in dino_version) else 3, | |
return_all_layers=return_all_dit_layers, | |
in_plane_attention=in_plane_attention, | |
vit_blk=DiTBlock, | |
) | |
else: # has bug on global token, to fix | |
vit_decoder = vits.__dict__[arch_decoder]( | |
patch_size=patch_size, | |
drop_path_rate=drop_path_rate, # stochastic depth | |
img_size=img_size) | |
# decoder = ViTTriplaneDecomposed(vit_decoder, triplane_decoder) | |
# if True: | |
decoder_kwargs = dict( | |
class_name=ae_classname, | |
vit_decoder=vit_decoder, | |
triplane_decoder=triplane_decoder, | |
# encoder_cls_token=encoder_cls_token, | |
cls_token=decoder_cls_token, | |
sr_ratio=sr_ratio, | |
vae_p=vae_p, | |
ldm_z_channels=ldm_z_channels, | |
ldm_embed_dim=ldm_embed_dim, | |
vae_dit_token_size=vae_dit_token_size, | |
plane_n=plane_n, | |
) | |
decoder = dnnlib.util.construct_class_by_name(**decoder_kwargs) | |
else: | |
# deprecated | |
decoder = ViTTriplane( | |
img_size, | |
patch_size, | |
in_chans, | |
num_classes, | |
embed_dim, | |
depth, | |
num_heads, | |
mlp_ratio, | |
qkv_bias, | |
qk_scale, | |
drop_rate, | |
attn_drop_rate, | |
drop_path_rate, | |
norm_layer, | |
out_chans, | |
cls_token, | |
c_dim, # Conditioning label (C) dimensionality. | |
image_size, # Output resolution. | |
img_channels, # Number of output color channels. | |
# TODO, replace with c | |
rendering_kwargs=rendering_kwargs, | |
) | |
# if return_encoder_decoder: | |
# return encoder, decoder, img_size[0], cls_token | |
# else: | |
if use_conf_map: | |
confnet = ConfNet(cin=3, cout=1, nf=64, zdim=128) | |
else: | |
confnet = None | |
auto_encoder = AE( | |
encoder, | |
decoder, | |
img_size[0], | |
encoder_cls_token, | |
decoder_cls_token, | |
preprocess, | |
use_clip, | |
dino_version, | |
clip_dtype, | |
no_dim_up_mlp=no_dim_up_mlp, | |
dim_up_mlp_as_func=dim_up_mlp_as_func, | |
uvit_skip_encoder=uvit_skip_encoder, | |
confnet=confnet, | |
) | |
logger.log(auto_encoder) | |
torch.cuda.empty_cache() | |
return auto_encoder | |
# def create_3DAE_Diffusion_model( | |
# arch_encoder, | |
# arch_decoder, | |
# img_size=[224], | |
# patch_size=16, | |
# in_chans=384, | |
# num_classes=0, | |
# embed_dim=1024, # Check ViT encoder dim | |
# depth=6, | |
# num_heads=16, | |
# mlp_ratio=4., | |
# qkv_bias=False, | |
# qk_scale=None, | |
# drop_rate=0.1, | |
# attn_drop_rate=0., | |
# drop_path_rate=0., | |
# # norm_layer=nn.LayerNorm, | |
# norm_layer='nn.LayerNorm', | |
# out_chans=96, | |
# decoder_in_chans=32, | |
# decoder_output_dim=32, | |
# cls_token=False, | |
# c_dim=25, # Conditioning label (C) dimensionality. | |
# img_resolution=128, # Output resolution. | |
# img_channels=3, # Number of output color channels. | |
# rendering_kwargs={}, | |
# load_pretrain_encoder=False, | |
# decomposed=True, | |
# triplane_size=224, | |
# ae_classname='ViTTriplaneDecomposed', | |
# # return_encoder_decoder=False, | |
# *args, | |
# **kwargs | |
# ): | |
# # TODO, check pre-trained ViT encoder cfgs | |
# encoder, decoder, img_size, cls_token = create_3DAE_model( | |
# arch_encoder, | |
# arch_decoder, | |
# img_size, | |
# patch_size, | |
# in_chans, | |
# num_classes, | |
# embed_dim, # Check ViT encoder dim | |
# depth, | |
# num_heads, | |
# mlp_ratio, | |
# qkv_bias, | |
# qk_scale, | |
# drop_rate, | |
# attn_drop_rate, | |
# drop_path_rate, | |
# # norm_layer=nn.LayerNorm, | |
# norm_layer, | |
# out_chans=96, | |
# decoder_in_chans=32, | |
# decoder_output_dim=32, | |
# cls_token=False, | |
# c_dim=25, # Conditioning label (C) dimensionality. | |
# img_resolution=128, # Output resolution. | |
# img_channels=3, # Number of output color channels. | |
# rendering_kwargs={}, | |
# load_pretrain_encoder=False, | |
# decomposed=True, | |
# triplane_size=224, | |
# ae_classname='ViTTriplaneDecomposed', | |
# return_encoder_decoder=False, | |
# *args, | |
# **kwargs | |
# ) # type: ignore | |
def create_Triplane( | |
c_dim=25, # Conditioning label (C) dimensionality. | |
img_resolution=128, # Output resolution. | |
img_channels=3, # Number of output color channels. | |
rendering_kwargs={}, | |
decoder_output_dim=32, | |
*args, | |
**kwargs): | |
decoder = Triplane( | |
c_dim, # Conditioning label (C) dimensionality. | |
img_resolution, # Output resolution. | |
img_channels, # Number of output color channels. | |
# TODO, replace with c | |
rendering_kwargs=rendering_kwargs, | |
create_triplane=True, | |
decoder_output_dim=decoder_output_dim) | |
return decoder | |
def DiT_defaults(): | |
return { | |
'dit_model': "DiT-B/16", | |
'vae': "ema" | |
# dit_model="DiT-XL/2", | |
# dit_patch_size=8, | |
} | |