|
import sys |
|
import os |
|
|
|
os.system("git clone https://github.com/hongfz16/EVA3D.git") |
|
sys.path.append("EVA3D") |
|
os.system("cp -r EVA3D/assets .") |
|
|
|
os.system(f"{sys.executable} -m pip install -U fvcore plotly") |
|
|
|
import torch |
|
pyt_version_str=torch.__version__.split("+")[0].replace(".", "") |
|
version_str="".join([ |
|
f"py3{sys.version_info.minor}_cu", |
|
torch.version.cuda.replace(".",""), |
|
f"_pyt{pyt_version_str}" |
|
]) |
|
|
|
os.system(f"{sys.executable} -m pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html") |
|
|
|
import os |
|
import html |
|
import glob |
|
import uuid |
|
import hashlib |
|
import requests |
|
from tqdm import tqdm |
|
from pdb import set_trace as st |
|
|
|
from download_models import download_file |
|
eva3d_deepfashion_model = dict(file_url='https://drive.google.com/uc?id=1SYPjxnHz3XPRhTarx_Lw8SG_iz16QUMU', |
|
alt_url='', file_size=160393221, file_md5='d0fae86edf76c52e94223bd3f39b2157', |
|
file_path='checkpoint/512x256_deepfashion/volume_renderer/models_0420000.pt',) |
|
|
|
smpl_model = dict(file_url='https://drive.google.com/uc?id={}'.format(os.environ['smpl_link']), |
|
alt_url='', file_size=39001280, file_md5='65dc7f162f3ef21a38637663c57e14a7', |
|
file_path='smpl_models/smpl/SMPL_NEUTRAL.pkl',) |
|
|
|
from huggingface_hub import hf_hub_download |
|
|
|
def download_pretrained_models(): |
|
print('Downloading EVA3D model pretrained on DeepFashion.') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eva3d_ckpt = hf_hub_download(repo_id="hongfz16/EVA3D", filename="models_0420000.pt", token=os.environ['hf_token']) |
|
os.system("mkdir -p checkpoint/512x256_deepfashion/volume_renderer") |
|
os.system("mkdir -p smpl_models/smpl") |
|
os.system(f"cp {eva3d_ckpt} checkpoint/512x256_deepfashion/volume_renderer/models_0420000.pt") |
|
print('Downloading SMPL model.') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
smpl_pkl = hf_hub_download(repo_id="hongfz16/EVA3D", filename="SMPL_NEUTRAL.pkl", token=os.environ['hf_token']) |
|
os.system(f"cp {smpl_pkl} smpl_models/smpl/SMPL_NEUTRAL.pkl") |
|
|
|
download_pretrained_models() |
|
|
|
import os |
|
import torch |
|
import trimesh |
|
import imageio |
|
import pickle |
|
import numpy as np |
|
from munch import * |
|
from PIL import Image |
|
from tqdm import tqdm |
|
from torch.nn import functional as F |
|
from torch.utils import data |
|
from torchvision import utils |
|
from torchvision import transforms |
|
from skimage.measure import marching_cubes |
|
from scipy.spatial import Delaunay |
|
from scipy.spatial.transform import Rotation as R |
|
from options import BaseOptions |
|
from model import VoxelHumanGenerator as Generator |
|
from dataset import DeepFashionDataset, DemoDataset |
|
from utils import ( |
|
generate_camera_params, |
|
align_volume, |
|
extract_mesh_with_marching_cubes, |
|
xyz2mesh, |
|
requires_grad, |
|
create_mesh_renderer, |
|
create_cameras |
|
) |
|
from pytorch3d.io import load_objs_as_meshes, load_obj |
|
from pytorch3d.structures import Meshes |
|
from pytorch3d.renderer import ( |
|
FoVPerspectiveCameras, look_at_view_transform, look_at_rotation, |
|
RasterizationSettings, MeshRenderer, MeshRasterizer, BlendParams, |
|
SoftSilhouetteShader, HardPhongShader, PointLights, TexturesVertex, |
|
) |
|
|
|
torch.random.manual_seed(8888) |
|
import random |
|
random.seed(8888) |
|
|
|
panning_angle = np.pi / 3 |
|
|
|
def sample_latent(opt, device): |
|
return |
|
|
|
def generate_rgb(opt, g_ema, device, mean_latent, sample_z, sample_trans, sample_beta, sample_theta, sample_cam_extrinsics, sample_focals): |
|
requires_grad(g_ema, False) |
|
g_ema.is_train = False |
|
g_ema.train_renderer = False |
|
img_list = [] |
|
for k in range(3): |
|
if k == 0: |
|
delta = R.from_rotvec(np.pi/8 * np.array([0, 1, 0])) |
|
elif k == 2: |
|
delta = R.from_rotvec(-np.pi/8 * np.array([0, 1, 0])) |
|
else: |
|
delta = R.from_rotvec(0 * np.array([0, 1, 0])) |
|
r = R.from_rotvec(sample_theta[0, :3].cpu().numpy()) |
|
new_r = delta * r |
|
new_sample_theta = sample_theta.clone() |
|
new_sample_theta[0, :3] = torch.from_numpy(new_r.as_rotvec()).to(device) |
|
|
|
with torch.no_grad(): |
|
j = 0 |
|
chunk = 1 |
|
out = g_ema([sample_z[j:j+chunk]], |
|
sample_cam_extrinsics[j:j+chunk], |
|
sample_focals[j:j+chunk], |
|
sample_beta[j:j+chunk], |
|
new_sample_theta[j:j+chunk], |
|
sample_trans[j:j+chunk], |
|
truncation=opt.truncation_ratio, |
|
truncation_latent=mean_latent, |
|
return_eikonal=False, |
|
return_normal=False, |
|
return_mask=False, |
|
fix_viewdir=True) |
|
|
|
rgb_images_thumbs = out[1].detach().cpu()[..., :3].permute(0, 3, 1, 2) |
|
g_ema.zero_grad() |
|
img_list.append(rgb_images_thumbs) |
|
|
|
utils.save_image(torch.cat(img_list, 0), |
|
os.path.join(opt.results_dst_dir, 'images_paper_fig','{}.png'.format(str(0).zfill(7))), |
|
nrow=3, |
|
normalize=True, |
|
range=(-1, 1), |
|
padding=0,) |
|
|
|
def generate_mesh(opt, g_ema, device, mean_latent, sample_z, sample_trans, sample_beta, sample_theta, sample_cam_extrinsics, sample_focals): |
|
latent = g_ema.styles_and_noise_forward(sample_z[:1], None, opt.truncation_ratio, |
|
mean_latent, False) |
|
|
|
sdf = g_ema.renderer.marching_cube_posed(latent[0], sample_beta, sample_theta, resolution=350, size=1.4).detach() |
|
marching_cubes_mesh, _, _ = extract_mesh_with_marching_cubes(sdf, level_set=0) |
|
marching_cubes_mesh = trimesh.smoothing.filter_humphrey(marching_cubes_mesh, beta=0.2, iterations=5) |
|
|
|
|
|
|
|
return marching_cubes_mesh |
|
|
|
def generate_video(opt, g_ema, device, mean_latent, sample_z, sample_trans, sample_beta, sample_theta, sample_cam_extrinsics, sample_focals): |
|
video_list = [] |
|
for k in tqdm(range(120)): |
|
if k < 30: |
|
angle = (panning_angle / 2) * (k / 30) |
|
elif k >= 30 and k < 90: |
|
angle = panning_angle / 2 - panning_angle * ((k - 30) / 60) |
|
else: |
|
angle = -panning_angle / 2 * ((120 - k) / 30) |
|
delta = R.from_rotvec(angle * np.array([0, 1, 0])) |
|
r = R.from_rotvec(sample_theta[0, :3].cpu().numpy()) |
|
new_r = delta * r |
|
new_sample_theta = sample_theta.clone() |
|
new_sample_theta[0, :3] = torch.from_numpy(new_r.as_rotvec()).to(device) |
|
with torch.no_grad(): |
|
j = 0 |
|
chunk = 1 |
|
out = g_ema([sample_z[j:j+chunk]], |
|
sample_cam_extrinsics[j:j+chunk], |
|
sample_focals[j:j+chunk], |
|
sample_beta[j:j+chunk], |
|
new_sample_theta[j:j+chunk], |
|
sample_trans[j:j+chunk], |
|
truncation=opt.truncation_ratio, |
|
truncation_latent=mean_latent, |
|
return_eikonal=False, |
|
return_normal=False, |
|
return_mask=False, |
|
fix_viewdir=True) |
|
rgb_images_thumbs = out[1].detach().cpu()[..., :3] |
|
g_ema.zero_grad() |
|
video_list.append((rgb_images_thumbs.numpy() + 1) / 2. * 255. + 0.5) |
|
all_img = np.concatenate(video_list, 0).astype(np.uint8) |
|
imageio.mimwrite(os.path.join(opt.results_dst_dir, 'images_paper_video', 'video_{}.mp4'.format(str(0).zfill(7))), all_img, fps=30, quality=8) |
|
|
|
def setup(): |
|
device='cuda' if torch.cuda.is_available() else 'cpu' |
|
opt = BaseOptions().parse() |
|
|
|
opt.training.batch = 1 |
|
opt.training.chunk = 1 |
|
opt.experiment.expname = '512x256_deepfashion' |
|
opt.dataset.dataset_path = 'demodataset' |
|
opt.rendering.depth = 5 |
|
opt.rendering.width = 128 |
|
opt.model.style_dim = 128 |
|
opt.model.renderer_spatial_output_dim = [512, 256] |
|
opt.training.no_sphere_init = True |
|
opt.rendering.input_ch_views = 3 |
|
opt.rendering.white_bg = True |
|
opt.model.voxhuman_name = 'eva3d_deepfashion' |
|
opt.training.deltasdf = True |
|
opt.rendering.N_samples = 28 |
|
opt.experiment.ckpt = '420000' |
|
opt.inference.identities = 1 |
|
opt.inference.truncation_ratio = 0.6 |
|
|
|
opt.model.is_test = True |
|
opt.model.freeze_renderer = False |
|
opt.rendering.no_features_output = True |
|
opt.rendering.offset_sampling = True |
|
opt.rendering.static_viewdirs = True |
|
opt.rendering.force_background = True |
|
opt.rendering.perturb = 0 |
|
opt.inference.size = opt.model.size |
|
opt.inference.camera = opt.camera |
|
opt.inference.renderer_output_size = opt.model.renderer_spatial_output_dim |
|
opt.inference.style_dim = opt.model.style_dim |
|
opt.inference.project_noise = opt.model.project_noise |
|
opt.inference.return_xyz = opt.rendering.return_xyz |
|
|
|
checkpoints_dir = os.path.join('checkpoint', opt.experiment.expname, 'volume_renderer') |
|
checkpoint_path = os.path.join(checkpoints_dir, |
|
'models_{}.pt'.format(opt.experiment.ckpt.zfill(7))) |
|
|
|
result_model_dir = 'iter_{}'.format(opt.experiment.ckpt.zfill(7)) |
|
|
|
|
|
results_dir_basename = os.path.join(opt.inference.results_dir, opt.experiment.expname) |
|
opt.inference.results_dst_dir = os.path.join(results_dir_basename, result_model_dir) |
|
if opt.inference.fixed_camera_angles: |
|
opt.inference.results_dst_dir = os.path.join(opt.inference.results_dst_dir, 'fixed_angles') |
|
else: |
|
opt.inference.results_dst_dir = os.path.join(opt.inference.results_dst_dir, 'random_angles') |
|
os.makedirs(opt.inference.results_dst_dir, exist_ok=True) |
|
os.makedirs(os.path.join(opt.inference.results_dst_dir, 'images_paper_fig'), exist_ok=True) |
|
os.makedirs(os.path.join(opt.inference.results_dst_dir, 'images_paper_video'), exist_ok=True) |
|
os.makedirs(os.path.join(opt.inference.results_dst_dir, 'marching_cubes_meshes_posed'), exist_ok=True) |
|
checkpoint = torch.load(checkpoint_path, map_location=lambda storage, loc: storage) |
|
|
|
|
|
g_ema = Generator(opt.model, opt.rendering, full_pipeline=False, voxhuman_name=opt.model.voxhuman_name).to(device) |
|
pretrained_weights_dict = checkpoint["g_ema"] |
|
model_dict = g_ema.state_dict() |
|
for k, v in pretrained_weights_dict.items(): |
|
if v.size() == model_dict[k].size(): |
|
model_dict[k] = v |
|
else: |
|
print(k) |
|
|
|
g_ema.load_state_dict(model_dict) |
|
|
|
transform = transforms.Compose( |
|
[transforms.ToTensor(), |
|
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)]) |
|
|
|
if 'deepfashion' in opt.dataset.dataset_path: |
|
file_list = '/mnt/lustre/fzhong/smplify-x/deepfashion_train_list/deepfashion_train_list_MAN.txt' |
|
elif '20w_fashion' in opt.dataset.dataset_path: |
|
file_list = '/mnt/lustre/fzhong/mmhuman3d/20w_fashion_result/nondress_flist.txt' |
|
else: |
|
file_list = None |
|
if file_list: |
|
dataset = DeepFashionDataset(opt.dataset.dataset_path, transform, opt.model.size, |
|
opt.model.renderer_spatial_output_dim, file_list) |
|
else: |
|
dataset = DemoDataset() |
|
|
|
|
|
if opt.inference.truncation_ratio < 1: |
|
with torch.no_grad(): |
|
mean_latent = g_ema.mean_latent(opt.inference.truncation_mean, device) |
|
else: |
|
mean_latent = None |
|
|
|
g_ema.renderer.is_train = False |
|
g_ema.renderer.perturb = 0 |
|
|
|
|
|
|
|
sample_trans, sample_beta, sample_theta = dataset.sample_smpl_param(1, device, val=False) |
|
sample_cam_extrinsics, sample_focals = dataset.get_camera_extrinsics(1, device, val=False) |
|
|
|
torch.randn(1, opt.inference.style_dim, device=device) |
|
|
|
return opt.inference, g_ema, device, mean_latent, torch.randn(1, opt.inference.style_dim, device=device), \ |
|
sample_trans, sample_beta, sample_theta, sample_cam_extrinsics, sample_focals |
|
|
|
import gradio as gr |
|
import plotly.graph_objects as go |
|
from PIL import Image |
|
|
|
setup_list = None |
|
|
|
def get_video(): |
|
global setup_list |
|
if setup_list is None: |
|
setup_list = list(setup()) |
|
generate_video(*setup_list) |
|
torch.cuda.empty_cache() |
|
path = 'evaluations/512x256_deepfashion/iter_0420000/random_angles/images_paper_video/video_0000000.mp4' |
|
return path |
|
|
|
def get_mesh(): |
|
global setup_list |
|
if setup_list is None: |
|
setup_list = list(setup()) |
|
setup_list[4] = torch.randn(1, setup_list[0].style_dim, device=setup_list[2]) |
|
generate_rgb(*setup_list) |
|
mesh = generate_mesh(*setup_list) |
|
torch.cuda.empty_cache() |
|
|
|
x=np.asarray(mesh.vertices).T[0] |
|
y=np.asarray(mesh.vertices).T[1] |
|
z=np.asarray(mesh.vertices).T[2] |
|
|
|
i=np.asarray(mesh.faces).T[0] |
|
j=np.asarray(mesh.faces).T[1] |
|
k=np.asarray(mesh.faces).T[2] |
|
fig = go.Figure(go.Mesh3d(x=x, y=y, z=z, |
|
i=i, j=j, k=k, |
|
color="lightpink", |
|
|
|
lighting=dict(ambient=0.5, |
|
diffuse=1, |
|
fresnel=4, |
|
specular=0.5, |
|
roughness=0.05, |
|
facenormalsepsilon=0, |
|
vertexnormalsepsilon=0),)) |
|
|
|
|
|
|
|
path='evaluations/512x256_deepfashion/iter_0420000/random_angles/images_paper_fig/0000000.png' |
|
|
|
image=Image.open(path) |
|
|
|
return fig,image |
|
|
|
markdown=f''' |
|
# EVA3D: Compositional 3D Human Generation from 2D Image Collections |
|
|
|
Authored by Fangzhou Hong, Zhaoxi Chen, Yushi Lan, Liang Pan, Ziwei Liu |
|
|
|
The space demo for the ICLR 2023 Spotlight paper "EVA3D: Compositional 3D Human Generation from 2D Image Collections". |
|
|
|
### Useful links: |
|
- [Official Github Repo](https://github.com/hongfz16/EVA3D) |
|
- [Project Page](https://hongfz16.github.io/projects/EVA3D.html) |
|
- [arXiv Link](https://arxiv.org/abs/2210.04888) |
|
|
|
Licensed under the S-Lab License. |
|
|
|
First use button "Generate RGB & Mesh" to randomly sample a 3D human. Then push button "Generate Video" to generate a panning video of the generated human. |
|
''' |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown(markdown) |
|
with gr.Column(): |
|
with gr.Row(): |
|
with gr.Column(): |
|
image=gr.Image(type="pil",shape=(512,256*3)) |
|
with gr.Row(): |
|
with gr.Column(): |
|
mesh = gr.Plot() |
|
with gr.Column(): |
|
video=gr.Video() |
|
|
|
|
|
|
|
|
|
with gr.Row(): |
|
btn = gr.Button(value="Generate RGB & Mesh") |
|
btn_2=gr.Button(value="Generate Video") |
|
|
|
btn.click(get_mesh,[],[mesh,image]) |
|
btn_2.click(get_video,[],[video]) |
|
|
|
demo.launch() |
|
|