Text3D-UTPL / app.py
cavargas10's picture
Update app.py
3b7142f verified
import spaces
from spaces.zero.decorator import GPU
import os
import tyro
import imageio
import numpy as np
import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms.functional as TF
from safetensors.torch import load_file
import rembg
import gradio as gr
import kiui
from kiui.op import recenter
from kiui.cam import orbit_camera
from core.utils import get_rays, grid_distortion, orbit_camera_jitter
from core.options import AllConfigs, Options
from core.models import LTRFM_Mesh,LTRFM_NeRF
from core.instant_utils.mesh_util import save_obj, save_obj_with_mtl
from mvdream.pipeline_mvdream import MVDreamPipeline
from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler
from huggingface_hub import hf_hub_download
import spaces
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
GRADIO_VIDEO_PATH = 'gradio_output.mp4'
GRADIO_OBJ_PATH = 'gradio_output_rgb.obj'
GRADIO_OBJ_ALBEDO_PATH = 'gradio_output_albedo.obj'
GRADIO_OBJ_SHADING_PATH = 'gradio_output_shading.obj'
#opt = tyro.cli(AllConfigs)
ckpt_path = hf_hub_download(repo_id="rgxie/LDM", filename="LDM_6V_SDF.ckpt")
opt = Options(
input_size=512,
down_channels=(32, 64, 128, 256, 512),
down_attention=(False, False, False, False, True),
up_channels=(512, 256, 128),
up_attention=(True, False, False, False),
volume_mode='TRF_NeRF',
splat_size=64,
output_size=62, #crop patch
data_mode='s5',
num_views=8,
gradient_accumulation_steps=1, #2
mixed_precision='bf16',
resume=ckpt_path,
)
# model
if opt.volume_mode == 'TRF_Mesh':
model = LTRFM_Mesh(opt)
elif opt.volume_mode == 'TRF_NeRF':
model = LTRFM_NeRF(opt)
else:
model = LGM(opt)
# resume pretrained checkpoint
if opt.resume is not None:
if opt.resume.endswith('safetensors'):
ckpt = load_file(opt.resume, device='cpu')
else: #ckpt
ckpt_dict = torch.load(opt.resume, map_location='cpu')
ckpt=ckpt_dict["model"]
state_dict = model.state_dict()
for k, v in ckpt.items():
k=k.replace('module.', '')
if k in state_dict:
if state_dict[k].shape == v.shape:
state_dict[k].copy_(v)
else:
print(f'[WARN] mismatching shape for param {k}: ckpt {v.shape} != model {state_dict[k].shape}, ignored.')
else:
print(f'[WARN] unexpected param {k}: {v.shape}')
print(f'[INFO] load resume success!')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.half().to(device)
model.eval()
tan_half_fov = np.tan(0.5 * np.deg2rad(opt.fovy))
proj_matrix = torch.zeros(4, 4, dtype=torch.float32).to(device)
proj_matrix[0, 0] = 1 / tan_half_fov
proj_matrix[1, 1] = 1 / tan_half_fov
proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear)
proj_matrix[3, 2] = - (opt.zfar * opt.znear) / (opt.zfar - opt.znear)
proj_matrix[2, 3] = 1
# load dreams
pipe_text = MVDreamPipeline.from_pretrained(
'ashawkey/mvdream-sd2.1-diffusers', # remote weights
torch_dtype=torch.float16,
trust_remote_code=True,
# local_files_only=True,
)
pipe_text = pipe_text.to(device)
# mvdream
pipe_image = MVDreamPipeline.from_pretrained(
"ashawkey/imagedream-ipmv-diffusers", # remote weights
torch_dtype=torch.float16,
trust_remote_code=True,
# local_files_only=True,
)
pipe_image = pipe_image.to(device)
print('Loading 123plus model ...')
pipe_image_plus = DiffusionPipeline.from_pretrained(
"sudo-ai/zero123plus-v1.2",
custom_pipeline="zero123plus",
torch_dtype=torch.float16,
trust_remote_code=True,
#local_files_only=True,
)
pipe_image_plus.scheduler = EulerAncestralDiscreteScheduler.from_config(
pipe_image_plus.scheduler.config, timestep_spacing='trailing'
)
unet_path='./pretrained/diffusion_pytorch_model.bin'
print('Loading custom white-background unet ...')
if os.path.exists(unet_path):
unet_ckpt_path = unet_path
else:
unet_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model")
state_dict = torch.load(unet_ckpt_path, map_location='cpu')
pipe_image_plus.unet.load_state_dict(state_dict, strict=True)
pipe_image_plus = pipe_image_plus.to(device)
# load rembg
bg_remover = rembg.new_session()
@spaces.GPU
def generate_mv(condition_input_image, prompt, prompt_neg='', input_elevation=0, input_num_steps=30, input_seed=42, mv_moedl_option=None):
# seed
kiui.seed_everything(input_seed)
os.makedirs(os.path.join(opt.workspace, "gradio"), exist_ok=True)
# text-conditioned
if condition_input_image is None:
mv_image_uint8 = pipe_text(prompt, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=7.5, elevation=input_elevation)
mv_image_uint8 = (mv_image_uint8 * 255).astype(np.uint8)
# bg removal
mv_image = []
for i in range(4):
image = rembg.remove(mv_image_uint8[i], session=bg_remover) # [H, W, 4]
# to white bg
image = image.astype(np.float32) / 255
image = recenter(image, image[..., 0] > 0, border_ratio=0.2)
image = image[..., :3] * image[..., -1:] + (1 - image[..., -1:])
mv_image.append(image)
mv_image_grid = np.concatenate([mv_image[1], mv_image[2],mv_image[3], mv_image[0]],axis=1)
input_image = np.stack([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=0)
processed_image=None
# image-conditioned (may also input text, but no text usually works too)
else:
condition_input_image = np.array(condition_input_image) # uint8
# bg removal
carved_image = rembg.remove(condition_input_image, session=bg_remover) # [H, W, 4]
mask = carved_image[..., -1] > 0
image = recenter(carved_image, mask, border_ratio=0.2)
image = image.astype(np.float32) / 255.0
processed_image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])
if mv_moedl_option=='mvdream':
mv_image = pipe_image(prompt, processed_image, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=5.0, elevation=input_elevation)
mv_image_grid = np.concatenate([mv_image[1], mv_image[2],mv_image[3], mv_image[0]],axis=1)
input_image = np.stack([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=0)
else:
from PIL import Image
from einops import rearrange, repeat
# input_image=input_image* 255
processed_image = Image.fromarray((processed_image * 255).astype(np.uint8))
mv_image = pipe_image_plus(processed_image, num_inference_steps=input_num_steps).images[0]
mv_image = np.asarray(mv_image, dtype=np.float32) / 255.0
mv_image = torch.from_numpy(mv_image).permute(2, 0, 1).contiguous().float() # (3, 960, 640)
mv_image_grid = rearrange(mv_image, 'c (n h) (m w) -> (m h) (n w) c', n=3, m=2).numpy()
mv_image = rearrange(mv_image, 'c (n h) (m w) -> (n m) h w c', n=3, m=2).numpy()
input_image = mv_image
return mv_image_grid, processed_image, input_image
@spaces.GPU
def generate_3d(input_image, condition_input_image, mv_moedl_option=None, input_seed=42):
kiui.seed_everything(input_seed)
output_obj_rgb_path = os.path.join(opt.workspace,"gradio", GRADIO_OBJ_PATH)
output_obj_albedo_path = os.path.join(opt.workspace,"gradio", GRADIO_OBJ_ALBEDO_PATH)
output_obj_shading_path = os.path.join(opt.workspace,"gradio", GRADIO_OBJ_SHADING_PATH)
output_video_path = os.path.join(opt.workspace,"gradio", GRADIO_VIDEO_PATH)
# generate gaussians
# [4, 256, 256, 3], float32
input_image = torch.from_numpy(input_image).permute(0, 3, 1, 2).float().to(device) # [4, 3, 256, 256]
input_image = F.interpolate(input_image, size=(opt.input_size, opt.input_size), mode='bilinear', align_corners=False)
images_input_vit = F.interpolate(input_image, size=(224, 224), mode='bilinear', align_corners=False)
data = {}
input_image = input_image.unsqueeze(0) # [1, 4, 9, H, W]
images_input_vit=images_input_vit.unsqueeze(0)
data['input_vit']=images_input_vit
elevation = 0
cam_poses =[]
if mv_moedl_option=='mvdream' or condition_input_image is None:
azimuth = np.arange(0, 360, 90, dtype=np.int32)
for azi in tqdm.tqdm(azimuth):
cam_pose = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device)
cam_poses.append(cam_pose)
else:
azimuth = np.arange(30, 360, 60, dtype=np.int32)
cnt = 0
for azi in tqdm.tqdm(azimuth):
if (cnt+1) % 2!= 0:
elevation=-20
else:
elevation=30
cam_pose = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device)
cam_poses.append(cam_pose)
cnt=cnt+1
cam_poses = torch.cat(cam_poses,0)
radius = torch.norm(cam_poses[0, :3, 3])
cam_poses[:, :3, 3] *= opt.cam_radius / radius
transform = torch.tensor([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, opt.cam_radius], [0, 0, 0, 1]], dtype=torch.float32).to(device) @ torch.inverse(cam_poses[0])
cam_poses = transform.unsqueeze(0) @ cam_poses
cam_poses=cam_poses.unsqueeze(0)
data['source_camera']=cam_poses
with torch.no_grad():
if opt.volume_mode == 'TRF_Mesh':
with torch.autocast(device_type='cuda', dtype=torch.float32):
svd_volume = model.forward_svd_volume(input_image,data)
else:
with torch.autocast(device_type='cuda', dtype=torch.float16):
svd_volume = model.forward_svd_volume(input_image,data)
#time-consuming
export_texmap=False
mesh_out = model.extract_mesh(svd_volume,use_texture_map=export_texmap)
if export_texmap:
vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
for i in range(len(tex_map)):
mesh_path=os.path.join(opt.workspace, name + str(i) + '_'+ str(seed)+ '.obj')
save_obj_with_mtl(
vertices.data.cpu().numpy(),
uvs.data.cpu().numpy(),
faces.data.cpu().numpy(),
mesh_tex_idx.data.cpu().numpy(),
tex_map[i].permute(1, 2, 0).data.cpu().numpy(),
mesh_path,
)
else:
vertices, faces, vertex_colors = mesh_out
save_obj(vertices, faces, vertex_colors[0], output_obj_rgb_path)
save_obj(vertices, faces, vertex_colors[1], output_obj_albedo_path)
save_obj(vertices, faces, vertex_colors[2], output_obj_shading_path)
# images=[]
# azimuth = np.arange(0, 360, 6, dtype=np.int32)
# for azi in tqdm.tqdm(azimuth):
# cam_pose = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True))
# if opt.volume_mode == 'TRF_Mesh':
# cam_view = torch.inverse(cam_pose)
# cam_view=cam_view.unsqueeze(0).unsqueeze(0).to(device)
# data['w2c'] = cam_view
# with torch.autocast(device_type='cuda', dtype=torch.float32):
# render_images=model.render_frame(data)
# else:
# rays_o, rays_d = get_rays(cam_pose, opt.infer_render_size, opt.infer_render_size, opt.fovy) # [h, w, 3]
# rays_o=rays_o.unsqueeze(0).unsqueeze(0).to(device)# B,V,H,W,3
# rays_d=rays_d.unsqueeze(0).unsqueeze(0).to(device)
# data['all_rays_o']=rays_o
# data['all_rays_d']=rays_d
# with torch.autocast(device_type='cuda', dtype=torch.float16):
# render_images=model.render_frame(data)
# image=render_images['images_pred']
# images.append((image.squeeze(1).permute(0,2,3,1).contiguous().float().cpu().numpy() * 255).astype(np.uint8))
# images = np.concatenate(images, axis=0)
# imageio.mimwrite(output_video_path, images, fps=30)
return output_obj_rgb_path, output_obj_albedo_path, output_obj_shading_path #, output_video_path
# gradio UI
_TITLE = '''LDM: Large Tensorial SDF Model for Textured Mesh Generation'''
_DESCRIPTION = '''
* Input can be text prompt, image.
* The currently supported multi-view diffusion models include the image-conditioned MVdream and Zero123plus, as well as the text-conditioned Imagedream.
* If you find the output unsatisfying, try using different multi-view diffusion models or seeds!
* The project code is available at [https://github.com/rgxie/LDM](https://github.com/rgxie/LDM).
'''
block = gr.Blocks(title=_TITLE).queue()
with block:
with gr.Row():
with gr.Column(scale=1):
gr.Markdown('# ' + _TITLE)
gr.Markdown(_DESCRIPTION)
with gr.Row(variant='panel'):
with gr.Column(scale=1):
with gr.Tab("Image-to-3D"):
# input image
with gr.Row():
condition_input_image = gr.Image(
label="Input Image",
image_mode="RGBA",
type="pil"
)
processed_image = gr.Image(
label="Processed Image",
image_mode="RGBA",
type="pil",
interactive=False
)
with gr.Row():
mv_moedl_option = gr.Radio([
"zero123plus",
"mvdream"
], value="zero123plus",
label="Multi-view Diffusion")
with gr.Row(variant="panel"):
gr.Examples(
examples=[
os.path.join("example", img_name) for img_name in sorted(os.listdir("example"))
],
inputs=[condition_input_image],
fn=lambda x: process(condition_input_image=x, prompt=''),
cache_examples=False,
examples_per_page=20,
label='Image-to-3D Examples'
)
with gr.Tab("Text-to-3D"):
# input prompt
with gr.Row():
input_text = gr.Textbox(label="prompt")
# negative prompt
with gr.Row():
input_neg_text = gr.Textbox(label="negative prompt", value='ugly, blurry, pixelated obscure, unnatural colors, poor lighting, dull, unclear, cropped, lowres, low quality, artifacts, duplicate')
with gr.Row(variant="panel"):
gr.Examples(
examples=[
"a hamburger",
"a furry red fox head",
"a teddy bear",
"a motorbike",
],
inputs=[input_text],
fn=lambda x: process(condition_input_image=None, prompt=x),
cache_examples=False,
label='Text-to-3D Examples'
)
# elevation
input_elevation = gr.Slider(label="elevation", minimum=-90, maximum=90, step=1, value=0)
# inference steps
input_num_steps = gr.Slider(label="inference steps", minimum=1, maximum=100, step=1, value=30)
# random seed
input_seed = gr.Slider(label="random seed", minimum=0, maximum=100000, step=1, value=0)
# gen button
button_gen = gr.Button("Generate")
with gr.Column(scale=1):
with gr.Row():
# multi-view results
mv_image_grid = gr.Image(interactive=False, show_label=False)
# with gr.Row():
# output_video_path = gr.Video(label="video")
with gr.Row():
output_obj_rgb_path = gr.Model3D(
label="RGB Model (OBJ Format)",
interactive=False,
)
with gr.Row():
output_obj_albedo_path = gr.Model3D(
label="Albedo Model (OBJ Format)",
interactive=False,
)
with gr.Row():
output_obj_shading_path = gr.Model3D(
label="Shading Model (OBJ Format)",
interactive=False,
)
input_image = gr.State()
button_gen.click(fn=generate_mv, inputs=[condition_input_image, input_text, input_neg_text, input_elevation, input_num_steps, input_seed, mv_moedl_option],
outputs=[mv_image_grid, processed_image, input_image],).success(
fn=generate_3d,
inputs=[input_image, condition_input_image, mv_moedl_option, input_seed],
outputs=[output_obj_rgb_path, output_obj_albedo_path, output_obj_shading_path] , #output_video_path
)
block.launch(server_name="0.0.0.0", share=False)