Spaces:

hyz317
/

StdGEN

Running on L40S

App Files Files Community

StdGEN / infer_api.py

YulianSa

update

216a665 3 days ago

raw

history blame

36.1 kB

	import spaces
	from PIL import Image

	import io
	import argparse
	import os
	import sys
	import random
	import tempfile
	from typing import Dict, Optional, Tuple
	from omegaconf import OmegaConf
	import numpy as np

	import torch
	from pygltflib import GLTF2, Material, PbrMetallicRoughness

	from diffusers import AutoencoderKL, DDIMScheduler
	from diffusers.utils import check_min_version
	from tqdm.auto import tqdm
	from transformers import CLIPTextModel, CLIPTokenizer, CLIPImageProcessor, CLIPVisionModelWithProjection
	from torchvision import transforms

	from canonicalize.models.unet_mv2d_condition import UNetMV2DConditionModel
	from canonicalize.models.unet_mv2d_ref import UNetMV2DRefModel
	from canonicalize.pipeline_canonicalize import CanonicalizationPipeline
	from einops import rearrange
	from torchvision.utils import save_image
	import json
	import cv2

	import onnxruntime as rt
	from huggingface_hub.file_download import hf_hub_download
	from huggingface_hub import list_repo_files
	from rm_anime_bg.cli import get_mask, SCALE

	import argparse
	import os
	import cv2
	import numpy as np
	from typing import Dict, Optional, List
	from omegaconf import OmegaConf, DictConfig
	from PIL import Image
	from pathlib import Path
	from dataclasses import dataclass
	from typing import Dict
	import torch
	import torch.nn.functional as F
	import torch.utils.checkpoint
	import torchvision.transforms.functional as TF
	from torch.utils.data import Dataset, DataLoader
	from torchvision import transforms
	from torchvision.utils import make_grid, save_image
	from accelerate.utils import set_seed
	from tqdm.auto import tqdm
	from einops import rearrange, repeat
	from multiview.pipeline_multiclass import StableUnCLIPImg2ImgPipeline

	import os
	import imageio
	import numpy as np
	import torch
	import cv2
	import glob
	import matplotlib.pyplot as plt
	from PIL import Image
	from torchvision.transforms import v2
	from pytorch_lightning import seed_everything
	from omegaconf import OmegaConf
	from tqdm import tqdm

	from slrm.utils.train_util import instantiate_from_config
	from slrm.utils.camera_util import (
	FOV_to_intrinsics,
	get_circular_camera_poses,
	)
	from slrm.utils.mesh_util import save_obj, save_glb, save_obj_with_mtl
	from slrm.utils.infer_util import images_to_video

	import cv2
	import numpy as np
	import os
	import trimesh
	import argparse
	import torch
	import scipy
	from PIL import Image

	from refine.mesh_refine import geo_refine
	from refine.func import make_star_cameras_orthographic
	from refine.render import NormalsRenderer, calc_vertex_normals

	import pytorch3d
	from pytorch3d.structures import Meshes
	from sklearn.neighbors import KDTree

	from segment_anything import SamAutomaticMaskGenerator, sam_model_registry

	check_min_version("0.24.0")
	weight_dtype = torch.float16
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	VIEWS = ['front', 'front_right', 'right', 'back', 'left', 'front_left']


	#### TEST ####
	import nvdiffrast.torch as dr
	import torch
	from typing import Tuple

	@spaces.GPU
	def _warmup(device=None):
	glctx = dr.RasterizeCudaContext(device=None)
	device = 'cuda' if device is None else device
	#windows workaround for https://github.com/NVlabs/nvdiffrast/issues/59
	def tensor(args, *kwargs):
	return torch.tensor(args, device=device, *kwargs)
	pos = tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=torch.float32)
	tri = tensor([[0, 1, 2]], dtype=torch.int32)
	dr.rasterize(glctx, pos, tri, resolution=[256, 256])

	_warmup(device)

	#### TEST END ####


	repo_id = "hyz317/StdGEN"
	all_files = list_repo_files(repo_id, revision="main")
	for file in all_files:
	if os.path.exists(file):
	continue
	hf_hub_download(repo_id, file, local_dir="./ckpt")

	@spaces.GPU
	def set_seed2(seed):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)


	session_infer_path = hf_hub_download(
	repo_id="skytnt/anime-seg", filename="isnetis.onnx",
	)
	providers: list[str] = ["CPUExecutionProvider"]
	if "CUDAExecutionProvider" in rt.get_available_providers():
	providers = ["CUDAExecutionProvider"]

	bkg_remover_session_infer = rt.InferenceSession(
	session_infer_path, providers=providers,
	)

	@spaces.GPU
	def remove_background(
	img: np.ndarray,
	alpha_min: float,
	alpha_max: float,
	) -> list:
	img = np.array(img)
	mask = get_mask(bkg_remover_session_infer, img)
	mask[mask < alpha_min] = 0.0
	mask[mask > alpha_max] = 1.0
	img_after = (mask * img).astype(np.uint8)
	mask = (mask * SCALE).astype(np.uint8)
	img_after = np.concatenate([img_after, mask], axis=2, dtype=np.uint8)
	return Image.fromarray(img_after)


	def process_image(image, totensor, width, height):
	assert image.mode == "RGBA"

	# Find non-transparent pixels
	non_transparent = np.nonzero(np.array(image)[..., 3])
	min_x, max_x = non_transparent[1].min(), non_transparent[1].max()
	min_y, max_y = non_transparent[0].min(), non_transparent[0].max()
	image = image.crop((min_x, min_y, max_x, max_y))

	# paste to center
	max_dim = max(image.width, image.height)
	max_height = int(max_dim * 1.2)
	max_width = int(max_dim / (height/width) * 1.2)
	new_image = Image.new("RGBA", (max_width, max_height))
	left = (max_width - image.width) // 2
	top = (max_height - image.height) // 2
	new_image.paste(image, (left, top))

	image = new_image.resize((width, height), resample=Image.BICUBIC)
	image = np.array(image)
	image = image.astype(np.float32) / 255.
	assert image.shape[-1] == 4 # RGBA
	alpha = image[..., 3:4]
	bg_color = np.array([1., 1., 1.], dtype=np.float32)
	image = image[..., :3] * alpha + bg_color * (1 - alpha)
	return totensor(image)


	@spaces.GPU
	@torch.no_grad()
	def inference(validation_pipeline, input_image, vae, feature_extractor, image_encoder, unet, ref_unet, tokenizer,
	text_encoder, pretrained_model_path, validation, val_width, val_height, unet_condition_type,
	use_noise=True, noise_d=256, crop=False, seed=100, timestep=20):
	set_seed2(seed)
	generator = torch.Generator(device=device).manual_seed(seed)

	totensor = transforms.ToTensor()

	prompts = "high quality, best quality"
	prompt_ids = tokenizer(
	prompts, max_length=tokenizer.model_max_length, padding="max_length", truncation=True,
	return_tensors="pt"
	).input_ids[0]

	# (B*Nv, 3, H, W)
	B = 1
	if input_image.mode != "RGBA":
	# remove background
	input_image = remove_background(input_image, 0.1, 0.9)
	imgs_in = process_image(input_image, totensor, val_width, val_height)
	imgs_in = rearrange(imgs_in.unsqueeze(0).unsqueeze(0), "B Nv C H W -> (B Nv) C H W")

	with torch.autocast('cuda' if torch.cuda.is_available() else 'cpu', dtype=weight_dtype):
	imgs_in = imgs_in.to(device=device)
	# B*Nv images
	out = validation_pipeline(prompt=prompts, image=imgs_in.to(weight_dtype), generator=generator,
	num_inference_steps=timestep, prompt_ids=prompt_ids,
	height=val_height, width=val_width, unet_condition_type=unet_condition_type,
	use_noise=use_noise, **validation,)
	out = rearrange(out, "B C f H W -> (B f) C H W", f=1)

	print("OUT!!!!!!")

	img_buf = io.BytesIO()
	save_image(out[0], img_buf, format='PNG')
	img_buf.seek(0)
	img = Image.open(img_buf)

	print("OUT2!!!!!!")

	torch.cuda.empty_cache()
	return img


	######### Multi View Part #############
	weight_dtype = torch.float16
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	def tensor_to_numpy(tensor):
	return tensor.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()


	@dataclass
	class TestConfig:
	pretrained_model_name_or_path: str
	pretrained_unet_path:Optional[str]
	revision: Optional[str]
	validation_dataset: Dict
	save_dir: str
	seed: Optional[int]
	validation_batch_size: int
	dataloader_num_workers: int
	save_mode: str
	local_rank: int

	pipe_kwargs: Dict
	pipe_validation_kwargs: Dict
	unet_from_pretrained_kwargs: Dict
	validation_grid_nrow: int
	camera_embedding_lr_mult: float

	num_views: int
	camera_embedding_type: str

	pred_type: str
	regress_elevation: bool
	enable_xformers_memory_efficient_attention: bool

	cond_on_normals: bool
	cond_on_colors: bool

	regress_elevation: bool
	regress_focal_length: bool



	def convert_to_numpy(tensor):
	return tensor.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()

	# 定义一个函数，用于保存图像
	def save_image2(tensor):
	# 将tensor转换为numpy数组
	ndarr = convert_to_numpy(tensor)
	# 调用save_image_numpy函数，保存图像
	return save_image_numpy(ndarr)

	def save_image_numpy(ndarr):
	im = Image.fromarray(ndarr)
	# pad to square
	if im.size[0] != im.size[1]:
	size = max(im.size)
	new_im = Image.new("RGB", (size, size))
	# set to white
	new_im.paste((255, 255, 255), (0, 0, size, size))
	new_im.paste(im, ((size - im.size[0]) // 2, (size - im.size[1]) // 2))
	im = new_im
	# resize to 1024x1024
	im = im.resize((1024, 1024), Image.LANCZOS)
	return im

	@spaces.GPU
	def run_multiview_infer(data, pipeline, cfg: TestConfig, num_levels=3, seed=None):
	pipeline.unet.enable_xformers_memory_efficient_attention()

	if seed is None:
	generator = None
	else:
	generator = torch.Generator(device=pipeline.unet.device).manual_seed(seed)

	images_cond = []
	results = {}

	torch.cuda.empty_cache()
	images_cond.append(data['image_cond_rgb'][:, 0].cuda())
	imgs_in = torch.cat([data['image_cond_rgb']]*2, dim=0).cuda()
	num_views = imgs_in.shape[1]
	imgs_in = rearrange(imgs_in, "B Nv C H W -> (B Nv) C H W")# (B*Nv, 3, H, W)

	target_h, target_w = imgs_in.shape[-2], imgs_in.shape[-1]

	normal_prompt_embeddings, clr_prompt_embeddings = data['normal_prompt_embeddings'].cuda(), data['color_prompt_embeddings'].cuda()
	prompt_embeddings = torch.cat([normal_prompt_embeddings, clr_prompt_embeddings], dim=0)
	prompt_embeddings = rearrange(prompt_embeddings, "B Nv N C -> (B Nv) N C")

	# B*Nv images
	unet_out = pipeline(
	imgs_in, None, prompt_embeds=prompt_embeddings,
	generator=generator, guidance_scale=3.0, output_type='pt', num_images_per_prompt=1,
	height=cfg.height, width=cfg.width,
	num_inference_steps=40, eta=1.0,
	num_levels=num_levels,
	)

	for level in range(num_levels):
	out = unet_out[level].images
	bsz = out.shape[0] // 2

	normals_pred = out[:bsz]
	images_pred = out[bsz:]

	if num_levels == 2:
	results[level+1] = {'normals': [], 'images': []}
	else:
	results[level] = {'normals': [], 'images': []}

	for i in range(bsz//num_views):
	img_in_ = images_cond[-1][i].to(out.device)
	for j in range(num_views):
	view = VIEWS[j]
	idx = i*num_views + j
	normal = normals_pred[idx]
	color = images_pred[idx]

	## save color and normal---------------------
	new_normal = save_image2(normal)
	new_color = save_image2(color)

	if num_levels == 2:
	results[level+1]['normals'].append(new_normal)
	results[level+1]['images'].append(new_color)
	else:
	results[level]['normals'].append(new_normal)
	results[level]['images'].append(new_color)

	torch.cuda.empty_cache()
	return results


	class InferAPI:
	def __init__(self,
	canonical_configs,
	multiview_configs,
	slrm_configs,
	refine_configs):
	self.canonical_configs = canonical_configs
	self.multiview_configs = multiview_configs
	self.slrm_configs = slrm_configs
	self.refine_configs = refine_configs
	self.results = {}

	# self.canonical_infer = InferCanonicalAPI(self.canonical_configs)
	# self.multiview_infer = InferMultiviewAPI(self.multiview_configs)
	# self.slrm_infer = InferSlrmAPI(self.slrm_configs)
	# self.refine_infer = InferRefineAPI(self.refine_configs)

	def genStage1(self, img, seed):
	return infer_canonicalize_gen(img, seed)

	def genStage2(self, img, seed, num_levels):
	return infer_multiview_gen(img, seed, num_levels)

	def genStage3(self, img):
	return infer_slrm_gen(img)

	def genStage4(self, meshes, imgs):
	return infer_refine(meshes, imgs)

	def add_results(self, results):
	for k in results:
	self.results[k] = results[k]


	############## Refine ##############
	def fix_vert_color_glb(mesh_path):
	from pygltflib import GLTF2, Material, PbrMetallicRoughness
	obj1 = GLTF2().load(mesh_path)
	obj1.meshes[0].primitives[0].material = 0
	obj1.materials.append(Material(
	pbrMetallicRoughness = PbrMetallicRoughness(
	baseColorFactor = [1.0, 1.0, 1.0, 1.0],
	metallicFactor = 0.,
	roughnessFactor = 1.0,
	),
	emissiveFactor = [0.0, 0.0, 0.0],
	doubleSided = True,
	))
	obj1.save(mesh_path)


	def srgb_to_linear(c_srgb):
	c_linear = np.where(c_srgb <= 0.04045, c_srgb / 12.92, ((c_srgb + 0.055) / 1.055) ** 2.4)
	return c_linear.clip(0, 1.)


	def save_py3dmesh_with_trimesh_fast(meshes: Meshes, save_glb_path, apply_sRGB_to_LinearRGB=True):
	# convert from pytorch3d meshes to trimesh mesh
	vertices = meshes.verts_packed().cpu().float().numpy()
	triangles = meshes.faces_packed().cpu().long().numpy()
	np_color = meshes.textures.verts_features_packed().cpu().float().numpy()
	if save_glb_path.endswith(".glb"):
	# rotate 180 along +Y
	vertices[:, [0, 2]] = -vertices[:, [0, 2]]

	if apply_sRGB_to_LinearRGB:
	np_color = srgb_to_linear(np_color)
	assert vertices.shape[0] == np_color.shape[0]
	assert np_color.shape[1] == 3
	assert 0 <= np_color.min() and np_color.max() <= 1.001, f"min={np_color.min()}, max={np_color.max()}"
	np_color = np.clip(np_color, 0, 1)
	mesh = trimesh.Trimesh(vertices=vertices, faces=triangles, vertex_colors=np_color)
	mesh.remove_unreferenced_vertices()
	# save mesh
	mesh.export(save_glb_path)
	if save_glb_path.endswith(".glb"):
	fix_vert_color_glb(save_glb_path)
	print(f"saving to {save_glb_path}")


	def calc_horizontal_offset(target_img, source_img):
	target_mask = target_img.astype(np.float32).sum(axis=-1) > 750
	source_mask = source_img.astype(np.float32).sum(axis=-1) > 750
	best_offset = -114514
	for offset in range(-200, 200):
	offset_mask = np.roll(source_mask, offset, axis=1)
	overlap = (target_mask & offset_mask).sum()
	if overlap > best_offset:
	best_offset = overlap
	best_offset_value = offset
	return best_offset_value


	def calc_horizontal_offset2(target_mask, source_img):
	source_mask = source_img.astype(np.float32).sum(axis=-1) > 750
	best_offset = -114514
	for offset in range(-200, 200):
	offset_mask = np.roll(source_mask, offset, axis=1)
	overlap = (target_mask & offset_mask).sum()
	if overlap > best_offset:
	best_offset = overlap
	best_offset_value = offset
	return best_offset_value


	@spaces.GPU
	def get_distract_mask(color_0, color_1, normal_0=None, normal_1=None, thres=0.25, ratio=0.50, outside_thres=0.10, outside_ratio=0.20):
	distract_area = np.abs(color_0 - color_1).sum(axis=-1) > thres
	if normal_0 is not None and normal_1 is not None:
	distract_area \|= np.abs(normal_0 - normal_1).sum(axis=-1) > thres
	labeled_array, num_features = scipy.ndimage.label(distract_area)
	results = []

	random_sampled_points = []

	for i in range(num_features + 1):
	if np.sum(labeled_array == i) > 1000 and np.sum(labeled_array == i) < 100000:
	results.append((i, np.sum(labeled_array == i)))
	# random sample a point in the area
	points = np.argwhere(labeled_array == i)
	random_sampled_points.append(points[np.random.randint(0, points.shape[0])])

	results = sorted(results, key=lambda x: x[1], reverse=True) # [1:]
	distract_mask = np.zeros_like(distract_area)
	distract_bbox = np.zeros_like(distract_area)
	for i, _ in results:
	distract_mask \|= labeled_array == i
	bbox = np.argwhere(labeled_array == i)
	min_x, min_y = bbox.min(axis=0)
	max_x, max_y = bbox.max(axis=0)
	distract_bbox[min_x:max_x, min_y:max_y] = 1

	return distract_mask, distract_bbox


	# infer_refine_sam = sam_model_registry["vit_h"](checkpoint="./ckpt/sam_vit_h_4b8939.pth").cuda()
	# infer_refine_generator = SamAutomaticMaskGenerator(
	# model=infer_refine_sam,
	# points_per_side=64,
	# pred_iou_thresh=0.80,
	# stability_score_thresh=0.92,
	# crop_n_layers=1,
	# crop_n_points_downscale_factor=2,
	# min_mask_region_area=100,
	# )
	infer_refine_outside_ratio = 0.20

	@spaces.GPU(duration=70)
	def infer_refine(meshes, imgs):
	fixed_v, fixed_f, fixed_t = None, None, None
	flow_vert, flow_vector = None, None
	last_colors, last_normals = None, None
	last_front_color, last_front_normal = None, None
	distract_mask = None

	results = []
	mesh_list = []

	for name_idx, level in zip([2, 0, 1], [2, 1, 0]):
	mesh = trimesh.load(meshes[name_idx])
	new_mesh = mesh.split(only_watertight=False)
	new_mesh = [ j for j in new_mesh if len(j.vertices) >= 300 ]
	mesh = trimesh.Scene(new_mesh).dump(concatenate=True)
	mesh_v, mesh_f = mesh.vertices, mesh.faces

	if last_colors is None:
	# @spaces.GPU()
	def get_mask():
	mv, proj = make_star_cameras_orthographic(8, 1, r=1.2)
	mv = mv[[4, 3, 2, 0, 6, 5]]
	renderer = NormalsRenderer(mv,proj,(1024,1024))
	images = renderer.render(
	torch.tensor(mesh_v, device='cuda').float(),
	torch.ones_like(torch.from_numpy(mesh_v), device='cuda').float(),
	torch.tensor(mesh_f, device='cuda'),
	)
	mask = (images[..., 3] < 0.9).cpu().numpy()
	return mask
	mask = get_mask()

	colors, normals = [], []
	for i in range(6):
	color = np.array(imgs[level]['images'][i])
	normal = np.array(imgs[level]['normals'][i])

	if last_colors is not None:
	offset = calc_horizontal_offset(np.array(last_colors[i]), color)
	# print('offset', i, offset)
	else:
	offset = calc_horizontal_offset2(mask[i], color)
	# print('init offset', i, offset)

	if offset != 0:
	color = np.roll(color, offset, axis=1)
	normal = np.roll(normal, offset, axis=1)

	color = Image.fromarray(color)
	normal = Image.fromarray(normal)
	colors.append(color)
	normals.append(normal)

	if last_front_color is not None and level == 0:
	distract_mask, distract_bbox = get_distract_mask(last_front_color, np.array(colors[0]).astype(np.float32) / 255.0)
	else:
	distract_mask = None
	distract_bbox = None

	if last_colors is None:
	from copy import deepcopy
	last_colors = deepcopy(colors)

	# my mesh flow weight by nearest vertexs
	if fixed_v is not None and fixed_f is not None and level == 1:
	fixed_v_cpu = fixed_v.cpu().numpy()
	kdtree_anchor = KDTree(fixed_v_cpu)
	kdtree_mesh_v = KDTree(mesh_v)
	_, idx_anchor = kdtree_anchor.query(mesh_v, k=1)
	_, idx_mesh_v = kdtree_mesh_v.query(mesh_v, k=25)
	idx_anchor = idx_anchor.squeeze()
	neighbors = torch.tensor(mesh_v).cuda()[idx_mesh_v] # V, 25, 3
	# calculate the distances neighbors [V, 25, 3]; mesh_v [V, 3] -> [V, 25]
	neighbor_dists = torch.norm(neighbors - torch.tensor(mesh_v).cuda()[:, None], dim=-1)
	neighbor_dists[neighbor_dists > 0.06] = 114514.
	neighbor_weights = torch.exp(-neighbor_dists * 1.)
	neighbor_weights = neighbor_weights / neighbor_weights.sum(dim=1, keepdim=True)
	anchors = fixed_v[idx_anchor] # V, 3
	anchor_normals = calc_vertex_normals(fixed_v, fixed_f)[idx_anchor] # V, 3
	dis_anchor = torch.clamp(((anchors - torch.tensor(mesh_v).cuda()) * anchor_normals).sum(-1), min=0) + 0.01
	vec_anchor = dis_anchor[:, None] * anchor_normals # V, 3
	vec_anchor = vec_anchor[idx_mesh_v] # V, 25, 3
	weighted_vec_anchor = (vec_anchor * neighbor_weights[:, :, None]).sum(1) # V, 3
	mesh_v += weighted_vec_anchor.cpu().numpy()

	mesh_v = torch.tensor(mesh_v, dtype=torch.float32)
	mesh_f = torch.tensor(mesh_f)

	new_mesh, simp_v, simp_f = geo_refine(mesh_v, mesh_f, colors, normals, fixed_v=fixed_v, fixed_f=fixed_f)

	# my mesh flow weight by nearest vertexs
	try:
	if fixed_v is not None and fixed_f is not None and level != 0:
	new_mesh_v = new_mesh.vertices.copy()

	fixed_v_cpu = fixed_v.cpu().numpy()
	kdtree_anchor = KDTree(fixed_v_cpu)
	kdtree_mesh_v = KDTree(new_mesh_v)
	_, idx_anchor = kdtree_anchor.query(new_mesh_v, k=1)
	_, idx_mesh_v = kdtree_mesh_v.query(new_mesh_v, k=25)
	idx_anchor = idx_anchor.squeeze()
	neighbors = torch.tensor(new_mesh_v).cuda()[idx_mesh_v] # V, 25, 3
	# calculate the distances neighbors [V, 25, 3]; new_mesh_v [V, 3] -> [V, 25]
	neighbor_dists = torch.norm(neighbors - torch.tensor(new_mesh_v).cuda()[:, None], dim=-1)
	neighbor_dists[neighbor_dists > 0.06] = 114514.
	neighbor_weights = torch.exp(-neighbor_dists * 1.)
	neighbor_weights = neighbor_weights / neighbor_weights.sum(dim=1, keepdim=True)
	anchors = fixed_v[idx_anchor] # V, 3
	anchor_normals = calc_vertex_normals(fixed_v, fixed_f)[idx_anchor] # V, 3
	dis_anchor = torch.clamp(((anchors - torch.tensor(new_mesh_v).cuda()) * anchor_normals).sum(-1), min=0) + 0.01
	vec_anchor = dis_anchor[:, None] * anchor_normals # V, 3
	vec_anchor = vec_anchor[idx_mesh_v] # V, 25, 3
	weighted_vec_anchor = (vec_anchor * neighbor_weights[:, :, None]).sum(1) # V, 3
	new_mesh_v += weighted_vec_anchor.cpu().numpy()

	# replace new_mesh verts with new_mesh_v
	new_mesh.vertices = new_mesh_v

	except Exception as e:
	pass

	if fixed_v is None:
	fixed_v, fixed_f = simp_v, simp_f
	else:
	fixed_f = torch.cat([fixed_f, simp_f + fixed_v.shape[0]], dim=0)
	fixed_v = torch.cat([fixed_v, simp_v], dim=0)

	mesh_list.append(new_mesh)

	if level == 2:
	new_mesh = trimesh.Trimesh(simp_v.cpu().numpy(), simp_f.cpu().numpy(), process=False)

	new_mesh.export(meshes[name_idx].replace('.obj', '_refined.glb'))
	results.append(meshes[name_idx].replace('.obj', '_refined.glb'))

	gltf = GLTF2().load(meshes[name_idx].replace('.obj', '_refined.glb'))
	for material in gltf.materials:
	if material.pbrMetallicRoughness:
	material.pbrMetallicRoughness.baseColorFactor = [1.0, 1.0, 1.0, 100.0]
	material.pbrMetallicRoughness.metallicFactor = 0.0
	material.pbrMetallicRoughness.roughnessFactor = 1.0
	gltf.save(meshes[name_idx].replace('.obj', '_refined.glb'))

	# save whole mesh
	scene = trimesh.Scene(mesh_list)
	scene.export(meshes[name_idx].replace('.obj', '_refined_whole.glb'))
	results.append(meshes[name_idx].replace('.obj', '_refined_whole.glb'))

	gltf = GLTF2().load(meshes[name_idx].replace('.obj', '_refined_whole.glb'))
	for material in gltf.materials:
	if material.pbrMetallicRoughness:
	material.pbrMetallicRoughness.baseColorFactor = [1.0, 1.0, 1.0, 100.0]
	material.pbrMetallicRoughness.metallicFactor = 0.0
	material.pbrMetallicRoughness.roughnessFactor = 1.0
	gltf.save(meshes[name_idx].replace('.obj', '_refined_whole.glb'))

	return results

	config_slrm = {
	'config_path': './configs/mesh-slrm-infer.yaml'
	}
	infer_slrm_config_path = config_slrm['config_path']
	infer_slrm_config = OmegaConf.load(infer_slrm_config_path)
	infer_slrm_config_name = os.path.basename(infer_slrm_config_path).replace('.yaml', '')
	infer_slrm_model_config = infer_slrm_config.model_config
	infer_slrm_infer_config = infer_slrm_config.infer_config
	infer_slrm_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	infer_slrm_model = instantiate_from_config(infer_slrm_model_config)
	state_dict = torch.load(infer_slrm_infer_config.model_path, map_location='cpu')
	infer_slrm_model.load_state_dict(state_dict, strict=False)
	infer_slrm_model = infer_slrm_model.to(infer_slrm_device)
	infer_slrm_model.init_flexicubes_geometry(infer_slrm_device, fovy=30.0, is_ortho=infer_slrm_model.is_ortho)
	infer_slrm_model = infer_slrm_model.eval()

	@spaces.GPU
	def infer_slrm_gen(imgs):
	imgs = [ cv2.imread(img[0])[:, :, ::-1] for img in imgs ]
	imgs = np.stack(imgs, axis=0).astype(np.float32) / 255.0
	imgs = torch.from_numpy(np.array(imgs)).permute(0, 3, 1, 2).contiguous().float() # (6, 3, 1024, 1024)
	mesh_glb_fpaths = infer_slrm_make3d(imgs)
	return mesh_glb_fpaths[1:4] + mesh_glb_fpaths[0:1]

	@spaces.GPU
	def infer_slrm_make3d(images):
	input_cameras = torch.tensor(np.load('slrm/cameras.npy')).to(device)

	images = images.unsqueeze(0).to(device)
	images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1)

	mesh_fpath = tempfile.NamedTemporaryFile(suffix=f".obj", delete=False).name
	print(mesh_fpath)
	mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
	mesh_dirname = os.path.dirname(mesh_fpath)

	with torch.no_grad():
	# get triplane
	planes = infer_slrm_model.forward_planes(images, input_cameras.float())

	# get mesh
	mesh_glb_fpaths = []
	for j in range(4):
	mesh_glb_fpath = infer_slrm_make_mesh(mesh_fpath.replace(mesh_fpath[-4:], f'_{j}{mesh_fpath[-4:]}'), planes, level=[0, 3, 4, 2][j])
	mesh_glb_fpaths.append(mesh_glb_fpath)

	return mesh_glb_fpaths

	@spaces.GPU
	def infer_slrm_make_mesh(mesh_fpath, planes, level=None, use_texture_map=False):
	mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
	mesh_dirname = os.path.dirname(mesh_fpath)

	with torch.no_grad():
	# get mesh
	mesh_out = infer_slrm_model.extract_mesh(
	planes,
	use_texture_map=use_texture_map,
	levels=torch.tensor([level]).to(device),
	**infer_slrm_infer_config,
	)

	if use_texture_map:
	vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
	vertices = vertices[:, [1, 2, 0]]
	tex_map = tex_map.permute(1, 2, 0).data.cpu().numpy()

	if level == 2:
	# fill all vertex_colors with 127
	tex_map = np.ones_like(tex_map) * 127
	save_obj_with_mtl(
	vertices.data.cpu().numpy(),
	uvs.data.cpu().numpy(),
	faces.data.cpu().numpy(),
	mesh_tex_idx.data.cpu().numpy(),
	tex_map,
	mesh_fpath
	)
	else:
	vertices, faces, vertex_colors = mesh_out
	vertices = vertices[:, [1, 2, 0]]

	if level == 2:
	# fill all vertex_colors with 127
	vertex_colors = np.ones_like(vertex_colors) * 127

	save_obj(vertices, faces, vertex_colors, mesh_fpath)

	return mesh_fpath


	parser = argparse.ArgumentParser()
	parser.add_argument("--seed", type=int, default=42)
	parser.add_argument("--num_views", type=int, default=6)
	parser.add_argument("--num_levels", type=int, default=3)
	parser.add_argument("--pretrained_path", type=str, default='./ckpt/StdGEN-multiview-1024')
	parser.add_argument("--height", type=int, default=1024)
	parser.add_argument("--width", type=int, default=576)
	infer_multiview_cfg = parser.parse_args()
	infer_multiview_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	infer_multiview_pipeline = StableUnCLIPImg2ImgPipeline.from_pretrained(
	infer_multiview_cfg.pretrained_path,
	torch_dtype=torch.float16,)
	if torch.cuda.is_available():
	infer_multiview_pipeline.to(infer_multiview_device)
	print(f"Era3D Using device!!!!!!!!!!!!: {infer_multiview_device}", file=sys.stderr)

	infer_multiview_results = {}

	infer_multiview_image_transforms = [transforms.Resize(int(max(infer_multiview_cfg.height, infer_multiview_cfg.width))),
	transforms.CenterCrop((infer_multiview_cfg.height, infer_multiview_cfg.width)),
	transforms.ToTensor(),
	transforms.Lambda(lambda x: x * 2. - 1),
	]
	infer_multiview_image_transforms = transforms.Compose(infer_multiview_image_transforms)

	prompt_embeds_path = './multiview/fixed_prompt_embeds_6view'
	infer_multiview_normal_text_embeds = torch.load(f'{prompt_embeds_path}/normal_embeds.pt')
	infer_multiview_color_text_embeds = torch.load(f'{prompt_embeds_path}/clr_embeds.pt')
	infer_multiview_total_views = infer_multiview_cfg.num_views


	@spaces.GPU
	def process_im(im):
	im = infer_multiview_image_transforms(im)
	return im

	@spaces.GPU(duration=150)
	def infer_multiview_gen(img, seed, num_levels):
	set_seed(seed)
	data = {}

	cond_im_rgb = process_im(img)
	cond_im_rgb = torch.stack([cond_im_rgb] * infer_multiview_total_views, dim=0)
	data["image_cond_rgb"] = cond_im_rgb[None, ...]
	data["normal_prompt_embeddings"] = infer_multiview_normal_text_embeds[None, ...]
	data["color_prompt_embeddings"] = infer_multiview_color_text_embeds[None, ...]

	results = run_multiview_infer(data, infer_multiview_pipeline, infer_multiview_cfg, num_levels=num_levels, seed=seed)
	return results

	infer_canonicalize_config = {
	'config_path': './configs/canonicalization-infer.yaml',
	}
	infer_canonicalize_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	# print device stderr
	import sys
	print(f"Using device!!!!!!!!!!!!: {infer_canonicalize_device}", file=sys.stderr)

	infer_canonicalize_config_path = infer_canonicalize_config['config_path']
	infer_canonicalize_loaded_config = OmegaConf.load(infer_canonicalize_config_path)


	def infer_canonicalize_setup(
	validation: Dict,
	pretrained_model_path: str,
	local_crossattn: bool = True,
	unet_from_pretrained_kwargs=None,
	unet_condition_type=None,
	use_noise=True,
	noise_d=256,
	timestep: int = 40,
	width_input: int = 640,
	height_input: int = 1024,
	):
	infer_canonicalize_width_input = width_input
	infer_canonicalize_height_input = height_input
	infer_canonicalize_timestep = timestep
	infer_canonicalize_use_noise = use_noise
	infer_canonicalize_noise_d = noise_d
	infer_canonicalize_validation = validation
	infer_canonicalize_unet_condition_type = unet_condition_type
	infer_canonicalize_pretrained_model_path = pretrained_model_path
	infer_canonicalize_local_crossattn = local_crossattn
	infer_canonicalize_unet_from_pretrained_kwargs = unet_from_pretrained_kwargs
	return infer_canonicalize_width_input, infer_canonicalize_height_input, infer_canonicalize_timestep, infer_canonicalize_use_noise, infer_canonicalize_noise_d, infer_canonicalize_validation, infer_canonicalize_unet_condition_type, infer_canonicalize_pretrained_model_path, infer_canonicalize_local_crossattn, infer_canonicalize_unet_from_pretrained_kwargs

	infer_canonicalize_width_input, infer_canonicalize_height_input, infer_canonicalize_timestep, infer_canonicalize_use_noise, infer_canonicalize_noise_d, infer_canonicalize_validation, infer_canonicalize_unet_condition_type, infer_canonicalize_pretrained_model_path, infer_canonicalize_local_crossattn, infer_canonicalize_unet_from_pretrained_kwargs = infer_canonicalize_setup(**infer_canonicalize_loaded_config)

	infer_canonicalize_tokenizer = CLIPTokenizer.from_pretrained(infer_canonicalize_pretrained_model_path, subfolder="tokenizer")
	infer_canonicalize_text_encoder = CLIPTextModel.from_pretrained(infer_canonicalize_pretrained_model_path, subfolder="text_encoder")
	infer_canonicalize_image_encoder = CLIPVisionModelWithProjection.from_pretrained(infer_canonicalize_pretrained_model_path, subfolder="image_encoder")
	infer_canonicalize_feature_extractor = CLIPImageProcessor()
	infer_canonicalize_vae = AutoencoderKL.from_pretrained(infer_canonicalize_pretrained_model_path, subfolder="vae")
	infer_canonicalize_unet = UNetMV2DConditionModel.from_pretrained_2d(infer_canonicalize_pretrained_model_path, subfolder="unet", local_crossattn=infer_canonicalize_local_crossattn, **infer_canonicalize_unet_from_pretrained_kwargs)
	infer_canonicalize_ref_unet = UNetMV2DRefModel.from_pretrained_2d(infer_canonicalize_pretrained_model_path, subfolder="ref_unet", local_crossattn=infer_canonicalize_local_crossattn, **infer_canonicalize_unet_from_pretrained_kwargs)

	infer_canonicalize_text_encoder.to(device, dtype=weight_dtype)
	infer_canonicalize_image_encoder.to(device, dtype=weight_dtype)
	infer_canonicalize_vae.to(device, dtype=weight_dtype)
	infer_canonicalize_ref_unet.to(device, dtype=weight_dtype)
	infer_canonicalize_unet.to(device, dtype=weight_dtype)

	infer_canonicalize_vae.requires_grad_(False)
	infer_canonicalize_ref_unet.requires_grad_(False)
	infer_canonicalize_unet.requires_grad_(False)

	infer_canonicalize_noise_scheduler = DDIMScheduler.from_pretrained(infer_canonicalize_pretrained_model_path, subfolder="scheduler-zerosnr")
	infer_canonicalize_validation_pipeline = CanonicalizationPipeline(
	vae=infer_canonicalize_vae, text_encoder=infer_canonicalize_text_encoder, tokenizer=infer_canonicalize_tokenizer, unet=infer_canonicalize_unet, ref_unet=infer_canonicalize_ref_unet,feature_extractor=infer_canonicalize_feature_extractor,image_encoder=infer_canonicalize_image_encoder,
	scheduler=infer_canonicalize_noise_scheduler
	)
	infer_canonicalize_validation_pipeline.set_progress_bar_config(disable=True)


	@spaces.GPU
	def infer_canonicalize_gen(img_input, seed=0):
	if np.array(img_input).shape[-1] == 4 and np.array(img_input)[..., 3].min() == 255:
	# convert to RGB
	img_input = img_input.convert("RGB")
	img_output = inference(
	infer_canonicalize_validation_pipeline, img_input, infer_canonicalize_vae, infer_canonicalize_feature_extractor, infer_canonicalize_image_encoder, infer_canonicalize_unet, infer_canonicalize_ref_unet, infer_canonicalize_tokenizer, infer_canonicalize_text_encoder,
	infer_canonicalize_pretrained_model_path, infer_canonicalize_validation, infer_canonicalize_width_input, infer_canonicalize_height_input, infer_canonicalize_unet_condition_type,
	use_noise=infer_canonicalize_use_noise, noise_d=infer_canonicalize_noise_d, crop=True, seed=seed, timestep=infer_canonicalize_timestep
	)

	max_dim = max(img_output.width, img_output.height)
	new_image = Image.new("RGBA", (max_dim, max_dim))
	left = (max_dim - img_output.width) // 2
	top = (max_dim - img_output.height) // 2
	new_image.paste(img_output, (left, top))

	return new_image