Spaces:

hyz317
/

StdGEN

Running on L40S

App Files Files Community

StdGEN / infer_api_bk.py

YulianSa

update

def0065 5 days ago

raw

history blame

35.8 kB

	import spaces
	from PIL import Image

	import io
	import argparse
	import os
	import random
	import tempfile
	from typing import Dict, Optional, Tuple
	from omegaconf import OmegaConf
	import numpy as np

	import torch

	from diffusers import AutoencoderKL, DDIMScheduler
	from diffusers.utils import check_min_version
	from tqdm.auto import tqdm
	from transformers import CLIPTextModel, CLIPTokenizer, CLIPImageProcessor, CLIPVisionModelWithProjection
	from torchvision import transforms

	from canonicalize.models.unet_mv2d_condition import UNetMV2DConditionModel
	from canonicalize.models.unet_mv2d_ref import UNetMV2DRefModel
	from canonicalize.pipeline_canonicalize import CanonicalizationPipeline
	from einops import rearrange
	from torchvision.utils import save_image
	import json
	import cv2

	import onnxruntime as rt
	from huggingface_hub.file_download import hf_hub_download
	from huggingface_hub import list_repo_files
	from rm_anime_bg.cli import get_mask, SCALE

	import argparse
	import os
	import cv2
	import glob
	import numpy as np
	import matplotlib.pyplot as plt
	from typing import Dict, Optional, List
	from omegaconf import OmegaConf, DictConfig
	from PIL import Image
	from pathlib import Path
	from dataclasses import dataclass
	from typing import Dict
	import torch
	import torch.nn.functional as F
	import torch.utils.checkpoint
	import torchvision.transforms.functional as TF
	from torch.utils.data import Dataset, DataLoader
	from torchvision import transforms
	from torchvision.utils import make_grid, save_image
	from accelerate.utils import set_seed
	from tqdm.auto import tqdm
	from einops import rearrange, repeat
	from multiview.pipeline_multiclass import StableUnCLIPImg2ImgPipeline

	import os
	import imageio
	import numpy as np
	import torch
	import cv2
	import glob
	import matplotlib.pyplot as plt
	from PIL import Image
	from torchvision.transforms import v2
	from pytorch_lightning import seed_everything
	from omegaconf import OmegaConf
	from tqdm import tqdm

	from slrm.utils.train_util import instantiate_from_config
	from slrm.utils.camera_util import (
	FOV_to_intrinsics,
	get_circular_camera_poses,
	)
	from slrm.utils.mesh_util import save_obj, save_glb
	from slrm.utils.infer_util import images_to_video

	import cv2
	import numpy as np
	import os
	import trimesh
	import argparse
	import torch
	import scipy
	from PIL import Image

	from refine.mesh_refine import geo_refine
	from refine.func import make_star_cameras_orthographic
	from refine.render import NormalsRenderer, calc_vertex_normals

	import pytorch3d
	from pytorch3d.structures import Meshes
	from sklearn.neighbors import KDTree

	from segment_anything import SamAutomaticMaskGenerator, sam_model_registry

	check_min_version("0.24.0")
	weight_dtype = torch.float16
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	VIEWS = ['front', 'front_right', 'right', 'back', 'left', 'front_left']


	@spaces.GPU
	def set_seed(seed):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)


	session_infer_path = hf_hub_download(
	repo_id="skytnt/anime-seg", filename="isnetis.onnx",
	)
	providers: list[str] = ["CPUExecutionProvider"]
	if "CUDAExecutionProvider" in rt.get_available_providers():
	providers = ["CUDAExecutionProvider"]

	bkg_remover_session_infer = rt.InferenceSession(
	session_infer_path, providers=providers,
	)

	@spaces.GPU
	def remove_background(
	img: np.ndarray,
	alpha_min: float,
	alpha_max: float,
	) -> list:
	img = np.array(img)
	mask = get_mask(bkg_remover_session_infer, img)
	mask[mask < alpha_min] = 0.0
	mask[mask > alpha_max] = 1.0
	img_after = (mask * img).astype(np.uint8)
	mask = (mask * SCALE).astype(np.uint8)
	img_after = np.concatenate([img_after, mask], axis=2, dtype=np.uint8)
	return Image.fromarray(img_after)


	def process_image(image, totensor, width, height):
	assert image.mode == "RGBA"

	# Find non-transparent pixels
	non_transparent = np.nonzero(np.array(image)[..., 3])
	min_x, max_x = non_transparent[1].min(), non_transparent[1].max()
	min_y, max_y = non_transparent[0].min(), non_transparent[0].max()
	image = image.crop((min_x, min_y, max_x, max_y))

	# paste to center
	max_dim = max(image.width, image.height)
	max_height = int(max_dim * 1.2)
	max_width = int(max_dim / (height/width) * 1.2)
	new_image = Image.new("RGBA", (max_width, max_height))
	left = (max_width - image.width) // 2
	top = (max_height - image.height) // 2
	new_image.paste(image, (left, top))

	image = new_image.resize((width, height), resample=Image.BICUBIC)
	image = np.array(image)
	image = image.astype(np.float32) / 255.
	assert image.shape[-1] == 4 # RGBA
	alpha = image[..., 3:4]
	bg_color = np.array([1., 1., 1.], dtype=np.float32)
	image = image[..., :3] * alpha + bg_color * (1 - alpha)
	return totensor(image)


	@spaces.GPU
	@torch.no_grad()
	def inference(validation_pipeline, input_image, vae, feature_extractor, image_encoder, unet, ref_unet, tokenizer,
	text_encoder, pretrained_model_path, validation, val_width, val_height, unet_condition_type,
	use_noise=True, noise_d=256, crop=False, seed=100, timestep=20):
	set_seed(seed)
	generator = torch.Generator(device=device).manual_seed(seed)

	totensor = transforms.ToTensor()

	prompts = "high quality, best quality"
	prompt_ids = tokenizer(
	prompts, max_length=tokenizer.model_max_length, padding="max_length", truncation=True,
	return_tensors="pt"
	).input_ids[0]

	# (B*Nv, 3, H, W)
	B = 1
	if input_image.mode != "RGBA":
	# remove background
	input_image = remove_background(input_image, 0.1, 0.9)
	imgs_in = process_image(input_image, totensor, val_width, val_height)
	imgs_in = rearrange(imgs_in.unsqueeze(0).unsqueeze(0), "B Nv C H W -> (B Nv) C H W")

	with torch.autocast('cuda' if torch.cuda.is_available() else 'cpu', dtype=weight_dtype):
	imgs_in = imgs_in.to(device=device)
	# B*Nv images
	out = validation_pipeline(prompt=prompts, image=imgs_in.to(weight_dtype), generator=generator,
	num_inference_steps=timestep, prompt_ids=prompt_ids,
	height=val_height, width=val_width, unet_condition_type=unet_condition_type,
	use_noise=use_noise, **validation,)
	out = rearrange(out, "B C f H W -> (B f) C H W", f=1)

	print("OUT!!!!!!")

	img_buf = io.BytesIO()
	save_image(out[0], img_buf, format='PNG')
	img_buf.seek(0)
	img = Image.open(img_buf)

	print("OUT2!!!!!!")

	torch.cuda.empty_cache()
	return img


	######### Multi View Part #############
	weight_dtype = torch.float16
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

	def tensor_to_numpy(tensor):
	return tensor.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()


	@dataclass
	class TestConfig:
	pretrained_model_name_or_path: str
	pretrained_unet_path:Optional[str]
	revision: Optional[str]
	validation_dataset: Dict
	save_dir: str
	seed: Optional[int]
	validation_batch_size: int
	dataloader_num_workers: int
	save_mode: str
	local_rank: int

	pipe_kwargs: Dict
	pipe_validation_kwargs: Dict
	unet_from_pretrained_kwargs: Dict
	validation_grid_nrow: int
	camera_embedding_lr_mult: float

	num_views: int
	camera_embedding_type: str

	pred_type: str
	regress_elevation: bool
	enable_xformers_memory_efficient_attention: bool

	cond_on_normals: bool
	cond_on_colors: bool

	regress_elevation: bool
	regress_focal_length: bool



	def convert_to_numpy(tensor):
	return tensor.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()

	def save_image(tensor):
	ndarr = convert_to_numpy(tensor)
	return save_image_numpy(ndarr)

	def save_image_numpy(ndarr):
	im = Image.fromarray(ndarr)
	# pad to square
	if im.size[0] != im.size[1]:
	size = max(im.size)
	new_im = Image.new("RGB", (size, size))
	# set to white
	new_im.paste((255, 255, 255), (0, 0, size, size))
	new_im.paste(im, ((size - im.size[0]) // 2, (size - im.size[1]) // 2))
	im = new_im
	# resize to 1024x1024
	im = im.resize((1024, 1024), Image.LANCZOS)
	return im

	@spaces.GPU
	def run_multiview_infer(data, pipeline, cfg: TestConfig, num_levels=3):
	if cfg.seed is None:
	generator = None
	else:
	generator = torch.Generator(device=pipeline.unet.device).manual_seed(cfg.seed)

	images_cond = []
	results = {}

	torch.cuda.empty_cache()
	images_cond.append(data['image_cond_rgb'][:, 0].cuda())
	imgs_in = torch.cat([data['image_cond_rgb']]*2, dim=0).cuda()
	num_views = imgs_in.shape[1]
	imgs_in = rearrange(imgs_in, "B Nv C H W -> (B Nv) C H W")# (B*Nv, 3, H, W)

	target_h, target_w = imgs_in.shape[-2], imgs_in.shape[-1]

	normal_prompt_embeddings, clr_prompt_embeddings = data['normal_prompt_embeddings'].cuda(), data['color_prompt_embeddings'].cuda()
	prompt_embeddings = torch.cat([normal_prompt_embeddings, clr_prompt_embeddings], dim=0)
	prompt_embeddings = rearrange(prompt_embeddings, "B Nv N C -> (B Nv) N C")

	# B*Nv images
	unet_out = pipeline(
	imgs_in, None, prompt_embeds=prompt_embeddings,
	generator=generator, guidance_scale=3.0, output_type='pt', num_images_per_prompt=1,
	height=cfg.height, width=cfg.width,
	num_inference_steps=40, eta=1.0,
	num_levels=num_levels,
	)

	for level in range(num_levels):
	out = unet_out[level].images
	bsz = out.shape[0] // 2

	normals_pred = out[:bsz]
	images_pred = out[bsz:]

	if num_levels == 2:
	results[level+1] = {'normals': [], 'images': []}
	else:
	results[level] = {'normals': [], 'images': []}

	for i in range(bsz//num_views):
	img_in_ = images_cond[-1][i].to(out.device)
	for j in range(num_views):
	view = VIEWS[j]
	idx = i*num_views + j
	normal = normals_pred[idx]
	color = images_pred[idx]

	## save color and normal---------------------
	new_normal = save_image(normal)
	new_color = save_image(color)

	if num_levels == 2:
	results[level+1]['normals'].append(new_normal)
	results[level+1]['images'].append(new_color)
	else:
	results[level]['normals'].append(new_normal)
	results[level]['images'].append(new_color)

	torch.cuda.empty_cache()
	return results

	@spaces.GPU
	def load_multiview_pipeline(cfg):
	pipeline = StableUnCLIPImg2ImgPipeline.from_pretrained(
	cfg.pretrained_path,
	torch_dtype=torch.float16,)
	pipeline.unet.enable_xformers_memory_efficient_attention()
	if torch.cuda.is_available():
	pipeline.to(device)
	return pipeline


	class InferAPI:
	def __init__(self,
	canonical_configs,
	multiview_configs,
	slrm_configs,
	refine_configs):
	self.canonical_configs = canonical_configs
	self.multiview_configs = multiview_configs
	self.slrm_configs = slrm_configs
	self.refine_configs = refine_configs

	repo_id = "hyz317/StdGEN"
	all_files = list_repo_files(repo_id, revision="main")
	for file in all_files:
	if os.path.exists(file):
	continue
	hf_hub_download(repo_id, file, local_dir="./ckpt")

	self.canonical_infer = InferCanonicalAPI(self.canonical_configs)
	# self.multiview_infer = InferMultiviewAPI(self.multiview_configs)
	# self.slrm_infer = InferSlrmAPI(self.slrm_configs)
	# self.refine_infer = InferRefineAPI(self.refine_configs)

	def genStage1(self, img, seed):
	return self.canonical_infer.gen(img, seed)

	def genStage2(self, img, seed, num_levels):
	return self.multiview_infer.gen(img, seed, num_levels)

	def genStage3(self, img):
	return self.slrm_infer.gen(img)

	def genStage4(self, meshes, imgs):
	return self.refine_infer.refine(meshes, imgs)


	############## Refine ##############
	def fix_vert_color_glb(mesh_path):
	from pygltflib import GLTF2, Material, PbrMetallicRoughness
	obj1 = GLTF2().load(mesh_path)
	obj1.meshes[0].primitives[0].material = 0
	obj1.materials.append(Material(
	pbrMetallicRoughness = PbrMetallicRoughness(
	baseColorFactor = [1.0, 1.0, 1.0, 1.0],
	metallicFactor = 0.,
	roughnessFactor = 1.0,
	),
	emissiveFactor = [0.0, 0.0, 0.0],
	doubleSided = True,
	))
	obj1.save(mesh_path)


	def srgb_to_linear(c_srgb):
	c_linear = np.where(c_srgb <= 0.04045, c_srgb / 12.92, ((c_srgb + 0.055) / 1.055) ** 2.4)
	return c_linear.clip(0, 1.)


	def save_py3dmesh_with_trimesh_fast(meshes: Meshes, save_glb_path, apply_sRGB_to_LinearRGB=True):
	# convert from pytorch3d meshes to trimesh mesh
	vertices = meshes.verts_packed().cpu().float().numpy()
	triangles = meshes.faces_packed().cpu().long().numpy()
	np_color = meshes.textures.verts_features_packed().cpu().float().numpy()
	if save_glb_path.endswith(".glb"):
	# rotate 180 along +Y
	vertices[:, [0, 2]] = -vertices[:, [0, 2]]

	if apply_sRGB_to_LinearRGB:
	np_color = srgb_to_linear(np_color)
	assert vertices.shape[0] == np_color.shape[0]
	assert np_color.shape[1] == 3
	assert 0 <= np_color.min() and np_color.max() <= 1.001, f"min={np_color.min()}, max={np_color.max()}"
	np_color = np.clip(np_color, 0, 1)
	mesh = trimesh.Trimesh(vertices=vertices, faces=triangles, vertex_colors=np_color)
	mesh.remove_unreferenced_vertices()
	# save mesh
	mesh.export(save_glb_path)
	if save_glb_path.endswith(".glb"):
	fix_vert_color_glb(save_glb_path)
	print(f"saving to {save_glb_path}")


	def calc_horizontal_offset(target_img, source_img):
	target_mask = target_img.astype(np.float32).sum(axis=-1) > 750
	source_mask = source_img.astype(np.float32).sum(axis=-1) > 750
	best_offset = -114514
	for offset in range(-200, 200):
	offset_mask = np.roll(source_mask, offset, axis=1)
	overlap = (target_mask & offset_mask).sum()
	if overlap > best_offset:
	best_offset = overlap
	best_offset_value = offset
	return best_offset_value


	def calc_horizontal_offset2(target_mask, source_img):
	source_mask = source_img.astype(np.float32).sum(axis=-1) > 750
	best_offset = -114514
	for offset in range(-200, 200):
	offset_mask = np.roll(source_mask, offset, axis=1)
	overlap = (target_mask & offset_mask).sum()
	if overlap > best_offset:
	best_offset = overlap
	best_offset_value = offset
	return best_offset_value


	@spaces.GPU
	def get_distract_mask(generator, color_0, color_1, normal_0=None, normal_1=None, thres=0.25, ratio=0.50, outside_thres=0.10, outside_ratio=0.20):
	distract_area = np.abs(color_0 - color_1).sum(axis=-1) > thres
	if normal_0 is not None and normal_1 is not None:
	distract_area \|= np.abs(normal_0 - normal_1).sum(axis=-1) > thres
	labeled_array, num_features = scipy.ndimage.label(distract_area)
	results = []

	random_sampled_points = []

	for i in range(num_features + 1):
	if np.sum(labeled_array == i) > 1000 and np.sum(labeled_array == i) < 100000:
	results.append((i, np.sum(labeled_array == i)))
	# random sample a point in the area
	points = np.argwhere(labeled_array == i)
	random_sampled_points.append(points[np.random.randint(0, points.shape[0])])

	results = sorted(results, key=lambda x: x[1], reverse=True) # [1:]
	distract_mask = np.zeros_like(distract_area)
	distract_bbox = np.zeros_like(distract_area)
	for i, _ in results:
	distract_mask \|= labeled_array == i
	bbox = np.argwhere(labeled_array == i)
	min_x, min_y = bbox.min(axis=0)
	max_x, max_y = bbox.max(axis=0)
	distract_bbox[min_x:max_x, min_y:max_y] = 1

	points = np.array(random_sampled_points)[:, ::-1]
	labels = np.ones(len(points), dtype=np.int32)

	masks = generator.generate((color_1 * 255).astype(np.uint8))

	outside_area = np.abs(color_0 - color_1).sum(axis=-1) < outside_thres

	final_mask = np.zeros_like(distract_mask)
	for iii, mask in enumerate(masks):
	mask['segmentation'] = cv2.resize(mask['segmentation'].astype(np.float32), (1024, 1024)) > 0.5
	intersection = np.logical_and(mask['segmentation'], distract_mask).sum()
	total = mask['segmentation'].sum()
	iou = intersection / total
	outside_intersection = np.logical_and(mask['segmentation'], outside_area).sum()
	outside_total = mask['segmentation'].sum()
	outside_iou = outside_intersection / outside_total
	if iou > ratio and outside_iou < outside_ratio:
	final_mask \|= mask['segmentation']

	# calculate coverage
	intersection = np.logical_and(final_mask, distract_mask).sum()
	total = distract_mask.sum()
	coverage = intersection / total

	if coverage < 0.8:
	# use original distract mask
	final_mask = (distract_mask.copy() * 255).astype(np.uint8)
	final_mask = cv2.dilate(final_mask, np.ones((3, 3), np.uint8), iterations=3)
	labeled_array_dilate, num_features_dilate = scipy.ndimage.label(final_mask)
	for i in range(num_features_dilate + 1):
	if np.sum(labeled_array_dilate == i) < 200:
	final_mask[labeled_array_dilate == i] = 255

	final_mask = cv2.erode(final_mask, np.ones((3, 3), np.uint8), iterations=3)
	final_mask = final_mask > 127

	return distract_mask, distract_bbox, random_sampled_points, final_mask


	class InferRefineAPI:
	@spaces.GPU
	def __init__(self, config):
	self.sam = sam_model_registry["vit_h"](checkpoint="./ckpt/sam_vit_h_4b8939.pth").cuda()
	self.generator = SamAutomaticMaskGenerator(
	model=self.sam,
	points_per_side=64,
	pred_iou_thresh=0.80,
	stability_score_thresh=0.92,
	crop_n_layers=1,
	crop_n_points_downscale_factor=2,
	min_mask_region_area=100,
	)
	self.outside_ratio = 0.20

	@spaces.GPU
	def refine(self, meshes, imgs):
	fixed_v, fixed_f, fixed_t = None, None, None
	flow_vert, flow_vector = None, None
	last_colors, last_normals = None, None
	last_front_color, last_front_normal = None, None
	distract_mask = None

	mv, proj = make_star_cameras_orthographic(8, 1, r=1.2)
	mv = mv[[4, 3, 2, 0, 6, 5]]
	renderer = NormalsRenderer(mv,proj,(1024,1024))

	results = []

	for name_idx, level in zip([2, 0, 1], [2, 1, 0]):
	mesh = trimesh.load(meshes[name_idx])
	new_mesh = mesh.split(only_watertight=False)
	new_mesh = [ j for j in new_mesh if len(j.vertices) >= 300 ]
	mesh = trimesh.Scene(new_mesh).dump(concatenate=True)
	mesh_v, mesh_f = mesh.vertices, mesh.faces

	if last_colors is None:
	images = renderer.render(
	torch.tensor(mesh_v, device='cuda').float(),
	torch.ones_like(torch.from_numpy(mesh_v), device='cuda').float(),
	torch.tensor(mesh_f, device='cuda'),
	)
	mask = (images[..., 3] < 0.9).cpu().numpy()

	colors, normals = [], []
	for i in range(6):
	color = np.array(imgs[level]['images'][i])
	normal = np.array(imgs[level]['normals'][i])

	if last_colors is not None:
	offset = calc_horizontal_offset(np.array(last_colors[i]), color)
	# print('offset', i, offset)
	else:
	offset = calc_horizontal_offset2(mask[i], color)
	# print('init offset', i, offset)

	if offset != 0:
	color = np.roll(color, offset, axis=1)
	normal = np.roll(normal, offset, axis=1)

	color = Image.fromarray(color)
	normal = Image.fromarray(normal)
	colors.append(color)
	normals.append(normal)

	if last_front_color is not None and level == 0:
	original_mask, distract_bbox, _, distract_mask = get_distract_mask(self.generator, last_front_color, np.array(colors[0]).astype(np.float32) / 255.0, outside_ratio=self.outside_ratio)
	else:
	distract_mask = None
	distract_bbox = None

	last_front_color = np.array(colors[0]).astype(np.float32) / 255.0
	last_front_normal = np.array(normals[0]).astype(np.float32) / 255.0

	if last_colors is None:
	from copy import deepcopy
	last_colors, last_normals = deepcopy(colors), deepcopy(normals)

	# my mesh flow weight by nearest vertexs
	if fixed_v is not None and fixed_f is not None and level == 1:
	t = trimesh.Trimesh(vertices=mesh_v, faces=mesh_f)

	fixed_v_cpu = fixed_v.cpu().numpy()
	kdtree_anchor = KDTree(fixed_v_cpu)
	kdtree_mesh_v = KDTree(mesh_v)
	_, idx_anchor = kdtree_anchor.query(mesh_v, k=1)
	_, idx_mesh_v = kdtree_mesh_v.query(mesh_v, k=25)
	idx_anchor = idx_anchor.squeeze()
	neighbors = torch.tensor(mesh_v).cuda()[idx_mesh_v] # V, 25, 3
	# calculate the distances neighbors [V, 25, 3]; mesh_v [V, 3] -> [V, 25]
	neighbor_dists = torch.norm(neighbors - torch.tensor(mesh_v).cuda()[:, None], dim=-1)
	neighbor_dists[neighbor_dists > 0.06] = 114514.
	neighbor_weights = torch.exp(-neighbor_dists * 1.)
	neighbor_weights = neighbor_weights / neighbor_weights.sum(dim=1, keepdim=True)
	anchors = fixed_v[idx_anchor] # V, 3
	anchor_normals = calc_vertex_normals(fixed_v, fixed_f)[idx_anchor] # V, 3
	dis_anchor = torch.clamp(((anchors - torch.tensor(mesh_v).cuda()) * anchor_normals).sum(-1), min=0) + 0.01
	vec_anchor = dis_anchor[:, None] * anchor_normals # V, 3
	vec_anchor = vec_anchor[idx_mesh_v] # V, 25, 3
	weighted_vec_anchor = (vec_anchor * neighbor_weights[:, :, None]).sum(1) # V, 3
	mesh_v += weighted_vec_anchor.cpu().numpy()

	t = trimesh.Trimesh(vertices=mesh_v, faces=mesh_f)

	mesh_v = torch.tensor(mesh_v, device='cuda', dtype=torch.float32)
	mesh_f = torch.tensor(mesh_f, device='cuda')

	new_mesh, simp_v, simp_f = geo_refine(mesh_v, mesh_f, colors, normals, fixed_v=fixed_v, fixed_f=fixed_f, distract_mask=distract_mask, distract_bbox=distract_bbox)

	# my mesh flow weight by nearest vertexs
	try:
	if fixed_v is not None and fixed_f is not None and level != 0:
	new_mesh_v = new_mesh.verts_packed().cpu().numpy()

	fixed_v_cpu = fixed_v.cpu().numpy()
	kdtree_anchor = KDTree(fixed_v_cpu)
	kdtree_mesh_v = KDTree(new_mesh_v)
	_, idx_anchor = kdtree_anchor.query(new_mesh_v, k=1)
	_, idx_mesh_v = kdtree_mesh_v.query(new_mesh_v, k=25)
	idx_anchor = idx_anchor.squeeze()
	neighbors = torch.tensor(new_mesh_v).cuda()[idx_mesh_v] # V, 25, 3
	# calculate the distances neighbors [V, 25, 3]; new_mesh_v [V, 3] -> [V, 25]
	neighbor_dists = torch.norm(neighbors - torch.tensor(new_mesh_v).cuda()[:, None], dim=-1)
	neighbor_dists[neighbor_dists > 0.06] = 114514.
	neighbor_weights = torch.exp(-neighbor_dists * 1.)
	neighbor_weights = neighbor_weights / neighbor_weights.sum(dim=1, keepdim=True)
	anchors = fixed_v[idx_anchor] # V, 3
	anchor_normals = calc_vertex_normals(fixed_v, fixed_f)[idx_anchor] # V, 3
	dis_anchor = torch.clamp(((anchors - torch.tensor(new_mesh_v).cuda()) * anchor_normals).sum(-1), min=0) + 0.01
	vec_anchor = dis_anchor[:, None] * anchor_normals # V, 3
	vec_anchor = vec_anchor[idx_mesh_v] # V, 25, 3
	weighted_vec_anchor = (vec_anchor * neighbor_weights[:, :, None]).sum(1) # V, 3
	new_mesh_v += weighted_vec_anchor.cpu().numpy()

	# replace new_mesh verts with new_mesh_v
	new_mesh = Meshes(verts=[torch.tensor(new_mesh_v, device='cuda')], faces=new_mesh.faces_list(), textures=new_mesh.textures)

	except Exception as e:
	pass

	notsimp_v, notsimp_f, notsimp_t = new_mesh.verts_packed(), new_mesh.faces_packed(), new_mesh.textures.verts_features_packed()

	if fixed_v is None:
	fixed_v, fixed_f = simp_v, simp_f
	complete_v, complete_f, complete_t = notsimp_v, notsimp_f, notsimp_t
	else:
	fixed_f = torch.cat([fixed_f, simp_f + fixed_v.shape[0]], dim=0)
	fixed_v = torch.cat([fixed_v, simp_v], dim=0)

	complete_f = torch.cat([complete_f, notsimp_f + complete_v.shape[0]], dim=0)
	complete_v = torch.cat([complete_v, notsimp_v], dim=0)
	complete_t = torch.cat([complete_t, notsimp_t], dim=0)

	if level == 2:
	new_mesh = Meshes(verts=[new_mesh.verts_packed()], faces=[new_mesh.faces_packed()], textures=pytorch3d.renderer.mesh.textures.TexturesVertex(verts_features=[torch.ones_like(new_mesh.textures.verts_features_packed(), device=new_mesh.verts_packed().device)*0.5]))

	save_py3dmesh_with_trimesh_fast(new_mesh, meshes[name_idx].replace('.obj', '_refined.obj'), apply_sRGB_to_LinearRGB=False)
	results.append(meshes[name_idx].replace('.obj', '_refined.obj'))

	# save whole mesh
	save_py3dmesh_with_trimesh_fast(Meshes(verts=[complete_v], faces=[complete_f], textures=pytorch3d.renderer.mesh.textures.TexturesVertex(verts_features=[complete_t])), meshes[name_idx].replace('.obj', '_refined_whole.obj'), apply_sRGB_to_LinearRGB=False)
	results.append(meshes[name_idx].replace('.obj', '_refined_whole.obj'))

	return results


	class InferSlrmAPI:
	@spaces.GPU
	def __init__(self, config):
	self.config_path = config['config_path']
	self.config = OmegaConf.load(self.config_path)
	self.config_name = os.path.basename(self.config_path).replace('.yaml', '')
	self.model_config = self.config.model_config
	self.infer_config = self.config.infer_config
	self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	self.model = instantiate_from_config(self.model_config)
	state_dict = torch.load(self.infer_config.model_path, map_location='cpu')
	self.model.load_state_dict(state_dict, strict=False)
	self.model = self.model.to(self.device)
	self.model.init_flexicubes_geometry(self.device, fovy=30.0, is_ortho=self.model.is_ortho)
	self.model = self.model.eval()

	@spaces.GPU
	def gen(self, imgs):
	imgs = [ cv2.imread(img[0])[:, :, ::-1] for img in imgs ]
	imgs = np.stack(imgs, axis=0).astype(np.float32) / 255.0
	imgs = torch.from_numpy(np.array(imgs)).permute(0, 3, 1, 2).contiguous().float() # (6, 3, 1024, 1024)
	mesh_glb_fpaths = self.make3d(imgs)
	return mesh_glb_fpaths[1:4] + mesh_glb_fpaths[0:1]

	@spaces.GPU
	def make3d(self, images):
	input_cameras = torch.tensor(np.load('slrm/cameras.npy')).to(device)

	images = images.unsqueeze(0).to(device)
	images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1)

	mesh_fpath = tempfile.NamedTemporaryFile(suffix=f".obj", delete=False).name
	print(mesh_fpath)
	mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
	mesh_dirname = os.path.dirname(mesh_fpath)

	with torch.no_grad():
	# get triplane
	planes = self.model.forward_planes(images, input_cameras.float())

	# get mesh
	mesh_glb_fpaths = []
	for j in range(4):
	mesh_glb_fpath = self.make_mesh(mesh_fpath.replace(mesh_fpath[-4:], f'_{j}{mesh_fpath[-4:]}'), planes, level=[0, 3, 4, 2][j])
	mesh_glb_fpaths.append(mesh_glb_fpath)

	return mesh_glb_fpaths

	@spaces.GPU
	def make_mesh(self, mesh_fpath, planes, level=None):
	mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
	mesh_dirname = os.path.dirname(mesh_fpath)
	mesh_glb_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.glb")

	with torch.no_grad():
	# get mesh
	mesh_out = self.model.extract_mesh(
	planes,
	use_texture_map=False,
	levels=torch.tensor([level]).to(device),
	**self.infer_config,
	)

	vertices, faces, vertex_colors = mesh_out
	vertices = vertices[:, [1, 2, 0]]

	if level == 2:
	# fill all vertex_colors with 127
	vertex_colors = np.ones_like(vertex_colors) * 127

	save_obj(vertices, faces, vertex_colors, mesh_fpath)

	return mesh_fpath

	class InferMultiviewAPI:
	def __init__(self, config):
	parser = argparse.ArgumentParser()
	parser.add_argument("--seed", type=int, default=42)
	parser.add_argument("--num_views", type=int, default=6)
	parser.add_argument("--num_levels", type=int, default=3)
	parser.add_argument("--pretrained_path", type=str, default='./ckpt/StdGEN-multiview-1024')
	parser.add_argument("--height", type=int, default=1024)
	parser.add_argument("--width", type=int, default=576)
	self.cfg = parser.parse_args()
	self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
	self.pipeline = load_multiview_pipeline(self.cfg)
	self.results = {}
	if torch.cuda.is_available():
	self.pipeline.to(device)

	self.image_transforms = [transforms.Resize(int(max(self.cfg.height, self.cfg.width))),
	transforms.CenterCrop((self.cfg.height, self.cfg.width)),
	transforms.ToTensor(),
	transforms.Lambda(lambda x: x * 2. - 1),
	]
	self.image_transforms = transforms.Compose(self.image_transforms)

	prompt_embeds_path = './multiview/fixed_prompt_embeds_6view'
	self.normal_text_embeds = torch.load(f'{prompt_embeds_path}/normal_embeds.pt')
	self.color_text_embeds = torch.load(f'{prompt_embeds_path}/clr_embeds.pt')
	self.total_views = self.cfg.num_views


	def process_im(self, im):
	im = self.image_transforms(im)
	return im

	def gen(self, img, seed, num_levels):
	set_seed(seed)
	data = {}

	cond_im_rgb = self.process_im(img)
	cond_im_rgb = torch.stack([cond_im_rgb] * self.total_views, dim=0)
	data["image_cond_rgb"] = cond_im_rgb[None, ...]
	data["normal_prompt_embeddings"] = self.normal_text_embeds[None, ...]
	data["color_prompt_embeddings"] = self.color_text_embeds[None, ...]

	results = run_multiview_infer(data, self.pipeline, self.cfg, num_levels=num_levels)
	for k in results:
	self.results[k] = results[k]
	return results


	class InferCanonicalAPI:
	def __init__(self, config):
	self.config = config
	self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

	self.config_path = config['config_path']
	self.loaded_config = OmegaConf.load(self.config_path)

	self.setup(**self.loaded_config)

	def setup(self,
	validation: Dict,
	pretrained_model_path: str,
	local_crossattn: bool = True,
	unet_from_pretrained_kwargs=None,
	unet_condition_type=None,
	use_noise=True,
	noise_d=256,
	timestep: int = 40,
	width_input: int = 640,
	height_input: int = 1024,
	):
	self.width_input = width_input
	self.height_input = height_input
	self.timestep = timestep
	self.use_noise = use_noise
	self.noise_d = noise_d
	self.validation = validation
	self.unet_condition_type = unet_condition_type
	self.pretrained_model_path = pretrained_model_path

	self.tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
	self.text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
	self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(pretrained_model_path, subfolder="image_encoder")
	self.feature_extractor = CLIPImageProcessor()
	self.vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
	self.unet = UNetMV2DConditionModel.from_pretrained_2d(pretrained_model_path, subfolder="unet", local_crossattn=local_crossattn, **unet_from_pretrained_kwargs)
	self.ref_unet = UNetMV2DRefModel.from_pretrained_2d(pretrained_model_path, subfolder="ref_unet", local_crossattn=local_crossattn, **unet_from_pretrained_kwargs)

	self.text_encoder.to(device, dtype=weight_dtype)
	self.image_encoder.to(device, dtype=weight_dtype)
	self.vae.to(device, dtype=weight_dtype)
	self.ref_unet.to(device, dtype=weight_dtype)
	self.unet.to(device, dtype=weight_dtype)

	self.vae.requires_grad_(False)
	self.ref_unet.requires_grad_(False)
	self.unet.requires_grad_(False)

	self.noise_scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler-zerosnr")
	self.validation_pipeline = CanonicalizationPipeline(
	vae=self.vae, text_encoder=self.text_encoder, tokenizer=self.tokenizer, unet=self.unet, ref_unet=self.ref_unet,feature_extractor=self.feature_extractor,image_encoder=self.image_encoder,
	scheduler=self.noise_scheduler
	)
	self.validation_pipeline.set_progress_bar_config(disable=True)

	def canonicalize(self, image, seed):
	return inference(
	self.validation_pipeline, image, self.vae, self.feature_extractor, self.image_encoder, self.unet, self.ref_unet, self.tokenizer, self.text_encoder,
	self.pretrained_model_path, self.validation, self.width_input, self.height_input, self.unet_condition_type,
	use_noise=self.use_noise, noise_d=self.noise_d, crop=True, seed=seed, timestep=self.timestep
	)

	def gen(self, img_input, seed=0):
	if np.array(img_input).shape[-1] == 4 and np.array(img_input)[..., 3].min() == 255:
	# convert to RGB
	img_input = img_input.convert("RGB")
	img_output = self.canonicalize(img_input, seed)

	max_dim = max(img_output.width, img_output.height)
	new_image = Image.new("RGBA", (max_dim, max_dim))
	left = (max_dim - img_output.width) // 2
	top = (max_dim - img_output.height) // 2
	new_image.paste(img_output, (left, top))

	return new_image