Spaces:

hyz317
/

StdGEN

Running on L40S

App Files Files Community

YulianSa commited on 6 days ago

Commit

def0065

1 Parent(s): 8d53de2

update

Browse files

Files changed (2) hide show

infer_api.py +81 -75
infer_api_bk.py +889 -0

infer_api.py CHANGED Viewed

@@ -367,13 +367,13 @@ class InferAPI:
                 continue
             hf_hub_download(repo_id, file, local_dir="./ckpt")
-        self.canonical_infer = InferCanonicalAPI(self.canonical_configs)
         # self.multiview_infer = InferMultiviewAPI(self.multiview_configs)
         # self.slrm_infer = InferSlrmAPI(self.slrm_configs)
         # self.refine_infer = InferRefineAPI(self.refine_configs)
     def genStage1(self, img, seed):
-        return self.canonical_infer.gen(img, seed)
     def genStage2(self, img, seed, num_levels):
         return self.multiview_infer.gen(img, seed, num_levels)
@@ -811,79 +811,85 @@ class InferMultiviewAPI:
         return results
-class InferCanonicalAPI:
-    def __init__(self, config):
-        self.config = config
-        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        self.config_path = config['config_path']
-        self.loaded_config = OmegaConf.load(self.config_path)
-        self.setup(**self.loaded_config)
-    def setup(self,
-        validation: Dict,
-        pretrained_model_path: str,
-        local_crossattn: bool = True,
-        unet_from_pretrained_kwargs=None,
-        unet_condition_type=None,
-        use_noise=True,
-        noise_d=256,
-        timestep: int = 40,
-        width_input: int = 640,
-        height_input: int = 1024,
     ):
-        self.width_input = width_input
-        self.height_input = height_input
-        self.timestep = timestep
-        self.use_noise = use_noise
-        self.noise_d = noise_d
-        self.validation = validation
-        self.unet_condition_type = unet_condition_type
-        self.pretrained_model_path = pretrained_model_path
-        self.tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
-        self.text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
-        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(pretrained_model_path, subfolder="image_encoder")
-        self.feature_extractor = CLIPImageProcessor()
-        self.vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
-        self.unet = UNetMV2DConditionModel.from_pretrained_2d(pretrained_model_path, subfolder="unet", local_crossattn=local_crossattn, **unet_from_pretrained_kwargs)
-        self.ref_unet = UNetMV2DRefModel.from_pretrained_2d(pretrained_model_path, subfolder="ref_unet", local_crossattn=local_crossattn, **unet_from_pretrained_kwargs)
-        self.text_encoder.to(device, dtype=weight_dtype)
-        self.image_encoder.to(device, dtype=weight_dtype)
-        self.vae.to(device, dtype=weight_dtype)
-        self.ref_unet.to(device, dtype=weight_dtype)
-        self.unet.to(device, dtype=weight_dtype)
-        self.vae.requires_grad_(False)
-        self.ref_unet.requires_grad_(False)
-        self.unet.requires_grad_(False)
-        self.noise_scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler-zerosnr")
-        self.validation_pipeline = CanonicalizationPipeline(
-            vae=self.vae, text_encoder=self.text_encoder, tokenizer=self.tokenizer, unet=self.unet, ref_unet=self.ref_unet,feature_extractor=self.feature_extractor,image_encoder=self.image_encoder,
-            scheduler=self.noise_scheduler
-        )
-        self.validation_pipeline.set_progress_bar_config(disable=True)
-    def canonicalize(self, image, seed):
-        return inference(
-            self.validation_pipeline, image, self.vae, self.feature_extractor, self.image_encoder, self.unet, self.ref_unet, self.tokenizer, self.text_encoder,
-            self.pretrained_model_path, self.validation, self.width_input, self.height_input, self.unet_condition_type,
-            use_noise=self.use_noise, noise_d=self.noise_d, crop=True, seed=seed, timestep=self.timestep
-        )
-    def gen(self, img_input, seed=0):
-        if np.array(img_input).shape[-1] == 4 and np.array(img_input)[..., 3].min() == 255:
-            # convert to RGB
-            img_input = img_input.convert("RGB")
-        img_output = self.canonicalize(img_input, seed)
-        max_dim = max(img_output.width, img_output.height)
-        new_image = Image.new("RGBA", (max_dim, max_dim))
-        left = (max_dim - img_output.width) // 2
-        top = (max_dim - img_output.height) // 2
-        new_image.paste(img_output, (left, top))
-        return new_image

                 continue
             hf_hub_download(repo_id, file, local_dir="./ckpt")
+        # self.canonical_infer = InferCanonicalAPI(self.canonical_configs)
         # self.multiview_infer = InferMultiviewAPI(self.multiview_configs)
         # self.slrm_infer = InferSlrmAPI(self.slrm_configs)
         # self.refine_infer = InferRefineAPI(self.refine_configs)
     def genStage1(self, img, seed):
+        return infer_canonicalize_gen(img, seed)
     def genStage2(self, img, seed, num_levels):
         return self.multiview_infer.gen(img, seed, num_levels)
         return results
+infer_canonicalize_config = {
+    'config_path': './configs/canonicalization-infer.yaml',
+}
+infer_canonicalize_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# print device stderr
+import sys
+print(f"Using device!!!!!!!!!!!!: {infer_canonicalize_device}", file=sys.stderr)
+infer_canonicalize_config_path = infer_canonicalize_config['config_path']
+infer_canonicalize_loaded_config = OmegaConf.load(infer_canonicalize_config_path)
+# infer_canonicalize_setup(**infer_canonicalize_loaded_config)
+def infer_canonicalize_setup(
+    validation: Dict,
+    pretrained_model_path: str,
+    local_crossattn: bool = True,
+    unet_from_pretrained_kwargs=None,
+    unet_condition_type=None,
+    use_noise=True,
+    noise_d=256,
+    timestep: int = 40,
+    width_input: int = 640,
+    height_input: int = 1024,
     ):
+    infer_canonicalize_width_input = width_input
+    infer_canonicalize_height_input = height_input
+    infer_canonicalize_timestep = timestep
+    infer_canonicalize_use_noise = use_noise
+    infer_canonicalize_noise_d = noise_d
+    infer_canonicalize_validation = validation
+    infer_canonicalize_unet_condition_type = unet_condition_type
+    infer_canonicalize_pretrained_model_path = pretrained_model_path
+    infer_canonicalize_local_crossattn = local_crossattn
+    infer_canonicalize_unet_from_pretrained_kwargs = unet_from_pretrained_kwargs
+    return infer_canonicalize_width_input, infer_canonicalize_height_input, infer_canonicalize_timestep, infer_canonicalize_use_noise, infer_canonicalize_noise_d, infer_canonicalize_validation, infer_canonicalize_unet_condition_type, infer_canonicalize_pretrained_model_path, infer_canonicalize_local_crossattn, infer_canonicalize_unet_from_pretrained_kwargs
+infer_canonicalize_width_input, infer_canonicalize_height_input, infer_canonicalize_timestep, infer_canonicalize_use_noise, infer_canonicalize_noise_d, infer_canonicalize_validation, infer_canonicalize_unet_condition_type, infer_canonicalize_pretrained_model_path, infer_canonicalize_local_crossattn, infer_canonicalize_unet_from_pretrained_kwargs = infer_canonicalize_setup(**infer_canonicalize_loaded_config)
+infer_canonicalize_tokenizer = CLIPTokenizer.from_pretrained(infer_canonicalize_pretrained_model_path, subfolder="tokenizer")
+infer_canonicalize_text_encoder = CLIPTextModel.from_pretrained(infer_canonicalize_pretrained_model_path, subfolder="text_encoder")
+infer_canonicalize_image_encoder = CLIPVisionModelWithProjection.from_pretrained(infer_canonicalize_pretrained_model_path, subfolder="image_encoder")
+infer_canonicalize_feature_extractor = CLIPImageProcessor()
+infer_canonicalize_vae = AutoencoderKL.from_pretrained(infer_canonicalize_pretrained_model_path, subfolder="vae")
+infer_canonicalize_unet = UNetMV2DConditionModel.from_pretrained_2d(infer_canonicalize_pretrained_model_path, subfolder="unet", local_crossattn=infer_canonicalize_local_crossattn, **infer_canonicalize_unet_from_pretrained_kwargs)
+infer_canonicalize_ref_unet = UNetMV2DRefModel.from_pretrained_2d(infer_canonicalize_pretrained_model_path, subfolder="ref_unet", local_crossattn=infer_canonicalize_local_crossattn, **infer_canonicalize_unet_from_pretrained_kwargs)
+infer_canonicalize_text_encoder.to(device, dtype=weight_dtype)
+infer_canonicalize_image_encoder.to(device, dtype=weight_dtype)
+infer_canonicalize_vae.to(device, dtype=weight_dtype)
+infer_canonicalize_ref_unet.to(device, dtype=weight_dtype)
+infer_canonicalize_unet.to(device, dtype=weight_dtype)
+infer_canonicalize_vae.requires_grad_(False)
+infer_canonicalize_ref_unet.requires_grad_(False)
+infer_canonicalize_unet.requires_grad_(False)
+infer_canonicalize_noise_scheduler = DDIMScheduler.from_pretrained(infer_canonicalize_pretrained_model_path, subfolder="scheduler-zerosnr")
+infer_canonicalize_validation_pipeline = CanonicalizationPipeline(
+    vae=infer_canonicalize_vae, text_encoder=infer_canonicalize_text_encoder, tokenizer=infer_canonicalize_tokenizer, unet=infer_canonicalize_unet, ref_unet=infer_canonicalize_ref_unet,feature_extractor=infer_canonicalize_feature_extractor,image_encoder=infer_canonicalize_image_encoder,
+    scheduler=infer_canonicalize_noise_scheduler
+)
+infer_canonicalize_validation_pipeline.set_progress_bar_config(disable=True)
+def infer_canonicalize_gen(img_input, seed=0):
+    if np.array(img_input).shape[-1] == 4 and np.array(img_input)[..., 3].min() == 255:
+        # convert to RGB
+        img_input = img_input.convert("RGB")
+    img_output = inference(
+        infer_canonicalize_validation_pipeline, img_input, infer_canonicalize_vae, infer_canonicalize_feature_extractor, infer_canonicalize_image_encoder, infer_canonicalize_unet, infer_canonicalize_ref_unet, infer_canonicalize_tokenizer, infer_canonicalize_text_encoder,
+        infer_canonicalize_pretrained_model_path, infer_canonicalize_validation, infer_canonicalize_width_input, infer_canonicalize_height_input, infer_canonicalize_unet_condition_type,
+        use_noise=infer_canonicalize_use_noise, noise_d=infer_canonicalize_noise_d, crop=True, seed=seed, timestep=infer_canonicalize_timestep
+    )
+    max_dim = max(img_output.width, img_output.height)
+    new_image = Image.new("RGBA", (max_dim, max_dim))
+    left = (max_dim - img_output.width) // 2
+    top = (max_dim - img_output.height) // 2
+    new_image.paste(img_output, (left, top))
+    return new_image

infer_api_bk.py ADDED Viewed

	@@ -0,0 +1,889 @@

+import spaces
+from PIL import Image
+import io
+import argparse
+import os
+import random
+import tempfile
+from typing import Dict, Optional, Tuple
+from omegaconf import OmegaConf
+import numpy as np
+import torch
+from diffusers import AutoencoderKL, DDIMScheduler
+from diffusers.utils import check_min_version
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPImageProcessor, CLIPVisionModelWithProjection
+from torchvision import transforms
+from canonicalize.models.unet_mv2d_condition import UNetMV2DConditionModel
+from canonicalize.models.unet_mv2d_ref import UNetMV2DRefModel
+from canonicalize.pipeline_canonicalize import CanonicalizationPipeline
+from einops import rearrange
+from torchvision.utils import save_image
+import json
+import cv2
+import onnxruntime as rt
+from huggingface_hub.file_download import hf_hub_download
+from huggingface_hub import list_repo_files
+from rm_anime_bg.cli import get_mask, SCALE
+import argparse
+import os
+import cv2
+import glob
+import numpy as np
+import matplotlib.pyplot as plt
+from typing import Dict, Optional,  List
+from omegaconf import OmegaConf, DictConfig
+from PIL import Image
+from pathlib import Path
+from dataclasses import dataclass
+from typing import Dict
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import torchvision.transforms.functional as TF
+from torch.utils.data import Dataset, DataLoader
+from torchvision import transforms
+from torchvision.utils import make_grid, save_image
+from accelerate.utils import set_seed
+from tqdm.auto import tqdm
+from einops import rearrange, repeat
+from multiview.pipeline_multiclass import StableUnCLIPImg2ImgPipeline
+import os
+import imageio
+import numpy as np
+import torch
+import cv2
+import glob
+import matplotlib.pyplot as plt
+from PIL import Image
+from torchvision.transforms import v2
+from pytorch_lightning import seed_everything
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from slrm.utils.train_util import instantiate_from_config
+from slrm.utils.camera_util import (
+    FOV_to_intrinsics,
+    get_circular_camera_poses,
+)
+from slrm.utils.mesh_util import save_obj, save_glb
+from slrm.utils.infer_util import images_to_video
+import cv2
+import numpy as np
+import os
+import trimesh
+import argparse
+import torch
+import scipy
+from PIL import Image
+from refine.mesh_refine import geo_refine
+from refine.func import make_star_cameras_orthographic
+from refine.render import NormalsRenderer, calc_vertex_normals
+import pytorch3d
+from pytorch3d.structures import Meshes
+from sklearn.neighbors import KDTree
+from segment_anything import SamAutomaticMaskGenerator, sam_model_registry
+check_min_version("0.24.0")
+weight_dtype = torch.float16
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+VIEWS = ['front', 'front_right', 'right', 'back', 'left', 'front_left']
+@spaces.GPU
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+session_infer_path = hf_hub_download(
+    repo_id="skytnt/anime-seg", filename="isnetis.onnx",
+)
+providers: list[str] = ["CPUExecutionProvider"]
+if "CUDAExecutionProvider" in rt.get_available_providers():
+    providers = ["CUDAExecutionProvider"]
+bkg_remover_session_infer = rt.InferenceSession(
+    session_infer_path, providers=providers,
+)
+@spaces.GPU
+def remove_background(
+    img: np.ndarray,
+    alpha_min: float,
+    alpha_max: float,
+) -> list:
+    img = np.array(img)
+    mask = get_mask(bkg_remover_session_infer, img)
+    mask[mask < alpha_min] = 0.0
+    mask[mask > alpha_max] = 1.0
+    img_after = (mask * img).astype(np.uint8)
+    mask = (mask * SCALE).astype(np.uint8)
+    img_after = np.concatenate([img_after, mask], axis=2, dtype=np.uint8)
+    return Image.fromarray(img_after)
+def process_image(image, totensor, width, height):
+    assert image.mode == "RGBA"
+    # Find non-transparent pixels
+    non_transparent = np.nonzero(np.array(image)[..., 3])
+    min_x, max_x = non_transparent[1].min(), non_transparent[1].max()
+    min_y, max_y = non_transparent[0].min(), non_transparent[0].max()
+    image = image.crop((min_x, min_y, max_x, max_y))
+    # paste to center
+    max_dim = max(image.width, image.height)
+    max_height = int(max_dim * 1.2)
+    max_width = int(max_dim / (height/width) * 1.2)
+    new_image = Image.new("RGBA", (max_width, max_height))
+    left = (max_width - image.width) // 2
+    top = (max_height - image.height) // 2
+    new_image.paste(image, (left, top))
+    image = new_image.resize((width, height), resample=Image.BICUBIC)
+    image = np.array(image)
+    image = image.astype(np.float32) / 255.
+    assert image.shape[-1] == 4  # RGBA
+    alpha = image[..., 3:4]
+    bg_color = np.array([1., 1., 1.], dtype=np.float32)
+    image = image[..., :3] * alpha + bg_color * (1 - alpha)
+    return totensor(image)
+@spaces.GPU
+@torch.no_grad()
+def inference(validation_pipeline, input_image, vae, feature_extractor, image_encoder, unet, ref_unet, tokenizer,
+              text_encoder, pretrained_model_path, validation, val_width, val_height, unet_condition_type,
+              use_noise=True, noise_d=256, crop=False, seed=100, timestep=20):
+    set_seed(seed)
+    generator = torch.Generator(device=device).manual_seed(seed)
+    totensor = transforms.ToTensor()
+    prompts = "high quality, best quality"
+    prompt_ids = tokenizer(
+        prompts, max_length=tokenizer.model_max_length, padding="max_length", truncation=True,
+        return_tensors="pt"
+    ).input_ids[0]
+    # (B*Nv, 3, H, W)
+    B = 1
+    if input_image.mode != "RGBA":
+        # remove background
+        input_image = remove_background(input_image, 0.1, 0.9)
+    imgs_in = process_image(input_image, totensor, val_width, val_height)
+    imgs_in = rearrange(imgs_in.unsqueeze(0).unsqueeze(0), "B Nv C H W -> (B Nv) C H W")
+    with torch.autocast('cuda' if torch.cuda.is_available() else 'cpu', dtype=weight_dtype):
+        imgs_in = imgs_in.to(device=device)
+        # B*Nv images
+        out = validation_pipeline(prompt=prompts, image=imgs_in.to(weight_dtype), generator=generator,
+                                  num_inference_steps=timestep, prompt_ids=prompt_ids,
+                                  height=val_height, width=val_width, unet_condition_type=unet_condition_type,
+                                  use_noise=use_noise, **validation,)
+        out = rearrange(out, "B C f H W -> (B f) C H W", f=1)
+    print("OUT!!!!!!")
+    img_buf = io.BytesIO()
+    save_image(out[0], img_buf, format='PNG')
+    img_buf.seek(0)
+    img = Image.open(img_buf)
+    print("OUT2!!!!!!")
+    torch.cuda.empty_cache()
+    return img
+######### Multi View Part #############
+weight_dtype = torch.float16
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def tensor_to_numpy(tensor):
+    return tensor.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
+@dataclass
+class TestConfig:
+    pretrained_model_name_or_path: str
+    pretrained_unet_path:Optional[str]
+    revision: Optional[str]
+    validation_dataset: Dict
+    save_dir: str
+    seed: Optional[int]
+    validation_batch_size: int
+    dataloader_num_workers: int
+    save_mode: str
+    local_rank: int
+    pipe_kwargs: Dict
+    pipe_validation_kwargs: Dict
+    unet_from_pretrained_kwargs: Dict
+    validation_grid_nrow: int
+    camera_embedding_lr_mult: float
+    num_views: int
+    camera_embedding_type: str
+    pred_type: str
+    regress_elevation: bool
+    enable_xformers_memory_efficient_attention: bool
+    cond_on_normals: bool
+    cond_on_colors: bool
+    regress_elevation: bool
+    regress_focal_length: bool
+def convert_to_numpy(tensor):
+    return tensor.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
+def save_image(tensor):
+    ndarr = convert_to_numpy(tensor)
+    return save_image_numpy(ndarr)
+def save_image_numpy(ndarr):
+    im = Image.fromarray(ndarr)
+    # pad to square
+    if im.size[0] != im.size[1]:
+        size = max(im.size)
+        new_im = Image.new("RGB", (size, size))
+        # set to white
+        new_im.paste((255, 255, 255), (0, 0, size, size))
+        new_im.paste(im, ((size - im.size[0]) // 2, (size - im.size[1]) // 2))
+        im = new_im
+    # resize to 1024x1024
+    im = im.resize((1024, 1024), Image.LANCZOS)
+    return im
+@spaces.GPU
+def run_multiview_infer(data, pipeline, cfg: TestConfig, num_levels=3):
+    if cfg.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=pipeline.unet.device).manual_seed(cfg.seed)
+    images_cond = []
+    results = {}
+    torch.cuda.empty_cache()
+    images_cond.append(data['image_cond_rgb'][:, 0].cuda())
+    imgs_in = torch.cat([data['image_cond_rgb']]*2, dim=0).cuda()
+    num_views = imgs_in.shape[1]
+    imgs_in = rearrange(imgs_in, "B Nv C H W -> (B Nv) C H W")# (B*Nv, 3, H, W)
+    target_h, target_w = imgs_in.shape[-2], imgs_in.shape[-1]
+    normal_prompt_embeddings, clr_prompt_embeddings = data['normal_prompt_embeddings'].cuda(), data['color_prompt_embeddings'].cuda()
+    prompt_embeddings = torch.cat([normal_prompt_embeddings, clr_prompt_embeddings], dim=0)
+    prompt_embeddings = rearrange(prompt_embeddings, "B Nv N C -> (B Nv) N C")
+    # B*Nv images
+    unet_out = pipeline(
+        imgs_in, None, prompt_embeds=prompt_embeddings,
+        generator=generator, guidance_scale=3.0, output_type='pt', num_images_per_prompt=1,
+        height=cfg.height, width=cfg.width,
+        num_inference_steps=40, eta=1.0,
+        num_levels=num_levels,
+    )
+    for level in range(num_levels):
+        out = unet_out[level].images
+        bsz = out.shape[0] // 2
+        normals_pred = out[:bsz]
+        images_pred = out[bsz:]
+        if num_levels == 2:
+            results[level+1] = {'normals': [], 'images': []}
+        else:
+            results[level] = {'normals': [], 'images': []}
+        for i in range(bsz//num_views):
+            img_in_ = images_cond[-1][i].to(out.device)
+            for j in range(num_views):
+                view = VIEWS[j]
+                idx = i*num_views + j
+                normal = normals_pred[idx]
+                color = images_pred[idx]
+                ## save color and normal---------------------
+                new_normal = save_image(normal)
+                new_color = save_image(color)
+                if num_levels == 2:
+                    results[level+1]['normals'].append(new_normal)
+                    results[level+1]['images'].append(new_color)
+                else:
+                    results[level]['normals'].append(new_normal)
+                    results[level]['images'].append(new_color)
+    torch.cuda.empty_cache()
+    return results
+@spaces.GPU
+def load_multiview_pipeline(cfg):
+    pipeline = StableUnCLIPImg2ImgPipeline.from_pretrained(
+        cfg.pretrained_path,
+        torch_dtype=torch.float16,)
+    pipeline.unet.enable_xformers_memory_efficient_attention()
+    if torch.cuda.is_available():
+        pipeline.to(device)
+    return pipeline
+class InferAPI:
+    def __init__(self,
+                 canonical_configs,
+                 multiview_configs,
+                 slrm_configs,
+                 refine_configs):
+        self.canonical_configs = canonical_configs
+        self.multiview_configs = multiview_configs
+        self.slrm_configs = slrm_configs
+        self.refine_configs = refine_configs
+        repo_id = "hyz317/StdGEN"
+        all_files = list_repo_files(repo_id, revision="main")
+        for file in all_files:
+            if os.path.exists(file):
+                continue
+            hf_hub_download(repo_id, file, local_dir="./ckpt")
+        self.canonical_infer = InferCanonicalAPI(self.canonical_configs)
+        # self.multiview_infer = InferMultiviewAPI(self.multiview_configs)
+        # self.slrm_infer = InferSlrmAPI(self.slrm_configs)
+        # self.refine_infer = InferRefineAPI(self.refine_configs)
+    def genStage1(self, img, seed):
+        return self.canonical_infer.gen(img, seed)
+    def genStage2(self, img, seed, num_levels):
+        return self.multiview_infer.gen(img, seed, num_levels)
+    def genStage3(self, img):
+        return self.slrm_infer.gen(img)
+    def genStage4(self, meshes, imgs):
+        return self.refine_infer.refine(meshes, imgs)
+############## Refine ##############
+def fix_vert_color_glb(mesh_path):
+    from pygltflib import GLTF2, Material, PbrMetallicRoughness
+    obj1 = GLTF2().load(mesh_path)
+    obj1.meshes[0].primitives[0].material = 0
+    obj1.materials.append(Material(
+        pbrMetallicRoughness = PbrMetallicRoughness(
+            baseColorFactor = [1.0, 1.0, 1.0, 1.0],
+            metallicFactor = 0.,
+            roughnessFactor = 1.0,
+        ),
+        emissiveFactor = [0.0, 0.0, 0.0],
+        doubleSided = True,
+    ))
+    obj1.save(mesh_path)
+def srgb_to_linear(c_srgb):
+    c_linear = np.where(c_srgb <= 0.04045, c_srgb / 12.92, ((c_srgb + 0.055) / 1.055) ** 2.4)
+    return c_linear.clip(0, 1.)
+def save_py3dmesh_with_trimesh_fast(meshes: Meshes, save_glb_path, apply_sRGB_to_LinearRGB=True):
+    # convert from pytorch3d meshes to trimesh mesh
+    vertices = meshes.verts_packed().cpu().float().numpy()
+    triangles = meshes.faces_packed().cpu().long().numpy()
+    np_color = meshes.textures.verts_features_packed().cpu().float().numpy()
+    if save_glb_path.endswith(".glb"):
+        # rotate 180 along +Y
+        vertices[:, [0, 2]] = -vertices[:, [0, 2]]
+    if apply_sRGB_to_LinearRGB:
+        np_color = srgb_to_linear(np_color)
+    assert vertices.shape[0] == np_color.shape[0]
+    assert np_color.shape[1] == 3
+    assert 0 <= np_color.min() and np_color.max() <= 1.001, f"min={np_color.min()}, max={np_color.max()}"
+    np_color = np.clip(np_color, 0, 1)
+    mesh = trimesh.Trimesh(vertices=vertices, faces=triangles, vertex_colors=np_color)
+    mesh.remove_unreferenced_vertices()
+    # save mesh
+    mesh.export(save_glb_path)
+    if save_glb_path.endswith(".glb"):
+        fix_vert_color_glb(save_glb_path)
+    print(f"saving to {save_glb_path}")
+def calc_horizontal_offset(target_img, source_img):
+    target_mask = target_img.astype(np.float32).sum(axis=-1) > 750
+    source_mask = source_img.astype(np.float32).sum(axis=-1) > 750
+    best_offset = -114514
+    for offset in range(-200, 200):
+        offset_mask = np.roll(source_mask, offset, axis=1)
+        overlap = (target_mask & offset_mask).sum()
+        if overlap > best_offset:
+            best_offset = overlap
+            best_offset_value = offset
+    return best_offset_value
+def calc_horizontal_offset2(target_mask, source_img):
+    source_mask = source_img.astype(np.float32).sum(axis=-1) > 750
+    best_offset = -114514
+    for offset in range(-200, 200):
+        offset_mask = np.roll(source_mask, offset, axis=1)
+        overlap = (target_mask & offset_mask).sum()
+        if overlap > best_offset:
+            best_offset = overlap
+            best_offset_value = offset
+    return best_offset_value
+@spaces.GPU
+def get_distract_mask(generator, color_0, color_1, normal_0=None, normal_1=None, thres=0.25, ratio=0.50, outside_thres=0.10, outside_ratio=0.20):
+    distract_area = np.abs(color_0 - color_1).sum(axis=-1) > thres
+    if normal_0 is not None and normal_1 is not None:
+        distract_area |= np.abs(normal_0 - normal_1).sum(axis=-1) > thres
+    labeled_array, num_features = scipy.ndimage.label(distract_area)
+    results = []
+    random_sampled_points = []
+    for i in range(num_features + 1):
+        if np.sum(labeled_array == i) > 1000 and np.sum(labeled_array == i) < 100000:
+            results.append((i, np.sum(labeled_array == i)))
+            # random sample a point in the area
+            points = np.argwhere(labeled_array == i)
+            random_sampled_points.append(points[np.random.randint(0, points.shape[0])])
+    results = sorted(results, key=lambda x: x[1], reverse=True)  # [1:]
+    distract_mask = np.zeros_like(distract_area)
+    distract_bbox = np.zeros_like(distract_area)
+    for i, _ in results:
+        distract_mask |= labeled_array == i
+        bbox = np.argwhere(labeled_array == i)
+        min_x, min_y = bbox.min(axis=0)
+        max_x, max_y = bbox.max(axis=0)
+        distract_bbox[min_x:max_x, min_y:max_y] = 1
+    points = np.array(random_sampled_points)[:, ::-1]
+    labels = np.ones(len(points), dtype=np.int32)
+    masks = generator.generate((color_1 * 255).astype(np.uint8))
+    outside_area = np.abs(color_0 - color_1).sum(axis=-1) < outside_thres
+    final_mask = np.zeros_like(distract_mask)
+    for iii, mask in enumerate(masks):
+        mask['segmentation'] = cv2.resize(mask['segmentation'].astype(np.float32), (1024, 1024)) > 0.5
+        intersection = np.logical_and(mask['segmentation'], distract_mask).sum()
+        total = mask['segmentation'].sum()
+        iou = intersection / total
+        outside_intersection = np.logical_and(mask['segmentation'], outside_area).sum()
+        outside_total = mask['segmentation'].sum()
+        outside_iou = outside_intersection / outside_total
+        if iou > ratio and outside_iou < outside_ratio:
+            final_mask |= mask['segmentation']
+    # calculate coverage
+    intersection = np.logical_and(final_mask, distract_mask).sum()
+    total = distract_mask.sum()
+    coverage = intersection / total
+    if coverage < 0.8:
+        # use original distract mask
+        final_mask = (distract_mask.copy() * 255).astype(np.uint8)
+        final_mask = cv2.dilate(final_mask, np.ones((3, 3), np.uint8), iterations=3)
+        labeled_array_dilate, num_features_dilate = scipy.ndimage.label(final_mask)
+        for i in range(num_features_dilate + 1):
+            if np.sum(labeled_array_dilate == i) < 200:
+                final_mask[labeled_array_dilate == i] = 255
+        final_mask = cv2.erode(final_mask, np.ones((3, 3), np.uint8), iterations=3)
+        final_mask = final_mask > 127
+    return distract_mask, distract_bbox, random_sampled_points, final_mask
+class InferRefineAPI:
+    @spaces.GPU
+    def __init__(self, config):
+        self.sam = sam_model_registry["vit_h"](checkpoint="./ckpt/sam_vit_h_4b8939.pth").cuda()
+        self.generator = SamAutomaticMaskGenerator(
+            model=self.sam,
+            points_per_side=64,
+            pred_iou_thresh=0.80,
+            stability_score_thresh=0.92,
+            crop_n_layers=1,
+            crop_n_points_downscale_factor=2,
+            min_mask_region_area=100,
+        )
+        self.outside_ratio = 0.20
+    @spaces.GPU
+    def refine(self, meshes, imgs):
+        fixed_v, fixed_f, fixed_t = None, None, None
+        flow_vert, flow_vector = None, None
+        last_colors, last_normals = None, None
+        last_front_color, last_front_normal = None, None
+        distract_mask = None
+        mv, proj = make_star_cameras_orthographic(8, 1, r=1.2)
+        mv = mv[[4, 3, 2, 0, 6, 5]]
+        renderer = NormalsRenderer(mv,proj,(1024,1024))
+        results = []
+        for name_idx, level in zip([2, 0, 1], [2, 1, 0]):
+            mesh = trimesh.load(meshes[name_idx])
+            new_mesh = mesh.split(only_watertight=False)
+            new_mesh = [ j for j in new_mesh if len(j.vertices) >= 300 ]
+            mesh = trimesh.Scene(new_mesh).dump(concatenate=True)
+            mesh_v, mesh_f = mesh.vertices, mesh.faces
+            if last_colors is None:
+                images = renderer.render(
+                    torch.tensor(mesh_v, device='cuda').float(),
+                    torch.ones_like(torch.from_numpy(mesh_v), device='cuda').float(),
+                    torch.tensor(mesh_f, device='cuda'),
+                )
+                mask = (images[..., 3] < 0.9).cpu().numpy()
+            colors, normals = [], []
+            for i in range(6):
+                color = np.array(imgs[level]['images'][i])
+                normal = np.array(imgs[level]['normals'][i])
+                if last_colors is not None:
+                    offset = calc_horizontal_offset(np.array(last_colors[i]), color)
+                    # print('offset', i, offset)
+                else:
+                    offset = calc_horizontal_offset2(mask[i], color)
+                    # print('init offset', i, offset)
+                if offset != 0:
+                    color = np.roll(color, offset, axis=1)
+                    normal = np.roll(normal, offset, axis=1)
+                color = Image.fromarray(color)
+                normal = Image.fromarray(normal)
+                colors.append(color)
+                normals.append(normal)
+            if last_front_color is not None and level == 0:
+                original_mask, distract_bbox, _, distract_mask = get_distract_mask(self.generator, last_front_color, np.array(colors[0]).astype(np.float32) / 255.0, outside_ratio=self.outside_ratio)
+            else:
+                distract_mask = None
+                distract_bbox = None
+            last_front_color = np.array(colors[0]).astype(np.float32) / 255.0
+            last_front_normal = np.array(normals[0]).astype(np.float32) / 255.0
+            if last_colors is None:
+                from copy import deepcopy
+                last_colors, last_normals = deepcopy(colors), deepcopy(normals)
+            # my mesh flow weight by nearest vertexs
+            if fixed_v is not None and fixed_f is not None and level == 1:
+                t = trimesh.Trimesh(vertices=mesh_v, faces=mesh_f)
+                fixed_v_cpu = fixed_v.cpu().numpy()
+                kdtree_anchor = KDTree(fixed_v_cpu)
+                kdtree_mesh_v = KDTree(mesh_v)
+                _, idx_anchor = kdtree_anchor.query(mesh_v, k=1)
+                _, idx_mesh_v = kdtree_mesh_v.query(mesh_v, k=25)
+                idx_anchor = idx_anchor.squeeze()
+                neighbors = torch.tensor(mesh_v).cuda()[idx_mesh_v]  # V, 25, 3
+                # calculate the distances neighbors [V, 25, 3]; mesh_v [V, 3] -> [V, 25]
+                neighbor_dists = torch.norm(neighbors - torch.tensor(mesh_v).cuda()[:, None], dim=-1)
+                neighbor_dists[neighbor_dists > 0.06] = 114514.
+                neighbor_weights = torch.exp(-neighbor_dists * 1.)
+                neighbor_weights = neighbor_weights / neighbor_weights.sum(dim=1, keepdim=True)
+                anchors = fixed_v[idx_anchor]  # V, 3
+                anchor_normals = calc_vertex_normals(fixed_v, fixed_f)[idx_anchor]  # V, 3
+                dis_anchor = torch.clamp(((anchors - torch.tensor(mesh_v).cuda()) * anchor_normals).sum(-1), min=0) + 0.01
+                vec_anchor = dis_anchor[:, None] * anchor_normals  # V, 3
+                vec_anchor = vec_anchor[idx_mesh_v]  # V, 25, 3
+                weighted_vec_anchor = (vec_anchor * neighbor_weights[:, :, None]).sum(1)  # V, 3
+                mesh_v += weighted_vec_anchor.cpu().numpy()
+                t = trimesh.Trimesh(vertices=mesh_v, faces=mesh_f)
+            mesh_v = torch.tensor(mesh_v, device='cuda', dtype=torch.float32)
+            mesh_f = torch.tensor(mesh_f, device='cuda')
+            new_mesh, simp_v, simp_f = geo_refine(mesh_v, mesh_f, colors, normals, fixed_v=fixed_v, fixed_f=fixed_f, distract_mask=distract_mask, distract_bbox=distract_bbox)
+            # my mesh flow weight by nearest vertexs
+            try:
+                if fixed_v is not None and fixed_f is not None and level != 0:
+                    new_mesh_v = new_mesh.verts_packed().cpu().numpy()
+                    fixed_v_cpu = fixed_v.cpu().numpy()
+                    kdtree_anchor = KDTree(fixed_v_cpu)
+                    kdtree_mesh_v = KDTree(new_mesh_v)
+                    _, idx_anchor = kdtree_anchor.query(new_mesh_v, k=1)
+                    _, idx_mesh_v = kdtree_mesh_v.query(new_mesh_v, k=25)
+                    idx_anchor = idx_anchor.squeeze()
+                    neighbors = torch.tensor(new_mesh_v).cuda()[idx_mesh_v]  # V, 25, 3
+                    # calculate the distances neighbors [V, 25, 3]; new_mesh_v [V, 3] -> [V, 25]
+                    neighbor_dists = torch.norm(neighbors - torch.tensor(new_mesh_v).cuda()[:, None], dim=-1)
+                    neighbor_dists[neighbor_dists > 0.06] = 114514.
+                    neighbor_weights = torch.exp(-neighbor_dists * 1.)
+                    neighbor_weights = neighbor_weights / neighbor_weights.sum(dim=1, keepdim=True)
+                    anchors = fixed_v[idx_anchor]  # V, 3
+                    anchor_normals = calc_vertex_normals(fixed_v, fixed_f)[idx_anchor]  # V, 3
+                    dis_anchor = torch.clamp(((anchors - torch.tensor(new_mesh_v).cuda()) * anchor_normals).sum(-1), min=0) + 0.01
+                    vec_anchor = dis_anchor[:, None] * anchor_normals  # V, 3
+                    vec_anchor = vec_anchor[idx_mesh_v]  # V, 25, 3
+                    weighted_vec_anchor = (vec_anchor * neighbor_weights[:, :, None]).sum(1)  # V, 3
+                    new_mesh_v += weighted_vec_anchor.cpu().numpy()
+                    # replace new_mesh verts with new_mesh_v
+                    new_mesh = Meshes(verts=[torch.tensor(new_mesh_v, device='cuda')], faces=new_mesh.faces_list(), textures=new_mesh.textures)
+            except Exception as e:
+                pass
+            notsimp_v, notsimp_f, notsimp_t = new_mesh.verts_packed(), new_mesh.faces_packed(), new_mesh.textures.verts_features_packed()
+            if fixed_v is None:
+                fixed_v, fixed_f = simp_v, simp_f
+                complete_v, complete_f, complete_t = notsimp_v, notsimp_f, notsimp_t
+            else:
+                fixed_f = torch.cat([fixed_f, simp_f + fixed_v.shape[0]], dim=0)
+                fixed_v = torch.cat([fixed_v, simp_v], dim=0)
+                complete_f = torch.cat([complete_f, notsimp_f + complete_v.shape[0]], dim=0)
+                complete_v = torch.cat([complete_v, notsimp_v], dim=0)
+                complete_t = torch.cat([complete_t, notsimp_t], dim=0)
+            if level == 2:
+                new_mesh = Meshes(verts=[new_mesh.verts_packed()], faces=[new_mesh.faces_packed()], textures=pytorch3d.renderer.mesh.textures.TexturesVertex(verts_features=[torch.ones_like(new_mesh.textures.verts_features_packed(), device=new_mesh.verts_packed().device)*0.5]))
+            save_py3dmesh_with_trimesh_fast(new_mesh, meshes[name_idx].replace('.obj', '_refined.obj'), apply_sRGB_to_LinearRGB=False)
+            results.append(meshes[name_idx].replace('.obj', '_refined.obj'))
+        # save whole mesh
+        save_py3dmesh_with_trimesh_fast(Meshes(verts=[complete_v], faces=[complete_f], textures=pytorch3d.renderer.mesh.textures.TexturesVertex(verts_features=[complete_t])), meshes[name_idx].replace('.obj', '_refined_whole.obj'), apply_sRGB_to_LinearRGB=False)
+        results.append(meshes[name_idx].replace('.obj', '_refined_whole.obj'))
+        return results
+class InferSlrmAPI:
+    @spaces.GPU
+    def __init__(self, config):
+        self.config_path = config['config_path']
+        self.config = OmegaConf.load(self.config_path)
+        self.config_name = os.path.basename(self.config_path).replace('.yaml', '')
+        self.model_config = self.config.model_config
+        self.infer_config = self.config.infer_config
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        self.model = instantiate_from_config(self.model_config)
+        state_dict = torch.load(self.infer_config.model_path, map_location='cpu')
+        self.model.load_state_dict(state_dict, strict=False)
+        self.model = self.model.to(self.device)
+        self.model.init_flexicubes_geometry(self.device, fovy=30.0, is_ortho=self.model.is_ortho)
+        self.model = self.model.eval()
+    @spaces.GPU
+    def gen(self, imgs):
+        imgs = [ cv2.imread(img[0])[:, :, ::-1] for img in imgs ]
+        imgs = np.stack(imgs, axis=0).astype(np.float32) / 255.0
+        imgs = torch.from_numpy(np.array(imgs)).permute(0, 3, 1, 2).contiguous().float()   # (6, 3, 1024, 1024)
+        mesh_glb_fpaths = self.make3d(imgs)
+        return mesh_glb_fpaths[1:4] + mesh_glb_fpaths[0:1]
+    @spaces.GPU
+    def make3d(self, images):
+        input_cameras = torch.tensor(np.load('slrm/cameras.npy')).to(device)
+        images = images.unsqueeze(0).to(device)
+        images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1)
+        mesh_fpath = tempfile.NamedTemporaryFile(suffix=f".obj", delete=False).name
+        print(mesh_fpath)
+        mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
+        mesh_dirname = os.path.dirname(mesh_fpath)
+        with torch.no_grad():
+            # get triplane
+            planes = self.model.forward_planes(images, input_cameras.float())
+            # get mesh
+            mesh_glb_fpaths = []
+            for j in range(4):
+                mesh_glb_fpath = self.make_mesh(mesh_fpath.replace(mesh_fpath[-4:], f'_{j}{mesh_fpath[-4:]}'), planes, level=[0, 3, 4, 2][j])
+                mesh_glb_fpaths.append(mesh_glb_fpath)
+        return mesh_glb_fpaths
+    @spaces.GPU
+    def make_mesh(self, mesh_fpath, planes, level=None):
+        mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
+        mesh_dirname = os.path.dirname(mesh_fpath)
+        mesh_glb_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.glb")
+        with torch.no_grad():
+            # get mesh
+            mesh_out = self.model.extract_mesh(
+                planes,
+                use_texture_map=False,
+                levels=torch.tensor([level]).to(device),
+                **self.infer_config,
+            )
+            vertices, faces, vertex_colors = mesh_out
+            vertices = vertices[:, [1, 2, 0]]
+            if level == 2:
+                # fill all vertex_colors with 127
+                vertex_colors = np.ones_like(vertex_colors) * 127
+            save_obj(vertices, faces, vertex_colors, mesh_fpath)
+        return mesh_fpath
+class InferMultiviewAPI:
+    def __init__(self, config):
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--seed", type=int, default=42)
+        parser.add_argument("--num_views", type=int, default=6)
+        parser.add_argument("--num_levels", type=int, default=3)
+        parser.add_argument("--pretrained_path", type=str, default='./ckpt/StdGEN-multiview-1024')
+        parser.add_argument("--height", type=int, default=1024)
+        parser.add_argument("--width", type=int, default=576)
+        self.cfg = parser.parse_args()
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        self.pipeline = load_multiview_pipeline(self.cfg)
+        self.results = {}
+        if torch.cuda.is_available():
+            self.pipeline.to(device)
+        self.image_transforms = [transforms.Resize(int(max(self.cfg.height, self.cfg.width))),
+                                 transforms.CenterCrop((self.cfg.height, self.cfg.width)),
+                                 transforms.ToTensor(),
+                                 transforms.Lambda(lambda x: x * 2. - 1),
+                                 ]
+        self.image_transforms = transforms.Compose(self.image_transforms)
+        prompt_embeds_path = './multiview/fixed_prompt_embeds_6view'
+        self.normal_text_embeds = torch.load(f'{prompt_embeds_path}/normal_embeds.pt')
+        self.color_text_embeds = torch.load(f'{prompt_embeds_path}/clr_embeds.pt')
+        self.total_views = self.cfg.num_views
+    def process_im(self, im):
+        im = self.image_transforms(im)
+        return im
+    def gen(self, img, seed, num_levels):
+        set_seed(seed)
+        data = {}
+        cond_im_rgb = self.process_im(img)
+        cond_im_rgb = torch.stack([cond_im_rgb] * self.total_views, dim=0)
+        data["image_cond_rgb"] = cond_im_rgb[None, ...]
+        data["normal_prompt_embeddings"] = self.normal_text_embeds[None, ...]
+        data["color_prompt_embeddings"] = self.color_text_embeds[None, ...]
+        results = run_multiview_infer(data, self.pipeline, self.cfg, num_levels=num_levels)
+        for k in results:
+            self.results[k] = results[k]
+        return results
+class InferCanonicalAPI:
+    def __init__(self, config):
+        self.config = config
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        self.config_path = config['config_path']
+        self.loaded_config = OmegaConf.load(self.config_path)
+        self.setup(**self.loaded_config)
+    def setup(self,
+        validation: Dict,
+        pretrained_model_path: str,
+        local_crossattn: bool = True,
+        unet_from_pretrained_kwargs=None,
+        unet_condition_type=None,
+        use_noise=True,
+        noise_d=256,
+        timestep: int = 40,
+        width_input: int = 640,
+        height_input: int = 1024,
+    ):
+        self.width_input = width_input
+        self.height_input = height_input
+        self.timestep = timestep
+        self.use_noise = use_noise
+        self.noise_d = noise_d
+        self.validation = validation
+        self.unet_condition_type = unet_condition_type
+        self.pretrained_model_path = pretrained_model_path
+        self.tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
+        self.text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
+        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(pretrained_model_path, subfolder="image_encoder")
+        self.feature_extractor = CLIPImageProcessor()
+        self.vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
+        self.unet = UNetMV2DConditionModel.from_pretrained_2d(pretrained_model_path, subfolder="unet", local_crossattn=local_crossattn, **unet_from_pretrained_kwargs)
+        self.ref_unet = UNetMV2DRefModel.from_pretrained_2d(pretrained_model_path, subfolder="ref_unet", local_crossattn=local_crossattn, **unet_from_pretrained_kwargs)
+        self.text_encoder.to(device, dtype=weight_dtype)
+        self.image_encoder.to(device, dtype=weight_dtype)
+        self.vae.to(device, dtype=weight_dtype)
+        self.ref_unet.to(device, dtype=weight_dtype)
+        self.unet.to(device, dtype=weight_dtype)
+        self.vae.requires_grad_(False)
+        self.ref_unet.requires_grad_(False)
+        self.unet.requires_grad_(False)
+        self.noise_scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler-zerosnr")
+        self.validation_pipeline = CanonicalizationPipeline(
+            vae=self.vae, text_encoder=self.text_encoder, tokenizer=self.tokenizer, unet=self.unet, ref_unet=self.ref_unet,feature_extractor=self.feature_extractor,image_encoder=self.image_encoder,
+            scheduler=self.noise_scheduler
+        )
+        self.validation_pipeline.set_progress_bar_config(disable=True)
+    def canonicalize(self, image, seed):
+        return inference(
+            self.validation_pipeline, image, self.vae, self.feature_extractor, self.image_encoder, self.unet, self.ref_unet, self.tokenizer, self.text_encoder,
+            self.pretrained_model_path, self.validation, self.width_input, self.height_input, self.unet_condition_type,
+            use_noise=self.use_noise, noise_d=self.noise_d, crop=True, seed=seed, timestep=self.timestep
+        )
+    def gen(self, img_input, seed=0):
+        if np.array(img_input).shape[-1] == 4 and np.array(img_input)[..., 3].min() == 255:
+            # convert to RGB
+            img_input = img_input.convert("RGB")
+        img_output = self.canonicalize(img_input, seed)
+        max_dim = max(img_output.width, img_output.height)
+        new_image = Image.new("RGBA", (max_dim, max_dim))
+        left = (max_dim - img_output.width) // 2
+        top = (max_dim - img_output.height) // 2
+        new_image.paste(img_output, (left, top))
+        return new_image