Spaces:

LTT
/

Kiss3DGen

Running on Zero

App Files Files Community

JiantaoLin commited on Dec 20, 2024

Commit

02a9751

1 Parent(s): e33401c

new

Browse files

Files changed (23) hide show

app.py +431 -323
demo.py → app_demo.py +98 -38
app_flux.py +0 -141
image_to_mesh.py +0 -437
image_to_mesh_new.py +0 -436
live_preview_helpers.py +0 -167
models/llm/__pycache__/llm.cpython-310.pyc +0 -0
models/llm/llm.py +97 -0
pipeline/custom_pipelines/__init__.py +3 -0
pipeline/custom_pipelines/pipeline_flux_controlnet_image_to_image.py +1004 -0
pipeline/custom_pipelines/pipeline_flux_img2img.py +862 -0
pipeline/custom_pipelines/pipeline_flux_prior_redux.py +500 -0
pipeline/example_text_to_3d.py +7 -0
pipeline/kiss3d_wrapper.py +416 -75
pipeline/pipeline_config/default.yaml +16 -8
pipeline/run_hpc.sh +1 -1
run_hpc.sh → pipeline/run_hpc_text_to_3d.sh +3 -4
pipeline/utils.py +34 -6
run.sh +0 -2
text_to_mesh.py +0 -232
text_to_mesh_new.py +0 -244
upload_huggingface.py +0 -57
video_render.py +123 -0

app.py CHANGED Viewed

@@ -1,9 +1,35 @@
-import gradio as gr
 import os
 import subprocess
-import shlex
 import spaces
 import torch
 access_token = os.getenv("HUGGINGFACE_TOKEN")
 subprocess.run(
     shlex.split(
@@ -22,6 +48,8 @@ subprocess.run(
         "pip install ./extension/renderutils_plugin-0.1.0-cp310-cp310-linux_x86_64.whl --force-reinstall --no-deps"
     )
 )
 def install_cuda_toolkit():
     # CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
     # CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
@@ -41,6 +69,7 @@ def install_cuda_toolkit():
     os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"
     print("==> finfish install")
 install_cuda_toolkit()
 @spaces.GPU
 def check_gpu():
     os.environ['CUDA_HOME'] = '/usr/local/cuda-12.1'
@@ -48,338 +77,417 @@ def check_gpu():
     # os.environ['LD_LIBRARY_PATH'] += ':/usr/local/cuda-12.1/lib64'
     os.environ['LD_LIBRARY_PATH'] = "/usr/local/cuda-12.1/lib64:" + os.environ.get('LD_LIBRARY_PATH', '')
     subprocess.run(['nvidia-smi'])  # 测试 CUDA 是否可用
     print(f"torch.cuda.is_available:{torch.cuda.is_available()}")
 check_gpu()
-from PIL import Image
-from einops import rearrange
-from diffusers import FluxPipeline
-from models.lrm.utils.camera_util import get_flux_input_cameras
-from models.lrm.utils.infer_util import save_video
-from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
-from models.lrm.utils.render_utils import rotate_x, rotate_y
-from models.lrm.utils.train_util import instantiate_from_config
-from models.ISOMER.reconstruction_func import reconstruction
-from models.ISOMER.projection_func import projection
-import os
-from einops import rearrange
-from omegaconf import OmegaConf
-import torch
-import numpy as np
-import trimesh
-import torchvision
-import torch.nn.functional as F
-from PIL import Image
-from torchvision import transforms
-from torchvision.transforms import v2
-from diffusers import  DiffusionPipeline, FlowMatchEulerDiscreteScheduler, AutoencoderTiny, AutoencoderKL
-from transformers import CLIPTextModel, CLIPTokenizer,T5EncoderModel, T5TokenizerFast
-from live_preview_helpers import calculate_shift, retrieve_timesteps, flux_pipe_call_that_returns_an_iterable_of_images
-from diffusers import FluxPipeline
-from pytorch_lightning import seed_everything
-import os
-from huggingface_hub import hf_hub_download
-from utils.tool import NormalTransfer, get_background, get_render_cameras_video, load_mipmap, render_frames
-device_0 = "cuda"
-device_1 = "cuda"
-resolution = 512
-save_dir = "./outputs"
-normal_transfer = NormalTransfer()
-isomer_azimuths = torch.from_numpy(np.array([0, 90, 180, 270])).float().to(device_1)
-isomer_elevations = torch.from_numpy(np.array([5, 5, 5, 5])).float().to(device_1)
-isomer_radius = 4.5
-isomer_geo_weights = torch.from_numpy(np.array([1, 0.9, 1, 0.9])).float().to(device_1)
-isomer_color_weights = torch.from_numpy(np.array([1, 0.5, 1, 0.5])).float().to(device_1)
-# model initialization and loading
-# flux
-# # taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=torch.bfloat16).to(device_0)
-# # good_vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=torch.bfloat16, token=access_token).to(device_0)
-# flux_pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, token=access_token).to(device=device_0, dtype=torch.bfloat16)
-# # flux_pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, vae=taef1, token=access_token).to(device_0)
-# flux_lora_ckpt_path = hf_hub_download(repo_id="LTT/xxx-ckpt", filename="rgb_normal_large.safetensors", repo_type="model", token=access_token)
-# flux_pipe.load_lora_weights(flux_lora_ckpt_path)
-# flux_pipe.to(device=device_0, dtype=torch.bfloat16)
-# torch.cuda.empty_cache()
-# flux_pipe.flux_pipe_call_that_returns_an_iterable_of_images = flux_pipe_call_that_returns_an_iterable_of_images.__get__(flux_pipe)
-# lrm
-config = OmegaConf.load("./models/lrm/config/PRM_inference.yaml")
-model_config = config.model_config
-infer_config = config.infer_config
-model = instantiate_from_config(model_config)
-model_ckpt_path = hf_hub_download(repo_id="LTT/PRM", filename="final_ckpt.ckpt", repo_type="model")
-state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
-state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
-model.load_state_dict(state_dict, strict=True)
-model = model.to(device_1)
-torch.cuda.empty_cache()
-@spaces.GPU
-def lrm_reconstructions(image, input_cameras, save_path=None, name="temp", export_texmap=False, if_save_video=False):
-    images = image.unsqueeze(0).to(device_1)
-    images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)
-    # breakpoint()
-    with torch.no_grad():
-        # get triplane
-        planes = model.forward_planes(images, input_cameras)
-        mesh_path_idx = os.path.join(save_path, f'{name}.obj')
-        mesh_out = model.extract_mesh(
-            planes,
-            use_texture_map=export_texmap,
-            **infer_config,
-        )
-        if export_texmap:
-            vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
-            save_obj_with_mtl(
-                vertices.data.cpu().numpy(),
-                uvs.data.cpu().numpy(),
-                faces.data.cpu().numpy(),
-                mesh_tex_idx.data.cpu().numpy(),
-                tex_map.permute(1, 2, 0).data.cpu().numpy(),
-                mesh_path_idx,
-            )
-        else:
-            vertices, faces, vertex_colors = mesh_out
-            save_obj(vertices, faces, vertex_colors, mesh_path_idx)
-        print(f"Mesh saved to {mesh_path_idx}")
-        render_size = 512
-        if if_save_video:
-            video_path_idx = os.path.join(save_path, f'{name}.mp4')
-            render_size = infer_config.render_resolution
-            ENV = load_mipmap("models/lrm/env_mipmap/6")
-            materials = (0.0,0.9)
-            all_mv, all_mvp, all_campos = get_render_cameras_video(
-                batch_size=1,
-                M=24,
-                radius=4.5,
-                elevation=(90, 60.0),
-                is_flexicubes=True,
-                fov=30
-            )
-            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
-                model,
-                planes,
-                render_cameras=all_mvp,
-                camera_pos=all_campos,
-                env=ENV,
-                materials=materials,
-                render_size=render_size,
-                chunk_size=20,
-                is_flexicubes=True,
-            )
-            normals = (torch.nn.functional.normalize(normals) + 1) / 2
-            normals = normals * alphas + (1-alphas)
-            all_frames = torch.cat([frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals], dim=3)
-            save_video(
-                all_frames,
-                video_path_idx,
-                fps=30,
-            )
-            print(f"Video saved to {video_path_idx}")
-    return vertices, faces
-def local_normal_global_transform(local_normal_images, azimuths_deg, elevations_deg):
-    if local_normal_images.min() >= 0:
-        local_normal = local_normal_images.float() * 2 - 1
-    else:
-        local_normal = local_normal_images.float()
-    global_normal = normal_transfer.trans_local_2_global(local_normal, azimuths_deg, elevations_deg, radius=4.5, for_lotus=False)
-    global_normal[...,0] *= -1
-    global_normal = (global_normal + 1) / 2
-    global_normal = global_normal.permute(0, 3, 1, 2)
-    return global_normal
-# 生成多视图图像
-@spaces.GPU(duration=120)
-def generate_multi_view_images(prompt, seed):
-    # torch.cuda.empty_cache()
-    # generator = torch.manual_seed(seed)
-    generator = torch.Generator().manual_seed(seed)
-    with torch.no_grad():
-        img = flux_pipe(
-            prompt=prompt,
-            num_inference_steps=5,
-            guidance_scale=3.5,
-            num_images_per_prompt=1,
-            width=resolution * 2,
-            height=resolution * 1,
-            output_type='np',
-            generator=generator,
-        ).images
-        # for img in flux_pipe.flux_pipe_call_that_returns_an_iterable_of_images(
-        #     prompt=prompt,
-        #     guidance_scale=3.5,
-        #     num_inference_steps=4,
-        #     width=resolution * 4,
-        #     height=resolution * 2,
-        #     generator=generator,
-        #     output_type="np",
-        #     good_vae=good_vae,
-        # ):
-        #     pass
-    # 返回最终的图像和种子（通过外部调用处理）
-    return img
-# 重建 3D 模型
 @spaces.GPU
-def reconstruct_3d_model(images, prompt):
-    global model
-    model.init_flexicubes_geometry(device_1, fovy=50.0)
-    model = model.eval()
-    rgb_normal_grid = images
-    save_dir_path = os.path.join(save_dir, prompt.replace(" ", "_"))
-    os.makedirs(save_dir_path, exist_ok=True)
-    images = torch.from_numpy(rgb_normal_grid).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
-    images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=2, m=4)        # (8, 3, 512, 512)
-    rgb_multi_view = images[:4, :3, :, :]
-    normal_multi_view = images[4:, :3, :, :]
-    multi_view_mask = get_background(normal_multi_view)
-    rgb_multi_view = rgb_multi_view * rgb_multi_view + (1-multi_view_mask)
-    input_cameras = get_flux_input_cameras(batch_size=1, radius=4.2, fov=30).to(device_1)
-    vertices, faces = lrm_reconstructions(rgb_multi_view, input_cameras, save_path=save_dir_path, name='lrm', export_texmap=False, if_save_video=True)
-    # local normal to global normal
-    global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1), isomer_azimuths, isomer_elevations)
-    global_normal = global_normal * multi_view_mask + (1-multi_view_mask)
-    global_normal = global_normal.permute(0,2,3,1)
-    rgb_multi_view = rgb_multi_view.permute(0,2,3,1)
-    multi_view_mask = multi_view_mask.permute(0,2,3,1).squeeze(-1)
-    vertices = torch.from_numpy(vertices).to(device_1)
-    faces = torch.from_numpy(faces).to(device_1)
-    vertices = vertices @ rotate_x(np.pi / 2, device=vertices.device)[:3, :3]
-    vertices = vertices @ rotate_y(np.pi / 2, device=vertices.device)[:3, :3]
-    # global_normal: B,H,W,3
-    # multi_view_mask: B,H,W
-    # rgb_multi_view: B,H,W,3
-    meshes = reconstruction(
-        normal_pils=global_normal,
-        masks=multi_view_mask,
-        weights=isomer_geo_weights,
-        fov=30,
-        radius=isomer_radius,
-        camera_angles_azi=isomer_azimuths,
-        camera_angles_ele=isomer_elevations,
-        expansion_weight_stage1=0.1,
-        init_type="file",
-        init_verts=vertices,
-        init_faces=faces,
-        stage1_steps=0,
-        stage2_steps=50,
-        start_edge_len_stage1=0.1,
-        end_edge_len_stage1=0.02,
-        start_edge_len_stage2=0.02,
-        end_edge_len_stage2=0.005,
-    )
-    save_glb_addr = projection(
-        meshes,
-        masks=multi_view_mask,
-        images=rgb_multi_view,
-        azimuths=isomer_azimuths,
-        elevations=isomer_elevations,
-        weights=isomer_color_weights,
-        fov=30,
-        radius=isomer_radius,
-        save_dir=f"{save_dir_path}/ISOMER/",
-    )
-    return save_glb_addr
-# Gradio 接口函数
 @spaces.GPU
-def gradio_pipeline(prompt, seed):
-    import ctypes
-    # 显式加载 libnvrtc.so.12
-    cuda_lib_path = "/usr/local/cuda-12.1/lib64/libnvrtc.so.12"
-    try:
-        ctypes.CDLL(cuda_lib_path, mode=ctypes.RTLD_GLOBAL)
-        print(f"Successfully preloaded {cuda_lib_path}")
-    except OSError as e:
-        print(f"Failed to preload {cuda_lib_path}: {e}")
-    # 生成多视图图像
-    # rgb_normal_grid = generate_multi_view_images(prompt, seed)
-    rgb_normal_grid = np.load("rgb_normal_grid.npy")
-    image_preview = Image.fromarray((rgb_normal_grid[0] * 255).astype(np.uint8))
-    # 3d reconstruction
-    # 重建 3D 模型并返回 glb 路径
-    save_glb_addr = reconstruct_3d_model(rgb_normal_grid, prompt)
-    # save_glb_addr = None
-    return image_preview, save_glb_addr
-# Gradio Blocks 应用
-with gr.Blocks() as demo:
-    with gr.Row(variant="panel"):
-        # 左侧输入区域
-        with gr.Column():
-            with gr.Row():
-                prompt_input = gr.Textbox(
-                    label="Enter Prompt",
-                    placeholder="Describe your 3D model...",
-                    lines=2,
-                    elem_id="prompt_input"
-                )
-            with gr.Row():
-                sample_seed = gr.Number(value=42, label="Seed Value", precision=0)
-            with gr.Row():
-                submit = gr.Button("Generate", elem_id="generate", variant="primary")
-            with gr.Row(variant="panel"):
-                gr.Markdown("Examples:")
-                gr.Examples(
-                    examples=[
-                        ["a castle on a hill"],
-                        ["an owl wearing a hat"],
-                        ["a futuristic car"]
-                    ],
-                    inputs=[prompt_input],
-                    label="Prompt Examples"
-                )
-        # 右侧输出区域
-        with gr.Column():
-            with gr.Row():
-                rgb_normal_grid_image = gr.Image(
-                    label="RGB Normal Grid",
-                    type="pil",
-                    interactive=False
-                )
-            with gr.Row():
-                with gr.Tab("GLB"):
-                    output_glb_model = gr.Model3D(
-                        label="Generated 3D Model (GLB Format)",
-                        interactive=False
-                    )
-                    gr.Markdown("Download the model for proper visualization.")
-    # 处理逻辑
-    submit.click(
-        fn=gradio_pipeline, inputs=[prompt_input, sample_seed],
-        outputs=[rgb_normal_grid_image, output_glb_model]
-    )
-# 启动应用
-# demo.queue(max_size=10)
-demo.launch()

 import os
+import gradio as gr
 import subprocess
 import spaces
+import ctypes
+import shlex
+import base64
+import re
+import sys
+from models.ISOMER.scripts.utils import fix_vert_color_glb
+sys.path.append(os.path.abspath(os.path.join(__file__, '../')))
+if 'OMP_NUM_THREADS' not in os.environ:
+    os.environ['OMP_NUM_THREADS'] = '32'
+import shutil
 import torch
+import json
+import requests
+import shutil
+import threading
+from PIL import Image
+import time
+torch.backends.cuda.matmul.allow_tf32 = True
+import trimesh
+import random
+import time
+import numpy as np
+from video_render import render_video_from_obj
 access_token = os.getenv("HUGGINGFACE_TOKEN")
 subprocess.run(
     shlex.split(
         "pip install ./extension/renderutils_plugin-0.1.0-cp310-cp310-linux_x86_64.whl --force-reinstall --no-deps"
     )
 )
+# download cudatoolkit
 def install_cuda_toolkit():
     # CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
     # CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
     os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"
     print("==> finfish install")
 install_cuda_toolkit()
 @spaces.GPU
 def check_gpu():
     os.environ['CUDA_HOME'] = '/usr/local/cuda-12.1'
     # os.environ['LD_LIBRARY_PATH'] += ':/usr/local/cuda-12.1/lib64'
     os.environ['LD_LIBRARY_PATH'] = "/usr/local/cuda-12.1/lib64:" + os.environ.get('LD_LIBRARY_PATH', '')
     subprocess.run(['nvidia-smi'])  # 测试 CUDA 是否可用
+    # 显式加载 libnvrtc.so.12
+    cuda_lib_path = "/usr/local/cuda-12.1/lib64/libnvrtc.so.12"
+    try:
+        ctypes.CDLL(cuda_lib_path, mode=ctypes.RTLD_GLOBAL)
+        print(f"Successfully preloaded {cuda_lib_path}")
+    except OSError as e:
+        print(f"Failed to preload {cuda_lib_path}: {e}")
     print(f"torch.cuda.is_available:{torch.cuda.is_available()}")
 check_gpu()
+from pipeline.kiss3d_wrapper import init_wrapper_from_config, run_text_to_3d, run_image_to_3d, image2mesh_preprocess, image2mesh_main
+is_running = False
+TEXT_URL = "http://127.0.0.1:9239/prompt"
+IMG_URL = ""
+KISS_3D_TEXT_FOLDER = "./outputs/text2"
+KISS_3D_IMG_FOLDER = "./outputs/image2"
+# Add logo file path and hyperlinks
+LOGO_PATH = "app_assets/logo_temp_.png"  # Update this to the actual path of your logo
+ARXIV_LINK = "https://arxiv.org/abs/example"
+GITHUB_LINK = "https://github.com/example"
+k3d_wrapper = init_wrapper_from_config('./pipeline/pipeline_config/default.yaml')
+TEMP_MESH_ADDRESS=''
+mesh_cache = None
+preprocessed_input_image = None
+def save_cached_mesh():
+    global mesh_cache
+    return mesh_cache
+    # if mesh_cache is None:
+    #     return None
+    # return save_py3dmesh_with_trimesh_fast(mesh_cache)
+def save_py3dmesh_with_trimesh_fast(meshes, save_glb_path=TEMP_MESH_ADDRESS, apply_sRGB_to_LinearRGB=True):
+    from pytorch3d.structures import Meshes
+    import trimesh
+    # convert from pytorch3d meshes to trimesh mesh
+    vertices = meshes.verts_packed().cpu().float().numpy()
+    triangles = meshes.faces_packed().cpu().long().numpy()
+    np_color = meshes.textures.verts_features_packed().cpu().float().numpy()
+    if save_glb_path.endswith(".glb"):
+        # rotate 180 along +Y
+        vertices[:, [0, 2]] = -vertices[:, [0, 2]]
+    def srgb_to_linear(c_srgb):
+        c_linear = np.where(c_srgb <= 0.04045, c_srgb / 12.92, ((c_srgb + 0.055) / 1.055) ** 2.4)
+        return c_linear.clip(0, 1.)
+    if apply_sRGB_to_LinearRGB:
+        np_color = srgb_to_linear(np_color)
+    assert vertices.shape[0] == np_color.shape[0]
+    assert np_color.shape[1] == 3
+    assert 0 <= np_color.min() and np_color.max() <= 1, f"min={np_color.min()}, max={np_color.max()}"
+    mesh = trimesh.Trimesh(vertices=vertices, faces=triangles, vertex_colors=np_color)
+    mesh.remove_unreferenced_vertices()
+    # save mesh
+    mesh.export(save_glb_path)
+    if save_glb_path.endswith(".glb"):
+        fix_vert_color_glb(save_glb_path)
+    print(f"saving to {save_glb_path}")
+#
+#
 @spaces.GPU
+def text_to_detailed(prompt, seed=None):
+    print(f"Before text_to_detailed: {torch.cuda.memory_allocated() / 1024**3} GB")
+    return k3d_wrapper.get_detailed_prompt(prompt, seed)
+@spaces.GPU
+def text_to_image(prompt, seed=None, strength=1.0,lora_scale=1.0, num_inference_steps=30, redux_hparam=None, init_image=None, **kwargs):
+    print(f"Before text_to_image: {torch.cuda.memory_allocated() / 1024**3} GB")
+    k3d_wrapper.renew_uuid()
+    init_image = None
+    if init_image_path is not None:
+        init_image = Image.open(init_image_path)
+    result = k3d_wrapper.generate_3d_bundle_image_text(
+                                      prompt,
+                                      image=init_image,
+                                      strength=strength,
+                                      lora_scale=lora_scale,
+                                      num_inference_steps=num_inference_steps,
+                                      seed=int(seed) if seed is not None else None,
+                                      redux_hparam=redux_hparam,
+                                      save_intermediate_results=True,
+                                      **kwargs)
+    return result[-1]
+def image2mesh_preprocess_(input_image_, seed, use_mv_rgb=True):
+    global preprocessed_input_image
+    seed = int(seed) if seed is not None else None
+    # TODO: delete this later
+    k3d_wrapper.del_llm_model()
+    input_image_save_path, reference_save_path, caption = image2mesh_preprocess(k3d_wrapper, input_image_, seed, use_mv_rgb)
+    preprocessed_input_image = Image.open(input_image_save_path)
+    return reference_save_path, caption
+@spaces.GPU
+def image2mesh_main_(reference_3d_bundle_image, caption, seed, strength1=0.5, strength2=0.95, enable_redux=True, use_controlnet=True, if_video=True):
+    global mesh_cache
+    seed = int(seed) if seed is not None else None
+    # TODO: delete this later
+    k3d_wrapper.del_llm_model()
+    input_image = preprocessed_input_image
+    reference_3d_bundle_image = torch.tensor(reference_3d_bundle_image).permute(2,0,1)/255
+    gen_save_path, recon_mesh_path = image2mesh_main(k3d_wrapper, input_image, reference_3d_bundle_image, caption=caption, seed=seed, strength1=strength1, strength2=strength2, enable_redux=enable_redux, use_controlnet=use_controlnet)
+    mesh_cache = recon_mesh_path
+    # gen_save_ = Image.open(gen_save_path)
+    if if_video:
+        video_path = recon_mesh_path.replace('.obj','.mp4').replace('.glb','.mp4')
+        render_video_from_obj(recon_mesh_path, video_path)
+        print(f"After bundle_image_to_mesh: {torch.cuda.memory_allocated() / 1024**3} GB")
+        return gen_save_path, video_path
+    else:
+        return gen_save_path, recon_mesh_path
+    # return gen_save_path, recon_mesh_path
 @spaces.GPU
+def bundle_image_to_mesh(
+        gen_3d_bundle_image,
+        lrm_radius = 4.15,
+        isomer_radius = 4.5,
+        reconstruction_stage1_steps = 10,
+        reconstruction_stage2_steps = 50,
+         save_intermediate_results=True,
+        if_video=True
+    ):
+    global mesh_cache
+    print(f"Before bundle_image_to_mesh: {torch.cuda.memory_allocated() / 1024**3} GB")
+    # TODO: delete this later
+    k3d_wrapper.del_llm_model()
+    print(f"Before bundle_image_to_mesh after deleting llm model: {torch.cuda.memory_allocated() / 1024**3} GB")
+    gen_3d_bundle_image = torch.tensor(gen_3d_bundle_image).permute(2,0,1)/255
+    # recon from 3D Bundle image
+    recon_mesh_path = k3d_wrapper.reconstruct_3d_bundle_image(gen_3d_bundle_image, lrm_render_radius=lrm_radius, isomer_radius=isomer_radius, save_intermediate_results=save_intermediate_results, reconstruction_stage1_steps=int(reconstruction_stage1_steps), reconstruction_stage2_steps=int(reconstruction_stage2_steps))
+    mesh_cache = recon_mesh_path
+    if if_video:
+        video_path = recon_mesh_path.replace('.obj','.mp4').replace('.glb','.mp4')
+        # # 检查这个video_path文件大小是是否超过50KB，不超过的话就认为是空文件，需要重新渲染
+        # if os.path.exists(video_path):
+        #     print(f"file size:{os.path.getsize(video_path)}")
+        #     if os.path.getsize(video_path) > 50*1024:
+        #         print(f"video path:{video_path}")
+        #         return video_path
+        render_video_from_obj(recon_mesh_path, video_path)
+        print(f"After bundle_image_to_mesh: {torch.cuda.memory_allocated() / 1024**3} GB")
+        return video_path
+    else:
+        return recon_mesh_path
+_HEADER_=f"""
+<img src="{LOGO_PATH}">
+    <h2><b>Official 🤗 Gradio Demo</b></h2><h2>
+    <b>Kiss3DGen: Repurposing Image Diffusion Models for 3D Asset Generation</b></a></h2>
+<p>**Kiss3DGen** is xxxxxxxxx</p>
+[![arXiv](https://img.shields.io/badge/arXiv-Link-red)]({ARXIV_LINK})  [![GitHub](https://img.shields.io/badge/GitHub-Repo-blue)]({GITHUB_LINK})
+"""
+_CITE_ = r"""
+<h2>If Kiss3DGen is helpful, please help to ⭐ the <a href='{""" + GITHUB_LINK + r"""}' target='_blank'>Github Repo</a>. Thanks!</h2>
+📝 **Citation**
+If you find our work useful for your research or applications, please cite using this bibtex:
+```bibtex
+@article{xxxx,
+  title={xxxx},
+  author={xxxx},
+  journal={xxxx},
+  year={xxxx}
+}
+```
+📋 **License**
+Apache-2.0 LICENSE. Please refer to the [LICENSE file](https://huggingface.co/spaces/TencentARC/InstantMesh/blob/main/LICENSE) for details.
+📧 **Contact**
+If you have any questions, feel free to open a discussion or contact us at <b>xxx@xxxx</b>.
+"""
+def image_to_base64(image_path):
+    """Converts an image file to a base64-encoded string."""
+    with open(image_path, "rb") as img_file:
+        return base64.b64encode(img_file.read()).decode('utf-8')
+def main():
+    torch.set_grad_enabled(False)
+    # Convert the logo image to base64
+    logo_base64 = image_to_base64(LOGO_PATH)
+    # with gr.Blocks() as demo:
+    with gr.Blocks(css="""
+        body {
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            min-height: 100vh;
+            margin: 0;
+            padding: 0;
+        }
+        #col-container { margin: 0px auto; max-width: 200px; }
+        .gradio-container {
+            max-width: 1000px;
+            margin: auto;
+            width: 100%;
+        }
+        #center-align-column {
+            display: flex;
+            justify-content: center;
+            align-items: center;
+        }
+        #right-align-column {
+            display: flex;
+            justify-content: flex-end;
+            align-items: center;
+        }
+        h1 {text-align: center;}
+        h2 {text-align: center;}
+        h3 {text-align: center;}
+        p {text-align: center;}
+        img {text-align: right;}
+        .right {
+        display: block;
+        margin-left: auto;
+        }
+        .center {
+        display: block;
+        margin-left: auto;
+        margin-right: auto;
+        width: 50%;
+        #content-container {
+            max-width: 1200px;
+            margin: 0 auto;
+        }
+        #example-container {
+            max-width: 300px;
+            margin: 0 auto;
+        }
+    """,elem_id="col-container") as demo:
+        # Header Section
+        # gr.Image(value=LOGO_PATH, width=64, height=64)
+        # gr.Markdown(_HEADER_)
+        with gr.Row(elem_id="content-container"):
+            # with gr.Column(scale=1):
+            #     pass
+            # with gr.Column(scale=1, elem_id="right-align-column"):
+            #     # gr.Image(value=LOGO_PATH, interactive=False, show_label=False, width=64, height=64, elem_id="logo-image")
+            #     # gr.Markdown(f"<img src='{LOGO_PATH}' alt='Logo' style='width:64px;height:64px;border:0;'>")
+            #     # gr.HTML(f"<img src='data:image/png;base64,{logo_base64}' alt='Logo' class='right' style='width:64px;height:64px;border:0;text-align:right;'>")
+            #     pass
+            with gr.Column(scale=7, elem_id="center-align-column"):
+                gr.Markdown(f"""
+                ## Official 🤗 Gradio Demo
+                # Kiss3DGen: Repurposing Image Diffusion Models for 3D Asset Generation""")
+                gr.HTML(f"<img src='data:image/png;base64,{logo_base64}' alt='Logo' class='center' style='width:64px;height:64px;border:0;text-align:center;'>")
+                gr.HTML(f"""
+                <div style="display: flex; justify-content: center; align-items: center; gap: 10px;">
+                    <a href="{ARXIV_LINK}" target="_blank">
+                        <img src="https://img.shields.io/badge/arXiv-Link-red" alt="arXiv">
+                    </a>
+                    <a href="{GITHUB_LINK}" target="_blank">
+                        <img src="https://img.shields.io/badge/GitHub-Repo-blue" alt="GitHub">
+                    </a>
+                </div>
+                """)
+                # gr.HTML(f"""
+                # <div style="display: flex; gap: 10px; align-items: center;"><a href="{ARXIV_LINK}" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/badge/arXiv-Link-red" alt="arXiv"></a>  <a href="{GITHUB_LINK}" target="_blank" rel="noopener noreferrer"><img src="https://img.shields.io/badge/GitHub-Repo-blue" alt="GitHub"></a></div>
+                # """)
+                # gr.Markdown(f"""
+                # [![arXiv](https://img.shields.io/badge/arXiv-Link-red)]({ARXIV_LINK})  [![GitHub](https://img.shields.io/badge/GitHub-Repo-blue)]({GITHUB_LINK})
+                # """, elem_id="title")
+            # with gr.Column(scale=1):
+            #     pass
+                # with gr.Row():
+                #     gr.Markdown(f"[![arXiv](https://img.shields.io/badge/arXiv-Link-red)]({ARXIV_LINK})")
+                #     gr.Markdown(f"[![GitHub](https://img.shields.io/badge/GitHub-Repo-blue)]({GITHUB_LINK})")
+        # Tabs Section
+        with gr.Tabs(selected='tab_text_to_3d', elem_id="content-container") as main_tabs:
+            with gr.TabItem('Text-to-3D', id='tab_text_to_3d'):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        prompt = gr.Textbox(value="", label="Input Prompt", lines=4)
+                        seed1 = gr.Number(value=10, label="Seed")
+                        with gr.Row(elem_id="example-container"):
+                            gr.Examples(
+                                examples=[
+                                    # ["A tree with red leaves"],
+                                    # ["A dragon with black texture"],
+                                    ["A girl with pink hair"],
+                                    ["A boy playing guitar"],
+                                    ["A dog wearing a hat"],
+                                    ["A boy playing basketball"],
+                                    # [""],
+                                    # [""],
+                                    # [""],
+                                ],
+                                inputs=[prompt],  # 将选中的示例填入 prompt 文本框
+                                label="Example Prompts"
+                            )
+                        btn_text2detailed = gr.Button("Refine to detailed prompt")
+                        detailed_prompt = gr.Textbox(value="", label="Detailed Prompt", placeholder="detailed prompt will be generated here base on your input prompt. You can also edit this prompt", lines=4, interactive=True)
+                        btn_text2img = gr.Button("Generate Images")
+                    with gr.Column(scale=1):
+                        output_image1 = gr.Image(label="Generated image", interactive=False)
+                        # lrm_radius = gr.Number(value=4.15, label="lrm_radius")
+                        # isomer_radius = gr.Number(value=4.5, label="isomer_radius")
+                        # reconstruction_stage1_steps = gr.Number(value=10, label="reconstruction_stage1_steps")
+                        # reconstruction_stage2_steps = gr.Number(value=50, label="reconstruction_stage2_steps")
+                        btn_gen_mesh = gr.Button("Generate Mesh")
+                        output_video1 = gr.Video(label="Generated Video", interactive=False, loop=True, autoplay=True)
+                        btn_download1 = gr.Button("Download Mesh")
+                        file_output1 = gr.File()
+            with gr.TabItem('Image-to-3D', id='tab_image_to_3d'):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        image = gr.Image(label="Input Image", type="pil")
+                        seed2 = gr.Number(value=10, label="Seed (0 for random)")
+                        btn_img2mesh_preprocess = gr.Button("Preprocess Image")
+                        image_caption = gr.Textbox(value="", label="Image Caption", placeholder="caption will be generated here base on your input image. You can also edit this caption", lines=4, interactive=True)
+                        output_image2 = gr.Image(label="Generated image", interactive=False)
+                        strength1 = gr.Slider(minimum=0, maximum=1.0, step=0.01, value=0.5, label="strength1")
+                        strength2 = gr.Slider(minimum=0, maximum=1.0, step=0.01, value=0.95, label="strength2")
+                        enable_redux = gr.Checkbox(label="enable redux", value=True)
+                        use_controlnet = gr.Checkbox(label="use controlnet", value=True)
+                        btn_img2mesh_main = gr.Button("Generate Mesh")
+                    with gr.Column(scale=1):
+                        # output_mesh2 = gr.Model3D(label="Generated Mesh", interactive=False)
+                        output_image3 = gr.Image(label="gen save image", interactive=False)
+                        output_video2 = gr.Video(label="Generated Video", interactive=False, loop=True, autoplay=True)
+                        btn_download2 = gr.Button("Download Mesh")
+                        file_output2 = gr.File()
+        # Image2
+        btn_img2mesh_preprocess.click(fn=image2mesh_preprocess_, inputs=[image, seed2], outputs=[output_image2, image_caption])
+        btn_img2mesh_main.click(fn=image2mesh_main_, inputs=[output_image2, image_caption, seed2, strength1, strength2, enable_redux, use_controlnet], outputs=[output_image3, output_video2])
+        btn_download2.click(fn=save_cached_mesh, inputs=[], outputs=file_output2)
+        # Button Click Events
+        # Text2
+        btn_text2detailed.click(fn=text_to_detailed, inputs=[prompt, seed1], outputs=detailed_prompt)
+        btn_text2img.click(fn=text_to_image, inputs=[detailed_prompt, seed1], outputs=output_image1)
+        btn_gen_mesh.click(fn=bundle_image_to_mesh, inputs=[output_image1,], outputs=output_video1)
+        # btn_gen_mesh.click(fn=bundle_image_to_mesh, inputs=[output_image1, lrm_radius, isomer_radius, reconstruction_stage1_steps, reconstruction_stage2_steps], outputs=output_video1)
+        with gr.Row():
+            pass
+        with gr.Row():
+            gr.Markdown(_CITE_)
+    # demo.queue(default_concurrency_limit=1)
+    # demo.launch(server_name="0.0.0.0", server_port=9239)
+    demo.launch()
+if __name__ == "__main__":
+    main()

demo.py → app_demo.py RENAMED Viewed

@@ -4,11 +4,10 @@ import subprocess
 import shlex
 import spaces
 import torch
-import numpy as numpy
 access_token = os.getenv("HUGGINGFACE_TOKEN")
 subprocess.run(
     shlex.split(
-        "pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt210/download.html"
     )
 )
@@ -20,7 +19,7 @@ subprocess.run(
 subprocess.run(
     shlex.split(
-        "pip install ./extension/renderutils_plugin-1.0-cp310-cp310-linux_x86_64.whl --force-reinstall --no-deps"
     )
 )
 def install_cuda_toolkit():
@@ -41,7 +40,7 @@ def install_cuda_toolkit():
     # Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range
     os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"
     print("==> finfish install")
-# install_cuda_toolkit()
 @spaces.GPU
 def check_gpu():
     os.environ['CUDA_HOME'] = '/usr/local/cuda-12.1'
@@ -84,8 +83,8 @@ from huggingface_hub import hf_hub_download
 from utils.tool import NormalTransfer, get_background, get_render_cameras_video, load_mipmap, render_frames
-device_0 = "cuda:0"
-device_1 = "cuda:1"
 resolution = 512
 save_dir = "./outputs"
 normal_transfer = NormalTransfer()
@@ -97,15 +96,15 @@ isomer_color_weights = torch.from_numpy(np.array([1, 0.5, 1, 0.5])).float().to(d
 # model initialization and loading
 # flux
-taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=torch.bfloat16).to(device_0)
-good_vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=torch.bfloat16, token=access_token).to(device_0)
 # flux_pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, token=access_token).to(device=device_0, dtype=torch.bfloat16)
-flux_pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, vae=taef1, token=access_token).to(device_0)
-flux_lora_ckpt_path = hf_hub_download(repo_id="LTT/xxx-ckpt", filename="rgb_normal_large.safetensors", repo_type="model")
-flux_pipe.load_lora_weights(flux_lora_ckpt_path)
 # flux_pipe.to(device=device_0, dtype=torch.bfloat16)
-torch.cuda.empty_cache()
-flux_pipe.flux_pipe_call_that_returns_an_iterable_of_images = flux_pipe_call_that_returns_an_iterable_of_images.__get__(flux_pipe)
 # lrm
@@ -159,7 +158,7 @@ def lrm_reconstructions(image, input_cameras, save_path=None, name="temp", expor
             all_mv, all_mvp, all_campos = get_render_cameras_video(
                 batch_size=1,
-                M=240,
                 radius=4.5,
                 elevation=(90, 60.0),
                 is_flexicubes=True,
@@ -209,28 +208,27 @@ def generate_multi_view_images(prompt, seed):
     # generator = torch.manual_seed(seed)
     generator = torch.Generator().manual_seed(seed)
     with torch.no_grad():
-        # images = flux_pipe(
         #     prompt=prompt,
-        #     num_inference_steps=10,
         #     guidance_scale=3.5,
-        #     num_images_per_prompt=1,
         #     width=resolution * 4,
         #     height=resolution * 2,
-        #     output_type='np',
         #     generator=generator,
         #     good_vae=good_vae,
-        # ).images
-        for img in flux_pipe.flux_pipe_call_that_returns_an_iterable_of_images(
-            prompt=prompt,
-            guidance_scale=3.5,
-            num_inference_steps=10,
-            width=resolution * 4,
-            height=resolution * 2,
-            generator=generator,
-            output_type="np",
-            good_vae=good_vae,
-        ):
-            pass
     # 返回最终的图像和种子（通过外部调用处理）
     return img
@@ -251,7 +249,7 @@ def reconstruct_3d_model(images, prompt):
     multi_view_mask = get_background(normal_multi_view)
     rgb_multi_view = rgb_multi_view * rgb_multi_view + (1-multi_view_mask)
     input_cameras = get_flux_input_cameras(batch_size=1, radius=4.2, fov=30).to(device_1)
-    vertices, faces = lrm_reconstructions(rgb_multi_view, input_cameras, save_path=save_dir_path, name='lrm', export_texmap=False, if_save_video=False)
     # local normal to global normal
     global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1), isomer_azimuths, isomer_elevations)
@@ -307,19 +305,81 @@ def reconstruct_3d_model(images, prompt):
 # Gradio 接口函数
 @spaces.GPU
 def gradio_pipeline(prompt, seed):
     # 生成多视图图像
-    rgb_normal_grid = generate_multi_view_images(prompt, seed)
-    image_preview = Image.fromarray((rgb_normal_grid * 255).astype(np.uint8))
     # 3d reconstruction
     # 重建 3D 模型并返回 glb 路径
     save_glb_addr = reconstruct_3d_model(rgb_normal_grid, prompt)
     return image_preview, save_glb_addr
-if __name__ == "__main__":
-    prompt_input = "a owm"
-    sample_seed = 42
-    gradio_pipeline(prompt_input, sample_seed)

 import shlex
 import spaces
 import torch
 access_token = os.getenv("HUGGINGFACE_TOKEN")
 subprocess.run(
     shlex.split(
+        "pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt240/download.html"
     )
 )
 subprocess.run(
     shlex.split(
+        "pip install ./extension/renderutils_plugin-0.1.0-cp310-cp310-linux_x86_64.whl --force-reinstall --no-deps"
     )
 )
 def install_cuda_toolkit():
     # Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range
     os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"
     print("==> finfish install")
+install_cuda_toolkit()
 @spaces.GPU
 def check_gpu():
     os.environ['CUDA_HOME'] = '/usr/local/cuda-12.1'
 from utils.tool import NormalTransfer, get_background, get_render_cameras_video, load_mipmap, render_frames
+device_0 = "cuda"
+device_1 = "cuda"
 resolution = 512
 save_dir = "./outputs"
 normal_transfer = NormalTransfer()
 # model initialization and loading
 # flux
+# # taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=torch.bfloat16).to(device_0)
+# # good_vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=torch.bfloat16, token=access_token).to(device_0)
 # flux_pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, token=access_token).to(device=device_0, dtype=torch.bfloat16)
+# # flux_pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, vae=taef1, token=access_token).to(device_0)
+# flux_lora_ckpt_path = hf_hub_download(repo_id="LTT/xxx-ckpt", filename="rgb_normal_large.safetensors", repo_type="model", token=access_token)
+# flux_pipe.load_lora_weights(flux_lora_ckpt_path)
 # flux_pipe.to(device=device_0, dtype=torch.bfloat16)
+# torch.cuda.empty_cache()
+# flux_pipe.flux_pipe_call_that_returns_an_iterable_of_images = flux_pipe_call_that_returns_an_iterable_of_images.__get__(flux_pipe)
 # lrm
             all_mv, all_mvp, all_campos = get_render_cameras_video(
                 batch_size=1,
+                M=24,
                 radius=4.5,
                 elevation=(90, 60.0),
                 is_flexicubes=True,
     # generator = torch.manual_seed(seed)
     generator = torch.Generator().manual_seed(seed)
     with torch.no_grad():
+        img = flux_pipe(
+            prompt=prompt,
+            num_inference_steps=5,
+            guidance_scale=3.5,
+            num_images_per_prompt=1,
+            width=resolution * 2,
+            height=resolution * 1,
+            output_type='np',
+            generator=generator,
+        ).images
+        # for img in flux_pipe.flux_pipe_call_that_returns_an_iterable_of_images(
         #     prompt=prompt,
         #     guidance_scale=3.5,
+        #     num_inference_steps=4,
         #     width=resolution * 4,
         #     height=resolution * 2,
         #     generator=generator,
+        #     output_type="np",
         #     good_vae=good_vae,
+        # ):
+        #     pass
     # 返回最终的图像和种子（通过外部调用处理）
     return img
     multi_view_mask = get_background(normal_multi_view)
     rgb_multi_view = rgb_multi_view * rgb_multi_view + (1-multi_view_mask)
     input_cameras = get_flux_input_cameras(batch_size=1, radius=4.2, fov=30).to(device_1)
+    vertices, faces = lrm_reconstructions(rgb_multi_view, input_cameras, save_path=save_dir_path, name='lrm', export_texmap=False, if_save_video=True)
     # local normal to global normal
     global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1), isomer_azimuths, isomer_elevations)
 # Gradio 接口函数
 @spaces.GPU
 def gradio_pipeline(prompt, seed):
+    import ctypes
+    # 显式加载 libnvrtc.so.12
+    cuda_lib_path = "/usr/local/cuda-12.1/lib64/libnvrtc.so.12"
+    try:
+        ctypes.CDLL(cuda_lib_path, mode=ctypes.RTLD_GLOBAL)
+        print(f"Successfully preloaded {cuda_lib_path}")
+    except OSError as e:
+        print(f"Failed to preload {cuda_lib_path}: {e}")
     # 生成多视图图像
+    # rgb_normal_grid = generate_multi_view_images(prompt, seed)
+    rgb_normal_grid = np.load("rgb_normal_grid.npy")
+    image_preview = Image.fromarray((rgb_normal_grid[0] * 255).astype(np.uint8))
     # 3d reconstruction
     # 重建 3D 模型并返回 glb 路径
     save_glb_addr = reconstruct_3d_model(rgb_normal_grid, prompt)
+    # save_glb_addr = None
     return image_preview, save_glb_addr
+# Gradio Blocks 应用
+with gr.Blocks() as demo:
+    with gr.Row(variant="panel"):
+        # 左侧输入区域
+        with gr.Column():
+            with gr.Row():
+                prompt_input = gr.Textbox(
+                    label="Enter Prompt",
+                    placeholder="Describe your 3D model...",
+                    lines=2,
+                    elem_id="prompt_input"
+                )
+            with gr.Row():
+                sample_seed = gr.Number(value=42, label="Seed Value", precision=0)
+            with gr.Row():
+                submit = gr.Button("Generate", elem_id="generate", variant="primary")
+            with gr.Row(variant="panel"):
+                gr.Markdown("Examples:")
+                gr.Examples(
+                    examples=[
+                        ["a castle on a hill"],
+                        ["an owl wearing a hat"],
+                        ["a futuristic car"]
+                    ],
+                    inputs=[prompt_input],
+                    label="Prompt Examples"
+                )
+        # 右侧输出区域
+        with gr.Column():
+            with gr.Row():
+                rgb_normal_grid_image = gr.Image(
+                    label="RGB Normal Grid",
+                    type="pil",
+                    interactive=False
+                )
+            with gr.Row():
+                with gr.Tab("GLB"):
+                    output_glb_model = gr.Model3D(
+                        label="Generated 3D Model (GLB Format)",
+                        interactive=False
+                    )
+                    gr.Markdown("Download the model for proper visualization.")
+    # 处理逻辑
+    submit.click(
+        fn=gradio_pipeline, inputs=[prompt_input, sample_seed],
+        outputs=[rgb_normal_grid_image, output_glb_model]
+    )
+# 启动应用
+# demo.queue(max_size=10)
+demo.launch()

app_flux.py DELETED Viewed

@@ -1,141 +0,0 @@
-import gradio as gr
-import numpy as np
-import os
-import random
-import spaces
-import torch
-from diffusers import  DiffusionPipeline, FlowMatchEulerDiscreteScheduler, AutoencoderTiny, AutoencoderKL
-from transformers import CLIPTextModel, CLIPTokenizer,T5EncoderModel, T5TokenizerFast
-from live_preview_helpers import calculate_shift, retrieve_timesteps, flux_pipe_call_that_returns_an_iterable_of_images
-dtype = torch.bfloat16
-device = "cuda" if torch.cuda.is_available() else "cpu"
-access_token = os.getenv("HUGGINGFACE_TOKEN")
-taef1 = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=dtype).to(device)
-good_vae = AutoencoderKL.from_pretrained("black-forest-labs/FLUX.1-dev", subfolder="vae", torch_dtype=dtype, token=access_token).to(device)
-pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=dtype, vae=taef1, token=access_token).to(device)
-torch.cuda.empty_cache()
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 2048
-pipe.flux_pipe_call_that_returns_an_iterable_of_images = flux_pipe_call_that_returns_an_iterable_of_images.__get__(pipe)
-@spaces.GPU(duration=75)
-def infer(prompt, seed=42, randomize_seed=False, width=2048, height=1024, guidance_scale=3.5, num_inference_steps=28, progress=gr.Progress(track_tqdm=True)):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    for img in pipe.flux_pipe_call_that_returns_an_iterable_of_images(
-            prompt=prompt,
-            guidance_scale=guidance_scale,
-            num_inference_steps=num_inference_steps,
-            width=width,
-            height=height,
-            generator=generator,
-            output_type="pil",
-            good_vae=good_vae,
-        ):
-            # yield img, seed
-            pass
-    return img, seed
-examples = [
-    "a tiny astronaut hatching from an egg on the moon",
-    "a cat holding a sign that says hello world",
-    "an anime illustration of a wiener schnitzel",
-]
-css="""
-#col-container {
-    margin: 0 auto;
-    max-width: 520px;
-}
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(f"""# FLUX.1 [dev]
-12B param rectified flow transformer guidance-distilled from [FLUX.1 [pro]](https://blackforestlabs.ai/)
-[[non-commercial license](https://huggingface.co/black-forest-labs/FLUX.1-dev/blob/main/LICENSE.md)] [[blog](https://blackforestlabs.ai/announcing-black-forest-labs/)] [[model](https://huggingface.co/black-forest-labs/FLUX.1-dev)]
-        """)
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0)
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
-            )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance Scale",
-                    minimum=1,
-                    maximum=15,
-                    step=0.1,
-                    value=3.5,
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=28,
-                )
-        gr.Examples(
-            examples = examples,
-            fn = infer,
-            inputs = [prompt],
-            outputs = [result, seed],
-            cache_examples="lazy"
-        )
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn = infer,
-        inputs = [prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
-        outputs = [result, seed]
-    )
-demo.launch()

image_to_mesh.py DELETED Viewed

@@ -1,437 +0,0 @@
-import os
-from einops import rearrange
-from omegaconf import OmegaConf
-import torch
-import numpy as np
-import trimesh
-import torchvision
-import torch.nn.functional as F
-from PIL import Image
-from torchvision import transforms
-from torchvision.transforms import v2
-from transformers import AutoProcessor, AutoModelForCausalLM
-import rembg
-from diffusers import FluxPipeline, FluxControlNetImg2ImgPipeline
-from diffusers.models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
-from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler, HeunDiscreteScheduler
-from pytorch_lightning import seed_everything
-import os
-from models.ISOMER.reconstruction_func import reconstruction
-from models.ISOMER.projection_func import projection
-from models.lrm.utils.infer_util import remove_background, resize_foreground, save_video
-from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
-from models.lrm.utils.render_utils import rotate_x, rotate_y
-from models.lrm.utils.train_util import instantiate_from_config
-from models.lrm.utils.camera_util import get_zero123plus_input_cameras, get_custom_zero123plus_input_cameras, get_flux_input_cameras
-from utils.tool import NormalTransfer, get_render_cameras_frames, load_mipmap
-from utils.tool import get_background, get_render_cameras_video, render_frames
-import time
-device = "cuda"
-resolution = 512
-save_dir = "./outputs"
-zero123plus_diffusion_steps = 75
-normal_transfer = NormalTransfer()
-rembg_session = rembg.new_session()
-isomer_azimuths = torch.from_numpy(np.array([270, 0, 90, 180])).to(device)
-isomer_elevations = torch.from_numpy(np.array([5, 5, 5, 5])).to(device)
-isomer_radius = 4.1
-isomer_geo_weights = torch.from_numpy(np.array([1, 0.9, 1, 0.9])).float().to(device)
-isomer_color_weights = torch.from_numpy(np.array([1, 0.5, 1, 0.5])).float().to(device)
-# seed_everything(42)
-# model initialization and loading
-# flux
-print('==> Loading Flux model ...')
-flux_base_model_pth = "/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/models--black-forest-labs--FLUX.1-dev"
-flux_controlnet = FluxControlNetModel.from_pretrained("/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/flux_controlnets/FLUX.1-dev-ControlNet-Union-Pro")
-flux_pipe = FluxControlNetImg2ImgPipeline.from_pretrained(flux_base_model_pth, controlnet=[flux_controlnet], torch_dtype=torch.bfloat16).to(device=device, dtype=torch.bfloat16)
-flux_pipe.load_lora_weights('./checkpoint/flux_lora/rgb_normal_large.safetensors')
-flux_pipe.to(device=device, dtype=torch.bfloat16)
-generator = torch.Generator(device=device).manual_seed(0)
-# lrm
-print('==> Loading LRM model ...')
-config = OmegaConf.load("./models/lrm/config/PRM_inference.yaml")
-model_config = config.model_config
-infer_config = config.infer_config
-model = instantiate_from_config(model_config)
-model_ckpt_path = "./checkpoint/lrm/final_ckpt.ckpt"
-state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
-state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
-model.load_state_dict(state_dict, strict=True)
-model = model.to(device)
-model.init_flexicubes_geometry(device, fovy=50.0)
-model = model.eval()
-# zero123++
-print('==> Loading diffusion model ...')
-zero123plus_pipeline = DiffusionPipeline.from_pretrained(
-    "sudo-ai/zero123plus-v1.2",
-    custom_pipeline="./models/zero123plus",
-    torch_dtype=torch.float16,
-)
-zero123plus_pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
-    zero123plus_pipeline.scheduler.config, timestep_spacing='trailing'
-)
-unet_ckpt_path = "./checkpoint/zero123++/flexgen_19w.ckpt"
-state_dict = torch.load(unet_ckpt_path, map_location='cpu')['state_dict']
-state_dict = {k[10:]: v for k, v in state_dict.items() if k.startswith('unet.unet.')}
-zero123plus_pipeline.unet.load_state_dict(state_dict, strict=True)
-zero123plus_pipeline = zero123plus_pipeline.to(device)
-# unet_ckpt_path = "checkpoint/zero123++/diffusion_pytorch_model.bin"
-# state_dict = torch.load(unet_ckpt_path, map_location='cpu')
-# zero123plus_pipeline.unet.load_state_dict(state_dict, strict=True)
-# zero123plus_pipeline = zero123plus_pipeline.to(device)
-# florence
-caption_model = AutoModelForCausalLM.from_pretrained(
-        "/hpc2hdd/home/jlin695/.cache/huggingface/hub/models--multimodalart--Florence-2-large-no-flash-attn/snapshots/8db3793cf5b453b2ccfb3a4f613b403b2e6b7ca2", torch_dtype=torch.bfloat16, trust_remote_code=True,
-    ).to(device)
-caption_processor = AutoProcessor.from_pretrained("/hpc2hdd/home/jlin695/.cache/huggingface/hub/models--multimodalart--Florence-2-large-no-flash-attn/snapshots/8db3793cf5b453b2ccfb3a4f613b403b2e6b7ca2", trust_remote_code=True)
-# Flux multi-view generation
-def multi_view_rgb_normal_generation_with_controlnet(prompt, image, strength=1.0,
-                                                    control_image=[],
-                                                    control_mode=[],
-                                                    control_guidance_start=None,
-                                                    control_guidance_end=None,
-                                                    controlnet_conditioning_scale=None,
-                                                    lora_scale=1.0
-                                                    ):
-    control_mode_dict = {
-        'canny': 0,
-        'tile': 1,
-        'depth': 2,
-        'blur': 3,
-        'pose': 4,
-        'gray': 5,
-        'lq': 6,
-    } # for https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union only
-    hparam_dict = {
-        'prompt': prompt,
-        'image': image,
-        'strength': strength,
-        'num_inference_steps': 30,
-        'guidance_scale': 3.5,
-        'num_images_per_prompt': 1,
-        'width': resolution*4,
-        'height': resolution*2,
-        'output_type': 'np',
-        'generator': generator,
-        'joint_attention_kwargs': {"scale": lora_scale}
-    }
-    # append controlnet hparams
-    if len(control_image) > 0:
-        assert len(control_mode) == len(control_image) # the count of image should be the same as control mode
-        ctrl_hparams = {
-            'control_mode': [control_mode_dict[mode_] for mode_ in control_mode],
-            'control_image': control_image,
-            'control_guidance_start': control_guidance_start or [0.0 for i in range(len(control_image))],
-            'control_guidance_end': control_guidance_end or [1.0 for i in range(len(control_image))],
-            'controlnet_conditioning_scale': controlnet_conditioning_scale or [1.0 for i in range(len(control_image))],
-        }
-        hparam_dict.update(ctrl_hparams)
-    # generate multi-view images
-    with torch.no_grad():
-        image = flux_pipe(
-            **hparam_dict
-        ).images
-    return image
-# captioning
-def run_captioning(image):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    torch_dtype = torch.bfloat16
-    if isinstance(image, str):  # If image is a file path
-        image = Image.open(image).convert("RGB")
-    prompt = "<MORE_DETAILED_CAPTION>"
-    inputs = caption_processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
-    # print(f"inputs {inputs}")
-    generated_ids = caption_model.generate(
-        input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3
-    )
-    generated_text = caption_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-    parsed_answer = caption_processor.post_process_generation(
-        generated_text, task=prompt, image_size=(image.width, image.height)
-    )
-    # print(f"parsed_answer = {parsed_answer}")
-    caption_text = parsed_answer["<MORE_DETAILED_CAPTION>"].replace("The image is ", "")
-    return caption_text
-# zero123++ multi-view generation
-def multi_view_rgb_generation(cond_img):
-    # generate multi-view images
-    with torch.no_grad():
-        output_image = zero123plus_pipeline(
-        cond_img,
-        num_inference_steps=zero123plus_diffusion_steps,
-        width=resolution*2,
-        height=resolution*2,
-    ).images[0]
-    return output_image
-# lrm reconstructions
-def lrm_reconstructions(image, input_cameras, save_path=None, name="temp", export_texmap=False, if_save_video=False, render_azimuths=None, render_elevations=None, render_radius=None, render_fov=30):
-    images = image.unsqueeze(0).to(device)
-    images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)
-    # breakpoint()
-    with torch.no_grad():
-        # get triplane
-        planes = model.forward_planes(images, input_cameras)
-        mesh_path_idx = os.path.join(save_path, f'{name}.obj')
-        mesh_out = model.extract_mesh(
-            planes,
-            use_texture_map=export_texmap,
-            **infer_config,
-        )
-        if export_texmap:
-            vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
-            save_obj_with_mtl(
-                vertices.data.cpu().numpy(),
-                uvs.data.cpu().numpy(),
-                faces.data.cpu().numpy(),
-                mesh_tex_idx.data.cpu().numpy(),
-                tex_map.permute(1, 2, 0).data.cpu().numpy(),
-                mesh_path_idx,
-            )
-        else:
-            vertices, faces, vertex_colors = mesh_out
-            save_obj(vertices, faces, vertex_colors, mesh_path_idx)
-        print(f"Mesh saved to {mesh_path_idx}")
-        render_size = 512
-        if if_save_video:
-            video_path_idx = os.path.join(save_path, f'{name}.mp4')
-            render_size = infer_config.render_resolution
-            ENV = load_mipmap("models/lrm/env_mipmap/6")
-            materials = (0.0,0.9)
-            all_mv, all_mvp, all_campos = get_render_cameras_video(
-                batch_size=1,
-                M=240,
-                radius=4.5,
-                elevation=(90, 60.0),
-                is_flexicubes=True,
-                fov=30
-            )
-            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
-                model,
-                planes,
-                render_cameras=all_mvp,
-                camera_pos=all_campos,
-                env=ENV,
-                materials=materials,
-                render_size=render_size,
-                chunk_size=20,
-                is_flexicubes=True,
-            )
-            normals = (torch.nn.functional.normalize(normals) + 1) / 2
-            normals = normals * alphas + (1-alphas)
-            all_frames = torch.cat([frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals], dim=3)
-            # breakpoint()
-            save_video(
-                all_frames,
-                video_path_idx,
-                fps=30,
-            )
-            print(f"Video saved to {video_path_idx}")
-        if render_azimuths is not None and render_elevations is not None and render_radius is not None:
-            render_size = infer_config.render_resolution
-            ENV = load_mipmap("models/lrm/env_mipmap/6")
-            materials = (0.0,0.9)
-            all_mv, all_mvp, all_campos, identity_mv = get_render_cameras_frames(
-                batch_size=1,
-                radius=render_radius,
-                azimuths=render_azimuths,
-                elevations=render_elevations,
-                fov=30
-            )
-            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
-                model,
-                planes,
-                render_cameras=all_mvp,
-                camera_pos=all_campos,
-                env=ENV,
-                materials=materials,
-                render_size=render_size,
-                render_mv = all_mv,
-                local_normal=True,
-                identity_mv=identity_mv,
-            )
-        else:
-            normals = None
-            frames = None
-            albedos = None
-    return vertices, faces, normals, frames, albedos
-def transform_normal(input_normal, azimuths_deg, elevations_deg, radius=4.5, is_global_to_local=False):
-    """
-    input_normal: in range [-1, 1], shape (b c h w)
-    """
-    input_normal = input_normal.permute(0, 2, 3, 1).cpu()
-    azimuths_deg = np.array(azimuths_deg)
-    elevations_deg = np.array(elevations_deg)
-    if is_global_to_local:
-        local_normal = normal_transfer.trans_global_2_local(input_normal, azimuths_deg, elevations_deg)
-        return local_normal.permute(0, 3, 1, 2)
-    else:
-        global_normal = normal_transfer.trans_local_2_global(input_normal, azimuths_deg, elevations_deg, radius=radius, for_lotus=False)
-        global_normal[..., 0] *= -1
-        return global_normal.permute(0, 3, 1, 2)
-def local_normal_global_transform(local_normal_images,azimuths_deg,elevations_deg):
-    if local_normal_images.min() >= 0:
-        local_normal = local_normal_images.float() * 2 - 1
-    else:
-        local_normal = local_normal_images.float()
-    global_normal = normal_transfer.trans_local_2_global(local_normal, azimuths_deg, elevations_deg, radius=4.5, for_lotus=False)
-    global_normal[...,0] *= -1
-    global_normal = (global_normal + 1) / 2
-    global_normal = global_normal.permute(0, 3, 1, 2)
-    return global_normal
-def main():
-    image_pth = "examples/蓝色小怪物.webp"
-    save_dir_path = os.path.join(save_dir, image_pth.split("/")[-1].split(".")[0])
-    os.makedirs(save_dir_path, exist_ok=True)
-    input_image = Image.open(image_pth)
-    # if not args.no_rembg:
-    input_image = remove_background(input_image, rembg_session)
-    input_image = resize_foreground(input_image, 0.85)
-    # generate caption
-    image_caption = run_captioning(image_pth)
-    # generate multi-view images
-    output_image = multi_view_rgb_generation(input_image)
-    # lrm reconstructions
-    rgb_multi_view = np.asarray(output_image, dtype=np.float32) / 255.0
-    rgb_multi_view = torch.from_numpy(rgb_multi_view).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
-    rgb_multi_view = rearrange(rgb_multi_view, 'c (n h) (m w) -> (n m) c h w', n=2, m=2)        # (8, 3, 512, 512)
-    input_cameras = get_custom_zero123plus_input_cameras(batch_size=1, radius=3.5, fov=30).to(device)
-    vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo = \
-                                        lrm_reconstructions(rgb_multi_view, input_cameras, save_path=save_dir_path, name='lrm',
-                                          export_texmap=False, if_save_video=False, render_azimuths=isomer_azimuths,
-                                          render_elevations=isomer_elevations, render_radius=isomer_radius, render_fov=30)
-    vertices = torch.from_numpy(vertices).to(device)
-    faces = torch.from_numpy(faces).to(device)
-    vertices = vertices @ rotate_x(np.pi / 2, device=vertices.device)[:3, :3]
-    vertices = vertices @ rotate_y(np.pi / 2, device=vertices.device)[:3, :3]
-    # lrm_3D_bundle_image = torchvision.utils.make_grid(torch.cat([lrm_multi_view_rgb.cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
-    lrm_3D_bundle_image = torchvision.utils.make_grid(torch.cat([rgb_multi_view[[3,0,1,2]].cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
-    # rgb_multi_view[[3,0,1,2]] : (B,3,H,W)
-    # lrm_multi_view_normals : (B,3,H,W)
-    # combined_images = 0.5 * rgb_multi_view[[3,0,1,2]].cpu() + 0.5 * (lrm_multi_view_normals.cpu() + 1) / 2
-    # torchvision.utils.save_image(combined_images, os.path.join("debug_output", 'combined.png'))
-    # breakpoint()
-    # Use the low-quality controlnet by default, feel free to try the others
-    control_image = [lrm_3D_bundle_image * 2 - 1]
-    control_mode = ['tile']
-    control_guidance_start = [0.0]
-    control_guidance_end = [0.3]
-    controlnet_conditioning_scale = [0.8]
-    flux_pipe.controlnet = FluxMultiControlNetModel([flux_controlnet for _ in control_mode])
-    # breakpoint()
-    rgb_normal_grid = multi_view_rgb_normal_generation_with_controlnet(
-        prompt= ' '.join(['A grid of 2x4 multi-view image, elevation 5. White background.', image_caption]),
-        image=lrm_3D_bundle_image,
-        strength=0.6,
-        control_image=control_image,
-        control_mode=control_mode,
-        control_guidance_start=control_guidance_start,
-        control_guidance_end=control_guidance_end,
-        controlnet_conditioning_scale=controlnet_conditioning_scale,
-        lora_scale=1.0
-    ) # noted that rgb_normal_grid is a (b, h, w, c) numpy array
-    rgb_normal_grid = torch.from_numpy(rgb_normal_grid).contiguous().float()
-    rgb_normal_grid = rearrange(rgb_normal_grid.squeeze(0), '(n h) (m w) c-> (n m) c h w', n=2, m=4)        # (8, 3, 512, 512)
-    rgb_multi_view = rgb_normal_grid[:4, :3, :, :].cuda()
-    normal_multi_view = rgb_normal_grid[4:, :3, :, :].cuda()
-    multi_view_mask = get_background(normal_multi_view).cuda()
-    rgb_multi_view = rgb_multi_view * multi_view_mask + (1-multi_view_mask)
-    # local normal to global normal
-    global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1).cpu(), isomer_azimuths, isomer_elevations).cuda()
-    global_normal = global_normal * multi_view_mask + (1-multi_view_mask)
-    global_normal = global_normal.permute(0,2,3,1)
-    multi_view_mask = multi_view_mask.squeeze(1)
-    rgb_multi_view = rgb_multi_view.permute(0,2,3,1)
-    # global_normal: B,H,W,3
-    # multi_view_mask: B,H,W
-    # rgb_multi_view: B,H,W,3
-    meshes = reconstruction(
-        normal_pils=global_normal,
-        masks=multi_view_mask,
-        weights=isomer_geo_weights,
-        fov=30,
-        radius=isomer_radius,
-        camera_angles_azi=isomer_azimuths,
-        camera_angles_ele=isomer_elevations,
-        expansion_weight_stage1=0.1,
-        init_type="file",
-        init_verts=vertices,
-        init_faces=faces,
-        stage1_steps=0,
-        stage2_steps=50,
-        start_edge_len_stage1=0.1,
-        end_edge_len_stage1=0.02,
-        start_edge_len_stage2=0.02,
-        end_edge_len_stage2=0.005,
-    )
-    save_glb_addr = projection(
-        meshes=meshes,
-        masks=multi_view_mask,
-        images=rgb_multi_view,
-        azimuths=isomer_azimuths,
-        elevations=isomer_elevations,
-        weights=isomer_color_weights,
-        fov=30,
-        radius=isomer_radius,
-        save_dir=f"{save_dir_path}/ISOMER/",
-    )
-    print(f'saved to {save_glb_addr}')
-if __name__ == '__main__':
-    main()

image_to_mesh_new.py DELETED Viewed

@@ -1,436 +0,0 @@
-import os
-from einops import rearrange
-from omegaconf import OmegaConf
-import torch
-import numpy as np
-import trimesh
-import torchvision
-import torch.nn.functional as F
-from PIL import Image
-from torchvision import transforms
-from torchvision.transforms import v2
-from transformers import AutoProcessor, AutoModelForCausalLM
-import rembg
-from diffusers import FluxPipeline, FluxControlNetImg2ImgPipeline
-from diffusers.models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
-from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler, HeunDiscreteScheduler
-from pytorch_lightning import seed_everything
-import os
-from models.ISOMER.reconstruction_func import reconstruction
-from models.ISOMER.projection_func import projection
-from models.lrm.utils.infer_util import remove_background, resize_foreground, save_video
-from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
-from models.lrm.utils.render_utils import rotate_x, rotate_y
-from models.lrm.utils.train_util import instantiate_from_config
-from models.lrm.utils.camera_util import get_zero123plus_input_cameras, get_custom_zero123plus_input_cameras, get_flux_input_cameras
-from utils.tool import NormalTransfer, get_render_cameras_frames, load_mipmap
-from utils.tool import get_background, get_render_cameras_video, render_frames, mask_fix
-device = "cuda"
-resolution = 512
-save_dir = "./outputs"
-zero123plus_diffusion_steps = 75
-normal_transfer = NormalTransfer()
-rembg_session = rembg.new_session()
-isomer_azimuths = torch.from_numpy(np.array([270, 0, 90, 180])).to(device)
-isomer_elevations = torch.from_numpy(np.array([5, 5, 5, 5])).to(device)
-isomer_radius = 4.1
-isomer_geo_weights = torch.from_numpy(np.array([1, 0.9, 1, 0.9])).float().to(device)
-isomer_color_weights = torch.from_numpy(np.array([1, 0.5, 1, 0.5])).float().to(device)
-# seed_everything(42)
-# model initialization and loading
-# flux
-print('==> Loading Flux model ...')
-flux_base_model_pth = "/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/models--black-forest-labs--FLUX.1-dev"
-flux_controlnet = FluxControlNetModel.from_pretrained("/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/flux_controlnets/FLUX.1-dev-ControlNet-Union-Pro")
-flux_pipe = FluxControlNetImg2ImgPipeline.from_pretrained(flux_base_model_pth, controlnet=[flux_controlnet], torch_dtype=torch.bfloat16).to(device=device, dtype=torch.bfloat16)
-flux_pipe.load_lora_weights('./checkpoint/flux_lora/rgb_normal_large.safetensors')
-flux_pipe.to(device=device, dtype=torch.bfloat16)
-generator = torch.Generator(device=device).manual_seed(0)
-# lrm
-print('==> Loading LRM model ...')
-config = OmegaConf.load("./models/lrm/config/PRM_inference.yaml")
-model_config = config.model_config
-infer_config = config.infer_config
-model = instantiate_from_config(model_config)
-model_ckpt_path = "./checkpoint/lrm/final_ckpt.ckpt"
-state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
-state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
-model.load_state_dict(state_dict, strict=True)
-model = model.to(device)
-model.init_flexicubes_geometry(device, fovy=50.0)
-model = model.eval()
-# zero123++
-print('==> Loading diffusion model ...')
-zero123plus_pipeline = DiffusionPipeline.from_pretrained(
-    "sudo-ai/zero123plus-v1.2",
-    custom_pipeline="./models/zero123plus",
-    torch_dtype=torch.float16,
-)
-zero123plus_pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
-    zero123plus_pipeline.scheduler.config, timestep_spacing='trailing'
-)
-unet_ckpt_path = "./checkpoint/zero123++/flexgen_19w.ckpt"
-state_dict = torch.load(unet_ckpt_path, map_location='cpu')['state_dict']
-state_dict = {k[10:]: v for k, v in state_dict.items() if k.startswith('unet.unet.')}
-zero123plus_pipeline.unet.load_state_dict(state_dict, strict=True)
-zero123plus_pipeline = zero123plus_pipeline.to(device)
-# unet_ckpt_path = "checkpoint/zero123++/diffusion_pytorch_model.bin"
-# state_dict = torch.load(unet_ckpt_path, map_location='cpu')
-# zero123plus_pipeline.unet.load_state_dict(state_dict, strict=True)
-# zero123plus_pipeline = zero123plus_pipeline.to(device)
-# florence
-caption_model = AutoModelForCausalLM.from_pretrained(
-        "/hpc2hdd/home/jlin695/.cache/huggingface/hub/models--multimodalart--Florence-2-large-no-flash-attn/snapshots/8db3793cf5b453b2ccfb3a4f613b403b2e6b7ca2", torch_dtype=torch.bfloat16, trust_remote_code=True,
-    ).to(device)
-caption_processor = AutoProcessor.from_pretrained("/hpc2hdd/home/jlin695/.cache/huggingface/hub/models--multimodalart--Florence-2-large-no-flash-attn/snapshots/8db3793cf5b453b2ccfb3a4f613b403b2e6b7ca2", trust_remote_code=True)
-# Flux multi-view generation
-def multi_view_rgb_normal_generation_with_controlnet(prompt, image, strength=1.0,
-                                                    control_image=[],
-                                                    control_mode=[],
-                                                    control_guidance_start=None,
-                                                    control_guidance_end=None,
-                                                    controlnet_conditioning_scale=None,
-                                                    lora_scale=1.0
-                                                    ):
-    control_mode_dict = {
-        'canny': 0,
-        'tile': 1,
-        'depth': 2,
-        'blur': 3,
-        'pose': 4,
-        'gray': 5,
-        'lq': 6,
-    } # for https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union only
-    hparam_dict = {
-        'prompt': prompt,
-        'image': image,
-        'strength': strength,
-        'num_inference_steps': 30,
-        'guidance_scale': 3.5,
-        'num_images_per_prompt': 1,
-        'width': resolution*4,
-        'height': resolution*2,
-        'output_type': 'np',
-        'generator': generator,
-        'joint_attention_kwargs': {"scale": lora_scale}
-    }
-    # append controlnet hparams
-    if len(control_image) > 0:
-        assert len(control_mode) == len(control_image) # the count of image should be the same as control mode
-        ctrl_hparams = {
-            'control_mode': [control_mode_dict[mode_] for mode_ in control_mode],
-            'control_image': control_image,
-            'control_guidance_start': control_guidance_start or [0.0 for i in range(len(control_image))],
-            'control_guidance_end': control_guidance_end or [1.0 for i in range(len(control_image))],
-            'controlnet_conditioning_scale': controlnet_conditioning_scale or [1.0 for i in range(len(control_image))],
-        }
-        hparam_dict.update(ctrl_hparams)
-    # generate multi-view images
-    with torch.no_grad():
-        image = flux_pipe(
-            **hparam_dict
-        ).images
-    return image
-# captioning
-def run_captioning(image):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    torch_dtype = torch.bfloat16
-    if isinstance(image, str):  # If image is a file path
-        image = Image.open(image).convert("RGB")
-    prompt = "<MORE_DETAILED_CAPTION>"
-    inputs = caption_processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
-    # print(f"inputs {inputs}")
-    generated_ids = caption_model.generate(
-        input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3
-    )
-    generated_text = caption_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-    parsed_answer = caption_processor.post_process_generation(
-        generated_text, task=prompt, image_size=(image.width, image.height)
-    )
-    # print(f"parsed_answer = {parsed_answer}")
-    caption_text = parsed_answer["<MORE_DETAILED_CAPTION>"].replace("The image is ", "")
-    return caption_text
-# zero123++ multi-view generation
-def multi_view_rgb_generation(cond_img):
-    # generate multi-view images
-    with torch.no_grad():
-        output_image = zero123plus_pipeline(
-        cond_img,
-        num_inference_steps=zero123plus_diffusion_steps,
-        width=resolution*2,
-        height=resolution*2,
-    ).images[0]
-    return output_image
-# lrm reconstructions
-def lrm_reconstructions(image, input_cameras, save_path=None, name="temp", export_texmap=False, if_save_video=False, render_azimuths=None, render_elevations=None, render_radius=None, render_fov=30):
-    images = image.unsqueeze(0).to(device)
-    images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)
-    # breakpoint()
-    with torch.no_grad():
-        # get triplane
-        planes = model.forward_planes(images, input_cameras)
-        mesh_path_idx = os.path.join(save_path, f'{name}.obj')
-        mesh_out = model.extract_mesh(
-            planes,
-            use_texture_map=export_texmap,
-            **infer_config,
-        )
-        if export_texmap:
-            vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
-            save_obj_with_mtl(
-                vertices.data.cpu().numpy(),
-                uvs.data.cpu().numpy(),
-                faces.data.cpu().numpy(),
-                mesh_tex_idx.data.cpu().numpy(),
-                tex_map.permute(1, 2, 0).data.cpu().numpy(),
-                mesh_path_idx,
-            )
-        else:
-            vertices, faces, vertex_colors = mesh_out
-            save_obj(vertices, faces, vertex_colors, mesh_path_idx)
-        print(f"Mesh saved to {mesh_path_idx}")
-        render_size = 512
-        if if_save_video:
-            video_path_idx = os.path.join(save_path, f'{name}.mp4')
-            render_size = infer_config.render_resolution
-            ENV = load_mipmap("models/lrm/env_mipmap/6")
-            materials = (0.0,0.9)
-            all_mv, all_mvp, all_campos = get_render_cameras_video(
-                batch_size=1,
-                M=240,
-                radius=4.5,
-                elevation=(90, 60.0),
-                is_flexicubes=True,
-                fov=30
-            )
-            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
-                model,
-                planes,
-                render_cameras=all_mvp,
-                camera_pos=all_campos,
-                env=ENV,
-                materials=materials,
-                render_size=render_size,
-                chunk_size=20,
-                is_flexicubes=True,
-            )
-            normals = (torch.nn.functional.normalize(normals) + 1) / 2
-            normals = normals * alphas + (1-alphas)
-            all_frames = torch.cat([frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals], dim=3)
-            # breakpoint()
-            save_video(
-                all_frames,
-                video_path_idx,
-                fps=30,
-            )
-            print(f"Video saved to {video_path_idx}")
-        if render_azimuths is not None and render_elevations is not None and render_radius is not None:
-            render_size = infer_config.render_resolution
-            ENV = load_mipmap("models/lrm/env_mipmap/6")
-            materials = (0.0,0.9)
-            all_mv, all_mvp, all_campos, identity_mv = get_render_cameras_frames(
-                batch_size=1,
-                radius=render_radius,
-                azimuths=render_azimuths,
-                elevations=render_elevations,
-                fov=30
-            )
-            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
-                model,
-                planes,
-                render_cameras=all_mvp,
-                camera_pos=all_campos,
-                env=ENV,
-                materials=materials,
-                render_size=render_size,
-                render_mv = all_mv,
-                local_normal=True,
-                identity_mv=identity_mv,
-            )
-        else:
-            normals = None
-            frames = None
-            albedos = None
-    return vertices, faces, normals, frames, albedos
-def transform_normal(input_normal, azimuths_deg, elevations_deg, radius=4.5, is_global_to_local=False):
-    """
-    input_normal: in range [-1, 1], shape (b c h w)
-    """
-    input_normal = input_normal.permute(0, 2, 3, 1).cpu()
-    azimuths_deg = np.array(azimuths_deg)
-    elevations_deg = np.array(elevations_deg)
-    if is_global_to_local:
-        local_normal = normal_transfer.trans_global_2_local(input_normal, azimuths_deg, elevations_deg)
-        return local_normal.permute(0, 3, 1, 2)
-    else:
-        global_normal = normal_transfer.trans_local_2_global(input_normal, azimuths_deg, elevations_deg, radius=radius, for_lotus=False)
-        global_normal[..., 0] *= -1
-        return global_normal.permute(0, 3, 1, 2)
-def local_normal_global_transform(local_normal_images,azimuths_deg,elevations_deg):
-    if local_normal_images.min() >= 0:
-        local_normal = local_normal_images.float() * 2 - 1
-    else:
-        local_normal = local_normal_images.float()
-    global_normal = normal_transfer.trans_local_2_global(local_normal, azimuths_deg, elevations_deg, radius=4.5, for_lotus=False)
-    global_normal[...,0] *= -1
-    global_normal = (global_normal + 1) / 2
-    global_normal = global_normal.permute(0, 3, 1, 2)
-    return global_normal
-def main():
-    image_pth = "examples/蓝色小怪物.webp"
-    save_dir_path = os.path.join(save_dir, image_pth.split("/")[-1].split(".")[0])
-    os.makedirs(save_dir_path, exist_ok=True)
-    input_image = Image.open(image_pth)
-    # if not args.no_rembg:
-    input_image = remove_background(input_image, rembg_session)
-    input_image = resize_foreground(input_image, 0.85)
-    # generate caption
-    image_caption = run_captioning(image_pth)
-    # generate multi-view images
-    output_image = multi_view_rgb_generation(input_image)
-    # lrm reconstructions
-    rgb_multi_view = np.asarray(output_image, dtype=np.float32) / 255.0
-    rgb_multi_view = torch.from_numpy(rgb_multi_view).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
-    rgb_multi_view = rearrange(rgb_multi_view, 'c (n h) (m w) -> (n m) c h w', n=2, m=2)        # (8, 3, 512, 512)
-    input_cameras = get_custom_zero123plus_input_cameras(batch_size=1, radius=3.5, fov=30).to(device)
-    vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo = \
-                                        lrm_reconstructions(rgb_multi_view, input_cameras, save_path=save_dir_path, name='lrm',
-                                          export_texmap=False, if_save_video=False, render_azimuths=isomer_azimuths,
-                                          render_elevations=isomer_elevations, render_radius=isomer_radius, render_fov=30)
-    vertices = torch.from_numpy(vertices).to(device)
-    faces = torch.from_numpy(faces).to(device)
-    vertices = vertices @ rotate_x(np.pi / 2, device=vertices.device)[:3, :3]
-    vertices = vertices @ rotate_y(np.pi / 2, device=vertices.device)[:3, :3]
-    # lrm_3D_bundle_image = torchvision.utils.make_grid(torch.cat([lrm_multi_view_rgb.cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
-    lrm_3D_bundle_image = torchvision.utils.make_grid(torch.cat([rgb_multi_view[[3,0,1,2]].cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
-    # rgb_multi_view[[3,0,1,2]] : (B,3,H,W)
-    # lrm_multi_view_normals : (B,3,H,W)
-    # combined_images = 0.5 * rgb_multi_view[[3,0,1,2]].cpu() + 0.5 * (lrm_multi_view_normals.cpu() + 1) / 2
-    # torchvision.utils.save_image(combined_images, os.path.join("debug_output", 'combined.png'))
-    # breakpoint()
-    # Use the low-quality controlnet by default, feel free to try the others
-    control_image = [lrm_3D_bundle_image * 2 - 1]
-    control_mode = ['tile']
-    control_guidance_start = [0.0]
-    control_guidance_end = [0.3]
-    controlnet_conditioning_scale = [0.8]
-    flux_pipe.controlnet = FluxMultiControlNetModel([flux_controlnet for _ in control_mode])
-    # breakpoint()
-    rgb_normal_grid = multi_view_rgb_normal_generation_with_controlnet(
-        prompt= ' '.join(['A grid of 2x4 multi-view image, elevation 5. White background.', image_caption]),
-        image=lrm_3D_bundle_image,
-        strength=0.6,
-        control_image=control_image,
-        control_mode=control_mode,
-        control_guidance_start=control_guidance_start,
-        control_guidance_end=control_guidance_end,
-        controlnet_conditioning_scale=controlnet_conditioning_scale,
-        lora_scale=1.0
-    ) # noted that rgb_normal_grid is a (b, h, w, c) numpy array
-    rgb_normal_grid = torch.from_numpy(rgb_normal_grid).contiguous().float()
-    rgb_normal_grid = rearrange(rgb_normal_grid.squeeze(0), '(n h) (m w) c-> (n m) c h w', n=2, m=4)        # (8, 3, 512, 512)
-    rgb_multi_view = rgb_normal_grid[:4, :3, :, :].cuda()
-    normal_multi_view = rgb_normal_grid[4:, :3, :, :].cuda()
-    multi_view_mask = get_background(normal_multi_view).cuda()
-    rgb_multi_view = rgb_multi_view * multi_view_mask + (1-multi_view_mask)
-    # local normal to global normal
-    global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1).cpu(), isomer_azimuths, isomer_elevations).cuda()
-    global_normal = global_normal * multi_view_mask + (1-multi_view_mask)
-    global_normal = global_normal.permute(0,2,3,1)
-    multi_view_mask = multi_view_mask.squeeze(1)
-    rgb_multi_view = rgb_multi_view.permute(0,2,3,1)
-    # global_normal: B,H,W,3
-    # multi_view_mask: B,H,W
-    # rgb_multi_view: B,H,W,3
-    meshes = reconstruction(
-        normal_pils=global_normal,
-        masks=multi_view_mask,
-        weights=isomer_geo_weights,
-        fov=30,
-        radius=isomer_radius,
-        camera_angles_azi=isomer_azimuths,
-        camera_angles_ele=isomer_elevations,
-        expansion_weight_stage1=0.1,
-        init_type="file",
-        init_verts=vertices,
-        init_faces=faces,
-        stage1_steps=0,
-        stage2_steps=50,
-        start_edge_len_stage1=0.1,
-        end_edge_len_stage1=0.02,
-        start_edge_len_stage2=0.02,
-        end_edge_len_stage2=0.005,
-    )
-    save_glb_addr = projection(
-        meshes=meshes,
-        masks=multi_view_mask,
-        images=rgb_multi_view,
-        azimuths=isomer_azimuths,
-        elevations=isomer_elevations,
-        weights=isomer_color_weights,
-        fov=30,
-        radius=isomer_radius,
-        save_dir=f"{save_dir_path}/ISOMER/",
-    )
-    print(f'saved to {save_glb_addr}')
-if __name__ == '__main__':
-    main()

live_preview_helpers.py DELETED Viewed

@@ -1,167 +0,0 @@
-import torch
-import numpy as np
-from diffusers import FluxPipeline, AutoencoderTiny, FlowMatchEulerDiscreteScheduler
-from typing import Any, Dict, List, Optional, Union
-# Helper functions
-def calculate_shift(
-    image_seq_len,
-    base_seq_len: int = 256,
-    max_seq_len: int = 4096,
-    base_shift: float = 0.5,
-    max_shift: float = 1.16,
-):
-    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
-    b = base_shift - m * base_seq_len
-    mu = image_seq_len * m + b
-    return mu
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    **kwargs,
-):
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-    if timesteps is not None:
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-# FLUX pipeline function
-@torch.inference_mode()
-def flux_pipe_call_that_returns_an_iterable_of_images(
-    self,
-    prompt: Union[str, List[str]] = None,
-    prompt_2: Optional[Union[str, List[str]]] = None,
-    height: Optional[int] = None,
-    width: Optional[int] = None,
-    num_inference_steps: int = 28,
-    timesteps: List[int] = None,
-    guidance_scale: float = 3.5,
-    num_images_per_prompt: Optional[int] = 1,
-    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-    latents: Optional[torch.FloatTensor] = None,
-    prompt_embeds: Optional[torch.FloatTensor] = None,
-    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-    output_type: Optional[str] = "pil",
-    return_dict: bool = True,
-    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-    max_sequence_length: int = 512,
-    good_vae: Optional[Any] = None,
-):
-    height = height or self.default_sample_size * self.vae_scale_factor
-    width = width or self.default_sample_size * self.vae_scale_factor
-    # 1. Check inputs
-    self.check_inputs(
-        prompt,
-        prompt_2,
-        height,
-        width,
-        prompt_embeds=prompt_embeds,
-        pooled_prompt_embeds=pooled_prompt_embeds,
-        max_sequence_length=max_sequence_length,
-    )
-    self._guidance_scale = guidance_scale
-    self._joint_attention_kwargs = joint_attention_kwargs
-    self._interrupt = False
-    # 2. Define call parameters
-    batch_size = 1 if isinstance(prompt, str) else len(prompt)
-    device = self._execution_device
-    # 3. Encode prompt
-    lora_scale = joint_attention_kwargs.get("scale", None) if joint_attention_kwargs is not None else None
-    prompt_embeds, pooled_prompt_embeds, text_ids = self.encode_prompt(
-        prompt=prompt,
-        prompt_2=prompt_2,
-        prompt_embeds=prompt_embeds,
-        pooled_prompt_embeds=pooled_prompt_embeds,
-        device=device,
-        num_images_per_prompt=num_images_per_prompt,
-        max_sequence_length=max_sequence_length,
-        lora_scale=lora_scale,
-    )
-    # 4. Prepare latent variables
-    num_channels_latents = self.transformer.config.in_channels // 4
-    latents, latent_image_ids = self.prepare_latents(
-        batch_size * num_images_per_prompt,
-        num_channels_latents,
-        height,
-        width,
-        prompt_embeds.dtype,
-        device,
-        generator,
-        latents,
-    )
-    # 5. Prepare timesteps
-    sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
-    image_seq_len = latents.shape[1]
-    mu = calculate_shift(
-        image_seq_len,
-        self.scheduler.config.base_image_seq_len,
-        self.scheduler.config.max_image_seq_len,
-        self.scheduler.config.base_shift,
-        self.scheduler.config.max_shift,
-    )
-    timesteps, num_inference_steps = retrieve_timesteps(
-        self.scheduler,
-        num_inference_steps,
-        device,
-        timesteps,
-        sigmas,
-        mu=mu,
-    )
-    self._num_timesteps = len(timesteps)
-    # Handle guidance
-    guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32).expand(latents.shape[0]) if self.transformer.config.guidance_embeds else None
-    # 6. Denoising loop
-    for i, t in enumerate(timesteps):
-        print(f"Inference step {i+1}/{num_inference_steps}")
-        if self.interrupt:
-            continue
-        timestep = t.expand(latents.shape[0]).to(latents.dtype)
-        noise_pred = self.transformer(
-            hidden_states=latents,
-            timestep=timestep / 1000,
-            guidance=guidance,
-            pooled_projections=pooled_prompt_embeds,
-            encoder_hidden_states=prompt_embeds,
-            txt_ids=text_ids,
-            img_ids=latent_image_ids,
-            joint_attention_kwargs=self.joint_attention_kwargs,
-            return_dict=False,
-        )[0]
-        # Yield intermediate result
-        latents_for_image = self._unpack_latents(latents, height, width, self.vae_scale_factor)
-        latents_for_image = (latents_for_image / self.vae.config.scaling_factor) + self.vae.config.shift_factor
-        image = self.vae.decode(latents_for_image, return_dict=False)[0]
-        yield self.image_processor.postprocess(image, output_type=output_type)[0]
-        latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
-        torch.cuda.empty_cache()
-    # Final image using good_vae
-    latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
-    latents = (latents / good_vae.config.scaling_factor) + good_vae.config.shift_factor
-    image = good_vae.decode(latents, return_dict=False)[0]
-    self.maybe_free_model_hooks()
-    torch.cuda.empty_cache()
-    yield self.image_processor.postprocess(image, output_type=output_type)[0]

models/llm/__pycache__/llm.cpython-310.pyc ADDED Viewed

Binary file (5.41 kB). View file

models/llm/llm.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+# device = "cuda" # the device to load the model onto
+model_name_or_dir="Qwen/Qwen2-7B-Instruct"
+DEFAULT_SYSTEM_PROMPT = """*Given the user's input describing an object, concept, or vague idea, generate a concise and vivid prompt for the diffusion model that portrays a 3D object based solely on that input. Do not include scenes or backgrounds. The prompt should include specific descriptions for each of the four views—front, left side, rear, and right side—that will be displayed in a 2x4 grid (RGB images on the top row and normal maps on the bottom row). Put all descriptions in one single line. Focus on enhancing the cuteness and 3D qualities of the object without including any background or scene elements. Use descriptive adjectives and, if appropriate, stylistic elements to amplify the object's appeal.*
+---
+**Examples: (Please follow the OUTPUT Format of the following examples.)**
+- **User's Input:** "我喜欢蘑菇."
+    A charming 3D mushroom character with a cheerful expression and blushing cheeks, styled in a whimsical, cartoonish manner. Front view displays a wide, happy smile, round eyes, and a polka-dotted cap with a small ladybug perched on top; left side view reveals a miniature satchel with a tiny acorn charm hanging from its stem; rear view shows a cute, tiny backpack decorated with mushroom patterns and a small patch of grass at the base; right side view features a petite, colorful umbrella tucked under its cap, with a ladybug sitting on the handle. No background. Arrange in a 2x4 grid with RGB images on top and normal maps below.
+- **User's Input:** "画点关于太空的东西吧."
+    A delightful 3D astronaut plush toy with oversized, twinkling eyes and a tiny, shiny helmet, styled in an endearing, kawaii fashion. Front view showcases a joyful smile, a sparkly visor, and a round emblem with a star on the chest; left side view highlights a small flag patch on the arm, with a tiny rocket embroidery; rear view reveals a heart-shaped mini oxygen tank with a playful bow attached; right side view displays a waving hand adorned with tiny, glittering stars and a wristband with planets. No background. Display in a 2x4 grid, top row RGB images, bottom row normal maps.
+- **User's Input:** "老哥，画条龙?"
+    A tiny, chubby 3D dragon with a joyful expression and dainty wings, styled in a cute, fantasy-inspired manner. Front view presents large, sparkling eyes, small curved horns, and a toothy grin; left side view features a little pouch hanging from its neck with a golden coin peeking out; rear view reveals a heart-shaped tail adorned with small, shimmering scales; right side view displays a miniature shield with a dragon emblem, and a wing folded in a playful manner. No background. Presented in a 2x4 grid with RGB images above and normal maps below.
+- **User's Input:** "Maybe a robot?"
+    A lovable 3D robot with a round, friendly body and an inviting smile, styled in a sleek, minimalist design. Front view shows glowing, expressive eyes, a cheerful mouth, and a touch-screen panel with a smiley face; left side view highlights a side antenna with a blinking light and a small digital clock display; rear view reveals a charming power pack with colorful circuits and a sticker of a smiling sun; right side view features a mechanical arm holding a tiny flower with a ladybug perched on a petal. No scene elements. Organize in a 2x4 grid, RGB images on the top row, normal maps on the bottom row.
+---
+**Tips:**
+- **Use Stylized Descriptions:** Mention styles that enhance cuteness (e.g., chibi, kawaii, cartoonish).
+- **Incorporate Expressive Features:** Emphasize features like big eyes, smiles, or playful accessories.
+- **Tailor View-Specific Details:** Ensure each view adds unique details to enrich the object's visual appeal.
+- **Avoid Ambiguity:** Make sure the prompt is specific enough for the model to interpret accurately but doesn't include unnecessary information.
+OUTPUT THE PROMPT ONLY!
+OUTPUT ENGLISH ONLY! NOT ANY OTHER LANGUAGE, E.G., CHINESE!"""
+def load_llm_model(model_name_or_dir, torch_dtype='auto', device_map='cpu'):
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name_or_dir,
+        torch_dtype=torch_dtype,
+        # torch_dtype=torch.float8_e5m2,
+        # torch_dtype=torch.float16,
+        device_map=device_map
+    )
+    print(f'set llm model to {model_name_or_dir}')
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_dir)
+    print(f'set llm tokenizer to {model_name_or_dir}')
+    return model, tokenizer
+# print(f"Before load llm model: {torch.cuda.memory_allocated() / 1024**3} GB")
+# load_model()
+# print(f"After load llm model: {torch.cuda.memory_allocated() / 1024**3} GB")
+def get_llm_response(model, tokenizer, user_prompt, seed=None, system_prompt=DEFAULT_SYSTEM_PROMPT):
+    # global model
+    # global tokenizer
+    # load_model()
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt}
+    ]
+    text = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    if seed is not None:
+        torch.manual_seed(seed)
+    # breakpoint()
+    generated_ids = model.generate(
+        model_inputs.input_ids,
+        max_new_tokens=512,
+        temperature=0.7,
+    )
+    generated_ids = [
+        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+    ]
+    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    return response
+# if __name__ == "__main__":
+    # user_prompt="哈利波特"
+    # rsp = get_response(user_prompt, seed=0)
+    # print(rsp)
+    # breakpoint()

pipeline/custom_pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .pipeline_flux_controlnet_image_to_image import FluxControlNetImg2ImgPipeline
+from .pipeline_flux_img2img import FluxImg2ImgPipeline
+from .pipeline_flux_prior_redux import FluxPriorReduxPipeline

pipeline/custom_pipelines/pipeline_flux_controlnet_image_to_image.py ADDED Viewed

	@@ -0,0 +1,1004 @@

+# copied from diffusers/src/diffusers/pipelines/flux/pipeline_flux_controlnet_image_to_image.py
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.controlnets.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+from diffusers.models.transformers import FluxTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxControlNetImg2ImgPipeline, FluxControlNetModel
+        >>> from diffusers.utils import load_image
+        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
+        >>> controlnet = FluxControlNetModel.from_pretrained(
+        ...     "InstantX/FLUX.1-dev-Controlnet-Canny-alpha", torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe = FluxControlNetImg2ImgPipeline.from_pretrained(
+        ...     "black-forest-labs/FLUX.1-schnell", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+        >>> pipe.text_encoder.to(torch.float16)
+        >>> pipe.controlnet.to(torch.float16)
+        >>> pipe.to("cuda")
+        >>> control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Canny/resolve/main/canny.jpg")
+        >>> init_image = load_image(
+        ...     "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        ... )
+        >>> prompt = "A girl in city, 25 years old, cool, futuristic"
+        >>> image = pipe(
+        ...     prompt,
+        ...     image=init_image,
+        ...     control_image=control_image,
+        ...     control_guidance_start=0.2,
+        ...     control_guidance_end=0.8,
+        ...     controlnet_conditioning_scale=1.0,
+        ...     strength=0.7,
+        ...     num_inference_steps=2,
+        ...     guidance_scale=3.5,
+        ... ).images[0]
+        >>> image.save("flux_controlnet_img2img.png")
+        ```
+"""
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if accept_sigmas:
+            sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+            # if not accept_sigmas:
+            #     raise ValueError(
+            #         f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+            #         f" sigmas schedules. Please check whether you are using the correct scheduler."
+            #     )
+            scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+            timesteps = scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            scheduler.set_timesteps(num_inference_steps, device=device)#, **kwargs)
+            timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class FluxControlNetImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    The Flux controlnet pipeline for image-to-image generation.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        controlnet: Union[
+            FluxControlNetModel, List[FluxControlNetModel], Tuple[FluxControlNetModel], FluxMultiControlNetModel
+        ],
+    ):
+        super().__init__()
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = FluxMultiControlNetModel(controlnet)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            controlnet=controlnet,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 128
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        return image_latents
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+        return timesteps, num_inference_steps - t_start
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        strength,
+        height,
+        width,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        max_sequence_length=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if height % self.vae_scale_factor * 2 != 0 or width % self.vae_scale_factor * 2 != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+        return latents
+    # Copied from diffusers.pipelines.flux.pipeline_flux_img2img.FluxImg2ImgPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+        image = image.to(device=device, dtype=dtype)
+        image_latents = self._encode_vae_image(image=image, generator=generator)
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        return latents, latent_image_ids
+    # Copied from diffusers.pipelines.controlnet_sd3.pipeline_stable_diffusion_3_controlnet.StableDiffusion3ControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        if isinstance(image, torch.Tensor):
+            pass
+        else:
+            image = self.image_processor.preprocess(image, height=height, width=width)
+        image_batch_size = image.shape[0]
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+        image = image.repeat_interleave(repeat_by, dim=0)
+        image = image.to(device=device, dtype=dtype)
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+        return image
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.6,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 7.0,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        control_mode: Optional[Union[int, List[int]]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`.
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The image(s) to modify with the pipeline.
+            control_image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                The ControlNet input condition. Image to control the generation.
+            height (`int`, *optional*, defaults to self.default_sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.default_sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.6):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+            num_inference_steps (`int`, *optional*, defaults to 28):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+            control_mode (`int` or `List[int]`, *optional*):
+                The mode for the ControlNet. If multiple ControlNets are used, this should be a list.
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original transformer.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or more [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to
+                make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                Additional keyword arguments to be passed to the joint attention mechanism.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising step during the inference.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function.
+            max_sequence_length (`int`, *optional*, defaults to 512):
+                The maximum length of the sequence to be generated.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(self.controlnet.nets) if isinstance(self.controlnet, FluxMultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            strength,
+            height,
+            width,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        dtype = self.transformer.dtype
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
+        init_image = init_image.to(dtype=torch.float32)
+        num_channels_latents = self.transformer.config.in_channels // 4
+        if isinstance(self.controlnet, FluxControlNetModel):
+            control_image = self.prepare_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.vae.dtype,
+            )
+            height, width = control_image.shape[-2:]
+            control_image = retrieve_latents(self.vae.encode(control_image), generator=generator)
+            control_image = (control_image - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+            height_control_image, width_control_image = control_image.shape[2:]
+            control_image = self._pack_latents(
+                control_image,
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height_control_image,
+                width_control_image,
+            )
+            if control_mode is not None:
+                control_mode = torch.tensor(control_mode).to(device, dtype=torch.long)
+                control_mode = control_mode.reshape([-1, 1])
+        elif isinstance(self.controlnet, FluxMultiControlNetModel):
+            control_images = []
+            for control_image_ in control_image:
+                control_image_ = self.prepare_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=self.vae.dtype,
+                )
+                height, width = control_image_.shape[-2:]
+                control_image_ = retrieve_latents(self.vae.encode(control_image_), generator=generator)
+                control_image_ = (control_image_ - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+                height_control_image, width_control_image = control_image_.shape[2:]
+                control_image_ = self._pack_latents(
+                    control_image_,
+                    batch_size * num_images_per_prompt,
+                    num_channels_latents,
+                    height_control_image,
+                    width_control_image,
+                )
+                control_images.append(control_image_)
+            control_image = control_images
+            control_mode_ = []
+            if isinstance(control_mode, list):
+                for cmode in control_mode:
+                    if cmode is None:
+                        control_mode_.append(-1)
+                    else:
+                        control_mode_.append(cmode)
+            control_mode = torch.tensor(control_mode_).to(device, dtype=torch.long)
+            control_mode = control_mode.reshape([-1, 1])
+        image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        # sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        latents, latent_image_ids = self.prepare_latents(
+            init_image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(self.controlnet, FluxControlNetModel) else keeps)
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                if isinstance(self.controlnet, FluxMultiControlNetModel):
+                    use_guidance = self.controlnet.nets[0].config.guidance_embeds
+                else:
+                    use_guidance = self.controlnet.config.guidance_embeds
+                guidance = torch.tensor([guidance_scale], device=device) if use_guidance else None
+                guidance = guidance.expand(latents.shape[0]) if guidance is not None else None
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+                controlnet_block_samples, controlnet_single_block_samples = self.controlnet(
+                    hidden_states=latents,
+                    controlnet_cond=control_image,
+                    controlnet_mode=control_mode,
+                    conditioning_scale=cond_scale,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )
+                guidance = (
+                    torch.tensor([guidance_scale], device=device) if self.transformer.config.guidance_embeds else None
+                )
+                guidance = guidance.expand(latents.shape[0]) if guidance is not None else None
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    controlnet_block_samples=controlnet_block_samples,
+                    controlnet_single_block_samples=controlnet_single_block_samples,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

pipeline/custom_pipelines/pipeline_flux_img2img.py ADDED Viewed

	@@ -0,0 +1,862 @@

+# copied from diffusers/src/diffusers/pipeline/flux/pipeline_flux_img2img.py
+# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.transformers import FluxTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxImg2ImgPipeline
+        >>> from diffusers.utils import load_image
+        >>> device = "cuda"
+        >>> pipe = FluxImg2ImgPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+        >>> pipe = pipe.to(device)
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> init_image = load_image(url).resize((1024, 1024))
+        >>> prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
+        >>> images = pipe(
+        ...     prompt=prompt, image=init_image, num_inference_steps=4, strength=0.95, guidance_scale=0.0
+        ... ).images[0]
+        ```
+"""
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if accept_sigmas:
+            sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+            # if not accept_sigmas:
+            #     raise ValueError(
+            #         f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+            #         f" sigmas schedules. Please check whether you are using the correct scheduler."
+            #     )
+            scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+            timesteps = scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            scheduler.set_timesteps(num_inference_steps, device=device)#, **kwargs)
+            timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class FluxImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    The Flux pipeline for image inpainting.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 128
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        return image_latents
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(num_inference_steps * strength, num_inference_steps)
+        t_start = int(max(num_inference_steps - init_timestep, 0))
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+        return timesteps, num_inference_steps - t_start
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        strength,
+        height,
+        width,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+        return latents
+    def prepare_latents(
+        self,
+        image,
+        timestep,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+        image = image.to(device=device, dtype=dtype)
+        image_latents = self._encode_vae_image(image=image, generator=generator)
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self.scheduler.scale_noise(image_latents, timestep, noise)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        return latents, latent_image_ids
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.6,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 7.0,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            strength,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        # 2. Preprocess image
+        init_image = self.image_processor.preprocess(image, height=height, width=width)
+        init_image = init_image.to(dtype=torch.float32)
+        # 3. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        # 4.Prepare timesteps
+        image_seq_len = (int(height) // self.vae_scale_factor // 2) * (int(width) // self.vae_scale_factor // 2)
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        # sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # 5. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            init_image,
+            latent_timestep,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

pipeline/custom_pipelines/pipeline_flux_prior_redux.py ADDED Viewed

	@@ -0,0 +1,500 @@

+# copied from diffusers/src/diffusers/pipelines/flux/pipeline_flux_prior_redux.py
+# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Union
+import torch
+from PIL import Image
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    SiglipImageProcessor,
+    SiglipVisionModel,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from diffusers.image_processor import PipelineImageInput
+from diffusers.loaders import FluxLoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.modeling_flux import ReduxImageEncoder
+from diffusers.pipelines.flux.pipeline_output import FluxPriorReduxPipelineOutput
+if is_torch_xla_available():
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxPriorReduxPipeline, FluxPipeline
+        >>> from diffusers.utils import load_image
+        >>> device = "cuda"
+        >>> dtype = torch.bfloat16
+        >>> repo_redux = "black-forest-labs/FLUX.1-Redux-dev"
+        >>> repo_base = "black-forest-labs/FLUX.1-dev"
+        >>> pipe_prior_redux = FluxPriorReduxPipeline.from_pretrained(repo_redux, torch_dtype=dtype).to(device)
+        >>> pipe = FluxPipeline.from_pretrained(
+        ...     repo_base, text_encoder=None, text_encoder_2=None, torch_dtype=torch.bfloat16
+        ... ).to(device)
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/style_ziggy/img5.png"
+        ... )
+        >>> pipe_prior_output = pipe_prior_redux(image)
+        >>> images = pipe(
+        ...     guidance_scale=2.5,
+        ...     num_inference_steps=50,
+        ...     generator=torch.Generator("cpu").manual_seed(0),
+        ...     **pipe_prior_output,
+        ... ).images
+        >>> images[0].save("flux-redux.png")
+        ```
+"""
+class FluxPriorReduxPipeline(DiffusionPipeline):
+    r"""
+    The Flux Redux pipeline for image-to-image generation.
+    Reference: https://blackforestlabs.ai/flux-1-tools/
+    Args:
+        image_encoder ([`SiglipVisionModel`]):
+            SIGLIP vision model to encode the input image.
+        feature_extractor ([`SiglipImageProcessor`]):
+            Image processor for preprocessing images for the SIGLIP model.
+        image_embedder ([`ReduxImageEncoder`]):
+            Redux image encoder to process the SIGLIP embeddings.
+        text_encoder ([`CLIPTextModel`], *optional*):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`], *optional*):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`, *optional*):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`, *optional*):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "image_encoder->image_embedder"
+    _optional_components = [
+        "text_encoder",
+        "tokenizer",
+        "text_encoder_2",
+        "tokenizer_2",
+    ]
+    _callback_tensor_inputs = []
+    def __init__(
+        self,
+        image_encoder: SiglipVisionModel,
+        feature_extractor: SiglipImageProcessor,
+        image_embedder: ReduxImageEncoder,
+        text_encoder: CLIPTextModel = None,
+        tokenizer: CLIPTokenizer = None,
+        text_encoder_2: T5EncoderModel = None,
+        tokenizer_2: T5TokenizerFast = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            image_embedder=image_embedder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_encoder_2=text_encoder_2,
+            tokenizer_2=tokenizer_2,
+        )
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+    def check_inputs(
+        self,
+        image,
+        prompt,
+        prompt_2,
+        prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        prompt_embeds_scale=1.0,
+        pooled_prompt_embeds_scale=1.0,
+    ):
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if prompt is not None and (isinstance(prompt, list) and isinstance(image, list) and len(prompt) != len(image)):
+            raise ValueError(
+                f"number of prompts must be equal to number of images, but {len(prompt)} prompts were provided and {len(image)} images"
+            )
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if isinstance(prompt_embeds_scale, list) and (
+            isinstance(image, list) and len(prompt_embeds_scale) != len(image)
+        ):
+            raise ValueError(
+                f"number of weights must be equal to number of images, but {len(prompt_embeds_scale)} weights were provided and {len(image)} images"
+            )
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+        image = self.feature_extractor.preprocess(
+            images=image, do_resize=True, return_tensors="pt", do_convert_rgb=True
+        )
+        image = image.to(device=device, dtype=dtype)
+        image_enc_hidden_states = self.image_encoder(**image).last_hidden_state
+        image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        return image_enc_hidden_states
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_embeds_scale: Optional[Union[float, List[float]]] = 1.0,
+        pooled_prompt_embeds_scale: Optional[Union[float, List[float]]] = 1.0,
+        strength: Optional[Union[float, List[float]]] = 1.0,
+        return_dict: bool = True,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. **experimental feature**: to use this feature,
+                make sure to explicitly load text encoders to the pipeline. Prompts will be ignored if text encoders
+                are not loaded.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPriorReduxPipelineOutput`] instead of a plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPriorReduxPipelineOutput`] or `tuple`:
+            [`~pipelines.flux.FluxPriorReduxPipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is a list with the generated images.
+        """
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            image,
+            prompt,
+            prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            prompt_embeds_scale=prompt_embeds_scale,
+            pooled_prompt_embeds_scale=pooled_prompt_embeds_scale,
+        )
+        # 2. Define call parameters
+        if image is not None and isinstance(image, Image.Image):
+            batch_size = 1
+        elif image is not None and isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        if prompt is not None and isinstance(prompt, str):
+            prompt = batch_size * [prompt]
+        if isinstance(prompt_embeds_scale, float):
+            prompt_embeds_scale = batch_size * [prompt_embeds_scale]
+        if isinstance(pooled_prompt_embeds_scale, float):
+            pooled_prompt_embeds_scale = batch_size * [pooled_prompt_embeds_scale]
+        if isinstance(strength, float):
+            strength = batch_size * [strength]
+        device = self._execution_device
+        # 3. Prepare image embeddings
+        image_latents = self.encode_image(image, device, 1)
+        image_embeds = self.image_embedder(image_latents).image_embeds
+        image_embeds = image_embeds.to(device=device)
+        # 3. Prepare (dummy) text embeddings
+        if hasattr(self, "text_encoder") and self.text_encoder is not None:
+            (
+                prompt_embeds,
+                pooled_prompt_embeds,
+                _,
+            ) = self.encode_prompt(
+                prompt=prompt,
+                prompt_2=prompt_2,
+                prompt_embeds=prompt_embeds,
+                pooled_prompt_embeds=pooled_prompt_embeds,
+                device=device,
+                num_images_per_prompt=1,
+                max_sequence_length=512,
+                lora_scale=None,
+            )
+        else:
+            if prompt is not None:
+                logger.warning(
+                    "prompt input is ignored when text encoders are not loaded to the pipeline. "
+                    "Make sure to explicitly load the text encoders to enable prompt input. "
+                )
+            # max_sequence_length is 512, t5 encoder hidden size is 4096
+            prompt_embeds = torch.zeros((batch_size, 512, 4096), device=device, dtype=image_embeds.dtype)
+            # pooled_prompt_embeds is 768, clip text encoder hidden size
+            pooled_prompt_embeds = torch.zeros((batch_size, 768), device=device, dtype=image_embeds.dtype)
+        # apply strength to image_embeds
+        image_embeds *= torch.tensor(strength, device=device, dtype=image_embeds.dtype)[:, None, None]
+        # scale & concatenate image and text embeddings
+        prompt_embeds = torch.cat([prompt_embeds, image_embeds], dim=1)
+        prompt_embeds *= torch.tensor(prompt_embeds_scale, device=device, dtype=image_embeds.dtype)[:, None, None]
+        pooled_prompt_embeds *= torch.tensor(pooled_prompt_embeds_scale, device=device, dtype=image_embeds.dtype)[
+            :, None
+        ]
+        # weighted sum
+        prompt_embeds = torch.sum(prompt_embeds, dim=0, keepdim=True)
+        pooled_prompt_embeds = torch.sum(pooled_prompt_embeds, dim=0, keepdim=True)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (prompt_embeds, pooled_prompt_embeds)
+        return FluxPriorReduxPipelineOutput(prompt_embeds=prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds)

pipeline/example_text_to_3d.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from pipeline.kiss3d_wrapper import init_wrapper_from_config, run_text_to_3d, run_image_to_3d
+if __name__ == "__main__":
+    k3d_wrapper = init_wrapper_from_config('/hpc2hdd/home/jlin695/code/github/Kiss3DGen/pipeline/pipeline_config/default.yaml')
+    run_text_to_3d(k3d_wrapper, prompt='A doll of a girl in Harry Potter')

pipeline/kiss3d_wrapper.py CHANGED Viewed

@@ -1,7 +1,9 @@
 # The kiss3d pipeline wrapper for inference
 import os
 import numpy as np
 import torch
 import yaml
 import uuid
@@ -10,49 +12,93 @@ from einops import rearrange
 from PIL import Image
 from pipeline.utils import logger, TMP_DIR, OUT_DIR
-from pipeline.utils import lrm_reconstruct, isomer_reconstruct
 import torch
 import torchvision
 # for reconstruction model
 from omegaconf import OmegaConf
 from models.lrm.utils.train_util import instantiate_from_config
 from models.lrm.utils.render_utils import rotate_x, rotate_y
 from utils.tool import get_background
 # for florence2
-from transformers import AutoProcessor, AutoModelForCausalLM
-from diffusers import FluxPipeline, FluxControlNetImg2ImgPipeline, FluxImg2ImgPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
-from diffusers.models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
 def init_wrapper_from_config(config_path):
     with open(config_path, 'r') as config_file:
         config_ = yaml.load(config_file, yaml.FullLoader)
     # init flux_pipeline
     logger.info('==> Loading Flux model ...')
     flux_device = config_['flux'].get('device', 'cpu')
     flux_base_model_pth = config_['flux'].get('base_model', None)
     flux_controlnet_pth = config_['flux'].get('controlnet', None)
-    flux_lora_pth = config_['flux'].get('lora', None)
-    # load flux model and controlnet
-    if flux_controlnet_pth is not None:
-        flux_controlnet = FluxControlNetModel.from_pretrained(flux_controlnet_pth)
-        flux_pipe = FluxControlNetImg2ImgPipeline.from_pretrained(flux_base_model_pth, controlnet=[flux_controlnet], \
-                                torch_dtype=torch.bfloat16)
     else:
-        flux_pipe = FluxImg2ImgPipeline(flux_base_model_pth, torch_dtype=torch.bfloat16)
     # load lora weights
     flux_pipe.load_lora_weights(flux_lora_pth)
-    flux_pipe.to(device=flux_device, dtype=torch.bfloat16)
-    # TODO: load redux model
-    # FluxPriorReduxPipeline.from_pretrained()
     # TODO: load pulid model
@@ -68,13 +114,15 @@ def init_wrapper_from_config(config_path):
         multiview_pipeline.scheduler.config, timestep_spacing='trailing'
     )
-    unet_ckpt_path = config_['multiview'].get('unet', None)
     if unet_ckpt_path is not None:
         state_dict = torch.load(unet_ckpt_path, map_location='cpu')['state_dict']
         state_dict = {k[10:]: v for k, v in state_dict.items() if k.startswith('unet.unet.')}
         multiview_pipeline.unet.load_state_dict(state_dict, strict=True)
     multiview_pipeline.to(multiview_device)
     # load caption model
     logger.info('==> Loading caption model ...')
@@ -82,6 +130,7 @@ def init_wrapper_from_config(config_path):
     caption_model = AutoModelForCausalLM.from_pretrained(config_['caption']['base_model'], \
                     torch_dtype=torch.bfloat16, trust_remote_code=True).to(caption_device)
     caption_processor = AutoProcessor.from_pretrained(config_['caption']['base_model'], trust_remote_code=True)
     # load reconstruction model
     logger.info('==> Loading reconstruction model ...')
@@ -89,40 +138,79 @@ def init_wrapper_from_config(config_path):
     recon_model_config = OmegaConf.load(config_['reconstruction']['model_config'])
     recon_model = instantiate_from_config(recon_model_config.model_config)
     # load recon model checkpoint
-    state_dict = torch.load(config_['reconstruction']['base_model'], map_location='cpu')['state_dict']
     state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
     recon_model.load_state_dict(state_dict, strict=True)
     recon_model.to(recon_device)
     recon_model.init_flexicubes_geometry(recon_device, fovy=50.0)
     recon_model.eval()
     return kiss3d_wrapper(
         config = config_,
         flux_pipeline = flux_pipe,
         multiview_pipeline = multiview_pipeline,
         caption_processor = caption_processor,
         caption_model = caption_model,
         reconstruction_model_config = recon_model_config,
         reconstruction_model = recon_model,
     )
 class kiss3d_wrapper(object):
     def __init__(self,
         config: Dict,
         flux_pipeline: Union[FluxPipeline, FluxControlNetImg2ImgPipeline],
         multiview_pipeline: DiffusionPipeline,
         caption_processor: AutoProcessor,
         caption_model: AutoModelForCausalLM,
         reconstruction_model_config: Any,
         reconstruction_model: Any,
     ):
         self.config = config
         self.flux_pipeline = flux_pipeline
         self.multiview_pipeline = multiview_pipeline
         self.caption_model = caption_model
         self.caption_processor = caption_processor
         self.recon_model_config = reconstruction_model_config
-        self.recon_model = reconstruction_model
         self.renew_uuid()
@@ -144,12 +232,10 @@ class kiss3d_wrapper(object):
         caption_device = self.config['caption'].get('device', 'cpu')
         if isinstance(image, str):  # If image is a file path
-            image = Image.open(image).convert("RGB")
-        elif isinstance(image, Image):
-            image = image.convert("RGB")
-        else:
             raise NotImplementedError('unexpected image type')
         prompt = "<MORE_DETAILED_CAPTION>"
         inputs = self.caption_processor(text=prompt, images=image, return_tensors="pt").to(caption_device, torch_dtype)
@@ -161,17 +247,45 @@ class kiss3d_wrapper(object):
         parsed_answer = self.caption_processor.post_process_generation(
             generated_text, task=prompt, image_size=(image.width, image.height)
         )
-        caption_text = parsed_answer["<MORE_DETAILED_CAPTION>"].replace("The image is ", "")
         return caption_text
-    def generate_multiview(self, image):
         with self.context():
             mv_image = self.multiview_pipeline(image,
-                                               num_inference_steps=self.config['multiview']['num_inference_steps'],
-                                               width=512*2, height=512*2).images[0]
         return mv_image
-    def reconstruct_from_multiview(self, mv_image):
         """
         mv_image: PIL.Image
         """
@@ -184,23 +298,31 @@ class kiss3d_wrapper(object):
         with self.context():
             vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo = \
             lrm_reconstruct(self.recon_model, self.recon_model_config.infer_config,
-                            rgb_multi_view, name=self.uuid)
-        return vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo
-    def generate_reference_3D_bundle_image_zero123(self, image, save_intermediate_results=True):
         """
         input: image, PIL.Image
-        return: ref_3D_bundle_image, Tensor of shape (1, 3, 1024, 2048)
         """
         mv_image = self.generate_multiview(image)
         if save_intermediate_results:
             mv_image.save(os.path.join(TMP_DIR, f'{self.uuid}_mv_image.png'))
-        vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo = self.reconstruct_from_multiview(mv_image)
-        ref_3D_bundle_image = torchvision.utils.make_grid(torch.cat([lrm_multi_view_rgb.cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
         if save_intermediate_results:
             save_path = os.path.join(TMP_DIR, f'{self.uuid}_ref_3d_bundle_image.png')
@@ -222,6 +344,9 @@ class kiss3d_wrapper(object):
                                  control_guidance_end=None,
                                  controlnet_conditioning_scale=None,
                                  lora_scale=1.0,
                                  save_intermediate_results=True,
                                  **kwargs):
         control_mode_dict = {
@@ -235,15 +360,20 @@ class kiss3d_wrapper(object):
         } # for https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union only
         flux_device = self.config['flux'].get('device', 'cpu')
-        seed = self.config['flux'].get('seed', 0)
         generator = torch.Generator(device=flux_device).manual_seed(seed)
         hparam_dict = {
-            'prompt': ' '.join(['A grid of 2x4 multi-view image, elevation 5. White background.', prompt]),
-            'image': image or torch.zeros((1, 3, 1024, 2048), dtype=torch.float32, device=flux_device),
             'strength': strength,
-            'num_inference_steps': 30,
             'guidance_scale': 3.5,
             'num_images_per_prompt': 1,
             'width': 2048,
@@ -253,14 +383,29 @@ class kiss3d_wrapper(object):
             'joint_attention_kwargs': {"scale": lora_scale}
         }
         hparam_dict.update(kwargs)
          # append controlnet hparams
         if len(control_image) > 0:
             assert isinstance(self.flux_pipeline, FluxControlNetImg2ImgPipeline)
             assert len(control_mode) == len(control_image) # the count of image should be the same as control mode
             flux_ctrl_net = self.flux_pipeline.controlnet.nets[0]
-            self.flux_pipeline.controlnet = FluxMultiControlNetModel([flux_ctrl_net for i in range(len(control_image))])
             ctrl_hparams = {
                 'control_mode': [control_mode_dict[mode_] for mode_ in control_mode],
@@ -285,13 +430,45 @@ class kiss3d_wrapper(object):
         return gen_3d_bundle_image_
     def generate_3d_bundle_image_text(self,
                                       prompt,
                                       image=None,
                                       strength=1.0,
                                       lora_scale=1.0,
-                                      num_inference_steps=30,
                                       save_intermediate_results=True,
                                       **kwargs):
@@ -299,27 +476,25 @@ class kiss3d_wrapper(object):
         return: gen_3d_bundle_image, torch.Tensor of shape (3, 1024, 2048), range [0., 1.]
         """
-        if isinstance(self.flux_pipeline, FluxControlNetImg2ImgPipeline):
-            flux_pipeline = FluxImg2ImgPipeline(
-                scheduler = self.flux_pipeline.scheduler,
-                vae = self.flux_pipeline.vae,
-                text_encoder = self.flux_pipeline.text_encoder,
-                tokenizer = self.flux_pipeline.tokenizer,
-                text_encoder_2 = self.flux_pipeline.text_encoder_2,
-                tokenizer_2 = self.flux_pipeline.tokenizer_2,
-                transformer = self.flux_pipeline.transformer
-            )
-        else:
             flux_pipeline = self.flux_pipeline
         flux_device = self.config['flux'].get('device', 'cpu')
-        seed = self.config['flux'].get('seed', 0)
         generator = torch.Generator(device=flux_device).manual_seed(seed)
         hparam_dict = {
-            'prompt': ' '.join(['A grid of 2x4 multi-view image, elevation 5. White background.', prompt]),
-            'image': image or torch.zeros((1, 3, 1024, 2048), dtype=torch.float32, device=flux_device),
             'strength': strength,
             'num_inference_steps': num_inference_steps,
             'guidance_scale': 3.5,
@@ -332,6 +507,22 @@ class kiss3d_wrapper(object):
         }
         hparam_dict.update(kwargs)
         with self.context():
             gen_3d_bundle_image = flux_pipeline(**hparam_dict).images
@@ -345,7 +536,13 @@ class kiss3d_wrapper(object):
         return gen_3d_bundle_image_
-    def reconstruct_3d_bundle_image(self, image, save_intermediate_results=True):
         """
         image: torch.Tensor, range [0., 1.], (3, 1024, 2048)
         """
@@ -355,6 +552,8 @@ class kiss3d_wrapper(object):
         images = rearrange(image, 'c (n h) (m w) -> (n m) c h w', n=2, m=4) # (3, 1024, 2048) -> (8, 3, 512, 512)
         rgb_multi_view, normal_multi_view = images.chunk(2, dim=0)
         multi_view_mask = get_background(normal_multi_view).to(recon_device)
         rgb_multi_view = rgb_multi_view.to(recon_device) * multi_view_mask + (1 - multi_view_mask)
         with self.context():
@@ -362,11 +561,12 @@ class kiss3d_wrapper(object):
             lrm_reconstruct(self.recon_model, self.recon_model_config.infer_config,
                             rgb_multi_view.unsqueeze(0).to(recon_device), name=self.uuid,
                             input_camera_type='kiss3d', render_3d_bundle_image=save_intermediate_results,
-                            render_azimuths=[0, 90, 180, 270])
         if save_intermediate_results:
             recon_3D_bundle_image = torchvision.utils.make_grid(torch.cat([lrm_multi_view_rgb.cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
-            torchvision.utils.save_image(recon_3D_bundle_image, os.path.join(TMP_DIR, f'{k3d_wrapper.uuid})_lrm_recon_3d_bundle_image.png'))
         recon_mesh_path = os.path.join(TMP_DIR, f"{self.uuid}_isomer_recon_mesh.obj")
@@ -375,7 +575,11 @@ class kiss3d_wrapper(object):
                                   multi_view_mask=multi_view_mask,
                                   vertices=vertices,
                                   faces=faces,
-                                  save_path=recon_mesh_path)
 def run_text_to_3d(k3d_wrapper,
@@ -391,39 +595,176 @@ def run_text_to_3d(k3d_wrapper,
     if init_image_path is not None:
         init_image = Image.open(init_image_path)
     gen_3d_bundle_image, gen_save_path = k3d_wrapper.generate_3d_bundle_image_text(prompt,
-                                                                                     image=init_image,
-                                                                                     strength=1.0,
-                                                                                     save_intermediate_results=True)
     # recon from 3D Bundle image
     recon_mesh_path = k3d_wrapper.reconstruct_3d_bundle_image(gen_3d_bundle_image, save_intermediate_results=False)
     return gen_save_path, recon_mesh_path
-def run_image_to_3d(k3d_wrapper, init_image_path):
     # ======================================= Example of image to 3D generation ======================================
     # Renew The uuid
     k3d_wrapper.renew_uuid()
     # FOR IMAGE TO 3D: generate reference 3D bundle image from a single input image
-    input_image = Image.open(init_image_path)
-    reference_3d_bundle_image, reference_save_path = k3d_wrapper.generate_reference_3D_bundle_image_zero123(input_image)
     caption = k3d_wrapper.get_image_caption(input_image)
-    import pdb
-    pdb.set_trace()
 if __name__ == "__main__":
-    k3d_wrapper = init_wrapper_from_config('/hpc2hdd/home/jlin695/code/Kiss3DGen/pipeline/pipeline_config/default.yaml')
-    # Example of loading existing 3D bundle Image
-    # demo_image = Image.open('/hpc2hdd/home/jlin695/code/github/Kiss3DGen/outputs/tmp/ea25bc9b-d775-46bb-9827-660a9a6540c8_gen_3d_bundle_image.png')
-    # gen_3d_bundle_image = torchvision.transforms.functional.to_tensor(demo_image)
-    run_image_to_3d(k3d_wrapper, '/hpc2hdd/home/jlin695/code/Kiss3DGen/examples/蓝色小怪物.webp')
     # run_text_to_3d(k3d_wrapper, prompt='A doll of a girl in Harry Potter')

 # The kiss3d pipeline wrapper for inference
 import os
+import spaces
 import numpy as np
+import random
 import torch
 import yaml
 import uuid
 from PIL import Image
 from pipeline.utils import logger, TMP_DIR, OUT_DIR
+from pipeline.utils import lrm_reconstruct, isomer_reconstruct, preprocess_input_image
 import torch
 import torchvision
+from torch.nn import functional as F
 # for reconstruction model
 from omegaconf import OmegaConf
 from models.lrm.utils.train_util import instantiate_from_config
 from models.lrm.utils.render_utils import rotate_x, rotate_y
+#
 from utils.tool import get_background
 # for florence2
+from transformers import AutoProcessor, AutoModelForCausalLM, AutoTokenizer
+from models.llm.llm import load_llm_model, get_llm_response
+from pipeline.custom_pipelines import FluxPriorReduxPipeline, FluxControlNetImg2ImgPipeline, FluxImg2ImgPipeline
+from diffusers import FluxPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler, FluxTransformer2DModel
+from diffusers.models.controlnets.controlnet_flux import FluxMultiControlNetModel, FluxControlNetModel
+from diffusers.schedulers import FlowMatchHeunDiscreteScheduler
+from huggingface_hub import hf_hub_download
+access_token = os.getenv("HUGGINGFACE_TOKEN")
+def convert_flux_pipeline(exist_flux_pipe, target_pipe, **kwargs):
+    new_pipe = target_pipe(
+        scheduler = exist_flux_pipe.scheduler,
+        vae = exist_flux_pipe.vae,
+        text_encoder = exist_flux_pipe.text_encoder,
+        tokenizer = exist_flux_pipe.tokenizer,
+        text_encoder_2 = exist_flux_pipe.text_encoder_2,
+        tokenizer_2 = exist_flux_pipe.tokenizer_2,
+        transformer = exist_flux_pipe.transformer,
+        **kwargs
+    )
+    return new_pipe
+@spaces.GPU
 def init_wrapper_from_config(config_path):
     with open(config_path, 'r') as config_file:
         config_ = yaml.load(config_file, yaml.FullLoader)
+    dtype_ = {
+        'fp8': torch.float8_e4m3fn,
+        'bf16': torch.bfloat16,
+        'fp16': torch.float16,
+        'fp32': torch.float32
+    }
     # init flux_pipeline
     logger.info('==> Loading Flux model ...')
     flux_device = config_['flux'].get('device', 'cpu')
     flux_base_model_pth = config_['flux'].get('base_model', None)
+    flux_dtype = config_['flux'].get('dtype', 'bf16')
     flux_controlnet_pth = config_['flux'].get('controlnet', None)
+    # flux_lora_pth = config_['flux'].get('lora', None)
+    flux_lora_pth = hf_hub_download(repo_id="LTT/xxx-ckpt", filename="rgb_normal_large.safetensors", repo_type="model", token=access_token)
+    flux_redux_pth = config_['flux'].get('redux', None)
+    if flux_base_model_pth.endswith('safetensors'):
+        flux_pipe = FluxImg2ImgPipeline.from_single_file(flux_base_model_pth, torch_dtype=dtype_[flux_dtype], token=access_token)
     else:
+        flux_pipe = FluxImg2ImgPipeline.from_pretrained(flux_base_model_pth, torch_dtype=dtype_[flux_dtype], token=access_token)
+    # load flux model and controlnet
+    if flux_controlnet_pth is not None:
+        flux_controlnet = FluxControlNetModel.from_pretrained(flux_controlnet_pth, torch_dtype=torch.bfloat16)
+        flux_pipe = convert_flux_pipeline(flux_pipe, FluxControlNetImg2ImgPipeline, controlnet=[flux_controlnet])
+    flux_pipe.scheduler = FlowMatchHeunDiscreteScheduler.from_config(flux_pipe.scheduler.config)
     # load lora weights
     flux_pipe.load_lora_weights(flux_lora_pth)
+    flux_pipe.to(device=flux_device)
+    # load redux model
+    flux_redux_pipe = None
+    if flux_redux_pth is not None:
+        flux_redux_pipe = FluxPriorReduxPipeline.from_pretrained(flux_redux_pth, torch_dtype=torch.bfloat16)
+        flux_redux_pipe.text_encoder = flux_pipe.text_encoder
+        flux_redux_pipe.text_encoder_2 = flux_pipe.text_encoder_2
+        flux_redux_pipe.tokenizer = flux_pipe.tokenizer
+        flux_redux_pipe.tokenizer_2 = flux_pipe.tokenizer_2
+        flux_redux_pipe.to(device=flux_device)
+    logger.warning(f"GPU memory allocated after load flux model on {flux_device}: {torch.cuda.memory_allocated(device=flux_device) / 1024**3} GB")
     # TODO: load pulid model
         multiview_pipeline.scheduler.config, timestep_spacing='trailing'
     )
+    # unet_ckpt_path = config_['multiview'].get('unet', None)
+    unet_ckpt_path = hf_hub_download(repo_id="LTT/PRM", filename="flexgen_19w.ckpt", repo_type="model")
     if unet_ckpt_path is not None:
         state_dict = torch.load(unet_ckpt_path, map_location='cpu')['state_dict']
         state_dict = {k[10:]: v for k, v in state_dict.items() if k.startswith('unet.unet.')}
         multiview_pipeline.unet.load_state_dict(state_dict, strict=True)
     multiview_pipeline.to(multiview_device)
+    logger.warning(f"GPU memory allocated after load multiview model on {multiview_device}: {torch.cuda.memory_allocated(device=multiview_device) / 1024**3} GB")
     # load caption model
     logger.info('==> Loading caption model ...')
     caption_model = AutoModelForCausalLM.from_pretrained(config_['caption']['base_model'], \
                     torch_dtype=torch.bfloat16, trust_remote_code=True).to(caption_device)
     caption_processor = AutoProcessor.from_pretrained(config_['caption']['base_model'], trust_remote_code=True)
+    logger.warning(f"GPU memory allocated after load caption model on {caption_device}: {torch.cuda.memory_allocated(device=caption_device) / 1024**3} GB")
     # load reconstruction model
     logger.info('==> Loading reconstruction model ...')
     recon_model_config = OmegaConf.load(config_['reconstruction']['model_config'])
     recon_model = instantiate_from_config(recon_model_config.model_config)
     # load recon model checkpoint
+    model_ckpt_path = hf_hub_download(repo_id="LTT/PRM", filename="final_ckpt.ckpt", repo_type="model")
+    state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
     state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
     recon_model.load_state_dict(state_dict, strict=True)
     recon_model.to(recon_device)
     recon_model.init_flexicubes_geometry(recon_device, fovy=50.0)
     recon_model.eval()
+    logger.warning(f"GPU memory allocated after load reconstruction model on {recon_device}: {torch.cuda.memory_allocated(device=recon_device) / 1024**3} GB")
+    # load llm
+    llm_configs = config_.get('llm', None)
+    if llm_configs is not None:
+        logger.info('==> Loading LLM ...')
+        llm_device = llm_configs.get('device', 'cpu')
+        llm, llm_tokenizer = load_llm_model(llm_configs['base_model'])
+        llm.to(llm_device)
+        logger.warning(f"GPU memory allocated after load llm model on {llm_device}: {torch.cuda.memory_allocated(device=llm_device) / 1024**3} GB")
+    else:
+        llm, llm_tokenizer = None, None
     return kiss3d_wrapper(
         config = config_,
         flux_pipeline = flux_pipe,
+        flux_redux_pipeline=flux_redux_pipe,
         multiview_pipeline = multiview_pipeline,
         caption_processor = caption_processor,
         caption_model = caption_model,
         reconstruction_model_config = recon_model_config,
         reconstruction_model = recon_model,
+        llm_model = llm,
+        llm_tokenizer = llm_tokenizer
     )
+def seed_everything(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    print(f"Random seed set to {seed}")
 class kiss3d_wrapper(object):
     def __init__(self,
         config: Dict,
         flux_pipeline: Union[FluxPipeline, FluxControlNetImg2ImgPipeline],
+        flux_redux_pipeline: FluxPriorReduxPipeline,
         multiview_pipeline: DiffusionPipeline,
         caption_processor: AutoProcessor,
         caption_model: AutoModelForCausalLM,
         reconstruction_model_config: Any,
         reconstruction_model: Any,
+        llm_model: AutoModelForCausalLM = None,
+        llm_tokenizer: AutoTokenizer = None
     ):
         self.config = config
         self.flux_pipeline = flux_pipeline
+        self.flux_redux_pipeline = flux_redux_pipeline
         self.multiview_pipeline = multiview_pipeline
         self.caption_model = caption_model
         self.caption_processor = caption_processor
         self.recon_model_config = reconstruction_model_config
+        self.recon_model = reconstruction_model
+        self.llm_model = llm_model
+        self.llm_tokenizer = llm_tokenizer
+        self.to_512_tensor = torchvision.transforms.Compose([
+            torchvision.transforms.ToTensor(),
+            torchvision.transforms.Resize((512, 512), interpolation=2),
+        ])
         self.renew_uuid()
         caption_device = self.config['caption'].get('device', 'cpu')
         if isinstance(image, str):  # If image is a file path
+            image = preprocess_input_image(Image.open(image))
+        elif not isinstance(image, Image.Image):
             raise NotImplementedError('unexpected image type')
         prompt = "<MORE_DETAILED_CAPTION>"
         inputs = self.caption_processor(text=prompt, images=image, return_tensors="pt").to(caption_device, torch_dtype)
         parsed_answer = self.caption_processor.post_process_generation(
             generated_text, task=prompt, image_size=(image.width, image.height)
         )
+        caption_text = parsed_answer["<MORE_DETAILED_CAPTION>"] # .replace("The image is ", "")
+        logger.info(f"Auto caption result: \"{caption_text}\"")
+        caption_text = self.get_detailed_prompt(caption_text)
         return caption_text
+    def get_detailed_prompt(self, prompt, seed=None):
+        if self.llm_model is not None:
+            detailed_prompt = get_llm_response(self.llm_model, self.llm_tokenizer, prompt, seed=seed)
+            logger.info(f"LLM refined prompt result: \"{detailed_prompt}\"")
+            return detailed_prompt
+        return prompt
+    def del_llm_model(self):
+        logger.warning('This function is now deprecated and will take no effect')
+        # raise NotImplementedError()
+        # del llm.model
+        # del llm.tokenizer
+        # llm.model = None
+        # llm.tokenizer = None
+    def generate_multiview(self, image, seed=None, num_inference_steps=None):
+        seed = seed or self.config['multiview'].get('seed', 0)
+        mv_device = self.config['multiview'].get('device', 'cpu')
+        generator = torch.Generator(device=mv_device).manual_seed(seed)
         with self.context():
             mv_image = self.multiview_pipeline(image,
+                                               num_inference_steps=num_inference_steps or self.config['multiview']['num_inference_steps'],
+                                               width=512*2,
+                                               height=512*2,
+                                               generator=generator).images[0]
         return mv_image
+    def reconstruct_from_multiview(self, mv_image, lrm_render_radius=4.15):
         """
         mv_image: PIL.Image
         """
         with self.context():
             vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo = \
             lrm_reconstruct(self.recon_model, self.recon_model_config.infer_config,
+                            rgb_multi_view, name=self.uuid, render_radius=lrm_render_radius)
+        return rgb_multi_view, vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo
+    def generate_reference_3D_bundle_image_zero123(self, image, use_mv_rgb=False, save_intermediate_results=True):
         """
         input: image, PIL.Image
+        return: ref_3D_bundle_image, Tensor of shape (3, 1024, 2048)
         """
         mv_image = self.generate_multiview(image)
         if save_intermediate_results:
             mv_image.save(os.path.join(TMP_DIR, f'{self.uuid}_mv_image.png'))
+        rgb_multi_view, vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo = self.reconstruct_from_multiview(mv_image)
+        if use_mv_rgb:
+            # ref_3D_bundle_image = torchvision.utils.make_grid(torch.cat([rgb_multi_view[0, [3, 0, 1, 2], ...].cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0) # range [0, 1]
+            rgb_ = torch.cat([rgb_multi_view[0, [3, 0, 1, 2], ...].cpu(), lrm_multi_view_rgb.cpu()], dim=0)
+            ref_3D_bundle_image = torchvision.utils.make_grid(torch.cat([rgb_[[0, 5, 2, 7], ...], (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0) # range [0, 1]
+        else:
+            ref_3D_bundle_image = torchvision.utils.make_grid(torch.cat([lrm_multi_view_rgb.cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0) # range [0, 1]
+        ref_3D_bundle_image = ref_3D_bundle_image.clip(0., 1.)
         if save_intermediate_results:
             save_path = os.path.join(TMP_DIR, f'{self.uuid}_ref_3d_bundle_image.png')
                                  control_guidance_end=None,
                                  controlnet_conditioning_scale=None,
                                  lora_scale=1.0,
+                                 num_inference_steps=None,
+                                 seed=None,
+                                 redux_hparam=None,
                                  save_intermediate_results=True,
                                  **kwargs):
         control_mode_dict = {
         } # for https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union only
         flux_device = self.config['flux'].get('device', 'cpu')
+        seed = seed or self.config['flux'].get('seed', 0)
+        num_inference_steps = num_inference_steps or self.config['flux'].get('num_inference_steps', 20)
         generator = torch.Generator(device=flux_device).manual_seed(seed)
+        if image is None:
+            image = torch.zeros((1, 3, 1024, 2048), dtype=torch.float32, device=flux_device)
         hparam_dict = {
+            'prompt': 'A grid of 2x4 multi-view image, elevation 5. White background.',
+            'prompt_2': ' '.join(['A grid of 2x4 multi-view image, elevation 5. White background.', prompt]),
+            'image': image,
             'strength': strength,
+            'num_inference_steps': num_inference_steps,
             'guidance_scale': 3.5,
             'num_images_per_prompt': 1,
             'width': 2048,
             'joint_attention_kwargs': {"scale": lora_scale}
         }
         hparam_dict.update(kwargs)
+        # do redux
+        if redux_hparam is not None:
+            assert self.flux_redux_pipeline is not None
+            assert 'image' in redux_hparam.keys()
+            redux_hparam_ = {
+                'prompt': hparam_dict.pop('prompt'),
+                'prompt_2': hparam_dict.pop('prompt_2'),
+            }
+            redux_hparam_.update(redux_hparam)
+            with self.context():
+                redux_output = self.flux_redux_pipeline(**redux_hparam_)
+            hparam_dict.update(redux_output)
          # append controlnet hparams
         if len(control_image) > 0:
             assert isinstance(self.flux_pipeline, FluxControlNetImg2ImgPipeline)
             assert len(control_mode) == len(control_image) # the count of image should be the same as control mode
             flux_ctrl_net = self.flux_pipeline.controlnet.nets[0]
+            self.flux_pipeline.controlnet = FluxMultiControlNetModel([flux_ctrl_net for _ in control_mode])
             ctrl_hparams = {
                 'control_mode': [control_mode_dict[mode_] for mode_ in control_mode],
         return gen_3d_bundle_image_
+    def preprocess_controlnet_cond_image(self, image, control_mode, save_intermediate_results=True, **kwargs):
+        """
+        image: Tensor of shape (c, h, w), range [0., 1.]
+        """
+        if control_mode in ['tile', 'lq']:
+            _, h, w = image.shape
+            down_scale = kwargs.get('down_scale', 4)
+            down_up = torchvision.transforms.Compose([
+                torchvision.transforms.Resize((h // down_scale, w // down_scale), interpolation=2), # 1 for lanczos and 2 for bilinear
+                torchvision.transforms.Resize((h, w), interpolation=2),
+                torchvision.transforms.ToPILImage()
+            ])
+            preprocessed = down_up(image)
+        elif control_mode == 'blur':
+            kernel_size = kwargs.get('kernel_size', 51)
+            sigma = kwargs.get('sigma', 2.0)
+            blur = torchvision.transforms.Compose([
+                    torchvision.transforms.ToPILImage(),
+                    torchvision.transforms.GaussianBlur(kernel_size, sigma),
+                ])
+            preprocessed = blur(image)
+        else:
+            raise NotImplementedError(f'Unexpected control mode {control_mode}')
+        if save_intermediate_results:
+            save_path = os.path.join(TMP_DIR, f'{self.uuid}_{control_mode}_controlnet_cond.png')
+            preprocessed.save(save_path)
+            logger.info(f'Save image to {save_path}')
+        return preprocessed
     def generate_3d_bundle_image_text(self,
                                       prompt,
                                       image=None,
                                       strength=1.0,
                                       lora_scale=1.0,
+                                      num_inference_steps=None,
+                                      seed=None,
+                                      redux_hparam=None,
                                       save_intermediate_results=True,
                                       **kwargs):
         return: gen_3d_bundle_image, torch.Tensor of shape (3, 1024, 2048), range [0., 1.]
         """
+        if isinstance(self.flux_pipeline, FluxImg2ImgPipeline):
             flux_pipeline = self.flux_pipeline
+        else:
+            flux_pipeline = convert_flux_pipeline(self.flux_pipeline, FluxImg2ImgPipeline)
         flux_device = self.config['flux'].get('device', 'cpu')
+        seed = seed or self.config['flux'].get('seed', 0)
+        num_inference_steps = num_inference_steps or self.config['flux'].get('num_inference_steps', 20)
+        if image is None:
+            image = torch.zeros((1, 3, 1024, 2048), dtype=torch.float32, device=flux_device)
         generator = torch.Generator(device=flux_device).manual_seed(seed)
         hparam_dict = {
+            'prompt': 'A grid of 2x4 multi-view image, elevation 5. White background.',
+            'prompt_2': ' '.join(['A grid of 2x4 multi-view image, elevation 5. White background.', prompt]),
+            'image': image,
             'strength': strength,
             'num_inference_steps': num_inference_steps,
             'guidance_scale': 3.5,
         }
         hparam_dict.update(kwargs)
+        # do redux
+        if redux_hparam is not None:
+            assert self.flux_redux_pipeline is not None
+            assert 'image' in redux_hparam.keys()
+            redux_hparam_ = {
+                'prompt': hparam_dict.pop('prompt'),
+                'prompt_2': hparam_dict.pop('prompt_2'),
+            }
+            redux_hparam_.update(redux_hparam)
+            with self.context():
+                redux_output = self.flux_redux_pipeline(**redux_hparam_)
+            hparam_dict.update(redux_output)
         with self.context():
             gen_3d_bundle_image = flux_pipeline(**hparam_dict).images
         return gen_3d_bundle_image_
+    def reconstruct_3d_bundle_image(self,
+        image,
+        lrm_render_radius=4.15,
+        isomer_radius=4.5,
+        reconstruction_stage1_steps=0,
+        reconstruction_stage2_steps=20,
+        save_intermediate_results=True):
         """
         image: torch.Tensor, range [0., 1.], (3, 1024, 2048)
         """
         images = rearrange(image, 'c (n h) (m w) -> (n m) c h w', n=2, m=4) # (3, 1024, 2048) -> (8, 3, 512, 512)
         rgb_multi_view, normal_multi_view = images.chunk(2, dim=0)
         multi_view_mask = get_background(normal_multi_view).to(recon_device)
+        print(f'shape images: {images.shape}')
+        # breakpoint()
         rgb_multi_view = rgb_multi_view.to(recon_device) * multi_view_mask + (1 - multi_view_mask)
         with self.context():
             lrm_reconstruct(self.recon_model, self.recon_model_config.infer_config,
                             rgb_multi_view.unsqueeze(0).to(recon_device), name=self.uuid,
                             input_camera_type='kiss3d', render_3d_bundle_image=save_intermediate_results,
+                            render_azimuths=[0, 90, 180, 270],
+                            render_radius=lrm_render_radius)
         if save_intermediate_results:
             recon_3D_bundle_image = torchvision.utils.make_grid(torch.cat([lrm_multi_view_rgb.cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
+            torchvision.utils.save_image(recon_3D_bundle_image, os.path.join(TMP_DIR, f'{self.uuid}_lrm_recon_3d_bundle_image.png'))
         recon_mesh_path = os.path.join(TMP_DIR, f"{self.uuid}_isomer_recon_mesh.obj")
                                   multi_view_mask=multi_view_mask,
                                   vertices=vertices,
                                   faces=faces,
+                                  save_path=recon_mesh_path,
+                                  radius=isomer_radius,
+                                  reconstruction_stage1_steps=int(reconstruction_stage1_steps),
+        reconstruction_stage2_steps=int(reconstruction_stage2_steps)
+        )
 def run_text_to_3d(k3d_wrapper,
     if init_image_path is not None:
         init_image = Image.open(init_image_path)
+    # refine prompt
+    logger.info(f"Input prompt: \"{prompt}\"")
+    prompt = k3d_wrapper.get_detailed_prompt(prompt)
     gen_3d_bundle_image, gen_save_path = k3d_wrapper.generate_3d_bundle_image_text(prompt,
+                                                                                   image=init_image,
+                                                                                   strength=1.0,
+                                                                                   save_intermediate_results=True)
+    # recon from 3D Bundle image
+    recon_mesh_path = k3d_wrapper.reconstruct_3d_bundle_image(gen_3d_bundle_image, save_intermediate_results=False)
+    return gen_save_path, recon_mesh_path
+def image2mesh_preprocess(k3d_wrapper, input_image_, seed, use_mv_rgb=True):
+    seed_everything(seed)
+    # Renew The uuid
+    k3d_wrapper.renew_uuid()
+    # FOR IMAGE TO 3D: generate reference 3D bundle image from a single input image
+    input_image__ = Image.open(input_image_) if isinstance(input_image_, str) else input_image_
+    input_image = preprocess_input_image(input_image__)
+    input_image_save_path = os.path.join(TMP_DIR, f'{k3d_wrapper.uuid}_input_image.png')
+    input_image.save(input_image_save_path)
+    reference_3d_bundle_image, reference_save_path = k3d_wrapper.generate_reference_3D_bundle_image_zero123(input_image, use_mv_rgb=use_mv_rgb)
+    caption = k3d_wrapper.get_image_caption(input_image)
+    return input_image_save_path, reference_save_path, caption
+def image2mesh_main(k3d_wrapper, input_image, reference_3d_bundle_image, caption, seed, strength1=0.5, strength2=0.95, enable_redux=True, use_controlnet=True):
+    seed_everything(seed)
+    if enable_redux:
+        redux_hparam = {
+            'image': k3d_wrapper.to_512_tensor(input_image).unsqueeze(0).clip(0., 1.),
+            'prompt_embeds_scale': 1.0,
+            'pooled_prompt_embeds_scale': 1.0,
+            'strength': strength1
+        }
+    else:
+        redux_hparam = None
+    if use_controlnet:
+        # prepare controlnet condition
+        control_mode = ['tile']
+        control_image = [k3d_wrapper.preprocess_controlnet_cond_image(reference_3d_bundle_image, mode_, down_scale=1, kernel_size=51, sigma=2.0) for mode_ in control_mode]
+        control_guidance_start = [0.0]
+        control_guidance_end = [0.3]
+        controlnet_conditioning_scale = [0.3]
+        gen_3d_bundle_image, gen_save_path = k3d_wrapper.generate_3d_bundle_image_controlnet(
+            prompt=caption,
+            image=reference_3d_bundle_image.unsqueeze(0),
+            strength=strength2,
+            control_image=control_image,
+            control_mode=control_mode,
+            control_guidance_start=control_guidance_start,
+            control_guidance_end=control_guidance_end,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            lora_scale=1.0,
+            redux_hparam=redux_hparam
+        )
+    else:
+        gen_3d_bundle_image, gen_save_path = k3d_wrapper.generate_3d_bundle_image_text(
+            prompt=caption,
+            image=reference_3d_bundle_image.unsqueeze(0),
+            strength=strength2,
+            lora_scale=1.0,
+            redux_hparam=redux_hparam
+        )
     # recon from 3D Bundle image
     recon_mesh_path = k3d_wrapper.reconstruct_3d_bundle_image(gen_3d_bundle_image, save_intermediate_results=False)
     return gen_save_path, recon_mesh_path
+def run_image_to_3d(k3d_wrapper, input_image_path, enable_redux=True, use_mv_rgb=True, use_controlnet=True):
     # ======================================= Example of image to 3D generation ======================================
     # Renew The uuid
     k3d_wrapper.renew_uuid()
     # FOR IMAGE TO 3D: generate reference 3D bundle image from a single input image
+    input_image = preprocess_input_image(Image.open(input_image_path))
+    input_image.save(os.path.join(TMP_DIR, f'{k3d_wrapper.uuid}_input_image.png'))
+    reference_3d_bundle_image, reference_save_path = k3d_wrapper.generate_reference_3D_bundle_image_zero123(input_image, use_mv_rgb=use_mv_rgb)
     caption = k3d_wrapper.get_image_caption(input_image)
+    if enable_redux:
+        redux_hparam = {
+            'image': k3d_wrapper.to_512_tensor(input_image).unsqueeze(0).clip(0., 1.),
+            'prompt_embeds_scale': 1.0,
+            'pooled_prompt_embeds_scale': 1.0,
+            'strength': 0.5
+        }
+    else:
+        redux_hparam = None
+    if use_controlnet:
+        # prepare controlnet condition
+        control_mode = ['tile']
+        control_image = [k3d_wrapper.preprocess_controlnet_cond_image(reference_3d_bundle_image, mode_, down_scale=1, kernel_size=51, sigma=2.0) for mode_ in control_mode]
+        control_guidance_start = [0.0]
+        control_guidance_end = [0.3]
+        controlnet_conditioning_scale = [0.3]
+        gen_3d_bundle_image, gen_save_path = k3d_wrapper.generate_3d_bundle_image_controlnet(
+            prompt=caption,
+            image=reference_3d_bundle_image.unsqueeze(0),
+            strength=.95,
+            control_image=control_image,
+            control_mode=control_mode,
+            control_guidance_start=control_guidance_start,
+            control_guidance_end=control_guidance_end,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            lora_scale=1.0,
+            redux_hparam=redux_hparam
+        )
+    else:
+        gen_3d_bundle_image, gen_save_path = k3d_wrapper.generate_3d_bundle_image_text(
+            prompt=caption,
+            image=reference_3d_bundle_image.unsqueeze(0),
+            strength=.95,
+            lora_scale=1.0,
+            redux_hparam=redux_hparam
+        )
+    # recon from 3D Bundle image
+    recon_mesh_path = k3d_wrapper.reconstruct_3d_bundle_image(gen_3d_bundle_image, save_intermediate_results=False)
+    return gen_save_path, recon_mesh_path
 if __name__ == "__main__":
+    k3d_wrapper = init_wrapper_from_config('/hpc2hdd/home/jlin695/code/github/Kiss3DGen/pipeline/pipeline_config/default.yaml')
+    os.system(f'rm -rf {TMP_DIR}/*')
+    # os.system(f'rm -rf {OUT_DIR}/3d_bundle/*')
+    enable_redux = True
+    use_mv_rgb = True
+    use_controlnet = True
+    img_folder = '/hpc2hdd/home/jlin695/code/Kiss3DGen/examples'
+    for img_ in os.listdir(img_folder):
+        name, _ = os.path.splitext(img_)
+        print("Now processing:", name)
+        gen_save_path, recon_mesh_path = run_image_to_3d(k3d_wrapper, os.path.join(img_folder, img_), enable_redux, use_mv_rgb, use_controlnet)
+        os.system(f'cp -f {gen_save_path} {OUT_DIR}/3d_bundle/{name}_3d_bundle.png')
+        os.system(f'cp -f {recon_mesh_path} {OUT_DIR}/3d_bundle/{name}.obj')
+    # TODO exams:
+    # 1. redux True, mv_rgb False, Tile, down_scale = 1
+    # 2. redux False, mv_rgb True, Tile, down_scale = 8
+    # 3. redux False, mv_rgb False, Tile, blur = 10
     # run_text_to_3d(k3d_wrapper, prompt='A doll of a girl in Harry Potter')
+    # Example of loading existing 3D bundle Image as Tensor from path
+    # pseudo_image = Image.open('/hpc2hdd/home/jlin695/code/github/Kiss3DGen/outputs/tmp/fbf6edad-2d7f-49e5-8ac2-a05af5fe695b_ref_3d_bundle_image.png')
+    # gen_3d_bundle_image = torchvision.transforms.functional.to_tensor(pseudo_image)

pipeline/pipeline_config/default.yaml CHANGED Viewed

@@ -1,15 +1,19 @@
 flux:
-  base_model: "/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/models--black-forest-labs--FLUX.1-dev"
-  lora: "./checkpoint/flux_lora/rgb_normal_doll_object.safetensors"
-  controlnet: "/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/flux_controlnets/FLUX.1-dev-ControlNet-Union-Pro"
-  seed: 0
   device: 'cuda:0'
 multiview:
   base_model: "sudo-ai/zero123plus-v1.2"
   custom_pipeline: "./models/zero123plus"
   unet: "./checkpoint/zero123++/flexgen_19w.ckpt"
-  num_inference_steps: 75
   device: 'cuda:0'
 reconstruction:
@@ -18,8 +22,12 @@ reconstruction:
   device: 'cuda:0'
 caption:
-  base_model: "/hpc2hdd/home/jlin695/.cache/huggingface/hub/models--multimodalart--Florence-2-large-no-flash-attn/snapshots/8db3793cf5b453b2ccfb3a4f613b403b2e6b7ca2"
-  device: 'cuda:0'
 use_zero_gpu: false # for huggingface demo only
-3d_bundle_templates: '/hpc2hdd/home/jlin695/code/github/Kiss3DGen/init_3d_Bundle'

 flux:
+  base_model: "https://huggingface.co/Comfy-Org/flux1-dev/blob/main/flux1-dev-fp8.safetensors"
+  flux_dtype: 'fp8'
+  lora: "./checkpoint/flux_lora/rgb_normal_large.safetensors"
+  controlnet: "InstantX/FLUX.1-dev-Controlnet-Union"
+  redux: "black-forest-labs/FLUX.1-Redux-dev"
+  num_inference_steps: 20
+  seed: 42
   device: 'cuda:0'
 multiview:
   base_model: "sudo-ai/zero123plus-v1.2"
   custom_pipeline: "./models/zero123plus"
   unet: "./checkpoint/zero123++/flexgen_19w.ckpt"
+  num_inference_steps: 50
+  seed: 42
   device: 'cuda:0'
 reconstruction:
   device: 'cuda:0'
 caption:
+  base_model: "multimodalart/Florence-2-large-no-flash-attn"
+  device: 'cuda:1'
+llm:
+  base_model: "Qwen/Qwen2-7B-Instruct"
+  device: 'cuda:1'
 use_zero_gpu: false # for huggingface demo only
+3d_bundle_templates: './init_3d_Bundle'

pipeline/run_hpc.sh CHANGED Viewed

@@ -5,6 +5,6 @@ export CC=$(which gcc)
 export CPLUS_INCLUDE_PATH=/hpc2ssd/softwares/cuda/cuda-12.1/targets/x86_64-linux/include:$CPLUS_INCLUDE_PATH
 export CUDA_LAUNCH_BLOCKING=1
 export NCCL_TIMEOUT=3600
-export CUDA_VISIBLE_DEVICES="0"
 python ./pipeline/kiss3d_wrapper.py

 export CPLUS_INCLUDE_PATH=/hpc2ssd/softwares/cuda/cuda-12.1/targets/x86_64-linux/include:$CPLUS_INCLUDE_PATH
 export CUDA_LAUNCH_BLOCKING=1
 export NCCL_TIMEOUT=3600
+export CUDA_VISIBLE_DEVICES="0,1"
 python ./pipeline/kiss3d_wrapper.py

run_hpc.sh → pipeline/run_hpc_text_to_3d.sh RENAMED Viewed

@@ -5,7 +5,6 @@ export CC=$(which gcc)
 export CPLUS_INCLUDE_PATH=/hpc2ssd/softwares/cuda/cuda-12.1/targets/x86_64-linux/include:$CPLUS_INCLUDE_PATH
 export CUDA_LAUNCH_BLOCKING=1
 export NCCL_TIMEOUT=3600
-export CUDA_VISIBLE_DEVICES="0"
-# python app.py
-python text_to_mesh.py
-# python image_to_mesh.py

 export CPLUS_INCLUDE_PATH=/hpc2ssd/softwares/cuda/cuda-12.1/targets/x86_64-linux/include:$CPLUS_INCLUDE_PATH
 export CUDA_LAUNCH_BLOCKING=1
 export NCCL_TIMEOUT=3600
+export CUDA_VISIBLE_DEVICES="0,1"
+python ./pipeline/example_text_to_3d.py

pipeline/utils.py CHANGED Viewed

@@ -10,18 +10,20 @@ print(__workdir__)
 import numpy as np
 import torch
 from torchvision.transforms import v2
 from models.lrm.online_render.render_single import load_mipmap
 from models.lrm.utils.camera_util import get_zero123plus_input_cameras, get_custom_zero123plus_input_cameras, get_flux_input_cameras
 from models.lrm.utils.render_utils import rotate_x, rotate_y
 from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
 from models.ISOMER.reconstruction_func import reconstruction
 from models.ISOMER.projection_func import projection
 from utils.tool import NormalTransfer, get_render_cameras_frames, get_background, get_render_cameras_video, render_frames, mask_fix
 logging.basicConfig(
     level = logging.INFO
 )
@@ -38,7 +40,7 @@ def lrm_reconstruct(model, infer_config, images,
                     render_3d_bundle_image=True,
                     render_azimuths=[270, 0, 90, 180],
                     render_elevations=[5, 5, 5, 5],
-                    render_radius=4.5):
     """
     image: Tensor, shape (1, c, h, w)
     """
@@ -49,7 +51,7 @@ def lrm_reconstruct(model, infer_config, images,
     if input_camera_type == 'zero123':
         input_cameras = get_custom_zero123plus_input_cameras(batch_size=1, radius=3.5, fov=30).to(device)
     elif input_camera_type == 'kiss3d':
-        input_cameras = get_flux_input_cameras(batch_size=1, radius=4.2, fov=30).to(device)
     else:
         raise NotImplementedError(f'Unexpected input camera type: {input_camera_type}')
@@ -142,9 +144,9 @@ def isomer_reconstruct(
         elevations=[5, 5, 5, 5],
         geo_weights=[1, 0.9, 1, 0.9],
         color_weights=[1, 0.5, 1, 0.5],
-        reconstruction_stage1_steps=50,
         reconstruction_stage2_steps=50,
-        radius=4.1):
     device = rgb_multi_view.device
     to_tensor_ = lambda x: torch.Tensor(x).float().to(device)
@@ -180,6 +182,7 @@ def isomer_reconstruct(
     multi_view_mask_proj = mask_fix(multi_view_mask, erode_dilate=-10, blur=5)
     logger.info(f"==> Runing ISOMER projection ...")
     save_glb_addr = projection(
         meshes,
@@ -195,4 +198,29 @@ def isomer_reconstruct(
     )
     logger.info(f"==> Save mesh to {save_glb_addr} ...")
-    return save_glb_addr

 import numpy as np
 import torch
 from torchvision.transforms import v2
+from PIL import Image
+import rembg
 from models.lrm.online_render.render_single import load_mipmap
 from models.lrm.utils.camera_util import get_zero123plus_input_cameras, get_custom_zero123plus_input_cameras, get_flux_input_cameras
 from models.lrm.utils.render_utils import rotate_x, rotate_y
 from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
+from models.lrm.utils.infer_util import remove_background, resize_foreground
 from models.ISOMER.reconstruction_func import reconstruction
 from models.ISOMER.projection_func import projection
 from utils.tool import NormalTransfer, get_render_cameras_frames, get_background, get_render_cameras_video, render_frames, mask_fix
 logging.basicConfig(
     level = logging.INFO
 )
                     render_3d_bundle_image=True,
                     render_azimuths=[270, 0, 90, 180],
                     render_elevations=[5, 5, 5, 5],
+                    render_radius=4.15):
     """
     image: Tensor, shape (1, c, h, w)
     """
     if input_camera_type == 'zero123':
         input_cameras = get_custom_zero123plus_input_cameras(batch_size=1, radius=3.5, fov=30).to(device)
     elif input_camera_type == 'kiss3d':
+        input_cameras = get_flux_input_cameras(batch_size=1, radius=3.5, fov=30).to(device)
     else:
         raise NotImplementedError(f'Unexpected input camera type: {input_camera_type}')
         elevations=[5, 5, 5, 5],
         geo_weights=[1, 0.9, 1, 0.9],
         color_weights=[1, 0.5, 1, 0.5],
+        reconstruction_stage1_steps=10,
         reconstruction_stage2_steps=50,
+        radius=4.5):
     device = rgb_multi_view.device
     to_tensor_ = lambda x: torch.Tensor(x).float().to(device)
     multi_view_mask_proj = mask_fix(multi_view_mask, erode_dilate=-10, blur=5)
     logger.info(f"==> Runing ISOMER projection ...")
     save_glb_addr = projection(
         meshes,
     )
     logger.info(f"==> Save mesh to {save_glb_addr} ...")
+    return save_glb_addr
+def to_rgb_image(maybe_rgba):
+    assert isinstance(maybe_rgba, Image.Image)
+    if maybe_rgba.mode == 'RGB':
+        return maybe_rgba, None
+    elif maybe_rgba.mode == 'RGBA':
+        rgba = maybe_rgba
+        img = np.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=np.uint8)
+        img = Image.fromarray(img, 'RGB')
+        img.paste(rgba, mask=rgba.getchannel('A'))
+        return img, rgba.getchannel('A')
+    else:
+        raise ValueError("Unsupported image type.", maybe_rgba.mode)
+rembg_session = rembg.new_session("u2net")
+def preprocess_input_image(input_image):
+    """
+    input_image: PIL.Image
+    output_image: PIL.Image, (3, 512, 512), mode = RGB, background = white
+    """
+    image = remove_background(to_rgb_image(input_image)[0], rembg_session, bgcolor=(255, 255, 255, 255))
+    image = resize_foreground(image, ratio=0.85, pad_value=255)
+    return to_rgb_image(image)[0]

run.sh DELETED Viewed

	@@ -1,2 +0,0 @@
1	- export CUDA_VISIBLE_DEVICES="0"
2	- python text_to_mesh.py

text_to_mesh.py DELETED Viewed

@@ -1,232 +0,0 @@
-import os
-from einops import rearrange
-from omegaconf import OmegaConf
-import torch
-import numpy as np
-import trimesh
-import torchvision
-import torch.nn.functional as F
-from PIL import Image
-from torchvision import transforms
-from torchvision.transforms import v2
-from diffusers import HeunDiscreteScheduler
-from diffusers import FluxPipeline
-from pytorch_lightning import seed_everything
-import os
-import time
-from models.lrm.utils.infer_util import save_video
-from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
-from models.lrm.utils.render_utils import rotate_x, rotate_y
-from models.lrm.utils.train_util import instantiate_from_config
-from models.lrm.utils.camera_util import get_flux_input_cameras
-from models.ISOMER.reconstruction_func import reconstruction
-from models.ISOMER.projection_func import projection
-from utils.tool import NormalTransfer, load_mipmap
-from utils.tool import get_background, get_render_cameras_video, render_frames
-device = "cuda"
-resolution = 512
-save_dir = "./outputs"
-normal_transfer = NormalTransfer()
-isomer_azimuths = torch.from_numpy(np.array([0, 90, 180, 270])).float().to(device)
-isomer_elevations = torch.from_numpy(np.array([5, 5, 5, 5])).float().to(device)
-isomer_radius = 4.5
-isomer_geo_weights = torch.from_numpy(np.array([1, 0.9, 1, 0.9])).float().to(device)
-isomer_color_weights = torch.from_numpy(np.array([1, 0.5, 1, 0.5])).float().to(device)
-# model initialization and loading
-# flux
-flux_pipe = FluxPipeline.from_pretrained("/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/models--black-forest-labs--FLUX.1-dev", torch_dtype=torch.bfloat16).to(device=device, dtype=torch.bfloat16)
-flux_pipe.load_lora_weights('./checkpoint/flux_lora/rgb_normal_large.safetensors')
-flux_pipe.to(device=device, dtype=torch.bfloat16)
-generator = torch.Generator(device=device).manual_seed(10)
-# lrm
-config = OmegaConf.load("./models/lrm/config/PRM_inference.yaml")
-model_config = config.model_config
-infer_config = config.infer_config
-model = instantiate_from_config(model_config)
-model_ckpt_path = "./checkpoint/lrm/final_ckpt.ckpt"
-state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
-state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
-model.load_state_dict(state_dict, strict=True)
-model = model.to(device)
-model.init_flexicubes_geometry(device, fovy=50.0)
-model = model.eval()
-# Flux multi-view generation
-def multi_view_rgb_normal_generation(prompt, save_path=None):
-    # generate multi-view images
-    with torch.no_grad():
-        image = flux_pipe(
-            prompt=prompt,
-            num_inference_steps=30,
-            guidance_scale=3.5,
-            num_images_per_prompt=1,
-            width=resolution*4,
-            height=resolution*2,
-            output_type='np',
-            generator=generator
-        ).images
-    return image
-# lrm reconstructions
-def lrm_reconstructions(image, input_cameras, save_path=None, name="temp", export_texmap=False, if_save_video=False):
-    images = image.unsqueeze(0).to(device)
-    images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)
-    # breakpoint()
-    with torch.no_grad():
-        # get triplane
-        planes = model.forward_planes(images, input_cameras)
-        mesh_path_idx = os.path.join(save_path, f'{name}.obj')
-        mesh_out = model.extract_mesh(
-            planes,
-            use_texture_map=export_texmap,
-            **infer_config,
-        )
-        if export_texmap:
-            vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
-            save_obj_with_mtl(
-                vertices.data.cpu().numpy(),
-                uvs.data.cpu().numpy(),
-                faces.data.cpu().numpy(),
-                mesh_tex_idx.data.cpu().numpy(),
-                tex_map.permute(1, 2, 0).data.cpu().numpy(),
-                mesh_path_idx,
-            )
-        else:
-            vertices, faces, vertex_colors = mesh_out
-            save_obj(vertices, faces, vertex_colors, mesh_path_idx)
-        print(f"Mesh saved to {mesh_path_idx}")
-        render_size = 512
-        if if_save_video:
-            video_path_idx = os.path.join(save_path, f'{name}.mp4')
-            render_size = infer_config.render_resolution
-            ENV = load_mipmap("models/lrm/env_mipmap/6")
-            materials = (0.0,0.9)
-            all_mv, all_mvp, all_campos = get_render_cameras_video(
-                batch_size=1,
-                M=240,
-                radius=4.5,
-                elevation=(90, 60.0),
-                is_flexicubes=True,
-                fov=30
-            )
-            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
-                model,
-                planes,
-                render_cameras=all_mvp,
-                camera_pos=all_campos,
-                env=ENV,
-                materials=materials,
-                render_size=render_size,
-                chunk_size=20,
-                is_flexicubes=True,
-            )
-            normals = (torch.nn.functional.normalize(normals) + 1) / 2
-            normals = normals * alphas + (1-alphas)
-            all_frames = torch.cat([frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals], dim=3)
-            save_video(
-                all_frames,
-                video_path_idx,
-                fps=30,
-            )
-            print(f"Video saved to {video_path_idx}")
-    return vertices, faces
-def local_normal_global_transform(local_normal_images, azimuths_deg, elevations_deg):
-    if local_normal_images.min() >= 0:
-        local_normal = local_normal_images.float() * 2 - 1
-    else:
-        local_normal = local_normal_images.float()
-    global_normal = normal_transfer.trans_local_2_global(local_normal, azimuths_deg, elevations_deg, radius=4.5, for_lotus=False)
-    global_normal[...,0] *= -1
-    global_normal = (global_normal + 1) / 2
-    global_normal = global_normal.permute(0, 3, 1, 2)
-    return global_normal
-def main():
-    end = time.time()
-    fix_prompt = 'a grid of 2x4 multi-view image. elevation 5. white background.'
-    # user prompt
-    prompt = "a owl wearing a hat."
-    save_dir_path = os.path.join(save_dir, prompt.split(".")[0].replace(" ", "_"))
-    os.makedirs(save_dir_path, exist_ok=True)
-    prompt = fix_prompt+" "+prompt
-    # generate multi-view images
-    rgb_normal_grid = multi_view_rgb_normal_generation(prompt)
-    # lrm reconstructions
-    images = torch.from_numpy(rgb_normal_grid).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
-    images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=2, m=4)        # (8, 3, 512, 512)
-    rgb_multi_view = images[:4, :3, :, :]
-    normal_multi_view = images[4:, :3, :, :]
-    multi_view_mask = get_background(normal_multi_view)
-    rgb_multi_view = rgb_multi_view * rgb_multi_view + (1-multi_view_mask)
-    input_cameras = get_flux_input_cameras(batch_size=1, radius=4.2, fov=30).to(device)
-    vertices, faces = lrm_reconstructions(rgb_multi_view, input_cameras, save_path=save_dir_path, name='lrm', export_texmap=False, if_save_video=False)
-    # local normal to global normal
-    global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1), isomer_azimuths, isomer_elevations)
-    global_normal = global_normal * multi_view_mask + (1-multi_view_mask)
-    global_normal = global_normal.permute(0,2,3,1)
-    rgb_multi_view = rgb_multi_view.permute(0,2,3,1)
-    multi_view_mask = multi_view_mask.permute(0,2,3,1).squeeze(-1)
-    vertices = torch.from_numpy(vertices).to(device)
-    faces = torch.from_numpy(faces).to(device)
-    vertices = vertices @ rotate_x(np.pi / 2, device=vertices.device)[:3, :3]
-    vertices = vertices @ rotate_y(np.pi / 2, device=vertices.device)[:3, :3]
-    # global_normal: B,H,W,3
-    # multi_view_mask: B,H,W
-    # rgb_multi_view: B,H,W,3
-    meshes = reconstruction(
-        normal_pils=global_normal,
-        masks=multi_view_mask,
-        weights=isomer_geo_weights,
-        fov=30,
-        radius=isomer_radius,
-        camera_angles_azi=isomer_azimuths,
-        camera_angles_ele=isomer_elevations,
-        expansion_weight_stage1=0.1,
-        init_type="file",
-        init_verts=vertices,
-        init_faces=faces,
-        stage1_steps=0,
-        stage2_steps=50,
-        start_edge_len_stage1=0.1,
-        end_edge_len_stage1=0.02,
-        start_edge_len_stage2=0.02,
-        end_edge_len_stage2=0.005,
-    )
-    save_glb_addr = projection(
-        meshes,
-        masks=multi_view_mask,
-        images=rgb_multi_view,
-        azimuths=isomer_azimuths,
-        elevations=isomer_elevations,
-        weights=isomer_color_weights,
-        fov=30,
-        radius=isomer_radius,
-        save_dir=f"{save_dir_path}/ISOMER/",
-    )
-    print(f'saved to {save_glb_addr}')
-    print(f"Time elapsed: {time.time() - end:.2f}s")
-if __name__ == '__main__':
-    main()

text_to_mesh_new.py DELETED Viewed

@@ -1,244 +0,0 @@
-import os
-from einops import rearrange
-from omegaconf import OmegaConf
-import torch
-import numpy as np
-import trimesh
-import torchvision
-import torch.nn.functional as F
-from PIL import Image
-from torchvision import transforms
-from torchvision.transforms import v2
-from diffusers import HeunDiscreteScheduler
-from diffusers import FluxPipeline
-from pytorch_lightning import seed_everything
-import os
-import time
-from models.lrm.utils.infer_util import save_video
-from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
-from models.lrm.utils.render_utils import rotate_x, rotate_y
-from models.lrm.utils.train_util import instantiate_from_config
-from models.lrm.utils.camera_util import get_flux_input_cameras
-from models.ISOMER.reconstruction_func import reconstruction
-from models.ISOMER.projection_func import projection
-from utils.tool import NormalTransfer, load_mipmap
-from utils.tool import get_background, get_render_cameras_video, render_frames, mask_fix
-device = "cuda"
-resolution = 512
-save_dir = "./outputs/text2"
-normal_transfer = NormalTransfer()
-isomer_azimuths = torch.from_numpy(np.array([0, 90, 180, 270])).float().to(device)
-isomer_elevations = torch.from_numpy(np.array([5, 5, 5, 5])).float().to(device)
-isomer_radius = 4.5
-isomer_geo_weights = torch.from_numpy(np.array([1, 0.9, 1, 0.9])).float().to(device)
-isomer_color_weights = torch.from_numpy(np.array([1, 0.5, 1, 0.5])).float().to(device)
-# model initialization and loading
-# flux
-flux_pipe = FluxPipeline.from_pretrained("/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/models--black-forest-labs--FLUX.1-dev", torch_dtype=torch.bfloat16).to(device=device, dtype=torch.bfloat16)
-flux_pipe.load_lora_weights('./checkpoint/flux_lora/rgb_normal_large.safetensors')
-flux_pipe.to(device=device, dtype=torch.bfloat16)
-generator = torch.Generator(device=device).manual_seed(10)
-# lrm
-config = OmegaConf.load("./models/lrm/config/PRM_inference.yaml")
-model_config = config.model_config
-infer_config = config.infer_config
-model = instantiate_from_config(model_config)
-model_ckpt_path = "./checkpoint/lrm/final_ckpt.ckpt"
-state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
-state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
-model.load_state_dict(state_dict, strict=True)
-model = model.to(device)
-model.init_flexicubes_geometry(device, fovy=50.0)
-model = model.eval()
-# Flux multi-view generation
-def multi_view_rgb_normal_generation(prompt, save_path=None):
-    # generate multi-view images
-    with torch.no_grad():
-        image = flux_pipe(
-            prompt=prompt,
-            num_inference_steps=30,
-            guidance_scale=3.5,
-            num_images_per_prompt=1,
-            width=resolution*4,
-            height=resolution*2,
-            output_type='np',
-            generator=generator
-        ).images
-    return image
-# lrm reconstructions
-def lrm_reconstructions(image, input_cameras, save_path=None, name="temp", export_texmap=False, if_save_video=False):
-    images = image.unsqueeze(0).to(device)
-    images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)
-    # breakpoint()
-    with torch.no_grad():
-        # get triplane
-        planes = model.forward_planes(images, input_cameras)
-        mesh_path_idx = os.path.join(save_path, f'{name}.obj')
-        mesh_out = model.extract_mesh(
-            planes,
-            use_texture_map=export_texmap,
-            **infer_config,
-        )
-        if export_texmap:
-            vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
-            save_obj_with_mtl(
-                vertices.data.cpu().numpy(),
-                uvs.data.cpu().numpy(),
-                faces.data.cpu().numpy(),
-                mesh_tex_idx.data.cpu().numpy(),
-                tex_map.permute(1, 2, 0).data.cpu().numpy(),
-                mesh_path_idx,
-            )
-        else:
-            vertices, faces, vertex_colors = mesh_out
-            save_obj(vertices, faces, vertex_colors, mesh_path_idx)
-        print(f"Mesh saved to {mesh_path_idx}")
-        render_size = 512
-        if if_save_video:
-            video_path_idx = os.path.join(save_path, f'{name}.mp4')
-            render_size = infer_config.render_resolution
-            ENV = load_mipmap("models/lrm/env_mipmap/6")
-            materials = (0.0,0.9)
-            all_mv, all_mvp, all_campos = get_render_cameras_video(
-                batch_size=1,
-                M=240,
-                radius=4.5,
-                elevation=(90, 60.0),
-                is_flexicubes=True,
-                fov=30
-            )
-            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
-                model,
-                planes,
-                render_cameras=all_mvp,
-                camera_pos=all_campos,
-                env=ENV,
-                materials=materials,
-                render_size=render_size,
-                chunk_size=20,
-                is_flexicubes=True,
-            )
-            normals = (torch.nn.functional.normalize(normals) + 1) / 2
-            normals = normals * alphas + (1-alphas)
-            all_frames = torch.cat([frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals], dim=3)
-            save_video(
-                all_frames,
-                video_path_idx,
-                fps=30,
-            )
-            print(f"Video saved to {video_path_idx}")
-    return vertices, faces
-def local_normal_global_transform(local_normal_images, azimuths_deg, elevations_deg):
-    if local_normal_images.min() >= 0:
-        local_normal = local_normal_images.float() * 2 - 1
-    else:
-        local_normal = local_normal_images.float()
-    global_normal = normal_transfer.trans_local_2_global(local_normal, azimuths_deg, elevations_deg, radius=4.5, for_lotus=False)
-    global_normal[...,0] *= -1
-    global_normal = (global_normal + 1) / 2
-    global_normal = global_normal.permute(0, 3, 1, 2)
-    return global_normal
-def main(prompt = "a owl wearing a hat."):
-    fix_prompt = 'a grid of 2x4 multi-view image. elevation 5. white background.'
-    # user prompt
-    save_dir_path = os.path.join(save_dir, prompt.split(".")[0].replace(" ", "_"))
-    os.makedirs(save_dir_path, exist_ok=True)
-    prompt = fix_prompt+" "+prompt
-    # generate multi-view images
-    rgb_normal_grid = multi_view_rgb_normal_generation(prompt)
-    # lrm reconstructions
-    images = torch.from_numpy(rgb_normal_grid).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
-    images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=2, m=4)        # (8, 3, 512, 512)
-    rgb_multi_view = images[:4, :3, :, :]
-    normal_multi_view = images[4:, :3, :, :]
-    multi_view_mask = get_background(normal_multi_view)
-    rgb_multi_view = rgb_multi_view * rgb_multi_view + (1-multi_view_mask)
-    input_cameras = get_flux_input_cameras(batch_size=1, radius=4.2, fov=30).to(device)
-    vertices, faces = lrm_reconstructions(rgb_multi_view, input_cameras, save_path=save_dir_path, name='lrm', export_texmap=False, if_save_video=False)
-    # local normal to global normal
-    global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1), isomer_azimuths, isomer_elevations)
-    global_normal = global_normal * multi_view_mask + (1-multi_view_mask)
-    global_normal = global_normal.permute(0,2,3,1)
-    rgb_multi_view = rgb_multi_view.permute(0,2,3,1)
-    multi_view_mask = multi_view_mask.permute(0,2,3,1).squeeze(-1)
-    vertices = torch.from_numpy(vertices).to(device)
-    faces = torch.from_numpy(faces).to(device)
-    vertices = vertices @ rotate_x(np.pi / 2, device=vertices.device)[:3, :3]
-    vertices = vertices @ rotate_y(np.pi / 2, device=vertices.device)[:3, :3]
-    # global_normal: B,H,W,3
-    # multi_view_mask: B,H,W
-    # rgb_multi_view: B,H,W,3
-    multi_view_mask_proj = mask_fix(multi_view_mask, erode_dilate=-6, blur=5)
-    meshes = reconstruction(
-        normal_pils=global_normal,
-        masks=multi_view_mask,
-        weights=isomer_geo_weights,
-        fov=30,
-        radius=isomer_radius,
-        camera_angles_azi=isomer_azimuths,
-        camera_angles_ele=isomer_elevations,
-        expansion_weight_stage1=0.1,
-        init_type="file",
-        init_verts=vertices,
-        init_faces=faces,
-        stage1_steps=0,
-        stage2_steps=50,
-        start_edge_len_stage1=0.1,
-        end_edge_len_stage1=0.02,
-        start_edge_len_stage2=0.02,
-        end_edge_len_stage2=0.005,
-    )
-    multi_view_mask_proj = mask_fix(multi_view_mask, erode_dilate=-10, blur=5)
-    save_glb_addr = projection(
-        meshes,
-        masks=multi_view_mask_proj,
-        images=rgb_multi_view,
-        azimuths=isomer_azimuths,
-        elevations=isomer_elevations,
-        weights=isomer_color_weights,
-        fov=30,
-        radius=isomer_radius,
-        save_dir=f"{save_dir_path}/ISOMER/",
-    )
-    print(f'saved to {save_glb_addr}')
-if __name__ == '__main__':
-    import time
-    start_time = time.time()
-    prompts = ["A red dragon soaring", "A running Chihuahua", "A dancing rabbit", "A girl with blue hair and white dress", "A teacher", "A tiger playing guitar", "A red rose", "A red peony", "A rose in a vase", "A golden retriever sitting", "A golden retriever running"]
-    for prompt in prompts:
-        main(prompt)
-    end_time = time.time()
-    print(f"Time taken: {end_time - start_time:.2f} seconds for {len(prompts)} prompts")
-    breakpoint()

upload_huggingface.py DELETED Viewed

@@ -1,57 +0,0 @@
-from huggingface_hub import HfApi, HfFolder, Repository, create_repo, upload_file
-import os
-# 登录到 Hugging Face
-from huggingface_hub import login
-login()
-# 创建或指定现有的 Repository
-repo_name = "xxx-ckpt"
-username = "LTT"
-repo_id = f"{username}/{repo_name}"
-# 创建仓库（如果它不存在）
-create_repo(repo_id, exist_ok=True)
-# 文件夹
-# 上传整个文件夹
-def upload_folder(folder_path, repo_id):
-    """
-    递归上传文件夹及其内容到 Hugging Face 仓库。
-    """
-    for root, _, files in os.walk(folder_path):
-        for file in files:
-            # 文件完整路径
-            full_file_path = os.path.join(root, file)
-            # 相对于文件夹的相对路径（保留文件夹结构）
-            relative_path = os.path.relpath(full_file_path, folder_path)
-            # 上传文件到仓库
-            print(f"Uploading {relative_path}...")
-            upload_file(
-                path_or_fileobj=full_file_path,
-                path_in_repo=relative_path,
-                repo_id=repo_id
-            )
-            print(f"Uploaded {relative_path} successfully.")
-# 上传模型文件
-model_path = "checkpoint/zero123++/flexgen_19w.ckpt"
-upload_file(path_or_fileobj=model_path, path_in_repo="flexgen_19w.ckpt", repo_id=repo_id)
-# # 上传数据文件
-# data_path = "/hpc2hdd/home/jlin695/data/env_map/data/env_mipmap_large.tar.gz"
-# upload_file(path_or_fileobj=data_path, path_in_repo="env_mipmap_large.tar.gz", repo_id=repo_id)
-# # 上传数据文件
-# data_path = "/hpc2hdd/home/jlin695/data/env_map/data/env_map_light_large.tar.gz"
-# upload_file(path_or_fileobj=data_path, path_in_repo="env_map_light_large.tar.gz", repo_id=repo_id)
-# # 定义要上传的文件夹路径
-# folder_path = "checkpoint/flux_lora"
-# # 调用上传文件夹的函数
-# upload_folder(folder_path, repo_id)
-# print("模型和数据文件已上传到 Hugging Face。")

video_render.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import math
+import numpy as np
+import imageio
+import trimesh
+import pyrender
+from tqdm import tqdm
+# os.environ["CUDA_VISIBLE_DEVICES"] = "7"
+os.environ['PYOPENGL_PLATFORM'] = 'egl'  # 设置渲染环境为 EGL（无头模式）
+def render_video_from_obj(input_obj_path, output_video_path, fps=15, frame_count=60, resolution=(512, 512)):
+    """
+    Render a rotating 3D model (OBJ file) to a video with RGB and normal map side-by-side.
+    Args:
+        input_obj_path (str): Path to the input OBJ file.
+        output_video_path (str): Path to save the output video.
+        fps (int): Frames per second for the video.
+        frame_count (int): Number of frames in the video.
+        resolution (tuple): Resolution of the rendered video (width, height).
+    Returns:
+        str: Path to the output video.
+    """
+    # 检查输入文件是否存在
+    if not os.path.exists(input_obj_path):
+        raise FileNotFoundError(f"Input OBJ file not found: {input_obj_path}")
+    # 加载3D模型
+    scene_data = trimesh.load(input_obj_path)
+    # 提取或合并网格
+    if isinstance(scene_data, trimesh.Scene):
+        mesh_data = trimesh.util.concatenate([geom for geom in scene_data.geometry.values()])
+    else:
+        mesh_data = scene_data
+    # 确保顶点法线存在
+    if not hasattr(mesh_data, 'vertex_normals') or mesh_data.vertex_normals is None:
+        mesh_data.compute_vertex_normals()
+    # 创建 Pyrender 场景并设置背景为白色
+    render_scene = pyrender.Scene(bg_color=[1.0, 1.0, 1.0])
+    mesh = pyrender.Mesh.from_trimesh(mesh_data, smooth=True)
+    mesh_node = render_scene.add(mesh)
+    # 设置摄像机参数
+    camera = pyrender.PerspectiveCamera(yfov=np.deg2rad(30), znear=0.0001, zfar=100000.0)
+    camera_pose = np.eye(4)
+    camera_pose[2, 3] = 4.0  # 距离模型 20 个单位
+    render_scene.add(camera, pose=camera_pose)
+    # 添加全局环境光
+    ambient_light = np.array([1.0, 1.0, 1.0]) * 2.0
+    render_scene.ambient_light = ambient_light
+    # 准备法线渲染场景
+    normals = mesh_data.vertex_normals.copy()
+    # 将法线映射到颜色范围 [0, 255]
+    normal_colors = ((normals + 1) / 2 * 255)
+    # 创建用于法线渲染的独立网格
+    normal_mesh_data = mesh_data.copy()
+    normal_mesh_data.visual.vertex_colors = np.hstack(
+        [normal_colors, np.full((normals.shape[0], 1), 255, dtype=np.uint8)]  # 添加 Alpha 通道
+    )
+    # 创建法线渲染场景
+    normal_scene = pyrender.Scene(bg_color=[1.0, 1.0, 1.0, 1.0])
+    normal_mesh = pyrender.Mesh.from_trimesh(normal_mesh_data, smooth=True)
+    normal_mesh_node = normal_scene.add(normal_mesh)
+    normal_scene.add(camera, pose=camera_pose)
+    normal_scene.ambient_light = ambient_light
+    # 初始化渲染器
+    r = pyrender.OffscreenRenderer(*resolution)
+    # 创建视频写入器
+    writer = imageio.get_writer(output_video_path, fps=fps)
+    # 渲染每一帧
+    try:
+        for frame_idx in tqdm(range(frame_count)):
+            # 计算旋转角度
+            angle = 2 * np.pi * frame_idx / frame_count
+            rotation_matrix = np.array([
+                [math.cos(angle), 0, math.sin(angle), 0],
+                [0, 1, 0, 0],
+                [-math.sin(angle), 0, math.cos(angle), 0],
+                [0, 0, 0, 1]
+            ])
+            # 更新模型的姿态
+            render_scene.set_pose(mesh_node, rotation_matrix)
+            # 渲染 RGB 图像
+            color, _ = r.render(render_scene)
+            # 更新法线场景的姿态
+            normal_scene.set_pose(normal_mesh_node, rotation_matrix)
+            # 渲染法线图像
+            normal, _ = r.render(normal_scene, flags=pyrender.RenderFlags.FLAT)
+            # 拼接左右图像
+            combined_frame = np.concatenate((color, normal), axis=1)
+            # 写入视频帧
+            writer.append_data(combined_frame)
+    finally:
+        # 释放资源
+        writer.close()
+        r.delete()
+    print(f"Rendered video saved to {output_video_path}")
+    return output_video_path
+if __name__ == '__main__':
+    # 示例调用
+    input_obj_path = "output/gradio_cache/text_3D/_超级赛亚人_10/rgb_projected.obj"
+    output_video_path = "output.mp4"
+    render_video_from_obj(input_obj_path, output_video_path)