File size: 9,596 Bytes
b572032
 
 
 
 
 
 
 
 
 
ba0a533
f0396f5
96e3b91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b572032
 
 
 
 
 
 
 
 
 
 
e14295e
ba0a533
8464784
b572032
 
 
 
 
 
 
581c8b5
b572032
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581c8b5
b572032
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f0396f5
 
 
e14295e
 
b572032
 
 
f0396f5
b572032
 
 
 
 
8fe4ab0
b572032
806a45c
 
 
 
 
b572032
806a45c
 
b572032
806a45c
 
 
f0396f5
b572032
 
 
5d85282
2c1f22d
b572032
 
 
 
ba0a533
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import spaces
import os
import tempfile
from typing import Any
import torch
import numpy as np
from PIL import Image
import gradio as gr
import trimesh
from transparent_background import Remover
from pathlib import Path
import uuid
import subprocess

def install_cuda_toolkit():
    # CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
    CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
    CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL)
    subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE])
    subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE])
    subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"])

    os.environ["CUDA_HOME"] = "/usr/local/cuda"
    os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"])
    os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % (
        os.environ["CUDA_HOME"],
        "" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"],
    )
    # Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range
    os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"

install_cuda_toolkit()

# Import and setup SPAR3D 
os.system("USE_CUDA=1 pip install -vv --no-build-isolation ./texture_baker ./uv_unwrapper")
import spar3d.utils as spar3d_utils
from spar3d.system import SPAR3D

# Constants
COND_WIDTH = 512
COND_HEIGHT = 512
COND_DISTANCE = 2.2
COND_FOVY = 0.591627
BACKGROUND_COLOR = [0.5, 0.5, 0.5]
OUTPUT_DIR = "./output"
os.makedirs(OUTPUT_DIR, exist_ok=True)
HF_TOKEN = os.getenv("HF_TOKEN", None)

# Initialize models
device = spar3d_utils.get_device()
bg_remover = Remover()
spar3d_model = SPAR3D.from_pretrained(
    "stabilityai/stable-point-aware-3d",
    config_name="config.yaml",
    weight_name="model.safetensors",
).eval().to(device)

# Initialize camera parameters
c2w_cond = spar3d_utils.default_cond_c2w(COND_DISTANCE)
intrinsic, intrinsic_normed_cond = spar3d_utils.create_intrinsic_from_fov_rad(
    COND_FOVY, COND_HEIGHT, COND_WIDTH
)

def create_rgba_image(rgb_image: Image.Image, mask: np.ndarray = None) -> Image.Image:
    """Create an RGBA image from RGB image and optional mask."""
    rgba_image = rgb_image.convert('RGBA')
    if mask is not None:
        # Ensure mask is 2D before converting to alpha
        if len(mask.shape) > 2:
            mask = mask.squeeze()
        alpha = Image.fromarray((mask * 255).astype(np.uint8))
        rgba_image.putalpha(alpha)
    return rgba_image

def create_batch(input_image: Image.Image) -> dict[str, Any]:
    """Prepare image batch for model input."""
    # Resize and convert input image to numpy array
    resized_image = input_image.resize((COND_WIDTH, COND_HEIGHT))
    img_array = np.array(resized_image).astype(np.float32) / 255.0

    # Extract RGB and alpha channels
    if img_array.shape[-1] == 4:  # RGBA
        rgb = img_array[..., :3]
        mask = img_array[..., 3:4]
    else:  # RGB
        rgb = img_array
        mask = np.ones((*img_array.shape[:2], 1), dtype=np.float32)
    
    # Convert to tensors while keeping channel-last format
    rgb = torch.from_numpy(rgb).float()  # [H, W, 3]
    mask = torch.from_numpy(mask).float()  # [H, W, 1]

    # Create background blend (match channel-last format)
    bg_tensor = torch.tensor(BACKGROUND_COLOR).view(1, 1, 3)  # [1, 1, 3]
 
    # Blend RGB with background using mask (all in channel-last format)
    rgb_cond = torch.lerp(bg_tensor, rgb, mask)  # [H, W, 3]
 
    # Move channels to correct dimension and add batch dimension
    # Important: For SPAR3D image tokenizer, we need [B, H, W, C] format
    rgb_cond = rgb_cond.unsqueeze(0)  # [1, H, W, 3]
    mask = mask.unsqueeze(0)  # [1, H, W, 1]
    
    # Create the batch dictionary
    batch = {
        "rgb_cond": rgb_cond,  # [1, H, W, 3]
        "mask_cond": mask,  # [1, H, W, 1]
        "c2w_cond": c2w_cond.unsqueeze(0),  # [1, 4, 4]
        "intrinsic_cond": intrinsic.unsqueeze(0),  # [1, 3, 3]
        "intrinsic_normed_cond": intrinsic_normed_cond.unsqueeze(0),  # [1, 3, 3]
    }
    
    for k, v in batch.items():
        print(f"[debug] {k} final shape:", v.shape)
 
    return batch

def forward_model(batch, system, guidance_scale=3.0, seed=0, device="cuda"):
    """Process batch through model and generate point cloud."""

    batch_size = batch["rgb_cond"].shape[0]
    assert batch_size == 1, f"Expected batch size 1, got {batch_size}"
    
    # Generate point cloud tokens
    try:
        cond_tokens = system.forward_pdiff_cond(batch)
    except Exception as e:
        print("\n[ERROR] Failed in forward_pdiff_cond:")
        print(e)
        print("\nInput tensor properties:")
        print("rgb_cond dtype:", batch["rgb_cond"].dtype)
        print("rgb_cond device:", batch["rgb_cond"].device)
        print("rgb_cond requires_grad:", batch["rgb_cond"].requires_grad)
        raise
    
    # Sample points
    sample_iter = system.sampler.sample_batch_progressive(
        batch_size,
        cond_tokens,
        guidance_scale=guidance_scale,
        device=device
    )
    
    # Get final samples
    for x in sample_iter:
        samples = x["xstart"]
    
    pc_cond = samples.permute(0, 2, 1).float()

    # Normalize point cloud
    pc_cond = spar3d_utils.normalize_pc_bbox(pc_cond)

    # Subsample to 512 points
    pc_cond = pc_cond[:, torch.randperm(pc_cond.shape[1])[:512]]

    return pc_cond

@spaces.GPU
@torch.inference_mode()
def generate_and_process_3d(image: Image.Image) -> tuple[str | None, str | None]:
    """Generate image from prompt and convert to 3D model."""

    # Generate random seed
    seed = np.random.randint(0, np.iinfo(np.int32).max)
    
    try:
        rgb_image = image.convert('RGB')
        
        # bg_remover returns a PIL Image already, no need to convert
        no_bg_image = bg_remover.process(rgb_image)
        print(f"[debug] no_bg_image type: {type(no_bg_image)}, mode: {no_bg_image.mode}")
        
        # Convert to RGBA if not already
        rgba_image = no_bg_image.convert('RGBA')
        print(f"[debug] rgba_image mode: {rgba_image.mode}")
        
        processed_image = spar3d_utils.foreground_crop(
            rgba_image,
            crop_ratio=1.3,
            newsize=(COND_WIDTH, COND_HEIGHT),
            no_crop=False
        )
        
        # Show the processed image alpha channel for debugging
        alpha = np.array(processed_image)[:, :, 3]
        print(f"[debug] Alpha channel stats - min: {alpha.min()}, max: {alpha.max()}, unique: {np.unique(alpha)}")

        # Prepare batch for processing
        batch = create_batch(processed_image)
        batch = {k: v.to(device) for k, v in batch.items()}

        # Generate point cloud
        pc_cond = forward_model(
            batch,
            spar3d_model,
            guidance_scale=3.0,
            seed=seed,
            device=device
        )
        batch["pc_cond"] = pc_cond

        # Generate mesh
        with torch.no_grad():
            with torch.autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu', dtype=torch.bfloat16):
                trimesh_mesh, _ = spar3d_model.generate_mesh(
                    batch,
                    1024,  # texture_resolution
                    remesh="none",
                    vertex_count=-1,
                    estimate_illumination=True
                )
                trimesh_mesh = trimesh_mesh[0]

        # Export to GLB
        unique_id = str(uuid.uuid4())
        filename = f'model_{unique_id}.glb'
        output_path = os.path.join(OUTPUT_DIR, filename)
        public_url = f"https://john6666-image-to-3d-test.hf.space/gradio_api/file={Path(output_path).resolve()}"
        print(f"public_url: {public_url}")
        
        trimesh_mesh.export(output_path, file_type="glb", include_normals=True)
        
        return output_path, public_url
        
    except Exception as e:
        print(f"Error during generation: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None

css = """

.image { margin: 0px auto; object-fit: contain; !important; }

.info { align-items: center; text-align: center; }

"""

# Create Gradio app using Blocks
with gr.Blocks(theme=gr.themes.Soft(), fill_width=True, fill_height=True, elem_id="container", css=css) as demo:
    gr.Markdown("This space is based on [Stable Point-Aware 3D](https://huggingface.co/spaces/stabilityai/stable-point-aware-3d) by Stability AI, [Text to 3D](https://huggingface.co/spaces/jbilcke-hf/text-to-3d) by jbilcke-hf.", elem_classes="info")
    
    input_img = gr.Image(type="pil", label="Input Image", sources="upload", image_mode="RGBA", width=128, elem_classes="image")
    model_output = gr.Model3D(label="Generated .GLB model", clear_color=[0.0, 0.0, 0.0, 0.0])
    output_url = gr.Textbox(label="Output URL", value="", lines=1, interactive=False, visible=True, show_copy_button=True)

    # Event handler
    input_img.upload(
        fn=generate_and_process_3d,
        inputs=[input_img],
        outputs=[model_output, output_url],
        api_name="generate"
    )
    
if __name__ == "__main__":
    demo.queue().launch(ssr_mode=False, allowed_paths=[Path(OUTPUT_DIR).resolve()])