Spaces:

facebook
/

vggt

Running on Zero

App Files Files Community

JianyuanWang commited on 4 days ago

Commit

2529861

1 Parent(s): febf487

update

Browse files

Files changed (3) hide show

app.py +129 -88
demo_hf.py +15 -12
gradio_util.py +127 -129

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import cv2
 import torch
 import numpy as np
 import gradio as gr
-import spaces
 import sys
 import os
 import socket
@@ -11,42 +10,64 @@ import webbrowser
 sys.path.append('vggt/')
 import shutil
 from datetime import datetime
-from demo_hf import demo_fn
 from omegaconf import DictConfig, OmegaConf
 import glob
 import gc
 import time
 from viser_fn import viser_wrapper
-def get_free_port():
-    """Get a free port using socket."""
-    # return 80
-    # return 8080
-    # return 10088 # for debugging
-    # return 7860
-    # return 7888
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(('', 0))
-        port = s.getsockname()[1]
-        return port
 @spaces.GPU(duration=240)
 def vggt_demo(
     input_video,
     input_image,
 ):
     start_time = time.time()
     gc.collect()
     torch.cuda.empty_cache()
-    debug = False
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     target_dir = f"input_images_{timestamp}"
     if os.path.exists(target_dir):
         shutil.rmtree(target_dir)
@@ -65,9 +86,6 @@ def vggt_demo(
     if input_image is not None:
         input_image = sorted(input_image)
-        # recon_num = len(input_image)
-        # Copy files to the new directory
         for file_name in input_image:
             shutil.copy(file_name, target_dir_images)
     elif input_video is not None:
@@ -90,26 +108,37 @@ def vggt_demo(
             if count % frame_interval == 0:
                 cv2.imwrite(target_dir_images+"/"+f"{video_frame_num:06}.png", frame)
-                video_frame_num+=1
-        # recon_num = video_frame_num
-        # if recon_num<3:
-            # return None, "Please input at least three frames"
     else:
-        return None, "Uploading not finished or Incorrect input format"
     print(f"Files have been copied to {target_dir_images}")
     cfg.SCENE_DIR = target_dir
-    predictions = demo_fn(cfg)
-    # Get a free port for viser
-    viser_port = get_free_port()
-    # Start viser visualization in a separate thread/process
-    viser_wrapper(predictions, port=viser_port)
     del predictions
     gc.collect()
     torch.cuda.empty_cache()
@@ -120,10 +149,31 @@ def vggt_demo(
     execution_time = end_time - start_time
     print(f"Execution time: {execution_time} seconds")
-    # Return None for the 3D model (since we're using viser) and the viser URL
     # viser_url = f"Viser visualization is ready at: http://localhost:{viser_port}"
     # print(viser_url)  # Debug print
-    return None, viser_port
@@ -177,8 +227,17 @@ with gr.Blocks() as demo:
     gr.Markdown("""
     # 🏛️ VGGT: Visual Geometry Grounded Transformer
-    <div style="font-size: 16px; line-height: 1.2;">
-    Alpha version (testing).
     </div>
     """)
@@ -186,87 +245,69 @@ with gr.Blocks() as demo:
         with gr.Column(scale=1):
             input_video = gr.Video(label="Upload Video", interactive=True)
             input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
         with gr.Column(scale=3):
-            viser_output = gr.HTML(
-                label="Viser Visualization",
-                value='''<div style="height: 520px; border: 1px solid #e0e0e0;
-                                    border-radius: 4px; padding: 16px;
-                                    display: flex; align-items: center;
-                                    justify-content: center">
-                            3D Reconstruction (Point Cloud and Camera Poses; Zoom in to see details)
-                        </div>'''
-            )
             log_output = gr.Textbox(label="Log")
     with gr.Row():
         submit_btn = gr.Button("Reconstruct", scale=1)
-        clear_btn = gr.ClearButton([input_video, input_images, viser_output, log_output], scale=1) #Modified viser_output
     examples = [
-        [flower_video, flower_images],
-        [kitchen_video, kitchen_images],
         # [person_video, person_images],
         # [statue_video, statue_images],
         # [drums_video, drums_images],
-        [counter_video, counter_images],
-        [fern_video, fern_images],
-        [horns_video, horns_images],
         # [apple_video, apple_images],
         # [bonsai_video, bonsai_images],
     ]
-    def process_example(video, images):
-        """Wrapper function to ensure outputs are properly captured"""
-        model_output, log = vggt_demo(video, images)
-        # viser_wrapper(predictions, port=log)
-        # Get the hostname - use the actual hostname or IP where the server is running
-        # hostname = socket.gethostname()
-        # Extract port from log
-        port = log
-        # Create the viser URL using the hostname
-        # viser_url = f"http://{hostname}:{port}"
-        viser_url = f"http://localhost:{log}"
-        print(f"Viser URL: {viser_url}")
-        # Create the iframe HTML code.  Set width and height appropriately.
-        iframe_code = f'<iframe src="{viser_url}" width="100%" height="520px"></iframe>'
-        # Return the iframe code to update the gr.HTML component
-        return iframe_code, f"Visualization running at {viser_url}"
-    # TODO: move the selection of port outside of the demo function
-    # so that we can cache examples
     gr.Examples(examples=examples,
-                inputs=[input_video, input_images],
-                outputs=[viser_output, log_output],  # Output to viser_output
-                fn=process_example,  # Use our wrapper function
                 cache_examples=False,
                 examples_per_page=50,
                 )
     submit_btn.click(
-        process_example,  # Use the same wrapper function
-        [input_video, input_images],
-        [viser_output, log_output], # Output to viser_output
         # concurrency_limit=1
     )
     # demo.launch(debug=True, share=True)
     # demo.launch(server_name="0.0.0.0", server_port=8082, debug=True, share=False)
     # demo.queue(max_size=20).launch(show_error=True, share=True)
     demo.queue(max_size=20).launch(show_error=True) #, share=True, server_port=7888, server_name="0.0.0.0")
     # demo.queue(max_size=20, concurrency_count=1).launch(debug=True, share=True)
 ########################################################################################################################

 import torch
 import numpy as np
 import gradio as gr
 import sys
 import os
 import socket
 sys.path.append('vggt/')
 import shutil
 from datetime import datetime
+from demo_hf import demo_fn #, initialize_model
 from omegaconf import DictConfig, OmegaConf
 import glob
 import gc
 import time
 from viser_fn import viser_wrapper
+from gradio_util import demo_predictions_to_glb
+from hydra.utils import instantiate
+import spaces
+# def get_free_port():
+#     """Get a free port using socket."""
+#     # return 80
+#     # return 8080
+#     # return 10088 # for debugging
+#     # return 7860
+#     # return 7888
+#     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+#         s.bind(('', 0))
+#         port = s.getsockname()[1]
+#         return port
+cfg_file = "config/base.yaml"
+cfg = OmegaConf.load(cfg_file)
+vggt_model = instantiate(cfg, _recursive_=False)
+_VGGT_URL = "https://huggingface.co/facebook/vggt_alpha/resolve/main/vggt_alpha_v0.pt"
+# Reload vggt_model
+pretrain_model = torch.hub.load_state_dict_from_url(_VGGT_URL)
+if "vggt_model" in pretrain_model:
+    model_dict = pretrain_model["vggt_model"]
+    vggt_model.load_state_dict(model_dict, strict=False)
+else:
+    vggt_model.load_state_dict(pretrain_model, strict=True)
+# @torch.inference_mode()
 @spaces.GPU(duration=240)
 def vggt_demo(
     input_video,
     input_image,
+    conf_thres=3.0,
+    frame_filter="all",
+    mask_black_bg=False,
 ):
     start_time = time.time()
     gc.collect()
     torch.cuda.empty_cache()
+    # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
     target_dir = f"input_images_{timestamp}"
     if os.path.exists(target_dir):
         shutil.rmtree(target_dir)
     if input_image is not None:
         input_image = sorted(input_image)
         for file_name in input_image:
             shutil.copy(file_name, target_dir_images)
     elif input_video is not None:
             if count % frame_interval == 0:
                 cv2.imwrite(target_dir_images+"/"+f"{video_frame_num:06}.png", frame)
+                video_frame_num+=1
     else:
+        return None, "Uploading not finished or Incorrect input format", None, None
+    all_files = sorted(os.listdir(target_dir_images))
+    all_files = [f"{i}: {filename}" for i, filename in enumerate(all_files)]
+    # Update frame_filter choices
+    frame_filter_choices = ["All"] + all_files
     print(f"Files have been copied to {target_dir_images}")
     cfg.SCENE_DIR = target_dir
+    print("Running demo_fn")
+    with torch.no_grad():
+        predictions = demo_fn(cfg, vggt_model)
+    predictions["pred_extrinsic_list"] = None
+    print("Saving predictions")
+    prediction_save_path = f"{target_dir}/predictions.npz"
+    np.savez(prediction_save_path, **predictions)
+    glbfile = target_dir + f"/glbscene_{conf_thres}_{frame_filter.replace('.', '_')}_mask{mask_black_bg}.glb"
+    glbscene = demo_predictions_to_glb(predictions, conf_thres=conf_thres, filter_by_frames=frame_filter, mask_black_bg=mask_black_bg)
+    glbscene.export(file_obj=glbfile)
     del predictions
     gc.collect()
     torch.cuda.empty_cache()
     execution_time = end_time - start_time
     print(f"Execution time: {execution_time} seconds")
+    # Return None for the 3D vggt_model (since we're using viser) and the viser URL
     # viser_url = f"Viser visualization is ready at: http://localhost:{viser_port}"
     # print(viser_url)  # Debug print
+    log = "Success. Waiting for visualization."
+    return glbfile, log, target_dir, gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True)
+def update_visualization(target_dir, conf_thres, frame_filter, mask_black_bg):
+    loaded = np.load(f"{target_dir}/predictions.npz", allow_pickle=True)
+    # predictions = np.load(f"{target_dir}/predictions.npz", allow_pickle=True)
+    # predictions["arr_0"]
+    # for key in predictions.files: print(key)
+    predictions = {key: loaded[key] for key in loaded.keys()}
+    glbfile = target_dir + f"/glbscene_{conf_thres}_{frame_filter.replace('.', '_')}_mask{mask_black_bg}.glb"
+    if not os.path.exists(glbfile):
+        glbscene = demo_predictions_to_glb(predictions, conf_thres=conf_thres, filter_by_frames=frame_filter, mask_black_bg=mask_black_bg)
+        glbscene.export(file_obj=glbfile)
+    return glbfile, "Updating Visualization", target_dir
     gr.Markdown("""
     # 🏛️ VGGT: Visual Geometry Grounded Transformer
+    <div style="font-size: 16px; line-height: 1.5;">
+    <p><strong>Alpha version</strong> (under active development)</p>
+    <p>Upload a video or images to create a 3D reconstruction. Once your media appears in the left panel, click the "Reconstruct" button to begin processing.</p>
+    <h3>Usage Tips:</h3>
+    <ol>
+        <li>After reconstruction, you can fine-tune the visualization by adjusting the confidence threshold or selecting specific frames to display, then click "Update Visualization".</li>
+        <li>Performance note: While the model itself processes quickly (~0.2 seconds), initial setup and visualization may take longer. First-time use requires downloading model weights, and rendering dense point clouds can be resource-intensive.</li>
+        <li>Known limitation: The model currently exhibits inconsistent behavior with videos centered around human subjects. This issue is being addressed in upcoming updates.</li>
+    </ol>
     </div>
     """)
         with gr.Column(scale=1):
             input_video = gr.Video(label="Upload Video", interactive=True)
             input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
         with gr.Column(scale=3):
+            with gr.Column():
+                gr.Markdown("**3D Reconstruction (Point Cloud and Camera Poses; Zoom in to see details)**")
+                reconstruction_output = gr.Model3D(height=520, zoom_speed=0.5, pan_speed=0.5)
+            # reconstruction_output = gr.Model3D(label="3D Reconstruction (Point Cloud and Camera Poses; Zoom in to see details)", height=520, zoom_speed=0.5, pan_speed=0.5)
+            # Move these controls to a new row above the log output
+            with gr.Row():
+                conf_thres = gr.Slider(minimum=0.1, maximum=20.0, value=3.0, step=0.1, label="Conf Thres")
+                frame_filter = gr.Dropdown(choices=["All"], value="All", label="Show Points from Frame")
+                mask_black_bg = gr.Checkbox(label="Filter Black Background", value=False)
             log_output = gr.Textbox(label="Log")
+            # Add a hidden textbox for target_dir
+            target_dir_output = gr.Textbox(label="Target Dir", visible=False)
     with gr.Row():
         submit_btn = gr.Button("Reconstruct", scale=1)
+        revisual_btn = gr.Button("Update Visualization", scale=1)
+        clear_btn = gr.ClearButton([input_video, input_images, reconstruction_output, log_output, target_dir_output], scale=1) #Modified reconstruction_output
     examples = [
+        [counter_video, counter_images, 1.5, "All", False],
+        [flower_video, flower_images, 1.5, "All", False],
+        [kitchen_video, kitchen_images, 3, "All", False],
+        [fern_video, fern_images, 1.5, "All", False],
         # [person_video, person_images],
         # [statue_video, statue_images],
         # [drums_video, drums_images],
+        # [horns_video, horns_images, 1.5, "All", False],
         # [apple_video, apple_images],
         # [bonsai_video, bonsai_images],
     ]
     gr.Examples(examples=examples,
+                inputs=[input_video, input_images, conf_thres, frame_filter, mask_black_bg],
+                outputs=[reconstruction_output, log_output, target_dir_output, frame_filter],  # Added frame_filter
+                fn=vggt_demo,  # Use our wrapper function
                 cache_examples=False,
                 examples_per_page=50,
                 )
     submit_btn.click(
+        vggt_demo,  # Use the same wrapper function
+        [input_video, input_images, conf_thres, frame_filter, mask_black_bg],
+        [reconstruction_output, log_output, target_dir_output, frame_filter], # Added frame_filter to outputs
         # concurrency_limit=1
     )
+    revisual_btn.click(
+        update_visualization,
+        [target_dir_output, conf_thres, frame_filter, mask_black_bg],
+        [reconstruction_output, log_output, target_dir_output],
+    )
     # demo.launch(debug=True, share=True)
     # demo.launch(server_name="0.0.0.0", server_port=8082, debug=True, share=False)
     # demo.queue(max_size=20).launch(show_error=True, share=True)
     demo.queue(max_size=20).launch(show_error=True) #, share=True, server_port=7888, server_name="0.0.0.0")
+    # share=True
     # demo.queue(max_size=20, concurrency_count=1).launch(debug=True, share=True)
 ########################################################################################################################

demo_hf.py CHANGED Viewed

@@ -11,26 +11,29 @@ from viser_fn import viser_wrapper
 # @hydra.main(config_path="config", config_name="base")
-def demo_fn(cfg: DictConfig) -> None:
-    print(cfg)
-    model = instantiate(cfg, _recursive_=False)
     if not torch.cuda.is_available():
         raise ValueError("CUDA is not available. Check your environment.")
-    device = "cuda"
     model = model.to(device)
-    _VGGT_URL = "https://huggingface.co/facebook/vggt_alpha/resolve/main/vggt_alpha_v0.pt"
-    # Reload model
-    pretrain_model = torch.hub.load_state_dict_from_url(_VGGT_URL)
-    if "model" in pretrain_model:
-        model_dict = pretrain_model["model"]
-        model.load_state_dict(model_dict, strict=False)
-    else:
-        model.load_state_dict(pretrain_model, strict=True)
     # batch = torch.load("/fsx-repligen/jianyuan/cvpr2025_ckpts/batch.pth")

 # @hydra.main(config_path="config", config_name="base")
+def demo_fn(cfg: DictConfig, model) -> None:
+    print(cfg.SCENE_DIR)
     if not torch.cuda.is_available():
         raise ValueError("CUDA is not available. Check your environment.")
+    if torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
     model = model.to(device)
+    # _VGGT_URL = "https://huggingface.co/facebook/vggt_alpha/resolve/main/vggt_alpha_v0.pt"
+    # # Reload model
+    # pretrain_model = torch.hub.load_state_dict_from_url(_VGGT_URL)
+    # if "model" in pretrain_model:
+    #     model_dict = pretrain_model["model"]
+    #     model.load_state_dict(model_dict, strict=False)
+    # else:
+    #     model.load_state_dict(pretrain_model, strict=True)
     # batch = torch.load("/fsx-repligen/jianyuan/cvpr2025_ckpts/batch.pth")

gradio_util.py CHANGED Viewed

@@ -1,56 +1,22 @@
-try:
-    import os
-    import trimesh
-    import open3d as o3d
-    import gradio as gr
-    import numpy as np
-    import matplotlib
-    from scipy.spatial.transform import Rotation
-    print("Successfully imported the packages for Gradio visualization")
-except:
-    print(
-        f"Failed to import packages for Gradio visualization. Please disable gradio visualization"
-    )
-def visualize_by_gradio(glbfile):
-    """
-    Set up and launch a Gradio interface to visualize a GLB file.
-    Args:
-        glbfile (str): Path to the GLB file to be visualized.
-    """
-    def load_glb_file(glb_path):
-        # Check if the file exists and return the path or error message
-        if os.path.exists(glb_path):
-            return glb_path, "3D Model Loaded Successfully"
-        else:
-            return None, "File not found"
-    # Load the GLB file initially to check if it's valid
-    initial_model, log_message = load_glb_file(glbfile)
-    # Create the Gradio interface
-    with gr.Blocks() as demo:
-        gr.Markdown("# GLB File Viewer")
-        # 3D Model viewer component
-        model_viewer = gr.Model3D(
-            label="3D Model Viewer", height=600, value=initial_model
-        )
-        # Textbox for log output
-        log_output = gr.Textbox(label="Log", lines=2, value=log_message)
-    # Launch the Gradio interface
-    demo.launch(share=True)
-def vggsfm_predictions_to_glb(predictions) -> trimesh.Scene:
     """
     Converts VGG SFM predictions to a 3D scene represented as a GLB.
@@ -61,27 +27,51 @@ def vggsfm_predictions_to_glb(predictions) -> trimesh.Scene:
         trimesh.Scene: A 3D scene object.
     """
     # Convert predictions to numpy arrays
-    vertices_3d = predictions["points3D"].cpu().numpy()
-    colors_rgb = (predictions["points3D_rgb"].cpu().numpy() * 255).astype(
-        np.uint8
-    )
-    if True:
-        pcd = o3d.geometry.PointCloud()
-        pcd.points = o3d.utility.Vector3dVector(vertices_3d)
-        pcd.colors = o3d.utility.Vector3dVector(colors_rgb)
-        cl, ind = pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=1.0)
-        filtered_pcd = pcd.select_by_index(ind)
-        print(f"Filter out {len(vertices_3d) - len(filtered_pcd.points)} 3D points")
-        vertices_3d = np.asarray(filtered_pcd.points)
-        colors_rgb = np.asarray(filtered_pcd.colors).astype(np.uint8)
-    camera_matrices = predictions["extrinsics_opencv"].cpu().numpy()
     # Calculate the 5th and 95th percentiles along each axis
     lower_percentile = np.percentile(vertices_3d, 5, axis=0)
@@ -122,39 +112,10 @@ def vggsfm_predictions_to_glb(predictions) -> trimesh.Scene:
     # Align scene to the observation of the first camera
     scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)
     return scene_3d
-def apply_scene_alignment(
-    scene_3d: trimesh.Scene, extrinsics_matrices: np.ndarray
-) -> trimesh.Scene:
-    """
-    Aligns the 3D scene based on the extrinsics of the first camera.
-    Args:
-        scene_3d (trimesh.Scene): The 3D scene to be aligned.
-        extrinsics_matrices (np.ndarray): Camera extrinsic matrices.
-    Returns:
-        trimesh.Scene: Aligned 3D scene.
-    """
-    # Set transformations for scene alignment
-    opengl_conversion_matrix = get_opengl_conversion_matrix()
-    # Rotation matrix for alignment (180 degrees around the y-axis)
-    align_rotation = np.eye(4)
-    align_rotation[:3, :3] = Rotation.from_euler(
-        "y", 180, degrees=True
-    ).as_matrix()
-    # Apply transformation
-    initial_transformation = (
-        np.linalg.inv(extrinsics_matrices[0])
-        @ opengl_conversion_matrix
-        @ align_rotation
-    )
-    scene_3d.apply_transform(initial_transformation)
-    return scene_3d
 def integrate_camera_into_scene(
@@ -215,40 +176,57 @@ def integrate_camera_into_scene(
     scene.add_geometry(camera_mesh)
-def compute_camera_faces(cone_shape: trimesh.Trimesh) -> np.ndarray:
     """
-    Computes the faces for the camera mesh.
     Args:
-        cone_shape (trimesh.Trimesh): The shape of the camera cone.
     Returns:
-        np.ndarray: Array of faces for the camera mesh.
     """
-    # Create pseudo cameras
-    faces_list = []
-    num_vertices_cone = len(cone_shape.vertices)
-    for face in cone_shape.faces:
-        if 0 in face:
-            continue
-        v1, v2, v3 = face
-        v1_offset, v2_offset, v3_offset = face + num_vertices_cone
-        v1_offset_2, v2_offset_2, v3_offset_2 = face + 2 * num_vertices_cone
-        faces_list.extend(
-            [
-                (v1, v2, v2_offset),
-                (v1, v1_offset, v3),
-                (v3_offset, v2, v3),
-                (v1, v2, v2_offset_2),
-                (v1, v1_offset_2, v3),
-                (v3_offset_2, v2, v3),
-            ]
-        )
-    faces_list += [(v3, v2, v1) for v1, v2, v3 in faces_list]
-    return np.array(faces_list)
 def transform_points(
@@ -280,18 +258,38 @@ def transform_points(
     return result
-def get_opengl_conversion_matrix() -> np.ndarray:
     """
-    Constructs and returns the OpenGL conversion matrix.
     Returns:
-        numpy.ndarray: A 4x4 OpenGL conversion matrix.
     """
-    # Create an identity matrix
-    matrix = np.identity(4)
-    # Flip the y and z axes
-    matrix[1, 1] = -1
-    matrix[2, 2] = -1
-    return matrix

+import os
+import trimesh
+# import open3d as o3d
+import gradio as gr
+import numpy as np
+import matplotlib
+from scipy.spatial.transform import Rotation
+# except:
+#     print(
+#         f"Failed to import packages for Gradio visualization. Please disable gradio visualization"
+#     )
+def demo_predictions_to_glb(predictions, conf_thres=3.0, filter_by_frames="all", mask_black_bg=False) -> trimesh.Scene:
     """
     Converts VGG SFM predictions to a 3D scene represented as a GLB.
         trimesh.Scene: A 3D scene object.
     """
     # Convert predictions to numpy arrays
+    # pred_extrinsic_list', 'pred_world_points', 'pred_world_points_conf', 'images', 'last_pred_extrinsic
+    print("Building GLB scene")
+    selected_frame_idx = None
+    if filter_by_frames != "all":
+        try:
+            # Extract the index part before the colon
+            selected_frame_idx = int(filter_by_frames.split(":")[0])
+        except (ValueError, IndexError):
+            # If parsing fails, default to using all frames
+            pass
+    pred_world_points = predictions["pred_world_points"][0] # remove batch dimension
+    pred_world_points_conf = predictions["pred_world_points_conf"][0]
+    images = predictions["images"][0]
+    last_pred_extrinsic = predictions["last_pred_extrinsic"][0]
+    if selected_frame_idx is not None:
+        pred_world_points = pred_world_points[selected_frame_idx][None]
+        pred_world_points_conf = pred_world_points_conf[selected_frame_idx][None]
+        images = images[selected_frame_idx][None]
+        last_pred_extrinsic = last_pred_extrinsic[selected_frame_idx][None]
+    vertices_3d = pred_world_points.reshape(-1, 3)
+    colors_rgb = np.transpose(images, (0, 2, 3, 1)) #images.permute(0, 3, 1, 2)
+    colors_rgb = (colors_rgb.reshape(-1, 3) * 255).astype(np.uint8)
+    camera_matrices = last_pred_extrinsic
+    conf = pred_world_points_conf.reshape(-1)
+    conf_mask = conf > conf_thres
+    if mask_black_bg:
+        black_bg_mask = colors_rgb.sum(axis=1) >= 16
+        conf_mask = conf_mask & black_bg_mask
+    vertices_3d = vertices_3d[conf_mask]
+    colors_rgb = colors_rgb[conf_mask]
+    # vertices_3d = predictions["points3D"].cpu().numpy()
+    # colors_rgb = (predictions["points3D_rgb"].cpu().numpy() * 255).astype(
+    #     np.uint8
+    # )
+    # camera_matrices = predictions["extrinsics_opencv"].cpu().numpy()
     # Calculate the 5th and 95th percentiles along each axis
     lower_percentile = np.percentile(vertices_3d, 5, axis=0)
     # Align scene to the observation of the first camera
     scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)
+    print("GLB Scene built")
     return scene_3d
 def integrate_camera_into_scene(
     scene.add_geometry(camera_mesh)
+def apply_scene_alignment(
+    scene_3d: trimesh.Scene, extrinsics_matrices: np.ndarray
+) -> trimesh.Scene:
     """
+    Aligns the 3D scene based on the extrinsics of the first camera.
     Args:
+        scene_3d (trimesh.Scene): The 3D scene to be aligned.
+        extrinsics_matrices (np.ndarray): Camera extrinsic matrices.
     Returns:
+        trimesh.Scene: Aligned 3D scene.
     """
+    # Set transformations for scene alignment
+    opengl_conversion_matrix = get_opengl_conversion_matrix()
+    # Rotation matrix for alignment (180 degrees around the y-axis)
+    align_rotation = np.eye(4)
+    align_rotation[:3, :3] = Rotation.from_euler(
+        "y", 180, degrees=True
+    ).as_matrix()
+    # Apply transformation
+    initial_transformation = (
+        np.linalg.inv(extrinsics_matrices[0])
+        @ opengl_conversion_matrix
+        @ align_rotation
+    )
+    scene_3d.apply_transform(initial_transformation)
+    return scene_3d
+def get_opengl_conversion_matrix() -> np.ndarray:
+    """
+    Constructs and returns the OpenGL conversion matrix.
+    Returns:
+        numpy.ndarray: A 4x4 OpenGL conversion matrix.
+    """
+    # Create an identity matrix
+    matrix = np.identity(4)
+    # Flip the y and z axes
+    matrix[1, 1] = -1
+    matrix[2, 2] = -1
+    return matrix
 def transform_points(
     return result
+def compute_camera_faces(cone_shape: trimesh.Trimesh) -> np.ndarray:
     """
+    Computes the faces for the camera mesh.
+    Args:
+        cone_shape (trimesh.Trimesh): The shape of the camera cone.
     Returns:
+        np.ndarray: Array of faces for the camera mesh.
     """
+    # Create pseudo cameras
+    faces_list = []
+    num_vertices_cone = len(cone_shape.vertices)
+    for face in cone_shape.faces:
+        if 0 in face:
+            continue
+        v1, v2, v3 = face
+        v1_offset, v2_offset, v3_offset = face + num_vertices_cone
+        v1_offset_2, v2_offset_2, v3_offset_2 = face + 2 * num_vertices_cone
+        faces_list.extend(
+            [
+                (v1, v2, v2_offset),
+                (v1, v1_offset, v3),
+                (v3_offset, v2, v3),
+                (v1, v2, v2_offset_2),
+                (v1, v1_offset_2, v3),
+                (v3_offset_2, v2, v3),
+            ]
+        )
+    faces_list += [(v3, v2, v1) for v1, v2, v3 in faces_list]
+    return np.array(faces_list)