Spaces:

facebook
/

vggt

Running on Zero

App Files Files Community

vggt / gradio_util.py

JianyuanWang

update

d35072c 3 days ago

raw

history blame

12.6 kB

	# import os

	import trimesh
	# import open3d as o3d

	import gradio as gr
	import numpy as np
	import matplotlib
	from scipy.spatial.transform import Rotation
	import copy
	import cv2
	import os


	def demo_predictions_to_glb(predictions, conf_thres=3.0, filter_by_frames="all", mask_black_bg=False, show_cam=True, mask_sky=False, target_dir=None) -> trimesh.Scene:
	"""
	Converts VGG SFM predictions to a 3D scene represented as a GLB.

	Args:
	predictions (dict): A dictionary containing model predictions.

	Returns:
	trimesh.Scene: A 3D scene object.
	"""
	# Convert predictions to numpy arrays
	# pred_extrinsic_list', 'pred_world_points', 'pred_world_points_conf', 'images', 'last_pred_extrinsic


	if conf_thres is None:
	conf_thres = 0.0

	print("Building GLB scene")
	selected_frame_idx = None
	if filter_by_frames != "all":
	try:
	# Extract the index part before the colon
	selected_frame_idx = int(filter_by_frames.split(":")[0])
	except (ValueError, IndexError):
	pass

	pred_world_points = predictions["pred_world_points"][0] # remove batch dimension
	pred_world_points_conf = predictions["pred_world_points_conf"][0]
	images = predictions["images"][0]
	last_pred_extrinsic = predictions["last_pred_extrinsic"][0]


	if mask_sky:
	if target_dir is not None:
	import onnxruntime
	skyseg_session = None
	target_dir_images = target_dir + "/images"
	image_list = sorted(os.listdir(target_dir_images))
	sky_mask_list = []

	# Get the shape of pred_world_points_conf to match
	S, H, W = pred_world_points_conf.shape

	for i, image_name in enumerate(image_list):
	image_filepath = os.path.join(target_dir_images, image_name)
	mask_filepath = os.path.join(target_dir, "sky_masks", image_name)

	# Check if mask already exists
	if os.path.exists(mask_filepath):
	# Load existing mask
	sky_mask = cv2.imread(mask_filepath, cv2.IMREAD_GRAYSCALE)
	else:
	# Generate new mask
	if skyseg_session is None:
	skyseg_session = onnxruntime.InferenceSession("skyseg.onnx")
	sky_mask = segment_sky(image_filepath, skyseg_session, mask_filepath)

	# Resize mask to match H×W if needed
	if sky_mask.shape[0] != H or sky_mask.shape[1] != W:
	sky_mask = cv2.resize(sky_mask, (W, H))

	# model_was_never_trained_on_single_image_or_oil_painting
	# no overlap
	# single view, youhua
	# single view, catoon
	sky_mask_list.append(sky_mask)

	# Convert list to numpy array with shape S×H×W
	sky_mask_array = np.array(sky_mask_list)

	# Apply sky mask to confidence scores
	sky_mask_binary = (sky_mask_array > 0.01).astype(np.float32)
	pred_world_points_conf = pred_world_points_conf * sky_mask_binary

	if selected_frame_idx is not None:
	pred_world_points = pred_world_points[selected_frame_idx][None]
	pred_world_points_conf = pred_world_points_conf[selected_frame_idx][None]
	images = images[selected_frame_idx][None]
	last_pred_extrinsic = last_pred_extrinsic[selected_frame_idx][None]

	vertices_3d = pred_world_points.reshape(-1, 3)
	colors_rgb = np.transpose(images, (0, 2, 3, 1)) #images.permute(0, 3, 1, 2)
	colors_rgb = (colors_rgb.reshape(-1, 3) * 255).astype(np.uint8)
	camera_matrices = last_pred_extrinsic

	conf = pred_world_points_conf.reshape(-1)
	conf_mask = conf > conf_thres

	if mask_black_bg:
	black_bg_mask = colors_rgb.sum(axis=1) >= 16
	conf_mask = conf_mask & black_bg_mask

	vertices_3d = vertices_3d[conf_mask]
	colors_rgb = colors_rgb[conf_mask]



	# resize_scale = 0.25
	# vertices_3d = vertices_3d * resize_scale
	# camera_matrices[:, :3, 3] = camera_matrices[:, :3, 3] * resize_scale


	if vertices_3d is None or np.asarray(vertices_3d).size == 0:
	vertices_3d = np.array([[1, 0, 0]])
	colors_rgb = np.array([[255, 255, 255]])
	scene_scale = 1
	else:
	# Calculate the 5th and 95th percentiles along each axis
	lower_percentile = np.percentile(vertices_3d, 5, axis=0)
	upper_percentile = np.percentile(vertices_3d, 95, axis=0)

	# Calculate the diagonal length of the percentile bounding box
	scene_scale = np.linalg.norm(upper_percentile - lower_percentile)

	colormap = matplotlib.colormaps.get_cmap("gist_rainbow")

	# Initialize a 3D scene
	scene_3d = trimesh.Scene()

	# Add point cloud data to the scene
	point_cloud_data = trimesh.PointCloud(
	vertices=vertices_3d, colors=colors_rgb
	)

	scene_3d.add_geometry(point_cloud_data)

	# Prepare 4x4 matrices for camera extrinsics
	num_cameras = len(camera_matrices)
	extrinsics_matrices = np.zeros((num_cameras, 4, 4))
	extrinsics_matrices[:, :3, :4] = camera_matrices
	extrinsics_matrices[:, 3, 3] = 1

	if show_cam:
	# Add camera models to the scene
	for i in range(num_cameras):
	world_to_camera = extrinsics_matrices[i]
	camera_to_world = np.linalg.inv(world_to_camera)
	rgba_color = colormap(i / num_cameras)
	current_color = tuple(int(255 * x) for x in rgba_color[:3])

	integrate_camera_into_scene(
	scene_3d, camera_to_world, current_color, scene_scale
	)

	# Align scene to the observation of the first camera
	scene_3d = apply_scene_alignment(scene_3d, extrinsics_matrices)

	print("GLB Scene built")
	return scene_3d




	def integrate_camera_into_scene(
	scene: trimesh.Scene,
	transform: np.ndarray,
	face_colors: tuple,
	scene_scale: float,
	):
	"""
	Integrates a fake camera mesh into the 3D scene.

	Args:
	scene (trimesh.Scene): The 3D scene to add the camera model.
	transform (np.ndarray): Transformation matrix for camera positioning.
	face_colors (tuple): Color of the camera face.
	scene_scale (float): Scale of the scene.
	"""

	cam_width = scene_scale * 0.05
	cam_height = scene_scale * 0.1

	# Create cone shape for camera
	rot_45_degree = np.eye(4)
	rot_45_degree[:3, :3] = Rotation.from_euler(
	"z", 45, degrees=True
	).as_matrix()
	rot_45_degree[2, 3] = -cam_height

	opengl_transform = get_opengl_conversion_matrix()
	# Combine transformations
	complete_transform = transform @ opengl_transform @ rot_45_degree
	camera_cone_shape = trimesh.creation.cone(cam_width, cam_height, sections=4)

	# Generate mesh for the camera
	slight_rotation = np.eye(4)
	slight_rotation[:3, :3] = Rotation.from_euler(
	"z", 2, degrees=True
	).as_matrix()

	vertices_combined = np.concatenate(
	[
	camera_cone_shape.vertices,
	0.95 * camera_cone_shape.vertices,
	transform_points(slight_rotation, camera_cone_shape.vertices),
	]
	)
	vertices_transformed = transform_points(
	complete_transform, vertices_combined
	)

	mesh_faces = compute_camera_faces(camera_cone_shape)

	# Add the camera mesh to the scene
	camera_mesh = trimesh.Trimesh(
	vertices=vertices_transformed, faces=mesh_faces
	)
	camera_mesh.visual.face_colors[:, :3] = face_colors
	scene.add_geometry(camera_mesh)



	def apply_scene_alignment(
	scene_3d: trimesh.Scene, extrinsics_matrices: np.ndarray
	) -> trimesh.Scene:
	"""
	Aligns the 3D scene based on the extrinsics of the first camera.

	Args:
	scene_3d (trimesh.Scene): The 3D scene to be aligned.
	extrinsics_matrices (np.ndarray): Camera extrinsic matrices.

	Returns:
	trimesh.Scene: Aligned 3D scene.
	"""
	# Set transformations for scene alignment
	opengl_conversion_matrix = get_opengl_conversion_matrix()

	# Rotation matrix for alignment (180 degrees around the y-axis)
	align_rotation = np.eye(4)
	align_rotation[:3, :3] = Rotation.from_euler(
	"y", 180, degrees=True
	).as_matrix()

	# Apply transformation
	initial_transformation = (
	np.linalg.inv(extrinsics_matrices[0])
	@ opengl_conversion_matrix
	@ align_rotation
	)
	scene_3d.apply_transform(initial_transformation)
	return scene_3d




	def get_opengl_conversion_matrix() -> np.ndarray:
	"""
	Constructs and returns the OpenGL conversion matrix.

	Returns:
	numpy.ndarray: A 4x4 OpenGL conversion matrix.
	"""
	# Create an identity matrix
	matrix = np.identity(4)

	# Flip the y and z axes
	matrix[1, 1] = -1
	matrix[2, 2] = -1

	return matrix



	def transform_points(
	transformation: np.ndarray, points: np.ndarray, dim: int = None
	) -> np.ndarray:
	"""
	Applies a 4x4 transformation to a set of points.

	Args:
	transformation (np.ndarray): Transformation matrix.
	points (np.ndarray): Points to be transformed.
	dim (int, optional): Dimension for reshaping the result.

	Returns:
	np.ndarray: Transformed points.
	"""
	points = np.asarray(points)
	initial_shape = points.shape[:-1]
	dim = dim or points.shape[-1]

	# Apply transformation
	transformation = transformation.swapaxes(
	-1, -2
	) # Transpose the transformation matrix
	points = points @ transformation[..., :-1, :] + transformation[..., -1:, :]

	# Reshape the result
	result = points[..., :dim].reshape(*initial_shape, dim)
	return result



	def compute_camera_faces(cone_shape: trimesh.Trimesh) -> np.ndarray:
	"""
	Computes the faces for the camera mesh.

	Args:
	cone_shape (trimesh.Trimesh): The shape of the camera cone.

	Returns:
	np.ndarray: Array of faces for the camera mesh.
	"""
	# Create pseudo cameras
	faces_list = []
	num_vertices_cone = len(cone_shape.vertices)

	for face in cone_shape.faces:
	if 0 in face:
	continue
	v1, v2, v3 = face
	v1_offset, v2_offset, v3_offset = face + num_vertices_cone
	v1_offset_2, v2_offset_2, v3_offset_2 = face + 2 * num_vertices_cone

	faces_list.extend(
	[
	(v1, v2, v2_offset),
	(v1, v1_offset, v3),
	(v3_offset, v2, v3),
	(v1, v2, v2_offset_2),
	(v1, v1_offset_2, v3),
	(v3_offset_2, v2, v3),
	]
	)

	faces_list += [(v3, v2, v1) for v1, v2, v3 in faces_list]
	return np.array(faces_list)




	def segment_sky(image_path, onnx_session, mask_filename=None):
	assert mask_filename is not None
	image = cv2.imread(image_path)

	result_map = run_skyseg(onnx_session,[320,320],image)
	# resize the result_map to the original image size
	result_map_original = cv2.resize(result_map, (image.shape[1], image.shape[0]))

	output_mask = np.zeros_like(result_map_original)
	output_mask[result_map_original<1] = 1
	output_mask = output_mask.astype(np.uint8) * 255
	os.makedirs(os.path.dirname(mask_filename), exist_ok=True)
	cv2.imwrite(mask_filename, output_mask)
	return output_mask





	def run_skyseg(onnx_session, input_size, image):
	# Pre process:Resize, BGR->RGB, Transpose, PyTorch standardization, float32 cast
	temp_image = copy.deepcopy(image)
	resize_image = cv2.resize(temp_image, dsize=(input_size[0], input_size[1]))
	x = cv2.cvtColor(resize_image, cv2.COLOR_BGR2RGB)
	x = np.array(x, dtype=np.float32)
	mean = [0.485, 0.456, 0.406]
	std = [0.229, 0.224, 0.225]
	x = (x / 255 - mean) / std
	x = x.transpose(2, 0, 1)
	x = x.reshape(-1, 3, input_size[0], input_size[1]).astype('float32')

	# Inference
	input_name = onnx_session.get_inputs()[0].name
	output_name = onnx_session.get_outputs()[0].name
	onnx_result = onnx_session.run([output_name], {input_name: x})

	# Post process
	onnx_result = np.array(onnx_result).squeeze()
	min_value = np.min(onnx_result)
	max_value = np.max(onnx_result)
	onnx_result = (onnx_result - min_value) / (max_value - min_value)
	onnx_result *= 255
	onnx_result = onnx_result.astype('uint8')

	return onnx_result