Spaces:

Dref360
/

vit_pose_playground

Running

App Files Files Community

vit_pose_playground / app.py

Dref360

Add NMS and fix conversion

8b58215 verified 3 months ago

raw

history blame contribute delete

7.56 kB

	import gradio as gr
	import torch
	import numpy as np
	import cv2
	from PIL import Image
	import supervision as sv
	from transformers import (
	RTDetrForObjectDetection,
	RTDetrImageProcessor,
	VitPoseConfig,
	VitPoseForPoseEstimation,
	VitPoseImageProcessor,
	)


	KEYPOINT_LABEL_MAP = {
	0: "Nose",
	1: "L_Eye",
	2: "R_Eye",
	3: "L_Ear",
	4: "R_Ear",
	5: "L_Shoulder",
	6: "R_Shoulder",
	7: "L_Elbow",
	8: "R_Elbow",
	9: "L_Wrist",
	10: "R_Wrist",
	11: "L_Hip",
	12: "R_Hip",
	13: "L_Knee",
	14: "R_Knee",
	15: "L_Ankle",
	16: "R_Ankle",
	}


	class KeypointDetector:
	def __init__(self):
	self.person_detector = None
	self.person_processor = None
	self.pose_model = None
	self.pose_processor = None
	self.load_models()

	def load_models(self):
	"""Load all required models"""
	# Object detection model
	self.person_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
	self.person_detector = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")

	# Pose estimation model
	self.pose_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple")
	self.pose_model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple")

	@staticmethod
	def pascal_voc_to_coco(bboxes: np.ndarray) -> np.ndarray:
	"""Convert Pascal VOC format to COCO format"""
	bboxes = bboxes.copy() # Create a copy to avoid modifying the input
	bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
	bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
	return bboxes

	@staticmethod
	def coco_to_xyxy(bboxes: np.ndarray) -> np.ndarray:
	"""Convert COCO format (x,y,w,h) to xyxy format (x1,y1,x2,y2)"""
	bboxes = bboxes.copy()
	bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]
	bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]
	return bboxes

	def detect_persons(self, image: Image.Image):
	"""Detect persons in the image"""
	inputs = self.person_processor(images=image, return_tensors="pt")
	with torch.no_grad():
	outputs = self.person_detector(**inputs)

	results = self.person_processor.post_process_object_detection(
	outputs,
	target_sizes=torch.tensor([(image.height, image.width)]),
	threshold=0.3
	)

	dets = sv.Detections.from_transformers(results[0]).with_nms(0.5)

	# Get boxes and scores for human class (index 0 in COCO dataset)
	boxes = dets.xyxy[dets.class_id == 0]
	scores = dets.confidence[dets.class_id == 0]
	return boxes, scores

	def detect_keypoints(self, image: Image.Image):
	"""Detect keypoints in the image"""
	# Detect persons first
	boxes, scores = self.detect_persons(image)
	boxes_coco = [self.pascal_voc_to_coco(boxes)]

	# Detect pose keypoints
	pixel_values = self.pose_processor(image, boxes=boxes_coco, return_tensors="pt").pixel_values
	with torch.no_grad():
	outputs = self.pose_model(pixel_values)

	pose_results = self.pose_processor.post_process_pose_estimation(outputs, boxes=boxes_coco)[0]
	return pose_results, boxes, scores

	def visualize_detections(self, image: Image.Image, pose_results, boxes, scores):
	"""Visualize both bounding boxes and keypoints on the image"""
	# Convert image to numpy array if needed
	image_array = np.array(image)

	# Setup detections for bounding boxes
	detections = sv.Detections(
	xyxy=boxes,
	confidence=scores,
	class_id=np.array([0]*len(scores))
	)

	# Create box annotator
	box_annotator = sv.BoxAnnotator(
	color=sv.ColorPalette.DEFAULT,
	thickness=2
	)

	# Create edge annotator for keypoints
	edge_annotator = sv.EdgeAnnotator(
	color=sv.Color.GREEN,
	thickness=3
	)

	# Convert keypoints to supervision format
	key_points = sv.KeyPoints(
	xy=torch.cat([pose_result['keypoints'].unsqueeze(0) for pose_result in pose_results]).cpu().numpy()
	)

	# Annotate image with boxes first
	annotated_frame = box_annotator.annotate(
	scene=image_array.copy(),
	detections=detections
	)

	# Then add keypoints
	annotated_frame = edge_annotator.annotate(
	scene=annotated_frame,
	key_points=key_points
	)

	return Image.fromarray(annotated_frame)

	def process_image(self, input_image):
	"""Process image and return visualization"""
	if input_image is None:
	return None, ""

	# Convert to PIL Image if necessary
	if isinstance(input_image, np.ndarray):
	image = Image.fromarray(input_image)
	else:
	image = input_image

	# Detect keypoints and boxes
	pose_results, boxes, scores = self.detect_keypoints(image)

	# Visualize results
	result_image = self.visualize_detections(image, pose_results, boxes, scores)

	# Create detection information text
	info_text = []

	# Box information
	for i, (box, score) in enumerate(zip(boxes, scores)):
	info_text.append(f"\nPerson {i + 1} (confidence: {score:.2f})")
	info_text.append(f"Bounding Box: x1={box[0]:.1f}, y1={box[1]:.1f}, x2={box[2]:.1f}, y2={box[3]:.1f}")

	# Add keypoint information for this person
	pose_result = pose_results[i]
	for j, keypoint in enumerate(pose_result["keypoints"]):
	x, y, confidence = keypoint
	info_text.append(f"Keypoint {KEYPOINT_LABEL_MAP[j]}: x={x:.1f}, y={y:.1f}, confidence={confidence:.2f}")

	return result_image, "\n".join(info_text)


	def create_gradio_interface():
	"""Create Gradio interface"""
	detector = KeypointDetector()

	with gr.Blocks() as interface:
	gr.Markdown("# Human Detection and Keypoint Estimation using VitPose")
	gr.Markdown("Upload an image to detect people and their keypoints. The model will:")
	gr.Markdown("1. Detect people in the image (shown as bounding boxes)")
	gr.Markdown("2. Identify keypoints for each detected person (shown as connected green lines)")
	gr.Markdown("Huge shoutout to @NielsRogge and @SangbumChoi for this work!")

	with gr.Row():
	with gr.Column():
	input_image = gr.Image(label="Input Image")
	process_button = gr.Button("Detect People & Keypoints")

	with gr.Column():
	output_image = gr.Image(label="Detection Results")
	detection_info = gr.Textbox(
	label="Detection Information",
	lines=10,
	placeholder="Detection details will appear here..."
	)

	process_button.click(
	fn=detector.process_image,
	inputs=input_image,
	outputs=[output_image, detection_info]
	)

	gr.Examples(
	examples=[
	"http://images.cocodataset.org/val2017/000000000139.jpg"
	],
	inputs=input_image
	)

	return interface


	if __name__ == "__main__":
	interface = create_gradio_interface()
	interface.launch()