import gradio as gr import torch import numpy as np import cv2 from PIL import Image import supervision as sv from transformers import ( RTDetrForObjectDetection, RTDetrImageProcessor, VitPoseConfig, VitPoseForPoseEstimation, VitPoseImageProcessor, ) KEYPOINT_LABEL_MAP = { 0: "Nose", 1: "L_Eye", 2: "R_Eye", 3: "L_Ear", 4: "R_Ear", 5: "L_Shoulder", 6: "R_Shoulder", 7: "L_Elbow", 8: "R_Elbow", 9: "L_Wrist", 10: "R_Wrist", 11: "L_Hip", 12: "R_Hip", 13: "L_Knee", 14: "R_Knee", 15: "L_Ankle", 16: "R_Ankle", } class KeypointDetector: def __init__(self): self.person_detector = None self.person_processor = None self.pose_model = None self.pose_processor = None self.load_models() def load_models(self): """Load all required models""" # Object detection model self.person_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365") self.person_detector = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365") # Pose estimation model self.pose_processor = VitPoseImageProcessor.from_pretrained("nielsr/vitpose-base-simple") self.pose_model = VitPoseForPoseEstimation.from_pretrained("nielsr/vitpose-base-simple") @staticmethod def pascal_voc_to_coco(bboxes: np.ndarray) -> np.ndarray: """Convert Pascal VOC format to COCO format""" bboxes = bboxes.copy() # Create a copy to avoid modifying the input bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0] bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1] return bboxes @staticmethod def coco_to_xyxy(bboxes: np.ndarray) -> np.ndarray: """Convert COCO format (x,y,w,h) to xyxy format (x1,y1,x2,y2)""" bboxes = bboxes.copy() bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2] bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3] return bboxes def detect_persons(self, image: Image.Image): """Detect persons in the image""" inputs = self.person_processor(images=image, return_tensors="pt") with torch.no_grad(): outputs = self.person_detector(**inputs) results = self.person_processor.post_process_object_detection( outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3 ) dets = sv.Detections.from_transformers(results[0]).with_nms(0.5) # Get boxes and scores for human class (index 0 in COCO dataset) boxes = dets.xyxy[dets.class_id == 0] scores = dets.confidence[dets.class_id == 0] return boxes, scores def detect_keypoints(self, image: Image.Image): """Detect keypoints in the image""" # Detect persons first boxes, scores = self.detect_persons(image) boxes_coco = [self.pascal_voc_to_coco(boxes)] # Detect pose keypoints pixel_values = self.pose_processor(image, boxes=boxes_coco, return_tensors="pt").pixel_values with torch.no_grad(): outputs = self.pose_model(pixel_values) pose_results = self.pose_processor.post_process_pose_estimation(outputs, boxes=boxes_coco)[0] return pose_results, boxes, scores def visualize_detections(self, image: Image.Image, pose_results, boxes, scores): """Visualize both bounding boxes and keypoints on the image""" # Convert image to numpy array if needed image_array = np.array(image) # Setup detections for bounding boxes detections = sv.Detections( xyxy=boxes, confidence=scores, class_id=np.array([0]*len(scores)) ) # Create box annotator box_annotator = sv.BoxAnnotator( color=sv.ColorPalette.DEFAULT, thickness=2 ) # Create edge annotator for keypoints edge_annotator = sv.EdgeAnnotator( color=sv.Color.GREEN, thickness=3 ) # Convert keypoints to supervision format key_points = sv.KeyPoints( xy=torch.cat([pose_result['keypoints'].unsqueeze(0) for pose_result in pose_results]).cpu().numpy() ) # Annotate image with boxes first annotated_frame = box_annotator.annotate( scene=image_array.copy(), detections=detections ) # Then add keypoints annotated_frame = edge_annotator.annotate( scene=annotated_frame, key_points=key_points ) return Image.fromarray(annotated_frame) def process_image(self, input_image): """Process image and return visualization""" if input_image is None: return None, "" # Convert to PIL Image if necessary if isinstance(input_image, np.ndarray): image = Image.fromarray(input_image) else: image = input_image # Detect keypoints and boxes pose_results, boxes, scores = self.detect_keypoints(image) # Visualize results result_image = self.visualize_detections(image, pose_results, boxes, scores) # Create detection information text info_text = [] # Box information for i, (box, score) in enumerate(zip(boxes, scores)): info_text.append(f"\nPerson {i + 1} (confidence: {score:.2f})") info_text.append(f"Bounding Box: x1={box[0]:.1f}, y1={box[1]:.1f}, x2={box[2]:.1f}, y2={box[3]:.1f}") # Add keypoint information for this person pose_result = pose_results[i] for j, keypoint in enumerate(pose_result["keypoints"]): x, y, confidence = keypoint info_text.append(f"Keypoint {KEYPOINT_LABEL_MAP[j]}: x={x:.1f}, y={y:.1f}, confidence={confidence:.2f}") return result_image, "\n".join(info_text) def create_gradio_interface(): """Create Gradio interface""" detector = KeypointDetector() with gr.Blocks() as interface: gr.Markdown("# Human Detection and Keypoint Estimation using VitPose") gr.Markdown("Upload an image to detect people and their keypoints. The model will:") gr.Markdown("1. Detect people in the image (shown as bounding boxes)") gr.Markdown("2. Identify keypoints for each detected person (shown as connected green lines)") gr.Markdown("Huge shoutout to @NielsRogge and @SangbumChoi for this work!") with gr.Row(): with gr.Column(): input_image = gr.Image(label="Input Image") process_button = gr.Button("Detect People & Keypoints") with gr.Column(): output_image = gr.Image(label="Detection Results") detection_info = gr.Textbox( label="Detection Information", lines=10, placeholder="Detection details will appear here..." ) process_button.click( fn=detector.process_image, inputs=input_image, outputs=[output_image, detection_info] ) gr.Examples( examples=[ "http://images.cocodataset.org/val2017/000000000139.jpg" ], inputs=input_image ) return interface if __name__ == "__main__": interface = create_gradio_interface() interface.launch()