""" Gradio app for pollen-vision This script creates a Gradio app for pollen-vision. The app allows users to perform object detection and object segmentation using the OWL-ViT and MobileSAM models. """ from datasets import load_dataset import gradio as gr import numpy as np import numpy.typing as npt from typing import Any, Dict, List from pollen_vision.vision_models.object_detection import OwlVitWrapper from pollen_vision.vision_models.object_segmentation import MobileSamWrapper from pollen_vision.vision_models.utils import Annotator, get_bboxes owl_vit = OwlVitWrapper() mobile_sam = MobileSamWrapper() annotator = Annotator() def object_detection( img: npt.NDArray[np.uint8], text_queries: List[str], score_threshold: float ) -> List[Dict[str, Any]]: predictions: List[Dict[str, Any]] = owl_vit.infer( im=img, candidate_labels=text_queries, detection_threshold=score_threshold ) return predictions def object_segmentation( img: npt.NDArray[np.uint8], object_detection_predictions: List[Dict[str, Any]] ) -> List[npt.NDArray[np.uint8]]: bboxes = get_bboxes(predictions=object_detection_predictions) masks: List[npt.NDArray[np.uint8]] = mobile_sam.infer(im=img, bboxes=bboxes) return masks def query( task: str, img: npt.NDArray[np.uint8], text_queries: List[str], score_threshold: float, ) -> npt.NDArray[np.uint8]: object_detection_predictions = object_detection( img=img, text_queries=text_queries, score_threshold=score_threshold ) if task == "Object detection + segmentation (OWL-ViT + MobileSAM)": masks = object_segmentation( img=img, object_detection_predictions=object_detection_predictions ) img = annotator.annotate( im=img, detection_predictions=object_detection_predictions, masks=masks ) return img img = annotator.annotate(im=img, detection_predictions=object_detection_predictions) return img description = """ Welcome to the demo of pollen-vision, a simple and unified Python library to zero-shot computer vision models curated for robotics use cases. **Pollen-vision** is designed for ease of installation and use, composed of independent modules that can be combined to create a 3D object detection pipeline, getting the position of the objects in 3D space (x, y, z). \n\nIn this demo, you have the option to choose between two tasks: object detection and object detection + segmentation. The models available are: - **OWL-VIT** (Open World Localization - Vision Transformer, By Google Research): this model performs text-conditionned zero-shot 2D object localization in RGB images. - **Mobile SAM**: A lightweight version of the Segment Anything Model (SAM) by Meta AI. SAM is a zero shot image segmentation model. It can be prompted with bounding boxes or points. (https://github.com/ChaoningZhang/MobileSAM) \n\nYou can input images in this demo in three ways: either by trying out the provided examples, by uploading an image of your choice, or by capturing an image from your computer's webcam. Additionally, you should provide text queries representing a list of objects to detect. Separate each object with a comma. The last input parameter is the detection threshold (ranging from 0 to 1), which defaults to 0.1. \n\nCheck out our blog post introducing pollen-vision or its Github repository for more info! """ demo_inputs = [ gr.Dropdown( [ "Object detection (OWL-ViT)", "Object detection + segmentation (OWL-ViT + MobileSAM)", ], label="Choose a task", value="Object detection (OWL-ViT)", ), gr.Image(), "text", gr.Slider(0, 1, value=0.1), ] rdt_dataset = load_dataset("pollen-robotics/reachy-doing-things", split="train") img_kitchen_detection = rdt_dataset[11]["image"] img_kitchen_segmentation = rdt_dataset[12]["image"] demo_examples = [ [ "Object detection (OWL-ViT)", img_kitchen_detection, ["kettle", "black mug", "sink", "blue mug", "sponge", "bag of chips"], 0.15, ], [ "Object detection + segmentation (OWL-ViT + MobileSAM)", img_kitchen_segmentation, ["blue mug", "paper cup", "kettle", "sponge"], 0.12, ], ] demo = gr.Interface( fn=query, inputs=demo_inputs, outputs="image", title="Use zero-shot computer vision models with pollen-vision", description=description, examples=demo_examples, ) demo.launch()