import os import cv2 import torch import gradio as gr import numpy as np import supervision as sv from typing import List from segment_anything import sam_model_registry, SamAutomaticMaskGenerator from utils import postprocess_masks, Visualizer HOME = os.getenv("HOME") DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') MINIMUM_AREA_THRESHOLD = 0.01 SAM_CHECKPOINT = os.path.join(HOME, "app/weights/sam_vit_h_4b8939.pth") # SAM_CHECKPOINT = "weights/sam_vit_h_4b8939.pth" SAM_MODEL_TYPE = "vit_h" MARKDOWN = """

Set-of-Mark (SoM) Prompting Unleashes Extraordinary Visual Grounding in GPT-4V

## 🚀 How To - Upload an image. - Click the `Run` button to generate the image with marks. - Pass OpenAI API 🔑. You can get one [here](https://platform.openai.com/api-keys). - Ask GPT-4V questions about the image in the chatbot. ## 🚧 Roadmap - [ ] Support for alphabetic labels - [ ] Support for Semantic-SAM (multi-level) - [ ] Support for interactive mode - [ ] Support for result highlighting """ SAM = sam_model_registry[SAM_MODEL_TYPE](checkpoint=SAM_CHECKPOINT).to(device=DEVICE) def inference( image: np.ndarray, annotation_mode: List[str], mask_alpha: float ) -> np.ndarray: visualizer = Visualizer(mask_opacity=mask_alpha) mask_generator = SamAutomaticMaskGenerator(SAM) result = mask_generator.generate(image=image) detections = sv.Detections.from_sam(result) detections = postprocess_masks( detections=detections, area_threshold=MINIMUM_AREA_THRESHOLD) bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) annotated_image = visualizer.visualize( image=bgr_image, detections=detections, with_box="Box" in annotation_mode, with_mask="Mask" in annotation_mode, with_polygon="Polygon" in annotation_mode, with_label="Mark" in annotation_mode) return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB) def prompt(message, history): return "response" image_input = gr.Image( label="Input", type="numpy", height=512) checkbox_annotation_mode = gr.CheckboxGroup( choices=["Mark", "Polygon", "Mask", "Box"], value=['Mark'], label="Annotation Mode") slider_mask_alpha = gr.Slider( minimum=0, maximum=1, value=0.05, label="Mask Alpha") image_output = gr.Image( label="SoM Visual Prompt", type="numpy", height=512) textbox_api_key = gr.Textbox( label="OpenAI API KEY", type="password") chatbot = gr.Chatbot( label="GPT-4V + SoM", height=256) run_button = gr.Button("Run") with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Row(): with gr.Column(): image_input.render() with gr.Accordion(label="Detailed prompt settings (e.g., mark type)", open=False): with gr.Row(): checkbox_annotation_mode.render() with gr.Row(): slider_mask_alpha.render() with gr.Column(): image_output.render() run_button.render() textbox_api_key.render() with gr.Row(): gr.ChatInterface(chatbot=chatbot, fn=prompt) run_button.click( fn=inference, inputs=[image_input, checkbox_annotation_mode, slider_mask_alpha], outputs=image_output) demo.queue().launch(debug=False, show_error=True)