File size: 3,546 Bytes
0c52132
242f627
03b9405
f6e3ce8
03b9405
 
 
f6e3ce8
0c52132
03b9405
1a1d05a
0c52132
 
03b9405
1a1d05a
f6e3ce8
e0f6bc4
 
03b9405
f6e3ce8
03b9405
 
 
 
 
 
 
 
203e0e8
e0f6bc4
 
 
 
 
 
 
203e0e8
 
 
 
 
e0f6bc4
03b9405
f6e3ce8
1a1d05a
f6e3ce8
 
203e0e8
 
 
 
 
 
1a1d05a
 
 
 
 
 
 
203e0e8
1a1d05a
 
 
 
 
 
 
f6e3ce8
 
e0f6bc4
 
 
 
0c52132
 
1a1d05a
 
0c52132
1a1d05a
0c52132
 
203e0e8
 
 
 
 
0c52132
 
 
 
e0f6bc4
 
 
 
 
 
03b9405
f6e3ce8
03b9405
 
f6e3ce8
 
03b9405
0c52132
203e0e8
 
 
 
f6e3ce8
03b9405
 
e0f6bc4
 
 
f6e3ce8
0c52132
 
203e0e8
0c52132
f6e3ce8
03b9405
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import cv2
import torch

import gradio as gr
import numpy as np
import supervision as sv

from typing import List
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
from utils import postprocess_masks, Visualizer

HOME = os.getenv("HOME")
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
MINIMUM_AREA_THRESHOLD = 0.01

SAM_CHECKPOINT = os.path.join(HOME, "app/weights/sam_vit_h_4b8939.pth")
# SAM_CHECKPOINT = "weights/sam_vit_h_4b8939.pth"
SAM_MODEL_TYPE = "vit_h"

MARKDOWN = """
<h1 style='text-align: center'>
    <img 
        src='https://som-gpt4v.github.io/website/img/som_logo.png' 
        style='height:50px; display:inline-block'
    />  
    Set-of-Mark (SoM) Prompting Unleashes Extraordinary Visual Grounding in GPT-4V
</h1>

## 🚀 How To

- Upload an image.
- Click the `Run` button to generate the image with marks.
- Pass OpenAI API 🔑. You can get one [here](https://platform.openai.com/api-keys).
- Ask GPT-4V questions about the image in the chatbot.

## 🚧 Roadmap

- [ ] Support for alphabetic labels
- [ ] Support for Semantic-SAM (multi-level)
- [ ] Support for interactive mode
- [ ] Support for result highlighting
"""

SAM = sam_model_registry[SAM_MODEL_TYPE](checkpoint=SAM_CHECKPOINT).to(device=DEVICE)


def inference(
    image: np.ndarray,
    annotation_mode: List[str],
    mask_alpha: float
) -> np.ndarray:
    visualizer = Visualizer(mask_opacity=mask_alpha)
    mask_generator = SamAutomaticMaskGenerator(SAM)
    result = mask_generator.generate(image=image)
    detections = sv.Detections.from_sam(result)
    detections = postprocess_masks(
        detections=detections,
        area_threshold=MINIMUM_AREA_THRESHOLD)
    bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    annotated_image = visualizer.visualize(
        image=bgr_image,
        detections=detections,
        with_box="Box" in annotation_mode,
        with_mask="Mask" in annotation_mode,
        with_polygon="Polygon" in annotation_mode,
        with_label="Mark" in annotation_mode)
    return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)


def prompt(message, history):
    return "response"


image_input = gr.Image(
    label="Input",
    type="numpy",
    height=512)
checkbox_annotation_mode = gr.CheckboxGroup(
    choices=["Mark", "Polygon", "Mask", "Box"],
    value=['Mark'],
    label="Annotation Mode")
slider_mask_alpha = gr.Slider(
    minimum=0,
    maximum=1,
    value=0.05,
    label="Mask Alpha")
image_output = gr.Image(
    label="SoM Visual Prompt",
    type="numpy",
    height=512)
textbox_api_key = gr.Textbox(
    label="OpenAI API KEY",
    type="password")
chatbot = gr.Chatbot(
    label="GPT-4V + SoM",
    height=256)
run_button = gr.Button("Run")

with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    with gr.Row():
        with gr.Column():
            image_input.render()
            with gr.Accordion(label="Detailed prompt settings (e.g., mark type)", open=False):
                with gr.Row():
                    checkbox_annotation_mode.render()
                with gr.Row():
                    slider_mask_alpha.render()
        with gr.Column():
            image_output.render()
            run_button.render()
    textbox_api_key.render()
    with gr.Row():
        gr.ChatInterface(chatbot=chatbot, fn=prompt)

    run_button.click(
        fn=inference,
        inputs=[image_input, checkbox_annotation_mode, slider_mask_alpha],
        outputs=image_output)

demo.queue().launch(debug=False, show_error=True)