Spaces:

Roboflow
/

SoM

Runtime error

SkalskiP commited on Nov 21, 2023

Commit

e0f6bc4

1 Parent(s): 203e0e8

Add IoU filter function and update Dockerfile & app.py

Added functions to compute Intersection over Union (IoU) and to filter out highly overlapping masks in utils.py. This is to improve the quality of masks by removing redundant masks. Updated post-processing masks function parameters and values in utils.py.

Further, Dockerfile is updated to add one more dependency - requests, a file - gpt4v.py, where it is used for communication with OpenAI API to compute image data in base64 format.

Additionally, OpenAI API key support and changes reflecting chat interface elements in Run button are made to app.py for interaction with GPT-4V. GPT-4V module added to improve the interactive mode support for image based queries.

Files changed (4) hide show

Dockerfile +3 -1
app.py +23 -2
gpt4v.py +78 -0
utils.py +55 -3

Dockerfile CHANGED Viewed

@@ -31,7 +31,8 @@ WORKDIR $HOME/app
 RUN pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 -f https://download.pytorch.org/whl/torch_stable.html
 # Install dependencies
-RUN pip install --no-cache-dir gradio==3.50.2 opencv-python supervision==0.17.0rc3 pillow
 # Install SAM and Detectron2
 RUN pip install 'git+https://github.com/facebookresearch/segment-anything.git'
@@ -43,6 +44,7 @@ RUN wget -c -O $HOME/app/weights/sam_vit_h_4b8939.pth https://dl.fbaipublicfiles
 COPY app.py .
 COPY utils.py .
 RUN find $HOME/app

 RUN pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 -f https://download.pytorch.org/whl/torch_stable.html
 # Install dependencies
+RUN pip install --no-cache-dir gradio==3.50.2 opencv-python supervision==0.17.0rc3 \
+    pillow requests
 # Install SAM and Detectron2
 RUN pip install 'git+https://github.com/facebookresearch/segment-anything.git'
 COPY app.py .
 COPY utils.py .
+COPY gpt4v.py .
 RUN find $HOME/app

app.py CHANGED Viewed

@@ -14,8 +14,8 @@ HOME = os.getenv("HOME")
 DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 MINIMUM_AREA_THRESHOLD = 0.01
-# SAM_CHECKPOINT = os.path.join(HOME, "app/weights/sam_vit_h_4b8939.pth")
-SAM_CHECKPOINT = "weights/sam_vit_h_4b8939.pth"
 SAM_MODEL_TYPE = "vit_h"
 MARKDOWN = """
@@ -27,11 +27,19 @@ MARKDOWN = """
     Set-of-Mark (SoM) Prompting Unleashes Extraordinary Visual Grounding in GPT-4V
 </h1>
 ## 🚧 Roadmap
 - [ ] Support for alphabetic labels
 - [ ] Support for Semantic-SAM (multi-level)
 - [ ] Support for interactive mode
 """
 SAM = sam_model_registry[SAM_MODEL_TYPE](checkpoint=SAM_CHECKPOINT).to(device=DEVICE)
@@ -60,6 +68,10 @@ def inference(
     return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
 image_input = gr.Image(
     label="Input",
     type="numpy",
@@ -77,6 +89,12 @@ image_output = gr.Image(
     label="SoM Visual Prompt",
     type="numpy",
     height=512)
 run_button = gr.Button("Run")
 with gr.Blocks() as demo:
@@ -92,6 +110,9 @@ with gr.Blocks() as demo:
         with gr.Column():
             image_output.render()
             run_button.render()
     run_button.click(
         fn=inference,

 DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 MINIMUM_AREA_THRESHOLD = 0.01
+SAM_CHECKPOINT = os.path.join(HOME, "app/weights/sam_vit_h_4b8939.pth")
+# SAM_CHECKPOINT = "weights/sam_vit_h_4b8939.pth"
 SAM_MODEL_TYPE = "vit_h"
 MARKDOWN = """
     Set-of-Mark (SoM) Prompting Unleashes Extraordinary Visual Grounding in GPT-4V
 </h1>
+## 🚀 How To
+- Upload an image.
+- Click the `Run` button to generate the image with marks.
+- Pass OpenAI API 🔑. You can get one [here](https://platform.openai.com/api-keys).
+- Ask GPT-4V questions about the image in the chatbot.
 ## 🚧 Roadmap
 - [ ] Support for alphabetic labels
 - [ ] Support for Semantic-SAM (multi-level)
 - [ ] Support for interactive mode
+- [ ] Support for result highlighting
 """
 SAM = sam_model_registry[SAM_MODEL_TYPE](checkpoint=SAM_CHECKPOINT).to(device=DEVICE)
     return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
+def prompt(message, history):
+    return "response"
 image_input = gr.Image(
     label="Input",
     type="numpy",
     label="SoM Visual Prompt",
     type="numpy",
     height=512)
+textbox_api_key = gr.Textbox(
+    label="OpenAI API KEY",
+    type="password")
+chatbot = gr.Chatbot(
+    label="GPT-4V + SoM",
+    height=256)
 run_button = gr.Button("Run")
 with gr.Blocks() as demo:
         with gr.Column():
             image_output.render()
             run_button.render()
+    textbox_api_key.render()
+    with gr.Row():
+        gr.ChatInterface(chatbot=chatbot, fn=prompt)
     run_button.click(
         fn=inference,

gpt4v.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import cv2
+import base64
+import requests
+import numpy as np
+META_PROMPT = '''
+- For any marks mentioned in your answer, please highlight them with [].
+'''
+API_URL = "https://api.openai.com/v1/chat/completions"
+def encode_image_to_base64(image: np.ndarray) -> str:
+    """
+    Encodes an image into a base64-encoded string in JPEG format.
+    Parameters:
+        image (np.ndarray): The image to be encoded. This should be a numpy array as
+            typically used in OpenCV.
+    Returns:
+        str: A base64-encoded string representing the image in JPEG format.
+    """
+    success, buffer = cv2.imencode('.jpg', image)
+    if not success:
+        raise ValueError("Could not encode image to JPEG format.")
+    encoded_image = base64.b64encode(buffer).decode('utf-8')
+    return encoded_image
+def compose_headers(api_key: str) -> dict:
+    return {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {api_key}"
+    }
+def compose_payload(image: np.ndarray, prompt: str) -> dict:
+    base64_image = encode_image_to_base64(image)
+    return {
+        "model": "gpt-4-vision-preview",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "role": "system",
+                        "content": [
+                            META_PROMPT
+                        ]
+                    },
+                    {
+                        "type": "text",
+                        "text": prompt
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{base64_image}"
+                        }
+                    }
+                ]
+            }
+        ],
+        "max_tokens": 800
+    }
+def prompt_image(api_key: str, image: np.ndarray, prompt: str) -> str:
+    headers = compose_headers(api_key=api_key)
+    payload = compose_payload(image=image, prompt=prompt)
+    response = requests.post(url=API_URL, headers=headers, json=payload).json()
+    if 'error' in response:
+        raise ValueError(response['error']['message'])
+    return response['choices'][0]['message']['content']

utils.py CHANGED Viewed

@@ -113,11 +113,58 @@ def filter_masks_by_relative_area(
     return masks[min_area_filter & max_area_filter]
 def postprocess_masks(
     detections: sv.Detections,
-    area_threshold: float = 0.02,
-    min_relative_area: float = 0.02,
-    max_relative_area: float = 1.0
 ) -> sv.Detections:
     """
     Post-processes the masks of detection objects by removing small islands and filling
@@ -128,6 +175,8 @@ def postprocess_masks(
         area_threshold (float): Threshold for relative area to remove or fill features.
         min_relative_area (float): Minimum relative area threshold for detections.
         max_relative_area (float): Maximum relative area threshold for detections.
     Returns:
         np.ndarray: Post-processed masks.
@@ -148,6 +197,9 @@ def postprocess_masks(
         masks=masks,
         min_relative_area=min_relative_area,
         max_relative_area=max_relative_area)
     return sv.Detections(
         xyxy=sv.mask_to_xyxy(masks),

     return masks[min_area_filter & max_area_filter]
+def compute_iou(mask1: np.ndarray, mask2: np.ndarray) -> float:
+    """
+    Computes the Intersection over Union (IoU) of two masks.
+    Parameters:
+        mask1, mask2 (np.ndarray): Two mask arrays.
+    Returns:
+        float: The IoU of the two masks.
+    """
+    intersection = np.logical_and(mask1, mask2).sum()
+    union = np.logical_or(mask1, mask2).sum()
+    return intersection / union if union != 0 else 0
+def filter_highly_overlapping_masks(
+    masks: np.ndarray,
+    iou_threshold: float
+) -> np.ndarray:
+    """
+    Removes masks with high overlap from a set of masks.
+    Parameters:
+        masks (np.ndarray): A 3D numpy array with shape (N, H, W), where N is the
+            number of masks, and H and W are the height and width of the masks.
+        iou_threshold (float): The IoU threshold above which masks will be considered as
+            overlapping.
+    Returns:
+        np.ndarray: A 3D numpy array of masks with highly overlapping masks removed.
+    """
+    num_masks = masks.shape[0]
+    keep_mask = np.ones(num_masks, dtype=bool)
+    for i in range(num_masks):
+        for j in range(i + 1, num_masks):
+            if not keep_mask[i] or not keep_mask[j]:
+                continue
+            iou = compute_iou(masks[i, :, :], masks[j, :, :])
+            if iou > iou_threshold:
+                keep_mask[j] = False
+    return masks[keep_mask]
 def postprocess_masks(
     detections: sv.Detections,
+    area_threshold: float = 0.01,
+    min_relative_area: float = 0.01,
+    max_relative_area: float = 1.0,
+    iou_threshold: float = 0.9
 ) -> sv.Detections:
     """
     Post-processes the masks of detection objects by removing small islands and filling
         area_threshold (float): Threshold for relative area to remove or fill features.
         min_relative_area (float): Minimum relative area threshold for detections.
         max_relative_area (float): Maximum relative area threshold for detections.
+        iou_threshold (float): The IoU threshold above which masks will be considered as
+            overlapping.
     Returns:
         np.ndarray: Post-processed masks.
         masks=masks,
         min_relative_area=min_relative_area,
         max_relative_area=max_relative_area)
+    masks = filter_highly_overlapping_masks(
+        masks=masks,
+        iou_threshold=iou_threshold)
     return sv.Detections(
         xyxy=sv.mask_to_xyxy(masks),