florence-sam-masking

Running on Zero

App Files Files Community

jiuface commited on Aug 22, 2024

Commit

917a5a6

1 Parent(s): 4c32826

bugfix

Browse files

Files changed (1) hide show

app.py +28 -19

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from io import BytesIO
 import PIL.Image
 import requests
 import cv2
 from utils.florence import load_florence_model, run_florence_inference, \
     FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
@@ -44,7 +45,7 @@ def process_image(image_input, image_url, task_prompt, text_prompt=None, dilate=
         response.raise_for_status()
         image_input = PIL.Image.open(BytesIO(response.content))
         print("fetch image success")
     _, result = run_florence_inference(
         model=FLORENCE_MODEL,
         processor=FLORENCE_PROCESSOR,
@@ -53,67 +54,75 @@ def process_image(image_input, image_url, task_prompt, text_prompt=None, dilate=
         task=task_prompt,
         text=text_prompt
     )
     detections = sv.Detections.from_lmm(
         lmm=sv.LMM.FLORENCE_2,
         result=result,
         resolution_wh=image_input.size
     )
     images = []
     if return_rectangles:
-       # 创建黑色背景的图片
-        mask_image = np.zeros((image_input.size.height, image_input.size.width), dtype=np.uint8)
-        bboxes = detections.get('bboxes', [])
         for bbox in bboxes:
             x1, y1, x2, y2 = map(int, bbox)
-            # 在 mask_image 上绘制白色的矩形
-            cv2.rectangle(mask_image, (x1, y1), (x2, y2), 255, thickness=cv2.FILLED)
-        images = [mask_image]
     else:
-        # sam
         detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
         if len(detections) == 0:
             gr.Info("No objects detected.")
             return None
-        kernel_size = dilate
         print("mask generated:", len(detections.mask))
         kernel = np.ones((kernel_size, kernel_size), np.uint8)
         for i in range(len(detections.mask)):
             mask = detections.mask[i].astype(np.uint8) * 255
             if dilate > 0:
                 mask = cv2.dilate(mask, kernel, iterations=1)
             images.append(mask)
         if merge_masks:
             merged_mask = np.zeros_like(images[0], dtype=np.uint8)
             for mask in images:
                 merged_mask = cv2.bitwise_or(merged_mask, mask)
-            images = [merged_mask] + images
-    return images
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             image = gr.Image(type='pil', label='Upload image')
-            image_url =  gr.Textbox( label='Image url', placeholder='Enter text prompts (Optional)')
             task_prompt = gr.Dropdown(
                 ['<OD>', '<CAPTION_TO_PHRASE_GROUNDING>', '<DENSE_REGION_CAPTION>', '<REGION_PROPOSAL>', '<OCR_WITH_REGION>', '<REFERRING_EXPRESSION_SEGMENTATION>', '<REGION_TO_SEGMENTATION>', '<OPEN_VOCABULARY_DETECTION>', '<REGION_TO_CATEGORY>', '<REGION_TO_DESCRIPTION>'], value="<CAPTION_TO_PHRASE_GROUNDING>", label="Task Prompt", info="task prompts"
             )
             dilate = gr.Slider(label="dilate mask", minimum=0, maximum=50, value=10, step=1)
             merge_masks = gr.Checkbox(label="Merge masks", value=False)
-            return_rectangles = gr.Checkbox(label="Return rectangle masks", value=False)
             text_prompt = gr.Textbox(label='Text prompt', placeholder='Enter text prompts')
             submit_button = gr.Button(value='Submit', variant='primary')
         with gr.Column():
             image_gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery", columns=[3], rows=[1], object_fit="contain", height="auto")
     print(image, image_url, task_prompt, text_prompt, image_gallery)
     submit_button.click(
-        fn = process_image,
-        inputs = [image, image_url, task_prompt, text_prompt, dilate, merge_masks, return_rectangles],
-        outputs = [image_gallery,],
         show_api=False
     )
-demo.launch(debug=True, show_error=True)

 import PIL.Image
 import requests
 import cv2
+import json
 from utils.florence import load_florence_model, run_florence_inference, \
     FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
         response.raise_for_status()
         image_input = PIL.Image.open(BytesIO(response.content))
         print("fetch image success")
+    # start to parse prompt
     _, result = run_florence_inference(
         model=FLORENCE_MODEL,
         processor=FLORENCE_PROCESSOR,
         task=task_prompt,
         text=text_prompt
     )
+    # start to dectect
     detections = sv.Detections.from_lmm(
         lmm=sv.LMM.FLORENCE_2,
         result=result,
         resolution_wh=image_input.size
     )
+    json_result = json.dumps({"bbox": detections.xyxy, "data": detections.data})
     images = []
     if return_rectangles:
+        # create mask in rectangle
+        (image_width, image_height) = image_input.size
+        bboxes = detections.xyxy
+        merge_mask_image = np.zeros((image_height, image_width), dtype=np.uint8)
         for bbox in bboxes:
             x1, y1, x2, y2 = map(int, bbox)
+            cv2.rectangle(merge_mask_image, (x1, y1), (x2, y2), 255, thickness=cv2.FILLED)
+            clip_mask = np.zeros((image_height, image_width), dtype=np.uint8)
+            cv2.rectangle(clip_mask, (x1, y1), (x2, y2), 255, thickness=cv2.FILLED)
+            images.append(clip_mask)
+        if merge_masks:
+            images = [merge_mask_image] + images
     else:
+        # using sam generate segments images
         detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
         if len(detections) == 0:
             gr.Info("No objects detected.")
             return None
         print("mask generated:", len(detections.mask))
+        kernel_size = dilate
         kernel = np.ones((kernel_size, kernel_size), np.uint8)
         for i in range(len(detections.mask)):
             mask = detections.mask[i].astype(np.uint8) * 255
             if dilate > 0:
                 mask = cv2.dilate(mask, kernel, iterations=1)
             images.append(mask)
         if merge_masks:
             merged_mask = np.zeros_like(images[0], dtype=np.uint8)
             for mask in images:
                 merged_mask = cv2.bitwise_or(merged_mask, mask)
+            images = [merged_mask]
+    return [images, json_result]
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             image = gr.Image(type='pil', label='Upload image')
+            image_url =  gr.Textbox(label='Image url', placeholder='Enter text prompts (Optional)')
             task_prompt = gr.Dropdown(
                 ['<OD>', '<CAPTION_TO_PHRASE_GROUNDING>', '<DENSE_REGION_CAPTION>', '<REGION_PROPOSAL>', '<OCR_WITH_REGION>', '<REFERRING_EXPRESSION_SEGMENTATION>', '<REGION_TO_SEGMENTATION>', '<OPEN_VOCABULARY_DETECTION>', '<REGION_TO_CATEGORY>', '<REGION_TO_DESCRIPTION>'], value="<CAPTION_TO_PHRASE_GROUNDING>", label="Task Prompt", info="task prompts"
             )
             dilate = gr.Slider(label="dilate mask", minimum=0, maximum=50, value=10, step=1)
             merge_masks = gr.Checkbox(label="Merge masks", value=False)
+            return_rectangles = gr.Checkbox(label="Return Rectangles", value=False)
             text_prompt = gr.Textbox(label='Text prompt', placeholder='Enter text prompts')
             submit_button = gr.Button(value='Submit', variant='primary')
         with gr.Column():
             image_gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery", columns=[3], rows=[1], object_fit="contain", height="auto")
+            json_result = gr.Code(label="JSON Result", language="json")
     print(image, image_url, task_prompt, text_prompt, image_gallery)
     submit_button.click(
+        fn=process_image,
+        inputs=[image, image_url, task_prompt, text_prompt, dilate, merge_masks, return_rectangles],
+        outputs=[image_gallery, json_result],
         show_api=False
     )
+demo.launch(debug=True, show_error=True)