Spaces:

ngthanhtinqn
/

Segment_Anything_With_OWL-ViT

Runtime error

App Files Files Community

ngthanhtinqn commited on Apr 8, 2023

Commit

127eb07

•

1 Parent(s): 0c76662

can run

Browse files

Files changed (3) hide show

.gitattributes +1 -0
app.py +2 -48
demo.py +178 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sam_vit_h_4b8939.pth filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,53 +1,7 @@
-import torch
-import cv2
-import gradio as gr
-import numpy as np
-from transformers import OwlViTProcessor, OwlViTForObjectDetection
-# Use GPU if available
-if torch.cuda.is_available():
-    device = torch.device("cuda:4")
-else:
-    device = torch.device("cpu")
-model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
-model.eval()
-processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
-def query_image(img, text_queries, score_threshold):
-    text_queries = text_queries
-    text_queries = text_queries.split(",")
-    target_sizes = torch.Tensor([img.shape[:2]])
-    inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    outputs.logits = outputs.logits.cpu()
-    outputs.pred_boxes = outputs.pred_boxes.cpu()
-    results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
-    boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
-    font = cv2.FONT_HERSHEY_SIMPLEX
-    for box, score, label in zip(boxes, scores, labels):
-        box = [int(i) for i in box.tolist()]
-        if score >= score_threshold:
-            img = cv2.rectangle(img, box[:2], box[2:], (255,0,0), 5)
-            if box[3] + 25 > 768:
-                y = box[3] - 10
-            else:
-                y = box[3] + 25
-            img = cv2.putText(
-                img, text_queries[label], (box[0], y), font, 1, (255,0,0), 2, cv2.LINE_AA
-            )
-    return img
 description = """
 Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlvit">OWL-ViT</a>,

+import gradio as gr
+from demo import query_image
 description = """
 Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlvit">OWL-ViT</a>,

demo.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import argparse
+import os
+import copy
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+import PIL
+# OwlViT Detection
+from transformers import OwlViTProcessor, OwlViTForObjectDetection
+# segment anything
+from segment_anything import build_sam, SamPredictor
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+import gc
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+def show_box(box, ax):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        draw.text((x0, y0), str(label), fill=color)
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+    return image_pil, mask
+# Use GPU if available
+if torch.cuda.is_available():
+    device = torch.device("cuda:4")
+else:
+    device = torch.device("cpu")
+# load OWL-ViT model
+owlvit_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
+owlvit_model.eval()
+owlvit_processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
+# run segment anything (SAM)
+sam_predictor = SamPredictor(build_sam(checkpoint="./sam_vit_h_4b8939.pth"))
+def query_image(img, text_prompt, box_threshold):
+    # load image
+    if not isinstance(img, PIL.Image.Image):
+        pil_img = Image.fromarray(np.uint8(img)).convert('RGB')
+    text_prompt = text_prompt
+    texts = text_prompt.split(",")
+    box_threshold = box_threshold
+    # run object detection model
+    with torch.no_grad():
+        inputs = owlvit_processor(text=texts, images=pil_img, return_tensors="pt").to(device)
+        outputs = owlvit_model(**inputs)
+    # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
+    target_sizes = torch.Tensor([pil_img.size[::-1]])
+    # Convert outputs (bounding boxes and class logits) to COCO API
+    results = owlvit_processor.post_process_object_detection(outputs=outputs, threshold=box_threshold, target_sizes=target_sizes.to(device))
+    scores = torch.sigmoid(outputs.logits)
+    topk_scores, topk_idxs = torch.topk(scores, k=1, dim=1)
+    i = 0  # Retrieve predictions for the first image for the corresponding text queries
+    text = texts[i]
+    topk_idxs = topk_idxs.squeeze(1).tolist()
+    topk_boxes = results[i]['boxes'][topk_idxs]
+    topk_scores = topk_scores.view(len(text), -1)
+    topk_labels = results[i]["labels"][topk_idxs]
+    boxes, scores, labels = topk_boxes, topk_scores, topk_labels
+    # boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
+    # Print detected objects and rescaled box coordinates
+    # for box, score, label in zip(boxes, scores, labels):
+    #     box = [round(i, 2) for i in box.tolist()]
+    #     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
+    boxes = boxes.cpu().detach().numpy()
+    normalized_boxes = copy.deepcopy(boxes)
+    # # visualize pred
+    size = pil_img.size
+    pred_dict = {
+        "boxes": normalized_boxes,
+        "size": [size[1], size[0]], # H, W
+        "labels": [text[idx] for idx in labels]
+    }
+    # release the OWL-ViT
+    # owlvit_model.cpu()
+    # del owlvit_model
+    gc.collect()
+    torch.cuda.empty_cache()
+    # run segment anything (SAM)
+    open_cv_image = np.array(pil_img)
+    image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2RGB)
+    sam_predictor.set_image(image)
+    H, W = size[1], size[0]
+    for i in range(boxes.shape[0]):
+        boxes[i] = torch.Tensor(boxes[i])
+    boxes = torch.tensor(boxes, device=sam_predictor.device)
+    transformed_boxes = sam_predictor.transform.apply_boxes_torch(boxes, image.shape[:2])
+    masks, _, _ = sam_predictor.predict_torch(
+        point_coords = None,
+        point_labels = None,
+        boxes = transformed_boxes,
+        multimask_output = False,
+    )
+    plt.figure(figsize=(10, 10))
+    plt.imshow(image)
+    for mask in masks:
+        show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
+    for box in boxes:
+        show_box(box.numpy(), plt.gca())
+    plt.axis('off')
+    import io
+    buf = io.BytesIO()
+    plt.savefig(buf)
+    buf.seek(0)
+    owlvit_segment_image = Image.open(buf).convert('RGB')
+    # grounded results
+    image_with_box = plot_boxes_to_image(pil_img, pred_dict)[0]
+    # return owlvit_segment_image, image_with_box
+    return owlvit_segment_image