taskswithcode commited on
Commit
b0e19ad
1 Parent(s): def93e8
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import cv2
3
+ import gradio as gr
4
+ import numpy as np
5
+ from transformers import OwlViTProcessor, OwlViTForObjectDetection
6
+ import pdb
7
+ from collections import OrderedDict
8
+
9
+
10
+ # Use GPU if available
11
+ if torch.cuda.is_available():
12
+ device = torch.device("cuda")
13
+ else:
14
+ device = torch.device("cpu")
15
+
16
+ model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
17
+ model.eval()
18
+ processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
19
+
20
+
21
+ def query_image(img, text_queries,max_results):
22
+ text_queries = text_queries
23
+ text_queries = text_queries.split(",")
24
+
25
+ target_sizes = torch.Tensor([img.shape[:2]])
26
+ inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
27
+
28
+ with torch.no_grad():
29
+ outputs = model(**inputs)
30
+
31
+ outputs.logits = outputs.logits.cpu()
32
+ outputs.pred_boxes = outputs.pred_boxes.cpu()
33
+ results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
34
+ boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
35
+ results_dict = {}
36
+ for box, score, label in zip(boxes, scores, labels):
37
+ results_dict[score] = {"box":box,"label":label}
38
+ sorted_results_dict = OrderedDict(sorted(results_dict.items(),reverse=True))
39
+
40
+
41
+ font = cv2.FONT_HERSHEY_SIMPLEX
42
+
43
+ score_dist = []
44
+ count = 0
45
+ for score in sorted_results_dict:
46
+ score_dist.append(round(score.tolist(),2))
47
+ count += 1
48
+ if (count == 10):
49
+ break
50
+
51
+
52
+ #for box, score, label in zip(boxes, scores, labels):
53
+ result_count = 0
54
+ for score in sorted_results_dict:
55
+ box = sorted_results_dict[score]["box"]
56
+ label = sorted_results_dict[score]["label"]
57
+ box = [int(i) for i in box.tolist()]
58
+
59
+ print("label:",label,"score:",score)
60
+ #if score >= score_threshold:
61
+ img = cv2.rectangle(img, box[:2], box[2:], (255,0,0), 1)
62
+ if box[3] + 25 > 768:
63
+ y = box[3] - 10
64
+ else:
65
+ y = box[3] + 25
66
+
67
+ rounded_score = round(score.tolist(),2)
68
+ img = cv2.putText(
69
+ img, f"({rounded_score}):{text_queries[label]}", (box[0], y), font, .5, (255,0,0), 1, cv2.LINE_AA
70
+ )
71
+ result_count += 1
72
+ if (result_count >= max_results):
73
+ break
74
+ return (img,f"Top {count} score confidences:{str(score_dist)}")
75
+
76
+
77
+ description = """
78
+ <div style=\"font-size:18px; color: #2f2f2f; text-align: center\">
79
+ </i>This app is a tweaked variation of <a href="https://huggingface.co/spaces/adirik/OWL-ViT">Alara Dirik's OWL-ViT demo</a>
80
+ </i></div>
81
+ <div style=\"font-size:18px; color: #2f2f2f; text-align: left\">
82
+ <b>Use cases of this model</b>
83
+ <br/>1) Given an image with an object, detect it. <i>(e.g. Where is Waldo? app)</i>
84
+ <br/>2) Given an image with multiple instances of an object, detect them <i>(e.g. labeling tool assistance for bounding box annotation)</i>
85
+ <br/>3) Find an object within an image using either text or image as input <i>(e.g. Image Search app - this would require pruning candidates using a threshold and using the score distribution in the output. Search using an input image could be useful when trying to find things that are hard to describe in text like a machine part)</i>
86
+ <br/><div style=\"font-size:16px; color: #3f3f3f; text-align: left\">
87
+ <br/>Links to apps/notebooks of other SOTA models for open vocabulary object detection or zero-shot object detection
88
+ <br/>a) <a href="https://huggingface.co/spaces/CVPR/regionclip-demo">RegionCLIP</a>
89
+ <br/>b) <a href="https://colab.research.google.com/drive/19LBqQg0cS36rTLL_TaXZ7Ka9KJGkxiSe?usp=sharing">Colab notebook for Object-Centric-OVD</a>
90
+ </div>
91
+ <br/><div style=\"font-size:16px; color: #4f4f4f; text-align: left\">Note: While most examples showcased illustrate model capabilities, some illustrate model's limitations - such as finding globe,bird cage,teapot etc. in the picture etc. Also, the model appears to have text detection and recognition capabilities, even if text recognition is only very limited</div>
92
+ <div style=\"font-size:14px; color: #6f6f6f; text-align: left\"><i>Images below are from&nbsp;&nbsp;<a href="https://en.wikipedia.org/wiki/Hidden_object_game">Wikipedia</a>,&nbsp;<a href="http://images.cocodataset.org/val2017/000000133819.jpg">COCO</a> and <a href="http://host.robots.ox.ac.uk/pascal/VOC/voc2012/">PASCAL VOC 2012</a>&nbsp;datasets </i></div>
93
+ """
94
+ demo = gr.Interface(
95
+ query_image,
96
+ inputs=[gr.Image(), "text",gr.Slider(1, 10, value=1)],
97
+ outputs=["image","text"],
98
+ server_port=80,
99
+ server_name="0.0.0.0",
100
+ title="Where is Waldo? <i>(implemented with OWL-ViT)</i>",
101
+ description=description,
102
+ examples=[
103
+ ["assets/Hidden_object_game_scaled.png", "bicycle", 1],
104
+ ["assets/Hidden_object_game_scaled.png", "laptop", 1],
105
+ ["assets/Hidden_object_game_scaled.png", "abacus", 1],
106
+ ["assets/Hidden_object_game_scaled.png", "frog", 1],
107
+ ["assets/Hidden_object_game_scaled.png", "bird cage", 2],
108
+ ["assets/Hidden_object_game_scaled.png", "globe", 2],
109
+ ["assets/Hidden_object_game_scaled.png", "teapot", 3],
110
+ ["assets/bus_ovd.jpg", "license plate", 1],
111
+ ["assets/bus_ovd.jpg", "sign saying ARRIVA", 1],
112
+ ["assets/bus_ovd.jpg", "sign saying ARRIVAL", 1],
113
+ ["assets/bus_ovd.jpg", "crossing push button", 1],
114
+ ["assets/bus_ovd.jpg", "building on moutain", 2],
115
+ ["assets/bus_ovd.jpg", "road marking", 3],
116
+ ["assets/bus_ovd.jpg", "mirror", 1],
117
+ ["assets/bus_ovd.jpg", "traffic camera", 1],
118
+ ["assets/bus_ovd.jpg", "red bus,blue bus", 2],
119
+ ["assets/calf.png", "snout,tail", 1],
120
+ ["assets/calf.png", "hoof", 4],
121
+ ["assets/calf.png", "ear", 2],
122
+ ["assets/calf.png", "tag", 1],
123
+ ["assets/calf.png", "hay", 1],
124
+ ["assets/calf.png", "barbed wire", 1],
125
+ ["assets/calf.png", "grass", 1],
126
+ ["assets/calf.png", "can", 2],
127
+ ["assets/road_signs.png", "STOP", 1],
128
+ ["assets/road_signs.png", "STOP sign", 1],
129
+ ["assets/road_signs.png", "arrow", 1],
130
+ ["assets/road_signs.png", "ROAD", 1],
131
+ ["assets/road_signs.png", "triangle", 1],
132
+ ],
133
+ )
134
+ demo.launch(share=True)
assets/Hidden_object_game_scaled.png ADDED
assets/bus_ovd.jpg ADDED
assets/calf.png ADDED
assets/road_signs.png ADDED
long_form_logo_with_icon.png ADDED
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pip install -r requirements.txt
2
+
3
+ gradio
4
+ numpy>=1.18.5
5
+ torch>=1.7.0
6
+ torchvision>=0.8.1
7
+ git+https://github.com/huggingface/transformers.git
8
+ opencv-python
run.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ python app.py