MSaadTariq commited on
Commit
389e8f6
·
verified ·
1 Parent(s): cc70891

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from transformers import Owlv2Processor, Owlv2ForObjectDetection
4
+ import spaces
5
+
6
+ # Use GPU if available
7
+ if torch.cuda.is_available():
8
+ device = torch.device("cuda")
9
+ else:
10
+ device = torch.device("cpu")
11
+
12
+ model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to(device)
13
+ processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
14
+
15
+ def query_image(Upload_Image, Text, score_threshold):
16
+ Text = Text
17
+ Text = Text.split(",")
18
+
19
+ size = max(Upload_Image.shape[:2])
20
+ target_sizes = torch.Tensor([[size, size]])
21
+ inputs = processor(text=Text, images=Upload_Image, return_tensors="pt").to(device)
22
+
23
+ with torch.no_grad():
24
+ outputs = model(**inputs)
25
+
26
+ outputs.logits = outputs.logits.cpu()
27
+ outputs.pred_boxes = outputs.pred_boxes.cpu()
28
+ results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes)
29
+ boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
30
+
31
+ result_labels = []
32
+ for box, score, label in zip(boxes, scores, labels):
33
+ box = [int(i) for i in box.tolist()]
34
+ if score < score_threshold:
35
+ continue
36
+ result_labels.append((box, Text[label.item()]))
37
+ return Upload_Image, result_labels
38
+
39
+
40
+ description = """
41
+ You can use AnyVision to query images with text descriptions of any object.
42
+ To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for. You
43
+ can also use the score threshold slider to set a threshold to filter out low probability predictions.
44
+
45
+ You can get better predictions by querying the image with text templates used in training the original model: e.g. *"photo of a star-spangled banner"*,
46
+ *"image of a shoe"*.
47
+ """
48
+ demo = gr.Interface(
49
+ query_image,
50
+ inputs=[gr.Image(), "text", gr.Slider(0, 1, value=0.1)],
51
+ outputs="annotatedimage",
52
+ title="AnyVision - Zero-Shot Object Detector with Owl2",
53
+ description=description
54
+ )
55
+ demo.launch()
56
+