h4d35 commited on
Commit
67a02dd
·
1 Parent(s): a95f717

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ from PIL import Image
4
+ from transformers import CLIPProcessor, CLIPModel, DetrFeatureExtractor, DetrForObjectDetection, AutoFeatureExtractor, AutoModelForObjectDetection
5
+ import torch
6
+
7
+ feature_extractor = AutoFeatureExtractor.from_pretrained("nielsr/detr-resnet-50")
8
+ dmodel = AutoModelForObjectDetection.from_pretrained("nielsr/detr-resnet-50")
9
+
10
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
11
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
12
+
13
+ i1 = gr.inputs.Image(type="pil", label="Input image")
14
+ i2 = gr.inputs.Textbox(label="Input text")
15
+ i3 = gr.inputs.Number(default=0.96, label="Threshold percentage score")
16
+ o1 = gr.outputs.Image(type="pil", label="Cropped part")
17
+ o2 = gr.outputs.Textbox(label="Similarity score")
18
+
19
+ def extract_image(image, text, prob, num=1):
20
+
21
+ inputs = feature_extractor(images=image, return_tensors="pt")
22
+ outputs = dmodel(**inputs)
23
+
24
+ # model predicts bounding boxes and corresponding COCO classes
25
+ logits = outputs.logits
26
+ bboxes = outputs.pred_boxes
27
+ probas = outputs.logits.softmax(-1)[0, :, :-1] #removing no class as detr maps
28
+
29
+ keep = probas.max(-1).values > prob
30
+ outs = feature_extractor.post_process(outputs, torch.tensor(image.size[::-1]).unsqueeze(0))
31
+ bboxes_scaled = outs[0]['boxes'][keep].detach().numpy()
32
+ labels = outs[0]['labels'][keep].detach().numpy()
33
+ scores = outs[0]['scores'][keep].detach().numpy()
34
+
35
+ images_list = []
36
+ for i,j in enumerate(bboxes_scaled):
37
+
38
+ xmin = int(j[0])
39
+ ymin = int(j[1])
40
+ xmax = int(j[2])
41
+ ymax = int(j[3])
42
+
43
+ im_arr = np.array(image)
44
+ roi = im_arr[ymin:ymax, xmin:xmax]
45
+ roi_im = Image.fromarray(roi)
46
+
47
+ images_list.append(roi_im)
48
+
49
+ inpu = processor(text = [text], images=images_list , return_tensors="pt", padding=True)
50
+ output = model(**inpu)
51
+ logits_per_image = output.logits_per_text
52
+ probs = logits_per_image.softmax(-1)
53
+ l_idx = np.argsort(probs[-1].detach().numpy())[::-1][0:num]
54
+
55
+ final_ims = []
56
+ for i,j in enumerate(images_list):
57
+ json_dict = {}
58
+ if i in l_idx:
59
+ json_dict['image'] = images_list[i]
60
+ json_dict['score'] = probs[-1].detach().numpy()[i]
61
+
62
+ final_ims.append(json_dict)
63
+
64
+ fi = sorted(final_ims, key=lambda item: item.get("score"), reverse=True)
65
+ return fi[0]['image'], fi[0]['score']
66
+
67
+ title = "ClipnCrop"
68
+ description = "Extract sections of images from your image by using OpenAI's CLIP and Facebooks Detr implemented on HuggingFace Transformers"
69
+ examples=[['ex3.jpg', 'black bag', 0.96],['ex2.jpg', 'man in red dress', 0.85]]
70
+ article = "<p style='text-align: center'><a href='https://github.com/Vishnunkumar/clipcrop' target='_blank'>clipcrop</a></p>"
71
+ gr.Interface(fn=extract_image, inputs=[i1, i2, i3], outputs=[o1, o2], title=title, description=description, article=article, examples=examples, enable_queue=True).launch()