Update app.py
Browse files
app.py
CHANGED
@@ -1,76 +1,73 @@
|
|
1 |
-
import
|
2 |
import numpy as np
|
3 |
from PIL import Image
|
4 |
from transformers import CLIPProcessor, CLIPModel, YolosImageProcessor, YolosForObjectDetection
|
5 |
import torch
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
9 |
|
10 |
-
# IMAGE_INPUT = st.file_uploader(type=["jpg", "png"], label="Input image")
|
11 |
-
# TEXT_INPUT = st.text_input(label="Description for section to extracted")
|
12 |
-
# NUMBER_INPUT = st.number_input(value=0.96, label="Threshold percentage score")
|
13 |
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
# dmodel = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
|
18 |
|
19 |
-
|
20 |
-
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
|
21 |
-
|
22 |
-
# SUBMIT_BUTTON = st.button("SUBMIT")
|
23 |
-
|
24 |
-
# def extract_image(image, text, prob, num=1):
|
25 |
|
26 |
-
|
27 |
-
|
28 |
|
29 |
-
#
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
|
40 |
-
|
41 |
-
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
|
52 |
-
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
|
67 |
-
|
68 |
|
69 |
-
|
70 |
-
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
1 |
+
import gradio as gr
|
2 |
import numpy as np
|
3 |
from PIL import Image
|
4 |
from transformers import CLIPProcessor, CLIPModel, YolosImageProcessor, YolosForObjectDetection
|
5 |
import torch
|
6 |
|
7 |
+
i1 = gr.Image(type="pil", label="Input image")
|
8 |
+
i2 = gr.Textbox(label="Description for section to extracted")
|
9 |
+
i3 = gr.Number(value=0.96, label="Threshold percentage score")
|
10 |
+
o1 = gr.Image(type="pil", label="Extracted Crop part")
|
11 |
+
o2 = gr.Textbox(label="Similarity score")
|
12 |
|
|
|
|
|
|
|
13 |
|
14 |
+
feature_extractor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
|
15 |
+
dmodel = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
|
16 |
|
17 |
+
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
|
18 |
+
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
|
|
|
19 |
|
20 |
+
def extract_image(image, text, prob, num=1):
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
inputs = feature_extractor(images=image, return_tensors="pt")
|
23 |
+
outputs = dmodel(**inputs)
|
24 |
|
25 |
+
# model predicts bounding boxes and corresponding COCO classes
|
26 |
+
logits = outputs.logits
|
27 |
+
bboxes = outputs.pred_boxes
|
28 |
+
probas = outputs.logits.softmax(-1)[0, :, :-1] #removing no class as detr maps
|
29 |
|
30 |
+
keep = probas.max(-1).values > prob
|
31 |
+
outs = feature_extractor.post_process(outputs, torch.tensor(image.size[::-1]).unsqueeze(0))
|
32 |
+
bboxes_scaled = outs[0]['boxes'][keep].detach().numpy()
|
33 |
+
labels = outs[0]['labels'][keep].detach().numpy()
|
34 |
+
scores = outs[0]['scores'][keep].detach().numpy()
|
35 |
|
36 |
+
images_list = []
|
37 |
+
for i,j in enumerate(bboxes_scaled):
|
38 |
|
39 |
+
xmin = int(j[0])
|
40 |
+
ymin = int(j[1])
|
41 |
+
xmax = int(j[2])
|
42 |
+
ymax = int(j[3])
|
43 |
|
44 |
+
im_arr = np.array(image)
|
45 |
+
roi = im_arr[ymin:ymax, xmin:xmax]
|
46 |
+
roi_im = Image.fromarray(roi)
|
47 |
|
48 |
+
images_list.append(roi_im)
|
49 |
|
50 |
+
inpu = processor(text = [text], images=images_list , return_tensors="pt", padding=True)
|
51 |
+
output = model(**inpu)
|
52 |
+
logits_per_image = output.logits_per_text
|
53 |
+
probs = logits_per_image.softmax(-1)
|
54 |
+
l_idx = np.argsort(probs[-1].detach().numpy())[::-1][0:num]
|
55 |
|
56 |
+
final_ims = []
|
57 |
+
for i,j in enumerate(images_list):
|
58 |
+
json_dict = {}
|
59 |
+
if i in l_idx:
|
60 |
+
json_dict['image'] = images_list[i]
|
61 |
+
json_dict['score'] = probs[-1].detach().numpy()[i]
|
62 |
|
63 |
+
final_ims.append(json_dict)
|
64 |
|
65 |
+
fi = sorted(final_ims, key=lambda item: item.get("score"), reverse=True)
|
66 |
+
return fi[0]['image'], fi[0]['score']
|
67 |
|
68 |
+
title = "ClipnCrop"
|
69 |
+
description = "<p style= 'color:white'>Extract sections of images from your image by using OpenAI's CLIP and Facebooks Detr implemented on HuggingFace Transformers, if the similarity score is not so much, then please consider the prediction to be void.</p>"
|
70 |
+
examples=[['ex3.jpg', 'black bag', 0.96],['ex2.jpg', 'man in red dress', 0.85]]
|
71 |
+
article = "<p style= 'color:white; text-align:center;'><a href='https://github.com/Vishnunkumar/clipcrop' target='_blank'>clipcrop</a></p>"
|
72 |
+
gr_app = gr.Interface(fn=extract_image, inputs=[i1, i2, i3], outputs=[o1, o2], title=title, description=description, article=article, examples=examples)
|
73 |
+
gr_app.launch()
|