File size: 3,036 Bytes
3f7bb9f
15ad19e
 
9a7ed40
9438682
15ad19e
3f7bb9f
 
 
 
 
15ad19e
9438682
3f7bb9f
 
9438682
3f7bb9f
 
15ad19e
3f7bb9f
15ad19e
3f7bb9f
 
15ad19e
3f7bb9f
 
 
 
15ad19e
3f7bb9f
 
 
 
 
15ad19e
3f7bb9f
 
15ad19e
3f7bb9f
 
 
 
15ad19e
3f7bb9f
 
 
15ad19e
3f7bb9f
15ad19e
3f7bb9f
 
 
 
 
15ad19e
3f7bb9f
 
 
 
 
 
15ad19e
3f7bb9f
15ad19e
3f7bb9f
 
15ad19e
3f7bb9f
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPModel, YolosImageProcessor, YolosForObjectDetection
import torch

i1 = gr.Image(type="pil", label="Input image")
i2 = gr.Textbox(label="Description for section to extracted")
i3 = gr.Number(value=0.96, label="Threshold percentage score")
o1 = gr.Image(type="pil", label="Extracted Crop part")
o2 = gr.Textbox(label="Similarity score")


feature_extractor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
dmodel = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

def extract_image(image, text, prob, num=1):
    
    inputs = feature_extractor(images=image, return_tensors="pt")
    outputs = dmodel(**inputs)
    
    # model predicts bounding boxes and corresponding COCO classes
    logits = outputs.logits
    bboxes = outputs.pred_boxes
    probas = outputs.logits.softmax(-1)[0, :, :-1] #removing no class as detr maps 
    
    keep = probas.max(-1).values > prob
    outs = feature_extractor.post_process(outputs, torch.tensor(image.size[::-1]).unsqueeze(0))
    bboxes_scaled = outs[0]['boxes'][keep].detach().numpy()
    labels = outs[0]['labels'][keep].detach().numpy()
    scores = outs[0]['scores'][keep].detach().numpy()
    
    images_list = []
    for i,j in enumerate(bboxes_scaled):
      
      xmin = int(j[0])
      ymin = int(j[1])
      xmax = int(j[2])
      ymax = int(j[3])
    
      im_arr = np.array(image)
      roi = im_arr[ymin:ymax, xmin:xmax]
      roi_im = Image.fromarray(roi)
    
      images_list.append(roi_im)
    
    inpu = processor(text = [text], images=images_list , return_tensors="pt", padding=True)
    output = model(**inpu)
    logits_per_image = output.logits_per_text
    probs = logits_per_image.softmax(-1)
    l_idx = np.argsort(probs[-1].detach().numpy())[::-1][0:num]
    
    final_ims = []
    for i,j in enumerate(images_list):
      json_dict = {}
      if i in l_idx:
        json_dict['image'] = images_list[i]
        json_dict['score'] = probs[-1].detach().numpy()[i]
    
        final_ims.append(json_dict)
    
    fi = sorted(final_ims, key=lambda item: item.get("score"), reverse=True)
    return fi[0]['image'], fi[0]['score']

title = "ClipnCrop"
description = "<p style= 'color:white'>Extract sections of images from your image by using OpenAI's CLIP and Facebooks Detr implemented on HuggingFace Transformers, if the similarity score is not so much, then please consider the prediction to be void.</p>" 
examples=[['ex3.jpg', 'black bag', 0.96],['ex2.jpg', 'man in red dress', 0.85]]
article = "<p style= 'color:white; text-align:center;'><a href='https://github.com/Vishnunkumar/clipcrop' target='_blank'>clipcrop</a></p>"
gr_app = gr.Interface(fn=extract_image, inputs=[i1, i2, i3], outputs=[o1, o2], title=title, description=description, article=article, examples=examples)
gr_app.launch()