File size: 1,807 Bytes
d60d34b
acda6c7
 
d60d34b
acda6c7
fc5e177
acda6c7
 
73f9f45
acda6c7
3601eff
 
 
 
 
c890be1
3601eff
c890be1
 
3601eff
c890be1
 
 
3601eff
acda6c7
d60d34b
 
c890be1
 
d60d34b
 
 
c890be1
d60d34b
c890be1
d60d34b
c890be1
 
 
d60d34b
c890be1
d60d34b
c890be1
 
 
d60d34b
c890be1
d60d34b
 
 
 
 
 
 
3601eff
d60d34b
 
3601eff
 
c890be1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import torch
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
import spaces

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16").to("cuda")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

@spaces.GPU
def calculate_score(image, text):
    labels = text.split(";")
    labels = [l.strip() for l in labels]
    labels = list(filter(None, labels))
    if len(labels) == 0:
        return dict()
    
    inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image.detach().cpu().numpy()
    
    results_dict = {label: score / 100.0 for label, score in zip(labels, logits_per_image[0])}
    return results_dict

with gr.Blocks() as demo:
    gr.Markdown("# CLIP Score")
    gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text")
    
    with gr.Row():
        image_input = gr.Image()
        output_label = gr.Label()
    
    text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
    
    image_input.change(
        fn=calculate_score,
        inputs=[image_input, text_input],
        outputs=output_label
    )
    
    text_input.submit(
        fn=calculate_score,
        inputs=[image_input, text_input],
        outputs=output_label
    )
    
    gr.Examples(
        examples=[
            [
                "cat.jpg",
                "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
            ]
        ],
        fn=calculate_score,
        inputs=[image_input, text_input],
        outputs=output_label,
    )

demo.launch()