File size: 1,878 Bytes
d60d34b
acda6c7
 
d60d34b
acda6c7
c890be1
acda6c7
 
c890be1
acda6c7
c890be1
 
3601eff
 
 
 
 
c890be1
3601eff
c890be1
 
3601eff
c890be1
 
 
3601eff
acda6c7
d60d34b
 
c890be1
 
d60d34b
 
 
c890be1
d60d34b
c890be1
d60d34b
c890be1
 
 
d60d34b
c890be1
d60d34b
c890be1
 
 
d60d34b
c890be1
d60d34b
 
 
 
 
 
 
3601eff
d60d34b
 
3601eff
 
c890be1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import torch
import gradio as gr
from transformers import CLIPProcessor, CLIPModel
import spaces

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

@spaces.GPU
def calculate_score(image, text):
    model.to("cuda")  # Move model to CUDA inside the GPU-decorated function
    
    labels = text.split(";")
    labels = [l.strip() for l in labels]
    labels = list(filter(None, labels))
    if len(labels) == 0:
        return dict()
    
    inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image.detach().cpu().numpy()
    
    results_dict = {label: score / 100.0 for label, score in zip(labels, logits_per_image[0])}
    return results_dict

with gr.Blocks() as demo:
    gr.Markdown("# CLIP Score")
    gr.Markdown("Calculate the [CLIP](https://openai.com/blog/clip/) score of a given image and text")
    
    with gr.Row():
        image_input = gr.Image()
        output_label = gr.Label()
    
    text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
    
    image_input.change(
        fn=calculate_score,
        inputs=[image_input, text_input],
        outputs=output_label
    )
    
    text_input.submit(
        fn=calculate_score,
        inputs=[image_input, text_input],
        outputs=output_label
    )
    
    gr.Examples(
        examples=[
            [
                "cat.jpg",
                "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
            ]
        ],
        fn=calculate_score,
        inputs=[image_input, text_input],
        outputs=output_label,
    )

demo.launch()