File size: 4,300 Bytes
d60d34b
3ccdd83
acda6c7
9270f3d
d60d34b
acda6c7
9270f3d
 
 
 
 
 
 
 
 
61b7eee
 
 
 
 
 
9270f3d
 
 
 
 
 
 
 
acda6c7
73f9f45
61b7eee
3601eff
 
 
 
 
9270f3d
61b7eee
 
9270f3d
 
61b7eee
7fc3bdc
c890be1
9270f3d
3ccdd83
61b7eee
 
9270f3d
 
 
 
 
 
 
3ccdd83
 
 
9270f3d
7fc3bdc
 
 
 
 
 
 
 
 
9270f3d
 
7fc3bdc
9270f3d
 
3601eff
acda6c7
9270f3d
d60d34b
9270f3d
 
 
 
 
d60d34b
61b7eee
d60d34b
9270f3d
61b7eee
 
9270f3d
 
 
 
61b7eee
 
 
 
9270f3d
f93e53d
 
9270f3d
f93e53d
 
 
9270f3d
d60d34b
 
 
 
 
9270f3d
d60d34b
 
61b7eee
f93e53d
 
3601eff
 
9270f3d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import torch
import torch.nn.functional as F
import gradio as gr
from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModel
import spaces

# Dictionary of available models with their image sizes
MODELS = {
    "CLIP ViT-B/32": ("openai/clip-vit-base-patch32", 224, "clip"),
    "CLIP ViT-B/16": ("openai/clip-vit-base-patch16", 224, "clip"),
    "CLIP ViT-L/14": ("openai/clip-vit-large-patch14", 224, "clip"),
    "CLIP ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336, "clip"),
    "SigLIP Large/16-256": ("google/siglip-large-patch16-256", 256, "siglip"),
    "SigLIP Base/16-384": ("google/siglip-base-patch16-384", 384, "siglip"),
    "SigLIP Large/16-384": ("google/siglip-large-patch16-384", 384, "siglip"),
}

# Initialize models and processors
models = {}
processors = {}

for model_name, (model_path, _, model_type) in MODELS.items():
    if model_type == "clip":
        models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
        processors[model_name] = CLIPProcessor.from_pretrained(model_path)
    elif model_type == "siglip":
        models[model_name] = AutoModel.from_pretrained(model_path).to("cuda")
        processors[model_name] = AutoProcessor.from_pretrained(model_path)


@spaces.GPU
def calculate_score(image, text, model_name):
    labels = text.split(";")
    labels = [l.strip() for l in labels]
    labels = list(filter(None, labels))
    if len(labels) == 0:
        return dict()

    model = models[model_name]
    processor = processors[model_name]
    model_type = MODELS[model_name][2]

    # Preprocess the image and text
    inputs = processor(text=labels, images=[image], return_tensors="pt", padding="max_length")
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    # Calculate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        if model_type == "clip":
            image_embeds = outputs.image_embeds
            text_embeds = outputs.text_embeds
        elif model_type == "siglip":
            image_embeds = outputs.image_embeds
            text_embeds = outputs.text_embeds

    # Normalize embeddings
    image_embeds = F.normalize(image_embeds, p=2, dim=1)
    text_embeds = F.normalize(text_embeds, p=2, dim=1)

    # Calculate similarity
    if model_type == "clip":
        # For CLIP, use cosine similarity
        similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
        similarities = torch.clamp(similarities, min=0, max=1)
    elif model_type == "siglip":
        # For SigLIP, use sigmoid on dot product
        logits = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
        similarities = torch.sigmoid(logits)

    # Convert to numpy array
    similarities = similarities.cpu().numpy()

    results_dict = {label: float(score) for label, score in zip(labels, similarities)}
    return results_dict


with gr.Blocks() as demo:
    gr.Markdown("# Multi-Model CLIP and SigLIP Score")
    gr.Markdown(
        "Calculate the score (cosine similarity) between the given image and text descriptions using different CLIP and SigLIP model variants"
    )

    with gr.Row():
        image_input = gr.Image(type="pil")
        output_label = gr.Label()

    with gr.Row():
        text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
        model_dropdown = gr.Dropdown(
            choices=list(MODELS.keys()), label="Model", value="CLIP ViT-B/16"
        )

    def process_inputs(image, text, model_name):
        if image is None or text.strip() == "":
            return None
        return calculate_score(image, text, model_name)

    inputs = [image_input, text_input, model_dropdown]
    outputs = output_label

    image_input.change(fn=process_inputs, inputs=inputs, outputs=outputs)
    text_input.submit(fn=process_inputs, inputs=inputs, outputs=outputs)
    model_dropdown.change(fn=process_inputs, inputs=inputs, outputs=outputs)

    gr.Examples(
        examples=[
            [
                "cat.jpg",
                "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
                "CLIP ViT-B/16",
            ]
        ],
        fn=process_inputs,
        inputs=inputs,
        outputs=outputs,
    )

demo.launch()