taesiri commited on
Commit
9270f3d
1 Parent(s): ba78c79
Files changed (1) hide show
  1. app.py +56 -33
app.py CHANGED
@@ -1,24 +1,34 @@
1
  import torch
2
  import torch.nn.functional as F
3
  import gradio as gr
4
- from transformers import CLIPProcessor, CLIPModel
5
  import spaces
6
 
7
- # Dictionary of available CLIP models with their image sizes
8
- CLIP_MODELS = {
9
- "ViT-B/32": ("openai/clip-vit-base-patch32", 224),
10
- "ViT-B/16": ("openai/clip-vit-base-patch16", 224),
11
- "ViT-L/14": ("openai/clip-vit-large-patch14", 224),
12
- "ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336),
 
 
 
 
 
13
  }
14
 
15
  # Initialize models and processors
16
  models = {}
17
  processors = {}
18
 
19
- for model_name, (model_path, _) in CLIP_MODELS.items():
20
- models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
21
- processors[model_name] = CLIPProcessor.from_pretrained(model_path)
 
 
 
 
 
22
 
23
  @spaces.GPU
24
  def calculate_score(image, text, model_name):
@@ -27,63 +37,76 @@ def calculate_score(image, text, model_name):
27
  labels = list(filter(None, labels))
28
  if len(labels) == 0:
29
  return dict()
30
-
31
  model = models[model_name]
32
  processor = processors[model_name]
33
-
 
34
  # Preprocess the image and text
35
  inputs = processor(text=labels, images=[image], return_tensors="pt", padding=True)
36
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
37
-
38
  # Calculate embeddings
39
  with torch.no_grad():
40
  outputs = model(**inputs)
41
- image_embeds = outputs.image_embeds
42
- text_embeds = outputs.text_embeds
43
-
 
 
 
 
44
  # Normalize embeddings
45
  image_embeds = F.normalize(image_embeds, p=2, dim=1)
46
  text_embeds = F.normalize(text_embeds, p=2, dim=1)
47
-
48
  # Calculate cosine similarity
49
  cosine_similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
50
-
51
- # Convert to percentages
52
- percentages = ((cosine_similarities + 1) / 2).cpu().numpy()
53
-
54
- results_dict = {label: float(score) for label, score in zip(labels, percentages)}
 
 
 
55
  return results_dict
56
 
 
57
  with gr.Blocks() as demo:
58
- gr.Markdown("# Multi-Model CLIP Score")
59
- gr.Markdown("Calculate the CLIP score (cosine similarity) between the given image and text descriptions using different CLIP model variants")
60
-
 
 
61
  with gr.Row():
62
  image_input = gr.Image(type="pil")
63
  output_label = gr.Label()
64
-
65
  with gr.Row():
66
  text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
67
- model_dropdown = gr.Dropdown(choices=list(CLIP_MODELS.keys()), label="CLIP Model", value="ViT-B/16")
68
-
 
 
69
  def process_inputs(image, text, model_name):
70
  if image is None or text.strip() == "":
71
  return None
72
  return calculate_score(image, text, model_name)
73
-
74
  inputs = [image_input, text_input, model_dropdown]
75
  outputs = output_label
76
-
77
  image_input.change(fn=process_inputs, inputs=inputs, outputs=outputs)
78
  text_input.submit(fn=process_inputs, inputs=inputs, outputs=outputs)
79
  model_dropdown.change(fn=process_inputs, inputs=inputs, outputs=outputs)
80
-
81
  gr.Examples(
82
  examples=[
83
  [
84
  "cat.jpg",
85
  "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
86
- "ViT-B/16"
87
  ]
88
  ],
89
  fn=process_inputs,
@@ -91,4 +114,4 @@ with gr.Blocks() as demo:
91
  outputs=outputs,
92
  )
93
 
94
- demo.launch()
 
1
  import torch
2
  import torch.nn.functional as F
3
  import gradio as gr
4
+ from transformers import CLIPProcessor, CLIPModel, AutoProcessor, AutoModel
5
  import spaces
6
 
7
+ # Dictionary of available models with their image sizes
8
+ MODELS = {
9
+ "CLIP ViT-B/32": ("openai/clip-vit-base-patch32", 224, "clip"),
10
+ "CLIP ViT-B/16": ("openai/clip-vit-base-patch16", 224, "clip"),
11
+ "CLIP ViT-L/14": ("openai/clip-vit-large-patch14", 224, "clip"),
12
+ "CLIP ViT-L/14@336px": ("openai/clip-vit-large-patch14-336", 336, "clip"),
13
+ "SigLIP SO400M/14-384": ("google/siglip-so400m-patch14-384", 384, "siglip"),
14
+ "SigLIP Large/16-256": ("google/siglip-large-patch16-256", 256, "siglip"),
15
+ "SigLIP SO400M/14-224": ("google/siglip-so400m-patch14-224", 224, "siglip"),
16
+ "SigLIP Base/16-384": ("google/siglip-base-patch16-384", 384, "siglip"),
17
+ "SigLIP Large/16-384": ("google/siglip-large-patch16-384", 384, "siglip"),
18
  }
19
 
20
  # Initialize models and processors
21
  models = {}
22
  processors = {}
23
 
24
+ for model_name, (model_path, _, model_type) in MODELS.items():
25
+ if model_type == "clip":
26
+ models[model_name] = CLIPModel.from_pretrained(model_path).to("cuda")
27
+ processors[model_name] = CLIPProcessor.from_pretrained(model_path)
28
+ elif model_type == "siglip":
29
+ models[model_name] = AutoModel.from_pretrained(model_path).to("cuda")
30
+ processors[model_name] = AutoProcessor.from_pretrained(model_path)
31
+
32
 
33
  @spaces.GPU
34
  def calculate_score(image, text, model_name):
 
37
  labels = list(filter(None, labels))
38
  if len(labels) == 0:
39
  return dict()
40
+
41
  model = models[model_name]
42
  processor = processors[model_name]
43
+ model_type = MODELS[model_name][2]
44
+
45
  # Preprocess the image and text
46
  inputs = processor(text=labels, images=[image], return_tensors="pt", padding=True)
47
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
48
+
49
  # Calculate embeddings
50
  with torch.no_grad():
51
  outputs = model(**inputs)
52
+ if model_type == "clip":
53
+ image_embeds = outputs.image_embeds
54
+ text_embeds = outputs.text_embeds
55
+ elif model_type == "siglip":
56
+ image_embeds = outputs.image_embeds
57
+ text_embeds = outputs.text_embeds
58
+
59
  # Normalize embeddings
60
  image_embeds = F.normalize(image_embeds, p=2, dim=1)
61
  text_embeds = F.normalize(text_embeds, p=2, dim=1)
62
+
63
  # Calculate cosine similarity
64
  cosine_similarities = torch.mm(text_embeds, image_embeds.t()).squeeze(1)
65
+
66
+ # Ensure values are between 0 and 1
67
+ cosine_similarities = torch.clamp(cosine_similarities, min=0, max=1)
68
+
69
+ # Convert to numpy array
70
+ similarities = cosine_similarities.cpu().numpy()
71
+
72
+ results_dict = {label: float(score) for label, score in zip(labels, similarities)}
73
  return results_dict
74
 
75
+
76
  with gr.Blocks() as demo:
77
+ gr.Markdown("# Multi-Model CLIP and SigLIP Score")
78
+ gr.Markdown(
79
+ "Calculate the score (cosine similarity) between the given image and text descriptions using different CLIP and SigLIP model variants"
80
+ )
81
+
82
  with gr.Row():
83
  image_input = gr.Image(type="pil")
84
  output_label = gr.Label()
85
+
86
  with gr.Row():
87
  text_input = gr.Textbox(label="Descriptions (separated by semicolons)")
88
+ model_dropdown = gr.Dropdown(
89
+ choices=list(MODELS.keys()), label="Model", value="CLIP ViT-B/16"
90
+ )
91
+
92
  def process_inputs(image, text, model_name):
93
  if image is None or text.strip() == "":
94
  return None
95
  return calculate_score(image, text, model_name)
96
+
97
  inputs = [image_input, text_input, model_dropdown]
98
  outputs = output_label
99
+
100
  image_input.change(fn=process_inputs, inputs=inputs, outputs=outputs)
101
  text_input.submit(fn=process_inputs, inputs=inputs, outputs=outputs)
102
  model_dropdown.change(fn=process_inputs, inputs=inputs, outputs=outputs)
103
+
104
  gr.Examples(
105
  examples=[
106
  [
107
  "cat.jpg",
108
  "a cat stuck in a door; a cat in the air; a cat sitting; a cat standing; a cat is entering the matrix; a cat is entering the void",
109
+ "CLIP ViT-B/16",
110
  ]
111
  ],
112
  fn=process_inputs,
 
114
  outputs=outputs,
115
  )
116
 
117
+ demo.launch()