Spaces:
Runtime error
Runtime error
File size: 6,378 Bytes
85a5010 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import torch
import requests
from torch import nn
from PIL import Image
class CLIP(nn.Module):
def __init__(self, model_name):
super(CLIP, self).__init__()
# model name: e.g. openai/clip-vit-base-patch32
print ('Initializing CLIP model...')
from transformers import CLIPProcessor, CLIPModel
self.model = CLIPModel.from_pretrained(model_name)
self.model.eval()
self.processor = CLIPProcessor.from_pretrained(model_name)
from transformers import CLIPTokenizer
self.tokenizer = CLIPTokenizer.from_pretrained(model_name)
self.cuda_has_been_checked = False
print ('CLIP model initialized.')
def check_cuda(self):
self.cuda_available = next(self.model.parameters()).is_cuda
self.device = next(self.model.parameters()).get_device()
if self.cuda_available:
print ('Cuda is available.')
print ('Device is {}'.format(self.device))
else:
print ('Cuda is not available.')
print ('Device is {}'.format(self.device))
@torch.no_grad()
def compute_image_representation_from_image_path(self, image_path):
if not self.cuda_has_been_checked:
self.check_cuda()
self.cuda_has_been_checked = True
else:
pass
# image_path: the path of the image
image = Image.open(image_path)
inputs = self.processor(images=image, return_tensors="pt")
pixel_values = inputs['pixel_values']
if self.cuda_available:
pixel_values = pixel_values.cuda(self.device)
visual_outputs = self.model.vision_model(pixel_values=pixel_values)
image_embeds = visual_outputs[1]
image_embeds = self.model.visual_projection(image_embeds) # [1 x embed_dim]
return image_embeds
def compute_image_representation_from_image_instance(self, image):
if not self.cuda_has_been_checked:
self.check_cuda()
self.cuda_has_been_checked = True
else:
pass
# image_path: the path of the image
inputs = self.processor(images=image, return_tensors="pt")
pixel_values = inputs['pixel_values']
if self.cuda_available:
pixel_values = pixel_values.cuda(self.device)
visual_outputs = self.model.vision_model(pixel_values=pixel_values)
image_embeds = visual_outputs[1]
image_embeds = self.model.visual_projection(image_embeds) # [1 x embed_dim]
return image_embeds
def compute_text_representation(self, text_list):
if not self.cuda_has_been_checked:
self.check_cuda()
self.cuda_has_been_checked = True
else:
pass
# text_list: a list of text
text_inputs = self.tokenizer(text_list, padding=True, return_tensors="pt",
max_length=self.tokenizer.max_len_single_sentence + 2, truncation=True)
# self.tokenizer.max_len_single_sentence + 2 = 77
input_ids, attention_mask = text_inputs['input_ids'], text_inputs['attention_mask']
if self.cuda_available:
input_ids = input_ids.cuda(self.device)
attention_mask = attention_mask.cuda(self.device)
text_outputs = self.model.text_model(
input_ids=input_ids,
attention_mask=attention_mask
)
text_embeds = text_outputs[1]
text_embeds = self.model.text_projection(text_embeds)
return text_embeds
def compute_image_text_similarity_via_embeddings(self, image_embeds, text_embeds):
'''
image_embeds: 1 x embed_dim
text_embeds: len(text_list) x embed_dim
'''
image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)
logit_scale = self.model.logit_scale.exp()
logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
logits_per_image = logits_per_text.T
return logits_per_image.softmax(dim=1), logits_per_image/logit_scale # 1 x len(text_list)
def compute_image_text_similarity_via_raw_text(self, image_embeds, text_list):
text_embeds = self.compute_text_representation(text_list)
return self.compute_image_text_similarity_via_embeddings(image_embeds, text_embeds)
### -------------------- functions for building index ---------------------- ###
def compute_batch_index_image_features(self, image_list):
'''
# list of image instances
'''
if not self.cuda_has_been_checked:
self.check_cuda()
self.cuda_has_been_checked = True
else:
pass
# image_path: the path of the image
inputs = self.processor(images=image_list, return_tensors="pt")
pixel_values = inputs['pixel_values']
if self.cuda_available:
pixel_values = pixel_values.cuda(self.device)
visual_outputs = self.model.vision_model(pixel_values=pixel_values)
image_embeds = visual_outputs[1]
image_embeds = self.model.visual_projection(image_embeds) # [1 x embed_dim]
return image_embeds # len(image_list) x embed_dim
def compute_batch_index_text_representation(self, text_list):
if not self.cuda_has_been_checked:
self.check_cuda()
self.cuda_has_been_checked = True
else:
pass
# text_list: a list of text
#text_inputs = self.tokenizer(text_list, padding=True, return_tensors="pt")
text_inputs = self.tokenizer(text_list, padding=True, return_tensors="pt",
max_length=self.tokenizer.max_len_single_sentence + 2, truncation=True)
input_ids, attention_mask = text_inputs['input_ids'], text_inputs['attention_mask']
if self.cuda_available:
input_ids = input_ids.cuda(self.device)
attention_mask = attention_mask.cuda(self.device)
text_outputs = self.model.text_model(
input_ids=input_ids,
attention_mask=attention_mask
)
text_embeds = text_outputs[1]
text_embeds = self.model.text_projection(text_embeds)
return text_embeds
#logit_scale = self.model.logit_scale.exp()
#text_embeds = text_embeds * logit_scale
#return text_embeds
|