Spaces:
Runtime error
Runtime error
import torch | |
import torch.nn as nn | |
import open_clip | |
from einops import rearrange | |
import os | |
def exists(val): | |
return val is not None | |
class CLIPEncoder(nn.Module): | |
def __init__(self, model, pretrained): | |
super().__init__() | |
#ViT_H_14_laion2b_s32b_b79k | |
fname = "models/" + model.replace("-", "_") + "_" + pretrained + ".pt" | |
if os.path.exists(fname): | |
print(fname) | |
pretrained = fname | |
#model = "ViT-B-32" | |
#pretrained = "openai" | |
self.model = model | |
self.pretrained = pretrained | |
#self.model, _, _ = open_clip.create_model_and_transforms(model)#, pretrained=pretrained) | |
#print(self.model) | |
self.output_size = 1024 | |
#self.output_size = self.model.transformer.width | |
def forward(self, texts, return_only_pooled=False): | |
return torch.randn(len(texts), self.output_size), torch.randn(len(texts), 77, self.output_size), torch.ones(len(texts), 77).bool() | |
device = next(self.parameters()).device | |
toks = open_clip.tokenize(texts).to(device) | |
x = self.model.token_embedding(toks) # [batch_size, n_ctx, d_model] | |
x = x + self.model.positional_embedding | |
x = x.permute(1, 0, 2) # NLD -> LND | |
x = self.model.transformer(x, attn_mask=self.model.attn_mask) | |
x = x.permute(1, 0, 2) # LND -> NLD | |
x = self.model.ln_final(x) | |
mask = (toks!=0) | |
pooled = x[torch.arange(x.shape[0]), toks.argmax(dim=-1)] @ self.model.text_projection | |
if return_only_pooled: | |
return pooled | |
else: | |
return pooled, x, mask | |
class CLIPImageEncoder(nn.Module): | |
def __init__(self, model_type="ViT-B/32"): | |
super().__init__() | |
import clip | |
self.model, preprocess = clip.load(model_type, device="cpu", jit=False) | |
CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] | |
CLIP_STD = [0.26862954, 0.26130258, 0.27577711] | |
mean = torch.tensor(CLIP_MEAN).view(1, 3, 1, 1) | |
std = torch.tensor(CLIP_STD).view(1, 3, 1, 1) | |
self.register_buffer("mean", mean) | |
self.register_buffer("std", std) | |
self.output_size = 512 | |
def forward_image(self, x): | |
x = torch.nn.functional.interpolate(x, mode='bicubic', size=(224, 224)) | |
x = (x-self.mean)/self.std | |
return self.model.encode_image(x) | |
def forward_text(self, texts): | |
import clip | |
toks = clip.tokenize(texts, truncate=True).to(self.mean.device) | |
return self.model.encode_text(toks) | |
class OpenCLIPImageEncoder(nn.Module): | |
def __init__(self, model="ViT-B/32", pretrained="openai"): | |
super().__init__() | |
model, _, preprocess = open_clip.create_model_and_transforms(model, pretrained=pretrained) | |
self.tokenizer = open_clip.get_tokenizer(model) | |
CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] | |
CLIP_STD = [0.26862954, 0.26130258, 0.27577711] | |
mean = torch.tensor(CLIP_MEAN).view(1, 3, 1, 1) | |
std = torch.tensor(CLIP_STD).view(1, 3, 1, 1) | |
self.register_buffer("mean", mean) | |
self.register_buffer("std", std) | |
def forward_image(self, x): | |
x = torch.nn.functional.interpolate(x, mode='bicubic', size=(224, 224)) | |
x = (x-self.mean)/self.std | |
return self.model.encode_image(x) | |
def forward_text(self, texts): | |
toks = self.tokenizer.tokenize(texts, truncate=True).to(self.mean.device) | |
return self.model.encode_text(toks) | |