text_to_image_ddgan / clip_encoder.py
Mehdi Cherti
update
9935a02
raw
history blame
3.49 kB
import torch
import torch.nn as nn
import open_clip
from einops import rearrange
import os
def exists(val):
return val is not None
class CLIPEncoder(nn.Module):
def __init__(self, model, pretrained):
super().__init__()
#ViT_H_14_laion2b_s32b_b79k
fname = "models/" + model.replace("-", "_") + "_" + pretrained + ".pt"
if os.path.exists(fname):
print(fname)
pretrained = fname
#model = "ViT-B-32"
#pretrained = "openai"
self.model = model
self.pretrained = pretrained
#self.model, _, _ = open_clip.create_model_and_transforms(model)#, pretrained=pretrained)
#print(self.model)
self.output_size = 1024
#self.output_size = self.model.transformer.width
def forward(self, texts, return_only_pooled=False):
return torch.randn(len(texts), self.output_size), torch.randn(len(texts), 77, self.output_size), torch.ones(len(texts), 77).bool()
device = next(self.parameters()).device
toks = open_clip.tokenize(texts).to(device)
x = self.model.token_embedding(toks) # [batch_size, n_ctx, d_model]
x = x + self.model.positional_embedding
x = x.permute(1, 0, 2) # NLD -> LND
x = self.model.transformer(x, attn_mask=self.model.attn_mask)
x = x.permute(1, 0, 2) # LND -> NLD
x = self.model.ln_final(x)
mask = (toks!=0)
pooled = x[torch.arange(x.shape[0]), toks.argmax(dim=-1)] @ self.model.text_projection
if return_only_pooled:
return pooled
else:
return pooled, x, mask
class CLIPImageEncoder(nn.Module):
def __init__(self, model_type="ViT-B/32"):
super().__init__()
import clip
self.model, preprocess = clip.load(model_type, device="cpu", jit=False)
CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
mean = torch.tensor(CLIP_MEAN).view(1, 3, 1, 1)
std = torch.tensor(CLIP_STD).view(1, 3, 1, 1)
self.register_buffer("mean", mean)
self.register_buffer("std", std)
self.output_size = 512
def forward_image(self, x):
x = torch.nn.functional.interpolate(x, mode='bicubic', size=(224, 224))
x = (x-self.mean)/self.std
return self.model.encode_image(x)
def forward_text(self, texts):
import clip
toks = clip.tokenize(texts, truncate=True).to(self.mean.device)
return self.model.encode_text(toks)
class OpenCLIPImageEncoder(nn.Module):
def __init__(self, model="ViT-B/32", pretrained="openai"):
super().__init__()
model, _, preprocess = open_clip.create_model_and_transforms(model, pretrained=pretrained)
self.tokenizer = open_clip.get_tokenizer(model)
CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
mean = torch.tensor(CLIP_MEAN).view(1, 3, 1, 1)
std = torch.tensor(CLIP_STD).view(1, 3, 1, 1)
self.register_buffer("mean", mean)
self.register_buffer("std", std)
def forward_image(self, x):
x = torch.nn.functional.interpolate(x, mode='bicubic', size=(224, 224))
x = (x-self.mean)/self.std
return self.model.encode_image(x)
def forward_text(self, texts):
toks = self.tokenizer.tokenize(texts, truncate=True).to(self.mean.device)
return self.model.encode_text(toks)