Slep
/

CondViT-B16-txt

Feature Extraction

lrvsf-benchmark

Model card Files Files and versions Community

CondViT-B16-txt / module.py

Slep's picture

Upload CondViTForEmbedding

fa2f5fd verified 7 months ago

4.58 kB

	import torch
	from torch import nn

	from collections import OrderedDict
	import logging

	logger = logging.getLogger(__name__)


	class LayerNorm(nn.LayerNorm):
	"""Subclass torch's LayerNorm to handle fp16."""

	def forward(self, x: torch.Tensor):
	if self.weight.dtype != x.dtype:
	orig_type = x.dtype
	ret = super().forward(x.type(self.weight.dtype))
	return ret.type(orig_type)
	else:
	return super().forward(x)


	class QuickGELU(nn.Module):
	def forward(self, x: torch.Tensor):
	return x * torch.sigmoid(1.702 * x)


	class ResidualAttentionBlock(nn.Module):
	def __init__(
	self,
	d_model: int,
	n_head: int,
	attn_mask: torch.Tensor = None,
	):
	super().__init__()

	self.attn = nn.MultiheadAttention(d_model, n_head)
	self.ln_1 = LayerNorm(d_model)
	self.mlp = nn.Sequential(
	OrderedDict(
	[
	(
	"c_fc",
	nn.Linear(d_model, d_model * 4),
	),
	("gelu", QuickGELU()),
	(
	"c_proj",
	nn.Linear(d_model * 4, d_model),
	),
	]
	)
	)
	self.ln_2 = LayerNorm(d_model)
	self.attn_mask = attn_mask

	def attention(self, x: torch.Tensor):
	self.attn_mask = (
	self.attn_mask.to(dtype=x.dtype, device=x.device)
	if self.attn_mask is not None
	else None
	)
	return self.attn(
	x,
	x,
	x,
	need_weights=False,
	attn_mask=self.attn_mask,
	)[0]

	def forward(self, x: torch.Tensor):
	x = x + self.attention(self.ln_1(x))
	x = x + self.mlp(self.ln_2(x))
	return x


	class Transformer(nn.Module):
	def __init__(
	self,
	width: int,
	layers: int,
	heads: int,
	attn_mask: torch.Tensor = None,
	):
	super().__init__()
	self.width = width
	self.layers = layers
	self.resblocks = nn.Sequential(
	*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]
	)

	def forward(self, x: torch.Tensor):
	return self.resblocks(x)


	class ConditionalViT(nn.Module):
	def __init__(
	self,
	input_resolution: int,
	patch_size: int,
	width: int,
	layers: int,
	heads: int,
	output_dim: int,
	):
	super().__init__()
	self.input_resolution = input_resolution
	self.output_dim = output_dim
	self.conv1 = nn.Conv2d(
	in_channels=3,
	out_channels=width,
	kernel_size=patch_size,
	stride=patch_size,
	bias=False,
	)

	scale = width**-0.5

	self.class_embedding = nn.Parameter(scale * torch.randn(width))

	self.c_pos_embedding = nn.Parameter(scale * torch.randn(1, width))

	self.positional_embedding = nn.Parameter(
	scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)
	)
	self.ln_pre = LayerNorm(width)

	self.transformer = Transformer(width, layers, heads)
	self.ln_post = LayerNorm(width)
	self.logit_scale = torch.nn.Parameter(torch.ones([]) * 4.6052)

	self.proj = nn.Linear(width, output_dim, bias=False)

	def forward(self, imgs: torch.Tensor, c: torch.Tensor = None):
	"""
	imgs : Batch of images
	c : Text embedding.
	"""

	x = self.conv1(imgs) # shape = [*, width, grid, grid]
	# shape = [, width, grid * 2]
	x = x.reshape(x.shape[0], x.shape[1], -1)
	x = x.permute(0, 2, 1) # shape = [, grid * 2, width]

	# Gather CLS, Grid, maybe CAT, and positional embedding
	tokens = [self.class_embedding.tile(x.shape[0], 1, 1), x] # NLD
	pos_embed = [self.positional_embedding] # LD

	if c is not None:
	pos_embed += [self.c_pos_embedding] # +1D -> N1D
	tokens += [c.unsqueeze(1)]

	x = torch.cat(tokens, dim=1) # shape = [, grid * 2 + 1\|2, width] = N(L\|L+1)D
	pos_embed = torch.cat(pos_embed, dim=0).unsqueeze(0) # 1(L\|L+1)D

	x = x + pos_embed
	x = self.ln_pre(x)

	x = x.permute(1, 0, 2) # NLD -> LND

	x = self.transformer(x)
	x = x.permute(1, 0, 2) # LND -> NLD

	x = self.ln_post(x[:, 0, :])

	x = self.proj(x)

	return x