gokaygokay
commited on
Upload vision_encoder.py with huggingface_hub
Browse files- vision_encoder.py +138 -0
vision_encoder.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import nn
|
3 |
+
from PIL import Image
|
4 |
+
from einops import rearrange
|
5 |
+
from torchvision.transforms.v2 import (
|
6 |
+
Compose,
|
7 |
+
Resize,
|
8 |
+
InterpolationMode,
|
9 |
+
ToImage,
|
10 |
+
ToDtype,
|
11 |
+
Normalize,
|
12 |
+
)
|
13 |
+
import timm
|
14 |
+
|
15 |
+
|
16 |
+
class VisualHolder(nn.Module):
|
17 |
+
def __init__(self, model):
|
18 |
+
super().__init__()
|
19 |
+
self.visual = model
|
20 |
+
|
21 |
+
def forward(self, x):
|
22 |
+
return self.visual(x)
|
23 |
+
|
24 |
+
|
25 |
+
class ModelHolder(nn.Module):
|
26 |
+
def __init__(self, model):
|
27 |
+
super().__init__()
|
28 |
+
self.model = model
|
29 |
+
|
30 |
+
def forward(self, x):
|
31 |
+
return self.model(x)
|
32 |
+
|
33 |
+
|
34 |
+
class LinearPatchEmbedding(nn.Module):
|
35 |
+
def __init__(self, conv):
|
36 |
+
super().__init__()
|
37 |
+
self.linear = nn.Linear(588, 1152)
|
38 |
+
self.linear.weight.data = conv.weight.data.view(1152, -1)
|
39 |
+
if conv.bias is not None:
|
40 |
+
self.linear.bias.data = conv.bias.data
|
41 |
+
|
42 |
+
def forward(self, x):
|
43 |
+
return self.linear(x)
|
44 |
+
|
45 |
+
|
46 |
+
class MLP(nn.Module):
|
47 |
+
def __init__(
|
48 |
+
self,
|
49 |
+
in_features: int,
|
50 |
+
hidden_features: int = None,
|
51 |
+
out_features: int = None,
|
52 |
+
act_layer: nn.Module = nn.GELU,
|
53 |
+
) -> None:
|
54 |
+
super().__init__()
|
55 |
+
out_features = out_features or in_features
|
56 |
+
hidden_features = hidden_features or in_features
|
57 |
+
self.fc1 = nn.Linear(in_features, hidden_features)
|
58 |
+
self.act = act_layer()
|
59 |
+
self.fc2 = nn.Linear(hidden_features, out_features)
|
60 |
+
|
61 |
+
torch.nn.init.kaiming_normal_(
|
62 |
+
self.fc1.weight, mode="fan_in", nonlinearity="relu"
|
63 |
+
)
|
64 |
+
torch.nn.init.kaiming_normal_(
|
65 |
+
self.fc2.weight, mode="fan_in", nonlinearity="relu"
|
66 |
+
)
|
67 |
+
|
68 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
69 |
+
x = self.fc1(x)
|
70 |
+
x = self.act(x)
|
71 |
+
x = self.fc2(x)
|
72 |
+
return x
|
73 |
+
|
74 |
+
|
75 |
+
class VisionProjection(nn.Module):
|
76 |
+
def __init__(self):
|
77 |
+
super().__init__()
|
78 |
+
|
79 |
+
image_embedding_dim = 1152
|
80 |
+
model_dim = 2048
|
81 |
+
hidden_dim = model_dim * 4
|
82 |
+
|
83 |
+
self.mlp = MLP(image_embedding_dim, hidden_dim, model_dim)
|
84 |
+
|
85 |
+
@property
|
86 |
+
def device(self):
|
87 |
+
return self.mlp.fc1.weight.device
|
88 |
+
|
89 |
+
def forward(self, x):
|
90 |
+
return self.mlp(x)
|
91 |
+
|
92 |
+
|
93 |
+
class VisionEncoder(nn.Module):
|
94 |
+
def __init__(self) -> None:
|
95 |
+
super().__init__()
|
96 |
+
|
97 |
+
self.encoder = ModelHolder(
|
98 |
+
VisualHolder(timm.create_model("vit_so400m_patch14_siglip_384"))
|
99 |
+
)
|
100 |
+
self.encoder.model.visual.patch_embed = LinearPatchEmbedding(
|
101 |
+
self.encoder.model.visual.patch_embed.proj
|
102 |
+
)
|
103 |
+
self.encoder.model.visual.attn_pool = nn.Identity()
|
104 |
+
|
105 |
+
self.projection = VisionProjection()
|
106 |
+
|
107 |
+
self.preprocess = Compose(
|
108 |
+
[
|
109 |
+
Resize(size=(378, 378), interpolation=InterpolationMode.BICUBIC),
|
110 |
+
ToImage(),
|
111 |
+
ToDtype(torch.float32, scale=True),
|
112 |
+
Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
|
113 |
+
]
|
114 |
+
)
|
115 |
+
|
116 |
+
@property
|
117 |
+
def device(self):
|
118 |
+
return self.projection.mlp.fc1.weight.device
|
119 |
+
|
120 |
+
@property
|
121 |
+
def dtype(self):
|
122 |
+
return self.projection.mlp.fc1.weight.dtype
|
123 |
+
|
124 |
+
def __call__(self, images) -> torch.Tensor:
|
125 |
+
if not isinstance(images, list):
|
126 |
+
images = [images]
|
127 |
+
|
128 |
+
with torch.no_grad():
|
129 |
+
x = torch.stack(
|
130 |
+
[self.preprocess(image.convert("RGB")) for image in images]
|
131 |
+
).to(self.device, dtype=self.dtype)
|
132 |
+
|
133 |
+
x = rearrange(x, "b c (h p1) (w p2) -> b (h w) (c p1 p2)", p1=14, p2=14)
|
134 |
+
|
135 |
+
x = self.encoder(x)
|
136 |
+
x = self.projection(x)
|
137 |
+
|
138 |
+
return x
|