Jingkang Yang commited on
Commit
09ed94e
·
1 Parent(s): 75a2e8a

update: clip

Browse files
app.py CHANGED
@@ -5,7 +5,7 @@ try:
5
  except:
6
  import os
7
  # os.system('cd /home/user/app/third_party/CLIP && pip install -Ue .')
8
- # os.system('pip install git+https://github.com/openai/CLIP.git')
9
  os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
10
  os.system('pip install git+https://github.com/facebookresearch/pytorch3d.git')
11
  os.system('pip install git+https://github.com/facebookresearch/segment-anything.git')
 
5
  except:
6
  import os
7
  # os.system('cd /home/user/app/third_party/CLIP && pip install -Ue .')
8
+ os.system('pip install git+https://github.com/Jun-CEN/CLIP.git')
9
  os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
10
  os.system('pip install git+https://github.com/facebookresearch/pytorch3d.git')
11
  os.system('pip install git+https://github.com/facebookresearch/segment-anything.git')
open_vocab_seg/modeling/clip_adapter/bpe_simple_vocab_16e6.txt.gz DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
3
- size 1356917
 
 
 
 
open_vocab_seg/modeling/clip_adapter/clip.py DELETED
@@ -1,285 +0,0 @@
1
- import hashlib
2
- import os
3
- import urllib
4
- import warnings
5
- from collections import OrderedDict
6
- from typing import Union, List
7
-
8
- import torch
9
- from PIL import Image
10
- from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
11
- from tqdm import tqdm
12
-
13
- from .model import build_model
14
- from .simple_tokenizer import SimpleTokenizer as _Tokenizer
15
-
16
- try:
17
- from torchvision.transforms import InterpolationMode
18
-
19
- BICUBIC = InterpolationMode.BICUBIC
20
- except ImportError:
21
- BICUBIC = Image.BICUBIC
22
-
23
-
24
- if torch.__version__.split(".") < ["1", "7", "1"]:
25
- warnings.warn("PyTorch version 1.7.1 or higher is recommended")
26
-
27
-
28
- __all__ = ["available_models", "load", "tokenize"]
29
- _tokenizer = _Tokenizer()
30
-
31
- _MODELS = {
32
- "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
33
- "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
34
- "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
35
- "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
36
- "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
37
- "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
38
- "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
39
- "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
40
- }
41
-
42
-
43
- def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
44
- os.makedirs(root, exist_ok=True)
45
- filename = os.path.basename(url)
46
-
47
- expected_sha256 = url.split("/")[-2]
48
- download_target = os.path.join(root, filename)
49
-
50
- if os.path.exists(download_target) and not os.path.isfile(download_target):
51
- raise RuntimeError(f"{download_target} exists and is not a regular file")
52
-
53
- if os.path.isfile(download_target):
54
- if (
55
- hashlib.sha256(open(download_target, "rb").read()).hexdigest()
56
- == expected_sha256
57
- ):
58
- return download_target
59
- else:
60
- warnings.warn(
61
- f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file"
62
- )
63
-
64
- with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
65
- with tqdm(
66
- total=int(source.info().get("Content-Length")),
67
- ncols=80,
68
- unit="iB",
69
- unit_scale=True,
70
- ) as loop:
71
- while True:
72
- buffer = source.read(8192)
73
- if not buffer:
74
- break
75
-
76
- output.write(buffer)
77
- loop.update(len(buffer))
78
-
79
- if (
80
- hashlib.sha256(open(download_target, "rb").read()).hexdigest()
81
- != expected_sha256
82
- ):
83
- raise RuntimeError(
84
- f"Model has been downloaded but the SHA256 checksum does not not match"
85
- )
86
-
87
- return download_target
88
-
89
-
90
- def _transform(n_px):
91
- return Compose(
92
- [
93
- Resize(n_px, interpolation=BICUBIC),
94
- CenterCrop(n_px),
95
- lambda image: image.convert("RGB"),
96
- ToTensor(),
97
- Normalize(
98
- (0.48145466, 0.4578275, 0.40821073),
99
- (0.26862954, 0.26130258, 0.27577711),
100
- ),
101
- ]
102
- )
103
-
104
-
105
- def available_models() -> List[str]:
106
- """Returns the names of available CLIP models"""
107
- return list(_MODELS.keys())
108
-
109
-
110
- def load(
111
- name: str,
112
- mask_prompt_depth: int = 0,
113
- device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
114
- jit=False,
115
- ):
116
- """Load a CLIP model
117
-
118
- Parameters
119
- ----------
120
- name : str
121
- A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
122
-
123
- device : Union[str, torch.device]
124
- The device to put the loaded model
125
-
126
- jit : bool
127
- Whether to load the optimized JIT model or more hackable non-JIT model (default).
128
-
129
- Returns
130
- -------
131
- model : torch.nn.Module
132
- The CLIP model
133
-
134
- preprocess : Callable[[PIL.Image], torch.Tensor]
135
- A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
136
- """
137
- if name in _MODELS:
138
- model_path = _download(_MODELS[name])
139
- elif os.path.isfile(name):
140
- model_path = name
141
- else:
142
- raise RuntimeError(
143
- f"Model {name} not found; available models = {available_models()}"
144
- )
145
-
146
- try:
147
- # loading JIT archive
148
- model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
149
- state_dict = None
150
- except RuntimeError:
151
- # loading saved state dict
152
- if jit:
153
- warnings.warn(
154
- f"File {model_path} is not a JIT archive. Loading as a state dict instead"
155
- )
156
- jit = False
157
- state_dict = torch.load(model_path, map_location="cpu")
158
- if 'state_dict' in state_dict:
159
- new_state_dict = OrderedDict()
160
- for k, v in state_dict['state_dict'].items():
161
- if k.startswith('module.'):
162
- name = k[7:] # remove `module.`
163
- new_state_dict[name] = v
164
- state_dict = new_state_dict
165
-
166
- if not jit:
167
- model = build_model(state_dict or model.state_dict(), mask_prompt_depth).to(device)
168
- if str(device) == "cpu":
169
- model.float()
170
- return model, _transform(model.visual.input_resolution)
171
-
172
- # patch the device names
173
- device_holder = torch.jit.trace(
174
- lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]
175
- )
176
- device_node = [
177
- n
178
- for n in device_holder.graph.findAllNodes("prim::Constant")
179
- if "Device" in repr(n)
180
- ][-1]
181
-
182
- def patch_device(module):
183
- try:
184
- graphs = [module.graph] if hasattr(module, "graph") else []
185
- except RuntimeError:
186
- graphs = []
187
-
188
- if hasattr(module, "forward1"):
189
- graphs.append(module.forward1.graph)
190
-
191
- for graph in graphs:
192
- for node in graph.findAllNodes("prim::Constant"):
193
- if "value" in node.attributeNames() and str(node["value"]).startswith(
194
- "cuda"
195
- ):
196
- node.copyAttributes(device_node)
197
-
198
- model.apply(patch_device)
199
- patch_device(model.encode_image)
200
- patch_device(model.encode_text)
201
-
202
- # patch dtype to float32 on CPU
203
- if str(device) == "cpu":
204
- float_holder = torch.jit.trace(
205
- lambda: torch.ones([]).float(), example_inputs=[]
206
- )
207
- float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
208
- float_node = float_input.node()
209
-
210
- def patch_float(module):
211
- try:
212
- graphs = [module.graph] if hasattr(module, "graph") else []
213
- except RuntimeError:
214
- graphs = []
215
-
216
- if hasattr(module, "forward1"):
217
- graphs.append(module.forward1.graph)
218
-
219
- for graph in graphs:
220
- for node in graph.findAllNodes("aten::to"):
221
- inputs = list(node.inputs())
222
- for i in [
223
- 1,
224
- 2,
225
- ]: # dtype can be the second or third argument to aten::to()
226
- if inputs[i].node()["value"] == 5:
227
- inputs[i].node().copyAttributes(float_node)
228
-
229
- model.apply(patch_float)
230
- patch_float(model.encode_image)
231
- patch_float(model.encode_text)
232
-
233
- model.float()
234
-
235
- return model, _transform(model.input_resolution.item())
236
-
237
-
238
- def tokenize(
239
- texts: Union[str, List[str]],
240
- context_length: int = 77,
241
- truncate: bool = False,
242
- return_length: bool = False,
243
- ) -> torch.LongTensor:
244
- """
245
- Returns the tokenized representation of given input string(s)
246
-
247
- Parameters
248
- ----------
249
- texts : Union[str, List[str]]
250
- An input string or a list of input strings to tokenize
251
-
252
- context_length : int
253
- The context length to use; all CLIP models use 77 as the context length
254
-
255
- truncate: bool
256
- Whether to truncate the text in case its encoding is longer than the context length
257
-
258
- Returns
259
- -------
260
- A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
261
- """
262
- if isinstance(texts, str):
263
- texts = [texts]
264
-
265
- sot_token = _tokenizer.encoder["<|startoftext|>"]
266
- eot_token = _tokenizer.encoder["<|endoftext|>"]
267
- all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
268
- result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
269
- length = []
270
- for i, tokens in enumerate(all_tokens):
271
- if len(tokens) > context_length:
272
- if truncate:
273
- tokens = tokens[:context_length]
274
- tokens[-1] = eot_token
275
- length.append(context_length)
276
- else:
277
- raise RuntimeError(
278
- f"Input {texts[i]} is too long for context length {context_length}"
279
- )
280
- else:
281
- length.append(len(tokens))
282
- result[i, : len(tokens)] = torch.tensor(tokens)
283
- if return_length:
284
- return result, length
285
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
open_vocab_seg/modeling/clip_adapter/model.py DELETED
@@ -1,613 +0,0 @@
1
- # Copyright (c) Facebook, Inc. and its affiliates.
2
- # Copyright (c) Meta Platforms, Inc. All Rights Reserved
3
- # Modified by Feng Liang from https://github.com/openai/CLIP/blob/main/clip/model.py
4
-
5
- from collections import OrderedDict
6
- from typing import Tuple, Union
7
-
8
- import numpy as np
9
- import torch
10
- import torch.nn.functional as F
11
- from torch import nn
12
-
13
-
14
- class Bottleneck(nn.Module):
15
- expansion = 4
16
-
17
- def __init__(self, inplanes, planes, stride=1):
18
- super().__init__()
19
-
20
- # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
21
- self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
22
- self.bn1 = nn.BatchNorm2d(planes)
23
-
24
- self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
25
- self.bn2 = nn.BatchNorm2d(planes)
26
-
27
- self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
28
-
29
- self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
30
- self.bn3 = nn.BatchNorm2d(planes * self.expansion)
31
-
32
- self.relu = nn.ReLU(inplace=True)
33
- self.downsample = None
34
- self.stride = stride
35
-
36
- if stride > 1 or inplanes != planes * Bottleneck.expansion:
37
- # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
38
- self.downsample = nn.Sequential(
39
- OrderedDict(
40
- [
41
- ("-1", nn.AvgPool2d(stride)),
42
- (
43
- "0",
44
- nn.Conv2d(
45
- inplanes,
46
- planes * self.expansion,
47
- 1,
48
- stride=1,
49
- bias=False,
50
- ),
51
- ),
52
- ("1", nn.BatchNorm2d(planes * self.expansion)),
53
- ]
54
- )
55
- )
56
-
57
- def forward(self, x: torch.Tensor):
58
- identity = x
59
-
60
- out = self.relu(self.bn1(self.conv1(x)))
61
- out = self.relu(self.bn2(self.conv2(out)))
62
- out = self.avgpool(out)
63
- out = self.bn3(self.conv3(out))
64
-
65
- if self.downsample is not None:
66
- identity = self.downsample(x)
67
-
68
- out += identity
69
- out = self.relu(out)
70
- return out
71
-
72
-
73
- class AttentionPool2d(nn.Module):
74
- def __init__(
75
- self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None
76
- ):
77
- super().__init__()
78
- self.positional_embedding = nn.Parameter(
79
- torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5
80
- )
81
- self.k_proj = nn.Linear(embed_dim, embed_dim)
82
- self.q_proj = nn.Linear(embed_dim, embed_dim)
83
- self.v_proj = nn.Linear(embed_dim, embed_dim)
84
- self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
85
- self.num_heads = num_heads
86
- self.grid_size = spacial_dim
87
-
88
- def forward(self, x, mask=None, return_cls=True):
89
- b, c, gh, gw = x.shape
90
- # remove irrelated feature
91
- if mask is not None:
92
- mask = F.interpolate(mask[:, None, ...], size=(gh, gw)).squeeze(
93
- 1
94
- ) # [N,H,W] -> [N,grid,grid]
95
- mask = (mask > 0.5).reshape(mask.shape[0], -1)
96
- mask = torch.cat([mask, mask.new_ones(mask.shape[0], 1)], dim=1)
97
- if x.size()[0] == 1:
98
- x = x.expand(mask.shape[0], c, gh, gw)
99
-
100
- x = x.reshape(x.shape[0], c, gh * gw).permute(2, 0, 1) # NCHW -> (HW)NC
101
-
102
- x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
103
- positional_embedding = self.positional_embedding
104
- if not (self.positional_embedding.shape[0] == x.shape[0]):
105
- cls_pos = positional_embedding[0:1, :]
106
- per_pos_embedding = (
107
- F.interpolate(
108
- positional_embedding[1:, :]
109
- .permute(1, 0)
110
- .view(1, -1, self.grid_size, self.grid_size),
111
- size=(gh, gw),
112
- mode="bicubic",
113
- )
114
- .reshape(-1, gh * gw)
115
- .permute(1, 0)
116
- )
117
- positional_embedding = torch.cat([cls_pos, per_pos_embedding])
118
-
119
- x = x + positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC
120
- x, _ = F.multi_head_attention_forward(
121
- query=x,
122
- key=x,
123
- value=x,
124
- embed_dim_to_check=x.shape[-1],
125
- num_heads=self.num_heads,
126
- q_proj_weight=self.q_proj.weight,
127
- k_proj_weight=self.k_proj.weight,
128
- v_proj_weight=self.v_proj.weight,
129
- in_proj_weight=None,
130
- in_proj_bias=torch.cat(
131
- [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]
132
- ),
133
- bias_k=None,
134
- bias_v=None,
135
- add_zero_attn=False,
136
- dropout_p=0,
137
- out_proj_weight=self.c_proj.weight,
138
- out_proj_bias=self.c_proj.bias,
139
- use_separate_proj_weight=True,
140
- training=self.training,
141
- need_weights=False,
142
- key_padding_mask=mask,
143
- )
144
-
145
- if return_cls:
146
- return x[0]
147
- else:
148
- return x
149
-
150
-
151
- class ModifiedResNet(nn.Module):
152
- """
153
- A ResNet class that is similar to torchvision's but contains the following changes:
154
- - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
155
- - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
156
- - The final pooling layer is a QKV attention instead of an average pool
157
- """
158
-
159
- def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
160
- super().__init__()
161
- self.output_dim = output_dim
162
- self.input_resolution = input_resolution
163
-
164
- # the 3-layer stem
165
- self.conv1 = nn.Conv2d(
166
- 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False
167
- )
168
- self.bn1 = nn.BatchNorm2d(width // 2)
169
- self.conv2 = nn.Conv2d(
170
- width // 2, width // 2, kernel_size=3, padding=1, bias=False
171
- )
172
- self.bn2 = nn.BatchNorm2d(width // 2)
173
- self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
174
- self.bn3 = nn.BatchNorm2d(width)
175
- self.avgpool = nn.AvgPool2d(2)
176
- self.relu = nn.ReLU(inplace=True)
177
-
178
- # residual layers
179
- self._inplanes = width # this is a *mutable* variable used during construction
180
- self.layer1 = self._make_layer(width, layers[0])
181
- self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
182
- self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
183
- self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
184
-
185
- embed_dim = width * 32 # the ResNet feature dimension
186
- self.attnpool = AttentionPool2d(
187
- input_resolution // 32, embed_dim, heads, output_dim
188
- )
189
-
190
- def _make_layer(self, planes, blocks, stride=1):
191
- layers = [Bottleneck(self._inplanes, planes, stride)]
192
-
193
- self._inplanes = planes * Bottleneck.expansion
194
- for _ in range(1, blocks):
195
- layers.append(Bottleneck(self._inplanes, planes))
196
-
197
- return nn.Sequential(*layers)
198
-
199
- def forward(self, x, mask: torch.Tensor = None, return_cls=True):
200
- def stem(x):
201
- for conv, bn in [
202
- (self.conv1, self.bn1),
203
- (self.conv2, self.bn2),
204
- (self.conv3, self.bn3),
205
- ]:
206
- x = self.relu(bn(conv(x)))
207
- x = self.avgpool(x)
208
- return x
209
-
210
- x = x.type(self.conv1.weight.dtype)
211
- x = stem(x) # 1/4,1/4
212
- x = self.layer1(x)
213
- x = self.layer2(x) # 1/8,1/8
214
- x = self.layer3(x) # 1/16,1/16
215
- x = self.layer4(x) # 1/32,1/32
216
- b, c, gh, gw = x.shape
217
- x = self.attnpool(x, mask, return_cls)
218
- if not return_cls:
219
- return x[1:].permute(1, 0, 2).reshape(b, gh, gw, x.shape[-1]) # N,L,C
220
- return x
221
-
222
-
223
- class LayerNorm(nn.LayerNorm):
224
- """Subclass torch's LayerNorm to handle fp16."""
225
-
226
- def forward(self, x: torch.Tensor):
227
- orig_type = x.dtype
228
- ret = super().forward(x.type(torch.float32))
229
- return ret.type(orig_type)
230
-
231
-
232
- class QuickGELU(nn.Module):
233
- def forward(self, x: torch.Tensor):
234
- return x * torch.sigmoid(1.702 * x)
235
-
236
-
237
- class ResidualAttentionBlock(nn.Module):
238
- def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
239
- super().__init__()
240
-
241
- self.attn = nn.MultiheadAttention(d_model, n_head)
242
- self.ln_1 = LayerNorm(d_model)
243
- self.mlp = nn.Sequential(
244
- OrderedDict(
245
- [
246
- ("c_fc", nn.Linear(d_model, d_model * 4)),
247
- ("gelu", QuickGELU()),
248
- ("c_proj", nn.Linear(d_model * 4, d_model)),
249
- ]
250
- )
251
- )
252
- self.ln_2 = LayerNorm(d_model)
253
- self.attn_mask = attn_mask
254
-
255
- def attention(self, x: torch.Tensor, **kwargs):
256
- self.attn_mask = (
257
- self.attn_mask.to(dtype=x.dtype, device=x.device)
258
- if self.attn_mask is not None
259
- else None
260
- )
261
- return self.attn(
262
- x, x, x, need_weights=False, attn_mask=self.attn_mask, **kwargs
263
- )[0]
264
-
265
- def forward(self, x: torch.Tensor, **kwargs):
266
- x = x + self.attention(self.ln_1(x), **kwargs)
267
- x = x + self.mlp(self.ln_2(x))
268
- return x
269
-
270
-
271
- class Transformer(nn.Module):
272
- def __init__(
273
- self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None
274
- ):
275
- super().__init__()
276
- self.width = width
277
- self.layers = layers
278
- self.resblocks = nn.Sequential(
279
- *[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]
280
- )
281
-
282
- def forward(self, x: torch.Tensor, **kwargs):
283
- for block in self.resblocks:
284
- x = block(x, **kwargs)
285
- return x
286
-
287
-
288
- class VisionTransformer(nn.Module):
289
- def __init__(
290
- self,
291
- input_resolution: int,
292
- patch_size: int,
293
- mask_prompt_depth: int,
294
- width: int,
295
- layers: int,
296
- heads: int,
297
- output_dim: int,
298
- ):
299
- super().__init__()
300
- self.input_resolution = input_resolution
301
- self.output_dim = output_dim
302
- self.conv1 = nn.Conv2d(
303
- in_channels=3,
304
- out_channels=width,
305
- kernel_size=patch_size,
306
- stride=patch_size,
307
- bias=False,
308
- )
309
-
310
- scale = width ** -0.5
311
- self.class_embedding = nn.Parameter(scale * torch.randn(width))
312
- self.positional_embedding = nn.Parameter(
313
- scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)
314
- )
315
- self.grid_size = input_resolution // patch_size
316
- self.ln_pre = LayerNorm(width)
317
-
318
- self.transformer = Transformer(width, layers, heads)
319
-
320
- self.ln_post = LayerNorm(width)
321
- self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
322
-
323
- self.mask_pool = nn.AvgPool2d(patch_size, stride=patch_size)
324
- self.mask_prompt_depth = mask_prompt_depth
325
- self.mask_embedding = nn.Parameter(torch.zeros(self.mask_prompt_depth, self.grid_size * self.grid_size, width))
326
-
327
- def forward(self, x: torch.Tensor, m: torch.Tensor = None):
328
- x = self.conv1(x) # shape = [*, width, grid, grid]
329
- x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
330
- x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
331
- if m is not None:
332
- m = self.mask_pool(m.to(torch.float).squeeze()).reshape(m.shape[0], -1).unsqueeze(-1)
333
- m = torch.ceil(m)
334
- if self.mask_embedding.shape[1] == 1:
335
- mask_embedding = self.mask_embedding.to(x.dtype).repeat(1, x.shape[1], 1)
336
- else:
337
- mask_embedding = self.mask_embedding.to(x.dtype)
338
- x = x * m + mask_embedding[0].unsqueeze(0) * (1 - m)
339
-
340
- x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width]
341
- x = x + self.positional_embedding.to(x.dtype)
342
- x = self.ln_pre(x)
343
-
344
- x = x.permute(1, 0, 2) # NLD -> LND
345
- if m is not None:
346
- for i, blk in enumerate(self.transformer.resblocks):
347
- d = i + 1
348
- x = blk(x)
349
- if d < self.mask_prompt_depth:
350
- masked_x = x[1:, :, :] * m.permute(1, 0, 2) + \
351
- mask_embedding[d].unsqueeze(0).permute(1, 0, 2) * (1 - m.permute(1, 0, 2))
352
- x = torch.cat([x[:1, :, :], masked_x], dim=0)
353
- else:
354
- x = self.transformer(x)
355
- x = x.permute(1, 0, 2) # LND -> NLD
356
-
357
- x = self.ln_post(x[:, 0, :])
358
-
359
- if self.proj is not None:
360
- x = x @ self.proj
361
-
362
- return x
363
-
364
-
365
-
366
- class CLIP(nn.Module):
367
- def __init__(
368
- self,
369
- embed_dim: int,
370
- # vision
371
- image_resolution: int,
372
- vision_layers: Union[Tuple[int, int, int, int], int],
373
- vision_width: int,
374
- vision_patch_size: int,
375
- mask_prompt_depth: int,
376
- # text
377
- context_length: int,
378
- vocab_size: int,
379
- transformer_width: int,
380
- transformer_heads: int,
381
- transformer_layers: int,
382
- ):
383
- super().__init__()
384
-
385
- self.context_length = context_length
386
-
387
- if isinstance(vision_layers, (tuple, list)):
388
- vision_heads = vision_width * 32 // 64
389
- self.visual = ModifiedResNet(
390
- layers=vision_layers,
391
- output_dim=embed_dim,
392
- heads=vision_heads,
393
- input_resolution=image_resolution,
394
- width=vision_width,
395
- )
396
- else:
397
- vision_heads = vision_width // 64
398
- self.visual = VisionTransformer(
399
- input_resolution=image_resolution,
400
- patch_size=vision_patch_size,
401
- mask_prompt_depth=mask_prompt_depth,
402
- width=vision_width,
403
- layers=vision_layers,
404
- heads=vision_heads,
405
- output_dim=embed_dim,
406
- )
407
-
408
- self.transformer = Transformer(
409
- width=transformer_width,
410
- layers=transformer_layers,
411
- heads=transformer_heads,
412
- attn_mask=self.build_attention_mask(),
413
- )
414
-
415
- self.vocab_size = vocab_size
416
- self.token_embedding = nn.Embedding(vocab_size, transformer_width)
417
- self.positional_embedding = nn.Parameter(
418
- torch.empty(self.context_length, transformer_width)
419
- )
420
- self.ln_final = LayerNorm(transformer_width)
421
-
422
- self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
423
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
424
-
425
- self.initialize_parameters()
426
-
427
- def initialize_parameters(self):
428
- nn.init.normal_(self.token_embedding.weight, std=0.02)
429
- nn.init.normal_(self.positional_embedding, std=0.01)
430
-
431
- if isinstance(self.visual, ModifiedResNet):
432
- if self.visual.attnpool is not None:
433
- std = self.visual.attnpool.c_proj.in_features ** -0.5
434
- nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
435
- nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
436
- nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
437
- nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
438
-
439
- for resnet_block in [
440
- self.visual.layer1,
441
- self.visual.layer2,
442
- self.visual.layer3,
443
- self.visual.layer4,
444
- ]:
445
- for name, param in resnet_block.named_parameters():
446
- if name.endswith("bn3.weight"):
447
- nn.init.zeros_(param)
448
-
449
- proj_std = (self.transformer.width ** -0.5) * (
450
- (2 * self.transformer.layers) ** -0.5
451
- )
452
- attn_std = self.transformer.width ** -0.5
453
- fc_std = (2 * self.transformer.width) ** -0.5
454
- for block in self.transformer.resblocks:
455
- nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
456
- nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
457
- nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
458
- nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
459
-
460
- if self.text_projection is not None:
461
- nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
462
-
463
- def build_attention_mask(self):
464
- # lazily create causal attention mask, with full attention between the vision tokens
465
- # pytorch uses additive attention mask; fill with -inf
466
- mask = torch.empty(self.context_length, self.context_length)
467
- mask.fill_(float("-inf"))
468
- mask.triu_(1) # zero out the lower diagonal
469
- return mask
470
-
471
- @property
472
- def dtype(self):
473
- return self.visual.conv1.weight.dtype
474
-
475
- def encode_image(self, image, **kwargs):
476
- return self.visual(image.type(self.dtype), **kwargs)
477
-
478
- def encode_text(self, text):
479
- x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model]
480
-
481
- x = x + self.positional_embedding.type(self.dtype)
482
- x = x.permute(1, 0, 2) # NLD -> LND
483
- x = self.transformer(x)
484
- x = x.permute(1, 0, 2) # LND -> NLD
485
- x = self.ln_final(x).type(self.dtype)
486
-
487
- # x.shape = [batch_size, n_ctx, transformer.width]
488
- # take features from the eot embedding (eot_token is the highest number in each sequence)
489
- x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
490
-
491
- return x
492
-
493
- def forward(self, image, text):
494
- image_features = self.encode_image(image)
495
- text_features = self.encode_text(text)
496
-
497
- # normalized features
498
- image_features = image_features / image_features.norm(dim=-1, keepdim=True)
499
- text_features = text_features / text_features.norm(dim=-1, keepdim=True)
500
-
501
- # cosine similarity as logits
502
- logit_scale = self.logit_scale.exp()
503
- logits_per_image = logit_scale * image_features @ text_features.t()
504
- logits_per_text = logit_scale * text_features @ image_features.t()
505
-
506
- # shape = [global_batch_size, global_batch_size]
507
- return logits_per_image, logits_per_text
508
-
509
-
510
- def convert_weights(model: nn.Module):
511
- """Convert applicable model parameters to fp16"""
512
-
513
- def _convert_weights_to_fp16(l):
514
- if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
515
- l.weight.data = l.weight.data.half()
516
- if l.bias is not None:
517
- l.bias.data = l.bias.data.half()
518
-
519
- if isinstance(l, nn.MultiheadAttention):
520
- for attr in [
521
- *[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]],
522
- "in_proj_bias",
523
- "bias_k",
524
- "bias_v",
525
- ]:
526
- tensor = getattr(l, attr)
527
- if tensor is not None:
528
- tensor.data = tensor.data.half()
529
-
530
- for name in ["text_projection", "proj"]:
531
- if hasattr(l, name):
532
- attr = getattr(l, name)
533
- if attr is not None:
534
- attr.data = attr.data.half()
535
-
536
- model.apply(_convert_weights_to_fp16)
537
-
538
-
539
- def build_model(state_dict: dict, mask_prompt_depth: int = 0):
540
- vit = "visual.proj" in state_dict
541
-
542
- if vit:
543
- vision_width = state_dict["visual.conv1.weight"].shape[0]
544
- vision_layers = len(
545
- [
546
- k
547
- for k in state_dict.keys()
548
- if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")
549
- ]
550
- )
551
- vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
552
- grid_size = round(
553
- (state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5
554
- )
555
- image_resolution = vision_patch_size * grid_size
556
- else:
557
- assert mask_prompt_depth == 0, 'ResNets do not support mask prompt tuning'
558
- counts: list = [
559
- len(
560
- set(
561
- k.split(".")[2]
562
- for k in state_dict
563
- if k.startswith(f"visual.layer{b}")
564
- )
565
- )
566
- for b in [1, 2, 3, 4]
567
- ]
568
- vision_layers = tuple(counts)
569
- vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
570
- output_width = round(
571
- (state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5
572
- )
573
- vision_patch_size = None
574
- assert (
575
- output_width ** 2 + 1
576
- == state_dict["visual.attnpool.positional_embedding"].shape[0]
577
- )
578
- image_resolution = output_width * 32
579
-
580
- embed_dim = state_dict["text_projection"].shape[1]
581
- context_length = state_dict["positional_embedding"].shape[0]
582
- vocab_size = state_dict["token_embedding.weight"].shape[0]
583
- transformer_width = state_dict["ln_final.weight"].shape[0]
584
- transformer_heads = transformer_width // 64
585
- transformer_layers = len(
586
- set(
587
- k.split(".")[2]
588
- for k in state_dict
589
- if k.startswith(f"transformer.resblocks")
590
- )
591
- )
592
-
593
- model = CLIP(
594
- embed_dim,
595
- image_resolution,
596
- vision_layers,
597
- vision_width,
598
- vision_patch_size,
599
- mask_prompt_depth,
600
- context_length,
601
- vocab_size,
602
- transformer_width,
603
- transformer_heads,
604
- transformer_layers,
605
- )
606
-
607
- for key in ["input_resolution", "context_length", "vocab_size"]:
608
- if key in state_dict:
609
- del state_dict[key]
610
-
611
- convert_weights(model)
612
- model.load_state_dict(state_dict, strict=False)
613
- return model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
open_vocab_seg/modeling/clip_adapter/simple_tokenizer.py DELETED
@@ -1,150 +0,0 @@
1
- import gzip
2
- import html
3
- import os
4
- from functools import lru_cache
5
-
6
- import ftfy
7
- import regex as re
8
-
9
-
10
- @lru_cache()
11
- def default_bpe():
12
- return os.path.join(
13
- os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz"
14
- )
15
-
16
-
17
- @lru_cache()
18
- def bytes_to_unicode():
19
- """
20
- Returns list of utf-8 byte and a corresponding list of unicode strings.
21
- The reversible bpe codes work on unicode strings.
22
- This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
23
- When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
24
- This is a signficant percentage of your normal, say, 32K bpe vocab.
25
- To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
26
- And avoids mapping to whitespace/control characters the bpe code barfs on.
27
- """
28
- bs = (
29
- list(range(ord("!"), ord("~") + 1))
30
- + list(range(ord("¡"), ord("¬") + 1))
31
- + list(range(ord("®"), ord("ÿ") + 1))
32
- )
33
- cs = bs[:]
34
- n = 0
35
- for b in range(2 ** 8):
36
- if b not in bs:
37
- bs.append(b)
38
- cs.append(2 ** 8 + n)
39
- n += 1
40
- cs = [chr(n) for n in cs]
41
- return dict(zip(bs, cs))
42
-
43
-
44
- def get_pairs(word):
45
- """Return set of symbol pairs in a word.
46
- Word is represented as tuple of symbols (symbols being variable-length strings).
47
- """
48
- pairs = set()
49
- prev_char = word[0]
50
- for char in word[1:]:
51
- pairs.add((prev_char, char))
52
- prev_char = char
53
- return pairs
54
-
55
-
56
- def basic_clean(text):
57
- text = ftfy.fix_text(text)
58
- text = html.unescape(html.unescape(text))
59
- return text.strip()
60
-
61
-
62
- def whitespace_clean(text):
63
- text = re.sub(r"\s+", " ", text)
64
- text = text.strip()
65
- return text
66
-
67
-
68
- class SimpleTokenizer(object):
69
- def __init__(self, bpe_path: str = default_bpe()):
70
- self.byte_encoder = bytes_to_unicode()
71
- self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
72
- merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
73
- merges = merges[1 : 49152 - 256 - 2 + 1]
74
- merges = [tuple(merge.split()) for merge in merges]
75
- vocab = list(bytes_to_unicode().values())
76
- vocab = vocab + [v + "</w>" for v in vocab]
77
- for merge in merges:
78
- vocab.append("".join(merge))
79
- vocab.extend(["<|startoftext|>", "<|endoftext|>"])
80
- self.encoder = dict(zip(vocab, range(len(vocab))))
81
- self.decoder = {v: k for k, v in self.encoder.items()}
82
- self.bpe_ranks = dict(zip(merges, range(len(merges))))
83
- self.cache = {
84
- "<|startoftext|>": "<|startoftext|>",
85
- "<|endoftext|>": "<|endoftext|>",
86
- }
87
- self.pat = re.compile(
88
- r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
89
- re.IGNORECASE,
90
- )
91
-
92
- def bpe(self, token):
93
- if token in self.cache:
94
- return self.cache[token]
95
- word = tuple(token[:-1]) + (token[-1] + "</w>",)
96
- pairs = get_pairs(word)
97
-
98
- if not pairs:
99
- return token + "</w>"
100
-
101
- while True:
102
- bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
103
- if bigram not in self.bpe_ranks:
104
- break
105
- first, second = bigram
106
- new_word = []
107
- i = 0
108
- while i < len(word):
109
- try:
110
- j = word.index(first, i)
111
- new_word.extend(word[i:j])
112
- i = j
113
- except:
114
- new_word.extend(word[i:])
115
- break
116
-
117
- if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
118
- new_word.append(first + second)
119
- i += 2
120
- else:
121
- new_word.append(word[i])
122
- i += 1
123
- new_word = tuple(new_word)
124
- word = new_word
125
- if len(word) == 1:
126
- break
127
- else:
128
- pairs = get_pairs(word)
129
- word = " ".join(word)
130
- self.cache[token] = word
131
- return word
132
-
133
- def encode(self, text):
134
- bpe_tokens = []
135
- text = whitespace_clean(basic_clean(text)).lower()
136
- for token in re.findall(self.pat, text):
137
- token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
138
- bpe_tokens.extend(
139
- self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
140
- )
141
- return bpe_tokens
142
-
143
- def decode(self, tokens):
144
- text = "".join([self.decoder[token] for token in tokens])
145
- text = (
146
- bytearray([self.byte_decoder[c] for c in text])
147
- .decode("utf-8", errors="replace")
148
- .replace("</w>", " ")
149
- )
150
- return text