runninglsy
commited on
Commit
•
f5cfd8b
1
Parent(s):
a2db297
clean code
Browse files- base_visual_tokenizer.py +0 -138
- clip_visual_tokenizer.py +0 -120
base_visual_tokenizer.py
DELETED
@@ -1,138 +0,0 @@
|
|
1 |
-
from typing import Union, Optional
|
2 |
-
|
3 |
-
import PIL.Image
|
4 |
-
import torch
|
5 |
-
from torch.nn.functional import softmax, gumbel_softmax
|
6 |
-
from transformers import PretrainedConfig, PreTrainedModel, AutoImageProcessor, AutoModel, AutoConfig
|
7 |
-
|
8 |
-
|
9 |
-
class BaseVisualTokenizerConfig(PretrainedConfig):
|
10 |
-
def __init__(self,
|
11 |
-
vocab_size=16384,
|
12 |
-
tokenize_function="softmax",
|
13 |
-
tau=1.0,
|
14 |
-
depths=None,
|
15 |
-
use_indicators=False,
|
16 |
-
drop_cls_token=False,
|
17 |
-
backbone_config: Optional[Union[PretrainedConfig, dict]] = None,
|
18 |
-
hidden_stride: int = 1,
|
19 |
-
**kwargs):
|
20 |
-
super().__init__(**kwargs)
|
21 |
-
self.vocab_size = vocab_size
|
22 |
-
self.tokenize_function = tokenize_function
|
23 |
-
self.tau = tau
|
24 |
-
if isinstance(depths, str):
|
25 |
-
depths = [int(x) for x in depths.split('|')]
|
26 |
-
self.depths = depths
|
27 |
-
self.backbone_kwargs = {}
|
28 |
-
self.use_indicators = use_indicators
|
29 |
-
self.drop_cls_token = drop_cls_token
|
30 |
-
if backbone_config is not None:
|
31 |
-
assert isinstance(backbone_config, (PretrainedConfig, dict)), \
|
32 |
-
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
|
33 |
-
if not isinstance(backbone_config, PretrainedConfig):
|
34 |
-
model_type = backbone_config['model_type']
|
35 |
-
backbone_config.pop('model_type')
|
36 |
-
backbone_config = AutoConfig.for_model(model_type, **backbone_config)
|
37 |
-
self.backbone_config = backbone_config
|
38 |
-
self.hidden_stride = hidden_stride
|
39 |
-
|
40 |
-
|
41 |
-
class BaseVisualTokenizer(PreTrainedModel):
|
42 |
-
base_model_prefix = "backbone"
|
43 |
-
main_input_name = None
|
44 |
-
_image_processor_class = None
|
45 |
-
_image_processor_kwargs = {}
|
46 |
-
_backbone_class = None
|
47 |
-
_backbone_name_or_path = None
|
48 |
-
|
49 |
-
def __init__(self, config: BaseVisualTokenizerConfig, *inputs, **kwargs):
|
50 |
-
super().__init__(config, *inputs, **kwargs)
|
51 |
-
if kwargs.get('train_from_scratch'):
|
52 |
-
self.image_processor = self._image_processor_class.from_pretrained(self._backbone_name_or_path,
|
53 |
-
**self._image_processor_kwargs)
|
54 |
-
self.backbone = self._backbone_class.from_pretrained(self._backbone_name_or_path,
|
55 |
-
**self.config.backbone_kwargs)
|
56 |
-
self.config.backbone_config = self.backbone.config
|
57 |
-
else:
|
58 |
-
self.image_processor = AutoImageProcessor.from_pretrained(kwargs['image_processor_name_or_path'])
|
59 |
-
self.backbone = AutoModel.from_config(self.config.backbone_config)
|
60 |
-
self.head = None
|
61 |
-
|
62 |
-
assert all((self.image_processor.do_resize,
|
63 |
-
not getattr(self.image_processor, 'do_center_crop', False),
|
64 |
-
self.image_processor.do_rescale,
|
65 |
-
self.image_processor.do_normalize
|
66 |
-
)), f"image_processor `{self.image_processor}` is not supported currently"
|
67 |
-
|
68 |
-
def get_backbone(self):
|
69 |
-
return self.backbone
|
70 |
-
|
71 |
-
def get_monitor_tensors(self):
|
72 |
-
raise NotImplementedError
|
73 |
-
|
74 |
-
def get_image_processor(self):
|
75 |
-
return self.image_processor
|
76 |
-
|
77 |
-
def get_head(self):
|
78 |
-
return self.head
|
79 |
-
|
80 |
-
def get_image_size(self):
|
81 |
-
raise NotImplementedError
|
82 |
-
|
83 |
-
def preprocess_image(self, image: PIL.Image.Image, convert_to_rgb=True):
|
84 |
-
if convert_to_rgb and image.mode != 'RGB':
|
85 |
-
image = image.convert('RGB')
|
86 |
-
|
87 |
-
# first resize and preprocess
|
88 |
-
sides = self.get_image_size()
|
89 |
-
if sides[0] != sides[1]:
|
90 |
-
raise ValueError('get_image_size() returns non-square size')
|
91 |
-
side = sides[0]
|
92 |
-
|
93 |
-
width, height = image.size
|
94 |
-
if width == height:
|
95 |
-
new_width = new_height = side
|
96 |
-
elif width > height:
|
97 |
-
new_width = side
|
98 |
-
new_height = int(height / width * new_width)
|
99 |
-
else:
|
100 |
-
new_height = side
|
101 |
-
new_width = int(width / height * new_height)
|
102 |
-
new_size = dict(height=new_height, width=new_width)
|
103 |
-
pixel_values = self.image_processor.preprocess(image, size=new_size, return_tensors='pt')['pixel_values']
|
104 |
-
|
105 |
-
# then pad to square
|
106 |
-
square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
|
107 |
-
new_height, new_width = pixel_values.shape[2:]
|
108 |
-
if new_height == new_width:
|
109 |
-
square_values[:, :, :, :] = pixel_values
|
110 |
-
elif new_height > new_width:
|
111 |
-
from_index = (side - new_width) // 2
|
112 |
-
square_values[:, :, :, from_index:from_index + new_width] = pixel_values
|
113 |
-
else:
|
114 |
-
from_index = (side - new_height) // 2
|
115 |
-
square_values[:, :, from_index:from_index + new_height, :] = pixel_values
|
116 |
-
|
117 |
-
return square_values
|
118 |
-
|
119 |
-
def get_layer_norm(self):
|
120 |
-
return self.layer_norm
|
121 |
-
|
122 |
-
def tokenize(self, logits):
|
123 |
-
def st_argmax(y_soft, dim): # straight-through softmax
|
124 |
-
index = y_soft.max(dim, keepdim=True)[1]
|
125 |
-
y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
|
126 |
-
ret = y_hard - y_soft.detach() + y_soft
|
127 |
-
return ret
|
128 |
-
|
129 |
-
if self.config.tokenize_function == 'softmax':
|
130 |
-
tokens = softmax(logits, dim=-1)
|
131 |
-
elif self.config.tokenize_function == 'gumbel_argmax':
|
132 |
-
tokens = gumbel_softmax(logits, tau=self.config.tau, hard=True)
|
133 |
-
elif self.config.tokenize_function == 'st_argmax':
|
134 |
-
tokens = st_argmax(logits, dim=-1)
|
135 |
-
else:
|
136 |
-
raise ValueError(
|
137 |
-
f'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax, but got {self.config.tokenize_function}')
|
138 |
-
return tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
clip_visual_tokenizer.py
DELETED
@@ -1,120 +0,0 @@
|
|
1 |
-
from datetime import datetime
|
2 |
-
from typing import Dict
|
3 |
-
|
4 |
-
import deepspeed
|
5 |
-
import torch
|
6 |
-
from torch import Tensor
|
7 |
-
from transformers import AutoConfig, AutoModel
|
8 |
-
from transformers import CLIPVisionModel, CLIPImageProcessor
|
9 |
-
from transformers.integrations import is_deepspeed_zero3_enabled
|
10 |
-
|
11 |
-
from .utils import BEGIN_LINE, END_LINE, rank0_print
|
12 |
-
from .base_visual_tokenizer import BaseVisualTokenizerConfig, BaseVisualTokenizer
|
13 |
-
|
14 |
-
MODEL_TYPE = "clip_visual_tokenizer"
|
15 |
-
|
16 |
-
|
17 |
-
class ClipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
18 |
-
model_type = MODEL_TYPE
|
19 |
-
|
20 |
-
def __init__(self, **kwargs):
|
21 |
-
super().__init__(**kwargs)
|
22 |
-
if self.depths:
|
23 |
-
assert len(self.depths) == 1
|
24 |
-
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
25 |
-
|
26 |
-
|
27 |
-
class ClipVisualTokenizer(BaseVisualTokenizer):
|
28 |
-
config_class = ClipVisualTokenizerConfig
|
29 |
-
supports_gradient_checkpointing = True
|
30 |
-
_no_split_modules = ["CLIPEncoderLayer"]
|
31 |
-
_image_processor_class = CLIPImageProcessor
|
32 |
-
_image_processor_kwargs = dict(do_center_crop=False)
|
33 |
-
_backbone_class = CLIPVisionModel
|
34 |
-
_backbone_name_or_path = "openai/clip-vit-large-patch14-336"
|
35 |
-
|
36 |
-
def __init__(self, config: ClipVisualTokenizerConfig = None, *inputs, **kwargs):
|
37 |
-
super().__init__(config, *inputs, **kwargs)
|
38 |
-
head_dim = self.config.vocab_size
|
39 |
-
if self.config.use_indicators:
|
40 |
-
head_dim -= 2 # reserved for two image indicator tokens
|
41 |
-
self.head = torch.nn.Sequential(
|
42 |
-
torch.nn.Linear(self.backbone.config.hidden_size, head_dim, bias=False),
|
43 |
-
torch.nn.LayerNorm(head_dim)
|
44 |
-
)
|
45 |
-
|
46 |
-
def re_init_layers(self, re_init_layer_begin):
|
47 |
-
layer_dict = self.get_re_init_layer_dict(re_init_layer_begin)
|
48 |
-
for name, layer in layer_dict.items():
|
49 |
-
rank0_print(BEGIN_LINE)
|
50 |
-
rank0_print(f'[{datetime.now()}] Before layer re-initialization of {name}: ')
|
51 |
-
for k, v in layer.named_parameters():
|
52 |
-
with deepspeed.zero.GatheredParameters([v]):
|
53 |
-
rank0_print(f'{k}: {v}')
|
54 |
-
with deepspeed.zero.GatheredParameters(list(layer.parameters(recurse=True)), modifier_rank=0):
|
55 |
-
if not is_deepspeed_zero3_enabled() or deepspeed.comm.get_rank() == 0:
|
56 |
-
layer.apply(self.backbone._init_weights)
|
57 |
-
rank0_print(f'[{datetime.now()}] After layer re-initialization of {name}:')
|
58 |
-
for k, v in layer.named_parameters():
|
59 |
-
with deepspeed.zero.GatheredParameters([v]):
|
60 |
-
rank0_print(f'{k}: {v}')
|
61 |
-
rank0_print(END_LINE)
|
62 |
-
|
63 |
-
def get_re_init_layer_dict(self, re_init_layer_begin: int) -> Dict[str, torch.nn.Module]:
|
64 |
-
assert re_init_layer_begin >= 0, "negative index is prohibited"
|
65 |
-
layer_dict = dict()
|
66 |
-
for i in range(re_init_layer_begin, self.backbone.config.num_hidden_layers):
|
67 |
-
layer_dict[f'backbone.vision_model.encoder.layers.{i}'] = self.backbone.vision_model.encoder.layers[i]
|
68 |
-
return layer_dict
|
69 |
-
|
70 |
-
def get_monitor_tensors(self):
|
71 |
-
return dict(
|
72 |
-
backbone_bottom=self.backbone.vision_model.encoder.layers[0].self_attn.k_proj.weight,
|
73 |
-
backbone_top=self.backbone.vision_model.encoder.layers[-1].self_attn.out_proj.weight,
|
74 |
-
head=self.head[0].weight
|
75 |
-
)
|
76 |
-
|
77 |
-
def get_image_size(self):
|
78 |
-
height = self.image_processor.crop_size["height"]
|
79 |
-
width = self.image_processor.crop_size["width"]
|
80 |
-
return height, width
|
81 |
-
|
82 |
-
def forward(self, pixel_values) -> Tensor: # [BatchSize, ImageShape] -> [BatchSize, #Token, VocabSize]
|
83 |
-
output = self.backbone(
|
84 |
-
pixel_values, output_hidden_states=True, return_dict=True)
|
85 |
-
features = output.last_hidden_state
|
86 |
-
if self.config.drop_cls_token:
|
87 |
-
features = features[:, 1:, :]
|
88 |
-
logits = self.head(features)
|
89 |
-
tokens = self.tokenize(logits)
|
90 |
-
if self.config.use_indicators:
|
91 |
-
# tokens' shape is [BatchSize, #Token, VocabSize-2], so padding with [BatchSize, #Token, 2], after
|
92 |
-
# which, tokens' shape should become [BatchSize, #Token, VocabSize]
|
93 |
-
batch_size, token_len, _ = tokens.shape
|
94 |
-
padding_tensor = torch.zeros(size=(batch_size, token_len, 2),
|
95 |
-
dtype=tokens.dtype,
|
96 |
-
device=tokens.device,
|
97 |
-
layout=tokens.layout,
|
98 |
-
requires_grad=False)
|
99 |
-
tokens = torch.cat((tokens, padding_tensor), dim=2)
|
100 |
-
|
101 |
-
# adding indicator tokens, after which tokens' shape should become [BatchSize, 1+#Token+1, VocabSize]
|
102 |
-
begin_indicator = torch.zeros(size=(batch_size, 1),
|
103 |
-
dtype=torch.long,
|
104 |
-
device=tokens.device,
|
105 |
-
requires_grad=False) + self.config.vocab_size - 2
|
106 |
-
begin_indicator_token = torch.nn.functional.one_hot(begin_indicator,
|
107 |
-
num_classes=self.config.vocab_size).to(
|
108 |
-
dtype=tokens.dtype)
|
109 |
-
end_indicator = torch.zeros(size=(batch_size, 1),
|
110 |
-
dtype=torch.long,
|
111 |
-
device=tokens.device,
|
112 |
-
requires_grad=False) + self.config.vocab_size - 1
|
113 |
-
end_indicator_token = torch.nn.functional.one_hot(end_indicator,
|
114 |
-
num_classes=self.config.vocab_size).to(dtype=tokens.dtype)
|
115 |
-
tokens = torch.cat((begin_indicator_token, tokens, end_indicator_token), dim=1)
|
116 |
-
return tokens
|
117 |
-
|
118 |
-
|
119 |
-
AutoConfig.register(MODEL_TYPE, ClipVisualTokenizerConfig)
|
120 |
-
AutoModel.register(ClipVisualTokenizerConfig, ClipVisualTokenizer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|