runninglsy commited on
Commit
f5cfd8b
1 Parent(s): a2db297

clean code

Browse files
Files changed (2) hide show
  1. base_visual_tokenizer.py +0 -138
  2. clip_visual_tokenizer.py +0 -120
base_visual_tokenizer.py DELETED
@@ -1,138 +0,0 @@
1
- from typing import Union, Optional
2
-
3
- import PIL.Image
4
- import torch
5
- from torch.nn.functional import softmax, gumbel_softmax
6
- from transformers import PretrainedConfig, PreTrainedModel, AutoImageProcessor, AutoModel, AutoConfig
7
-
8
-
9
- class BaseVisualTokenizerConfig(PretrainedConfig):
10
- def __init__(self,
11
- vocab_size=16384,
12
- tokenize_function="softmax",
13
- tau=1.0,
14
- depths=None,
15
- use_indicators=False,
16
- drop_cls_token=False,
17
- backbone_config: Optional[Union[PretrainedConfig, dict]] = None,
18
- hidden_stride: int = 1,
19
- **kwargs):
20
- super().__init__(**kwargs)
21
- self.vocab_size = vocab_size
22
- self.tokenize_function = tokenize_function
23
- self.tau = tau
24
- if isinstance(depths, str):
25
- depths = [int(x) for x in depths.split('|')]
26
- self.depths = depths
27
- self.backbone_kwargs = {}
28
- self.use_indicators = use_indicators
29
- self.drop_cls_token = drop_cls_token
30
- if backbone_config is not None:
31
- assert isinstance(backbone_config, (PretrainedConfig, dict)), \
32
- f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
33
- if not isinstance(backbone_config, PretrainedConfig):
34
- model_type = backbone_config['model_type']
35
- backbone_config.pop('model_type')
36
- backbone_config = AutoConfig.for_model(model_type, **backbone_config)
37
- self.backbone_config = backbone_config
38
- self.hidden_stride = hidden_stride
39
-
40
-
41
- class BaseVisualTokenizer(PreTrainedModel):
42
- base_model_prefix = "backbone"
43
- main_input_name = None
44
- _image_processor_class = None
45
- _image_processor_kwargs = {}
46
- _backbone_class = None
47
- _backbone_name_or_path = None
48
-
49
- def __init__(self, config: BaseVisualTokenizerConfig, *inputs, **kwargs):
50
- super().__init__(config, *inputs, **kwargs)
51
- if kwargs.get('train_from_scratch'):
52
- self.image_processor = self._image_processor_class.from_pretrained(self._backbone_name_or_path,
53
- **self._image_processor_kwargs)
54
- self.backbone = self._backbone_class.from_pretrained(self._backbone_name_or_path,
55
- **self.config.backbone_kwargs)
56
- self.config.backbone_config = self.backbone.config
57
- else:
58
- self.image_processor = AutoImageProcessor.from_pretrained(kwargs['image_processor_name_or_path'])
59
- self.backbone = AutoModel.from_config(self.config.backbone_config)
60
- self.head = None
61
-
62
- assert all((self.image_processor.do_resize,
63
- not getattr(self.image_processor, 'do_center_crop', False),
64
- self.image_processor.do_rescale,
65
- self.image_processor.do_normalize
66
- )), f"image_processor `{self.image_processor}` is not supported currently"
67
-
68
- def get_backbone(self):
69
- return self.backbone
70
-
71
- def get_monitor_tensors(self):
72
- raise NotImplementedError
73
-
74
- def get_image_processor(self):
75
- return self.image_processor
76
-
77
- def get_head(self):
78
- return self.head
79
-
80
- def get_image_size(self):
81
- raise NotImplementedError
82
-
83
- def preprocess_image(self, image: PIL.Image.Image, convert_to_rgb=True):
84
- if convert_to_rgb and image.mode != 'RGB':
85
- image = image.convert('RGB')
86
-
87
- # first resize and preprocess
88
- sides = self.get_image_size()
89
- if sides[0] != sides[1]:
90
- raise ValueError('get_image_size() returns non-square size')
91
- side = sides[0]
92
-
93
- width, height = image.size
94
- if width == height:
95
- new_width = new_height = side
96
- elif width > height:
97
- new_width = side
98
- new_height = int(height / width * new_width)
99
- else:
100
- new_height = side
101
- new_width = int(width / height * new_height)
102
- new_size = dict(height=new_height, width=new_width)
103
- pixel_values = self.image_processor.preprocess(image, size=new_size, return_tensors='pt')['pixel_values']
104
-
105
- # then pad to square
106
- square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
107
- new_height, new_width = pixel_values.shape[2:]
108
- if new_height == new_width:
109
- square_values[:, :, :, :] = pixel_values
110
- elif new_height > new_width:
111
- from_index = (side - new_width) // 2
112
- square_values[:, :, :, from_index:from_index + new_width] = pixel_values
113
- else:
114
- from_index = (side - new_height) // 2
115
- square_values[:, :, from_index:from_index + new_height, :] = pixel_values
116
-
117
- return square_values
118
-
119
- def get_layer_norm(self):
120
- return self.layer_norm
121
-
122
- def tokenize(self, logits):
123
- def st_argmax(y_soft, dim): # straight-through softmax
124
- index = y_soft.max(dim, keepdim=True)[1]
125
- y_hard = torch.zeros_like(y_soft, memory_format=torch.legacy_contiguous_format).scatter_(dim, index, 1.0)
126
- ret = y_hard - y_soft.detach() + y_soft
127
- return ret
128
-
129
- if self.config.tokenize_function == 'softmax':
130
- tokens = softmax(logits, dim=-1)
131
- elif self.config.tokenize_function == 'gumbel_argmax':
132
- tokens = gumbel_softmax(logits, tau=self.config.tau, hard=True)
133
- elif self.config.tokenize_function == 'st_argmax':
134
- tokens = st_argmax(logits, dim=-1)
135
- else:
136
- raise ValueError(
137
- f'Invalid `max_type`, expected softmax or gumbel_argmax or st_argmax, but got {self.config.tokenize_function}')
138
- return tokens
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
clip_visual_tokenizer.py DELETED
@@ -1,120 +0,0 @@
1
- from datetime import datetime
2
- from typing import Dict
3
-
4
- import deepspeed
5
- import torch
6
- from torch import Tensor
7
- from transformers import AutoConfig, AutoModel
8
- from transformers import CLIPVisionModel, CLIPImageProcessor
9
- from transformers.integrations import is_deepspeed_zero3_enabled
10
-
11
- from .utils import BEGIN_LINE, END_LINE, rank0_print
12
- from .base_visual_tokenizer import BaseVisualTokenizerConfig, BaseVisualTokenizer
13
-
14
- MODEL_TYPE = "clip_visual_tokenizer"
15
-
16
-
17
- class ClipVisualTokenizerConfig(BaseVisualTokenizerConfig):
18
- model_type = MODEL_TYPE
19
-
20
- def __init__(self, **kwargs):
21
- super().__init__(**kwargs)
22
- if self.depths:
23
- assert len(self.depths) == 1
24
- self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
25
-
26
-
27
- class ClipVisualTokenizer(BaseVisualTokenizer):
28
- config_class = ClipVisualTokenizerConfig
29
- supports_gradient_checkpointing = True
30
- _no_split_modules = ["CLIPEncoderLayer"]
31
- _image_processor_class = CLIPImageProcessor
32
- _image_processor_kwargs = dict(do_center_crop=False)
33
- _backbone_class = CLIPVisionModel
34
- _backbone_name_or_path = "openai/clip-vit-large-patch14-336"
35
-
36
- def __init__(self, config: ClipVisualTokenizerConfig = None, *inputs, **kwargs):
37
- super().__init__(config, *inputs, **kwargs)
38
- head_dim = self.config.vocab_size
39
- if self.config.use_indicators:
40
- head_dim -= 2 # reserved for two image indicator tokens
41
- self.head = torch.nn.Sequential(
42
- torch.nn.Linear(self.backbone.config.hidden_size, head_dim, bias=False),
43
- torch.nn.LayerNorm(head_dim)
44
- )
45
-
46
- def re_init_layers(self, re_init_layer_begin):
47
- layer_dict = self.get_re_init_layer_dict(re_init_layer_begin)
48
- for name, layer in layer_dict.items():
49
- rank0_print(BEGIN_LINE)
50
- rank0_print(f'[{datetime.now()}] Before layer re-initialization of {name}: ')
51
- for k, v in layer.named_parameters():
52
- with deepspeed.zero.GatheredParameters([v]):
53
- rank0_print(f'{k}: {v}')
54
- with deepspeed.zero.GatheredParameters(list(layer.parameters(recurse=True)), modifier_rank=0):
55
- if not is_deepspeed_zero3_enabled() or deepspeed.comm.get_rank() == 0:
56
- layer.apply(self.backbone._init_weights)
57
- rank0_print(f'[{datetime.now()}] After layer re-initialization of {name}:')
58
- for k, v in layer.named_parameters():
59
- with deepspeed.zero.GatheredParameters([v]):
60
- rank0_print(f'{k}: {v}')
61
- rank0_print(END_LINE)
62
-
63
- def get_re_init_layer_dict(self, re_init_layer_begin: int) -> Dict[str, torch.nn.Module]:
64
- assert re_init_layer_begin >= 0, "negative index is prohibited"
65
- layer_dict = dict()
66
- for i in range(re_init_layer_begin, self.backbone.config.num_hidden_layers):
67
- layer_dict[f'backbone.vision_model.encoder.layers.{i}'] = self.backbone.vision_model.encoder.layers[i]
68
- return layer_dict
69
-
70
- def get_monitor_tensors(self):
71
- return dict(
72
- backbone_bottom=self.backbone.vision_model.encoder.layers[0].self_attn.k_proj.weight,
73
- backbone_top=self.backbone.vision_model.encoder.layers[-1].self_attn.out_proj.weight,
74
- head=self.head[0].weight
75
- )
76
-
77
- def get_image_size(self):
78
- height = self.image_processor.crop_size["height"]
79
- width = self.image_processor.crop_size["width"]
80
- return height, width
81
-
82
- def forward(self, pixel_values) -> Tensor: # [BatchSize, ImageShape] -> [BatchSize, #Token, VocabSize]
83
- output = self.backbone(
84
- pixel_values, output_hidden_states=True, return_dict=True)
85
- features = output.last_hidden_state
86
- if self.config.drop_cls_token:
87
- features = features[:, 1:, :]
88
- logits = self.head(features)
89
- tokens = self.tokenize(logits)
90
- if self.config.use_indicators:
91
- # tokens' shape is [BatchSize, #Token, VocabSize-2], so padding with [BatchSize, #Token, 2], after
92
- # which, tokens' shape should become [BatchSize, #Token, VocabSize]
93
- batch_size, token_len, _ = tokens.shape
94
- padding_tensor = torch.zeros(size=(batch_size, token_len, 2),
95
- dtype=tokens.dtype,
96
- device=tokens.device,
97
- layout=tokens.layout,
98
- requires_grad=False)
99
- tokens = torch.cat((tokens, padding_tensor), dim=2)
100
-
101
- # adding indicator tokens, after which tokens' shape should become [BatchSize, 1+#Token+1, VocabSize]
102
- begin_indicator = torch.zeros(size=(batch_size, 1),
103
- dtype=torch.long,
104
- device=tokens.device,
105
- requires_grad=False) + self.config.vocab_size - 2
106
- begin_indicator_token = torch.nn.functional.one_hot(begin_indicator,
107
- num_classes=self.config.vocab_size).to(
108
- dtype=tokens.dtype)
109
- end_indicator = torch.zeros(size=(batch_size, 1),
110
- dtype=torch.long,
111
- device=tokens.device,
112
- requires_grad=False) + self.config.vocab_size - 1
113
- end_indicator_token = torch.nn.functional.one_hot(end_indicator,
114
- num_classes=self.config.vocab_size).to(dtype=tokens.dtype)
115
- tokens = torch.cat((begin_indicator_token, tokens, end_indicator_token), dim=1)
116
- return tokens
117
-
118
-
119
- AutoConfig.register(MODEL_TYPE, ClipVisualTokenizerConfig)
120
- AutoModel.register(ClipVisualTokenizerConfig, ClipVisualTokenizer)