tinyllava commited on
Commit
648dcc8
1 Parent(s): 66f6d6a

Update modeling_tinyllava_phi.py

Browse files
Files changed (1) hide show
  1. modeling_tinyllava_phi.py +473 -473
modeling_tinyllava_phi.py CHANGED
@@ -1,474 +1,474 @@
1
- from dataclasses import dataclass
2
- from typing import List, Optional, Tuple, Union
3
- import ast
4
- import re
5
-
6
- import torch
7
- import torch.utils.checkpoint
8
- from torch import nn, Tensor
9
- from torch.nn import functional as F
10
-
11
- from transformers import PreTrainedModel
12
- from transformers.modeling_outputs import CausalLMOutputWithPast
13
- from transformers.generation.utils import GenerateOutput
14
- from transformers import CLIPVisionModel, CLIPImageProcessor, SiglipVisionModel, SiglipImageProcessor
15
-
16
- from .configuration import TinyLlavaConfig, IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
17
-
18
- from transformers import AutoConfig, AutoModelForCausalLM, PhiForCausalLM
19
- from data_preprocess import *
20
-
21
- # from tinyllava.utils.data_utils import get_value_from_kwargs
22
- CONTROLLER_HEART_BEAT_EXPIRATION = 30
23
- WORKER_HEART_BEAT_INTERVAL = 15
24
-
25
- LOGDIR = "."
26
- #
27
- # For licensing see accompanying LICENSE file.
28
- # Copyright (C) 2024 Apple Inc. All Rights Reserved.
29
- #
30
- from transformers.utils import logging
31
-
32
- logger = logging.get_logger(__name__)
33
-
34
- # this import has to be relative, otherwise, when setting trust_remote_code=True
35
- # huggingface transformers won't be able to load the module correctly
36
- from numbers import Number
37
- from typing import List, Optional, Union
38
-
39
-
40
-
41
-
42
- ACT_TYPE = {
43
- 'relu': nn.ReLU,
44
- 'gelu': nn.GELU
45
- }
46
-
47
- class Connector(nn.Module):
48
- def __init__(self, config=None):
49
- super().__init__()
50
- mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', config.connector_type)
51
- act_type = config.connector_type.split('_')[-1]
52
- mlp_depth = int(mlp_gelu_match.group(1))
53
- modules = [nn.Linear(config.vision_hidden_size, config.hidden_size)]
54
- for _ in range(1, mlp_depth):
55
- modules.append(ACT_TYPE[act_type]())
56
- modules.append(nn.Linear(config.hidden_size, config.hidden_size))
57
-
58
- self._connector = nn.Sequential(*modules)
59
-
60
- def forward(self, x):
61
- return self._connector(x)
62
-
63
- class VisionTower(nn.Module):
64
- def __init__(self, cfg, model_name_or_path = 'clip'):
65
- super().__init__()
66
- if 'clip' in model_name_or_path:
67
- self._vision_tower = CLIPVisionModel(cfg)
68
- self._image_processor = CLIPImageProcessor.from_pretrained(cfg.model_name_or_path)
69
- else:
70
- self._vision_tower = SiglipVisionModel(cfg)
71
- self._image_processor = SiglipImageProcessor.from_pretrained(cfg.model_name_or_path)
72
-
73
- self.config = cfg
74
-
75
- def forward(self, x, **kwargs):
76
- image_features = self._vision_tower(x, output_hidden_states=True)
77
- image_features = image_features.hidden_states[kwargs.get('vision_feature_layer', -2)]
78
-
79
- if kwargs.get('vision_feature_select_strategy', 'patch') == 'patch':
80
- image_features = image_features[:, 1:]
81
- elif kwargs.get('vision_feature_select_strategy', 'patch') == 'cls_patch':
82
- image_features = image_features
83
- else:
84
- raise ValueError(f"Unexpected select feature: {kwargs.get('vision_feature_select_strategy')}")
85
-
86
- return image_features
87
-
88
- @property
89
- def vision_tower(self):
90
- return self._vision_tower
91
-
92
- @vision_tower.setter
93
- def vision_tower(self, vision_tower):
94
- self._vision_tower = vision_tower
95
-
96
- def get_value_from_kwargs(kwargs, name):
97
- if name in kwargs:
98
- return kwargs.pop(name)
99
- else:
100
- return None
101
-
102
-
103
- class TinyLlavaPreTrainedModel(PreTrainedModel):
104
- config_class = TinyLlavaConfig
105
- base_model_prefix = "model"
106
- supports_gradient_checkpointing = True
107
- _no_split_modules = ["LlavaVisionAttention"]
108
- _skip_keys_device_placement = "past_key_values"
109
- _supports_flash_attn_2 = True
110
-
111
- def _init_weights(self, module):
112
- std = (
113
- self.config.initializer_range
114
- if hasattr(self.config, "initializer_range")
115
- else self.config.text_config.initializer_range
116
- )
117
-
118
- if hasattr(module, "class_embedding"):
119
- module.class_embedding.data.normal_(mean=0.0, std=std)
120
-
121
- if isinstance(module, (nn.Linear, nn.Conv2d)):
122
- module.weight.data.normal_(mean=0.0, std=std)
123
- if module.bias is not None:
124
- module.bias.data.zero_()
125
- elif isinstance(module, nn.Embedding):
126
- module.weight.data.normal_(mean=0.0, std=std)
127
- if module.padding_idx is not None:
128
- module.weight.data[module.padding_idx].zero_()
129
-
130
- @property
131
- def _supports_sdpa(self):
132
- return self.language_model._supports_sdpa
133
-
134
-
135
- class TinyLlavaForConditionalGeneration(TinyLlavaPreTrainedModel):
136
- def __init__(self, config: TinyLlavaConfig):
137
-
138
- super().__init__(config)
139
-
140
- self.language_model = PhiForCausalLM(config.text_config)
141
- self.vision_tower = VisionTower(config.vision_config, config.vision_model_name_or_path)
142
- self.connector = Connector(config)
143
- self.post_init()
144
-
145
-
146
- def get_input_embeddings(self):
147
- return self.language_model.get_input_embeddings()
148
-
149
- def set_input_embeddings(self, value):
150
- self.language_model.set_input_embeddings(value)
151
-
152
- def get_output_embeddings(self):
153
- return self.language_model.get_output_embeddings()
154
-
155
- def set_output_embeddings(self, new_embeddings):
156
- self.language_model.set_output_embeddings(new_embeddings)
157
-
158
- def set_decoder(self, decoder):
159
- self.language_model.set_decoder(decoder)
160
-
161
- def get_decoder(self):
162
- return self.language_model.get_decoder()
163
-
164
- def tie_weights(self):
165
- return self.language_model.tie_weights()
166
-
167
- def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
168
- model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
169
- # update vocab size
170
- self.config.text_config.vocab_size = model_embeds.num_embeddings
171
- self.config.vocab_size = model_embeds.num_embeddings
172
- self.vocab_size = model_embeds.num_embeddings
173
- return model_embeds
174
-
175
-
176
- def forward(
177
- self,
178
- input_ids: torch.LongTensor = None,
179
- attention_mask: Optional[torch.Tensor] = None,
180
- position_ids: Optional[torch.LongTensor] = None,
181
- past_key_values: Optional[List[torch.FloatTensor]] = None,
182
- inputs_embeds: Optional[torch.FloatTensor] = None,
183
- labels: Optional[torch.LongTensor] = None,
184
- use_cache: Optional[bool] = None,
185
- output_attentions: Optional[bool] = None,
186
- output_hidden_states: Optional[bool] = None,
187
- images: Optional[torch.FloatTensor] = None,
188
- image_sizes: Optional[List[List[int]]] = None,
189
- return_dict: Optional[bool] = None,
190
- ) -> Union[Tuple, CausalLMOutputWithPast]:
191
- use_cache = use_cache if use_cache is not None else self.config.use_cache
192
- if inputs_embeds is None:
193
- (
194
- input_ids,
195
- position_ids,
196
- attention_mask,
197
- past_key_values,
198
- inputs_embeds,
199
- labels
200
- ) = self.prepare_inputs_labels_for_multimodal(
201
- input_ids,
202
- position_ids,
203
- attention_mask,
204
- past_key_values,
205
- labels,
206
- images,
207
- image_sizes
208
- )
209
- return self.language_model.forward(
210
- input_ids=input_ids,
211
- attention_mask=attention_mask,
212
- position_ids=position_ids,
213
- past_key_values=past_key_values,
214
- inputs_embeds=inputs_embeds,
215
- labels=labels,
216
- use_cache=use_cache,
217
- output_attentions=output_attentions,
218
- output_hidden_states=output_hidden_states,
219
- return_dict=return_dict
220
- )
221
-
222
- @torch.no_grad()
223
- def generate(
224
- self,
225
- inputs: Optional[torch.Tensor] = None,
226
- images: Optional[torch.Tensor] = None,
227
- image_sizes: Optional[torch.Tensor] = None,
228
- **kwargs,
229
- ) -> Union[GenerateOutput, torch.LongTensor]:
230
- position_ids = kwargs.pop("position_ids", None)
231
- attention_mask = kwargs.pop("attention_mask", None)
232
- if "inputs_embeds" in kwargs:
233
- raise NotImplementedError("`inputs_embeds` is not supported")
234
-
235
- if images is not None:
236
- (
237
- inputs,
238
- position_ids,
239
- attention_mask,
240
- _,
241
- inputs_embeds,
242
- _
243
- ) = self.prepare_inputs_labels_for_multimodal(
244
- inputs,
245
- position_ids,
246
- attention_mask,
247
- None,
248
- None,
249
- images,
250
- image_sizes=image_sizes
251
- )
252
- else:
253
- inputs_embeds = self.language_model.get_input_embeddings()(inputs)
254
-
255
- return self.language_model.generate(
256
- position_ids=position_ids,
257
- attention_mask=attention_mask,
258
- inputs_embeds=inputs_embeds,
259
- **kwargs
260
- )
261
-
262
- def encode_images(self, images):
263
- kwargs = {}
264
- kwargs['vision_feature_layer'] = self.config.vision_feature_layer
265
- kwargs['vision_feature_select_strategy'] = self.config.vision_feature_select_strategy
266
- images = images.to(device=self.device, dtype=self.dtype)
267
- image_features = self.vision_tower(images, **kwargs)
268
- image_features = self.connector(image_features)
269
- return image_features
270
-
271
-
272
-
273
- def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
274
- inputs_embeds=None, **kwargs):
275
- images = kwargs.pop("images", None)
276
- image_sizes = kwargs.pop("image_sizes", None)
277
- inputs = self.language_model.prepare_inputs_for_generation(
278
- input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
279
- )
280
- if images is not None:
281
- inputs['images'] = images
282
- if image_sizes is not None:
283
- inputs['image_sizes'] = image_sizes
284
- return inputs
285
-
286
- def prepare_inputs_labels_for_multimodal(
287
- self, input_ids, position_ids, attention_mask, past_key_values, labels,
288
- images, image_sizes=None
289
- ):
290
- vision_tower = self.vision_tower
291
- if vision_tower is None or images is None or input_ids.shape[1] == 1:
292
- return input_ids, position_ids, attention_mask, past_key_values, None, labels
293
-
294
-
295
- image_features = self.encode_images(images)
296
-
297
- # TODO: image start / end is not implemented here to support pretraining.
298
- if getattr(self.config, 'tune_mm_mlp_adapter', False):
299
- raise NotImplementedError
300
-
301
- # Let's just add dummy tensors if they do not exist,
302
- # it is a headache to deal with None all the time.
303
- # But it is not ideal, and if you have a better idea,
304
- # please open an issue / submit a PR, thanks.
305
- _labels = labels
306
- _position_ids = position_ids
307
- _attention_mask = attention_mask
308
- if attention_mask is None:
309
- attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
310
- else:
311
- attention_mask = attention_mask.bool()
312
- if position_ids is None:
313
- position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
314
- if labels is None:
315
- labels = torch.full_like(input_ids, IGNORE_INDEX)
316
-
317
- # remove the padding using attention_mask -- FIXME
318
- _input_ids = input_ids
319
- input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
320
- labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
321
-
322
- new_input_embeds = []
323
- new_labels = []
324
- cur_image_idx = 0
325
- for batch_idx, cur_input_ids in enumerate(input_ids):
326
- num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
327
- if num_images == 0:
328
- cur_image_features = image_features[cur_image_idx]
329
- cur_input_embeds_1 = self.language_model.get_input_embeddings()(cur_input_ids)
330
- cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
331
- new_input_embeds.append(cur_input_embeds)
332
- new_labels.append(labels[batch_idx])
333
- cur_image_idx += 1
334
- continue
335
-
336
- image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
337
- cur_input_ids_noim = []
338
- cur_labels = labels[batch_idx]
339
- cur_labels_noim = []
340
- for i in range(len(image_token_indices) - 1):
341
- cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
342
- cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
343
- split_sizes = [x.shape[0] for x in cur_labels_noim]
344
- cur_input_embeds = self.language_model.get_input_embeddings()(torch.cat(cur_input_ids_noim))
345
- cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
346
- cur_new_input_embeds = []
347
- cur_new_labels = []
348
-
349
- for i in range(num_images + 1):
350
- cur_new_input_embeds.append(cur_input_embeds_no_im[i])
351
- cur_new_labels.append(cur_labels_noim[i])
352
- if i < num_images:
353
- cur_image_features = image_features[cur_image_idx]
354
- cur_image_idx += 1
355
- cur_new_input_embeds.append(cur_image_features)
356
- cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
357
-
358
- cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
359
-
360
- cur_new_input_embeds = torch.cat(cur_new_input_embeds)
361
- cur_new_labels = torch.cat(cur_new_labels)
362
-
363
- new_input_embeds.append(cur_new_input_embeds)
364
- new_labels.append(cur_new_labels)
365
-
366
- # Truncate sequences to max length as image embeddings can make the sequence longer
367
- tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
368
- if tokenizer_model_max_length is not None:
369
- new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
370
- new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
371
-
372
- # Combine them
373
- max_len = max(x.shape[0] for x in new_input_embeds)
374
- batch_size = len(new_input_embeds)
375
-
376
- new_input_embeds_padded = []
377
- new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
378
- attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
379
- position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
380
-
381
- for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
382
- cur_len = cur_new_embed.shape[0]
383
- if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
384
- new_input_embeds_padded.append(torch.cat((
385
- torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
386
- cur_new_embed
387
- ), dim=0))
388
- if cur_len > 0:
389
- new_labels_padded[i, -cur_len:] = cur_new_labels
390
- attention_mask[i, -cur_len:] = True
391
- position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
392
- else:
393
- new_input_embeds_padded.append(torch.cat((
394
- cur_new_embed,
395
- torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
396
- ), dim=0))
397
- if cur_len > 0:
398
- new_labels_padded[i, :cur_len] = cur_new_labels
399
- attention_mask[i, :cur_len] = True
400
- position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
401
-
402
- new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
403
-
404
- if _labels is None:
405
- new_labels = None
406
- else:
407
- new_labels = new_labels_padded
408
-
409
- if _attention_mask is None:
410
- attention_mask = None
411
- else:
412
- attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
413
-
414
- if _position_ids is None:
415
- position_ids = None
416
-
417
- return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
418
-
419
- def chat(
420
- self,
421
- prompt: str,
422
- tokenizer = None,
423
- image: str = None,
424
- max_new_tokens: int = 512,
425
- num_beams = 1,
426
- top_p=None,
427
- temperature=0
428
- ):
429
- image_processor = self.vision_tower._image_processor
430
-
431
- if image is not None:
432
- prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt
433
- conv = conv_phi_v0.copy()
434
- conv.append_message(conv.roles[0], prompt)
435
- conv.append_message(conv.roles[1], None)
436
- prompt = conv.get_prompt()
437
- if image is not None:
438
- image = load_image(image)
439
- image_tensor = process_images(image, image_processor, self.config).to(self.device)
440
-
441
- input_ids = (
442
- tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
443
- .unsqueeze(0).to(self.device)
444
- )
445
- # Generate
446
- stime = time.time()
447
-
448
- with torch.inference_mode():
449
- output_ids = self.generate(
450
- input_ids,
451
- images=image_tensor,
452
- do_sample=True if temperature > 0 else False,
453
- temperature=temperature,
454
- top_p=top_p,
455
- num_beams=num_beams,
456
- pad_token_id=tokenizer.pad_token_id,
457
- max_new_tokens=max_new_tokens,
458
- use_cache=True,
459
- # stopping_criteria=[stopping_criteria],
460
- )
461
-
462
- # print('inference over')
463
- generation_time = time.time() - stime
464
- outputs = tokenizer.batch_decode(
465
- output_ids, skip_special_tokens=True
466
- )[0]
467
-
468
- outputs = outputs.strip()
469
-
470
- return outputs, generation_time
471
-
472
-
473
- AutoConfig.register("tinyllava", TinyLlavaConfig)
474
  AutoModelForCausalLM.register(TinyLlavaConfig, TinyLlavaForConditionalGeneration)
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional, Tuple, Union
3
+ import ast
4
+ import re
5
+
6
+ import torch
7
+ import torch.utils.checkpoint
8
+ from torch import nn, Tensor
9
+ from torch.nn import functional as F
10
+
11
+ from transformers import PreTrainedModel
12
+ from transformers.modeling_outputs import CausalLMOutputWithPast
13
+ from transformers.generation.utils import GenerateOutput
14
+ from transformers import CLIPVisionModel, CLIPImageProcessor, SiglipVisionModel, SiglipImageProcessor
15
+
16
+ from .configuration import TinyLlavaConfig, IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
17
+
18
+ from transformers import AutoConfig, AutoModelForCausalLM, PhiForCausalLM
19
+ from .data_preprocess import *
20
+
21
+ # from tinyllava.utils.data_utils import get_value_from_kwargs
22
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
23
+ WORKER_HEART_BEAT_INTERVAL = 15
24
+
25
+ LOGDIR = "."
26
+ #
27
+ # For licensing see accompanying LICENSE file.
28
+ # Copyright (C) 2024 Apple Inc. All Rights Reserved.
29
+ #
30
+ from transformers.utils import logging
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ # this import has to be relative, otherwise, when setting trust_remote_code=True
35
+ # huggingface transformers won't be able to load the module correctly
36
+ from numbers import Number
37
+ from typing import List, Optional, Union
38
+
39
+
40
+
41
+
42
+ ACT_TYPE = {
43
+ 'relu': nn.ReLU,
44
+ 'gelu': nn.GELU
45
+ }
46
+
47
+ class Connector(nn.Module):
48
+ def __init__(self, config=None):
49
+ super().__init__()
50
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', config.connector_type)
51
+ act_type = config.connector_type.split('_')[-1]
52
+ mlp_depth = int(mlp_gelu_match.group(1))
53
+ modules = [nn.Linear(config.vision_hidden_size, config.hidden_size)]
54
+ for _ in range(1, mlp_depth):
55
+ modules.append(ACT_TYPE[act_type]())
56
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
57
+
58
+ self._connector = nn.Sequential(*modules)
59
+
60
+ def forward(self, x):
61
+ return self._connector(x)
62
+
63
+ class VisionTower(nn.Module):
64
+ def __init__(self, cfg, model_name_or_path = 'clip'):
65
+ super().__init__()
66
+ if 'clip' in model_name_or_path:
67
+ self._vision_tower = CLIPVisionModel(cfg)
68
+ self._image_processor = CLIPImageProcessor.from_pretrained(cfg.model_name_or_path)
69
+ else:
70
+ self._vision_tower = SiglipVisionModel(cfg)
71
+ self._image_processor = SiglipImageProcessor.from_pretrained(cfg.model_name_or_path)
72
+
73
+ self.config = cfg
74
+
75
+ def forward(self, x, **kwargs):
76
+ image_features = self._vision_tower(x, output_hidden_states=True)
77
+ image_features = image_features.hidden_states[kwargs.get('vision_feature_layer', -2)]
78
+
79
+ if kwargs.get('vision_feature_select_strategy', 'patch') == 'patch':
80
+ image_features = image_features[:, 1:]
81
+ elif kwargs.get('vision_feature_select_strategy', 'patch') == 'cls_patch':
82
+ image_features = image_features
83
+ else:
84
+ raise ValueError(f"Unexpected select feature: {kwargs.get('vision_feature_select_strategy')}")
85
+
86
+ return image_features
87
+
88
+ @property
89
+ def vision_tower(self):
90
+ return self._vision_tower
91
+
92
+ @vision_tower.setter
93
+ def vision_tower(self, vision_tower):
94
+ self._vision_tower = vision_tower
95
+
96
+ def get_value_from_kwargs(kwargs, name):
97
+ if name in kwargs:
98
+ return kwargs.pop(name)
99
+ else:
100
+ return None
101
+
102
+
103
+ class TinyLlavaPreTrainedModel(PreTrainedModel):
104
+ config_class = TinyLlavaConfig
105
+ base_model_prefix = "model"
106
+ supports_gradient_checkpointing = True
107
+ _no_split_modules = ["LlavaVisionAttention"]
108
+ _skip_keys_device_placement = "past_key_values"
109
+ _supports_flash_attn_2 = True
110
+
111
+ def _init_weights(self, module):
112
+ std = (
113
+ self.config.initializer_range
114
+ if hasattr(self.config, "initializer_range")
115
+ else self.config.text_config.initializer_range
116
+ )
117
+
118
+ if hasattr(module, "class_embedding"):
119
+ module.class_embedding.data.normal_(mean=0.0, std=std)
120
+
121
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
122
+ module.weight.data.normal_(mean=0.0, std=std)
123
+ if module.bias is not None:
124
+ module.bias.data.zero_()
125
+ elif isinstance(module, nn.Embedding):
126
+ module.weight.data.normal_(mean=0.0, std=std)
127
+ if module.padding_idx is not None:
128
+ module.weight.data[module.padding_idx].zero_()
129
+
130
+ @property
131
+ def _supports_sdpa(self):
132
+ return self.language_model._supports_sdpa
133
+
134
+
135
+ class TinyLlavaForConditionalGeneration(TinyLlavaPreTrainedModel):
136
+ def __init__(self, config: TinyLlavaConfig):
137
+
138
+ super().__init__(config)
139
+
140
+ self.language_model = PhiForCausalLM(config.text_config)
141
+ self.vision_tower = VisionTower(config.vision_config, config.vision_model_name_or_path)
142
+ self.connector = Connector(config)
143
+ self.post_init()
144
+
145
+
146
+ def get_input_embeddings(self):
147
+ return self.language_model.get_input_embeddings()
148
+
149
+ def set_input_embeddings(self, value):
150
+ self.language_model.set_input_embeddings(value)
151
+
152
+ def get_output_embeddings(self):
153
+ return self.language_model.get_output_embeddings()
154
+
155
+ def set_output_embeddings(self, new_embeddings):
156
+ self.language_model.set_output_embeddings(new_embeddings)
157
+
158
+ def set_decoder(self, decoder):
159
+ self.language_model.set_decoder(decoder)
160
+
161
+ def get_decoder(self):
162
+ return self.language_model.get_decoder()
163
+
164
+ def tie_weights(self):
165
+ return self.language_model.tie_weights()
166
+
167
+ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
168
+ model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
169
+ # update vocab size
170
+ self.config.text_config.vocab_size = model_embeds.num_embeddings
171
+ self.config.vocab_size = model_embeds.num_embeddings
172
+ self.vocab_size = model_embeds.num_embeddings
173
+ return model_embeds
174
+
175
+
176
+ def forward(
177
+ self,
178
+ input_ids: torch.LongTensor = None,
179
+ attention_mask: Optional[torch.Tensor] = None,
180
+ position_ids: Optional[torch.LongTensor] = None,
181
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
182
+ inputs_embeds: Optional[torch.FloatTensor] = None,
183
+ labels: Optional[torch.LongTensor] = None,
184
+ use_cache: Optional[bool] = None,
185
+ output_attentions: Optional[bool] = None,
186
+ output_hidden_states: Optional[bool] = None,
187
+ images: Optional[torch.FloatTensor] = None,
188
+ image_sizes: Optional[List[List[int]]] = None,
189
+ return_dict: Optional[bool] = None,
190
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
191
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
192
+ if inputs_embeds is None:
193
+ (
194
+ input_ids,
195
+ position_ids,
196
+ attention_mask,
197
+ past_key_values,
198
+ inputs_embeds,
199
+ labels
200
+ ) = self.prepare_inputs_labels_for_multimodal(
201
+ input_ids,
202
+ position_ids,
203
+ attention_mask,
204
+ past_key_values,
205
+ labels,
206
+ images,
207
+ image_sizes
208
+ )
209
+ return self.language_model.forward(
210
+ input_ids=input_ids,
211
+ attention_mask=attention_mask,
212
+ position_ids=position_ids,
213
+ past_key_values=past_key_values,
214
+ inputs_embeds=inputs_embeds,
215
+ labels=labels,
216
+ use_cache=use_cache,
217
+ output_attentions=output_attentions,
218
+ output_hidden_states=output_hidden_states,
219
+ return_dict=return_dict
220
+ )
221
+
222
+ @torch.no_grad()
223
+ def generate(
224
+ self,
225
+ inputs: Optional[torch.Tensor] = None,
226
+ images: Optional[torch.Tensor] = None,
227
+ image_sizes: Optional[torch.Tensor] = None,
228
+ **kwargs,
229
+ ) -> Union[GenerateOutput, torch.LongTensor]:
230
+ position_ids = kwargs.pop("position_ids", None)
231
+ attention_mask = kwargs.pop("attention_mask", None)
232
+ if "inputs_embeds" in kwargs:
233
+ raise NotImplementedError("`inputs_embeds` is not supported")
234
+
235
+ if images is not None:
236
+ (
237
+ inputs,
238
+ position_ids,
239
+ attention_mask,
240
+ _,
241
+ inputs_embeds,
242
+ _
243
+ ) = self.prepare_inputs_labels_for_multimodal(
244
+ inputs,
245
+ position_ids,
246
+ attention_mask,
247
+ None,
248
+ None,
249
+ images,
250
+ image_sizes=image_sizes
251
+ )
252
+ else:
253
+ inputs_embeds = self.language_model.get_input_embeddings()(inputs)
254
+
255
+ return self.language_model.generate(
256
+ position_ids=position_ids,
257
+ attention_mask=attention_mask,
258
+ inputs_embeds=inputs_embeds,
259
+ **kwargs
260
+ )
261
+
262
+ def encode_images(self, images):
263
+ kwargs = {}
264
+ kwargs['vision_feature_layer'] = self.config.vision_feature_layer
265
+ kwargs['vision_feature_select_strategy'] = self.config.vision_feature_select_strategy
266
+ images = images.to(device=self.device, dtype=self.dtype)
267
+ image_features = self.vision_tower(images, **kwargs)
268
+ image_features = self.connector(image_features)
269
+ return image_features
270
+
271
+
272
+
273
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
274
+ inputs_embeds=None, **kwargs):
275
+ images = kwargs.pop("images", None)
276
+ image_sizes = kwargs.pop("image_sizes", None)
277
+ inputs = self.language_model.prepare_inputs_for_generation(
278
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
279
+ )
280
+ if images is not None:
281
+ inputs['images'] = images
282
+ if image_sizes is not None:
283
+ inputs['image_sizes'] = image_sizes
284
+ return inputs
285
+
286
+ def prepare_inputs_labels_for_multimodal(
287
+ self, input_ids, position_ids, attention_mask, past_key_values, labels,
288
+ images, image_sizes=None
289
+ ):
290
+ vision_tower = self.vision_tower
291
+ if vision_tower is None or images is None or input_ids.shape[1] == 1:
292
+ return input_ids, position_ids, attention_mask, past_key_values, None, labels
293
+
294
+
295
+ image_features = self.encode_images(images)
296
+
297
+ # TODO: image start / end is not implemented here to support pretraining.
298
+ if getattr(self.config, 'tune_mm_mlp_adapter', False):
299
+ raise NotImplementedError
300
+
301
+ # Let's just add dummy tensors if they do not exist,
302
+ # it is a headache to deal with None all the time.
303
+ # But it is not ideal, and if you have a better idea,
304
+ # please open an issue / submit a PR, thanks.
305
+ _labels = labels
306
+ _position_ids = position_ids
307
+ _attention_mask = attention_mask
308
+ if attention_mask is None:
309
+ attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
310
+ else:
311
+ attention_mask = attention_mask.bool()
312
+ if position_ids is None:
313
+ position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
314
+ if labels is None:
315
+ labels = torch.full_like(input_ids, IGNORE_INDEX)
316
+
317
+ # remove the padding using attention_mask -- FIXME
318
+ _input_ids = input_ids
319
+ input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
320
+ labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
321
+
322
+ new_input_embeds = []
323
+ new_labels = []
324
+ cur_image_idx = 0
325
+ for batch_idx, cur_input_ids in enumerate(input_ids):
326
+ num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
327
+ if num_images == 0:
328
+ cur_image_features = image_features[cur_image_idx]
329
+ cur_input_embeds_1 = self.language_model.get_input_embeddings()(cur_input_ids)
330
+ cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
331
+ new_input_embeds.append(cur_input_embeds)
332
+ new_labels.append(labels[batch_idx])
333
+ cur_image_idx += 1
334
+ continue
335
+
336
+ image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
337
+ cur_input_ids_noim = []
338
+ cur_labels = labels[batch_idx]
339
+ cur_labels_noim = []
340
+ for i in range(len(image_token_indices) - 1):
341
+ cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
342
+ cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
343
+ split_sizes = [x.shape[0] for x in cur_labels_noim]
344
+ cur_input_embeds = self.language_model.get_input_embeddings()(torch.cat(cur_input_ids_noim))
345
+ cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
346
+ cur_new_input_embeds = []
347
+ cur_new_labels = []
348
+
349
+ for i in range(num_images + 1):
350
+ cur_new_input_embeds.append(cur_input_embeds_no_im[i])
351
+ cur_new_labels.append(cur_labels_noim[i])
352
+ if i < num_images:
353
+ cur_image_features = image_features[cur_image_idx]
354
+ cur_image_idx += 1
355
+ cur_new_input_embeds.append(cur_image_features)
356
+ cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
357
+
358
+ cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
359
+
360
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds)
361
+ cur_new_labels = torch.cat(cur_new_labels)
362
+
363
+ new_input_embeds.append(cur_new_input_embeds)
364
+ new_labels.append(cur_new_labels)
365
+
366
+ # Truncate sequences to max length as image embeddings can make the sequence longer
367
+ tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
368
+ if tokenizer_model_max_length is not None:
369
+ new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
370
+ new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
371
+
372
+ # Combine them
373
+ max_len = max(x.shape[0] for x in new_input_embeds)
374
+ batch_size = len(new_input_embeds)
375
+
376
+ new_input_embeds_padded = []
377
+ new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
378
+ attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
379
+ position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
380
+
381
+ for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
382
+ cur_len = cur_new_embed.shape[0]
383
+ if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
384
+ new_input_embeds_padded.append(torch.cat((
385
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
386
+ cur_new_embed
387
+ ), dim=0))
388
+ if cur_len > 0:
389
+ new_labels_padded[i, -cur_len:] = cur_new_labels
390
+ attention_mask[i, -cur_len:] = True
391
+ position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
392
+ else:
393
+ new_input_embeds_padded.append(torch.cat((
394
+ cur_new_embed,
395
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
396
+ ), dim=0))
397
+ if cur_len > 0:
398
+ new_labels_padded[i, :cur_len] = cur_new_labels
399
+ attention_mask[i, :cur_len] = True
400
+ position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
401
+
402
+ new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
403
+
404
+ if _labels is None:
405
+ new_labels = None
406
+ else:
407
+ new_labels = new_labels_padded
408
+
409
+ if _attention_mask is None:
410
+ attention_mask = None
411
+ else:
412
+ attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
413
+
414
+ if _position_ids is None:
415
+ position_ids = None
416
+
417
+ return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
418
+
419
+ def chat(
420
+ self,
421
+ prompt: str,
422
+ tokenizer = None,
423
+ image: str = None,
424
+ max_new_tokens: int = 512,
425
+ num_beams = 1,
426
+ top_p=None,
427
+ temperature=0
428
+ ):
429
+ image_processor = self.vision_tower._image_processor
430
+
431
+ if image is not None:
432
+ prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt
433
+ conv = conv_phi_v0.copy()
434
+ conv.append_message(conv.roles[0], prompt)
435
+ conv.append_message(conv.roles[1], None)
436
+ prompt = conv.get_prompt()
437
+ if image is not None:
438
+ image = load_image(image)
439
+ image_tensor = process_images(image, image_processor, self.config).to(self.device)
440
+
441
+ input_ids = (
442
+ tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
443
+ .unsqueeze(0).to(self.device)
444
+ )
445
+ # Generate
446
+ stime = time.time()
447
+
448
+ with torch.inference_mode():
449
+ output_ids = self.generate(
450
+ input_ids,
451
+ images=image_tensor,
452
+ do_sample=True if temperature > 0 else False,
453
+ temperature=temperature,
454
+ top_p=top_p,
455
+ num_beams=num_beams,
456
+ pad_token_id=tokenizer.pad_token_id,
457
+ max_new_tokens=max_new_tokens,
458
+ use_cache=True,
459
+ # stopping_criteria=[stopping_criteria],
460
+ )
461
+
462
+ # print('inference over')
463
+ generation_time = time.time() - stime
464
+ outputs = tokenizer.batch_decode(
465
+ output_ids, skip_special_tokens=True
466
+ )[0]
467
+
468
+ outputs = outputs.strip()
469
+
470
+ return outputs, generation_time
471
+
472
+
473
+ AutoConfig.register("tinyllava", TinyLlavaConfig)
474
  AutoModelForCausalLM.register(TinyLlavaConfig, TinyLlavaForConditionalGeneration)