katuni4ka commited on
Commit
62c40a0
·
verified ·
1 Parent(s): bc034e1

Upload 15 files

Browse files
config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "_commit_hash": null,
3
- "_name_or_path": "internvl2-tiny",
4
  "architectures": [
5
  "InternVLChatModel"
6
  ],
@@ -14,6 +14,7 @@
14
  "force_image_size": 28,
15
  "img_context_token_id": 151648,
16
  "llm_config": {
 
17
  "_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
18
  "add_cross_attention": false,
19
  "architectures": [
@@ -89,7 +90,7 @@
89
  "top_p": 1.0,
90
  "torch_dtype": "bfloat16",
91
  "torchscript": false,
92
- "transformers_version": "4.45.2",
93
  "typical_p": 1.0,
94
  "use_bfloat16": true,
95
  "use_cache": true,
@@ -108,6 +109,7 @@
108
  "use_llm_lora": 0,
109
  "use_thumbnail": true,
110
  "vision_config": {
 
111
  "_name_or_path": "",
112
  "add_cross_attention": false,
113
  "architectures": [
@@ -186,7 +188,7 @@
186
  "top_p": 1.0,
187
  "torch_dtype": "bfloat16",
188
  "torchscript": false,
189
- "transformers_version": "4.45.2",
190
  "typical_p": 1.0,
191
  "use_bfloat16": true,
192
  "use_flash_attn": false
 
1
  {
2
  "_commit_hash": null,
3
+ "_name_or_path": "/home/ea/work/my_optimum_intel/optimum-intel/internvl2-tiny",
4
  "architectures": [
5
  "InternVLChatModel"
6
  ],
 
14
  "force_image_size": 28,
15
  "img_context_token_id": 151648,
16
  "llm_config": {
17
+ "_attn_implementation_autoset": true,
18
  "_name_or_path": "Qwen/Qwen2-0.5B-Instruct",
19
  "add_cross_attention": false,
20
  "architectures": [
 
90
  "top_p": 1.0,
91
  "torch_dtype": "bfloat16",
92
  "torchscript": false,
93
+ "transformers_version": "4.46.2",
94
  "typical_p": 1.0,
95
  "use_bfloat16": true,
96
  "use_cache": true,
 
109
  "use_llm_lora": 0,
110
  "use_thumbnail": true,
111
  "vision_config": {
112
+ "_attn_implementation_autoset": true,
113
  "_name_or_path": "",
114
  "add_cross_attention": false,
115
  "architectures": [
 
188
  "top_p": 1.0,
189
  "torch_dtype": "bfloat16",
190
  "torchscript": false,
191
+ "transformers_version": "4.46.2",
192
  "typical_p": 1.0,
193
  "use_bfloat16": true,
194
  "use_flash_attn": false
generation_config.json CHANGED
@@ -1,4 +1,4 @@
1
  {
2
  "_from_model_config": true,
3
- "transformers_version": "4.45.2"
4
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "transformers_version": "4.46.2"
4
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c91dc9c94bbae4e6d8d16aab28765983e8b12cd0565b9e8a98f80073b0b5e14
3
- size 34321392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9f0331124d4efa1050990abf1755529dc4c161abcd9efa40dab5df83495c290
3
+ size 68292216
modeling_internvl_chat.py CHANGED
@@ -20,7 +20,7 @@ from .conversation import get_conv_template
20
  from .modeling_intern_vit import InternVisionModel, has_flash_attn
21
 
22
 
23
- logger = logging.get_logger(__name__)
24
 
25
 
26
  def version_cmp(v1, v2, op="eq"):
@@ -39,7 +39,7 @@ class InternVLChatModel(PreTrainedModel):
39
  _supports_flash_attn_2 = True
40
  _no_split_modules = ["InternVisionModel", "LlamaDecoderLayer", "Qwen2DecoderLayer"]
41
 
42
- def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
43
  super().__init__(config)
44
 
45
  assert version_cmp(transformers.__version__, "4.37.0", "ge")
@@ -55,8 +55,8 @@ class InternVLChatModel(PreTrainedModel):
55
  config.vision_config.use_flash_attn = True if use_flash_attn else False
56
  config.llm_config._attn_implementation = "flash_attention_2" if use_flash_attn else "eager"
57
 
58
- logger.info(f"num_image_token: {self.num_image_token}")
59
- logger.info(f"ps_version: {self.ps_version}")
60
  if vision_model is not None:
61
  self.vision_model = vision_model
62
  else:
@@ -103,7 +103,7 @@ class InternVLChatModel(PreTrainedModel):
103
  input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
104
 
105
  vit_embeds = self.extract_feature(pixel_values)
106
- vit_batch_size = pixel_values.shape[0]
107
 
108
  B, N, C = input_embeds.shape
109
  input_embeds = input_embeds.reshape(B * N, C)
@@ -289,9 +289,6 @@ class InternVLChatModel(PreTrainedModel):
289
  template.append_message(template.roles[1], None)
290
  query = template.get_prompt()
291
 
292
- print(self.num_image_token)
293
- print(num_patches_list)
294
-
295
  if verbose and pixel_values is not None:
296
  image_bs = pixel_values.shape[0]
297
  print(f"dynamic ViT batch size: {image_bs}")
@@ -300,7 +297,7 @@ class InternVLChatModel(PreTrainedModel):
300
  image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
301
  query = query.replace("<image>", image_tokens, 1)
302
 
303
- print(query)
304
  model_inputs = tokenizer(query, return_tensors="pt")
305
  input_ids = model_inputs["input_ids"].to(self.device)
306
  attention_mask = model_inputs["attention_mask"].to(self.device)
@@ -356,6 +353,7 @@ class InternVLChatModel(PreTrainedModel):
356
  attention_mask=attention_mask,
357
  generation_config=generation_config,
358
  output_hidden_states=output_hidden_states,
 
359
  use_cache=True,
360
  **generate_kwargs,
361
  )
 
20
  from .modeling_intern_vit import InternVisionModel, has_flash_attn
21
 
22
 
23
+ #logger = logging.get_logger(__name__)
24
 
25
 
26
  def version_cmp(v1, v2, op="eq"):
 
39
  _supports_flash_attn_2 = True
40
  _no_split_modules = ["InternVisionModel", "LlamaDecoderLayer", "Qwen2DecoderLayer"]
41
 
42
+ def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=False):
43
  super().__init__(config)
44
 
45
  assert version_cmp(transformers.__version__, "4.37.0", "ge")
 
55
  config.vision_config.use_flash_attn = True if use_flash_attn else False
56
  config.llm_config._attn_implementation = "flash_attention_2" if use_flash_attn else "eager"
57
 
58
+ #logger.info(f"num_image_token: {self.num_image_token}")
59
+ #logger.info(f"ps_version: {self.ps_version}")
60
  if vision_model is not None:
61
  self.vision_model = vision_model
62
  else:
 
103
  input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
104
 
105
  vit_embeds = self.extract_feature(pixel_values)
106
+ pixel_values.shape[0]
107
 
108
  B, N, C = input_embeds.shape
109
  input_embeds = input_embeds.reshape(B * N, C)
 
289
  template.append_message(template.roles[1], None)
290
  query = template.get_prompt()
291
 
 
 
 
292
  if verbose and pixel_values is not None:
293
  image_bs = pixel_values.shape[0]
294
  print(f"dynamic ViT batch size: {image_bs}")
 
297
  image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
298
  query = query.replace("<image>", image_tokens, 1)
299
 
300
+
301
  model_inputs = tokenizer(query, return_tensors="pt")
302
  input_ids = model_inputs["input_ids"].to(self.device)
303
  attention_mask = model_inputs["attention_mask"].to(self.device)
 
353
  attention_mask=attention_mask,
354
  generation_config=generation_config,
355
  output_hidden_states=output_hidden_states,
356
+ return_dict=return_dict,
357
  use_cache=True,
358
  **generate_kwargs,
359
  )