czczup commited on
Commit
c391cab
Β·
verified Β·
1 Parent(s): acc9f3d

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/red-panda.mp4 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -23,6 +23,8 @@ tags:
23
 
24
  [\[πŸ—¨οΈ Chat Demo\]](https://internvl.opengvlab.com/) [\[πŸ€— HF Demo\]](https://huggingface.co/spaces/OpenGVLab/InternVL) [\[πŸš€ Quick Start\]](#quick-start) [\[πŸ“– 中文解读\]](https://zhuanlan.zhihu.com/p/706547971) [\[πŸ“– Documents\]](https://internvl.readthedocs.io/en/latest/)
25
 
 
 
26
  InternVL-Chat-V1-2-Plus uses the same model architecture as [InternVL-Chat-V1-2](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-2), but the difference lies in the SFT dataset. InternVL-Chat-V1-2 only utilizes an SFT dataset with 1.2M samples, while **our plus version employs an SFT dataset with 12M samples**.
27
 
28
  <p align="center">
 
23
 
24
  [\[πŸ—¨οΈ Chat Demo\]](https://internvl.opengvlab.com/) [\[πŸ€— HF Demo\]](https://huggingface.co/spaces/OpenGVLab/InternVL) [\[πŸš€ Quick Start\]](#quick-start) [\[πŸ“– 中文解读\]](https://zhuanlan.zhihu.com/p/706547971) [\[πŸ“– Documents\]](https://internvl.readthedocs.io/en/latest/)
25
 
26
+ ## Introduction
27
+
28
  InternVL-Chat-V1-2-Plus uses the same model architecture as [InternVL-Chat-V1-2](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-2), but the difference lies in the SFT dataset. InternVL-Chat-V1-2 only utilizes an SFT dataset with 1.2M samples, while **our plus version employs an SFT dataset with 12M samples**.
29
 
30
  <p align="center">
examples/image1.jpg ADDED
examples/image2.jpg ADDED
examples/red-panda.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d921c07bb97224d65a37801541d246067f0d506f08723ffa1ad85c217907ccb8
3
+ size 1867237
modeling_internvl_chat.py CHANGED
@@ -33,6 +33,7 @@ def version_cmp(v1, v2, op='eq'):
33
  class InternVLChatModel(PreTrainedModel):
34
  config_class = InternVLChatConfig
35
  main_input_name = 'pixel_values'
 
36
  _supports_flash_attn_2 = True
37
  _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer']
38
 
@@ -97,7 +98,7 @@ class InternVLChatModel(PreTrainedModel):
97
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
98
 
99
  image_flags = image_flags.squeeze(-1)
100
- input_embeds = self.language_model.get_input_embeddings()(input_ids)
101
 
102
  vit_embeds = self.extract_feature(pixel_values)
103
  vit_embeds = vit_embeds[image_flags == 1]
@@ -230,8 +231,8 @@ class InternVLChatModel(PreTrainedModel):
230
 
231
  tokenizer.padding_side = 'left'
232
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
233
- input_ids = model_inputs['input_ids'].cuda()
234
- attention_mask = model_inputs['attention_mask'].cuda()
235
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
236
  generation_config['eos_token_id'] = eos_token_id
237
  generation_output = self.generate(
@@ -279,8 +280,8 @@ class InternVLChatModel(PreTrainedModel):
279
  query = query.replace('<image>', image_tokens, 1)
280
 
281
  model_inputs = tokenizer(query, return_tensors='pt')
282
- input_ids = model_inputs['input_ids'].cuda()
283
- attention_mask = model_inputs['attention_mask'].cuda()
284
  generation_config['eos_token_id'] = eos_token_id
285
  generation_output = self.generate(
286
  pixel_values=pixel_values,
 
33
  class InternVLChatModel(PreTrainedModel):
34
  config_class = InternVLChatConfig
35
  main_input_name = 'pixel_values'
36
+ base_model_prefix = 'language_model'
37
  _supports_flash_attn_2 = True
38
  _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer']
39
 
 
98
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
99
 
100
  image_flags = image_flags.squeeze(-1)
101
+ input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
102
 
103
  vit_embeds = self.extract_feature(pixel_values)
104
  vit_embeds = vit_embeds[image_flags == 1]
 
231
 
232
  tokenizer.padding_side = 'left'
233
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
234
+ input_ids = model_inputs['input_ids'].to(self.device)
235
+ attention_mask = model_inputs['attention_mask'].to(self.device)
236
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
237
  generation_config['eos_token_id'] = eos_token_id
238
  generation_output = self.generate(
 
280
  query = query.replace('<image>', image_tokens, 1)
281
 
282
  model_inputs = tokenizer(query, return_tensors='pt')
283
+ input_ids = model_inputs['input_ids'].to(self.device)
284
+ attention_mask = model_inputs['attention_mask'].to(self.device)
285
  generation_config['eos_token_id'] = eos_token_id
286
  generation_output = self.generate(
287
  pixel_values=pixel_values,