czczup commited on
Commit
d0cf52b
1 Parent(s): 71b72f5

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. modeling_internvl_chat.py +6 -5
  2. modeling_phi3.py +1 -1
modeling_internvl_chat.py CHANGED
@@ -35,6 +35,7 @@ def version_cmp(v1, v2, op='eq'):
35
  class InternVLChatModel(PreTrainedModel):
36
  config_class = InternVLChatConfig
37
  main_input_name = 'pixel_values'
 
38
  _supports_flash_attn_2 = True
39
  _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'Phi3DecoderLayer']
40
 
@@ -101,7 +102,7 @@ class InternVLChatModel(PreTrainedModel):
101
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
102
 
103
  image_flags = image_flags.squeeze(-1)
104
- input_embeds = self.language_model.get_input_embeddings()(input_ids)
105
 
106
  vit_embeds = self.extract_feature(pixel_values)
107
  vit_embeds = vit_embeds[image_flags == 1]
@@ -234,8 +235,8 @@ class InternVLChatModel(PreTrainedModel):
234
 
235
  tokenizer.padding_side = 'left'
236
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
237
- input_ids = model_inputs['input_ids'].cuda()
238
- attention_mask = model_inputs['attention_mask'].cuda()
239
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
240
  generation_config['eos_token_id'] = eos_token_id
241
  generation_output = self.generate(
@@ -283,8 +284,8 @@ class InternVLChatModel(PreTrainedModel):
283
  query = query.replace('<image>', image_tokens, 1)
284
 
285
  model_inputs = tokenizer(query, return_tensors='pt')
286
- input_ids = model_inputs['input_ids'].cuda()
287
- attention_mask = model_inputs['attention_mask'].cuda()
288
  generation_config['eos_token_id'] = eos_token_id
289
  generation_output = self.generate(
290
  pixel_values=pixel_values,
 
35
  class InternVLChatModel(PreTrainedModel):
36
  config_class = InternVLChatConfig
37
  main_input_name = 'pixel_values'
38
+ base_model_prefix = 'language_model'
39
  _supports_flash_attn_2 = True
40
  _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'Phi3DecoderLayer']
41
 
 
102
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
103
 
104
  image_flags = image_flags.squeeze(-1)
105
+ input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
106
 
107
  vit_embeds = self.extract_feature(pixel_values)
108
  vit_embeds = vit_embeds[image_flags == 1]
 
235
 
236
  tokenizer.padding_side = 'left'
237
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
238
+ input_ids = model_inputs['input_ids'].to(self.device)
239
+ attention_mask = model_inputs['attention_mask'].to(self.device)
240
  eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
241
  generation_config['eos_token_id'] = eos_token_id
242
  generation_output = self.generate(
 
284
  query = query.replace('<image>', image_tokens, 1)
285
 
286
  model_inputs = tokenizer(query, return_tensors='pt')
287
+ input_ids = model_inputs['input_ids'].to(self.device)
288
+ attention_mask = model_inputs['attention_mask'].to(self.device)
289
  generation_config['eos_token_id'] = eos_token_id
290
  generation_output = self.generate(
291
  pixel_values=pixel_values,
modeling_phi3.py CHANGED
@@ -1370,7 +1370,7 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
1370
  position_ids = position_ids[:, -input_ids.shape[1] :]
1371
 
1372
  # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1373
- if inputs_embeds is not None and past_key_values is None:
1374
  model_inputs = {'inputs_embeds': inputs_embeds}
1375
  else:
1376
  model_inputs = {'input_ids': input_ids}
 
1370
  position_ids = position_ids[:, -input_ids.shape[1] :]
1371
 
1372
  # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1373
+ if (inputs_embeds is not None and past_key_values is None) or (inputs_embeds is not None and len(past_key_values) == 0):
1374
  model_inputs = {'inputs_embeds': inputs_embeds}
1375
  else:
1376
  model_inputs = {'input_ids': input_ids}