msr2000 commited on
Commit
e0828e3
1 Parent(s): 3d3dec2

Update code & readme

Browse files
Files changed (2) hide show
  1. README.md +4 -2
  2. modeling_deepseek.py +0 -12
README.md CHANGED
@@ -189,7 +189,8 @@ model_name = "deepseek-ai/DeepSeek-V2"
189
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
190
  # `max_memory` should be set based on your devices
191
  max_memory = {i: "75GB" for i in range(8)}
192
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16, max_memory=max_memory)
 
193
  model.generation_config = GenerationConfig.from_pretrained(model_name)
194
  model.generation_config.pad_token_id = model.generation_config.eos_token_id
195
 
@@ -210,7 +211,8 @@ model_name = "deepseek-ai/DeepSeek-V2-Chat"
210
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
211
  # `max_memory` should be set based on your devices
212
  max_memory = {i: "75GB" for i in range(8)}
213
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16, max_memory=max_memory)
 
214
  model.generation_config = GenerationConfig.from_pretrained(model_name)
215
  model.generation_config.pad_token_id = model.generation_config.eos_token_id
216
 
 
189
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
190
  # `max_memory` should be set based on your devices
191
  max_memory = {i: "75GB" for i in range(8)}
192
+ # `device_map` cannot be set to `auto`
193
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="sequential", torch_dtype=torch.bfloat16, max_memory=max_memory, attn_implementation="eager")
194
  model.generation_config = GenerationConfig.from_pretrained(model_name)
195
  model.generation_config.pad_token_id = model.generation_config.eos_token_id
196
 
 
211
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
212
  # `max_memory` should be set based on your devices
213
  max_memory = {i: "75GB" for i in range(8)}
214
+ # `device_map` cannot be set to `auto`
215
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="sequential", torch_dtype=torch.bfloat16, max_memory=max_memory, attn_implementation="eager")
216
  model.generation_config = GenerationConfig.from_pretrained(model_name)
217
  model.generation_config.pad_token_id = model.generation_config.eos_token_id
218
 
modeling_deepseek.py CHANGED
@@ -34,7 +34,6 @@ from transformers.modeling_attn_mask_utils import (
34
  AttentionMaskConverter,
35
  _prepare_4d_attention_mask,
36
  _prepare_4d_causal_attention_mask,
37
- _prepare_4d_causal_attention_mask_for_sdpa,
38
  )
39
  from transformers.modeling_outputs import (
40
  BaseModelOutputWithPast,
@@ -1295,7 +1294,6 @@ class DeepseekV2PreTrainedModel(PreTrainedModel):
1295
  _no_split_modules = ["DeepseekV2DecoderLayer"]
1296
  _skip_keys_device_placement = "past_key_values"
1297
  _supports_flash_attn_2 = True
1298
- _supports_sdpa = True
1299
  _supports_cache_class = True
1300
 
1301
  def _init_weights(self, module):
@@ -1406,7 +1404,6 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel):
1406
  for layer_idx in range(config.num_hidden_layers)
1407
  ]
1408
  )
1409
- self._use_sdpa = config._attn_implementation == "sdpa"
1410
  self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
1411
  self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1412
 
@@ -1495,15 +1492,6 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel):
1495
  if (attention_mask is not None and 0 in attention_mask)
1496
  else None
1497
  )
1498
- elif self._use_sdpa and not output_attentions:
1499
- # output_attentions=True can not be supported when using SDPA, and we fall back on
1500
- # the manual implementation that requires a 4D causal mask in all cases.
1501
- attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
1502
- attention_mask,
1503
- (batch_size, seq_length),
1504
- inputs_embeds,
1505
- past_key_values_length,
1506
- )
1507
  else:
1508
  # 4d mask is passed through the layers
1509
  attention_mask = _prepare_4d_causal_attention_mask(
 
34
  AttentionMaskConverter,
35
  _prepare_4d_attention_mask,
36
  _prepare_4d_causal_attention_mask,
 
37
  )
38
  from transformers.modeling_outputs import (
39
  BaseModelOutputWithPast,
 
1294
  _no_split_modules = ["DeepseekV2DecoderLayer"]
1295
  _skip_keys_device_placement = "past_key_values"
1296
  _supports_flash_attn_2 = True
 
1297
  _supports_cache_class = True
1298
 
1299
  def _init_weights(self, module):
 
1404
  for layer_idx in range(config.num_hidden_layers)
1405
  ]
1406
  )
 
1407
  self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
1408
  self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1409
 
 
1492
  if (attention_mask is not None and 0 in attention_mask)
1493
  else None
1494
  )
 
 
 
 
 
 
 
 
 
1495
  else:
1496
  # 4d mask is passed through the layers
1497
  attention_mask = _prepare_4d_causal_attention_mask(