Shaltiel commited on
Commit
69acbf1
1 Parent(s): 85f616e

Upload e2.5 + instruct

Browse files
modeling_megatron_gpt.py CHANGED
@@ -20,6 +20,7 @@
20
 
21
  """ PyTorch MegatronGPT model."""
22
 
 
23
  from typing import Optional, Tuple, Union
24
 
25
  import torch
@@ -42,7 +43,12 @@ from transformers.modeling_outputs import (
42
  )
43
  from transformers.modeling_utils import PreTrainedModel
44
  from transformers.utils import logging
45
- from .configuration_megatron_gpt import MegatronGPTConfig
 
 
 
 
 
46
 
47
  def get_activation(act):
48
  if act in ["gelu", "geglu", "fast-geglu"]:
@@ -57,6 +63,10 @@ logger = logging.get_logger(__name__)
57
 
58
  _CONFIG_FOR_DOC = "MegatronGPTConfig"
59
 
 
 
 
 
60
  class MegatronGPTPreTrainedModel(PreTrainedModel):
61
  """
62
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -187,7 +197,7 @@ class MegatronGPTAttention(nn.Module):
187
  # Compute token offset for rotary embeddings (when decoding)
188
  seq_len = key.shape[-2]
189
  if has_layer_past:
190
- seq_len += layer_past[0].shape[-2]
191
  cos, sin = self.rotary_emb(value, seq_len=seq_len)
192
  query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
193
  query = torch.cat((query, query_pass), dim=-1)
@@ -420,8 +430,8 @@ class MegatronGPTMLP(nn.Module):
420
  class MegatronGPTLayer(nn.Module):
421
  def __init__(self, config, layer_idx):
422
  super().__init__()
423
- self.input_layernorm = MegatronGPTLPLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
424
- self.post_attention_layernorm = MegatronGPTLPLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
425
  self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
426
  self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
427
  self.self_attention = MegatronGPTAttention(config)
@@ -466,23 +476,36 @@ class MegatronGPTLayer(nn.Module):
466
 
467
  return outputs
468
 
469
- class MegatronGPTLPLayerNorm(torch.nn.LayerNorm):
470
  def __init__(self, normalization, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
471
- super().__init__(
472
- normalized_shape=normalized_shape,
473
- eps=eps,
474
- elementwise_affine=elementwise_affine,
475
- device=device,
476
- dtype=dtype,
477
- )
478
- assert normalization in ['layernorm', 'layernorm1p']
 
 
 
 
 
 
479
  self.normalization = normalization
480
 
481
  def forward(self, x):
482
- weight_bias = 1 if self.normalization == 'layernorm1p' else 0
483
- return torch.nn.functional.layer_norm(
484
- x, self.normalized_shape, self.weight + weight_bias, self.bias, self.eps
485
- )
 
 
 
 
 
 
 
486
 
487
 
488
 
@@ -551,7 +574,7 @@ class MegatronGPTModel(MegatronGPTPreTrainedModel):
551
  self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
552
  self.emb_dropout = nn.Dropout(config.hidden_dropout)
553
  self.layers = nn.ModuleList([MegatronGPTLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
554
- self.final_layernorm = MegatronGPTLPLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
555
 
556
  self.gradient_checkpointing = False
557
 
@@ -748,7 +771,7 @@ class MegatronGPTForCausalLM(MegatronGPTPreTrainedModel):
748
  output_attentions: Optional[bool] = None,
749
  output_hidden_states: Optional[bool] = None,
750
  return_dict: Optional[bool] = None,
751
- ) -> Union[Tuple, CausalLMOutputWithPast]:
752
  r"""
753
  past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
754
  Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
@@ -804,12 +827,13 @@ class MegatronGPTForCausalLM(MegatronGPTPreTrainedModel):
804
  output = (lm_logits,) + outputs[1:]
805
  return ((lm_loss,) + output) if lm_loss is not None else output
806
 
807
- return CausalLMOutputWithPast(
808
  loss=lm_loss,
809
  logits=lm_logits,
810
  past_key_values=outputs.past_key_values,
811
  hidden_states=outputs.hidden_states,
812
  attentions=outputs.attentions,
 
813
  )
814
 
815
  def prepare_inputs_for_generation(
 
20
 
21
  """ PyTorch MegatronGPT model."""
22
 
23
+ from dataclasses import dataclass
24
  from typing import Optional, Tuple, Union
25
 
26
  import torch
 
43
  )
44
  from transformers.modeling_utils import PreTrainedModel
45
  from transformers.utils import logging
46
+ # try to load using a relative path, but if it fails try loading it directly
47
+ try:
48
+ from .configuration_megatron_gpt import MegatronGPTConfig
49
+ except:
50
+ from configuration_megatron_gpt import MegatronGPTConfig
51
+
52
 
53
  def get_activation(act):
54
  if act in ["gelu", "geglu", "fast-geglu"]:
 
63
 
64
  _CONFIG_FOR_DOC = "MegatronGPTConfig"
65
 
66
+ @dataclass
67
+ class CausalLMOutputWithPastAndEncoding(CausalLMOutputWithPast):
68
+ encoding_states: Optional[torch.FloatTensor] = None
69
+
70
  class MegatronGPTPreTrainedModel(PreTrainedModel):
71
  """
72
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
 
197
  # Compute token offset for rotary embeddings (when decoding)
198
  seq_len = key.shape[-2]
199
  if has_layer_past:
200
+ seq_len = seq_len + layer_past[0].shape[-2]
201
  cos, sin = self.rotary_emb(value, seq_len=seq_len)
202
  query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
203
  query = torch.cat((query, query_pass), dim=-1)
 
430
  class MegatronGPTLayer(nn.Module):
431
  def __init__(self, config, layer_idx):
432
  super().__init__()
433
+ self.input_layernorm = MegatronGPTLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
434
+ self.post_attention_layernorm = MegatronGPTLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
435
  self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
436
  self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
437
  self.self_attention = MegatronGPTAttention(config)
 
476
 
477
  return outputs
478
 
479
+ class MegatronGPTLayerNorm(torch.nn.LayerNorm):
480
  def __init__(self, normalization, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
481
+ normalization = normalization.lower()
482
+ assert normalization in ['layernorm', 'layernorm1p', 'rmsnorm']
483
+ if normalization == 'rmsnorm':
484
+ torch.nn.Module.__init__(self)
485
+ self.weight = nn.Parameter(torch.ones(normalized_shape))
486
+ self.variance_epsilon = eps
487
+ else:
488
+ super().__init__(
489
+ normalized_shape=normalized_shape,
490
+ eps=eps,
491
+ elementwise_affine=elementwise_affine,
492
+ device=device,
493
+ dtype=dtype,
494
+ )
495
  self.normalization = normalization
496
 
497
  def forward(self, x):
498
+ if self.normalization == 'rmsnorm':
499
+ input_dtype = x.dtype
500
+ x = x.to(torch.float32)
501
+ variance = x.pow(2).mean(-1, keepdim=True)
502
+ x = x * torch.rsqrt(variance + self.variance_epsilon)
503
+ return self.weight * x.to(input_dtype)
504
+ else:
505
+ weight_bias = 1 if self.normalization == 'layernorm1p' else 0
506
+ return torch.nn.functional.layer_norm(
507
+ x, self.normalized_shape, self.weight + weight_bias, self.bias, self.eps
508
+ )
509
 
510
 
511
 
 
574
  self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
575
  self.emb_dropout = nn.Dropout(config.hidden_dropout)
576
  self.layers = nn.ModuleList([MegatronGPTLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
577
+ self.final_layernorm = MegatronGPTLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
578
 
579
  self.gradient_checkpointing = False
580
 
 
771
  output_attentions: Optional[bool] = None,
772
  output_hidden_states: Optional[bool] = None,
773
  return_dict: Optional[bool] = None,
774
+ ) -> Union[Tuple, CausalLMOutputWithPastAndEncoding]:
775
  r"""
776
  past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
777
  Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
 
827
  output = (lm_logits,) + outputs[1:]
828
  return ((lm_loss,) + output) if lm_loss is not None else output
829
 
830
+ return CausalLMOutputWithPastAndEncoding(
831
  loss=lm_loss,
832
  logits=lm_logits,
833
  past_key_values=outputs.past_key_values,
834
  hidden_states=outputs.hidden_states,
835
  attentions=outputs.attentions,
836
+ encoding_states=hidden_states
837
  )
838
 
839
  def prepare_inputs_for_generation(
pytorch_model-00001-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9be72e7e2eca0bc35760a307f1ee166d396a658c54f6a19ea6ac0469ed178b18
3
  size 9970836963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69fceaa3477ed790a9f3506f717a58db7f328ffed532d468bdd82098f3433dce
3
  size 9970836963
pytorch_model-00002-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2c87cbe98bb83d5b16c6e8ba81776b02fe4d0454f0398a669ba721c64f2f464
3
  size 950158711
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a95c1cd54a63f3ba01e2283528f431948cd2014426efc1e08403cdf99bf3084
3
  size 950158711