Upload e2.5 + instruct

Browse files

Files changed (3) hide show

modeling_megatron_gpt.py +44 -20
pytorch_model-00001-of-00002.bin +1 -1
pytorch_model-00002-of-00002.bin +1 -1

modeling_megatron_gpt.py CHANGED Viewed

@@ -20,6 +20,7 @@
 """ PyTorch MegatronGPT model."""
 from typing import Optional, Tuple, Union
 import torch
@@ -42,7 +43,12 @@ from transformers.modeling_outputs import (
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
-from .configuration_megatron_gpt import MegatronGPTConfig
 def get_activation(act):
     if act in ["gelu", "geglu", "fast-geglu"]:
@@ -57,6 +63,10 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "MegatronGPTConfig"
 class MegatronGPTPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -187,7 +197,7 @@ class MegatronGPTAttention(nn.Module):
         # Compute token offset for rotary embeddings (when decoding)
         seq_len = key.shape[-2]
         if has_layer_past:
-            seq_len += layer_past[0].shape[-2]
         cos, sin = self.rotary_emb(value, seq_len=seq_len)
         query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
         query = torch.cat((query, query_pass), dim=-1)
@@ -420,8 +430,8 @@ class MegatronGPTMLP(nn.Module):
 class MegatronGPTLayer(nn.Module):
     def __init__(self, config, layer_idx):
         super().__init__()
-        self.input_layernorm = MegatronGPTLPLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
-        self.post_attention_layernorm = MegatronGPTLPLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
         self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
         self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
         self.self_attention = MegatronGPTAttention(config)
@@ -466,23 +476,36 @@ class MegatronGPTLayer(nn.Module):
         return outputs
-class MegatronGPTLPLayerNorm(torch.nn.LayerNorm):
     def __init__(self, normalization, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
-        super().__init__(
-            normalized_shape=normalized_shape,
-            eps=eps,
-            elementwise_affine=elementwise_affine,
-            device=device,
-            dtype=dtype,
-        )
-        assert normalization in ['layernorm', 'layernorm1p']
         self.normalization = normalization
     def forward(self, x):
-        weight_bias = 1 if self.normalization == 'layernorm1p' else 0
-        return torch.nn.functional.layer_norm(
-            x, self.normalized_shape, self.weight + weight_bias, self.bias, self.eps
-        )
@@ -551,7 +574,7 @@ class MegatronGPTModel(MegatronGPTPreTrainedModel):
         self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
         self.emb_dropout = nn.Dropout(config.hidden_dropout)
         self.layers = nn.ModuleList([MegatronGPTLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
-        self.final_layernorm = MegatronGPTLPLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
         self.gradient_checkpointing = False
@@ -748,7 +771,7 @@ class MegatronGPTForCausalLM(MegatronGPTPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
@@ -804,12 +827,13 @@ class MegatronGPTForCausalLM(MegatronGPTPreTrainedModel):
             output = (lm_logits,) + outputs[1:]
             return ((lm_loss,) + output) if lm_loss is not None else output
-        return CausalLMOutputWithPast(
             loss=lm_loss,
             logits=lm_logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
     def prepare_inputs_for_generation(

 """ PyTorch MegatronGPT model."""
+from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import torch
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
+# try to load using a relative path, but if it fails try loading it directly
+try:
+    from .configuration_megatron_gpt import MegatronGPTConfig
+except:
+    from configuration_megatron_gpt import MegatronGPTConfig
 def get_activation(act):
     if act in ["gelu", "geglu", "fast-geglu"]:
 _CONFIG_FOR_DOC = "MegatronGPTConfig"
+@dataclass
+class CausalLMOutputWithPastAndEncoding(CausalLMOutputWithPast):
+    encoding_states: Optional[torch.FloatTensor] = None
 class MegatronGPTPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
         # Compute token offset for rotary embeddings (when decoding)
         seq_len = key.shape[-2]
         if has_layer_past:
+            seq_len = seq_len + layer_past[0].shape[-2]
         cos, sin = self.rotary_emb(value, seq_len=seq_len)
         query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
         query = torch.cat((query, query_pass), dim=-1)
 class MegatronGPTLayer(nn.Module):
     def __init__(self, config, layer_idx):
         super().__init__()
+        self.input_layernorm = MegatronGPTLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
+        self.post_attention_layernorm = MegatronGPTLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
         self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
         self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
         self.self_attention = MegatronGPTAttention(config)
         return outputs
+class MegatronGPTLayerNorm(torch.nn.LayerNorm):
     def __init__(self, normalization, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
+        normalization = normalization.lower()
+        assert normalization in ['layernorm', 'layernorm1p', 'rmsnorm']
+        if normalization == 'rmsnorm':
+            torch.nn.Module.__init__(self)
+            self.weight = nn.Parameter(torch.ones(normalized_shape))
+            self.variance_epsilon = eps
+        else:
+            super().__init__(
+                normalized_shape=normalized_shape,
+                eps=eps,
+                elementwise_affine=elementwise_affine,
+                device=device,
+                dtype=dtype,
+            )
         self.normalization = normalization
     def forward(self, x):
+        if self.normalization == 'rmsnorm':
+            input_dtype = x.dtype
+            x = x.to(torch.float32)
+            variance = x.pow(2).mean(-1, keepdim=True)
+            x = x * torch.rsqrt(variance + self.variance_epsilon)
+            return self.weight * x.to(input_dtype)
+        else:
+            weight_bias = 1 if self.normalization == 'layernorm1p' else 0
+            return torch.nn.functional.layer_norm(
+                x, self.normalized_shape, self.weight + weight_bias, self.bias, self.eps
+            )
         self.embed_in = nn.Embedding(config.vocab_size, config.hidden_size)
         self.emb_dropout = nn.Dropout(config.hidden_dropout)
         self.layers = nn.ModuleList([MegatronGPTLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.final_layernorm = MegatronGPTLayerNorm(config.normalization, config.hidden_size, eps=config.layer_norm_eps)
         self.gradient_checkpointing = False
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPastAndEncoding]:
         r"""
         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
             output = (lm_logits,) + outputs[1:]
             return ((lm_loss,) + output) if lm_loss is not None else output
+        return CausalLMOutputWithPastAndEncoding(
             loss=lm_loss,
             logits=lm_logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+            encoding_states=hidden_states
         )
     def prepare_inputs_for_generation(

pytorch_model-00001-of-00002.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9be72e7e2eca0bc35760a307f1ee166d396a658c54f6a19ea6ac0469ed178b18
 size 9970836963

 version https://git-lfs.github.com/spec/v1
+oid sha256:69fceaa3477ed790a9f3506f717a58db7f328ffed532d468bdd82098f3433dce
 size 9970836963

pytorch_model-00002-of-00002.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2c87cbe98bb83d5b16c6e8ba81776b02fe4d0454f0398a669ba721c64f2f464
 size 950158711

 version https://git-lfs.github.com/spec/v1
+oid sha256:7a95c1cd54a63f3ba01e2283528f431948cd2014426efc1e08403cdf99bf3084
 size 950158711