Update modelling_walsh.py

- Added support for inference cache.
- Refactor common code in attention
- Removed unused code (fragments from another project)

Files changed (1) hide show

modelling_walsh.py +369 -296

modelling_walsh.py CHANGED Viewed

@@ -1,5 +1,5 @@
 # See: https://huggingface.co/docs/transformers/custom_models
-from typing import Optional, Tuple, Union
 import math
 import copy
 import sys
@@ -9,7 +9,7 @@ import torch
 from torch import nn, Tensor
 import torch.nn.init as init
 from torch.nn import functional as F
-from transformers.modeling_outputs import CausalLMOutput
 from transformers import (
     PreTrainedModel,
     PretrainedConfig,
@@ -18,6 +18,10 @@ from transformers import (
     AutoModelForCausalLM,
 )
 from transformers.utils import (
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
@@ -26,6 +30,8 @@ from transformers.utils import (
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
 # The model type string to bind.
 model_type = "walsh-causal-v1"
@@ -78,6 +84,10 @@ class Config(PretrainedConfig):
         layer_args=dict(),
         embedding_args=dict(),
         output_proj_args=dict(),
         **kwargs,
     ):
@@ -113,6 +123,10 @@ class Config(PretrainedConfig):
         self.layer_args = layer_args
         self.embedding_args = embedding_args
         self.output_proj_args = output_proj_args
         super().__init__(**kwargs)
@@ -204,6 +218,8 @@ class HFCausalModel(PreTrainedModel):
     _no_split_modules = ["DeepNetLayer"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     def __init__(self, config):
         super().__init__(config)
@@ -221,40 +237,143 @@ class HFCausalModel(PreTrainedModel):
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs,
     ) -> (Tensor, dict[str, Tensor]):
         if self.gradient_checkpointing and self.training:
             gradient_checkpointing_func = self._gradient_checkpointing_func
         else:
             gradient_checkpointing_func = None
-        logits, attentions = self.transformer_head(
             input_ids=input_ids,
-            need_weights=output_attentions,
             gradient_checkpointing_func=gradient_checkpointing_func,
         )
         # Compute loss.
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, input_ids=input_ids)
         else:
             loss = None
-        return CausalLMOutput(loss=loss, logits=logits, attentions=attentions)
-    # Needed for generate() method.
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        attention_mask = kwargs.get("attention_mask", None)
-        model_inputs = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
         return model_inputs
     def _make_embedding(self, config):
         embedding_cls = get_dynamic_class(config.embdding_cls)
         return embedding_cls(config.vocab_size, self.d_model, config.pad_index, **config.embedding_args)
@@ -278,7 +397,7 @@ class HFCausalModel(PreTrainedModel):
         norm_cls = get_dynamic_class(config.norm_cls)
         return norm_cls(self.d_model)
-    def _make_self_attention(self, config):
         attention_cls = get_dynamic_class(config.attention_cls)
         # Map HF _attn_implementation to attn_type
         match config._attn_implementation:
@@ -299,28 +418,32 @@ class HFCausalModel(PreTrainedModel):
             d_model=self.d_model,
             num_heads=config.num_attention_heads,
             attn_type=attn_type,
             **config.attention_args,
         )
-    def _make_feedforward(self, config):
         feedforward_cls = get_dynamic_class(config.feedforward_cls)
         return feedforward_cls(
             d_model=self.d_model,
             feedforward_dim=config.dim_feedforward,
             dropout=config.dropout,
             activation=self._make_activation(config),
             **config.feedforward_args,
         )
-    def _make_layer(self, config):
         layer_cls = get_dynamic_class(config.layer_cls)
         return layer_cls(
             d_model=self.d_model,
             dropout=self._make_dropout(config),
-            attention=self._make_self_attention(config),
-            feedforward=self._make_feedforward(config),
             norm1=self._make_norm(config),
             norm2=self._make_norm(config),
             **config.layer_args,
         )
@@ -328,7 +451,7 @@ class HFCausalModel(PreTrainedModel):
         layer_stack_cls = get_dynamic_class(config.layer_stack_cls)
         return layer_stack_cls(
             layers=nn.ModuleList([
-                 self._make_layer(config) for _ in range(config.num_hidden_layers)
             ]),
             **config.layer_stack_args,
         )
@@ -364,43 +487,35 @@ class Transformer(nn.Module):
         self.sqrt_d_model = d_model**0.5
         self.reset_parameters()
-    def forward(self, input_ids, need_weights, gradient_checkpointing_func):
-        x = self.positional_encoder(self.embedding(input_ids) * self.sqrt_d_model)
-        x, attentions = self.layer_stack(
-            x,
-            need_weights,
-            gradient_checkpointing_func,
         )
-        # Translate output embedding ot logits.
-        logits = self.output_projection(x)
-        return logits, attentions
     def reset_parameters(self):
         init.xavier_uniform_(self.output_projection.weight)
         init.constant_(self.output_projection.bias, 0.)
         init.normal_(self.embedding.weight, std=self.d_model**-0.5)
-# A vanilla positional encoder
-class PositionalEncoder(nn.Module):
-    def __init__(self, d_embed, max_seq):
-        super().__init__()
-        self.d_embed = d_embed
-        self.max_seq = max_seq
-        weight = torch.zeros(max_seq, d_embed)
-        position = torch.arange(0, max_seq, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_embed, 2).float() * (-math.log(10000.0) / d_embed))
-        weight[:, 0::2] = torch.sin(position * div_term)
-        weight[:, 1::2] = torch.cos(position * div_term)
-        weight = weight.unsqueeze(0)
-        self.register_buffer('weight', weight)
-    def forward(self, x):
-        seq_len = x.size(-2)
-        return x + self.weight[:, :seq_len]
 # Converts a torch array of integers into their equivalent binary codes.
 def binary_tensor(x, bits):
     mask = 2**torch.arange(bits).to(x.device, x.dtype)
@@ -472,7 +587,7 @@ class RSWalshPositionalEncoder(nn.Module):
         # walsh = (hadamard_walsh_matrix(k)[:bits,:d_embed] -0.5) * self.gain
         self.register_buffer('walsh', walsh, persistent=False)
-    def forward(self, x):
         seq_len = x.size(-2)
         # Get sequence of binary codes...
@@ -486,6 +601,12 @@ class RSWalshPositionalEncoder(nn.Module):
             shift = torch.randint(self.max_seq - seq_len + 1, (1,)).item()
             seq = self.binary_code[shift:seq_len + shift,:]
         # Disable shifting when not training. This does not appear to change the evaluation loss, but
         # it does makes predictions easier to analyse when the attention weights are not shifting with each step.
         else:
@@ -508,25 +629,58 @@ class TransformerLayerStack(nn.Module):
         super().__init__()
         self.layers = layers
-    def forward(self, x, need_weights, gradient_checkpointing_func=None):
-        attentions = []
         for layer in self.layers:
             if gradient_checkpointing_func is not None:
-                x, attention_weights = gradient_checkpointing_func(
                     layer.__call__,
-                    x,
-                    need_weights,
-                    use_reentrant=False
                 )
             else:
-                x, attention_weights = layer(x, need_weights=need_weights)
-            if need_weights:
-                attentions.append(attention_weights)
-        return x, attentions
 # DeepNet: Scaling Transformers to 1,000 Layers
 # https://arxiv.org/abs/2203.00555
 class DeepnetLayer(nn.Module):
     def __init__(
         self,
@@ -536,6 +690,7 @@ class DeepnetLayer(nn.Module):
         norm1,
         norm2,
         dropout,
         alpha=1.0,
     ):
         super().__init__()
@@ -547,27 +702,45 @@ class DeepnetLayer(nn.Module):
         self.dropout = dropout
         # Deepnet alpha
         self.alpha = alpha
-    def forward(self, x, need_weights=False):
         # Keep input as residual
-        residual = x * self.alpha
         # Compute attention
-        x, attention_weights = self.attention(x, need_weights)
         # Add attention with residual and normalize.
-        x = self.norm1(residual + self.dropout(x))
         # Keep output as next residual.
-        residual = x * self.alpha
         # Pass through feedforward network.
-        x = self.feedforward(x)
         # Combine residual and ff output, then normalize again.
-        x = self.norm2(residual + self.dropout(x))
-        return x, attention_weights
 # A vanilla MLP transfomer layer.
 class FeedforwardLayer(nn.Module):
@@ -576,6 +749,7 @@ class FeedforwardLayer(nn.Module):
         d_model: int,
         feedforward_dim: int,
         dropout,
         activation=nn.ReLU(),
         beta=1.0,
         bias=True,
@@ -598,41 +772,6 @@ class FeedforwardLayer(nn.Module):
         init.constant_(self.linear1.bias, 0.)
         init.constant_(self.linear2.bias, 0.)
-# GLU Variants Improve Transformer
-# https://arxiv.org/pdf/2002.05202v1.pdf
-class SwiGLUFeedforwardLayer(nn.Module):
-    def __init__(
-        self,
-        d_model,
-        d_feedforward,
-        beta=1.0,
-        dropout=0.1
-    ):
-        super().__init__()
-        self.d_model = d_model
-        self.d_feedforward = d_feedforward
-        self.beta = 1.0
-        self.linear1 = nn.Linear(self.d_model, self.d_feedforward * 2, bias=False)
-        self.linear2 = nn.Linear(self.d_feedforward, self.d_model, bias=False)
-        self.dropout = nn.Dropout(dropout)
-        self.reset_parameters()
-    def forward(self, x):
-        x, gate = self.linear1(x).chunk(2, dim=-1)
-        x = x * F.silu(gate)
-        x = self.dropout(x)
-        x = self.linear2(x)
-        return x
-    def reset_parameters(self):
-        # Deepnet initialization
-        # https://arxiv.org/pdf/2203.00555.pdf
-        w, g = self.linear1.weight.chunk(2, dim=0)
-        init.xavier_uniform_(w, gain=self.beta)
-        init.xavier_uniform_(g, gain=self.beta)
-        init.xavier_uniform_(self.linear2.weight, gain=self.beta)
 class CausalSelfAttention(nn.Module):
     def __init__(
         self,
@@ -643,6 +782,8 @@ class CausalSelfAttention(nn.Module):
         #   torch: Use pytorch "scaled_dot_product_attention()"; faster; generally good compatibility; does not support returning attn weights.
         #   flash2: Use Flash-Attention2 implementation; fastest; limited to int16 and bfloat16 types; least memory usage.
         attn_type,
         beta=1.0,
         dropout=0.1,
     ):
@@ -651,6 +792,8 @@ class CausalSelfAttention(nn.Module):
         self.num_heads = num_heads
         self.beta = beta
         self.attn_type = attn_type
         assert d_model % num_heads == 0, "d_model must be evenly divisible by num_heads"
@@ -681,29 +824,56 @@ class CausalSelfAttention(nn.Module):
         init.constant_(self.in_proj.bias, 0.)
         init.constant_(self.output_linear.bias, 0.)
-    def project_input(self, qkv):
-        proj = self.in_proj(qkv)
-        return proj.chunk(chunks=3, dim=-1)
-    def forward(self, qkv, need_weights):
-        if self.attn_type == "flash2":
-            return self.flash2_forward(qkv)
-        # qkv: (batch_size, seq_len, d_embed)
         batch_size, seq_len, d_embed = qkv.shape
-        # Feed the inputs through the K, Q, V matrices.
-        query, key, value = self.project_input(qkv)
         # Split projections into multiple heads and swap position of sequence / heads dimension
         query = query.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
         key = key.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
         value = value.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
         # Default to returning empty attention weights.
-        attention_weights = None
-        if self.attn_type == "torch":
             # This context manager can be used to force which implementation to use.
             #with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
             attended_values = F.scaled_dot_product_attention(
@@ -712,7 +882,7 @@ class CausalSelfAttention(nn.Module):
                 value,
                 attn_mask=None,
                 dropout_p=self.dropout.p if self.training else 0.0,
-                is_causal=True,
                 scale=self.dot_product_scale
             )
         # "native" scaled-dot-product attention implementation.
@@ -721,44 +891,57 @@ class CausalSelfAttention(nn.Module):
             scores = torch.matmul(query, key.transpose(-2, -1)) * self.dot_product_scale
             # Mask future positions from the past
-            scores.masked_fill_(
-                torch.tril(
-                    torch.ones(seq_len, seq_len, dtype=torch.bool, device=qkv.device),
-                    diagonal=0,
-                ).logical_not(),
-                float('-inf'),
-            )
             # Calculate the attention weights; avoid NANs that might emerge from zeros in softmax's denominator
-            attention_weights = self.dropout(torch.softmax(scores, dim=-1).clamp(min=1e-10))
             del scores
             # Use the attention weights to get a weighted combination of value vectors
-            attended_values = torch.matmul(attention_weights, value)
-            if not need_weights:
-                del attention_weights
-                attention_weights = None
         # Concatenate attention heads and project to original embedding size using the output linear layer
         attended_values = attended_values.transpose(1, 2).contiguous().view(batch_size, seq_len, d_embed)
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
-        return attended_values, attention_weights
-    def flash2_forward(self, qkv):
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
         # query : (batch_size, seq_len, d_model)
         # qkv : (batch_size, seq_len, 3, num_heads, d_kq)
         qkv = self.in_proj(qkv).unflatten(
             -1,
             (3, self.num_heads, self.d_head)
         )
         attended_values = flash_attn_qkvpacked_func(
-            qkv.bfloat16(),
             dropout_p=self.dropout.p if self.training else 0.0,
             softmax_scale=self.dot_product_scale,
             causal=True,
@@ -770,180 +953,70 @@ class CausalSelfAttention(nn.Module):
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
-        return attended_values, None
-# Attention layer with ALiBi relative positional encoding
-# TRAIN SHORT, TEST LONG: ATTENTION WITH LINEAR BIASES ENABLES INPUT LENGTH EXTRAPOLATION
-# https://arxiv.org/pdf/2108.12409.pdf
-def alibi_biases(query_len, key_len, device='cpu'):
-    x = torch.arange(key_len, device=device)[None, :]
-    y = torch.arange(query_len, device=device)[:, None]
-    return x - y
-class CausalAlibiAttention(nn.Module):
-    def __init__(
         self,
-        d_model,
-        num_heads,
-        beta=1.0,
-        dropout=0.1,
-        # values:
-        #   native: Use local impementation; slowest option; good for debugging; useful when experimenting with non-standard stuff.
-        #   torch: Use pytorch "scaled_dot_product_attention()"; faster; generally good compatibility; does not support returning attn weights.
-        #   flash2: Use Flash-Attention2 implementation; fastest; limited to int16 and bfloat16 types; can't train Alibi weights; least memory usage.
-        # Note: You can perform initial training with "torch," then switch to "flash2," after the Alibi weights have settled.
-        window_size=None,
-        attn_type="native",
-        freeze_alibi=True,
     ):
-        super().__init__()
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.beta = beta
-        self.attn_type = attn_type
-        assert d_model % num_heads == 0, "d_model must be evenly divisible by num_heads"
-        # The dimension of each head.
-        self.d_head = d_model // num_heads
-        # We scale the attention scores by the inverse-square-root of the head dimension
-        # this shifts the temerature of softmax.
-        self.dot_product_scale = 1.0 / math.sqrt(self.d_head)
-        self.in_proj = nn.Parameter(torch.empty(3 * self.d_model, self.d_model))
-        self.output_linear = nn.Linear(self.d_model, self.d_model, bias=False)
-        if window_size is not None:
-            self.window_size=(window_size, -1)
-        else:
-            self.window_size = (-1, -1)
-        self.dropout = nn.Dropout(dropout)
-        # This generates the original slope distribution from the paper.
-        # Observations with trainable slopes suggest that the high half of the slopes shift
-        # towards / past 1.0 and the low half approach zero or even go slightly negative.
-        # alibi_slopes = 1.0 / torch.logspace(1, 8, self.num_heads, base=2, dtype=torch.float)
-        # These appear to work better, as initial values, in practice.
-        alibi_slopes = 1.0 / torch.logspace(0, 7, self.num_heads, base=2, dtype=torch.float)
-        # If not trainable, it can improve performance somewhat if the low half are set to zero. Apparently
-        # making roughly half of the slopes position-agnostic is somehow closer to optimal?
-        # alibi_slopes.masked_fill_(torch.where(torch.arange(0, self.num_heads) >= (self.num_heads / 2), True, False), 0)
-        self.alibi_slopes = nn.Parameter(alibi_slopes)
-        # Optionally, allow/disallow training of ALiBi slopes.
-        self.alibi_slopes.requires_grad = (not freeze_alibi)
-        self.reset_parameters()
-    def extra_repr(self) -> str:
-        return f'd_model={self.d_model}, num_heads={self.num_heads}, beta={self.beta}, attn_type={self.attn_type}, window_size={self.window_size}, dropout={self.dropout}'
-    def reset_parameters(self):
-        # Deepnet initialization
-        # https://arxiv.org/pdf/2203.00555.pdf
-        q, k, v = self.in_proj.chunk(3)
-        init.xavier_uniform_(q, gain=1.0)
-        init.xavier_uniform_(k, gain=1.0)
-        init.xavier_uniform_(v, gain=self.beta)
-        init.xavier_uniform_(self.output_linear.weight, gain=self.beta)
-    def project_input(self, qkv):
-        proj = F.linear(qkv, self.in_proj)
-        return proj.chunk(chunks=3, dim=-1)
-    def forward(self, qkv, need_weights):
-        if self.attn_type == "flash2":
-            return self.flash2_forward(qkv)
-        # qkv: (batch_size, seq_len, d_embed)
-        batch_size, seq_len, d_embed = qkv.shape
-        # Feed the inputs through the K, Q, V matrices.
-        query, key, value = self.project_input(qkv)
-        # Split projections into multiple heads and swap position of sequence / heads dimension
-        query = query.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        key = key.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        value = value.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
-        # Apply Alibi relative positional biases.
-        attn_bias = alibi_biases(seq_len, seq_len, device=query.device) * self.alibi_slopes.view(-1, 1, 1)
-        # Mask future positions from the past
-        causal_mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.bool, device=qkv.device), diagonal=0)
-        attn_bias.masked_fill_(causal_mask.logical_not(), float('-inf'))
-        del causal_mask
-        # Default to returning empty attention weights.
-        attention_weights = None
-        if self.attn_type == "torch":
-            # This context manager can be used to force which implementation to use.
-            #with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
-            attended_values = F.scaled_dot_product_attention(
-                query,
-                key,
-                value,
-                attn_mask=attn_bias.to(dtype=query.dtype),
-                dropout_p=self.dropout.p if self.training else 0.0,
-                is_causal=False,
-                scale=self.dot_product_scale
-            )
-        # "native" scaled-dot-product attention implementation.
-        else:
-            # Compute attention scores
-            scores = torch.matmul(query, key.transpose(-2, -1)) * self.dot_product_scale
-            # Adjust scores with attn_mask
-            scores += attn_bias
-            # Calculate the attention weights; avoid NANs that might emerge from zeros in softmax's denominator
-            attention_weights = self.dropout(torch.softmax(scores, dim=-1).clamp(min=1e-10))
-            # Use the attention weights to get a weighted combination of value vectors
-            attended_values = torch.matmul(attention_weights, value)
-            if not need_weights:
-                attention_weights = None
-        # Concatenate attention heads and project to original embedding size using the output linear layer
-        attended_values = attended_values.transpose(1, 2).contiguous().view(batch_size, seq_len, d_embed)
-        # Project the concatenated output through the output matrix.
-        attended_values = self.output_linear(attended_values)
-        return attended_values, attention_weights
-    def flash2_forward(self, qkv):
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
-        # query : (batch_size, seq_len, d_model)
-        # qkv : (batch_size, seq_len, 3, num_heads, d_kq)
-        qkv = F.linear(
-            qkv,
-            self.in_proj,
-        ).unflatten(
-            -1,
-            (3, self.num_heads, self.d_head)
-        )
-        attended_values = flash_attn_qkvpacked_func(
-            qkv.bfloat16(),
             dropout_p=self.dropout.p if self.training else 0.0,
             softmax_scale=self.dot_product_scale,
             causal=True,
-            window_size=self.window_size,
-            alibi_slopes=self.alibi_slopes.float(),
-        ).to(dtype=qkv.dtype)
         # attended_values: (batch_size, seqlen, nheads, headdim)
         # Concatentate heads back into d_embed
         attended_values = attended_values.view(batch_size, seq_len, d_embed)
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
-        return attended_values, None

 # See: https://huggingface.co/docs/transformers/custom_models
+from typing import Optional, Tuple, Union, List
 import math
 import copy
 import sys
 from torch import nn, Tensor
 import torch.nn.init as init
 from torch.nn import functional as F
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutput, CausalLMOutputWithPast
 from transformers import (
     PreTrainedModel,
     PretrainedConfig,
     AutoModelForCausalLM,
 )
+from transformers.utils import logging
+from transformers.cache_utils import Cache, DynamicCache
 from transformers.utils import (
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_qkvpacked_func, flash_attn_func
+logger = logging.get_logger(__name__)
 # The model type string to bind.
 model_type = "walsh-causal-v1"
         layer_args=dict(),
         embedding_args=dict(),
         output_proj_args=dict(),
+        output_attentions=False,
+        output_hidden_states=False,
+        use_cache=True,
         **kwargs,
     ):
         self.layer_args = layer_args
         self.embedding_args = embedding_args
         self.output_proj_args = output_proj_args
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.use_cache = use_cache
         super().__init__(**kwargs)
     _no_split_modules = ["DeepNetLayer"]
     _supports_flash_attn_2 = True
     _supports_sdpa = True
+    _supports_cache_class = True
+    _skip_keys_device_placement = "past_key_values"
     def __init__(self, config):
         super().__init__(config)
         token_type_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs,
     ) -> (Tensor, dict[str, Tensor]):
+        batch_size, seq_len = input_ids.shape
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if use_cache:
+            # If legacy cache, convert to DynamicCache
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
         if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
             gradient_checkpointing_func = self._gradient_checkpointing_func
         else:
             gradient_checkpointing_func = None
+        outputs = self.transformer_head(
             input_ids=input_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
             gradient_checkpointing_func=gradient_checkpointing_func,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
         )
+        logits = outputs["logits"].float()
+        attentions = outputs["attentions"]
         # Compute loss.
         if labels is not None:
             loss = self.loss_function(logits=logits, labels=labels, input_ids=input_ids)
         else:
             loss = None
+        # Convert back to legacy cache, if that's what we received
+        new_cache = outputs["past_key_values"]
+        if use_cache and new_cache is not None and use_legacy_cache:
+            new_cache = new_cache.to_legacy_cache()
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=new_cache,
+            hidden_states=outputs["hidden_states"],
+            attentions=outputs["attentions"],
+        )
+    # Implementation from Huggingface Transformers,
+    # https://github.com/huggingface/transformers/blob/main/src/transformers/models/mistral/modeling_mistral.py
+    # Note: We do not implement attention mask at present, so some of this code is not applicable
+    # TODO: Reenable attention mask support for batch inference..
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        # NOTE: Injecting positional embeddings is not yet supported.
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
         return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
     def _make_embedding(self, config):
         embedding_cls = get_dynamic_class(config.embdding_cls)
         return embedding_cls(config.vocab_size, self.d_model, config.pad_index, **config.embedding_args)
         norm_cls = get_dynamic_class(config.norm_cls)
         return norm_cls(self.d_model)
+    def _make_self_attention(self, layer_idx, config):
         attention_cls = get_dynamic_class(config.attention_cls)
         # Map HF _attn_implementation to attn_type
         match config._attn_implementation:
             d_model=self.d_model,
             num_heads=config.num_attention_heads,
             attn_type=attn_type,
+            layer_idx=layer_idx,
+            config=config,
             **config.attention_args,
         )
+    def _make_feedforward(self, layer_idx, config):
         feedforward_cls = get_dynamic_class(config.feedforward_cls)
         return feedforward_cls(
             d_model=self.d_model,
             feedforward_dim=config.dim_feedforward,
             dropout=config.dropout,
             activation=self._make_activation(config),
+            layer_idx=layer_idx,
             **config.feedforward_args,
         )
+    def _make_layer(self, layer_idx, config):
         layer_cls = get_dynamic_class(config.layer_cls)
         return layer_cls(
             d_model=self.d_model,
             dropout=self._make_dropout(config),
+            attention=self._make_self_attention(layer_idx, config),
+            feedforward=self._make_feedforward(layer_idx, config),
             norm1=self._make_norm(config),
             norm2=self._make_norm(config),
+            layer_idx=layer_idx,
             **config.layer_args,
         )
         layer_stack_cls = get_dynamic_class(config.layer_stack_cls)
         return layer_stack_cls(
             layers=nn.ModuleList([
+                 self._make_layer(layer_idx, config) for layer_idx in range(config.num_hidden_layers)
             ]),
             **config.layer_stack_args,
         )
         self.sqrt_d_model = d_model**0.5
         self.reset_parameters()
+    def forward(
+        self,
+        input_ids,
+        position_ids,
+        output_attentions,
+        gradient_checkpointing_func,
+        past_key_values,
+        use_cache,
+        output_hidden_states,
+    ):
+        outputs = self.layer_stack(
+            self.positional_encoder(self.embedding(input_ids) * self.sqrt_d_model, position_ids),
+            output_attentions=output_attentions,
+            gradient_checkpointing_func=gradient_checkpointing_func,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
         )
+        # Translate output states to logits.
+        outputs["logits"] = self.output_projection(outputs["last_hidden_state"])
+        del outputs["last_hidden_state"]
+        return outputs
     def reset_parameters(self):
         init.xavier_uniform_(self.output_projection.weight)
         init.constant_(self.output_projection.bias, 0.)
         init.normal_(self.embedding.weight, std=self.d_model**-0.5)
 # Converts a torch array of integers into their equivalent binary codes.
 def binary_tensor(x, bits):
     mask = 2**torch.arange(bits).to(x.device, x.dtype)
         # walsh = (hadamard_walsh_matrix(k)[:bits,:d_embed] -0.5) * self.gain
         self.register_buffer('walsh', walsh, persistent=False)
+    def forward(self, x, position_ids=None):
         seq_len = x.size(-2)
         # Get sequence of binary codes...
             shift = torch.randint(self.max_seq - seq_len + 1, (1,)).item()
             seq = self.binary_code[shift:seq_len + shift,:]
+        # When the cache is used for generation, after the first call, we are only passed a single token at a time,
+        # with the remaining tokens being in the cache. We need to make sure that the newly injected tokens have the
+        # correct relative position by indexing the codes with the position_ids.
+        elif position_ids != None:
+            seq = self.binary_code[position_ids, :]
         # Disable shifting when not training. This does not appear to change the evaluation loss, but
         # it does makes predictions easier to analyse when the attention weights are not shifting with each step.
         else:
         super().__init__()
         self.layers = layers
+    def forward(
+        self,
+        hidden_states,
+        output_attentions,
+        past_key_values,
+        use_cache,
+        output_hidden_states,
+        gradient_checkpointing_func=None,
+    ):
+        present_key_value = None
+        all_attentions = [] if output_attentions else None
+        all_hidden_states = [hidden_states] if output_hidden_states else None
         for layer in self.layers:
             if gradient_checkpointing_func is not None:
+                layer_outputs = gradient_checkpointing_func(
                     layer.__call__,
+                    hidden_states,
+                    output_attentions,
+                    past_key_values,
+                    use_cache,
+                    use_reentrant=False,
                 )
             else:
+                layer_outputs = layer(
+                    hidden_states,
+                    output_attentions,
+                    past_key_values,
+                    use_cache,
+                )
+            hidden_states = layer_outputs["hidden_states"]
+            if output_hidden_states:
+                all_hidden_states.append(hidden_states)
+            if use_cache:
+                present_key_value = layer_outputs["past_key_values"]
+            if output_attentions:
+                all_attentions.append(layer_outputs["attentions"])
+        return dict(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value,
+            hidden_states=hidden_states,
+            attentions=all_attentions,
+        )
 # DeepNet: Scaling Transformers to 1,000 Layers
 # https://arxiv.org/abs/2203.00555
+# Note: This is a type of Pre-Layer-Norm Transformer layer.
 class DeepnetLayer(nn.Module):
     def __init__(
         self,
         norm1,
         norm2,
         dropout,
+        layer_idx,
         alpha=1.0,
     ):
         super().__init__()
         self.dropout = dropout
         # Deepnet alpha
         self.alpha = alpha
+        self.layer_idx = layer_idx
+    def forward(
+        self,
+        hidden_states,
+        output_attentions,
+        past_key_values,
+        use_cache,
+    ):
         # Keep input as residual
+        residual = hidden_states * self.alpha
         # Compute attention
+        attn_outputs = self.attention(
+            hidden_states,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions
+        )
+        hidden_states = attn_outputs["hidden_states"]
         # Add attention with residual and normalize.
+        hidden_states = self.norm1(residual + self.dropout(hidden_states))
         # Keep output as next residual.
+        residual = hidden_states * self.alpha
         # Pass through feedforward network.
+        hidden_states = self.feedforward(hidden_states)
         # Combine residual and ff output, then normalize again.
+        hidden_states = self.norm2(residual + self.dropout(hidden_states))
+        return dict(
+            hidden_states=hidden_states,
+            attentions=attn_outputs["attentions"],
+            past_key_values=attn_outputs["past_key_values"]
+        )
 # A vanilla MLP transfomer layer.
 class FeedforwardLayer(nn.Module):
         d_model: int,
         feedforward_dim: int,
         dropout,
+        layer_idx,
         activation=nn.ReLU(),
         beta=1.0,
         bias=True,
         init.constant_(self.linear1.bias, 0.)
         init.constant_(self.linear2.bias, 0.)
 class CausalSelfAttention(nn.Module):
     def __init__(
         self,
         #   torch: Use pytorch "scaled_dot_product_attention()"; faster; generally good compatibility; does not support returning attn weights.
         #   flash2: Use Flash-Attention2 implementation; fastest; limited to int16 and bfloat16 types; least memory usage.
         attn_type,
+        layer_idx,
+        config,
         beta=1.0,
         dropout=0.1,
     ):
         self.num_heads = num_heads
         self.beta = beta
         self.attn_type = attn_type
+        self.layer_idx = layer_idx
+        self.config = config
         assert d_model % num_heads == 0, "d_model must be evenly divisible by num_heads"
         init.constant_(self.in_proj.bias, 0.)
         init.constant_(self.output_linear.bias, 0.)
+    # Project QKV input through input matrices, reshape to (batch_size, n_heads, seq_len, d_model), and apply cache.
+    def project_input(self, qkv, past_key_values):
         batch_size, seq_len, d_embed = qkv.shape
+        proj = self.in_proj(qkv)
+        query, key, value = proj.chunk(chunks=3, dim=-1)
         # Split projections into multiple heads and swap position of sequence / heads dimension
         query = query.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
         key = key.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
         value = value.view(batch_size, seq_len, self.num_heads, self.d_head).transpose(1, 2)
+         # Update the cache values.
+        if past_key_values is not None:
+            key, value = past_key_values.update(key, value, self.layer_idx)
+        return query, key, value
+    def forward(
+        self,
+        qkv,
+        output_attentions,
+        past_key_values,
+        use_cache,
+    ):
+        attn_type = self.attn_type
+        if output_attentions and attn_type != "native":
+            logger.warning_once(
+                "CausalSelfAttention(output_attentions=True) and attn_type is not 'native': "
+                "Forcing native attention."
+            )
+            attn_type = "native"
+        if attn_type == "flash2":
+            if use_cache is None or use_cache == False:
+                return self.flash2_forward(qkv)
+            else:
+                return self.flash2_forward_cached(qkv, past_key_values)
+        # qkv: (batch_size, seq_len, d_embed)
+        batch_size, seq_len, d_embed = qkv.shape
+        # Feed the inputs through the K, Q, V matrices.
+        query, key, value = self.project_input(qkv, past_key_values)
+        kv_seq_len = key.shape[-2]
         # Default to returning empty attention weights.
+        attentions = None
+        # https://github.com/pytorch/pytorch/issues/112577
+        if attn_type == "torch":
             # This context manager can be used to force which implementation to use.
             #with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
             attended_values = F.scaled_dot_product_attention(
                 value,
                 attn_mask=None,
                 dropout_p=self.dropout.p if self.training else 0.0,
+                is_causal=(seq_len > 1),
                 scale=self.dot_product_scale
             )
         # "native" scaled-dot-product attention implementation.
             scores = torch.matmul(query, key.transpose(-2, -1)) * self.dot_product_scale
             # Mask future positions from the past
+            if seq_len > 1:
+                scores.masked_fill_(
+                    torch.tril(
+                        torch.ones(seq_len, kv_seq_len, dtype=torch.bool, device=qkv.device),
+                        diagonal=0,
+                    ).logical_not(),
+                    float('-inf'),
+                )
             # Calculate the attention weights; avoid NANs that might emerge from zeros in softmax's denominator
+            attentions = self.dropout(torch.softmax(scores, dim=-1).clamp(min=1e-10))
             del scores
             # Use the attention weights to get a weighted combination of value vectors
+            attended_values = torch.matmul(attentions, value)
+            if not output_attentions:
+                del attentions
+                attentions = None
         # Concatenate attention heads and project to original embedding size using the output linear layer
         attended_values = attended_values.transpose(1, 2).contiguous().view(batch_size, seq_len, d_embed)
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
+        return dict(
+            hidden_states=attended_values,
+            attentions=attentions,
+            past_key_values=past_key_values
+        )
+    # No cache support, but faster
+    def flash2_forward(
+        self,
+        qkv,
+    ):
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
         # query : (batch_size, seq_len, d_model)
         # qkv : (batch_size, seq_len, 3, num_heads, d_kq)
+        # Feed the inputs through the K, Q, V matrices.
+        # query : (batch_size, seq_len, d_model)
+        # qkv : (batch_size, seq_len, 3, num_heads, d_kq)
         qkv = self.in_proj(qkv).unflatten(
             -1,
             (3, self.num_heads, self.d_head)
         )
         attended_values = flash_attn_qkvpacked_func(
+            self._downcast_to_float16(qkv)[0],
             dropout_p=self.dropout.p if self.training else 0.0,
             softmax_scale=self.dot_product_scale,
             causal=True,
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
+        return dict(
+            hidden_states=attended_values,
+            attentions=None,
+            past_key_values=None
+        )
+    # See https://github.com/huggingface/transformers/blob/main/src/transformers/cache_utils.py
+    #https://huggingface.co/docs/transformers/internal/generation_utils
+    def flash2_forward_cached(
         self,
+        qkv,
+        past_key_values,
     ):
         batch_size, seq_len, d_embed = qkv.shape
         # Feed the inputs through the K, Q, V matrices.
+        query, key, value = self.project_input(qkv, past_key_values)
+        query, key, value = self._downcast_to_float16(query, key, value)
+        # Expected inputs to flash2:
+        # q: (batch_size, seqlen, nheads, headdim)
+        # k: (batch_size, seqlen, nheads_k, headdim)
+        # v: (batch_size, seqlen, nheads_k, headdim)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        attended_values = flash_attn_func(
+            q=query,
+            k=key,
+            v=value,
             dropout_p=self.dropout.p if self.training else 0.0,
             softmax_scale=self.dot_product_scale,
             causal=True,
+        )
         # attended_values: (batch_size, seqlen, nheads, headdim)
         # Concatentate heads back into d_embed
         attended_values = attended_values.view(batch_size, seq_len, d_embed)
         # Project the concatenated output through the output matrix.
         attended_values = self.output_linear(attended_values)
+        return dict(
+            hidden_states=attended_values,
+            attentions=None,
+            past_key_values=past_key_values
+        )
+    def _downcast_to_float16(self, *args):
+        if args[0].dtype != torch.float32:
+            return args
+        if torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        # Handle the case where the model is quantized
+        elif hasattr(self.config, "_pre_quantization_dtype"):
+            target_dtype = self.config._pre_quantization_dtype
+        else:
+            target_dtype = self.output_linear.weight.dtype
+        logger.warning_once(
+            f"The input hidden states seems to be silently casted in float32, this might be related to"
+            f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+            f" {target_dtype}."
+        )
+        return (arg.to(target_dtype) for arg in args)