Added flash attention

Browse files

Files changed (2) hide show

configuration_megatron_gpt.py +4 -0
modeling_megatron_gpt.py +48 -2

configuration_megatron_gpt.py CHANGED Viewed

@@ -81,6 +81,8 @@ class MegatronGPTConfig(PretrainedConfig):
             Whether to calculate and apply the relative position bias within the attention function.
             If this is False, then model.generate will require you to calculate the triangular attention
             mask and pass it through in the attention mask.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
             strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
@@ -118,6 +120,7 @@ class MegatronGPTConfig(PretrainedConfig):
         eos_token_id=2,
         tie_word_embeddings=False,
         rope_scaling=None,
         **kwargs,
     ):
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -141,6 +144,7 @@ class MegatronGPTConfig(PretrainedConfig):
         self.use_cache = use_cache
         self.self_attention_relative_position_bias = self_attention_relative_position_bias
         self.tie_word_embeddings = tie_word_embeddings
         self.rope_scaling = rope_scaling
         self._rope_scaling_validation()

             Whether to calculate and apply the relative position bias within the attention function.
             If this is False, then model.generate will require you to calculate the triangular attention
             mask and pass it through in the attention mask.
+        use_flash_attention (`bool`, *optional*, defaults to `False`):
+            When calculating attention, whether to attempt to use flash attention if it's installed, or to always skip and use the regular method.
         rope_scaling (`Dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling
             strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
         eos_token_id=2,
         tie_word_embeddings=False,
         rope_scaling=None,
+        use_flash_attention=False,
         **kwargs,
     ):
         super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
         self.use_cache = use_cache
         self.self_attention_relative_position_bias = self_attention_relative_position_bias
         self.tie_word_embeddings = tie_word_embeddings
+        self.use_flash_attention = use_flash_attention
         self.rope_scaling = rope_scaling
         self._rope_scaling_validation()

modeling_megatron_gpt.py CHANGED Viewed

@@ -21,6 +21,7 @@
 """ PyTorch MegatronGPT model."""
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union
 import torch
@@ -43,8 +44,21 @@ from transformers.modeling_outputs import (
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
 from .configuration_megatron_gpt import MegatronGPTConfig
 def get_activation(act):
     if act in ["gelu", "geglu", "fast-geglu"]:
         act = 'gelu'
@@ -111,9 +125,10 @@ class MegatronGPTAttention(nn.Module):
         self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
         self._init_rope()
         self.register_buffer(
             "norm_factor",
-            torch.sqrt(torch.tensor(self.head_size if config.normalize_attention_scores else 1.0, dtype=torch.float32)).to(torch.get_default_dtype()),
             persistent=False,
         )
         self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.bias)
@@ -207,7 +222,10 @@ class MegatronGPTAttention(nn.Module):
         present = (key, value) if use_cache else None
         # Compute attention
-        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
         # Reshape outputs
         attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
@@ -244,6 +262,34 @@ class MegatronGPTAttention(nn.Module):
         # -> [bs, seq_len, hidden_size]
         return tensor
     def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
         # compute causal mask from causal mask buffer

 """ PyTorch MegatronGPT model."""
 from dataclasses import dataclass
+import math
 from typing import Optional, Tuple, Union
 import torch
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
+# try to load using a relative path, but if it fails try loading it directly
 from .configuration_megatron_gpt import MegatronGPTConfig
+try:
+    from flash_attn.bert_padding import unpad_input, pad_input
+    from flash_attn import flash_attn_varlen_func as flash_attn_func
+    HAS_FLASH = True
+except:
+    try:
+        from flash_attn.flash_attn_interface import flash_attn_unpadded_func as flash_attn_func
+        HAS_FLASH = True
+    except:
+        HAS_FLASH = False
 def get_activation(act):
     if act in ["gelu", "geglu", "fast-geglu"]:
         act = 'gelu'
         self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False)
         self._init_rope()
+        self.norm_factor_float = math.sqrt(self.head_size if config.normalize_attention_scores else 1.0)
         self.register_buffer(
             "norm_factor",
+            torch.tensor(self.norm_factor_float, dtype=torch.float32).to(torch.get_default_dtype()),
             persistent=False,
         )
         self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.bias)
         present = (key, value) if use_cache else None
         # Compute attention
+        if not HAS_FLASH or output_attentions or head_mask is not None or not self.config.use_flash_attention:
+            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+        else:
+            attn_output = self._flash_attn(query, key, value, attention_mask)
         # Reshape outputs
         attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size)
         # -> [bs, seq_len, hidden_size]
         return tensor
+    def _flash_attn(self, query, key, value, attention_mask=None):
+        # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
+        # compute causal mask from causal mask buffer
+        batch_size, num_attention_heads, query_seq_length, attn_head_size = query.size()
+        # transpose_for_scores_flash returns b s h d
+        query_layer = query.transpose(1, 2).half()
+        key_layer = key.transpose(1, 2).half()
+        value_layer = value.transpose(1, 2).half()
+        # fix the mask
+        attention_mask = (attention_mask == 0).int().squeeze(1).squeeze(1)
+        query_layer, query_indicies, cu_seqlens_q, max_seqlen_q = unpad_input(query_layer, attention_mask[:, -query_seq_length:])
+        key_layer, _, cu_seqlens_k, max_seqlen_k = unpad_input(key_layer, attention_mask)
+        value_layer, _, _, _ = unpad_input(value_layer, attention_mask)
+        # returns [batch * seq, nheads, headdim]
+        context_layer = flash_attn_func(query_layer, key_layer, value_layer,
+                                        cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k,
+                                        dropout_p=self.config.attention_dropout, softmax_scale=1 / self.norm_factor_float, causal=self.self_attention_relative_position_bias if max_seqlen_q > 1 else False)
+        # fix the shape to be [bs, num_attention_heads, seq_len, attn_head_size]
+        context_layer = pad_input(context_layer, query_indicies, batch_size, query_seq_length)
+        context_layer = context_layer.view(batch_size, query_seq_length, num_attention_heads, attn_head_size) \
+                                        .transpose(1, 2)
+        return context_layer.to(value.dtype)
     def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size]
         # compute causal mask from causal mask buffer