test-flex-gpt / attention.py
oweller2
fix
9bfde91
# Copyright 2024 **AUTHORS_TODO**
# License: Apache-2.0
# Copyright 2022 MosaicML Examples authors
# SPDX-License-Identifier: Apache-2.0
# Copyright 2023 MosaicML Examples authors
# SPDX-License-Identifier: Apache-2.0
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2023, Tri Dao.
import torch
import torch.nn as nn
import torch.nn.functional as F
import warnings
from typing import Optional
import importlib.metadata
import logging
import math
from .bert_padding import pad_input, unpad_input_only, index_first_axis
from .configuration_bert import FlexBertConfig, maybe_add_padding
from .normalization import get_norm_layer
from .initialization import ModuleType, init_weights
IMPL_USE_FLASH3 = False
IMPL_USE_FLASH2 = False
try:
from flash_attn_interface import flash_attn_varlen_func
IMPL_USE_FLASH3 = True
except ImportError:
pass
# Import Flash Attention 2, which supports ALiBi https://github.com/Dao-AILab/flash-attention
try:
from flash_attn import flash_attn_varlen_qkvpacked_func, flash_attn_qkvpacked_func # type: ignore
installed_version = importlib.metadata.version("flash_attn") # type: ignore
if installed_version < "2.5.7":
raise ImportError("newer version of flash_attn required (>= 2.5.7)")
IMPL_USE_FLASH2 = True
except ImportError:
pass
try:
from flash_attn.layers.rotary import RotaryEmbedding # type: ignore
from .rotary import UnpaddedRotaryEmbedding # type: ignore
except ImportError:
RotaryEmbedding = None
UnpaddedRotaryEmbedding = None
logger = logging.getLogger(__name__)
class BertAlibiUnpadSelfAttention(nn.Module):
"""Performs multi-headed self attention on a batch of unpadded sequences.
If Flash Attention 2 is installed, this module uses Flash Attention to greatly improve throughput.
The Flash Attention implementation used in MosaicBERT supports arbitrary attention biases (which
we use to implement ALiBi). If either Flash Attention 2 is not installed the implementation will
default to a math-equivalent pytorch version, which is much slower.
See `forward` method for additional details.
"""
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.is_causal = config.causal_mask
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.p_dropout = config.attention_probs_dropout_prob
self.Wqkv = nn.Linear(self.all_head_size, 3 * config.hidden_size)
self.deterministic_fa2 = getattr(config, "deterministic_fa2", False)
# Warn if defaulting to pytorch because of import issues
if not IMPL_USE_FLASH2:
warnings.warn(
"Unable to import flash_attn; defaulting MosaicBERT attention implementation to "
"vanilla PyTorch (this will reduce throughput when using this model)."
)
def forward(
self,
hidden_states: torch.Tensor,
cu_seqlens: torch.Tensor,
max_seqlen: int,
indices: torch.Tensor,
attn_mask: torch.Tensor,
bias: torch.Tensor,
slopes: torch.Tensor,
) -> torch.Tensor:
"""Perform self-attention.
There are two attention implementations: vanilla attention with ALiBi, and Flash Attention 2 with ALiBi
The arguments are unpadded. The vanilla implementation of attention requires padded arguments while the
Flash Attention implementation does not. If using vanilla we first call `pad_input`. Once we compute
attention, we re-unpad our outputs for the other layers. The pad/unpad operations add overhead, but not
sending pad tokens through ffs saves compute.
Args:
hidden_states: (total_nnz, dim)
cu_seqlens: (batch + 1,)
max_seqlen: int
indices: (total_nnz,)
attn_mask: (batch, max_seqlen)
bias: (batch, heads, max_seqlen, max_seqlen)
slopes: (heads) or (batch, heads)
Returns:
attention: (total_nnz, dim)
"""
bs, dim = hidden_states.shape
qkv = self.Wqkv(hidden_states)
# Option 1: Flash Attention with ALiBi
if IMPL_USE_FLASH2:
qkv = qkv.view(-1, 3, self.num_attention_heads, self.attention_head_size)
assert 1 <= len(slopes.shape) <= 2, f"{slopes=}"
assert slopes.shape[-1] == self.num_attention_heads, f"{slopes=}"
convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
if convert_dtype:
# FA2 implementation only supports fp16 and bf16
# If FA2 is supported, bfloat16 must be supported
# as of FA2 2.4.2. (Turing GPUs not supported)
orig_dtype = qkv.dtype
qkv = qkv.to(torch.bfloat16)
attention = flash_attn_varlen_qkvpacked_func(
qkv,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
alibi_slopes=slopes,
causal=self.is_causal
)
attention = attention.to(orig_dtype) # type: ignore
else:
attention = flash_attn_varlen_qkvpacked_func(
qkv,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
alibi_slopes=slopes,
causal = self.is_causal
)
else:
assert not self.is_causal, f"causal mask not implemented here yet"
assert False
qkv = bert_padding.pad_input(qkv, indices, cu_seqlens.shape[0] - 1, max_seqlen) # batch, max_seqlen, thd
unpad_bs, *_ = qkv.shape
qkv = qkv.view(unpad_bs, -1, 3, self.num_attention_heads, self.attention_head_size)
# if we have nonzero attention dropout (e.g. during fine-tuning) or no Triton, compute attention in PyTorch
q = qkv[:, :, 0, :, :].permute(0, 2, 1, 3) # b h s d
k = qkv[:, :, 1, :, :].permute(0, 2, 3, 1) # b h d s
v = qkv[:, :, 2, :, :].permute(0, 2, 1, 3) # b h s d
attention_scores = torch.matmul(q, k) / math.sqrt(self.attention_head_size)
attention_scores = attention_scores + bias
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
attention_probs = self.dropout(attention_probs)
attention = torch.matmul(attention_probs, v).permute(0, 2, 1, 3) # b s h d
attention = bert_padding.unpad_input_only(attention, torch.squeeze(attn_mask) == 1)
return attention.view(bs, dim)
# Copy of transformer's library BertSelfOutput that will not be caught by surgery methods looking for HF BERT modules.
class BertSelfOutput(nn.Module):
"""Computes the output of the attention layer.
This module is modeled after the Hugging Face BERT's
:class:`~transformers.model.bert.modeling_bert.BertSelfOutput`.
The implementation is identical. Rather than use the original module
directly, we re-implement it here so that Mosaic BERT's modules will not
be affected by any Composer surgery algorithm that modifies Hugging Face
BERT modules.
"""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = get_norm_layer(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
class BertAlibiUnpadAttention(nn.Module):
"""Chains attention, Dropout, and LayerNorm for Mosaic BERT."""
def __init__(self, config):
super().__init__()
self.self = BertAlibiUnpadSelfAttention(config)
self.output = BertSelfOutput(config)
def forward(
self,
input_tensor: torch.Tensor,
cu_seqlens: torch.Tensor,
max_s: int,
subset_idx: Optional[torch.Tensor] = None,
indices: Optional[torch.Tensor] = None,
attn_mask: Optional[torch.Tensor] = None,
bias: Optional[torch.Tensor] = None,
slopes: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""Forward pass for scaled self-attention without padding.
Arguments:
input_tensor: (total_nnz, dim)
cu_seqlens: (batch + 1,)
max_s: int
subset_idx: () set of indices whose values we care about at the end of the layer
(e.g., the masked tokens, if this is the final layer).
indices: None or (total_nnz,)
attn_mask: None or (batch, max_seqlen)
bias: None or (batch, heads, max_seqlen, max_seqlen)
slopes: None or (batch, heads) or (heads,)
"""
assert (bias is None) == (slopes is None), f"{bias=}, {slopes=}"
assert False
self_output = self.self(input_tensor, cu_seqlens, max_s, indices, attn_mask, bias, slopes)
if subset_idx is not None:
return self.output(
bert_padding.index_first_axis(self_output, subset_idx),
bert_padding.index_first_axis(input_tensor, subset_idx),
)
else:
return self.output(self_output, input_tensor)
class FlexBertAttentionBase(nn.Module):
"""A FlexBERT attention base class for type hints."""
def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None):
super().__init__()
self.config = config
self.layer_id = layer_id
def _init_weights(self, reset_params: bool = False):
raise NotImplementedError("This is a base class and should not be used directly.")
def forward(self, hidden_states: torch.Tensor, attn_mask: torch.Tensor, **kwargs) -> torch.Tensor:
raise NotImplementedError("This is a base class and should not be used directly.")
def extra_repr(self) -> str:
repr = ""
if hasattr(self, "num_attention_heads"):
repr += f"num_attention_heads={self.num_attention_heads}"
if hasattr(self, "attn_head_size"):
repr += f", attn_head_size={self.attn_head_size}"
if hasattr(self, "sliding_window"):
repr += f", sliding_window={self.sliding_window if self.sliding_window != (-1, -1) else 'False'}"
if hasattr(self, "use_fa2"):
repr += f", use_fa2={self.use_fa2}"
if hasattr(self, "deterministic_fa2"):
repr += f", deterministic_fa2={self.deterministic_fa2}"
return repr
class FlexBertUnpadAttention(FlexBertAttentionBase):
"""Performs multi-headed self attention on a batch of unpadded sequences.
If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
which requires padding and unpadding inputs, adding some overhead.
See `forward` method for additional detail.
"""
def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None):
super().__init__(config=config, layer_id=layer_id)
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.is_causal = config.causal_mask
self.num_attention_heads = config.num_attention_heads
self.attn_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attn_head_size
self.p_dropout = config.attention_probs_dropout_prob
self.Wqkv = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias=config.attn_qkv_bias)
self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attn_out_bias)
self.out_drop = (
nn.Dropout(config.attn_out_dropout_prob) if config.attn_out_dropout_prob > 0.0 else nn.Identity()
)
self.use_fa2 = config.use_fa2
self.deterministic_fa2 = config.deterministic_fa2
self.use_sdpa_attn_mask = config.use_sdpa_attn_mask
if config.global_attn_every_n_layers > 0:
if config.sliding_window == -1:
raise ValueError("global_attn_every_n_layers` requires `sliding_window` to be set")
if layer_id % config.global_attn_every_n_layers != 0:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
else:
self.sliding_window = (-1, -1)
else:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
# Warn if defaulting to pytorch because of import issues
if not IMPL_USE_FLASH2 and self.use_fa2:
logger.warn_once(
"Unable to import flash_attn; defaulting FlexBERT attention implementation to PyTorch's"
" SDPA kernel. This requires padding and unpadding inputs, which will add some overhead."
)
self.use_fa2 = False
if not self.use_fa2:
if not self.use_sdpa_attn_mask:
logger.warn_once(
"SDPA attention is being used without an attention mask. Including padding in the "
" attention calculation may cause differences from the Flash Attention implementation."
)
else:
logger.warn_once(
"SDPA attention with an attention mask doesn't use the Flash Attention kernel and will"
" use more memory during the backward pass. Use the FA2 backend for linear memory scaling"
" with sequence length."
)
if self.sliding_window[0] > 0:
raise ValueError("Sliding window is not implemented for the PyTorch SDPA path. Use the FA2 backend.")
def _init_weights(self, reset_params: bool = False):
init_weights(
self.config,
self.Wqkv,
layer_dim=self.config.hidden_size,
layer_id=None,
type_of_module=ModuleType.in_module,
)
init_weights(
self.config,
self.Wo,
layer_dim=self.config.hidden_size,
layer_id=self.layer_id,
type_of_module=ModuleType.out_module,
)
def forward(
self,
hidden_states: torch.Tensor,
cu_seqlens: torch.Tensor,
max_seqlen: int,
indices: torch.Tensor,
attn_mask: torch.Tensor,
) -> torch.Tensor:
"""Perform self-attention.
There are two attention implementations supported: PyTorch's SDPA attention and Flash Attention 2.
The arguments are unpadded. The SDPA implementation of attention requires padded arguments while the
Flash Attention implementation does not. If using SDPA we first call `pad_input`. Once we compute
attention, we re-unpad our outputs for the other layers. The pad/unpad operations add overhead, but not
sending pad tokens through ffs saves compute.
Args:
hidden_states: (total_nnz, dim)
cu_seqlens: (batch + 1,)
max_seqlen: int
indices: (total_nnz,)
attn_mask: (batch, max_seqlen)
Returns:
attention: (total_nnz, dim)
"""
bs, dim = hidden_states.shape
qkv = self.Wqkv(hidden_states)
if self.use_fa2:
qkv = qkv.view(-1, 3, self.num_attention_heads, self.attn_head_size)
convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
if convert_dtype:
# FA2 implementation only supports fp16 and bf16. If FA2 is supported,
# bfloat16 must be supported as of FA2 2.5.7. (Turing GPUs not supported)
orig_dtype = qkv.dtype
qkv = qkv.to(torch.bfloat16)
attn = flash_attn_varlen_qkvpacked_func(
qkv,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal
)
attn = attn.to(orig_dtype) # type: ignore
else:
attn = flash_attn_varlen_qkvpacked_func(
qkv,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal
)
attn = attn.view(bs, dim)
else:
assert not self.is_causal, f"causal mask not implemented here yet"
assert False
qkv = bert_padding.pad_input(qkv, indices, cu_seqlens.shape[0] - 1, max_seqlen) # batch, max_seqlen, thd
unpad_bs, seqlen, _ = qkv.shape
qkv = qkv.view(unpad_bs, -1, 3, self.num_attention_heads, self.attn_head_size)
q, k, v = qkv.transpose(3, 1).unbind(dim=2) # b h s d
attn = F.scaled_dot_product_attention(
q,
k,
v,
dropout_p=self.p_dropout,
attn_mask=attn_mask[:, None, None, :seqlen].to(torch.bool).expand(unpad_bs, 1, seqlen, seqlen)
if self.use_sdpa_attn_mask
else None,
)
attn = attn.transpose(1, 2).view(unpad_bs, -1, dim) # b s h d
attn = bert_padding.unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
return self.out_drop(self.Wo(attn))
class FlexBertUnpadParallelAttention(FlexBertAttentionBase):
"""Computes the output of the multi-headed self parallel attention on a batch of unpadded sequences
If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
which requires padding and unpadding inputs, adding some overhead.
See `forward` method for additional detail.
"""
def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None):
super().__init__(config=config, layer_id=layer_id)
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.is_causal = config.causal_mask
self.num_attention_heads = config.num_attention_heads
self.attn_head_size = int(config.hidden_size / config.num_attention_heads)
self.hidden_size = config.hidden_size
self.p_dropout = config.attention_probs_dropout_prob
self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attn_out_bias)
self.out_drop = (
nn.Dropout(config.attn_out_dropout_prob) if config.attn_out_dropout_prob > 0.0 else nn.Identity()
)
self.use_fa2 = config.use_fa2
self.deterministic_fa2 = config.deterministic_fa2
self.use_sdpa_attn_mask = config.use_sdpa_attn_mask
if config.global_attn_every_n_layers > 0:
if config.sliding_window == -1:
raise ValueError("global_attn_every_n_layers` requires `sliding_window` to be set")
if layer_id % config.global_attn_every_n_layers != 0:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
else:
self.sliding_window = (-1, -1)
else:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
# Warn if defaulting to pytorch because of import issues
if not IMPL_USE_FLASH2 and self.use_fa2:
logger.warn_once(
"Unable to import flash_attn; defaulting FlexBERT attention implementation to PyTorch's"
" SDPA kernel. This requires padding and unpadding inputs, which will add some overhead."
)
self.use_fa2 = False
if not self.use_fa2:
if not self.use_sdpa_attn_mask:
logger.warn_once(
"SDPA attention is being used without an attention mask. Including padding in the "
" attention calculation may cause differences from the Flash Attention implementation."
)
else:
logger.warn_once(
"SDPA attention with an attention mask doesn't use the Flash Attention kernel and will"
" use more memory during the backward pass. Use the FA2 backend for linear memory scaling"
" with sequence length."
)
if self.sliding_window[0] > 0:
raise ValueError("Sliding window is not implemented for the PyTorch SDPA path. Use the FA2 backend.")
def _init_weights(self, reset_params: bool = False):
init_weights(
self.config,
self.Wo,
layer_dim=self.config.hidden_size,
layer_id=self.layer_id,
type_of_module=ModuleType.out_module,
)
def forward(
self,
qkv: torch.Tensor,
cu_seqlens: torch.Tensor,
max_seqlen: int,
indices: torch.Tensor,
attn_mask: torch.Tensor,
) -> torch.Tensor:
"""Perform self-attention.
There are two attention implementations supported: PyTorch's SDPA attention and Flash Attention 2.
The arguments are unpadded. The SDPA implementation of attention requires padded arguments while the
Flash Attention implementation does not. If using SDPA we first call `pad_input`. Once we compute
attention, we re-unpad our outputs for the other layers. The pad/unpad operations add overhead, but not
sending pad tokens through ffs saves compute.
Args:
qkv: (total_nnz, 3 * dim)
cu_seqlens: (batch + 1,)
max_seqlen: int
indices: (total_nnz,)
attn_mask: (batch, max_seqlen)
Returns:
attention: (total_nnz, dim)
"""
bs = qkv.shape[0]
dim = self.hidden_size
if self.use_fa2:
qkv = qkv.view(-1, 3, self.num_attention_heads, self.attn_head_size)
convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
if convert_dtype:
# FA2 implementation only supports fp16 and bf16. If FA2 is supported,
# bfloat16 must be supported as of FA2 2.5.7. (Turing GPUs not supported)
orig_dtype = qkv.dtype
qkv = qkv.to(torch.bfloat16)
attn = flash_attn_varlen_qkvpacked_func(
qkv,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal
)
attn = attn.to(orig_dtype) # type: ignore
else:
attn = flash_attn_varlen_qkvpacked_func(
qkv,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal
)
attn = attn.view(bs, dim)
else:
assert not self.is_causal, f"causal mask not implemented here yet"
assert False
qkv = bert_padding.pad_input(qkv, indices, cu_seqlens.shape[0] - 1, max_seqlen) # batch, max_seqlen, thd
unpad_bs, seqlen, _ = qkv.shape
qkv = qkv.view(unpad_bs, -1, 3, self.num_attention_heads, self.attn_head_size)
q, k, v = qkv.transpose(3, 1).unbind(dim=2) # b h s d
attn = F.scaled_dot_product_attention(
q,
k,
v,
dropout_p=self.p_dropout,
attn_mask=attn_mask[:, None, None, :seqlen].to(torch.bool).expand(unpad_bs, 1, seqlen, seqlen)
if self.use_sdpa_attn_mask
else None,
)
attn = attn.transpose(1, 2).view(unpad_bs, -1, dim) # b s h d
attn = bert_padding.unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
return self.out_drop(self.Wo(attn.view(bs, dim)))
class FlexBertPaddedAttention(FlexBertAttentionBase):
"""Performs multi-headed self attention on a batch of padded sequences.
This module supports two attention implementations:
1. Flash Attention 2 (if installed), which improves throughput.
2. PyTorch's scaled_dot_product_attention.
See `forward` method for additional detail.
"""
def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None):
super().__init__(config=config, layer_id=layer_id)
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.is_causal = config.causal_mask
self.num_attention_heads = config.num_attention_heads
self.attn_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attn_head_size
self.p_dropout = config.attention_probs_dropout_prob
self.Wqkv = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias=config.attn_qkv_bias)
self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attn_out_bias)
self.out_drop = (
nn.Dropout(config.attn_out_dropout_prob) if config.attn_out_dropout_prob > 0.0 else nn.Identity()
)
self.use_fa2 = config.use_fa2
self.deterministic_fa2 = config.deterministic_fa2
self.use_sdpa_attn_mask = config.use_sdpa_attn_mask
if config.global_attn_every_n_layers > 0:
if config.sliding_window == -1:
raise ValueError("global_attn_every_n_layers` requires `sliding_window` to be set")
if layer_id % config.global_attn_every_n_layers != 0:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
else:
self.sliding_window = (-1, -1)
else:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
if not IMPL_USE_FLASH2 and self.use_fa2:
self.use_fa2 = False
if self.use_fa2 and self.use_sdpa_attn_mask:
logger.warn_once(
"Flash Attention 2 does not support attention masks. Use unpadded attention "
"the equivalent functionality of masking out padding tokens."
)
if not self.use_fa2 and self.sliding_window[0] > 0:
raise ValueError("Sliding window is not implemented for the PyTorch SDPA path. Use the FA2 backend.")
def _init_weights(self, reset_params: bool = False):
init_weights(
self.config,
self.Wqkv,
layer_dim=self.config.hidden_size,
layer_id=None,
type_of_module=ModuleType.in_module,
)
init_weights(
self.config,
self.Wo,
layer_dim=self.config.hidden_size,
layer_id=self.layer_id,
type_of_module=ModuleType.out_module,
)
def forward(
self,
hidden_states: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""Perform self-attention.
There are two attention implementations supported:
Flash Attention 2 and PyTorch's scaled_dot_product_attention.
Args:
hidden_states: (batch, seqlen, dim)
attn_mask: (batch, seqlen)
Returns:
attention: (batch, seqlen, dim)
"""
bs, seqlen, dim = hidden_states.shape
qkv = self.Wqkv(hidden_states)
if self.use_fa2:
qkv = qkv.view(bs, seqlen, 3, self.num_attention_heads, self.attn_head_size)
convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
if convert_dtype:
# FA2 implementation only supports fp16 and bf16. If FA2 is supported,
# bfloat16 must be supported as of FA2 2.5.7. (Turing GPUs not supported)
orig_dtype = qkv.dtype
qkv = qkv.to(torch.bfloat16)
attn = flash_attn_qkvpacked_func(
qkv,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal
)
attn = attn.to(orig_dtype) # type: ignore
else:
attn = flash_attn_qkvpacked_func(
qkv,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal
)
else:
assert not self.is_causal, f"causal mask not implemented here yet"
assert False
qkv = qkv.view(bs, seqlen, 3, self.num_attention_heads, self.attn_head_size)
q, k, v = qkv.transpose(3, 1).unbind(dim=2)
attn = F.scaled_dot_product_attention(
q,
k,
v,
dropout_p=self.p_dropout,
attn_mask=attn_mask[:, None, None, :seqlen].to(torch.bool).expand(bs, 1, seqlen, seqlen)
if self.use_sdpa_attn_mask
else None,
).transpose(1, 2)
attn = attn.view(bs, seqlen, dim)
return self.out_drop(self.Wo(attn))
class FlexBertUnpadRopeAttention(FlexBertAttentionBase):
"""Performs multi-headed self attention on a batch of unpadded sequences.
If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
which requires padding and unpadding inputs, adding some overhead.
See `forward` method for additional details.
"""
def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None):
super().__init__(config=config, layer_id=layer_id)
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.is_causal = config.causal_mask
self.num_attention_heads = config.num_attention_heads
self.attn_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attn_head_size
self.p_dropout = config.attention_probs_dropout_prob
self.Wqkv = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias=config.attn_qkv_bias)
self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attn_out_bias)
self.out_drop = (
nn.Dropout(config.attn_out_dropout_prob) if config.attn_out_dropout_prob > 0.0 else nn.Identity()
)
if config.global_attn_every_n_layers > 0:
if config.sliding_window == -1:
raise ValueError("global_attn_every_n_layers` requires `sliding_window` to be set")
if layer_id % config.global_attn_every_n_layers != 0:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
else:
self.sliding_window = (-1, -1)
else:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
if config.rotary_emb_dim is None:
config.rotary_emb_dim = self.attn_head_size
rotary_base = config.rotary_emb_base
rotary_dim = config.rotary_emb_dim
if self.sliding_window != (-1, -1):
if config.local_attn_rotary_emb_base != -1:
rotary_base = config.local_attn_rotary_emb_base
if config.local_attn_rotary_emb_dim is not None:
rotary_dim = config.local_attn_rotary_emb_dim
assert UnpaddedRotaryEmbedding is not None, "rotary_emb is not installed"
self.rotary_emb = UnpaddedRotaryEmbedding(
dim=rotary_dim,
base=rotary_base,
scale_base=config.rotary_emb_scale_base, # If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
interleaved=config.rotary_emb_interleaved,
)
self.use_fa2 = config.use_fa2
# flash attention 3 only supports global attention
self.use_fa3 = config.use_fa2 and self.sliding_window == (-1, -1) and IMPL_USE_FLASH3
self.deterministic_fa2 = config.deterministic_fa2
self.use_sdpa_attn_mask = config.use_sdpa_attn_mask
# Warn if defaulting to pytorch because of import issues
if not IMPL_USE_FLASH2 and self.use_fa2:
logger.warn_once(
"Unable to import flash_attn; defaulting FlexBERT attention implementation to PyTorch's"
" SDPA kernel. This requires padding and unpadding inputs, which will add some overhead."
)
self.use_fa2 = False
if not self.use_fa2:
if not self.use_sdpa_attn_mask:
logger.warn_once(
"SDPA attention is being used without an attention mask. Including padding in the "
" attention calculation may cause differences from the Flash Attention implementation."
)
else:
logger.warn_once(
"SDPA attention with an attention mask doesn't use the Flash Attention kernel and will"
" use more memory during the backward pass. Use the FA2 backend for linear memory scaling"
" with sequence length."
)
if self.sliding_window[0] > 0:
raise ValueError("Sliding window is not implemented for the PyTorch SDPA path. Use the FA2 backend.")
def _init_weights(self, reset_params: bool = False):
init_weights(
self.config,
self.Wqkv,
layer_dim=self.config.hidden_size,
layer_id=None,
type_of_module=ModuleType.in_module,
)
init_weights(
self.config,
self.Wo,
layer_dim=self.config.hidden_size,
layer_id=self.layer_id,
type_of_module=ModuleType.out_module,
)
def forward(
self,
hidden_states: torch.Tensor,
cu_seqlens: torch.Tensor,
max_seqlen: int,
indices: torch.Tensor,
attn_mask: torch.Tensor,
) -> torch.Tensor:
"""Perform self-attention.
There are two attention implementations supported: PyTorch's SDPA attention and Flash Attention 2.
The arguments are unpadded. The SDPA implementation of attention requires padded arguments while the
Flash Attention implementation does not. If using SDPA we first call `pad_input`. Once we compute
attention, we re-unpad our outputs for the other layers. The pad/unpad operations add overhead, but not
sending pad tokens through ffs saves compute.
Args:
hidden_states: (total_nnz, dim)
cu_seqlens: (batch + 1,)
max_seqlen: int
indices: (total_nnz,)
attn_mask: (batch, max_seqlen)
Returns:
attention: (total_nnz, dim)
"""
bs, dim = hidden_states.shape
qkv = self.Wqkv(hidden_states)
# only needed for inference when we have KV cache
seqlen_offset = 0
# (total_seqlen, 3, nheads, headdim)
qkv = qkv.view(-1, 3, self.num_attention_heads, self.attn_head_size)
qkv = self.rotary_emb(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, seqlen_offset=seqlen_offset)
if self.use_fa3:
convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
if convert_dtype:
# FA2 implementation only supports fp16 and bf16. If FA2 is supported,
# bfloat16 must be supported as of FA2 2.5.7. (Turing GPUs not supported)
orig_dtype = qkv.dtype
qkv = qkv.to(torch.bfloat16)
q, k, v = qkv.view(-1, 3, self.num_attention_heads, self.attn_head_size).unbind(dim=1)
attn, _ = flash_attn_varlen_func(
q=q,
k=k,
v=v,
cu_seqlens_q=cu_seqlens,
cu_seqlens_k=cu_seqlens,
max_seqlen_q=max_seqlen,
max_seqlen_k=max_seqlen,
deterministic=self.deterministic_fa2,
causal=self.is_causal,
)
attn = attn.to(orig_dtype) # type: ignore
else:
q, k, v = qkv.view(-1, 3, self.num_attention_heads, self.attn_head_size).unbind(dim=1)
attn, _ = flash_attn_varlen_func(
q=q,
k=k,
v=v,
cu_seqlens_q=cu_seqlens,
cu_seqlens_k=cu_seqlens,
max_seqlen_q=max_seqlen,
max_seqlen_k=max_seqlen,
deterministic=self.deterministic_fa2,
causal=self.is_causal,
)
attn = attn.view(bs, dim)
elif self.use_fa2:
convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
if convert_dtype:
# FA2 implementation only supports fp16 and bf16. If FA2 is supported,
# bfloat16 must be supported as of FA2 2.5.7. (Turing GPUs not supported)
orig_dtype = qkv.dtype
qkv = qkv.to(torch.bfloat16)
attn = flash_attn_varlen_qkvpacked_func(
qkv,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal,
)
attn = attn.to(orig_dtype) # type: ignore
else:
attn = flash_attn_varlen_qkvpacked_func(
qkv,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal,
)
attn = attn.view(bs, dim)
else:
assert not self.is_causal, f"causal mask not implemented here yet"
assert False
qkv = bert_padding.pad_input(
qkv, indices, cu_seqlens.shape[0] - 1, attn_mask.shape[-1]
) # batch, max_seqlen, thd
unpad_bs, seqlen, *_ = qkv.shape
q, k, v = qkv.transpose(3, 1).unbind(dim=2) # b h s d
attn = F.scaled_dot_product_attention(
q,
k,
v,
dropout_p=self.p_dropout,
attn_mask=attn_mask[:, None, None, :seqlen].to(torch.bool).expand(unpad_bs, 1, seqlen, seqlen)
if self.use_sdpa_attn_mask
else None,
)
attn = attn.transpose(1, 2).view(unpad_bs, -1, dim) # b s h d
attn = bert_padding.unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
return self.out_drop(self.Wo(attn))
class FlexBertPaddedRopeAttention(FlexBertAttentionBase):
"""Performs multi-headed self attention on a batch of padded sequences.
This module supports two attention implementations:
1. Flash Attention 2 (if installed), which improves throughput.
2. PyTorch's scaled_dot_product_attention.
See `forward` method for additional details.
"""
def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None):
super().__init__(config=config, layer_id=layer_id)
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.is_causal = config.causal_mask
self.num_attention_heads = config.num_attention_heads
self.attn_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attn_head_size
self.p_dropout = config.attention_probs_dropout_prob
self.Wqkv = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias=config.attn_qkv_bias)
self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attn_out_bias)
self.out_drop = (
nn.Dropout(config.attn_out_dropout_prob) if config.attn_out_dropout_prob > 0.0 else nn.Identity()
)
self.use_fa2 = config.use_fa2
self.deterministic_fa2 = config.deterministic_fa2
self.use_sdpa_attn_mask = config.use_sdpa_attn_mask
if config.global_attn_every_n_layers > 0:
if config.sliding_window == -1:
raise ValueError("global_attn_every_n_layers` requires `sliding_window` to be set")
if layer_id % config.global_attn_every_n_layers != 0:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
else:
self.sliding_window = (-1, -1)
else:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
if config.rotary_emb_dim is None:
config.rotary_emb_dim = self.attn_head_size
rotary_base = config.rotary_emb_base
rotary_dim = config.rotary_emb_dim
if self.sliding_window != (-1, -1):
if config.local_attn_rotary_emb_base != -1:
rotary_base = config.local_attn_rotary_emb_base
if config.local_attn_rotary_emb_dim is not None:
rotary_dim = config.local_attn_rotary_emb_dim
assert RotaryEmbedding is not None, "rotary_emb is not installed"
self.rotary_emb = RotaryEmbedding(
dim=rotary_dim,
base=rotary_base,
scale_base=config.rotary_emb_scale_base, # If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
interleaved=config.rotary_emb_interleaved,
)
if not IMPL_USE_FLASH2 and self.use_fa2:
self.use_fa2 = False
if self.use_fa2 and self.use_sdpa_attn_mask:
logger.warn_once(
"Flash Attention 2 does not support attention masks. Use unpadded attention "
"the equivalent functionality of masking out padding tokens."
)
if not self.use_fa2 and self.sliding_window[0] > 0:
raise ValueError("Sliding window is not implemented for the PyTorch SDPA path. Use the FA2 backend.")
def _init_weights(self, reset_params: bool = False):
init_weights(
self.config,
self.Wqkv,
layer_dim=self.config.hidden_size,
layer_id=None,
type_of_module=ModuleType.in_module,
)
init_weights(
self.config,
self.Wo,
layer_dim=self.config.hidden_size,
layer_id=self.layer_id,
type_of_module=ModuleType.out_module,
)
def forward(
self,
hidden_states: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""Perform self-attention.
There are two attention implementations supported:
Flash Attention 2 and PyTorch's scaled_dot_product_attention.
Args:
hidden_states: (batch, seqlen, dim)
attn_mask: (batch, seqlen)
Returns:
attention: (batch, seqlen, dim)
"""
bs, seqlen, dim = hidden_states.shape
qkv = self.Wqkv(hidden_states)
seqlen_offset = 0
# Reshape to (batch, seqlen, 3, nheads, headdim)
qkv = qkv.view(bs, seqlen, 3, self.num_attention_heads, self.attn_head_size)
if IMPL_USE_FLASH2:
# Apply RoPE
qkv = self.rotary_emb(qkv, seqlen_offset=seqlen_offset, max_seqlen=None)
convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
if convert_dtype:
# FA2 implementation only supports fp16 and bf16. If FA2 is supported,
# bfloat16 must be supported as of FA2 2.5.7. (Turing GPUs not supported)
orig_dtype = qkv.dtype
qkv = qkv.to(torch.bfloat16)
attn = flash_attn_qkvpacked_func(
qkv,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal,
)
attn = attn.to(orig_dtype) # type: ignore
else:
attn = flash_attn_qkvpacked_func(
qkv,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal
)
else:
assert not self.is_causal, f"causal mask not implemented here yet"
assert False
qkv = self.rotary_emb(qkv, seqlen_offset=seqlen_offset, max_seqlen=None)
q, k, v = qkv.transpose(3, 1).unbind(dim=2)
attn = F.scaled_dot_product_attention(
q,
k,
v,
dropout_p=self.p_dropout,
attn_mask=attn_mask[:, None, None, :seqlen].to(torch.bool).expand(bs, 1, seqlen, seqlen)
if self.use_sdpa_attn_mask
else None,
).transpose(1, 2)
attn = attn.view(bs, seqlen, dim)
return self.out_drop(self.Wo(attn))
class FlexBertUnpadRopeParallelAttention(FlexBertAttentionBase):
"""Performs multi-headed self attention on a batch of unpadded sequences.
If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
which requires padding and unpadding inputs, adding some overhead.
See `forward` method for additional details.
"""
def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None):
super().__init__(config=config, layer_id=layer_id)
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.is_causal = config.causal_mask
self.num_attention_heads = config.num_attention_heads
self.attn_head_size = int(config.hidden_size / config.num_attention_heads)
self.hidden_size = config.hidden_size
self.p_dropout = config.attention_probs_dropout_prob
self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attn_out_bias)
self.out_drop = (
nn.Dropout(config.attn_out_dropout_prob) if config.attn_out_dropout_prob > 0.0 else nn.Identity()
)
if config.global_attn_every_n_layers > 0:
if config.sliding_window == -1:
raise ValueError("global_attn_every_n_layers` requires `sliding_window` to be set")
if layer_id % config.global_attn_every_n_layers != 0:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
else:
self.sliding_window = (-1, -1)
else:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
if config.rotary_emb_dim is None:
config.rotary_emb_dim = self.attn_head_size
rotary_base = config.rotary_emb_base
rotary_dim = config.rotary_emb_dim
if self.sliding_window != (-1, -1):
if config.local_attn_rotary_emb_base != -1:
rotary_base = config.local_attn_rotary_emb_base
if config.local_attn_rotary_emb_dim is not None:
rotary_dim = config.local_attn_rotary_emb_dim
assert UnpaddedRotaryEmbedding is not None, "rotary_emb is not installed"
self.rotary_emb = UnpaddedRotaryEmbedding(
dim=rotary_dim,
base=rotary_base,
scale_base=config.rotary_emb_scale_base, # If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
interleaved=config.rotary_emb_interleaved,
)
self.use_fa2 = config.use_fa2
self.deterministic_fa2 = config.deterministic_fa2
self.use_sdpa_attn_mask = config.use_sdpa_attn_mask
# Warn if defaulting to pytorch because of import issues
if not IMPL_USE_FLASH2 and self.use_fa2:
logger.warn_once(
"Unable to import flash_attn; defaulting FlexBERT attention implementation to PyTorch's"
" SDPA kernel. This requires padding and unpadding inputs, which will add some overhead."
)
self.use_fa2 = False
if not self.use_fa2:
if not self.use_sdpa_attn_mask:
logger.warn_once(
"SDPA attention is being used without an attention mask. Including padding in the "
" attention calculation may cause differences from the Flash Attention implementation."
)
else:
logger.warn_once(
"SDPA attention with an attention mask doesn't use the Flash Attention kernel and will"
" use more memory during the backward pass. Use the FA2 backend for linear memory scaling"
" with sequence length."
)
if self.sliding_window[0] > 0:
raise ValueError("Sliding window is not implemented for the PyTorch SDPA path. Use the FA2 backend.")
def _init_weights(self, reset_params: bool = False):
init_weights(
self.config,
self.Wo,
layer_dim=self.config.hidden_size,
layer_id=self.layer_id,
type_of_module=ModuleType.out_module,
)
def forward(
self,
qkv: torch.Tensor,
cu_seqlens: torch.Tensor,
max_seqlen: int,
indices: torch.Tensor,
attn_mask: torch.Tensor,
) -> torch.Tensor:
"""Perform self-attention.
There are two attention implementations supported: PyTorch's SDPA attention and Flash Attention 2.
The arguments are unpadded. The SDPA implementation of attention requires padded arguments while the
Flash Attention implementation does not. If using SDPA we first call `pad_input`. Once we compute
attention, we re-unpad our outputs for the other layers. The pad/unpad operations add overhead, but not
sending pad tokens through ffs saves compute.
Args:
qkv: (total_nnz, 3 * dim)
cu_seqlens: (batch + 1,)
max_seqlen: int
indices: (total_nnz,)
attn_mask: (batch, max_seqlen)
Returns:
attention: (total_nnz, dim)
"""
bs = qkv.shape[0]
dim = self.hidden_size
# only needed for inference when we have KV cache
seqlen_offset = 0
# (total_seqlen, 3, nheads, headdim)
qkv = qkv.view(-1, 3, self.num_attention_heads, self.attn_head_size)
qkv = self.rotary_emb(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, seqlen_offset=seqlen_offset)
if self.use_fa2:
convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
if convert_dtype:
# FA2 implementation only supports fp16 and bf16. If FA2 is supported,
# bfloat16 must be supported as of FA2 2.5.7. (Turing GPUs not supported)
orig_dtype = qkv.dtype
qkv = qkv.to(torch.bfloat16)
attn = flash_attn_varlen_qkvpacked_func(
qkv,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal,
)
attn = attn.to(orig_dtype) # type: ignore
else:
attn = flash_attn_varlen_qkvpacked_func(
qkv,
cu_seqlens=cu_seqlens,
max_seqlen=max_seqlen,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal,
)
attn = attn.view(bs, dim)
else:
assert not self.is_causal, f"causal mask not implemented here yet"
assert False
qkv = bert_padding.pad_input(
qkv, indices, cu_seqlens.shape[0] - 1, attn_mask.shape[-1]
) # batch, max_seqlen, thd
unpad_bs, seqlen, *_ = qkv.shape
q, k, v = qkv.transpose(3, 1).unbind(dim=2) # b h s d
attn = F.scaled_dot_product_attention(
q,
k,
v,
dropout_p=self.p_dropout,
attn_mask=attn_mask[:, None, None, :seqlen].to(torch.bool).expand(unpad_bs, 1, seqlen, seqlen)
if self.use_sdpa_attn_mask
else None,
)
attn = attn.transpose(1, 2).view(unpad_bs, -1, dim) # b s h d
attn = bert_padding.unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
return self.out_drop(self.Wo(attn))
class FlexBertPaddedRopeParallelAttention(FlexBertAttentionBase):
"""Performs multi-headed self attention on a batch of padded sequences.
This module supports two attention implementations:
1. Flash Attention 2 (if installed), which improves throughput.
2. PyTorch's scaled_dot_product_attention.
See `forward` method for additional details.
"""
def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None):
super().__init__(config=config, layer_id=layer_id)
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.is_causal = config.causal_mask
self.num_attention_heads = config.num_attention_heads
self.attn_head_size = int(config.hidden_size / config.num_attention_heads)
self.hidden_size = config.hidden_size
self.p_dropout = config.attention_probs_dropout_prob
self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attn_out_bias)
self.out_drop = (
nn.Dropout(config.attn_out_dropout_prob) if config.attn_out_dropout_prob > 0.0 else nn.Identity()
)
self.use_fa2 = config.use_fa2
self.deterministic_fa2 = config.deterministic_fa2
self.use_sdpa_attn_mask = config.use_sdpa_attn_mask
if not IMPL_USE_FLASH2 and self.use_fa2:
self.use_fa2 = False
if config.global_attn_every_n_layers > 0:
if config.sliding_window == -1:
raise ValueError("global_attn_every_n_layers` requires `sliding_window` to be set")
if layer_id % config.global_attn_every_n_layers != 0:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
else:
self.sliding_window = (-1, -1)
else:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
if config.rotary_emb_dim is None:
config.rotary_emb_dim = self.attn_head_size
rotary_base = config.rotary_emb_base
rotary_dim = config.rotary_emb_dim
if self.sliding_window != (-1, -1):
if config.local_attn_rotary_emb_base != -1:
rotary_base = config.local_attn_rotary_emb_base
if config.local_attn_rotary_emb_dim is not None:
rotary_dim = config.local_attn_rotary_emb_dim
assert RotaryEmbedding is not None, "rotary_emb is not installed"
self.rotary_emb = RotaryEmbedding(
dim=rotary_dim,
base=rotary_base,
scale_base=config.rotary_emb_scale_base, # If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
interleaved=config.rotary_emb_interleaved,
)
if not IMPL_USE_FLASH2 and self.use_fa2:
self.use_fa2 = False
if self.use_fa2 and self.use_sdpa_attn_mask:
logger.warn_once(
"Flash Attention 2 does not support attention masks. Use unpadded attention "
"the equivalent functionality of masking out padding tokens."
)
if not self.use_fa2 and self.sliding_window[0] > 0:
raise ValueError("Sliding window is not implemented for the PyTorch SDPA path. Use the FA2 backend.")
def _init_weights(self, reset_params: bool = False):
init_weights(
self.config,
self.Wo,
layer_dim=self.config.hidden_size,
layer_id=self.layer_id,
type_of_module=ModuleType.out_module,
)
def forward(
self,
qkv: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""Perform self-attention.
There are two attention implementations supported:
Flash Attention 2 and PyTorch's scaled_dot_product_attention.
Args:
qkv: (batch, seqlen, 3 * dim)
attn_mask: (batch, seqlen)
Returns:
attention: (batch, seqlen, dim)
"""
bs, seqlen, _ = qkv.shape
dim = self.hidden_size
seqlen_offset = 0
# Reshape to (batch, seqlen, 3, nheads, headdim)
qkv = qkv.view(bs, seqlen, 3, self.num_attention_heads, self.attn_head_size)
if self.use_fa2:
# Apply RoPE
qkv = self.rotary_emb(qkv, seqlen_offset=seqlen_offset, max_seqlen=None)
convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
if convert_dtype:
# FA2 implementation only supports fp16 and bf16. If FA2 is supported,
# bfloat16 must be supported as of FA2 2.5.7. (Turing GPUs not supported)
orig_dtype = qkv.dtype
qkv = qkv.to(torch.bfloat16)
attn = flash_attn_qkvpacked_func(
qkv,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal
)
attn = attn.to(orig_dtype) # type: ignore
else:
attn = flash_attn_qkvpacked_func(
qkv,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal
)
else:
assert not self.is_causal, f"causal mask not implemented here yet"
assert False
qkv = self.rotary_emb(qkv, seqlen_offset=seqlen_offset, max_seqlen=None)
q, k, v = qkv.transpose(3, 1).unbind(dim=2)
attn = F.scaled_dot_product_attention(
q,
k,
v,
dropout_p=self.p_dropout,
attn_mask=attn_mask[:, None, None, :seqlen].to(torch.bool).expand(bs, 1, seqlen, seqlen)
if self.use_sdpa_attn_mask
else None,
).transpose(1, 2)
attn = attn.view(bs, seqlen, dim)
return self.out_drop(self.Wo(attn))
class FlexBertPaddedParallelAttention(FlexBertAttentionBase):
"""Performs multi-headed self attention on a batch of padded sequences.
This module supports two attention implementations:
1. Flash Attention 2 (if installed), which improves throughput.
2. PyTorch's scaled_dot_product_attention.
See `forward` method for additional detail.
"""
def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None):
super().__init__(config=config, layer_id=layer_id)
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.is_causal = config.causal_mask
self.num_attention_heads = config.num_attention_heads
self.attn_head_size = int(config.hidden_size / config.num_attention_heads)
self.hidden_size = config.hidden_size
self.p_dropout = config.attention_probs_dropout_prob
self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attn_out_bias)
self.out_drop = (
nn.Dropout(config.attn_out_dropout_prob) if config.attn_out_dropout_prob > 0.0 else nn.Identity()
)
self.use_fa2 = config.use_fa2
self.deterministic_fa2 = config.deterministic_fa2
self.use_sdpa_attn_mask = config.use_sdpa_attn_mask
if config.global_attn_every_n_layers > 0:
if config.sliding_window == -1:
raise ValueError("global_attn_every_n_layers` requires `sliding_window` to be set")
if layer_id % config.global_attn_every_n_layers != 0:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
else:
self.sliding_window = (-1, -1)
else:
self.sliding_window = (config.sliding_window // 2, config.sliding_window // 2)
if not IMPL_USE_FLASH2 and self.use_fa2:
self.use_fa2 = False
if self.use_fa2 and self.use_sdpa_attn_mask:
logger.warn_once(
"Flash Attention 2 does not support attention masks. Use unpadded attention "
"the equivalent functionality of masking out padding tokens."
)
if not self.use_fa2 and self.sliding_window[0] > 0:
raise ValueError("Sliding window is not implemented for the PyTorch SDPA path. Use the FA2 backend.")
def _init_weights(self, reset_params: bool = False):
init_weights(
self.config,
self.Wo,
layer_dim=self.config.hidden_size,
layer_id=self.layer_id,
type_of_module=ModuleType.out_module,
)
def forward(
self,
qkv: torch.Tensor,
attn_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""Perform self-attention.
There are two attention implementations supported:
Flash Attention 2 and PyTorch's scaled_dot_product_attention.
Args:
qkv: (batch, seqlen, 3 * dim)
attn_mask: (batch, seqlen)
Returns:
attention: (batch, seqlen, dim)
"""
bs, seqlen, _ = qkv.shape
dim = self.hidden_size
if self.use_fa2:
qkv = qkv.view(bs, seqlen, 3, self.num_attention_heads, self.attn_head_size)
convert_dtype = qkv.dtype not in (torch.float16, torch.bfloat16)
if convert_dtype:
# FA2 implementation only supports fp16 and bf16. If FA2 is supported,
# bfloat16 must be supported as of FA2 2.5.7. (Turing GPUs not supported)
orig_dtype = qkv.dtype
qkv = qkv.to(torch.bfloat16)
attn = flash_attn_qkvpacked_func(
qkv,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal
)
attn = attn.to(orig_dtype) # type: ignore
else:
attn = flash_attn_qkvpacked_func(
qkv,
dropout_p=self.p_dropout,
deterministic=self.deterministic_fa2,
window_size=self.sliding_window,
causal=self.is_causal
)
else:
assert not self.is_causal, f"causal attention mask not yet implemented here"
assert False
qkv = qkv.view(bs, seqlen, 3, self.num_attention_heads, self.attn_head_size)
q, k, v = qkv.transpose(3, 1).unbind(dim=2) # b h s d
attn = F.scaled_dot_product_attention(
q,
k,
v,
dropout_p=self.p_dropout,
attn_mask=attn_mask[:, None, None, :seqlen].to(torch.bool).expand(bs, 1, seqlen, seqlen)
if self.use_sdpa_attn_mask
else None,
).transpose(1, 2)
attn = attn.view(bs, seqlen, dim)
return self.out_drop(self.Wo(attn))
ATTN2CLS = {
"unpadded_base": FlexBertUnpadAttention,
"padded_base": FlexBertPaddedAttention,
"unpadded_parallel": FlexBertUnpadParallelAttention,
"padded_parallel": FlexBertPaddedParallelAttention,
"unpadded_rope": FlexBertUnpadRopeAttention,
"padded_rope": FlexBertPaddedRopeAttention,
"unpadded_rope_parallel": FlexBertUnpadRopeParallelAttention,
"padded_rope_parallel": FlexBertPaddedRopeParallelAttention,
}
def get_attention_layer(config: FlexBertConfig, layer_id: Optional[int] = None) -> FlexBertAttentionBase:
try:
attention_layer = (
config.initial_attention_layer
if layer_id < config.num_initial_layers and getattr(config, "initial_attention_layer", None) is not None
else config.attention_layer
)
return ATTN2CLS[maybe_add_padding(config, attention_layer)](config, layer_id=layer_id)
except KeyError:
if layer_id < config.num_initial_layers and getattr(config, "initial_attention_layer", None) is not None:
raise ValueError(
f"Invalid attention layer type: {config.initial_attention_layer=}, must be one of {ATTN2CLS.keys()}."
f"{config.padding=} will be automatically prepended to `config.attention_layer` if unspecified."
)
else:
raise ValueError(
f"Invalid attention layer type: {config.attention_layer=}, must be one of {ATTN2CLS.keys()}. "
f"{config.padding=} will be automatically prepended to `config.attention_layer` if unspecified."
)