# Copyright 2024 **AUTHORS_TODO** # License: Apache-2.0 # Copyright 2022 MosaicML Examples authors # SPDX-License-Identifier: Apache-2.0 # Copyright 2023 MosaicML Examples authors # SPDX-License-Identifier: Apache-2.0 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2023, Tri Dao. import copy import math import warnings from typing import Optional, Union, List import torch import torch.nn as nn from .bert_padding import unpad_input, pad_input from .activation import get_act_fn from .attention import FlexBertAttentionBase, BertAlibiUnpadAttention, get_attention_layer from .mlp import FlexBertMLPBase, BertResidualGLU, get_mlp_layer from .configuration_bert import FlexBertConfig, maybe_add_padding from .normalization import get_norm_layer from .initialization import ModuleType, init_weights class BertAlibiLayer(nn.Module): """Composes the Mosaic BERT attention and FFN blocks into a single layer.""" def __init__(self, config): super().__init__() self.attention = BertAlibiUnpadAttention(config) self.mlp = BertResidualGLU(config) def forward( self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, seqlen: int, subset_idx: Optional[torch.Tensor] = None, indices: Optional[torch.Tensor] = None, attn_mask: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, slopes: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass for a BERT layer, including both attention and MLP. Args: hidden_states: (total_nnz, dim) cu_seqlens: (batch + 1,) seqlen: int subset_idx: () set of indices whose values we care about at the end of the layer (e.g., the masked tokens, if this is the final layer). indices: None or (total_nnz,) attn_mask: None or (batch, max_seqlen_in_batch) bias: None or (batch, heads, max_seqlen_in_batch, max_seqlen_in_batch) slopes: None or (batch, heads) or (heads,) """ assert (bias is None) == (slopes is None), f"{bias=}, {slopes=}" attention_output = self.attention( hidden_states, cu_seqlens, seqlen, subset_idx, indices, attn_mask, bias, slopes ) layer_output = self.mlp(attention_output) return layer_output class BertAlibiEncoder(nn.Module): """A stack of BERT layers providing the backbone of Mosaic BERT. This module is modeled after the Hugging Face BERT's :class:`~transformers.model.bert.modeling_bert.BertAlibiEncoder`, but with substantial modifications to implement unpadding and ALiBi. Compared to the analogous Hugging Face BERT module, this module handles unpadding to reduce unnecessary computation at padded tokens, and pre-computes attention biases to implement ALiBi. """ def __init__(self, config): super().__init__() layer = BertAlibiLayer(config) self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) self.num_attention_heads = config.num_attention_heads # The alibi mask will be dynamically expanded if it is too small for # the input the model receives. But it generally helps to initialize it # to a reasonably large size to help pre-allocate CUDA memory. # The default `alibi_starting_size` is 512. self._current_alibi_size = int(config.alibi_starting_size) self.alibi = torch.zeros((1, self.num_attention_heads, self._current_alibi_size, self._current_alibi_size)) self.rebuild_alibi_tensor(size=config.alibi_starting_size) def rebuild_alibi_tensor(self, size: int, device: Optional[Union[torch.device, str]] = None): # Alibi # Following https://github.com/ofirpress/attention_with_linear_biases/issues/5 (Implementation 1) # In the causal case, you can exploit the fact that softmax is invariant to a uniform translation # of the logits, which makes the math work out *after* applying causal masking. If no causal masking # will be applied, it is necessary to construct the diagonal mask. n_heads = self.num_attention_heads def _get_alibi_head_slopes(n_heads: int) -> List[float]: def get_slopes_power_of_2(n_heads: int) -> List[float]: start = 2 ** (-(2 ** -(math.log2(n_heads) - 3))) ratio = start return [start * ratio**i for i in range(n_heads)] # In the paper, they only train models that have 2^a heads for some a. This function # has some good properties that only occur when the input is a power of 2. To # maintain that even when the number of heads is not a power of 2, we use a # workaround. if math.log2(n_heads).is_integer(): return get_slopes_power_of_2(n_heads) closest_power_of_2 = 2 ** math.floor(math.log2(n_heads)) slopes_a = get_slopes_power_of_2(closest_power_of_2) slopes_b = _get_alibi_head_slopes(2 * closest_power_of_2) slopes_b = slopes_b[0::2][: n_heads - closest_power_of_2] return slopes_a + slopes_b context_position = torch.arange(size, device=device)[:, None] memory_position = torch.arange(size, device=device)[None, :] relative_position = torch.abs(memory_position - context_position) # [n_heads, max_token_length, max_token_length] relative_position = relative_position.unsqueeze(0).expand(n_heads, -1, -1) slopes = torch.Tensor(_get_alibi_head_slopes(n_heads)).to(device) self.slopes = slopes alibi = slopes.unsqueeze(1).unsqueeze(1) * -relative_position # [1, n_heads, max_token_length, max_token_length] alibi = alibi.unsqueeze(0) assert alibi.shape == torch.Size([1, n_heads, size, size]) self._current_alibi_size = size self.alibi = alibi def forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_all_encoded_layers: Optional[bool] = True, subset_mask: Optional[torch.Tensor] = None, ) -> List[torch.Tensor]: extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 attention_mask_bool = attention_mask.bool() batch, seqlen = hidden_states.shape[:2] # Unpad inputs and mask. It will remove tokens that are padded. # Assume ntokens is total number of tokens (padded and non-padded) # and ntokens_unpad is total number of non-padded tokens. # Then unpadding performs the following compression of the inputs: # hidden_states[ntokens,hidden] -> hidden_states[ntokens_unpad,hidden] hidden_states, indices, cu_seqlens, _ = unpad_input(hidden_states, attention_mask_bool) # Add alibi matrix to extended_attention_mask if self._current_alibi_size < seqlen: # Rebuild the alibi tensor when needed warnings.warn(f"Increasing alibi size from {self._current_alibi_size} to {seqlen}") self.rebuild_alibi_tensor(size=seqlen, device=hidden_states.device) elif self.alibi.device != hidden_states.device: # Device catch-up self.alibi = self.alibi.to(hidden_states.device) self.slopes = self.slopes.to(hidden_states.device) # type: ignore alibi_bias = self.alibi[:, :, :seqlen, :seqlen] attn_bias = extended_attention_mask[:, :, :seqlen, :seqlen] alibi_attn_mask = attn_bias + alibi_bias all_encoder_layers = [] if subset_mask is None: for layer_module in self.layer: hidden_states = layer_module( hidden_states, cu_seqlens, seqlen, None, indices, attn_mask=attention_mask, bias=alibi_attn_mask, slopes=self.slopes, ) if output_all_encoded_layers: all_encoder_layers.append(hidden_states) # Pad inputs and mask. It will insert back zero-padded tokens. # Assume ntokens is total number of tokens (padded and non-padded) # and ntokens_unpad is total number of non-padded tokens. # Then padding performs the following de-compression: # hidden_states[ntokens_unpad,hidden] -> hidden_states[ntokens,hidden] hidden_states = pad_input(hidden_states, indices, batch, seqlen) else: for i in range(len(self.layer) - 1): layer_module = self.layer[i] hidden_states = layer_module( hidden_states, cu_seqlens, seqlen, None, indices, attn_mask=attention_mask, bias=alibi_attn_mask, slopes=self.slopes, ) if output_all_encoded_layers: all_encoder_layers.append(hidden_states) subset_idx = torch.nonzero(subset_mask[attention_mask_bool], as_tuple=False).flatten() hidden_states = self.layer[-1]( hidden_states, cu_seqlens, seqlen, subset_idx=subset_idx, indices=indices, attn_mask=attention_mask, bias=alibi_attn_mask, slopes=self.slopes, ) if not output_all_encoded_layers: all_encoder_layers.append(hidden_states) return all_encoder_layers class BertPooler(nn.Module): def __init__(self, config): super(BertPooler, self).__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() def forward(self, hidden_states: torch.Tensor, pool: Optional[bool] = True) -> torch.Tensor: # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = hidden_states[:, 0] if pool else hidden_states pooled_output = self.dense(first_token_tensor) pooled_output = self.activation(pooled_output) return pooled_output class BertPredictionHeadTransform(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) if isinstance(config.hidden_act, str): self.transform_act_fn = get_act_fn(config.head_pred_act) else: self.transform_act_fn = config.hidden_act self.LayerNorm = get_norm_layer(config) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.dense(hidden_states) hidden_states = self.transform_act_fn(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states class FlexBertLayerBase(nn.Module): """A FlexBERT Layer base class for type hints.""" attn: FlexBertAttentionBase mlp: FlexBertMLPBase def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None): super().__init__() self.config = config self.layer_id = layer_id def _init_weights(self, reset_params: bool = False): if hasattr(self, "attn"): self.attn._init_weights(reset_params) if hasattr(self, "mlp"): self.mlp._init_weights(reset_params) def reset_parameters(self): self._init_weights(reset_params=True) def forward(self, hidden_states: torch.Tensor, attn_mask: Optional[torch.Tensor] = None, **kwargs) -> torch.Tensor: raise NotImplementedError("This is a base class and should not be used directly.") class FlexBertCompileUnpadPreNormLayer(FlexBertLayerBase): """Composes the FlexBERT attention and MLP blocks into a single layer using pre-normalization.""" def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None): super().__init__(config=config, layer_id=layer_id) if config.skip_first_prenorm and config.embed_norm and layer_id == 0: self.attn_norm = nn.Identity() else: self.attn_norm = get_norm_layer(config) self.attn = get_attention_layer(config, layer_id=layer_id) self.mlp_norm = get_norm_layer(config, compiled_norm=config.compile_model) self.mlp = get_mlp_layer(config, layer_id=layer_id) self.compile_model = config.compile_model def _init_weights(self, reset_params: bool = False): super()._init_weights(reset_params) if reset_params: self.attn_norm.reset_parameters() self.mlp_norm.reset_parameters() @torch.compile(dynamic=True) def compiled_mlp(self, hidden_states: torch.Tensor) -> torch.Tensor: return self.mlp(self.mlp_norm(hidden_states)) def forward( self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, max_seqlen: int, indices: Optional[torch.Tensor] = None, attn_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass for a BERT layer, including both attention and MLP. Args: hidden_states: (total_nnz, dim) cu_seqlens: (batch + 1,) max_seqlen: int indices: None or (total_nnz,) attn_mask: None or (batch, max_seqlen) """ attn_out = hidden_states + self.attn(self.attn_norm(hidden_states), cu_seqlens, max_seqlen, indices, attn_mask) return attn_out + self.compiled_mlp(attn_out) class FlexBertUnpadPreNormLayer(FlexBertLayerBase): """Composes the FlexBERT attention and MLP blocks into a single layer using pre-normalization.""" def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None): super().__init__(config=config, layer_id=layer_id) if config.skip_first_prenorm and config.embed_norm and layer_id == 0: self.attn_norm = nn.Identity() else: self.attn_norm = get_norm_layer(config) self.attn = get_attention_layer(config, layer_id=layer_id) self.mlp_norm = get_norm_layer(config) self.mlp = get_mlp_layer(config, layer_id=layer_id) def _init_weights(self, reset_params: bool = False): super()._init_weights(reset_params) if reset_params: self.attn_norm.reset_parameters() self.mlp_norm.reset_parameters() def forward( self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, max_seqlen: int, indices: Optional[torch.Tensor] = None, attn_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass for a BERT layer, including both attention and MLP. Args: hidden_states: (total_nnz, dim) cu_seqlens: (batch + 1,) max_seqlen: int indices: None or (total_nnz,) attn_mask: None or (batch, max_seqlen) """ attn_out = hidden_states + self.attn(self.attn_norm(hidden_states), cu_seqlens, max_seqlen, indices, attn_mask) return attn_out + self.mlp(self.mlp_norm(attn_out)) class FlexBertUnpadParallelPreNormLayer(FlexBertLayerBase): """Composes the FlexBERT parallel attention and MLP blocks into a single layer using pre-normalization.""" def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None): super().__init__(config=config, layer_id=layer_id) self.attn_size = config.hidden_size * 3 self.mlp_size = config.intermediate_size * 2 # Compute QKV and FF outputs at once self.Wqkvff = nn.Linear(config.hidden_size, self.attn_size + self.mlp_size, bias=config.attn_qkv_bias) if config.skip_first_prenorm and config.embed_norm and layer_id == 0: self.norm = nn.Identity() else: self.norm = get_norm_layer(config) self.attn = get_attention_layer(config, layer_id=layer_id) self.mlp = get_mlp_layer(config, layer_id=layer_id) def _init_weights(self, reset_params: bool = False): super()._init_weights(reset_params) if reset_params and hasattr(self.norm, "reset_parameters"): self.norm.reset_parameters() init_weights( self.config, self.Wqkvff, layer_dim=self.config.hidden_size, layer_id=None, type_of_module=ModuleType.in_module, ) def forward( self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, max_seqlen: int, indices: Optional[torch.Tensor] = None, attn_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass for a BERT layer, including both attention and MLP. Args: hidden_states: (total_nnz, dim) attn_mask: None or (batch, max_seqlen) """ # Compute QKV and FF outputs at once and split them qkv, intermediate_ff = self.Wqkvff(self.norm(hidden_states)).split([self.attn_size, self.mlp_size], dim=1) return hidden_states + self.attn(qkv, cu_seqlens, max_seqlen, indices, attn_mask) + self.mlp(intermediate_ff) class FlexBertPaddedPreNormLayer(FlexBertLayerBase): """Composes the FlexBERT attention and MLP blocks into a single layer using pre-normalization.""" def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None): super().__init__(config=config, layer_id=layer_id) if config.skip_first_prenorm and config.embed_norm and layer_id == 0: self.attn_norm = nn.Identity() else: self.attn_norm = get_norm_layer(config) self.attn = get_attention_layer(config, layer_id=layer_id) self.mlp_norm = get_norm_layer(config) self.mlp = get_mlp_layer(config, layer_id=layer_id) def _init_weights(self, reset_params: bool = False): super()._init_weights(reset_params) if reset_params: self.attn_norm.reset_parameters() self.mlp_norm.reset_parameters() def forward( self, hidden_states: torch.Tensor, attn_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass for a BERT layer, including both attention and MLP. Args: hidden_states: (batch, max_seqlen, dim) attn_mask: None or (batch, max_seqlen) """ attn_out = hidden_states + self.attn(self.attn_norm(hidden_states), attn_mask) return attn_out + self.mlp(self.mlp_norm(attn_out)) class FlexBertPaddedParallelPreNormLayer(FlexBertLayerBase): """Composes the FlexBERT attention and MLP blocks into a single layer using pre-normalization.""" def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None): super().__init__(config=config, layer_id=layer_id) self.attn_size = config.hidden_size * 3 self.mlp_size = config.intermediate_size * 2 # Compute QKV and FF outputs at once self.Wqkvff = nn.Linear(config.hidden_size, self.attn_size + self.mlp_size, bias=config.attn_qkv_bias) if config.skip_first_prenorm and config.embed_norm and layer_id == 0: self.norm = nn.Identity() else: self.norm = get_norm_layer(config) self.attn = get_attention_layer(config, layer_id=layer_id) self.mlp = get_mlp_layer(config, layer_id=layer_id) def _init_weights(self, reset_params: bool = False): super()._init_weights(reset_params) if reset_params: self.norm.reset_parameters() init_weights( self.config, self.Wqkvff, layer_dim=self.config.hidden_size, layer_id=None, type_of_module=ModuleType.in_module, ) def forward( self, hidden_states: torch.Tensor, attn_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass for a BERT layer, including both attention and MLP. Args: hidden_states: (batch, max_seqlen, dim) attn_mask: None or (batch, max_seqlen) """ # Compute QKV and FF outputs at once and split them qkv, intermediate_ff = self.Wqkvff(self.norm(hidden_states)).split([self.attn_size, self.mlp_size], dim=2) return hidden_states + self.attn(qkv, attn_mask) + self.mlp(intermediate_ff) class FlexBertUnpadPostNormLayer(FlexBertLayerBase): """Composes the FlexBERT attention and MLP blocks into a single layer using post-normalization.""" def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None): super().__init__(config=config, layer_id=layer_id) self.attn = get_attention_layer(config, layer_id=layer_id) self.attn_norm = get_norm_layer(config) self.mlp = get_mlp_layer(config, layer_id=layer_id) self.mlp_norm = get_norm_layer(config) def _init_weights(self, reset_params: bool = False): super()._init_weights(reset_params) if reset_params: self.attn_norm.reset_parameters() self.mlp_norm.reset_parameters() def forward( self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, max_seqlen: int, indices: Optional[torch.Tensor] = None, attn_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass for a BERT layer, including both attention and MLP. Args: hidden_states: (total_nnz, dim) cu_seqlens: (batch + 1,) max_seqlen: int indices: None or (total_nnz,) attn_mask: None or (batch, max_seqlen) """ attn_out = self.attn_norm(hidden_states + self.attn(hidden_states, cu_seqlens, max_seqlen, indices, attn_mask)) return self.mlp_norm(attn_out + self.mlp(attn_out)) class FlexBertPaddedPostNormLayer(FlexBertLayerBase): """Composes the FlexBERT attention and MLP blocks into a single layer using post-normalization.""" def __init__(self, config: FlexBertConfig, layer_id: Optional[int] = None): super().__init__(config=config, layer_id=layer_id) self.attn = get_attention_layer(config, layer_id=layer_id) self.attn_norm = get_norm_layer(config) self.mlp = get_mlp_layer(config, layer_id=layer_id) self.mlp_norm = get_norm_layer(config) def _init_weights(self, reset_params: bool = False): super()._init_weights(reset_params) if reset_params: self.mlp_norm.reset_parameters() def forward( self, hidden_states: torch.Tensor, attn_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass for a BERT layer, including both attention and MLP. Args: hidden_states: (batch, max_seqlen, dim) attn_mask: None or (batch, max_seqlen) """ attn_out = self.attn_norm(hidden_states + self.attn(hidden_states, attn_mask)) return self.mlp_norm(attn_out + self.mlp(attn_out)) LAYER2CLS = { "unpadded_prenorm": FlexBertUnpadPreNormLayer, "unpadded_compile_prenorm": FlexBertCompileUnpadPreNormLayer, "unpadded_parallel_prenorm": FlexBertUnpadParallelPreNormLayer, "unpadded_postnorm": FlexBertUnpadPostNormLayer, "padded_prenorm": FlexBertPaddedPreNormLayer, "padded_parallel_prenorm": FlexBertPaddedParallelPreNormLayer, "padded_postnorm": FlexBertPaddedPostNormLayer, } def get_bert_layer(config: FlexBertConfig, layer_id: Optional[int] = None) -> FlexBertLayerBase: try: bert_layer = ( config.initial_bert_layer if layer_id < config.num_initial_layers and getattr(config, "initial_bert_layer", None) is not None else config.bert_layer ) bert_layer = maybe_add_padding(config, bert_layer) if config.compile_model and bert_layer == "unpadded_prenorm": bert_layer = "unpadded_compile_prenorm" return LAYER2CLS[bert_layer](config, layer_id=layer_id) except KeyError: if layer_id < config.num_initial_layers and getattr(config, "initial_bert_layer", None) is not None: raise ValueError( f"Invalid BERT layer type: {config.initial_bert_layer=}, must be one of {LAYER2CLS.keys()}." f"{config.padding=} will be automatically prepended to `config.bert_layer` if unspecified." ) else: raise ValueError( f"Invalid BERT layer type: {config.bert_layer=}, must be one of {LAYER2CLS.keys()}. " f"{config.padding=} will be automatically prepended to `config.bert_layer` if unspecified." ) class FlexBertEncoderBase(nn.Module): """A FlexBERT base class for type hints.""" layers: nn.ModuleList def _init_weights(self, reset_params: bool = False): if hasattr(self, "layers"): for layer in self.layers: layer._init_weights(reset_params=reset_params) def reset_parameters(self): self._init_weights(reset_params=True) def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: raise NotImplementedError("This is a base class and should not be used directly.") class FlexBertUnpadEncoder(FlexBertEncoderBase): """A stack of BERT layers providing the backbone of FlexBERT. This module is modeled after the Hugging Face BERT's :class:`~transformers.model.bert.modeling_bert.BertAlibiEncoder`, but with substantial modifications to implement unpadding and ALiBi. Compared to the analogous Hugging Face BERT module, this module handles unpadding to reduce unnecessary computation at padded tokens, and pre-computes attention biases to implement ALiBi. """ def __init__(self, config: FlexBertConfig): super().__init__() self.layers = nn.ModuleList([get_bert_layer(config, layer_id=i) for i in range(config.num_hidden_layers)]) self.num_attention_heads = config.num_attention_heads def forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, indices: Optional[torch.Tensor] = None, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None, ) -> torch.Tensor: if indices is None and cu_seqlens is None and max_seqlen is None: attention_mask_bool = attention_mask.bool() batch, seqlen = hidden_states.shape[:2] hidden_states, indices, cu_seqlens, max_seqlen = unpad_input( hidden_states, attention_mask_bool ) for layer_module in self.layers: hidden_states = layer_module( hidden_states, cu_seqlens, max_seqlen, indices, attn_mask=attention_mask, ) return pad_input(hidden_states, indices, batch, seqlen) else: for layer_module in self.layers: hidden_states = layer_module( hidden_states, cu_seqlens, max_seqlen, indices, attn_mask=attention_mask, ) return hidden_states class FlexBertPaddedEncoder(FlexBertEncoderBase): """A stack of BERT layers providing the backbone of FlexBERT. This module is modeled after the Hugging Face BERT's :class:`~transformers.model.bert.modeling_bert.BertAlibiEncoder`, but with substantial modifications to implement unpadding and ALiBi. Compared to the analogous Hugging Face BERT module, this module handles unpadding to reduce unnecessary computation at padded tokens, and pre-computes attention biases to implement ALiBi. """ def __init__(self, config: FlexBertConfig): super().__init__() self.layers = nn.ModuleList([get_bert_layer(config, layer_id=i) for i in range(config.num_hidden_layers)]) self.num_attention_heads = config.num_attention_heads def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, **kwargs) -> torch.Tensor: for layer_module in self.layers: hidden_states = layer_module(hidden_states, attn_mask=attention_mask) return hidden_states ENC2CLS = { "unpadded_base": FlexBertUnpadEncoder, "padded_base": FlexBertPaddedEncoder, } def get_encoder_layer(config: FlexBertConfig) -> FlexBertEncoderBase: try: return ENC2CLS[maybe_add_padding(config, config.encoder_layer)](config) except KeyError: raise ValueError( f"Invalid encoder layer type: {config.encoder_layer=}, must be one of {ENC2CLS.keys()}. " f"{config.padding=} will be automatically prepended to `config.encoder_layer` if unspecified." )