""" PyTorch Mixtral model.""" import importlib import inspect import math import warnings from dataclasses import dataclass from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import scattermoe import stk import torch import torch.distributed as dist import torch.nn.functional as F import torch.utils.checkpoint from megablocks import grouped_gemm_util as gg from megablocks.layers.activation_fn import act_fn from megablocks.layers.arguments import Arguments as MegablocksArguments from megablocks.layers.dmlp_registry import _REGISTRY from megablocks.layers.dmoe import ParallelDroplessMLP from megablocks.layers.glu import memory_optimized_grouped_glu from packaging import version from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from transformers import ( BeamSearchScorer, ConstrainedBeamSearchScorer, DisjunctiveConstraint, LogitsProcessorList, PhrasalConstraint, QuantizedCacheConfig, StoppingCriteriaList, ) from transformers.activations import ACT2FN from transformers.cache_utils import Cache, DynamicCache from transformers.generation.configuration_utils import GenerationConfig, GenerationMode from transformers.generation.utils import ( NEED_SETUP_CACHE_CLASSES_MAPPING, QUANT_BACKEND_CLASSES_MAPPING, GenerateOutput, ) from transformers.integrations import is_deepspeed_zero3_enabled from transformers.modeling_outputs import ModelOutput, SequenceClassifierOutputWithPast from transformers.modeling_utils import PreTrainedModel from transformers.utils import is_torch_available, logging from transformers.utils.import_utils import ( is_hqq_available, is_quanto_available, is_torch_fx_available, is_torchdynamo_compiling, ) from .configuration_mixtral import MixtralConfig logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "MixtralConfig" parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version) is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse("1.13") def _is_package_available( pkg_name: str, return_version: bool = False ) -> Union[Tuple[bool, str], bool]: # Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version package_exists = importlib.util.find_spec(pkg_name) is not None package_version = "N/A" if package_exists: try: package_version = importlib.metadata.version(pkg_name) package_exists = True except importlib.metadata.PackageNotFoundError: package_exists = False logger.debug(f"Detected {pkg_name} version {package_version}") if return_version: return package_exists, package_version else: return package_exists def is_flash_attn_2_available(): if not is_torch_available(): return False if not _is_package_available("flash_attn"): return False # Let's add an extra check to see if cuda is available import torch if not torch.cuda.is_available(): return False if torch.version.cuda: return version.parse(importlib.metadata.version("flash_attn")) >= version.parse( "2.1.0" ) elif torch.version.hip: # TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention return version.parse(importlib.metadata.version("flash_attn")) >= version.parse( "2.0.4" ) else: return False def is_flash_attn_greater_or_equal_2_10(): if not _is_package_available("flash_attn"): return False return version.parse(importlib.metadata.version("flash_attn")) >= version.parse( "2.1.0" ) def is_flash_attn_available(): logger.warning( "Using `is_flash_attn_available` is deprecated and will be removed in v4.38. " "Please use `is_flash_attn_2_available` instead." ) return is_flash_attn_2_available() @dataclass class AttentionMaskConverter: """ A utility attention mask class that allows one to: - Create a causal 4d mask - Create a causal 4d mask with slided window - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length, key_value_length) that can be multiplied with attention scores Examples: ```python >>> import torch >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter >>> converter = AttentionMaskConverter(True) >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32) tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], [-3.4028e+38, -3.4028e+38, -3.4028e+38, 0.0000e+00, -3.4028e+38], [-3.4028e+38, -3.4028e+38, -3.4028e+38, 0.0000e+00, 0.0000e+00]]]]) ``` Parameters: is_causal (`bool`): Whether the attention mask should be a uni-directional (causal) or bi-directional mask. sliding_window (`int`, *optional*): Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer. """ is_causal: bool sliding_window: int def __init__(self, is_causal: bool, sliding_window: Optional[int] = None): self.is_causal = is_causal self.sliding_window = sliding_window if self.sliding_window is not None and self.sliding_window <= 0: raise ValueError( f"Make sure that when passing `sliding_window` that its value is a strictly positive integer, not `{self.sliding_window}`" ) def to_causal_4d( self, batch_size: int, query_length: int, key_value_length: int, dtype: torch.dtype, device: Union[torch.device, "str"] = "cpu", ) -> Optional[torch.Tensor]: """ Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative bias to upper right hand triangular matrix (causal mask). """ if not self.is_causal: raise ValueError( f"Please use `to_causal_4d` only if {self.__class__} has `is_causal` set to True." ) # If shape is not cached, create a new causal mask and cache it input_shape = (batch_size, query_length) past_key_values_length = key_value_length - query_length # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] causal_4d_mask = None if input_shape[-1] > 1 or self.sliding_window is not None: causal_4d_mask = self._make_causal_mask( input_shape, dtype, device=device, past_key_values_length=past_key_values_length, sliding_window=self.sliding_window, ) return causal_4d_mask def to_4d( self, attention_mask_2d: torch.Tensor, query_length: int, dtype: torch.dtype, key_value_length: Optional[int] = None, ) -> torch.Tensor: """ Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length, key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is causal, a causal mask will be added. """ input_shape = (attention_mask_2d.shape[0], query_length) # create causal mask # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] causal_4d_mask = None if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal: if key_value_length is None: raise ValueError( "This attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask." ) past_key_values_length = key_value_length - query_length causal_4d_mask = self._make_causal_mask( input_shape, dtype, device=attention_mask_2d.device, past_key_values_length=past_key_values_length, sliding_window=self.sliding_window, ) elif self.sliding_window is not None: raise NotImplementedError( "Sliding window is currently only implemented for causal masking" ) # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] expanded_attn_mask = self._expand_mask( attention_mask_2d, dtype, tgt_len=input_shape[-1] ).to(attention_mask_2d.device) if causal_4d_mask is not None: expanded_attn_mask = causal_4d_mask.masked_fill( expanded_attn_mask.bool(), torch.finfo(dtype).min ) # expanded_attn_mask + causal_4d_mask can cause some overflow expanded_4d_mask = expanded_attn_mask return expanded_4d_mask @staticmethod def _make_causal_mask( input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0, sliding_window: Optional[int] = None, ): """ Make causal mask used for bi-directional self-attention. """ bsz, tgt_len = input_ids_shape mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device) mask_cond = torch.arange(mask.size(-1), device=device) mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) mask = mask.to(dtype) if past_key_values_length > 0: mask = torch.cat( [ torch.zeros( tgt_len, past_key_values_length, dtype=dtype, device=device ), mask, ], dim=-1, ) # add lower triangular sliding window mask if necessary if sliding_window is not None: diagonal = past_key_values_length - sliding_window + 1 context_mask = 1 - torch.triu( torch.ones_like(mask, dtype=torch.int), diagonal=diagonal ) mask.masked_fill_(context_mask.bool(), torch.finfo(dtype).min) return mask[None, None, :, :].expand( bsz, 1, tgt_len, tgt_len + past_key_values_length ) @staticmethod def _expand_mask( mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None ): """ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. """ bsz, src_len = mask.size() tgt_len = tgt_len if tgt_len is not None else src_len expanded_mask = ( mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) ) inverted_mask = 1.0 - expanded_mask return inverted_mask.masked_fill( inverted_mask.to(torch.bool), torch.finfo(dtype).min ) @staticmethod def _unmask_unattended( expanded_mask: torch.Tensor, attention_mask: torch.Tensor, unmasked_value: Union[bool, float], ): # fmt: off """ Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. Details: https://github.com/pytorch/pytorch/issues/110213 `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len]. `attention_mask` is [bsz, src_seq_len]. The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias. For example, if `attention_mask` is ``` [[0, 0, 1], [1, 1, 1], [0, 1, 1]] ``` and `expanded_mask` is (e.g. here left-padding case) ``` [[[[0, 0, 0], [0, 0, 0], [0, 0, 1]]], [[[1, 0, 0], [1, 1, 0], [1, 1, 1]]], [[[0, 0, 0], [0, 1, 0], [0, 1, 1]]]] ``` then the modified `expanded_mask` will be ``` [[[[1, 1, 1], <-- modified [1, 1, 1], <-- modified [0, 0, 1]]], [[[1, 0, 0], [1, 1, 0], [1, 1, 1]]], [[[1, 1, 1], <-- modified [0, 1, 0], [0, 1, 1]]]] ``` """ # fmt: on # Get the index of the first non-zero value for every sample in the batch. # In the above example, indices = [[2], [0], [1]]] tmp = torch.arange(attention_mask.shape[1], 0, -1) indices = torch.argmax(attention_mask.cpu() * tmp, 1, keepdim=True) # Find the batch indexes that have unattended tokens on the leftmost side (e.g. [0, 0, 1, 1, 1]), for which the first rows of the # expanded mask will be completely unattended. left_masked_rows = torch.where(indices > 0)[0] if left_masked_rows.shape[0] == 0: return expanded_mask indices = indices[left_masked_rows] max_len = torch.max(indices) range_tensor = torch.arange(max_len).unsqueeze(0) range_tensor = range_tensor.repeat(indices.size(0), 1) # Avoid unmasking tokens at relevant target positions (on the row axis), by rather unmasking possibly several times the first row that should always be unmasked as we filtered out the batch above. range_tensor[range_tensor >= indices] = 0 # TODO: we may drop support for 3D attention mask as the refactor from Patrick maybe dropped this case if expanded_mask.dim() == 4: num_masks = expanded_mask.shape[1] if num_masks == 1: # Broadcast [left_masked_rows, 1], [left_masked_rows, max_len] mask_slice = (left_masked_rows[:, None], 0, range_tensor) else: # Broadcast [left_masked_rows, 1, 1], [1, num_masks, 1], [left_masked_rows, 1, max_len] mask_slice = ( left_masked_rows[:, None, None], torch.arange(num_masks)[None, :, None], range_tensor[:, None, :], ) else: # Broadcast [left_masked_rows, 1], [left_masked_rows, max_len] mask_slice = (left_masked_rows[:, None], range_tensor) expanded_mask[mask_slice] = unmasked_value return expanded_mask def _prepare_4d_causal_attention_mask( attention_mask: Optional[torch.Tensor], input_shape: Union[torch.Size, Tuple, List], inputs_embeds: torch.Tensor, past_key_values_length: int, sliding_window: Optional[int] = None, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape `(batch_size, key_value_length)` Args: attention_mask (`torch.Tensor` or `None`): A 2D attention mask of shape `(batch_size, key_value_length)` input_shape (`tuple(int)` or `list(int)` or `torch.Size`): The input shape should be a tuple that defines `(batch_size, query_length)`. inputs_embeds (`torch.Tensor`): The embedded inputs as a torch Tensor. past_key_values_length (`int`): The length of the key value cache. sliding_window (`int`, *optional*): If the model uses windowed attention, a sliding window should be passed. """ attn_mask_converter = AttentionMaskConverter( is_causal=True, sliding_window=sliding_window ) key_value_length = input_shape[-1] + past_key_values_length # 4d mask is passed through the layers if attention_mask is not None: attention_mask = attn_mask_converter.to_4d( attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype, ) else: attention_mask = attn_mask_converter.to_causal_4d( input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device, ) return attention_mask @dataclass class MoeCausalLMOutputWithPast(ModelOutput): """ Base class for causal language model (or autoregressive) with mixture of experts outputs. Args: loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): Language modeling loss (for next-token prediction). logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided): aux_loss for the sparse modules. router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary loss for Mixture of Experts models. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ loss: Optional[torch.FloatTensor] = None aux_loss: Optional[torch.FloatTensor] = None logits: torch.FloatTensor = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None router_logits: Optional[Tuple[torch.FloatTensor]] = None @property def balance_loss(self): return self.aux_loss @property def num_dropped_tokens(self): return [torch.tensor(-1)] * 32 @property def gate_load(self): return [torch.tensor(-1)] * 32 @property def gate_importance(self): return [torch.tensor(-1)] * 32 @dataclass class MoeModelOutputWithPast(ModelOutput): """ Base class for model's outputs, with potential hidden states and attentions. Args: last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`): Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary loss for Mixture of Experts models. """ last_hidden_state: torch.FloatTensor = None past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None router_logits: Optional[Tuple[torch.FloatTensor]] = None attn_router_logits: Optional[Tuple[torch.FloatTensor]] = None # 🔍 if is_flash_attn_2_available(): from flash_attn import flash_attn_func, flash_attn_varlen_func from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa _flash_supports_window_size = "window_size" in list( inspect.signature(flash_attn_func).parameters ) # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. # It means that the function will not be traced through and simply appear as a node in the graph. if is_torch_fx_available(): if not is_torch_greater_or_equal_than_1_13: import torch.fx _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) def load_balancing_loss_func( gate_logits: Union[torch.Tensor, Tuple], num_experts: torch.Tensor = None, top_k=2, use_layer_wise_balance=False, ) -> torch.FloatTensor: r""" Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch. See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between experts is too unbalanced. Args: gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]): Logits from the `gate`, should be a tuple of tensors. Shape: [batch_size, seqeunce_length, num_experts]. num_experts (`int`, *optional*): Number of experts Returns: The auxiliary loss. """ if gate_logits is None or (isinstance(gate_logits, Iterable) and len(gate_logits) == 0): return 0 # ✨ Here is the fix for balance loss in Mixtral. # We should calculate the balance loss in a layer-wise manner otherwise it may lead to degenerated solutions. if use_layer_wise_balance: if not isinstance(gate_logits, Iterable): gate_logits = (gate_logits,) else: if isinstance(gate_logits, Iterable): gate_logits = (torch.cat(gate_logits, dim=0),) else: gate_logits = (gate_logits,) all_balance_losses = [] for logits in gate_logits: routing_weights, selected_experts = torch.topk(logits, top_k, dim=-1) routing_weights = routing_weights.softmax(dim=-1) # cast the expert indices to int64, otherwise one-hot encoding will fail if selected_experts.dtype != torch.int64: selected_experts = selected_experts.to(torch.int64) if len(selected_experts.shape) == 2: selected_experts = selected_experts.unsqueeze(2) expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts) # For a given token, determine if it was routed to a given expert. expert_mask = torch.max(expert_mask, axis=-2).values # cast to float32 otherwise mean will fail expert_mask = expert_mask.to(torch.float32) tokens_per_group_and_expert = torch.mean(expert_mask, axis=-2) router_prob_per_group_and_expert = torch.mean(routing_weights, axis=-1) # ✨ balance loss for this layer balance_loss = torch.mean( tokens_per_group_and_expert * router_prob_per_group_and_expert.unsqueeze(-1) ) * (num_experts**2) all_balance_losses.append(balance_loss.reshape(1)) all_balance_losses = torch.cat(all_balance_losses).mean() # ✨ return all_balance_losses # Copied from transformers.models.llama.modeling_llama._get_unpad_data def _get_unpad_data(attention_mask): seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() max_seqlen_in_batch = seqlens_in_batch.max().item() cu_seqlens = F.pad( torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0) ) return ( indices, cu_seqlens, max_seqlen_in_batch, ) # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mixtral class MixtralRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): """ MixtralRMSNorm is equivalent to T5LayerNorm """ super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps def forward(self, hidden_states): input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) variance = hidden_states.pow(2).mean(-1, keepdim=True) hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mixtral class MixtralRotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): super().__init__() self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base inv_freq = 1.0 / ( self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) ) self.register_buffer("inv_freq", inv_freq, persistent=False) # Build here to make `torch.jit.trace` work. self._set_cos_sin_cache( seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype(), ) def _set_cos_sin_cache(self, seq_len, device, dtype): self.max_seq_len_cached = seq_len t = torch.arange( self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype ) freqs = torch.outer(t, self.inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation emb = torch.cat((freqs, freqs), dim=-1) self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) def forward(self, x, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] if seq_len > self.max_seq_len_cached: self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) return ( self.cos_cached[:seq_len].to(dtype=x.dtype), self.sin_cached[:seq_len].to(dtype=x.dtype), ) # Copied from transformers.models.llama.modeling_llama.rotate_half def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] x2 = x[..., x.shape[-1] // 2 :] return torch.cat((-x2, x1), dim=-1) # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): """Applies Rotary Position Embedding to the query and key tensors. Args: q (`torch.Tensor`): The query tensor. k (`torch.Tensor`): The key tensor. cos (`torch.Tensor`): The cosine part of the rotary embedding. sin (`torch.Tensor`): The sine part of the rotary embedding. position_ids (`torch.Tensor`): The position indices of the tokens corresponding to the query and key tensors. For example, this can be used to pass offsetted position ids when working with a KV-cache. unsqueeze_dim (`int`, *optional*, defaults to 1): The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. Returns: `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. """ cos = cos[position_ids].unsqueeze(unsqueeze_dim) sin = sin[position_ids].unsqueeze(unsqueeze_dim) q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed # Copied from transformers.models.llama.modeling_llama.repeat_kv def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) """ batch, num_key_value_heads, slen, head_dim = hidden_states.shape if n_rep == 1: return hidden_states hidden_states = hidden_states[:, :, None, :, :].expand( batch, num_key_value_heads, n_rep, slen, head_dim ) return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) # Copied from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Mixtral class MixtralAttention(nn.Module): """ Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer and "Generating Long Sequences with Sparse Transformers". """ def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None): super().__init__() self.config = config self.layer_idx = layer_idx if layer_idx is None: logger.warning_once( f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " "when creating this class." ) self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings self.rope_theta = config.rope_theta self.is_causal = True self.attention_dropout = config.attention_dropout if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" f" and `num_heads`: {self.num_heads})." ) self.q_proj = nn.Linear( self.hidden_size, self.num_heads * self.head_dim, bias=False ) self.k_proj = nn.Linear( self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False ) self.v_proj = nn.Linear( self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False ) self.o_proj = nn.Linear( self.num_heads * self.head_dim, self.hidden_size, bias=False ) self.rotary_emb = MixtralRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, ) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return ( tensor.view(bsz, seq_len, self.num_heads, self.head_dim) .transpose(1, 2) .contiguous() ) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: bool = False, use_cache: bool = False, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if "padding_mask" in kwargs: warnings.warn( "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" ) bsz, q_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) query_states = query_states.view( bsz, q_len, self.num_heads, self.head_dim ).transpose(1, 2) key_states = key_states.view( bsz, q_len, self.num_key_value_heads, self.head_dim ).transpose(1, 2) value_states = value_states.view( bsz, q_len, self.num_key_value_heads, self.head_dim ).transpose(1, 2) kv_seq_len = key_states.shape[-2] if past_key_value is not None: if self.layer_idx is None: raise ValueError( f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " "with a layer index." ) kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) query_states, key_states = apply_rotary_pos_emb( query_states, key_states, cos, sin, position_ids ) if past_key_value is not None: cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models key_states, value_states = past_key_value.update( key_states, value_states, self.layer_idx, cache_kwargs ) # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) attn_weights = torch.matmul( query_states, key_states.transpose(2, 3) ) / math.sqrt(self.head_dim) if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): raise ValueError( f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" f" {attn_weights.size()}" ) if attention_mask is not None: if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): raise ValueError( f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" ) attn_weights = attn_weights + attention_mask # upcast attention to fp32 attn_weights = nn.functional.softmax( attn_weights, dim=-1, dtype=torch.float32 ).to(query_states.dtype) attn_weights = nn.functional.dropout( attn_weights, p=self.attention_dropout, training=self.training ) attn_output = torch.matmul(attn_weights, value_states) if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): raise ValueError( f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" f" {attn_output.size()}" ) attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None return attn_output, attn_weights, past_key_value # fmt: off # 🔍 Modified from DynamicCache class MoECache(Cache): """ Modified from the `DynamicCache`!!! A cache that grows dynamically as more tokens are generated. This cache adds extra support for Attention MoE. It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is `[batch_size, num_heads, seq_len, head_dim]`. """ def __init__(self, num_experts: int) -> None: # 🔍 multi-experts support self.num_experts = num_experts self.key_cache: List[Dict[int, torch.Tensor]] = [{} for _ in range(num_experts)] self.value_cache: List[Dict[int, torch.Tensor]] = [{} for _ in range(num_experts)] self._seen_tokens: List[Dict[int, int]] = [{} for _ in range(num_experts)] # Used in `generate` to keep tally of how many tokens the cache has seen self._seen_tokens_total = 0 # 🔍 the total number of individual tokens that at least one expert has seen, this is for `get_seq_length` globally self.attention_mask_cache: List[Dict[int, torch.BoolTensor]] = [{} for _ in range(num_experts)] # 🔍 this is a new cache for attention mask that records the state of previous tokens def __getitem__(self, layer_idx: int, expert_idx: int) -> Tuple[torch.Tensor, torch.Tensor]: """ Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0][0].shape[2]` to get the sequence length. """ if layer_idx < len(self): if expert_idx < self.num_experts: # 🔍 return (self.key_cache[expert_idx][layer_idx], self.value_cache[expert_idx][layer_idx]) else: # 🔍 raise KeyError(f"Cache only has {self.num_experts} experts, attempted to access expert with index {expert_idx}") else: raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}") def __iter__(self): """ Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over keys and values """ for layer_idx in range(len(self)): for expert_idx in range(self.num_experts): # 🔍 if layer_idx in self.key_cache[expert_idx]: yield (self.key_cache[expert_idx][layer_idx], self.value_cache[expert_idx][layer_idx]) def __len__(self): """ Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds to the number of layers in the model. """ all_index_list = [key for i in range(self.num_experts) for key in self.key_cache[i].keys()] if len(all_index_list) == 0: return 0 else: return max(all_index_list) + 1 # 🔍 the maximum layer index among all experts def update( self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx: int, expert_idx: int, # 🔍 cache_kwargs: Optional[Dict[str, Any]] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """ Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. Parameters: key_states (`torch.Tensor`): The new key states to cache. value_states (`torch.Tensor`): The new value states to cache. layer_idx (`int`): The index of the layer to cache the states for. expert_idx (`int`): 🔍 The index of the expert to cache the states for. cache_kwargs (`Dict[str, Any]`, `optional`): Additional arguments for the cache subclass. No additional arguments are used in `MoECache`. Return: A tuple containing the updated key and value states. """ if layer_idx not in self._seen_tokens[expert_idx]: # 🔍 # Update the number of seen tokens self._seen_tokens[expert_idx][layer_idx] = key_states.shape[-2] # Update the cache self.key_cache[expert_idx][layer_idx] = key_states self.value_cache[expert_idx][layer_idx] = value_states else: # 🔍 # Update the number of seen tokens self._seen_tokens[expert_idx][layer_idx] += key_states.shape[-2] # Update the cache self.key_cache[expert_idx][layer_idx] = torch.cat([self.key_cache[expert_idx][layer_idx], key_states], dim=-2) self.value_cache[expert_idx][layer_idx] = torch.cat([self.value_cache[expert_idx][layer_idx], value_states], dim=-2) return self.key_cache[expert_idx][layer_idx], self.value_cache[expert_idx][layer_idx] def add_seen_tokens_total(self, new_token_num: int = 0) -> None: """🔍 Add the number of new tokens to the total number of seen tokens.""" # THIS FUNCTION IS EXCLUSIVE FOR `MoECache`! self._seen_tokens_total += new_token_num def get_seq_length(self, layer_idx: Optional[int] = None, expert_idx: Optional[int] = None) -> Union[List[List[int]], int]: # 🔍 """Returns the sequence length of the cached states. A layer & expert index can be optionally passed.""" if layer_idx is not None and expert_idx is not None: # 🔍 return the length for specific layer & expert if self.num_experts <= expert_idx or layer_idx not in self.key_cache[expert_idx]: # 🔍 return 0 else: return self.key_cache[expert_idx][layer_idx].shape[-2] else: # 🔍 return the total number of individual tokens the cache has seen return self._seen_tokens_total def get_max_length(self) -> Optional[int]: """Returns the maximum sequence length of the cached states. MoECache does not have a maximum length.""" return None def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = None, expert_idx: Optional[int] = None) -> int: """Given the sequence length of the new inputs, returns the usable length of the cache.""" # Cache without size limit -> all cache is usable # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache # length, we will need to evict part of the cache (and thus not all cache is usable) max_length = self.get_max_length() previous_seq_length = self.get_seq_length(layer_idx, expert_idx) # 🔍 if max_length is not None and previous_seq_length + new_seq_length > max_length: return max_length - new_seq_length return previous_seq_length def reorder_cache(self, beam_idx: torch.LongTensor): """Reorders the cache for beam search, given the selected beam indices.""" # TODO: support for beam search print("MoECache, reorder_cache", beam_idx) raise NotImplementedError # for layer_idx in range(len(self.key_cache)): # device = self.key_cache[layer_idx].device # self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) # device = self.value_cache[layer_idx].device # self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]: raise NotImplementedError @classmethod def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "MoECache": raise NotImplementedError def update_attention_mask( self, new_attention_mask: torch.BoolTensor, layer_idx: int, expert_idx: int, ) -> torch.BoolTensor: """ 🔍 Updates the attention mask cache with the new `attention_mask`. Parameters: new_attention_mask (`torch.Tensor`): The new key states to cache. layer_idx (`int`): The index of the layer to cache the states for. expert_idx (`int`): The index of the expert to cache the states for. Return: A tensor containing the updated attention_mask. """ # Update the cache if layer_idx not in self.attention_mask_cache[expert_idx]: # 🔍 no attention mask cached, this is the first stroke self.attention_mask_cache[expert_idx][layer_idx] = new_attention_mask else: # 🔍 concatenate along the seq_len dim self.attention_mask_cache[expert_idx][layer_idx] = torch.cat([self.attention_mask_cache[expert_idx][layer_idx], new_attention_mask], dim=-1) return self.attention_mask_cache[expert_idx][layer_idx] # 🔍 Modified from MixtralAttention class MixtralAttentionMoE(MixtralAttention): def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None): super(MixtralAttention, self).__init__() # 🔍 init using nn.Module self.config = config self.layer_idx = layer_idx if layer_idx is None: logger.warning_once( f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " "when creating this class." ) self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings self.rope_theta = config.rope_theta self.is_causal = True self.attention_dropout = config.attention_dropout if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" f" and `num_heads`: {self.num_heads})." ) # 🔍 self.softmax = nn.Softmax(dim=-1) self.top_k_attn = config.top_k_attn self.attn_experts = config.attn_experts self.scale_factor_attn = config.scale_factor_attn self.split_ratio = self.attn_experts // self.num_key_value_heads self.gate = nn.Linear(self.hidden_size, self.attn_experts, bias=False) # 🔍 self.q_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.num_key_value_groups * self.head_dim // self.split_ratio, bias=False) for _ in range(self.attn_experts)]) self.k_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.head_dim, bias=False) for _ in range(self.attn_experts)]) self.v_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.head_dim, bias=False) for _ in range(self.attn_experts)]) self.o_proj = nn.ModuleList([nn.Linear(self.num_key_value_groups * self.head_dim // self.split_ratio, self.hidden_size, bias=config.add_rescale_bias) for _ in range(self.attn_experts)]) # 🔍 (may add bias for rescaling) self.rotary_emb = MixtralRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, ) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, # 🔍 This should be the Tensor with shape(bsz, seqlen) that represents the padding mask position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[MoECache] = None, output_attentions: bool = False, use_cache: bool = False, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if "padding_mask" in kwargs: warnings.warn( "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" ) if past_key_value is not None and not isinstance(past_key_value, MoECache): # 🔍 type check raise TypeError( "`past_key_value` must be a `MoECache` instance for attention MoE!" ) # print("attention_mask", attention_mask, attention_mask.shape) device = hidden_states.device dtype = hidden_states.dtype bsz, q_len, hidden_dim = hidden_states.size() hidden_states = hidden_states.reshape(-1, hidden_dim) # 🔍 flatten the dim # 🔍 topk gating router_logits = self.gate(hidden_states) # (bsz * q_len, num_key_value_heads) scores = F.softmax(router_logits, dim=1, dtype=torch.float) routing_weights, selected_experts = torch.topk(scores, self.top_k_attn, dim=-1) # (bsz * q_len, top_k_attn) routing_weights /= routing_weights.sum(dim=-1, keepdim=True) routing_weights = routing_weights.to(dtype) # we cast back to the input dtype # 🔍 moe selection final_attn_output = torch.zeros_like(hidden_states).reshape(-1, hidden_dim) # One hot encode the selected experts to create an expert mask # this will be used to easily index which expert is going to be sollicitated expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.attn_experts) # (bsz * q_len, top_k_attn, num_key_value_heads) expert_mask = expert_mask.permute(2, 1, 0) # (num_key_value_heads, top_k_attn, bsz * q_len) # Loop over all available experts in the model and perform the computation on each expert all_attn_weights = [] if output_attentions else None for expert_idx in range(self.attn_experts): # expert_mask[expert_idx]: (top_k_attn, bsz * q_len) # idx: the topk position. (selected_num) # top_x: token index. (selected_num) idx, top_x = torch.nonzero(expert_mask[expert_idx], as_tuple=True) if top_x.shape[0] == 0 and not self.training: # skip during training will lead to asynchrony among different GPUs and blocks the training! if output_attentions: all_attn_weights.append(None) continue # 🔍 Comment (DDZ): This is useless and even lags the speed, so I get it removed. # in torch it is faster to index using lists than torch tensors # top_x_list = top_x.tolist() # idx_list = idx.tolist() # 🔍 get routing info for this expert current_batch_ids = (top_x // q_len) # batch ids for current_state, (selected_num) each_batch_selected_token_num = torch.bincount(current_batch_ids, minlength=bsz) # (bsz) this_q_len = each_batch_selected_token_num.max().item() # 🔍 get the indices of each token in the hidden_state of this expert selection_mask = torch.zeros((bsz * q_len,), device=device, dtype=torch.bool) # the selection mask for this expert (this helps specify the position to put for each token) selection_mask[top_x] = True selection_mask = selection_mask.reshape(bsz, q_len) token_position_indices = torch.cumsum(selection_mask, dim=1) - 1 # the sequence ids of all tokens in the current state, (bsz, q_len) token_position_indices = token_position_indices.flatten() current_seq_ids = token_position_indices[top_x] # sequence ids for current_state, (selected_num) # 🔍 initialize hidden_states for this expert current_state = torch.zeros((bsz, this_q_len, hidden_dim), dtype=dtype, device=device) current_state[current_batch_ids, current_seq_ids] = hidden_states[top_x] # assign tokens sparsely # Normal Attention Forward # ---------------------------------------------- # query_states = self.q_proj[expert_idx](current_state) # 🔍 specify expert key_states = self.k_proj[expert_idx](current_state) # 🔍 specify expert value_states = self.v_proj[expert_idx](current_state) # 🔍 specify expert query_states = query_states.view(bsz, this_q_len, self.num_key_value_groups // self.split_ratio, self.head_dim).transpose(1, 2) # 🔍 q_len -> this_q_len, num_heads -> num_key_value_groups key_states = key_states.view(bsz, this_q_len, 1, self.head_dim).transpose(1, 2) # 🔍 q_len -> this_q_len, num_key_value_heads -> 1 value_states = value_states.view(bsz, this_q_len, 1, self.head_dim).transpose(1, 2) # 🔍 q_len -> this_q_len, num_key_value_heads -> 1 past_key_values_length = 0 kv_seq_len = key_states.shape[-2] if past_key_value is not None: if self.layer_idx is None: raise ValueError( f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " "with a layer index." ) past_key_values_length = past_key_value.get_usable_length(kv_seq_len, self.layer_idx, expert_idx) # 🔍 specify expert index kv_seq_len += past_key_values_length # 🔍 create position_ids for selected tokens current_position_ids = torch.zeros((bsz, this_q_len), device=device, dtype=torch.long) current_position_ids[current_batch_ids, current_seq_ids] = position_ids.expand(bsz, q_len).flatten()[top_x] if top_x.shape[0] > 0: # apply only when there are tokens cos, sin = self.rotary_emb(value_states, seq_len=current_position_ids.max().item() + 1) # 🔍 adjust the seq_len to the maximum possible value query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, current_position_ids) if past_key_value is not None: cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, expert_idx, cache_kwargs) # 🔍 specify expert index # repeat k/v heads if n_kv_heads < n_heads # Note (DDZ): here the dim is expanded internally, rather than concat-repeat. (Disable for Attention MoE) # key_states = repeat_kv(key_states, self.num_key_value_groups) # value_states = repeat_kv(value_states, self.num_key_value_groups) attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) # softmax temperature if attn_weights.size() != (bsz, self.num_key_value_groups // self.split_ratio, this_q_len, kv_seq_len): # 🔍 q_len -> this_q_len, num_heads -> num_key_value_groups raise ValueError(f"Attention weights should be of size {(bsz, self.num_key_value_groups // self.split_ratio, this_q_len, kv_seq_len)}, but is {attn_weights.size()}") # 🔍 create `current_attention_mask` with reduced `seq_len` # Notice that the `attention_mask` is passed intact during both training & generation, so we need to adjust the `top_x` by `past_key_values_length`. # However, we don't have the routing information of previous tokens, which makes it impossible to create `current_attention_mask` for previous tokens. # So here we need an extra "attention mask cache" to record the `attention_mask` for previous tokens, and update for new tokens accordingly during generation. current_attention_mask = torch.zeros((bsz, this_q_len), dtype=torch.bool, device=device) if attention_mask is not None: if past_key_values_length > 0: # 🔍 we need to exclude previous tokens previous_seen_tokens_total = past_key_value._seen_tokens_total - q_len temp_attention_mask = attention_mask[:, previous_seen_tokens_total:].flatten() # select along dimension 1 so that we get tokens in this iteration else: temp_attention_mask = attention_mask.flatten() # flatten the dim current_attention_mask[current_batch_ids, current_seq_ids] = temp_attention_mask[top_x].bool() # assign masks sparsely else: current_attention_mask[current_batch_ids, current_seq_ids] = True # assign masks sparsely # print("current_attention_mask", current_attention_mask, current_attention_mask.shape) if past_key_value is not None: # 🔍 we need to update with cached attention mask current_attention_mask = past_key_value.update_attention_mask(current_attention_mask, self.layer_idx, expert_idx) # if self.layer_idx == 0 and expert_idx == 0: # print("current_attention_mask", current_attention_mask.sum(-1), current_attention_mask.shape, current_attention_mask[0]) current_attention_mask = _prepare_4d_causal_attention_mask( current_attention_mask, (bsz, this_q_len), current_state, past_key_values_length, sliding_window=self.config.sliding_window, ) if current_attention_mask.size() != (bsz, 1, this_q_len, kv_seq_len): # 🔍 q_len -> this_q_len raise ValueError(f"Attention mask should be of size {(bsz, 1, this_q_len, kv_seq_len)}, but is {current_attention_mask.size()}") attn_weights = attn_weights + current_attention_mask # 🔍 # print("current_attention_mask", current_attention_mask.shape, current_attention_mask[0]) # upcast attention to fp32 attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) attn_output = torch.matmul(attn_weights, value_states) # if attn_output.size() != (bsz, self.num_key_value_groups // self.split_ratio, this_q_len, self.head_dim): # 🔍 q_len -> this_q_len, num_heads -> num_key_value_groups # raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is {attn_output.size()}") attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(bsz, this_q_len, self.num_key_value_groups * self.head_dim // self.split_ratio) # 🔍 q_len -> this_q_len, hidden_size -> num_key_value_groups * head_dim attn_output = self.o_proj[expert_idx](attn_output) # ---------------------------------------------- # # 🔍 select & rescale the outputs by softmax scores attn_output = attn_output[current_batch_ids, current_seq_ids] * (routing_weights[top_x, idx, None] * self.scale_factor_attn) # attn_output = attn_output[current_batch_ids, current_seq_ids] # this line for debug only # 🔍 add to the final outputs final_attn_output.index_add_(0, top_x, attn_output) if output_attentions: all_attn_weights.append(attn_weights) # 🔍 reshape final_attn_output = final_attn_output.reshape(bsz, q_len, hidden_dim) if output_attentions: all_attn_weights = tuple(all_attn_weights) return final_attn_output, all_attn_weights, past_key_value, router_logits # 🔍 return an extra `router_logits` @torch.no_grad() def from_vanilla_attention(attention: MixtralAttention, top_k_attn, scale_factor_attn): # config layer_idx = attention.layer_idx config = attention.config config.top_k_attn = top_k_attn config.scale_factor_attn = scale_factor_attn # init attention_moe = MixtralAttentionMoE(config, layer_idx) split = 1 # split the hidden_size, support split=1 --> 8/2, split=2 --> 16/4, split=4 --> 32/8 # copy weights num_key_value_groups = attention_moe.num_key_value_groups // split head_dim = attention_moe.head_dim for i in range(config.num_key_value_heads * split): indices_q_o = [j for j in range(head_dim * num_key_value_groups * i, head_dim * num_key_value_groups * (i + 1))] indices_k_v = [j for j in range(head_dim * (i // split), head_dim * ((i // split) + 1))] print(i, "indices_q_o", indices_q_o) # print(i, "indices_k_v", indices_k_v) attention_moe.q_proj[i].weight.data = attention.q_proj.weight.data[indices_q_o].clone() attention_moe.k_proj[i].weight.data = attention.k_proj.weight.data[indices_k_v].clone() attention_moe.v_proj[i].weight.data = attention.v_proj.weight.data[indices_k_v].clone() attention_moe.o_proj[i].weight.data = attention.o_proj.weight.data[:, indices_q_o].clone() return attention_moe # fmt: on # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Mixtral class MixtralFlashAttention2(MixtralAttention): """ Mixtral flash attention module. This module inherits from `MixtralAttention` as the weights of the module stays untouched. The only required change would be on the forward pass where it needs to correctly call the public API of flash attention and deal with padding tokens in case the input contains any of them. """ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: bool = False, use_cache: bool = False, **kwargs, ): if "padding_mask" in kwargs: warnings.warn( "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" ) # overwrite attention_mask with padding_mask attention_mask = kwargs.pop("padding_mask") bsz, q_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) query_states = query_states.view( bsz, q_len, self.num_heads, self.head_dim ).transpose(1, 2) key_states = key_states.view( bsz, q_len, self.num_key_value_heads, self.head_dim ).transpose(1, 2) value_states = value_states.view( bsz, q_len, self.num_key_value_heads, self.head_dim ).transpose(1, 2) kv_seq_len = key_states.shape[-2] if past_key_value is not None: if self.layer_idx is None: raise ValueError( f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " "with a layer index." ) kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) # Because the input can be padded, the absolute sequence length depends on the max position id. rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) query_states, key_states = apply_rotary_pos_emb( query_states, key_states, cos, sin, position_ids ) use_sliding_windows = ( _flash_supports_window_size and getattr(self.config, "sliding_window", None) is not None and kv_seq_len > self.config.sliding_window ) if not _flash_supports_window_size: logger.warning_once( "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" " make sure to upgrade flash-attn library." ) if past_key_value is not None: # Activate slicing cache only if the config has a value `sliding_windows` attribute cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 if ( getattr(self.config, "sliding_window", None) is not None and kv_seq_len > self.config.sliding_window and cache_has_contents ): slicing_tokens = 1 - self.config.sliding_window past_key = past_key_value[self.layer_idx][0] past_value = past_key_value[self.layer_idx][1] past_key = past_key[:, :, slicing_tokens:, :].contiguous() past_value = past_value[:, :, slicing_tokens:, :].contiguous() if past_key.shape[-2] != self.config.sliding_window - 1: raise ValueError( f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" f" {past_key.shape}" ) if attention_mask is not None: attention_mask = attention_mask[:, slicing_tokens:] attention_mask = torch.cat( [attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1, ) cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models key_states, value_states = past_key_value.update( key_states, value_states, self.layer_idx, cache_kwargs ) # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.num_key_value_groups) value_states = repeat_kv(value_states, self.num_key_value_groups) dropout_rate = 0.0 if not self.training else self.attention_dropout # In PEFT, usually we cast the layer norms in float32 for training stability reasons # therefore the input hidden states gets silently casted in float32. Hence, we need # cast them back in float16 just to be sure everything works as expected. input_dtype = query_states.dtype if input_dtype == torch.float32: # Handle the case where the model is quantized if hasattr(self.config, "_pre_quantization_dtype"): target_dtype = self.config._pre_quantization_dtype else: target_dtype = self.q_proj.weight.dtype logger.warning_once( f"The input hidden states seems to be silently casted in float32, this might be related to" f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" f" {target_dtype}." ) query_states = query_states.to(target_dtype) key_states = key_states.to(target_dtype) value_states = value_states.to(target_dtype) # Reashape to the expected shape for Flash Attention query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) # print("attention_mask", attention_mask, attention_mask.shape) attn_output = self._flash_attention_forward( query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate, use_sliding_windows=use_sliding_windows, ) attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None return attn_output, attn_weights, past_key_value def _flash_attention_forward( self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None, use_sliding_windows=False, ): """ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token first unpad the input, then computes the attention scores and pad the final attention scores. Args: query_states (`torch.Tensor`): Input query states to be passed to Flash Attention API key_states (`torch.Tensor`): Input key states to be passed to Flash Attention API value_states (`torch.Tensor`): Input value states to be passed to Flash Attention API attention_mask (`torch.Tensor`): The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the position of padding tokens and 1 for the position of non-padding tokens. dropout (`int`, *optional*): Attention dropout softmax_scale (`float`, *optional*): The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) use_sliding_windows (`bool`, *optional*): Whether to activate sliding window attention. """ if not self._flash_attn_uses_top_left_mask: causal = self.is_causal else: # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. causal = self.is_causal and query_length != 1 # Contains at least one padding token in the sequence if attention_mask is not None: batch_size = query_states.shape[0] ( query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens, ) = self._upad_input( query_states, key_states, value_states, attention_mask, query_length ) cu_seqlens_q, cu_seqlens_k = cu_seq_lens max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens if not use_sliding_windows: attn_output_unpad = flash_attn_varlen_func( query_states, key_states, value_states, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_in_batch_q, max_seqlen_k=max_seqlen_in_batch_k, dropout_p=dropout, softmax_scale=softmax_scale, causal=causal, ) else: attn_output_unpad = flash_attn_varlen_func( query_states, key_states, value_states, cu_seqlens_q=cu_seqlens_q, cu_seqlens_k=cu_seqlens_k, max_seqlen_q=max_seqlen_in_batch_q, max_seqlen_k=max_seqlen_in_batch_k, dropout_p=dropout, softmax_scale=softmax_scale, causal=causal, window_size=( self.config.sliding_window, self.config.sliding_window, ), ) attn_output = pad_input( attn_output_unpad, indices_q, batch_size, query_length ) else: if not use_sliding_windows: attn_output = flash_attn_func( query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal, ) else: attn_output = flash_attn_func( query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal, window_size=( self.config.sliding_window, self.config.sliding_window, ), ) return attn_output def _upad_input( self, query_layer, key_layer, value_layer, attention_mask, query_length ): batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape # On the first iteration we need to properly re-create the padding mask # by slicing it on the proper place if kv_seq_len != attention_mask.shape[-1]: attention_mask_num_tokens = attention_mask.shape[-1] attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) key_layer = index_first_axis( key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k ) value_layer = index_first_axis( value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k ) if query_length == kv_seq_len: query_layer = index_first_axis( query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k, ) cu_seqlens_q = cu_seqlens_k max_seqlen_in_batch_q = max_seqlen_in_batch_k indices_q = indices_k elif query_length == 1: max_seqlen_in_batch_q = 1 cu_seqlens_q = torch.arange( batch_size + 1, dtype=torch.int32, device=query_layer.device ) # There is a memcpy here, that is very bad. indices_q = cu_seqlens_q[:-1] query_layer = query_layer.squeeze(1) else: # The -q_len: slice assumes left padding. attention_mask = attention_mask[:, -query_length:] query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input( query_layer, attention_mask ) return ( query_layer, key_layer, value_layer, indices_q, (cu_seqlens_q, cu_seqlens_k), (max_seqlen_in_batch_q, max_seqlen_in_batch_k), ) class MixtralFlashAttention2MoE(MixtralFlashAttention2): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.top_k_attn = self.config.top_k_attn self.attn_experts = self.config.attn_experts self.scale_factor_attn = self.config.scale_factor_attn self.split_ratio = self.attn_experts // self.num_key_value_heads self.gate = nn.Linear(self.hidden_size, self.attn_experts, bias=False) self.q_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.num_key_value_groups * self.head_dim // self.split_ratio, bias=False) for _ in range(self.attn_experts)]) self.k_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.head_dim, bias=False) for _ in range(self.attn_experts)]) self.v_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.head_dim, bias=False) for _ in range(self.attn_experts)]) self.o_proj = nn.ModuleList([nn.Linear(self.num_key_value_groups * self.head_dim // self.split_ratio, self.hidden_size, bias=self.config.add_rescale_bias) for _ in range(self.attn_experts)]) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: bool = False, use_cache: bool = False, **kwargs, ): if "padding_mask" in kwargs: warnings.warn( "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" ) # overwrite attention_mask with padding_mask # attention_mask = kwargs.pop("padding_mask") if past_key_value is not None and not isinstance(past_key_value, MoECache): # 🔍 type check raise TypeError( "`past_key_value` must be a `MoECache` instance for attention MoE!" ) bsz, q_len, hidden_dim = hidden_states.size() device = hidden_states.device dtype = hidden_states.dtype hidden_states = hidden_states.reshape(-1, hidden_dim) # gate compute router_logits = self.gate(hidden_states) router_scores = F.softmax(router_logits, dim=1, dtype=torch.float) routing_weights, selected_experts = torch.topk(router_scores, self.top_k_attn, dim=-1) routing_weights /= routing_weights.sum(dim=-1, keepdim=True) routing_weights = routing_weights.to(dtype) final_attn_output = torch.zeros_like(hidden_states).reshape(-1, hidden_dim) expert_mask = F.one_hot(selected_experts, num_classes=self.num_heads).permute(2, 1, 0) all_attn_weights = [] if output_attentions else None for expert_idx in range(self.attn_experts): idx, top_x = torch.nonzero(expert_mask[expert_idx], as_tuple=True) # top_x_list = top_x.tolist() # idx_list = idx.tolist() if top_x.shape[0] == 0 and not self.training: # skip during training will lead to asynchrony among different GPUs and blocks the training! if output_attentions: all_attn_weights.append(None) continue # create position_ids for selected tokens current_batch_ids = (top_x // q_len) each_batch_selected_token_num = torch.bincount(current_batch_ids, minlength=bsz) # (bsz) this_q_len = each_batch_selected_token_num.max().item() selection_mask = torch.zeros((bsz * q_len,), device=device, dtype=torch.bool) selection_mask[top_x] = True selection_mask = selection_mask.reshape(bsz, q_len) token_position_indices = torch.cumsum(selection_mask, dim=1) - 1 token_position_indices = token_position_indices.flatten() current_seq_ids = token_position_indices[top_x] # 🔍 initialize hidden_states for this expert current_state = torch.zeros((bsz, this_q_len, hidden_dim), dtype=dtype, device=device) current_state[current_batch_ids, current_seq_ids] = hidden_states[top_x] # assign tokens sparsely # for attention forward # expert_inputs = viewed_hidden_states[None, top_x_list].reshape(-1, self.hidden_size) query_states = self.q_proj[expert_idx](current_state) key_states = self.k_proj[expert_idx](current_state) value_states = self.v_proj[expert_idx](current_state) # seq_len = query_states.numel() // (bsz * self.num_key_value_groups * self.head_dim) query_states = query_states.view(bsz, -1, self.num_key_value_groups // self.split_ratio, self.head_dim).transpose(1, 2) key_states = key_states.view(bsz, -1, 1, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, -1, 1, self.head_dim).transpose(1, 2) # for moe kv cache past_key_values_length = 0 kv_seq_len = key_states.shape[-2] if past_key_value is not None: if self.layer_idx is None: raise ValueError( f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " "with a layer index." ) past_key_values_length = past_key_value.get_usable_length(kv_seq_len, self.layer_idx, expert_idx) # 🔍 specify expert index kv_seq_len += past_key_values_length current_position_ids = torch.zeros((bsz, this_q_len), device=hidden_states.device, dtype=torch.long) current_position_ids[current_batch_ids, current_seq_ids] = position_ids.expand(bsz, q_len).flatten()[top_x] if top_x.shape[0] > 0: # apply only when there are tokens cos, sin = self.rotary_emb(value_states, seq_len=current_position_ids.max().item() + 1) # 🔍 adjust the seq_len to the maximum possible value query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, current_position_ids) if past_key_value is not None: cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, expert_idx, cache_kwargs) # 🔍 specify expert index # print("attention_mask", attention_mask.shape, attention_mask) # for current attention mask ''' current_attention_mask = torch.zeros((bsz, this_q_len), dtype=torch.bool, device=device) if attention_mask is not None: if past_key_values_length > 0: # 🔍 we need to exclude previous tokens previous_seen_tokens_total = past_key_value._seen_tokens_total - q_len temp_attention_mask = attention_mask[:, previous_seen_tokens_total:].flatten() # select along dimension 1 so that we get tokens in this iteration else: temp_attention_mask = attention_mask.flatten() # flatten the dim current_attention_mask[current_batch_ids, current_seq_ids] = temp_attention_mask[top_x] # bug here !!! else: current_attention_mask[current_batch_ids, current_seq_ids] = True # assign masks sparsely if past_key_value is not None: # 🔍 we need to update with cached attention mask current_attention_mask = past_key_value.update_attention_mask(current_attention_mask, self.layer_idx, expert_idx) current_attention_mask = _prepare_4d_causal_attention_mask( current_attention_mask, (bsz, this_q_len), current_state, past_key_values_length, sliding_window=self.config.sliding_window, ) if current_attention_mask.size() != (bsz, 1, this_q_len, kv_seq_len): # 🔍 q_len -> this_q_len raise ValueError(f"Attention mask should be of size {(bsz, 1, this_q_len, kv_seq_len)}, but is {current_attention_mask.size()}") ''' # for sliding window use_sliding_windows = ( _flash_supports_window_size and getattr(self.config, "sliding_window", None) is not None and kv_seq_len > self.config.sliding_window ) if not _flash_supports_window_size: logger.warning_once( "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" " make sure to upgrade flash-attn library." ) # wait for change! sliding_window=4096 if past_key_value is not None: # Activate slicing cache only if the config has a value `sliding_windows` attribute cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 if ( getattr(self.config, "sliding_window", None) is not None and kv_seq_len > self.config.sliding_window and cache_has_contents ): slicing_tokens = 1 - self.config.sliding_window past_key = past_key_value[self.layer_idx][0] past_value = past_key_value[self.layer_idx][1] past_key = past_key[:, :, slicing_tokens:, :].contiguous() past_value = past_value[:, :, slicing_tokens:, :].contiguous() if past_key.shape[-2] != self.config.sliding_window - 1: raise ValueError( f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" f" {past_key.shape}" ) if attention_mask is not None: attention_mask = attention_mask[:, slicing_tokens:] attention_mask = torch.cat( [attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1, ) cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models key_states, value_states = past_key_value.update( key_states, value_states, self.layer_idx, cache_kwargs ) # for input dtype input_dtype = query_states.dtype if input_dtype == torch.float32: # Handle the case where the model is quantized if hasattr(self.config, "_pre_quantization_dtype"): target_dtype = self.config._pre_quantization_dtype else: target_dtype = self.q_proj[0].weight.dtype logger.warning_once( f"The input hidden states seems to be silently casted in float32, this might be related to" f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" f" {target_dtype}." ) query_states = query_states.to(target_dtype) key_states = key_states.to(target_dtype) value_states = value_states.to(target_dtype) dropout_rate = 0.0 if not self.training else self.attention_dropout repeat_num = query_states.shape[1] key_states = repeat_kv(key_states, repeat_num) value_states = repeat_kv(value_states, repeat_num) # print("repeat_num", repeat_num) # print("query_states shape", query_states.shape, key_states.shape, value_states.shape) # Reashape to the expected shape for Flash Attention query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) attn_output = self._flash_attention_forward( query_states, key_states, value_states, attention_mask, this_q_len, dropout=dropout_rate, use_sliding_windows=use_sliding_windows, ) attn_output = attn_output.reshape(bsz, this_q_len, self.num_key_value_groups * self.head_dim // self.split_ratio).contiguous() attn_output = self.o_proj[expert_idx](attn_output) attn_output = attn_output[current_batch_ids, current_seq_ids] * (routing_weights[top_x, idx, None] * self.scale_factor_attn) final_attn_output.index_add_(0, top_x, attn_output) final_attn_output = final_attn_output.reshape(bsz, q_len, hidden_dim) if not output_attentions: attn_weights = None return final_attn_output, attn_weights, past_key_value, router_logits # 🔍 return an extra `router_logits` class MixtralFlashAttention2MoE_zt(MixtralFlashAttention2): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.top_k_attn = self.config.top_k_attn self.scale_factor_attn = self.config.scale_factor_attn # self.num_heads # self.head_dim # self.num_key_value_heads # self.num_key_value_groups # total number of experts assert self.top_k_attn <= self.num_key_value_groups # assert self.top_k_attn % self.num_key_value_heads == 0 self.attn_hsz = self.hidden_size // self.num_key_value_groups * self.top_k_attn self.kv_repeat_num = self.attn_hsz // (self.num_key_value_heads * self.head_dim) self.simulated_attn_head_num = self.attn_hsz // self.head_dim assert self.attn_hsz % (self.num_key_value_heads * self.head_dim) == 0 assert self.simulated_attn_head_num == self.num_heads * (self.top_k_attn / self.num_key_value_groups) assert self.kv_repeat_num * self.num_key_value_heads == self.simulated_attn_head_num self.gate = nn.Linear(self.hidden_size, self.num_key_value_groups, bias=False) # tzhu: there are self.num_key_value_groups experts # each expert has a size of self.attn_hsz self.q_proj = nn.ModuleList( [nn.Linear(self.hidden_size, self.attn_hsz) for _ in range(self.num_key_value_groups)] ) self.o_proj = nn.ModuleList( [nn.Linear(self.attn_hsz, self.hidden_size) for _ in range(self.num_key_value_groups)] ) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: bool = False, use_cache: bool = False, **kwargs, ): if "padding_mask" in kwargs: warnings.warn( "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" ) # overwrite attention_mask with padding_mask attention_mask = kwargs.pop("padding_mask") bsz, q_len, _ = hidden_states.size() key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) # tzhu: attn-moe on q_proj viewed_hidden_states = hidden_states.view(bsz * q_len, self.hidden_size) # router router_logits = self.gate(viewed_hidden_states) router_scores = F.softmax(router_logits, dim=-1, dtype=torch.float) routing_weights, selected_experts = torch.topk(router_scores, self.top_k_attn, dim=-1) routing_weights /= routing_weights.sum(dim=-1, keepdim=True) routing_weights = routing_weights.to(hidden_states.dtype) query_states = torch.zeros( (bsz * q_len, self.attn_hsz), dtype=hidden_states.dtype, device=hidden_states.device, ) # expert_mask: (num_experts, top_k_attn, bsz * q_len) expert_mask = F.one_hot(selected_experts, num_classes=self.num_heads).permute(2, 1, 0) for expert_idx in range(self.num_key_value_groups): expert_layer = self.q_proj[expert_idx] idx, top_x = torch.where(expert_mask[expert_idx]) top_x_list = top_x.tolist() idx_list = idx.tolist() expert_inputs = viewed_hidden_states[None, top_x_list].reshape(-1, self.hidden_size) # inputs (-1, hidden_size) -> outputs (-1, attn_hsz) expert_outs = expert_layer(expert_inputs) * routing_weights[top_x_list, idx_list, None] * self.scale_factor_attn query_states.index_add_(0, top_x, expert_outs.to(query_states.dtype)) query_states = query_states.view(bsz, q_len, self.attn_hsz) # query_states = query_states.view( # bsz, q_len, self.num_heads, self.simulated_attn_head_num # ).transpose(1, 2) query_states = query_states.view( bsz, q_len, self.simulated_attn_head_num, self.head_dim ).transpose(1, 2) key_states = key_states.view( bsz, q_len, self.num_key_value_heads, self.head_dim ).transpose(1, 2) value_states = value_states.view( bsz, q_len, self.num_key_value_heads, self.head_dim ).transpose(1, 2) kv_seq_len = key_states.shape[-2] if past_key_value is not None: if self.layer_idx is None: raise ValueError( f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " "with a layer index." ) kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) # Because the input can be padded, the absolute sequence length depends on the max position id. rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len) query_states, key_states = apply_rotary_pos_emb( query_states, key_states, cos, sin, position_ids ) use_sliding_windows = ( _flash_supports_window_size and getattr(self.config, "sliding_window", None) is not None and kv_seq_len > self.config.sliding_window ) if not _flash_supports_window_size: logger.warning_once( "The current flash attention version does not support sliding window attention, for a more memory efficient implementation" " make sure to upgrade flash-attn library." ) if past_key_value is not None: # Activate slicing cache only if the config has a value `sliding_windows` attribute cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 if ( getattr(self.config, "sliding_window", None) is not None and kv_seq_len > self.config.sliding_window and cache_has_contents ): slicing_tokens = 1 - self.config.sliding_window past_key = past_key_value[self.layer_idx][0] past_value = past_key_value[self.layer_idx][1] past_key = past_key[:, :, slicing_tokens:, :].contiguous() past_value = past_value[:, :, slicing_tokens:, :].contiguous() if past_key.shape[-2] != self.config.sliding_window - 1: raise ValueError( f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" f" {past_key.shape}" ) if attention_mask is not None: attention_mask = attention_mask[:, slicing_tokens:] attention_mask = torch.cat( [attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1, ) cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models key_states, value_states = past_key_value.update( key_states, value_states, self.layer_idx, cache_kwargs ) # repeat k/v heads if n_kv_heads < n_heads key_states = repeat_kv(key_states, self.kv_repeat_num) value_states = repeat_kv(value_states, self.kv_repeat_num) dropout_rate = 0.0 if not self.training else self.attention_dropout # In PEFT, usually we cast the layer norms in float32 for training stability reasons # therefore the input hidden states gets silently casted in float32. Hence, we need # cast them back in float16 just to be sure everything works as expected. input_dtype = query_states.dtype if input_dtype == torch.float32: # Handle the case where the model is quantized if hasattr(self.config, "_pre_quantization_dtype"): target_dtype = self.config._pre_quantization_dtype else: target_dtype = self.q_proj.weight.dtype logger.warning_once( f"The input hidden states seems to be silently casted in float32, this might be related to" f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" f" {target_dtype}." ) query_states = query_states.to(target_dtype) key_states = key_states.to(target_dtype) value_states = value_states.to(target_dtype) # Reashape to the expected shape for Flash Attention query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) attn_output = self._flash_attention_forward( query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate, use_sliding_windows=use_sliding_windows, ) attn_output = attn_output.reshape(bsz * q_len, self.attn_hsz).contiguous() final_attn_output = torch.zeros( (bsz * q_len, self.hidden_size), dtype=hidden_states.dtype, device=hidden_states.device, ) for expert_idx in range(self.num_key_value_groups): expert_layer = self.o_proj[expert_idx] idx, top_x = torch.where(expert_mask[expert_idx]) top_x_list = top_x.tolist() idx_list = idx.tolist() expert_inputs = attn_output[None, top_x_list].reshape(-1, self.attn_hsz) expert_outs = expert_layer(expert_inputs) * routing_weights[top_x_list, idx_list, None] * self.scale_factor_attn final_attn_output.index_add_(0, top_x, expert_outs.to(final_attn_output.dtype)) final_attn_output = final_attn_output.view(bsz, q_len, self.hidden_size) if not output_attentions: attn_weights = None return final_attn_output, attn_weights, past_key_value, router_logits @torch.no_grad() def from_vanilla_attention(attention: MixtralAttention, top_k_attn, scale_factor_attn): # config layer_idx = attention.layer_idx config = attention.config config.top_k_attn = top_k_attn config.scale_factor_attn = scale_factor_attn # init attention_moe = MixtralFlashAttention2MoE(config, layer_idx) # copy weights num_key_value_groups = attention_moe.num_key_value_groups head_dim = attention_moe.head_dim for i in range(num_key_value_groups): indices_q_o = [] for j in range(attention_moe.num_key_value_heads): k = i + j * num_key_value_groups indices_q_o.extend( list(range(k * head_dim, (k + 1) * head_dim)) ) print(i, "indices_q_o", indices_q_o) attention_moe.q_proj[i].weight.data = attention.q_proj.weight.data[indices_q_o].clone() attention_moe.o_proj[i].weight.data = attention.o_proj.weight.data[:, indices_q_o].clone() return attention_moe class MixtralBLockSparseTop2MLP(nn.Module): def __init__(self, config: MixtralConfig, ffn_dim, add_rescale_bias=False): # 🔍 super().__init__() self.ffn_dim = ffn_dim # 🔍 self.hidden_dim = config.hidden_size self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False) # gate self.w2 = nn.Linear( self.ffn_dim, self.hidden_dim, bias=add_rescale_bias ) # 🔍 down (may add bias for rescaling) self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False) # up self.act_fn = ACT2FN[config.hidden_act] def forward(self, hidden_states): current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3( hidden_states ) current_hidden_states = self.w2(current_hidden_states) return current_hidden_states MISTRAL_ATTENTION_CLASSES = { "eager": MixtralAttention, "flash_attention_2": MixtralFlashAttention2, } # 🔍 MISTRAL_ATTENTION_MOE_CLASSES = { "eager": MixtralAttentionMoE, "flash_attention_2": MixtralFlashAttention2MoE, } class SimplifiedSparseGLU(nn.Module): def __init__(self, args: MegablocksArguments): super().__init__() self.args = args if args.bf16: torch_dtype = torch.bfloat16 elif args.fp16: torch_dtype = torch.float16 else: torch_dtype = None # gate self.w1 = nn.Parameter( torch.empty( args.ffn_hidden_size * args.moe_num_experts, args.hidden_size, dtype=torch_dtype, ) ) # down self.w2 = nn.Parameter( torch.empty( args.ffn_hidden_size * args.moe_num_experts, args.hidden_size, dtype=torch_dtype, ) ) # up self.v1 = nn.Parameter( torch.empty( args.ffn_hidden_size * args.moe_num_experts, args.hidden_size, dtype=torch_dtype, ) ) self.act_fn = args.activation_fn def forward(self, x, topo): if self.args.memory_optimized_mlp: raise NotImplementedError( "Memory optimized implementation not yet supported with GLU with sparse kernels." ) # TODO (tzhu): test if OOM comes from dtensor conversion # TODO (tzhu): return x directly to see if it still encounters OOM # w1, v1, w2 = ( # resolve_dtensor(self.w1), # resolve_dtensor(self.v1), # resolve_dtensor(self.w2), # ) # Compute the GLU. x1 = stk.ops.sdd(x, self.w1.t(), topo) x2 = stk.ops.sdd(x, self.v1.t(), topo) activation_fn_out = act_fn(x1, self.act_fn) x1 = stk.ops.mul(activation_fn_out, x2) return stk.ops.dsd(x1, self.w2) class SimplifiedGroupedSparseGLU(SimplifiedSparseGLU): def forward(self, x, tokens_per_expert): batch_sizes = tokens_per_expert.cpu().to(torch.long) # w1, v1, w2 = ( # resolve_dtensor(self.w1), # resolve_dtensor(self.v1), # resolve_dtensor(self.w2), # ) # Re-shape the weights for the grouped GEMMs. # ne = mpu.experts_per_rank(self.args) # w1 = self.w1.view(ne, -1, self.args.hidden_size) # v1 = self.v1.view(ne, -1, self.args.hidden_size) # w2 = self.w2.view(ne, -1, self.args.hidden_size) ne = self.args.moe_num_experts w1 = self.w1.view(ne, -1, self.args.hidden_size) v1 = self.v1.view(ne, -1, self.args.hidden_size) w2 = self.w2.view(ne, -1, self.args.hidden_size) if self.args.memory_optimized_mlp: return memory_optimized_grouped_glu( x, w1, v1, w2, batch_sizes, self.args.quantize_inputs_num_bits, self.args.quantize_rematerialize_num_bits, self.args.activation_fn, ) # Compute the MLP. x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True) x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True) x1 = self.act_fn(x1) * x2 return gg.ops.gmm(x1, w2, batch_sizes) _REGISTRY["simplified_glu"] = { "grouped": SimplifiedGroupedSparseGLU, "sparse": SimplifiedSparseGLU, } class SimplifiedParallelDroplessMLP(ParallelDroplessMLP): def forward(self, x, expert_weights, top_experts): in_shape = x.size() # Compute the experts. x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts) x = x.view(in_shape) if self.bias is not None: if self.args.return_bias: return x, self.bias return x + self.bias return x class MixtralSparseMoeBlock(nn.Module): """ This implementation is strictly equivalent to standard MoE with full capacity (no dropped tokens). It's faster since it formulates MoE operations in terms of block-sparse operations to accomodate imbalanced assignments of tokens to experts, whereas standard MoE either (1) drop tokens at the cost of reduced performance or (2) set capacity factor to number of experts and thus waste computation and memory on padding. """ def __init__(self, config): super().__init__() self.hidden_dim = config.hidden_size self.ffn_dim = config.intermediate_size self.num_experts = config.num_local_experts self.top_k = config.num_experts_per_tok # specialized for llama-moe-v2 self.scale_factor = config.scale_factor self.moe_type = config.moe_type # gating self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False) if self.moe_type == "modulelist": self.experts = nn.ModuleList( [ MixtralBLockSparseTop2MLP( config, config.intermediate_size, add_rescale_bias=config.add_rescale_bias, ) for _ in range(self.num_experts) ] # 🔍 ) elif self.moe_type == "megablocks": if config.add_rescale_bias: raise NotImplementedError( "RescaleBias not yet supported with megablocks." ) is_fp16 = self.gate.weight.dtype == torch.float16 is_bf16 = self.gate.weight.dtype == torch.bfloat16 args = MegablocksArguments( hidden_size=self.hidden_dim, ffn_hidden_size=self.ffn_dim, moe_num_experts=self.num_experts, moe_top_k=self.top_k, activation_fn={"silu": F.silu}[config.hidden_act], mlp_type="simplified_glu", mlp_impl="sparse", memory_optimized_mlp=False, bias=False, fp16=is_fp16, bf16=is_bf16, ) self.experts = SimplifiedParallelDroplessMLP(args) elif self.moe_type == "scattermoe": if config.add_rescale_bias: raise NotImplementedError( "RescaleBias not yet supported with scattermoe." ) self.experts = scattermoe.mlp.GLUMLP( input_size=self.hidden_dim, hidden_size=self.ffn_dim, num_experts=self.num_experts, top_k=self.top_k, activation={"silu": F.silu}[config.hidden_act], ) else: raise NotImplementedError(f"Unsupported moe_type: {self.moe_type}") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: """ """ batch_size, sequence_length, hidden_dim = hidden_states.shape hidden_states = hidden_states.view(-1, hidden_dim) # router_logits: (batch * sequence_length, n_experts) router_logits = self.gate(hidden_states) scores = F.softmax(router_logits, dim=1, dtype=torch.float) routing_weights, selected_experts = torch.topk(scores, self.top_k, dim=-1) routing_weights /= routing_weights.sum(dim=-1, keepdim=True) # we cast back to the input dtype routing_weights = routing_weights.to(hidden_states.dtype) if self.moe_type == "megablocks": final_hidden_states = self.experts( hidden_states, routing_weights, selected_experts ) elif self.moe_type == "scattermoe": final_hidden_states = self.experts( hidden_states, routing_weights, selected_experts ) else: final_hidden_states = torch.zeros( (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device, ) # One hot encode the selected experts to create an expert mask # this will be used to easily index which expert is going to be sollicitated expert_mask = torch.nn.functional.one_hot( selected_experts, num_classes=self.num_experts ).permute(2, 1, 0) # Loop over all available experts in the model and perform the computation on each expert for expert_idx in range(self.num_experts): expert_layer = self.experts[expert_idx] idx, top_x = torch.where(expert_mask[expert_idx]) if ( top_x.shape[0] == 0 and not self.training ): # skip during training will lead to asynchrony among different GPUs and blocks the training! continue # in torch it is faster to index using lists than torch tensors top_x_list = top_x.tolist() idx_list = idx.tolist() # Index the correct hidden states and compute the expert hidden state for # the current expert. We need to make sure to multiply the output hidden # states by `routing_weights` on the corresponding tokens (top-1 and top-2) current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim) current_hidden_states = expert_layer(current_state) * ( routing_weights[top_x_list, idx_list, None] * self.scale_factor ) # However `index_add_` only support torch tensors for indexing so we'll use # the `top_x` tensor here. final_hidden_states.index_add_( 0, top_x, current_hidden_states.to(hidden_states.dtype) ) final_hidden_states = final_hidden_states.reshape( batch_size, sequence_length, hidden_dim ) return final_hidden_states, router_logits class MixtralDecoderLayer(nn.Module): def __init__(self, config: MixtralConfig, layer_idx: int): super().__init__() self.hidden_size = config.hidden_size # 🔍 self.is_moe = (layer_idx >= config.num_moe_contract_layers) and ( layer_idx < config.num_hidden_layers - config.num_moe_contract_layers ) self.use_attn_moe = config.use_attn_moe if self.use_attn_moe: attn_class = MISTRAL_ATTENTION_MOE_CLASSES[config._attn_implementation] else: attn_class = MISTRAL_ATTENTION_CLASSES[config._attn_implementation] self.self_attn = attn_class(config, layer_idx) if self.is_moe: self.block_sparse_moe = MixtralSparseMoeBlock(config) self.mlp_residual = ( MixtralBLockSparseTop2MLP(config, config.intermediate_size_residual) if config.intermediate_size_residual is not None else None ) else: self.block_sparse_moe = MixtralBLockSparseTop2MLP( config, config.intermediate_size * config.num_local_experts ) self.mlp_residual = None self.input_layernorm = MixtralRMSNorm( config.hidden_size, eps=config.rms_norm_eps ) self.post_attention_layernorm = MixtralRMSNorm( config.hidden_size, eps=config.rms_norm_eps ) def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: Optional[bool] = False, output_router_logits: Optional[bool] = False, use_cache: Optional[bool] = False, **kwargs, ) -> Tuple[ torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] ]: if "padding_mask" in kwargs: warnings.warn( "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" ) """ Args: hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`, *optional*): attention mask of size `(batch, sequence_length)` where padding elements are indicated by 0. past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_router_logits (`bool`, *optional*): Whether or not to return the logits of all the routers. They are useful for computing the router loss, and should not be returned during inference. use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). """ residual = hidden_states hidden_states = self.input_layernorm(hidden_states) # 🔍 Self Attention if self.use_attn_moe: ( hidden_states, self_attn_weights, present_key_value, attn_router_logits, ) = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, ) else: hidden_states, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, ) attn_router_logits = None hidden_states = residual + hidden_states # Fully Connected residual = hidden_states hidden_states_input = self.post_attention_layernorm(hidden_states) # 🔍 if self.is_moe: hidden_states, router_logits = self.block_sparse_moe(hidden_states_input) else: hidden_states = self.block_sparse_moe(hidden_states_input) router_logits = None if self.mlp_residual is not None: hidden_states += self.mlp_residual(hidden_states_input) # hidden_states = residual + hidden_states outputs = (hidden_states,) if output_attentions: outputs += (self_attn_weights,) if use_cache: outputs += (present_key_value,) if output_router_logits: outputs += (router_logits, attn_router_logits) # 🔍 return outputs # Copied from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->Mixtral class MixtralPreTrainedModel(PreTrainedModel): config_class = MixtralConfig base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["MixtralDecoderLayer"] _skip_keys_device_placement = "past_key_values" _supports_flash_attn_2 = True _supports_cache_class = True def _init_weights(self, module): std = self.config.initializer_range if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() # Copied from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->MIXTRAL,Mistral->Mixtral class MixtralModel(MixtralPreTrainedModel): """ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`] Args: config: MixtralConfig """ def __init__(self, config: MixtralConfig): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = nn.Embedding( config.vocab_size, config.hidden_size, self.padding_idx ) self.layers = nn.ModuleList( [ MixtralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers) ] ) self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" self.norm = MixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.embed_tokens def set_input_embeddings(self, value): self.embed_tokens = value # Ignore copy def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_router_logits: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, MoeModelOutputWithPast]: output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_router_logits = ( output_router_logits if output_router_logits is not None else self.config.output_router_logits ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) # retrieve input_ids and inputs_embeds if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time" ) elif input_ids is not None: batch_size, seq_length = input_ids.shape elif inputs_embeds is not None: batch_size, seq_length, _ = inputs_embeds.shape else: raise ValueError( "You have to specify either decoder_input_ids or decoder_inputs_embeds" ) past_key_values_length = 0 if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." ) use_cache = False if use_cache: use_legacy_cache = not isinstance(past_key_values, Cache) if use_legacy_cache: if self.config.use_attn_moe: # 🔍 past_key_values = MoECache.from_legacy_cache(past_key_values) else: # 🔍 past_key_values = DynamicCache.from_legacy_cache(past_key_values) past_key_values_length = past_key_values.get_usable_length(seq_length) # 🔍 add total seen tokens, this is VERY important for getting correct `past_key_values_length`! if self.config.use_attn_moe: past_key_values.add_seen_tokens_total(seq_length) if position_ids is None: device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange( past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device, ) position_ids = position_ids.unsqueeze(0).view(-1, seq_length) else: position_ids = position_ids.view(-1, seq_length).long() if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) if attention_mask is not None and self._use_flash_attention_2 and use_cache: is_padding_right = attention_mask[:, -1].sum().item() != batch_size if is_padding_right: raise ValueError( "You are attempting to perform batched generation with padding_side='right'" " this may lead to unexpected behaviour for Flash Attention version of Mixtral. Make sure to " " call `tokenizer.padding_side = 'left'` before tokenizing the input. " ) if ( self._use_flash_attention_2 or self.config.use_attn_moe ): # 🔍 added special case for attention MoE # 2d mask is passed through the layers attention_mask = ( attention_mask if (attention_mask is not None and 0 in attention_mask) else None ) else: # 4d mask is passed through the layers attention_mask = _prepare_4d_causal_attention_mask( attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length, sliding_window=self.config.sliding_window, ) # print("attention_mask" , attention_mask) hidden_states = inputs_embeds # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None all_router_logits = () if output_router_logits else None all_attn_router_logits = () if output_router_logits else None # 🔍 next_decoder_cache = None for decoder_layer in self.layers: if output_hidden_states: all_hidden_states += (hidden_states,) if self.gradient_checkpointing and self.training: def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value return module(*inputs) return custom_forward layer_outputs: tuple = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, position_ids, past_key_values, output_attentions, output_router_logits, use_cache, ) else: layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, position_ids=position_ids, past_key_value=past_key_values, output_attentions=output_attentions, output_router_logits=output_router_logits, use_cache=use_cache, ) hidden_states = layer_outputs[0] if use_cache: next_decoder_cache = layer_outputs[2 if output_attentions else 1] if output_attentions: all_self_attns += (layer_outputs[1],) if output_router_logits: all_router_logits += (layer_outputs[-2],) all_attn_router_logits += (layer_outputs[-1],) hidden_states = self.norm(hidden_states) # add hidden states from the last decoder layer if output_hidden_states: all_hidden_states += (hidden_states,) next_cache = None if use_cache: next_cache = ( next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache ) if not return_dict: return tuple( v for v in [ hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits, ] if v is not None ) return MoeModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns, router_logits=all_router_logits, attn_router_logits=all_attn_router_logits, # 🔍 ) class MixtralForCausalLM(MixtralPreTrainedModel): _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): super().__init__(config) self.model = MixtralModel(config) self.vocab_size = config.vocab_size self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.router_aux_loss_coef = config.router_aux_loss_coef self.num_experts = config.num_local_experts self.num_experts_per_tok = config.num_experts_per_tok # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.model.embed_tokens def set_input_embeddings(self, value): self.model.embed_tokens = value def get_output_embeddings(self): return self.lm_head def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def set_decoder(self, decoder): self.model = decoder def get_decoder(self): return self.model def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, MixtralModel): module.gradient_checkpointing = value # Ignore copy def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_router_logits: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, MoeCausalLMOutputWithPast]: output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_router_logits = ( output_router_logits if output_router_logits is not None else self.config.output_router_logits ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs: MoeModelOutputWithPast = self.model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, output_router_logits=output_router_logits, return_dict=return_dict, ) hidden_states = outputs[0] logits = self.lm_head(hidden_states) logits = logits.float() loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() shift_logits = shift_logits.view(-1, self.config.vocab_size) shift_labels = shift_labels.view(-1) # Enable model parallelism shift_labels = shift_labels.to(shift_logits.device) loss = loss_fct(shift_logits, shift_labels) # print("MixtralForCausalLM, cross entropy loss", loss) aux_loss = None if output_router_logits: valid_router_logits = tuple( logits for logits in (outputs.router_logits if return_dict else outputs[-2]) if logits is not None ) aux_loss = load_balancing_loss_func( valid_router_logits, self.num_experts, self.num_experts_per_tok, use_layer_wise_balance=self.config.use_layer_wise_balance, # ✨ ) if labels is not None: loss += self.router_aux_loss_coef * aux_loss # loss_mlp = self.router_aux_loss_coef * aux_loss # loss = loss + loss_mlp # print("MixtralForCausalLM, mlp aux_loss", loss_mlp) # 🔍 for Attention MoE ################################# valid_attn_router_logits = tuple( logits for logits in ( outputs.attn_router_logits if return_dict else outputs[-1] ) if logits is not None ) if len(valid_attn_router_logits) > 0: # exist logits that is not None attn_aux_loss = load_balancing_loss_func( valid_attn_router_logits, self.config.attn_experts, self.config.top_k_attn, use_layer_wise_balance=self.config.use_layer_wise_balance, # ✨ ) if labels is not None: loss += self.router_aux_loss_coef * attn_aux_loss # loss_attn = self.router_aux_loss_coef * attn_aux_loss # loss = loss + loss_attn # print("MixtralForCausalLM, attn aux_loss", loss_attn) ################################# if not return_dict: output = (logits,) + outputs[1:] if output_router_logits: output = (aux_loss,) + output return (loss,) + output if loss is not None else output return MoeCausalLMOutputWithPast( loss=loss, aux_loss=aux_loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, router_logits=outputs.router_logits, ) def prepare_inputs_for_generation( self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs, ): # Omit tokens covered by past_key_values if past_key_values is not None: if isinstance(past_key_values, MoECache): # 🔍 for MoECache only cache_length = past_key_values.get_seq_length() past_length = past_key_values._seen_tokens_total # 🔍 max_cache_length = past_key_values.get_max_length() elif isinstance(past_key_values, Cache): cache_length = past_key_values.get_seq_length() past_length = past_key_values.seen_tokens max_cache_length = past_key_values.get_max_length() else: cache_length = past_length = past_key_values[0][0].shape[2] max_cache_length = None # Keep only the unprocessed tokens: # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as # input) if ( attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1] ): input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard # input_ids based on the past_length. elif past_length < input_ids.shape[1]: input_ids = input_ids[:, past_length:] # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. if ( max_cache_length is not None and attention_mask is not None and cache_length + input_ids.shape[1] > max_cache_length ): attention_mask = attention_mask[:, -max_cache_length:] position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: # create position_ids on the fly for batch generation position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) if past_key_values: position_ids = position_ids[:, -input_ids.shape[1] :] # if `inputs_embeds` are passed, we only want to use them in the 1st generation step if inputs_embeds is not None and past_key_values is None: model_inputs = {"inputs_embeds": inputs_embeds} else: model_inputs = {"input_ids": input_ids} model_inputs.update( { "position_ids": position_ids, "past_key_values": past_key_values, "use_cache": kwargs.get("use_cache"), "attention_mask": attention_mask, } ) return model_inputs @staticmethod def _reorder_cache(past_key_values, beam_idx): # TODO: support for beam search print("MixtralForCausalLM, _reorder_cache", beam_idx) raise NotImplementedError # reordered_past = () # for layer_past in past_key_values: # reordered_past += ( # tuple( # past_state.index_select(0, beam_idx.to(past_state.device)) # for past_state in layer_past # ), # ) # return reordered_past @torch.no_grad() def generate( self, inputs: Optional[torch.Tensor] = None, generation_config: Optional[GenerationConfig] = None, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, prefix_allowed_tokens_fn: Optional[ Callable[[int, torch.Tensor], List[int]] ] = None, synced_gpus: Optional[bool] = None, assistant_model: Optional["PreTrainedModel"] = None, streamer: Optional["BaseStreamer"] = None, negative_prompt_ids: Optional[torch.Tensor] = None, negative_prompt_attention_mask: Optional[torch.Tensor] = None, **kwargs, ) -> Union[GenerateOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head. Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the model's default generation configuration. You can override any `generation_config` by passing the corresponding parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. For an overview of generation strategies and code examples, check out the [following guide](../generation_strategies). Parameters: inputs (`torch.Tensor` of varying shape depending on the modality, *optional*): The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs` should be in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of `input_ids`, `input_values`, `input_features`, or `pixel_values`. generation_config ([`~generation.GenerationConfig`], *optional*): The generation configuration to be used as base parametrization for the generation call. `**kwargs` passed to generate matching the attributes of `generation_config` will override them. If `generation_config` is not provided, the default will be used, which has the following loading priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s default values, whose documentation should be checked to parameterize generation. logits_processor (`LogitsProcessorList`, *optional*): Custom logits processors that complement the default logits processors built from arguments and generation config. If a logit processor is passed that is already created with the arguments or a generation config an error is thrown. This feature is intended for advanced users. stopping_criteria (`StoppingCriteriaList`, *optional*): Custom stopping criteria that complements the default stopping criteria built from arguments and a generation config. If a stopping criteria is passed that is already created with the arguments or a generation config an error is thrown. If your stopping criteria depends on the `scores` input, make sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is intended for advanced users. prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*): If provided, this function constraints the beam search to allowed tokens only at each step. If not provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful for constrained generation conditioned on the prefix, as described in [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904). synced_gpus (`bool`, *optional*): Whether to continue running the while loop until max_length. Unless overridden this flag will be set to `True` under DeepSpeed ZeRO Stage 3 multiple GPUs environment to avoid hanging if one GPU finished generating before other GPUs. Otherwise it'll be set to `False`. assistant_model (`PreTrainedModel`, *optional*): An assistant model that can be used to accelerate generation. The assistant model must have the exact same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model is much faster than running generation with the model you're calling generate from. As such, the assistant model should be much smaller. streamer (`BaseStreamer`, *optional*): Streamer object that will be used to stream the generated sequences. Generated tokens are passed through `streamer.put(token_ids)` and the streamer is responsible for any further processing. negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): The negative prompt needed for some processors such as CFG. The batch size must match the input batch size. This is an experimental feature, subject to breaking API changes in future versions. negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Attention_mask for `negative_prompt_ids`. kwargs (`Dict[str, Any]`, *optional*): Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*. Return: [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` or when `config.return_dict_in_generate=True`) or a `torch.LongTensor`. If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible [`~utils.ModelOutput`] types are: - [`~generation.GenerateDecoderOnlyOutput`], - [`~generation.GenerateBeamDecoderOnlyOutput`] If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible [`~utils.ModelOutput`] types are: - [`~generation.GenerateEncoderDecoderOutput`], - [`~generation.GenerateBeamEncoderDecoderOutput`] """ # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call self._validate_model_class() tokenizer = kwargs.pop( "tokenizer", None ) # Pull this out first, we only use it for stopping criteria generation_config, model_kwargs = self._prepare_generation_config( generation_config, **kwargs ) self._validate_model_kwargs(model_kwargs.copy()) self._validate_assistant(assistant_model) # 2. Set generation parameters if not already defined if synced_gpus is None: if is_deepspeed_zero3_enabled() and dist.get_world_size() > 1: synced_gpus = True else: synced_gpus = False logits_processor = ( logits_processor if logits_processor is not None else LogitsProcessorList() ) stopping_criteria = ( stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() ) accepts_attention_mask = "attention_mask" in set( inspect.signature(self.forward).parameters.keys() ) requires_attention_mask = "encoder_outputs" not in model_kwargs kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None # 3. Define model inputs inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs( inputs, generation_config.bos_token_id, model_kwargs ) batch_size = inputs_tensor.shape[0] device = inputs_tensor.device self._prepare_special_tokens( generation_config, kwargs_has_attention_mask, device=device ) # decoder-only models must use left-padding for batched generation. if not self.config.is_encoder_decoder and not is_torchdynamo_compiling(): # If `input_ids` was given, check if the last id in any sequence is `pad_token_id` # Note: If using, `inputs_embeds` this check does not work, because we want to be more hands-off. if ( generation_config.pad_token_id is not None and batch_size > 1 and len(inputs_tensor.shape) == 2 and torch.sum(inputs_tensor[:, -1] == generation_config.pad_token_id) > 0 ): logger.warning( "A decoder-only architecture is being used, but right-padding was detected! For correct " "generation results, please set `padding_side='left'` when initializing the tokenizer." ) # 4. Define other model kwargs # decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are # generating the first new token or not, and we only want to use the embeddings for the first new token) if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds": model_kwargs["use_cache"] = True else: model_kwargs["use_cache"] = generation_config.use_cache if ( not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask ): model_kwargs[ "attention_mask" ] = self._prepare_attention_mask_for_generation( inputs_tensor, generation_config.pad_token_id, generation_config.eos_token_id, ) if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs: # if model is encoder decoder encoder_outputs are created and added to `model_kwargs` model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( inputs_tensor, model_kwargs, model_input_name, generation_config ) # 5. Prepare `input_ids` which will be used for auto-regressive generation if self.config.is_encoder_decoder: input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation( batch_size=batch_size, model_input_name=model_input_name, model_kwargs=model_kwargs, decoder_start_token_id=generation_config.decoder_start_token_id, device=inputs_tensor.device, ) else: input_ids = ( inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids") ) if generation_config.token_healing: input_ids = self.heal_tokens(input_ids, tokenizer) if streamer is not None: streamer.put(input_ids.cpu()) # 6. Prepare `max_length` depending on other stopping criteria. input_ids_length = input_ids.shape[-1] has_default_max_length = ( kwargs.get("max_length") is None and generation_config.max_length is not None ) has_default_min_length = ( kwargs.get("min_length") is None and generation_config.min_length is not None ) generation_config = self._prepare_generated_length( generation_config=generation_config, has_default_max_length=has_default_max_length, has_default_min_length=has_default_min_length, model_input_name=model_input_name, inputs_tensor=inputs_tensor, input_ids_length=input_ids_length, ) use_dynamic_cache_by_default = False if ( generation_config.cache_implementation is not None and model_kwargs.get("past_key_values") is not None ): raise ValueError( "Passing both `cache_implementation` (used to initialize certain caches) and `past_key_values` (a " "Cache object) is unsupported. Please use only one of the two." ) elif generation_config.cache_implementation is not None: if self.config.use_attn_moe: # 🔍 raise ValueError( "Attention MoE doesn't support specifying the cache type! You can only use `MoECache`" ) if ( generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING ): if ( generation_config.cache_implementation == "static" and not self._supports_static_cache ): raise ValueError( "This model does not support `cache_implementation='static'`. Please check the following " "issue: https://github.com/huggingface/transformers/issues/28981" ) model_kwargs["past_key_values"] = self._get_cache( generation_config.cache_implementation, getattr(generation_config, "num_beams", 1) * batch_size, generation_config.max_length, ) elif generation_config.cache_implementation == "quantized": if not self._supports_quantized_cache: raise ValueError( "This model does not support the quantized cache. If you want your model to support quantized " "cache, please open an issue." ) cache_config = ( generation_config.cache_config if generation_config.cache_config is not None else QuantizedCacheConfig() ) cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend] if cache_config.backend == "quanto" and not is_quanto_available(): raise ImportError( "You need to install `quanto` in order to use KV cache quantization with quanto backend. " "Please install it via with `pip install quanto`" ) elif cache_config.backend == "HQQ" and not is_hqq_available(): raise ImportError( "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. " "Please install it via with `pip install hqq`" ) model_kwargs["past_key_values"] = cache_class(cache_config) # Use DynamicCache() instance by default. This will avoid back and forth from legacy format that # keeps copying the cache thus using much more memory elif ( generation_config.cache_implementation is None and self._supports_default_dynamic_cache() ): past = model_kwargs.get("past_key_values", None) if past is None: if self.config.use_attn_moe: # 🔍 model_kwargs["past_key_values"] = MoECache( # self.config.num_key_value_heads self.config.attn_experts ) else: # 🔍 model_kwargs["past_key_values"] = DynamicCache() use_dynamic_cache_by_default = True elif isinstance(past, tuple): if self.config.use_attn_moe: # 🔍 model_kwargs["past_key_values"] = MoECache.from_legacy_cache(past) else: # 🔍 model_kwargs["past_key_values"] = DynamicCache.from_legacy_cache( past ) use_dynamic_cache_by_default = True self._validate_generated_length( generation_config, input_ids_length, has_default_max_length ) # 7. determine generation mode generation_mode = generation_config.get_generation_mode(assistant_model) if streamer is not None and (generation_config.num_beams > 1): raise ValueError( "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1." ) if self.device.type != input_ids.device.type: warnings.warn( "You are calling .generate() with the `input_ids` being on a device type different" f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model" f" is on {self.device.type}. You may experience unexpected behaviors or slower generation." " Please make sure that you have put `input_ids` to the" f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before" " running `.generate()`.", UserWarning, ) # 8. prepare distribution pre_processing samplers prepared_logits_processor = self._get_logits_processor( generation_config=generation_config, input_ids_seq_length=input_ids_length, encoder_input_ids=inputs_tensor, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, logits_processor=logits_processor, device=inputs_tensor.device, model_kwargs=model_kwargs, negative_prompt_ids=negative_prompt_ids, negative_prompt_attention_mask=negative_prompt_attention_mask, ) # 9. prepare stopping criteria prepared_stopping_criteria = self._get_stopping_criteria( generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs, ) # 10. go into different generation modes if generation_mode == GenerationMode.ASSISTED_GENERATION: if generation_config.num_return_sequences > 1: raise ValueError( "num_return_sequences has to be 1 when doing assisted generate, " f"but is {generation_config.num_return_sequences}." ) if batch_size > 1: raise ValueError( "assisted generate is only supported for batch_size = 1" ) if not model_kwargs["use_cache"]: raise ValueError("assisted generate requires `use_cache=True`") if generation_config.cache_implementation == "static": raise ValueError( "assisted generate is not supported with `static_cache`" ) if self._is_stateful: # In assisted generation we need the ability to confirm whether the model would pick certain tokens, # which is not possible with stateful models (they can't reset to a previous subset of generated text) raise ValueError( f"assisted generation is not supported with stateful models, such as {self.__class__.__name__}" ) # 11. Get the candidate generator, given the parameterization candidate_generator = self._get_candidate_generator( generation_config=generation_config, input_ids=input_ids, inputs_tensor=inputs_tensor, assistant_model=assistant_model, logits_processor=logits_processor, model_kwargs=model_kwargs, ) # 12. prepare logits warper (if `do_sample` is `True`) prepared_logits_warper = ( self._get_logits_warper( generation_config, device=input_ids.device, ) if generation_config.do_sample else None ) # 13. run assisted generate result = self._assisted_decoding( input_ids, candidate_generator=candidate_generator, logits_processor=prepared_logits_processor, logits_warper=prepared_logits_warper, stopping_criteria=prepared_stopping_criteria, generation_config=generation_config, synced_gpus=synced_gpus, streamer=streamer, **model_kwargs, ) elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH: if not model_kwargs["use_cache"]: raise ValueError("Contrastive search requires `use_cache=True`") if self._is_stateful: # Just like assisted generation, we need to be able to rollback to a previous state (see comment above) raise ValueError( f"contrastive search is not supported with stateful models, such as {self.__class__.__name__}" ) result = self._contrastive_search( input_ids, logits_processor=prepared_logits_processor, stopping_criteria=prepared_stopping_criteria, generation_config=generation_config, synced_gpus=synced_gpus, streamer=streamer, **model_kwargs, ) elif generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH): # 11. prepare logits warper prepared_logits_warper = ( self._get_logits_warper(generation_config, device=input_ids.device) if generation_config.do_sample else None ) # 12. expand input_ids with `num_return_sequences` additional sequences per batch input_ids, model_kwargs = self._expand_inputs_for_generation( input_ids=input_ids, expand_size=generation_config.num_return_sequences, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs, ) # 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`) result = self._sample( input_ids, logits_processor=prepared_logits_processor, logits_warper=prepared_logits_warper, stopping_criteria=prepared_stopping_criteria, generation_config=generation_config, synced_gpus=synced_gpus, streamer=streamer, **model_kwargs, ) elif generation_mode in ( GenerationMode.BEAM_SAMPLE, GenerationMode.BEAM_SEARCH, ): # 11. prepare logits warper prepared_logits_warper = ( self._get_logits_warper(generation_config, device=input_ids.device) if generation_config.do_sample else None ) # 12. prepare beam search scorer beam_scorer = BeamSearchScorer( batch_size=batch_size, num_beams=generation_config.num_beams, device=inputs_tensor.device, length_penalty=generation_config.length_penalty, do_early_stopping=generation_config.early_stopping, num_beam_hyps_to_keep=generation_config.num_return_sequences, max_length=generation_config.max_length, ) # 13. interleave input_ids with `num_beams` additional sequences per batch input_ids, model_kwargs = self._expand_inputs_for_generation( input_ids=input_ids, expand_size=generation_config.num_beams, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs, ) # 14. run beam sample result = self._beam_search( input_ids, beam_scorer, logits_processor=prepared_logits_processor, logits_warper=prepared_logits_warper, stopping_criteria=prepared_stopping_criteria, generation_config=generation_config, synced_gpus=synced_gpus, **model_kwargs, ) elif generation_mode == GenerationMode.GROUP_BEAM_SEARCH: # 11. prepare beam search scorer beam_scorer = BeamSearchScorer( batch_size=batch_size, num_beams=generation_config.num_beams, device=inputs_tensor.device, length_penalty=generation_config.length_penalty, do_early_stopping=generation_config.early_stopping, num_beam_hyps_to_keep=generation_config.num_return_sequences, num_beam_groups=generation_config.num_beam_groups, max_length=generation_config.max_length, ) # 12. interleave input_ids with `num_beams` additional sequences per batch input_ids, model_kwargs = self._expand_inputs_for_generation( input_ids=input_ids, expand_size=generation_config.num_beams, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs, ) # 13. run beam search result = self._group_beam_search( input_ids, beam_scorer, logits_processor=prepared_logits_processor, stopping_criteria=prepared_stopping_criteria, generation_config=generation_config, synced_gpus=synced_gpus, **model_kwargs, ) elif generation_mode == GenerationMode.CONSTRAINED_BEAM_SEARCH: final_constraints = [] if generation_config.constraints is not None: final_constraints = generation_config.constraints if generation_config.force_words_ids is not None: def typeerror(): raise ValueError( "`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]` " f"of positive integers, but is {generation_config.force_words_ids}." ) if ( not isinstance(generation_config.force_words_ids, list) or len(generation_config.force_words_ids) == 0 ): typeerror() for word_ids in generation_config.force_words_ids: if isinstance(word_ids[0], list): if not isinstance(word_ids, list) or len(word_ids) == 0: typeerror() if any( not isinstance(token_ids, list) for token_ids in word_ids ): typeerror() if any( any( (not isinstance(token_id, int) or token_id < 0) for token_id in token_ids ) for token_ids in word_ids ): typeerror() constraint = DisjunctiveConstraint(word_ids) else: if not isinstance(word_ids, list) or len(word_ids) == 0: typeerror() if any( (not isinstance(token_id, int) or token_id < 0) for token_id in word_ids ): typeerror() constraint = PhrasalConstraint(word_ids) final_constraints.append(constraint) # 11. prepare beam search scorer constrained_beam_scorer = ConstrainedBeamSearchScorer( constraints=final_constraints, batch_size=batch_size, num_beams=generation_config.num_beams, device=inputs_tensor.device, length_penalty=generation_config.length_penalty, do_early_stopping=generation_config.early_stopping, num_beam_hyps_to_keep=generation_config.num_return_sequences, max_length=generation_config.max_length, ) # 12. interleave input_ids with `num_beams` additional sequences per batch input_ids, model_kwargs = self._expand_inputs_for_generation( input_ids=input_ids, expand_size=generation_config.num_beams, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs, ) # 13. run beam search result = self._constrained_beam_search( input_ids, constrained_beam_scorer=constrained_beam_scorer, logits_processor=prepared_logits_processor, stopping_criteria=prepared_stopping_criteria, generation_config=generation_config, synced_gpus=synced_gpus, **model_kwargs, ) # Convert to legacy cache if needed if use_dynamic_cache_by_default and generation_config.return_legacy_cache: if isinstance(result, ModelOutput) and hasattr(result, "past_key_values"): if isinstance(result.past_key_values, DynamicCache): result.past_key_values = result.past_key_values.to_legacy_cache() return result # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mixtral, LLAMA->MIXTRAL class MixtralForSequenceClassification(MixtralPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.model = MixtralModel(config) self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self): return self.model.embed_tokens def set_input_embeddings(self, value): self.model.embed_tokens = value def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, SequenceClassifierOutputWithPast]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) transformer_outputs = self.model( input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] logits = self.score(hidden_states) if input_ids is not None: batch_size = input_ids.shape[0] else: batch_size = inputs_embeds.shape[0] if self.config.pad_token_id is None and batch_size != 1: raise ValueError( "Cannot handle batch sizes > 1 if no padding token is defined." ) if self.config.pad_token_id is None: sequence_lengths = -1 else: if input_ids is not None: sequence_lengths = ( torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 ).to(logits.device) else: sequence_lengths = -1 pooled_logits = logits[ torch.arange(batch_size, device=logits.device), sequence_lengths ] loss = None if labels is not None: labels = labels.to(logits.device) if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and ( labels.dtype == torch.long or labels.dtype == torch.int ): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) else: loss = loss_fct(pooled_logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct( pooled_logits.view(-1, self.num_labels), labels.view(-1) ) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(pooled_logits, labels) if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutputWithPast( loss=loss, logits=pooled_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, )