LLaMA-MoE-v2-3_8B-2_8-sft / modeling_mixtral.py

update

4f1c13a about 2 months ago

175 kB

	""" PyTorch Mixtral model."""
	import importlib
	import inspect
	import math
	import warnings
	from dataclasses import dataclass
	from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union

	import scattermoe
	import stk
	import torch
	import torch.distributed as dist
	import torch.nn.functional as F
	import torch.utils.checkpoint
	from megablocks import grouped_gemm_util as gg
	from megablocks.layers.activation_fn import act_fn
	from megablocks.layers.arguments import Arguments as MegablocksArguments
	from megablocks.layers.dmlp_registry import _REGISTRY
	from megablocks.layers.dmoe import ParallelDroplessMLP
	from megablocks.layers.glu import memory_optimized_grouped_glu
	from packaging import version
	from torch import nn
	from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
	from transformers import (
	BeamSearchScorer,
	ConstrainedBeamSearchScorer,
	DisjunctiveConstraint,
	LogitsProcessorList,
	PhrasalConstraint,
	QuantizedCacheConfig,
	StoppingCriteriaList,
	)
	from transformers.activations import ACT2FN
	from transformers.cache_utils import Cache, DynamicCache
	from transformers.generation.configuration_utils import GenerationConfig, GenerationMode
	from transformers.generation.utils import (
	NEED_SETUP_CACHE_CLASSES_MAPPING,
	QUANT_BACKEND_CLASSES_MAPPING,
	GenerateOutput,
	)
	from transformers.integrations import is_deepspeed_zero3_enabled
	from transformers.modeling_outputs import ModelOutput, SequenceClassifierOutputWithPast
	from transformers.modeling_utils import PreTrainedModel
	from transformers.utils import is_torch_available, logging
	from transformers.utils.import_utils import (
	is_hqq_available,
	is_quanto_available,
	is_torch_fx_available,
	is_torchdynamo_compiling,
	)

	from .configuration_mixtral import MixtralConfig

	logger = logging.get_logger(__name__)

	_CONFIG_FOR_DOC = "MixtralConfig"

	parsed_torch_version_base = version.parse(version.parse(torch.__version__).base_version)
	is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse("1.13")


	def _is_package_available(
	pkg_name: str, return_version: bool = False
	) -> Union[Tuple[bool, str], bool]:
	# Check we're not importing a "pkg_name" directory somewhere but the actual library by trying to grab the version
	package_exists = importlib.util.find_spec(pkg_name) is not None
	package_version = "N/A"
	if package_exists:
	try:
	package_version = importlib.metadata.version(pkg_name)
	package_exists = True
	except importlib.metadata.PackageNotFoundError:
	package_exists = False
	logger.debug(f"Detected {pkg_name} version {package_version}")
	if return_version:
	return package_exists, package_version
	else:
	return package_exists


	def is_flash_attn_2_available():
	if not is_torch_available():
	return False

	if not _is_package_available("flash_attn"):
	return False

	# Let's add an extra check to see if cuda is available
	import torch

	if not torch.cuda.is_available():
	return False

	if torch.version.cuda:
	return version.parse(importlib.metadata.version("flash_attn")) >= version.parse(
	"2.1.0"
	)
	elif torch.version.hip:
	# TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention
	return version.parse(importlib.metadata.version("flash_attn")) >= version.parse(
	"2.0.4"
	)
	else:
	return False


	def is_flash_attn_greater_or_equal_2_10():
	if not _is_package_available("flash_attn"):
	return False

	return version.parse(importlib.metadata.version("flash_attn")) >= version.parse(
	"2.1.0"
	)


	def is_flash_attn_available():
	logger.warning(
	"Using `is_flash_attn_available` is deprecated and will be removed in v4.38. "
	"Please use `is_flash_attn_2_available` instead."
	)
	return is_flash_attn_2_available()


	@dataclass
	class AttentionMaskConverter:
	"""
	A utility attention mask class that allows one to:
	- Create a causal 4d mask
	- Create a causal 4d mask with slided window
	- Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length,
	key_value_length) that can be multiplied with attention scores

	Examples:

	```python
	>>> import torch
	>>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter

	>>> converter = AttentionMaskConverter(True)
	>>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
	tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
	[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
	[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
	[-3.4028e+38, -3.4028e+38, -3.4028e+38, 0.0000e+00, -3.4028e+38],
	[-3.4028e+38, -3.4028e+38, -3.4028e+38, 0.0000e+00, 0.0000e+00]]]])
	```

	Parameters:
	is_causal (`bool`):
	Whether the attention mask should be a uni-directional (causal) or bi-directional mask.

	sliding_window (`int`, optional):
	Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer.
	"""

	is_causal: bool
	sliding_window: int

	def __init__(self, is_causal: bool, sliding_window: Optional[int] = None):
	self.is_causal = is_causal
	self.sliding_window = sliding_window

	if self.sliding_window is not None and self.sliding_window <= 0:
	raise ValueError(
	f"Make sure that when passing `sliding_window` that its value is a strictly positive integer, not `{self.sliding_window}`"
	)

	def to_causal_4d(
	self,
	batch_size: int,
	query_length: int,
	key_value_length: int,
	dtype: torch.dtype,
	device: Union[torch.device, "str"] = "cpu",
	) -> Optional[torch.Tensor]:
	"""
	Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
	bias to upper right hand triangular matrix (causal mask).
	"""
	if not self.is_causal:
	raise ValueError(
	f"Please use `to_causal_4d` only if {self.__class__} has `is_causal` set to True."
	)

	# If shape is not cached, create a new causal mask and cache it
	input_shape = (batch_size, query_length)
	past_key_values_length = key_value_length - query_length

	# create causal mask
	# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
	causal_4d_mask = None
	if input_shape[-1] > 1 or self.sliding_window is not None:
	causal_4d_mask = self._make_causal_mask(
	input_shape,
	dtype,
	device=device,
	past_key_values_length=past_key_values_length,
	sliding_window=self.sliding_window,
	)

	return causal_4d_mask

	def to_4d(
	self,
	attention_mask_2d: torch.Tensor,
	query_length: int,
	dtype: torch.dtype,
	key_value_length: Optional[int] = None,
	) -> torch.Tensor:
	"""
	Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
	key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
	causal, a causal mask will be added.
	"""
	input_shape = (attention_mask_2d.shape[0], query_length)

	# create causal mask
	# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
	causal_4d_mask = None
	if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
	if key_value_length is None:
	raise ValueError(
	"This attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask."
	)

	past_key_values_length = key_value_length - query_length
	causal_4d_mask = self._make_causal_mask(
	input_shape,
	dtype,
	device=attention_mask_2d.device,
	past_key_values_length=past_key_values_length,
	sliding_window=self.sliding_window,
	)
	elif self.sliding_window is not None:
	raise NotImplementedError(
	"Sliding window is currently only implemented for causal masking"
	)

	# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
	expanded_attn_mask = self._expand_mask(
	attention_mask_2d, dtype, tgt_len=input_shape[-1]
	).to(attention_mask_2d.device)
	if causal_4d_mask is not None:
	expanded_attn_mask = causal_4d_mask.masked_fill(
	expanded_attn_mask.bool(), torch.finfo(dtype).min
	)

	# expanded_attn_mask + causal_4d_mask can cause some overflow
	expanded_4d_mask = expanded_attn_mask

	return expanded_4d_mask

	@staticmethod
	def _make_causal_mask(
	input_ids_shape: torch.Size,
	dtype: torch.dtype,
	device: torch.device,
	past_key_values_length: int = 0,
	sliding_window: Optional[int] = None,
	):
	"""
	Make causal mask used for bi-directional self-attention.
	"""
	bsz, tgt_len = input_ids_shape
	mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
	mask_cond = torch.arange(mask.size(-1), device=device)
	mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)

	mask = mask.to(dtype)

	if past_key_values_length > 0:
	mask = torch.cat(
	[
	torch.zeros(
	tgt_len, past_key_values_length, dtype=dtype, device=device
	),
	mask,
	],
	dim=-1,
	)

	# add lower triangular sliding window mask if necessary
	if sliding_window is not None:
	diagonal = past_key_values_length - sliding_window + 1

	context_mask = 1 - torch.triu(
	torch.ones_like(mask, dtype=torch.int), diagonal=diagonal
	)
	mask.masked_fill_(context_mask.bool(), torch.finfo(dtype).min)

	return mask[None, None, :, :].expand(
	bsz, 1, tgt_len, tgt_len + past_key_values_length
	)

	@staticmethod
	def _expand_mask(
	mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
	):
	"""
	Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
	"""
	bsz, src_len = mask.size()
	tgt_len = tgt_len if tgt_len is not None else src_len

	expanded_mask = (
	mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
	)

	inverted_mask = 1.0 - expanded_mask

	return inverted_mask.masked_fill(
	inverted_mask.to(torch.bool), torch.finfo(dtype).min
	)

	@staticmethod
	def _unmask_unattended(
	expanded_mask: torch.Tensor,
	attention_mask: torch.Tensor,
	unmasked_value: Union[bool, float],
	):
	# fmt: off
	"""
	Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
	using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
	Details: https://github.com/pytorch/pytorch/issues/110213

	`expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
	`attention_mask` is [bsz, src_seq_len].

	The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.

	For example, if `attention_mask` is
	```
	[[0, 0, 1],
	[1, 1, 1],
	[0, 1, 1]]
	```
	and `expanded_mask` is (e.g. here left-padding case)
	```
	[[[[0, 0, 0],
	[0, 0, 0],
	[0, 0, 1]]],
	[[[1, 0, 0],
	[1, 1, 0],
	[1, 1, 1]]],
	[[[0, 0, 0],
	[0, 1, 0],
	[0, 1, 1]]]]
	```
	then the modified `expanded_mask` will be
	```
	[[[[1, 1, 1], <-- modified
	[1, 1, 1], <-- modified
	[0, 0, 1]]],
	[[[1, 0, 0],
	[1, 1, 0],
	[1, 1, 1]]],
	[[[1, 1, 1], <-- modified
	[0, 1, 0],
	[0, 1, 1]]]]
	```
	"""
	# fmt: on

	# Get the index of the first non-zero value for every sample in the batch.
	# In the above example, indices = [[2], [0], [1]]]
	tmp = torch.arange(attention_mask.shape[1], 0, -1)
	indices = torch.argmax(attention_mask.cpu() * tmp, 1, keepdim=True)

	# Find the batch indexes that have unattended tokens on the leftmost side (e.g. [0, 0, 1, 1, 1]), for which the first rows of the
	# expanded mask will be completely unattended.
	left_masked_rows = torch.where(indices > 0)[0]

	if left_masked_rows.shape[0] == 0:
	return expanded_mask
	indices = indices[left_masked_rows]

	max_len = torch.max(indices)
	range_tensor = torch.arange(max_len).unsqueeze(0)
	range_tensor = range_tensor.repeat(indices.size(0), 1)

	# Avoid unmasking tokens at relevant target positions (on the row axis), by rather unmasking possibly several times the first row that should always be unmasked as we filtered out the batch above.
	range_tensor[range_tensor >= indices] = 0

	# TODO: we may drop support for 3D attention mask as the refactor from Patrick maybe dropped this case
	if expanded_mask.dim() == 4:
	num_masks = expanded_mask.shape[1]
	if num_masks == 1:
	# Broadcast [left_masked_rows, 1], [left_masked_rows, max_len]
	mask_slice = (left_masked_rows[:, None], 0, range_tensor)
	else:
	# Broadcast [left_masked_rows, 1, 1], [1, num_masks, 1], [left_masked_rows, 1, max_len]
	mask_slice = (
	left_masked_rows[:, None, None],
	torch.arange(num_masks)[None, :, None],
	range_tensor[:, None, :],
	)
	else:
	# Broadcast [left_masked_rows, 1], [left_masked_rows, max_len]
	mask_slice = (left_masked_rows[:, None], range_tensor)

	expanded_mask[mask_slice] = unmasked_value

	return expanded_mask


	def _prepare_4d_causal_attention_mask(
	attention_mask: Optional[torch.Tensor],
	input_shape: Union[torch.Size, Tuple, List],
	inputs_embeds: torch.Tensor,
	past_key_values_length: int,
	sliding_window: Optional[int] = None,
	):
	"""
	Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
	`(batch_size, key_value_length)`

	Args:
	attention_mask (`torch.Tensor` or `None`):
	A 2D attention mask of shape `(batch_size, key_value_length)`
	input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
	The input shape should be a tuple that defines `(batch_size, query_length)`.
	inputs_embeds (`torch.Tensor`):
	The embedded inputs as a torch Tensor.
	past_key_values_length (`int`):
	The length of the key value cache.
	sliding_window (`int`, optional):
	If the model uses windowed attention, a sliding window should be passed.
	"""
	attn_mask_converter = AttentionMaskConverter(
	is_causal=True, sliding_window=sliding_window
	)

	key_value_length = input_shape[-1] + past_key_values_length

	# 4d mask is passed through the layers
	if attention_mask is not None:
	attention_mask = attn_mask_converter.to_4d(
	attention_mask,
	input_shape[-1],
	key_value_length=key_value_length,
	dtype=inputs_embeds.dtype,
	)
	else:
	attention_mask = attn_mask_converter.to_causal_4d(
	input_shape[0],
	input_shape[-1],
	key_value_length,
	dtype=inputs_embeds.dtype,
	device=inputs_embeds.device,
	)

	return attention_mask


	@dataclass
	class MoeCausalLMOutputWithPast(ModelOutput):
	"""
	Base class for causal language model (or autoregressive) with mixture of experts outputs.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).

	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).

	aux_loss (`torch.FloatTensor`, optional, returned when `labels` is provided):
	aux_loss for the sparse modules.

	router_logits (`tuple(torch.FloatTensor)`, optional, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.

	Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
	loss for Mixture of Experts models.

	past_key_values (`tuple(tuple(torch.FloatTensor))`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
	`(batch_size, num_heads, sequence_length, embed_size_per_head)`)

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	"""

	loss: Optional[torch.FloatTensor] = None
	aux_loss: Optional[torch.FloatTensor] = None
	logits: torch.FloatTensor = None
	past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
	hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None
	router_logits: Optional[Tuple[torch.FloatTensor]] = None

	@property
	def balance_loss(self):
	return self.aux_loss

	@property
	def num_dropped_tokens(self):
	return [torch.tensor(-1)] * 32

	@property
	def gate_load(self):
	return [torch.tensor(-1)] * 32

	@property
	def gate_importance(self):
	return [torch.tensor(-1)] * 32


	@dataclass
	class MoeModelOutputWithPast(ModelOutput):
	"""
	Base class for model's outputs, with potential hidden states and attentions.

	Args:
	last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.
	past_key_values (`tuple(tuple(torch.FloatTensor))`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
	`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
	`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
	encoder_sequence_length, embed_size_per_head)`.

	Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
	`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
	input) to speed up sequential decoding.
	hidden_states (`tuple(torch.FloatTensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.FloatTensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	router_logits (`tuple(torch.FloatTensor)`, optional, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
	Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.

	Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
	loss for Mixture of Experts models.
	"""

	last_hidden_state: torch.FloatTensor = None
	past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
	hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None
	router_logits: Optional[Tuple[torch.FloatTensor]] = None
	attn_router_logits: Optional[Tuple[torch.FloatTensor]] = None # 🔍


	if is_flash_attn_2_available():
	from flash_attn import flash_attn_func, flash_attn_varlen_func
	from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa

	_flash_supports_window_size = "window_size" in list(
	inspect.signature(flash_attn_func).parameters
	)

	# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
	# It means that the function will not be traced through and simply appear as a node in the graph.
	if is_torch_fx_available():
	if not is_torch_greater_or_equal_than_1_13:
	import torch.fx

	_prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)


	def load_balancing_loss_func(
	gate_logits: Union[torch.Tensor, Tuple],
	num_experts: torch.Tensor = None,
	top_k=2,
	use_layer_wise_balance=False,
	) -> torch.FloatTensor:
	r"""
	Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

	See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
	function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
	experts is too unbalanced.

	Args:
	gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
	Logits from the `gate`, should be a tuple of tensors. Shape: [batch_size, seqeunce_length, num_experts].
	num_experts (`int`, optional):
	Number of experts

	Returns:
	The auxiliary loss.
	"""
	if gate_logits is None or (isinstance(gate_logits, Iterable) and len(gate_logits) == 0):
	return 0

	# ✨ Here is the fix for balance loss in Mixtral.
	# We should calculate the balance loss in a layer-wise manner otherwise it may lead to degenerated solutions.
	if use_layer_wise_balance:
	if not isinstance(gate_logits, Iterable):
	gate_logits = (gate_logits,)
	else:
	if isinstance(gate_logits, Iterable):
	gate_logits = (torch.cat(gate_logits, dim=0),)
	else:
	gate_logits = (gate_logits,)

	all_balance_losses = []

	for logits in gate_logits:
	routing_weights, selected_experts = torch.topk(logits, top_k, dim=-1)
	routing_weights = routing_weights.softmax(dim=-1)

	# cast the expert indices to int64, otherwise one-hot encoding will fail
	if selected_experts.dtype != torch.int64:
	selected_experts = selected_experts.to(torch.int64)

	if len(selected_experts.shape) == 2:
	selected_experts = selected_experts.unsqueeze(2)

	expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)

	# For a given token, determine if it was routed to a given expert.
	expert_mask = torch.max(expert_mask, axis=-2).values

	# cast to float32 otherwise mean will fail
	expert_mask = expert_mask.to(torch.float32)
	tokens_per_group_and_expert = torch.mean(expert_mask, axis=-2)

	router_prob_per_group_and_expert = torch.mean(routing_weights, axis=-1)

	# ✨ balance loss for this layer
	balance_loss = torch.mean(
	tokens_per_group_and_expert * router_prob_per_group_and_expert.unsqueeze(-1)
	) * (num_experts**2)
	all_balance_losses.append(balance_loss.reshape(1))

	all_balance_losses = torch.cat(all_balance_losses).mean() # ✨

	return all_balance_losses


	# Copied from transformers.models.llama.modeling_llama._get_unpad_data
	def _get_unpad_data(attention_mask):
	seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
	indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
	max_seqlen_in_batch = seqlens_in_batch.max().item()
	cu_seqlens = F.pad(
	torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
	)
	return (
	indices,
	cu_seqlens,
	max_seqlen_in_batch,
	)


	# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mixtral
	class MixtralRMSNorm(nn.Module):
	def __init__(self, hidden_size, eps=1e-6):
	"""
	MixtralRMSNorm is equivalent to T5LayerNorm
	"""
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps

	def forward(self, hidden_states):
	input_dtype = hidden_states.dtype
	hidden_states = hidden_states.to(torch.float32)
	variance = hidden_states.pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
	return self.weight * hidden_states.to(input_dtype)


	# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mixtral
	class MixtralRotaryEmbedding(nn.Module):
	def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
	super().__init__()

	self.dim = dim
	self.max_position_embeddings = max_position_embeddings
	self.base = base
	inv_freq = 1.0 / (
	self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
	)
	self.register_buffer("inv_freq", inv_freq, persistent=False)

	# Build here to make `torch.jit.trace` work.
	self._set_cos_sin_cache(
	seq_len=max_position_embeddings,
	device=self.inv_freq.device,
	dtype=torch.get_default_dtype(),
	)

	def _set_cos_sin_cache(self, seq_len, device, dtype):
	self.max_seq_len_cached = seq_len
	t = torch.arange(
	self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
	)

	freqs = torch.outer(t, self.inv_freq)
	# Different from paper, but it uses a different permutation in order to obtain the same calculation
	emb = torch.cat((freqs, freqs), dim=-1)
	self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
	self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

	def forward(self, x, seq_len=None):
	# x: [bs, num_attention_heads, seq_len, head_size]
	if seq_len > self.max_seq_len_cached:
	self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

	return (
	self.cos_cached[:seq_len].to(dtype=x.dtype),
	self.sin_cached[:seq_len].to(dtype=x.dtype),
	)


	# Copied from transformers.models.llama.modeling_llama.rotate_half
	def rotate_half(x):
	"""Rotates half the hidden dims of the input."""
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat((-x2, x1), dim=-1)


	# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
	def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
	"""Applies Rotary Position Embedding to the query and key tensors.

	Args:
	q (`torch.Tensor`): The query tensor.
	k (`torch.Tensor`): The key tensor.
	cos (`torch.Tensor`): The cosine part of the rotary embedding.
	sin (`torch.Tensor`): The sine part of the rotary embedding.
	position_ids (`torch.Tensor`):
	The position indices of the tokens corresponding to the query and key tensors. For example, this can be
	used to pass offsetted position ids when working with a KV-cache.
	unsqueeze_dim (`int`, optional, defaults to 1):
	The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
	sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
	that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
	k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
	cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
	the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
	Returns:
	`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
	"""
	cos = cos[position_ids].unsqueeze(unsqueeze_dim)
	sin = sin[position_ids].unsqueeze(unsqueeze_dim)
	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed


	# Copied from transformers.models.llama.modeling_llama.repeat_kv
	def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
	"""
	This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
	num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
	"""
	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
	if n_rep == 1:
	return hidden_states
	hidden_states = hidden_states[:, :, None, :, :].expand(
	batch, num_key_value_heads, n_rep, slen, head_dim
	)
	return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


	# Copied from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Mixtral
	class MixtralAttention(nn.Module):
	"""
	Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
	and "Generating Long Sequences with Sparse Transformers".
	"""

	def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None):
	super().__init__()
	self.config = config
	self.layer_idx = layer_idx
	if layer_idx is None:
	logger.warning_once(
	f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
	"to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
	"when creating this class."
	)

	self.hidden_size = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.hidden_size // self.num_heads
	self.num_key_value_heads = config.num_key_value_heads
	self.num_key_value_groups = self.num_heads // self.num_key_value_heads
	self.max_position_embeddings = config.max_position_embeddings
	self.rope_theta = config.rope_theta
	self.is_causal = True
	self.attention_dropout = config.attention_dropout

	if (self.head_dim * self.num_heads) != self.hidden_size:
	raise ValueError(
	f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
	f" and `num_heads`: {self.num_heads})."
	)
	self.q_proj = nn.Linear(
	self.hidden_size, self.num_heads * self.head_dim, bias=False
	)
	self.k_proj = nn.Linear(
	self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False
	)
	self.v_proj = nn.Linear(
	self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False
	)
	self.o_proj = nn.Linear(
	self.num_heads * self.head_dim, self.hidden_size, bias=False
	)

	self.rotary_emb = MixtralRotaryEmbedding(
	self.head_dim,
	max_position_embeddings=self.max_position_embeddings,
	base=self.rope_theta,
	)

	def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
	return (
	tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
	.transpose(1, 2)
	.contiguous()
	)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Cache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	**kwargs,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	if "padding_mask" in kwargs:
	warnings.warn(
	"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
	)
	bsz, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	query_states = query_states.view(
	bsz, q_len, self.num_heads, self.head_dim
	).transpose(1, 2)
	key_states = key_states.view(
	bsz, q_len, self.num_key_value_heads, self.head_dim
	).transpose(1, 2)
	value_states = value_states.view(
	bsz, q_len, self.num_key_value_heads, self.head_dim
	).transpose(1, 2)

	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	if self.layer_idx is None:
	raise ValueError(
	f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
	"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
	"with a layer index."
	)
	kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
	cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
	query_states, key_states = apply_rotary_pos_emb(
	query_states, key_states, cos, sin, position_ids
	)

	if past_key_value is not None:
	cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
	key_states, value_states = past_key_value.update(
	key_states, value_states, self.layer_idx, cache_kwargs
	)

	# repeat k/v heads if n_kv_heads < n_heads
	key_states = repeat_kv(key_states, self.num_key_value_groups)
	value_states = repeat_kv(value_states, self.num_key_value_groups)

	attn_weights = torch.matmul(
	query_states, key_states.transpose(2, 3)
	) / math.sqrt(self.head_dim)

	if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
	raise ValueError(
	f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
	f" {attn_weights.size()}"
	)

	if attention_mask is not None:
	if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
	raise ValueError(
	f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
	)

	attn_weights = attn_weights + attention_mask

	# upcast attention to fp32
	attn_weights = nn.functional.softmax(
	attn_weights, dim=-1, dtype=torch.float32
	).to(query_states.dtype)
	attn_weights = nn.functional.dropout(
	attn_weights, p=self.attention_dropout, training=self.training
	)
	attn_output = torch.matmul(attn_weights, value_states)

	if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
	raise ValueError(
	f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
	f" {attn_output.size()}"
	)

	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

	attn_output = self.o_proj(attn_output)

	if not output_attentions:
	attn_weights = None

	return attn_output, attn_weights, past_key_value


	# fmt: off
	# 🔍 Modified from DynamicCache
	class MoECache(Cache):
	"""
	Modified from the `DynamicCache`!!!
	A cache that grows dynamically as more tokens are generated.
	This cache adds extra support for Attention MoE.

	It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
	`[batch_size, num_heads, seq_len, head_dim]`.
	"""

	def __init__(self, num_experts: int) -> None:
	# 🔍 multi-experts support
	self.num_experts = num_experts
	self.key_cache: List[Dict[int, torch.Tensor]] = [{} for _ in range(num_experts)]
	self.value_cache: List[Dict[int, torch.Tensor]] = [{} for _ in range(num_experts)]
	self._seen_tokens: List[Dict[int, int]] = [{} for _ in range(num_experts)] # Used in `generate` to keep tally of how many tokens the cache has seen
	self._seen_tokens_total = 0 # 🔍 the total number of individual tokens that at least one expert has seen, this is for `get_seq_length` globally

	self.attention_mask_cache: List[Dict[int, torch.BoolTensor]] = [{} for _ in range(num_experts)] # 🔍 this is a new cache for attention mask that records the state of previous tokens

	def __getitem__(self, layer_idx: int, expert_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0][0].shape[2]` to get the
	sequence length.
	"""
	if layer_idx < len(self):
	if expert_idx < self.num_experts: # 🔍
	return (self.key_cache[expert_idx][layer_idx], self.value_cache[expert_idx][layer_idx])
	else: # 🔍
	raise KeyError(f"Cache only has {self.num_experts} experts, attempted to access expert with index {expert_idx}")
	else:
	raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")

	def __iter__(self):
	"""
	Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
	keys and values
	"""
	for layer_idx in range(len(self)):
	for expert_idx in range(self.num_experts): # 🔍
	if layer_idx in self.key_cache[expert_idx]:
	yield (self.key_cache[expert_idx][layer_idx], self.value_cache[expert_idx][layer_idx])

	def __len__(self):
	"""
	Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
	to the number of layers in the model.
	"""
	all_index_list = [key for i in range(self.num_experts) for key in self.key_cache[i].keys()]

	if len(all_index_list) == 0:
	return 0
	else:
	return max(all_index_list) + 1 # 🔍 the maximum layer index among all experts

	def update(
	self,
	key_states: torch.Tensor,
	value_states: torch.Tensor,
	layer_idx: int,
	expert_idx: int, # 🔍
	cache_kwargs: Optional[Dict[str, Any]] = None,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""
	Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

	Parameters:
	key_states (`torch.Tensor`):
	The new key states to cache.
	value_states (`torch.Tensor`):
	The new value states to cache.
	layer_idx (`int`):
	The index of the layer to cache the states for.
	expert_idx (`int`):
	🔍 The index of the expert to cache the states for.
	cache_kwargs (`Dict[str, Any]`, `optional`):
	Additional arguments for the cache subclass. No additional arguments are used in `MoECache`.

	Return:
	A tuple containing the updated key and value states.
	"""
	if layer_idx not in self._seen_tokens[expert_idx]: # 🔍
	# Update the number of seen tokens
	self._seen_tokens[expert_idx][layer_idx] = key_states.shape[-2]
	# Update the cache
	self.key_cache[expert_idx][layer_idx] = key_states
	self.value_cache[expert_idx][layer_idx] = value_states

	else: # 🔍
	# Update the number of seen tokens
	self._seen_tokens[expert_idx][layer_idx] += key_states.shape[-2]
	# Update the cache
	self.key_cache[expert_idx][layer_idx] = torch.cat([self.key_cache[expert_idx][layer_idx], key_states], dim=-2)
	self.value_cache[expert_idx][layer_idx] = torch.cat([self.value_cache[expert_idx][layer_idx], value_states], dim=-2)

	return self.key_cache[expert_idx][layer_idx], self.value_cache[expert_idx][layer_idx]

	def add_seen_tokens_total(self, new_token_num: int = 0) -> None:
	"""🔍 Add the number of new tokens to the total number of seen tokens."""
	# THIS FUNCTION IS EXCLUSIVE FOR `MoECache`!
	self._seen_tokens_total += new_token_num

	def get_seq_length(self, layer_idx: Optional[int] = None, expert_idx: Optional[int] = None) -> Union[List[List[int]], int]: # 🔍
	"""Returns the sequence length of the cached states. A layer & expert index can be optionally passed."""
	if layer_idx is not None and expert_idx is not None: # 🔍 return the length for specific layer & expert
	if self.num_experts <= expert_idx or layer_idx not in self.key_cache[expert_idx]: # 🔍
	return 0
	else:
	return self.key_cache[expert_idx][layer_idx].shape[-2]

	else: # 🔍 return the total number of individual tokens the cache has seen
	return self._seen_tokens_total

	def get_max_length(self) -> Optional[int]:
	"""Returns the maximum sequence length of the cached states. MoECache does not have a maximum length."""
	return None

	def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = None, expert_idx: Optional[int] = None) -> int:
	"""Given the sequence length of the new inputs, returns the usable length of the cache."""
	# Cache without size limit -> all cache is usable
	# Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
	# length, we will need to evict part of the cache (and thus not all cache is usable)
	max_length = self.get_max_length()
	previous_seq_length = self.get_seq_length(layer_idx, expert_idx) # 🔍
	if max_length is not None and previous_seq_length + new_seq_length > max_length:
	return max_length - new_seq_length
	return previous_seq_length

	def reorder_cache(self, beam_idx: torch.LongTensor):
	"""Reorders the cache for beam search, given the selected beam indices."""
	# TODO: support for beam search
	print("MoECache, reorder_cache", beam_idx)
	raise NotImplementedError

	# for layer_idx in range(len(self.key_cache)):
	# device = self.key_cache[layer_idx].device
	# self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
	# device = self.value_cache[layer_idx].device
	# self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))

	def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
	raise NotImplementedError

	@classmethod
	def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "MoECache":
	raise NotImplementedError

	def update_attention_mask(
	self,
	new_attention_mask: torch.BoolTensor,
	layer_idx: int,
	expert_idx: int,
	) -> torch.BoolTensor:
	"""
	🔍 Updates the attention mask cache with the new `attention_mask`.

	Parameters:
	new_attention_mask (`torch.Tensor`):
	The new key states to cache.
	layer_idx (`int`):
	The index of the layer to cache the states for.
	expert_idx (`int`):
	The index of the expert to cache the states for.

	Return:
	A tensor containing the updated attention_mask.
	"""
	# Update the cache
	if layer_idx not in self.attention_mask_cache[expert_idx]: # 🔍 no attention mask cached, this is the first stroke
	self.attention_mask_cache[expert_idx][layer_idx] = new_attention_mask
	else: # 🔍 concatenate along the seq_len dim
	self.attention_mask_cache[expert_idx][layer_idx] = torch.cat([self.attention_mask_cache[expert_idx][layer_idx], new_attention_mask], dim=-1)

	return self.attention_mask_cache[expert_idx][layer_idx]


	# 🔍 Modified from MixtralAttention
	class MixtralAttentionMoE(MixtralAttention):
	def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None):
	super(MixtralAttention, self).__init__() # 🔍 init using nn.Module
	self.config = config
	self.layer_idx = layer_idx
	if layer_idx is None:
	logger.warning_once(
	f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
	"to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
	"when creating this class."
	)

	self.hidden_size = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.hidden_size // self.num_heads
	self.num_key_value_heads = config.num_key_value_heads
	self.num_key_value_groups = self.num_heads // self.num_key_value_heads
	self.max_position_embeddings = config.max_position_embeddings
	self.rope_theta = config.rope_theta
	self.is_causal = True
	self.attention_dropout = config.attention_dropout

	if (self.head_dim * self.num_heads) != self.hidden_size:
	raise ValueError(
	f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
	f" and `num_heads`: {self.num_heads})."
	)

	# 🔍
	self.softmax = nn.Softmax(dim=-1)
	self.top_k_attn = config.top_k_attn
	self.attn_experts = config.attn_experts
	self.scale_factor_attn = config.scale_factor_attn

	self.split_ratio = self.attn_experts // self.num_key_value_heads

	self.gate = nn.Linear(self.hidden_size, self.attn_experts, bias=False)

	# 🔍
	self.q_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.num_key_value_groups * self.head_dim // self.split_ratio, bias=False) for _ in range(self.attn_experts)])
	self.k_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.head_dim, bias=False) for _ in range(self.attn_experts)])
	self.v_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.head_dim, bias=False) for _ in range(self.attn_experts)])
	self.o_proj = nn.ModuleList([nn.Linear(self.num_key_value_groups * self.head_dim // self.split_ratio, self.hidden_size, bias=config.add_rescale_bias) for _ in range(self.attn_experts)]) # 🔍 (may add bias for rescaling)

	self.rotary_emb = MixtralRotaryEmbedding(
	self.head_dim,
	max_position_embeddings=self.max_position_embeddings,
	base=self.rope_theta,
	)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None, # 🔍 This should be the Tensor with shape(bsz, seqlen) that represents the padding mask
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[MoECache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	**kwargs,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	if "padding_mask" in kwargs:
	warnings.warn(
	"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
	)
	if past_key_value is not None and not isinstance(past_key_value, MoECache): # 🔍 type check
	raise TypeError(
	"`past_key_value` must be a `MoECache` instance for attention MoE!"
	)
	# print("attention_mask", attention_mask, attention_mask.shape)
	device = hidden_states.device
	dtype = hidden_states.dtype
	bsz, q_len, hidden_dim = hidden_states.size()
	hidden_states = hidden_states.reshape(-1, hidden_dim) # 🔍 flatten the dim

	# 🔍 topk gating
	router_logits = self.gate(hidden_states) # (bsz * q_len, num_key_value_heads)
	scores = F.softmax(router_logits, dim=1, dtype=torch.float)

	routing_weights, selected_experts = torch.topk(scores, self.top_k_attn, dim=-1) # (bsz * q_len, top_k_attn)
	routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
	routing_weights = routing_weights.to(dtype) # we cast back to the input dtype

	# 🔍 moe selection
	final_attn_output = torch.zeros_like(hidden_states).reshape(-1, hidden_dim)

	# One hot encode the selected experts to create an expert mask
	# this will be used to easily index which expert is going to be sollicitated
	expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.attn_experts) # (bsz * q_len, top_k_attn, num_key_value_heads)
	expert_mask = expert_mask.permute(2, 1, 0) # (num_key_value_heads, top_k_attn, bsz * q_len)

	# Loop over all available experts in the model and perform the computation on each expert
	all_attn_weights = [] if output_attentions else None
	for expert_idx in range(self.attn_experts):
	# expert_mask[expert_idx]: (top_k_attn, bsz * q_len)
	# idx: the topk position. (selected_num)
	# top_x: token index. (selected_num)
	idx, top_x = torch.nonzero(expert_mask[expert_idx], as_tuple=True)

	if top_x.shape[0] == 0 and not self.training: # skip during training will lead to asynchrony among different GPUs and blocks the training!
	if output_attentions:
	all_attn_weights.append(None)
	continue

	# 🔍 Comment (DDZ): This is useless and even lags the speed, so I get it removed.
	# in torch it is faster to index using lists than torch tensors
	# top_x_list = top_x.tolist()
	# idx_list = idx.tolist()

	# 🔍 get routing info for this expert
	current_batch_ids = (top_x // q_len) # batch ids for current_state, (selected_num)
	each_batch_selected_token_num = torch.bincount(current_batch_ids, minlength=bsz) # (bsz)
	this_q_len = each_batch_selected_token_num.max().item()

	# 🔍 get the indices of each token in the hidden_state of this expert
	selection_mask = torch.zeros((bsz * q_len,), device=device, dtype=torch.bool) # the selection mask for this expert (this helps specify the position to put for each token)
	selection_mask[top_x] = True
	selection_mask = selection_mask.reshape(bsz, q_len)

	token_position_indices = torch.cumsum(selection_mask, dim=1) - 1 # the sequence ids of all tokens in the current state, (bsz, q_len)
	token_position_indices = token_position_indices.flatten()

	current_seq_ids = token_position_indices[top_x] # sequence ids for current_state, (selected_num)

	# 🔍 initialize hidden_states for this expert
	current_state = torch.zeros((bsz, this_q_len, hidden_dim), dtype=dtype, device=device)
	current_state[current_batch_ids, current_seq_ids] = hidden_states[top_x] # assign tokens sparsely

	# Normal Attention Forward
	# ---------------------------------------------- #
	query_states = self.q_proj[expert_idx](current_state) # 🔍 specify expert
	key_states = self.k_proj[expert_idx](current_state) # 🔍 specify expert
	value_states = self.v_proj[expert_idx](current_state) # 🔍 specify expert

	query_states = query_states.view(bsz, this_q_len, self.num_key_value_groups // self.split_ratio, self.head_dim).transpose(1, 2) # 🔍 q_len -> this_q_len, num_heads -> num_key_value_groups
	key_states = key_states.view(bsz, this_q_len, 1, self.head_dim).transpose(1, 2) # 🔍 q_len -> this_q_len, num_key_value_heads -> 1
	value_states = value_states.view(bsz, this_q_len, 1, self.head_dim).transpose(1, 2) # 🔍 q_len -> this_q_len, num_key_value_heads -> 1

	past_key_values_length = 0
	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	if self.layer_idx is None:
	raise ValueError(
	f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
	"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
	"with a layer index."
	)
	past_key_values_length = past_key_value.get_usable_length(kv_seq_len, self.layer_idx, expert_idx) # 🔍 specify expert index
	kv_seq_len += past_key_values_length

	# 🔍 create position_ids for selected tokens
	current_position_ids = torch.zeros((bsz, this_q_len), device=device, dtype=torch.long)
	current_position_ids[current_batch_ids, current_seq_ids] = position_ids.expand(bsz, q_len).flatten()[top_x]

	if top_x.shape[0] > 0: # apply only when there are tokens
	cos, sin = self.rotary_emb(value_states, seq_len=current_position_ids.max().item() + 1) # 🔍 adjust the seq_len to the maximum possible value
	query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, current_position_ids)

	if past_key_value is not None:
	cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
	key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, expert_idx, cache_kwargs) # 🔍 specify expert index

	# repeat k/v heads if n_kv_heads < n_heads
	# Note (DDZ): here the dim is expanded internally, rather than concat-repeat. (Disable for Attention MoE)
	# key_states = repeat_kv(key_states, self.num_key_value_groups)
	# value_states = repeat_kv(value_states, self.num_key_value_groups)

	attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) # softmax temperature

	if attn_weights.size() != (bsz, self.num_key_value_groups // self.split_ratio, this_q_len, kv_seq_len): # 🔍 q_len -> this_q_len, num_heads -> num_key_value_groups
	raise ValueError(f"Attention weights should be of size {(bsz, self.num_key_value_groups // self.split_ratio, this_q_len, kv_seq_len)}, but is {attn_weights.size()}")

	# 🔍 create `current_attention_mask` with reduced `seq_len`
	# Notice that the `attention_mask` is passed intact during both training & generation, so we need to adjust the `top_x` by `past_key_values_length`.
	# However, we don't have the routing information of previous tokens, which makes it impossible to create `current_attention_mask` for previous tokens.
	# So here we need an extra "attention mask cache" to record the `attention_mask` for previous tokens, and update for new tokens accordingly during generation.
	current_attention_mask = torch.zeros((bsz, this_q_len), dtype=torch.bool, device=device)

	if attention_mask is not None:
	if past_key_values_length > 0: # 🔍 we need to exclude previous tokens
	previous_seen_tokens_total = past_key_value._seen_tokens_total - q_len
	temp_attention_mask = attention_mask[:, previous_seen_tokens_total:].flatten() # select along dimension 1 so that we get tokens in this iteration
	else:
	temp_attention_mask = attention_mask.flatten() # flatten the dim
	current_attention_mask[current_batch_ids, current_seq_ids] = temp_attention_mask[top_x].bool() # assign masks sparsely

	else:
	current_attention_mask[current_batch_ids, current_seq_ids] = True # assign masks sparsely

	# print("current_attention_mask", current_attention_mask, current_attention_mask.shape)
	if past_key_value is not None: # 🔍 we need to update with cached attention mask
	current_attention_mask = past_key_value.update_attention_mask(current_attention_mask, self.layer_idx, expert_idx)

	# if self.layer_idx == 0 and expert_idx == 0:
	# print("current_attention_mask", current_attention_mask.sum(-1), current_attention_mask.shape, current_attention_mask[0])
	current_attention_mask = _prepare_4d_causal_attention_mask(
	current_attention_mask,
	(bsz, this_q_len),
	current_state,
	past_key_values_length,
	sliding_window=self.config.sliding_window,
	)

	if current_attention_mask.size() != (bsz, 1, this_q_len, kv_seq_len): # 🔍 q_len -> this_q_len
	raise ValueError(f"Attention mask should be of size {(bsz, 1, this_q_len, kv_seq_len)}, but is {current_attention_mask.size()}")

	attn_weights = attn_weights + current_attention_mask # 🔍
	# print("current_attention_mask", current_attention_mask.shape, current_attention_mask[0])
	# upcast attention to fp32
	attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
	attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
	attn_output = torch.matmul(attn_weights, value_states)

	# if attn_output.size() != (bsz, self.num_key_value_groups // self.split_ratio, this_q_len, self.head_dim): # 🔍 q_len -> this_q_len, num_heads -> num_key_value_groups
	# raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is {attn_output.size()}")

	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.reshape(bsz, this_q_len, self.num_key_value_groups * self.head_dim // self.split_ratio) # 🔍 q_len -> this_q_len, hidden_size -> num_key_value_groups * head_dim
	attn_output = self.o_proj[expert_idx](attn_output)
	# ---------------------------------------------- #

	# 🔍 select & rescale the outputs by softmax scores
	attn_output = attn_output[current_batch_ids, current_seq_ids] * (routing_weights[top_x, idx, None] * self.scale_factor_attn)
	# attn_output = attn_output[current_batch_ids, current_seq_ids] # this line for debug only

	# 🔍 add to the final outputs
	final_attn_output.index_add_(0, top_x, attn_output)

	if output_attentions:
	all_attn_weights.append(attn_weights)

	# 🔍 reshape
	final_attn_output = final_attn_output.reshape(bsz, q_len, hidden_dim)

	if output_attentions:
	all_attn_weights = tuple(all_attn_weights)

	return final_attn_output, all_attn_weights, past_key_value, router_logits # 🔍 return an extra `router_logits`

	@torch.no_grad()
	def from_vanilla_attention(attention: MixtralAttention, top_k_attn, scale_factor_attn):
	# config
	layer_idx = attention.layer_idx
	config = attention.config
	config.top_k_attn = top_k_attn
	config.scale_factor_attn = scale_factor_attn

	# init
	attention_moe = MixtralAttentionMoE(config, layer_idx)

	split = 1 # split the hidden_size, support split=1 --> 8/2, split=2 --> 16/4, split=4 --> 32/8
	# copy weights
	num_key_value_groups = attention_moe.num_key_value_groups // split
	head_dim = attention_moe.head_dim

	for i in range(config.num_key_value_heads * split):
	indices_q_o = [j for j in range(head_dim * num_key_value_groups * i, head_dim * num_key_value_groups * (i + 1))]
	indices_k_v = [j for j in range(head_dim * (i // split), head_dim * ((i // split) + 1))]

	print(i, "indices_q_o", indices_q_o)
	# print(i, "indices_k_v", indices_k_v)

	attention_moe.q_proj[i].weight.data = attention.q_proj.weight.data[indices_q_o].clone()
	attention_moe.k_proj[i].weight.data = attention.k_proj.weight.data[indices_k_v].clone()
	attention_moe.v_proj[i].weight.data = attention.v_proj.weight.data[indices_k_v].clone()
	attention_moe.o_proj[i].weight.data = attention.o_proj.weight.data[:, indices_q_o].clone()

	return attention_moe


	# fmt: on


	# Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Mixtral
	class MixtralFlashAttention2(MixtralAttention):
	"""
	Mixtral flash attention module. This module inherits from `MixtralAttention` as the weights of the module stays
	untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
	flash attention and deal with padding tokens in case the input contains any of them.
	"""

	# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)
	# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
	# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
	self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Cache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	**kwargs,
	):
	if "padding_mask" in kwargs:
	warnings.warn(
	"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
	)

	# overwrite attention_mask with padding_mask
	attention_mask = kwargs.pop("padding_mask")
	bsz, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	query_states = query_states.view(
	bsz, q_len, self.num_heads, self.head_dim
	).transpose(1, 2)
	key_states = key_states.view(
	bsz, q_len, self.num_key_value_heads, self.head_dim
	).transpose(1, 2)
	value_states = value_states.view(
	bsz, q_len, self.num_key_value_heads, self.head_dim
	).transpose(1, 2)

	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	if self.layer_idx is None:
	raise ValueError(
	f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
	"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
	"with a layer index."
	)
	kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

	# Because the input can be padded, the absolute sequence length depends on the max position id.
	rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
	cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)

	query_states, key_states = apply_rotary_pos_emb(
	query_states, key_states, cos, sin, position_ids
	)

	use_sliding_windows = (
	_flash_supports_window_size
	and getattr(self.config, "sliding_window", None) is not None
	and kv_seq_len > self.config.sliding_window
	)

	if not _flash_supports_window_size:
	logger.warning_once(
	"The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
	" make sure to upgrade flash-attn library."
	)

	if past_key_value is not None:
	# Activate slicing cache only if the config has a value `sliding_windows` attribute
	cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
	if (
	getattr(self.config, "sliding_window", None) is not None
	and kv_seq_len > self.config.sliding_window
	and cache_has_contents
	):
	slicing_tokens = 1 - self.config.sliding_window

	past_key = past_key_value[self.layer_idx][0]
	past_value = past_key_value[self.layer_idx][1]

	past_key = past_key[:, :, slicing_tokens:, :].contiguous()
	past_value = past_value[:, :, slicing_tokens:, :].contiguous()

	if past_key.shape[-2] != self.config.sliding_window - 1:
	raise ValueError(
	f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
	f" {past_key.shape}"
	)

	if attention_mask is not None:
	attention_mask = attention_mask[:, slicing_tokens:]
	attention_mask = torch.cat(
	[attention_mask, torch.ones_like(attention_mask[:, -1:])],
	dim=-1,
	)

	cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
	key_states, value_states = past_key_value.update(
	key_states, value_states, self.layer_idx, cache_kwargs
	)

	# repeat k/v heads if n_kv_heads < n_heads
	key_states = repeat_kv(key_states, self.num_key_value_groups)
	value_states = repeat_kv(value_states, self.num_key_value_groups)
	dropout_rate = 0.0 if not self.training else self.attention_dropout

	# In PEFT, usually we cast the layer norms in float32 for training stability reasons
	# therefore the input hidden states gets silently casted in float32. Hence, we need
	# cast them back in float16 just to be sure everything works as expected.
	input_dtype = query_states.dtype
	if input_dtype == torch.float32:
	# Handle the case where the model is quantized
	if hasattr(self.config, "_pre_quantization_dtype"):
	target_dtype = self.config._pre_quantization_dtype
	else:
	target_dtype = self.q_proj.weight.dtype

	logger.warning_once(
	f"The input hidden states seems to be silently casted in float32, this might be related to"
	f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
	f" {target_dtype}."
	)

	query_states = query_states.to(target_dtype)
	key_states = key_states.to(target_dtype)
	value_states = value_states.to(target_dtype)

	# Reashape to the expected shape for Flash Attention
	query_states = query_states.transpose(1, 2)
	key_states = key_states.transpose(1, 2)
	value_states = value_states.transpose(1, 2)

	# print("attention_mask", attention_mask, attention_mask.shape)
	attn_output = self._flash_attention_forward(
	query_states,
	key_states,
	value_states,
	attention_mask,
	q_len,
	dropout=dropout_rate,
	use_sliding_windows=use_sliding_windows,
	)

	attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
	attn_output = self.o_proj(attn_output)

	if not output_attentions:
	attn_weights = None

	return attn_output, attn_weights, past_key_value

	def _flash_attention_forward(
	self,
	query_states,
	key_states,
	value_states,
	attention_mask,
	query_length,
	dropout=0.0,
	softmax_scale=None,
	use_sliding_windows=False,
	):
	"""
	Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
	first unpad the input, then computes the attention scores and pad the final attention scores.

	Args:
	query_states (`torch.Tensor`):
	Input query states to be passed to Flash Attention API
	key_states (`torch.Tensor`):
	Input key states to be passed to Flash Attention API
	value_states (`torch.Tensor`):
	Input value states to be passed to Flash Attention API
	attention_mask (`torch.Tensor`):
	The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
	position of padding tokens and 1 for the position of non-padding tokens.
	dropout (`int`, optional):
	Attention dropout
	softmax_scale (`float`, optional):
	The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
	use_sliding_windows (`bool`, optional):
	Whether to activate sliding window attention.
	"""
	if not self._flash_attn_uses_top_left_mask:
	causal = self.is_causal
	else:
	# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
	causal = self.is_causal and query_length != 1

	# Contains at least one padding token in the sequence
	if attention_mask is not None:
	batch_size = query_states.shape[0]
	(
	query_states,
	key_states,
	value_states,
	indices_q,
	cu_seq_lens,
	max_seq_lens,
	) = self._upad_input(
	query_states, key_states, value_states, attention_mask, query_length
	)

	cu_seqlens_q, cu_seqlens_k = cu_seq_lens
	max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

	if not use_sliding_windows:
	attn_output_unpad = flash_attn_varlen_func(
	query_states,
	key_states,
	value_states,
	cu_seqlens_q=cu_seqlens_q,
	cu_seqlens_k=cu_seqlens_k,
	max_seqlen_q=max_seqlen_in_batch_q,
	max_seqlen_k=max_seqlen_in_batch_k,
	dropout_p=dropout,
	softmax_scale=softmax_scale,
	causal=causal,
	)
	else:
	attn_output_unpad = flash_attn_varlen_func(
	query_states,
	key_states,
	value_states,
	cu_seqlens_q=cu_seqlens_q,
	cu_seqlens_k=cu_seqlens_k,
	max_seqlen_q=max_seqlen_in_batch_q,
	max_seqlen_k=max_seqlen_in_batch_k,
	dropout_p=dropout,
	softmax_scale=softmax_scale,
	causal=causal,
	window_size=(
	self.config.sliding_window,
	self.config.sliding_window,
	),
	)

	attn_output = pad_input(
	attn_output_unpad, indices_q, batch_size, query_length
	)
	else:
	if not use_sliding_windows:
	attn_output = flash_attn_func(
	query_states,
	key_states,
	value_states,
	dropout,
	softmax_scale=softmax_scale,
	causal=causal,
	)
	else:
	attn_output = flash_attn_func(
	query_states,
	key_states,
	value_states,
	dropout,
	softmax_scale=softmax_scale,
	causal=causal,
	window_size=(
	self.config.sliding_window,
	self.config.sliding_window,
	),
	)

	return attn_output

	def _upad_input(
	self, query_layer, key_layer, value_layer, attention_mask, query_length
	):
	batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
	# On the first iteration we need to properly re-create the padding mask
	# by slicing it on the proper place
	if kv_seq_len != attention_mask.shape[-1]:
	attention_mask_num_tokens = attention_mask.shape[-1]
	attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]

	indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)

	key_layer = index_first_axis(
	key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
	)
	value_layer = index_first_axis(
	value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
	)

	if query_length == kv_seq_len:
	query_layer = index_first_axis(
	query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim),
	indices_k,
	)
	cu_seqlens_q = cu_seqlens_k
	max_seqlen_in_batch_q = max_seqlen_in_batch_k
	indices_q = indices_k
	elif query_length == 1:
	max_seqlen_in_batch_q = 1
	cu_seqlens_q = torch.arange(
	batch_size + 1, dtype=torch.int32, device=query_layer.device
	) # There is a memcpy here, that is very bad.
	indices_q = cu_seqlens_q[:-1]
	query_layer = query_layer.squeeze(1)
	else:
	# The -q_len: slice assumes left padding.
	attention_mask = attention_mask[:, -query_length:]
	query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
	query_layer, attention_mask
	)

	return (
	query_layer,
	key_layer,
	value_layer,
	indices_q,
	(cu_seqlens_q, cu_seqlens_k),
	(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
	)



	class MixtralFlashAttention2MoE(MixtralFlashAttention2):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	self.top_k_attn = self.config.top_k_attn
	self.attn_experts = self.config.attn_experts
	self.scale_factor_attn = self.config.scale_factor_attn
	self.split_ratio = self.attn_experts // self.num_key_value_heads

	self.gate = nn.Linear(self.hidden_size, self.attn_experts, bias=False)

	self.q_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.num_key_value_groups * self.head_dim // self.split_ratio, bias=False) for _ in range(self.attn_experts)])
	self.k_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.head_dim, bias=False) for _ in range(self.attn_experts)])
	self.v_proj = nn.ModuleList([nn.Linear(self.hidden_size, self.head_dim, bias=False) for _ in range(self.attn_experts)])
	self.o_proj = nn.ModuleList([nn.Linear(self.num_key_value_groups * self.head_dim // self.split_ratio, self.hidden_size, bias=self.config.add_rescale_bias) for _ in range(self.attn_experts)])

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Cache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	**kwargs,
	):

	if "padding_mask" in kwargs:
	warnings.warn(
	"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
	)

	# overwrite attention_mask with padding_mask
	# attention_mask = kwargs.pop("padding_mask")

	if past_key_value is not None and not isinstance(past_key_value, MoECache): # 🔍 type check
	raise TypeError(
	"`past_key_value` must be a `MoECache` instance for attention MoE!"
	)

	bsz, q_len, hidden_dim = hidden_states.size()
	device = hidden_states.device
	dtype = hidden_states.dtype

	hidden_states = hidden_states.reshape(-1, hidden_dim)
	# gate compute
	router_logits = self.gate(hidden_states)
	router_scores = F.softmax(router_logits, dim=1, dtype=torch.float)
	routing_weights, selected_experts = torch.topk(router_scores, self.top_k_attn, dim=-1)
	routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
	routing_weights = routing_weights.to(dtype)

	final_attn_output = torch.zeros_like(hidden_states).reshape(-1, hidden_dim)

	expert_mask = F.one_hot(selected_experts, num_classes=self.num_heads).permute(2, 1, 0)

	all_attn_weights = [] if output_attentions else None

	for expert_idx in range(self.attn_experts):
	idx, top_x = torch.nonzero(expert_mask[expert_idx], as_tuple=True)
	# top_x_list = top_x.tolist()
	# idx_list = idx.tolist()

	if top_x.shape[0] == 0 and not self.training: # skip during training will lead to asynchrony among different GPUs and blocks the training!
	if output_attentions:
	all_attn_weights.append(None)
	continue

	# create position_ids for selected tokens
	current_batch_ids = (top_x // q_len)
	each_batch_selected_token_num = torch.bincount(current_batch_ids, minlength=bsz) # (bsz)
	this_q_len = each_batch_selected_token_num.max().item()

	selection_mask = torch.zeros((bsz * q_len,), device=device, dtype=torch.bool)
	selection_mask[top_x] = True
	selection_mask = selection_mask.reshape(bsz, q_len)
	token_position_indices = torch.cumsum(selection_mask, dim=1) - 1
	token_position_indices = token_position_indices.flatten()
	current_seq_ids = token_position_indices[top_x]


	# 🔍 initialize hidden_states for this expert
	current_state = torch.zeros((bsz, this_q_len, hidden_dim), dtype=dtype, device=device)
	current_state[current_batch_ids, current_seq_ids] = hidden_states[top_x] # assign tokens sparsely

	# for attention forward
	# expert_inputs = viewed_hidden_states[None, top_x_list].reshape(-1, self.hidden_size)

	query_states = self.q_proj[expert_idx](current_state)
	key_states = self.k_proj[expert_idx](current_state)
	value_states = self.v_proj[expert_idx](current_state)

	# seq_len = query_states.numel() // (bsz * self.num_key_value_groups * self.head_dim)
	query_states = query_states.view(bsz, -1, self.num_key_value_groups // self.split_ratio, self.head_dim).transpose(1, 2)
	key_states = key_states.view(bsz, -1, 1, self.head_dim).transpose(1, 2)
	value_states = value_states.view(bsz, -1, 1, self.head_dim).transpose(1, 2)

	# for moe kv cache
	past_key_values_length = 0
	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	if self.layer_idx is None:
	raise ValueError(
	f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
	"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
	"with a layer index."
	)
	past_key_values_length = past_key_value.get_usable_length(kv_seq_len, self.layer_idx, expert_idx) # 🔍 specify expert index
	kv_seq_len += past_key_values_length

	current_position_ids = torch.zeros((bsz, this_q_len), device=hidden_states.device, dtype=torch.long)
	current_position_ids[current_batch_ids, current_seq_ids] = position_ids.expand(bsz, q_len).flatten()[top_x]

	if top_x.shape[0] > 0: # apply only when there are tokens
	cos, sin = self.rotary_emb(value_states, seq_len=current_position_ids.max().item() + 1) # 🔍 adjust the seq_len to the maximum possible value
	query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, current_position_ids)

	if past_key_value is not None:
	cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
	key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, expert_idx, cache_kwargs) # 🔍 specify expert index

	# print("attention_mask", attention_mask.shape, attention_mask)
	# for current attention mask

	'''
	current_attention_mask = torch.zeros((bsz, this_q_len), dtype=torch.bool, device=device)

	if attention_mask is not None:
	if past_key_values_length > 0: # 🔍 we need to exclude previous tokens
	previous_seen_tokens_total = past_key_value._seen_tokens_total - q_len
	temp_attention_mask = attention_mask[:, previous_seen_tokens_total:].flatten() # select along dimension 1 so that we get tokens in this iteration
	else:
	temp_attention_mask = attention_mask.flatten() # flatten the dim
	current_attention_mask[current_batch_ids, current_seq_ids] = temp_attention_mask[top_x] # bug here !!!

	else:
	current_attention_mask[current_batch_ids, current_seq_ids] = True # assign masks sparsely

	if past_key_value is not None: # 🔍 we need to update with cached attention mask
	current_attention_mask = past_key_value.update_attention_mask(current_attention_mask, self.layer_idx, expert_idx)


	current_attention_mask = _prepare_4d_causal_attention_mask(
	current_attention_mask,
	(bsz, this_q_len),
	current_state,
	past_key_values_length,
	sliding_window=self.config.sliding_window,
	)

	if current_attention_mask.size() != (bsz, 1, this_q_len, kv_seq_len): # 🔍 q_len -> this_q_len
	raise ValueError(f"Attention mask should be of size {(bsz, 1, this_q_len, kv_seq_len)}, but is {current_attention_mask.size()}")

	'''

	# for sliding window
	use_sliding_windows = (
	_flash_supports_window_size
	and getattr(self.config, "sliding_window", None) is not None
	and kv_seq_len > self.config.sliding_window
	)

	if not _flash_supports_window_size:
	logger.warning_once(
	"The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
	" make sure to upgrade flash-attn library."
	)

	# wait for change! sliding_window=4096
	if past_key_value is not None:
	# Activate slicing cache only if the config has a value `sliding_windows` attribute
	cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
	if (
	getattr(self.config, "sliding_window", None) is not None
	and kv_seq_len > self.config.sliding_window
	and cache_has_contents
	):
	slicing_tokens = 1 - self.config.sliding_window

	past_key = past_key_value[self.layer_idx][0]
	past_value = past_key_value[self.layer_idx][1]

	past_key = past_key[:, :, slicing_tokens:, :].contiguous()
	past_value = past_value[:, :, slicing_tokens:, :].contiguous()

	if past_key.shape[-2] != self.config.sliding_window - 1:
	raise ValueError(
	f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
	f" {past_key.shape}"
	)

	if attention_mask is not None:
	attention_mask = attention_mask[:, slicing_tokens:]
	attention_mask = torch.cat(
	[attention_mask, torch.ones_like(attention_mask[:, -1:])],
	dim=-1,
	)

	cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
	key_states, value_states = past_key_value.update(
	key_states, value_states, self.layer_idx, cache_kwargs
	)

	# for input dtype
	input_dtype = query_states.dtype
	if input_dtype == torch.float32:
	# Handle the case where the model is quantized
	if hasattr(self.config, "_pre_quantization_dtype"):
	target_dtype = self.config._pre_quantization_dtype
	else:
	target_dtype = self.q_proj[0].weight.dtype

	logger.warning_once(
	f"The input hidden states seems to be silently casted in float32, this might be related to"
	f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
	f" {target_dtype}."
	)

	query_states = query_states.to(target_dtype)
	key_states = key_states.to(target_dtype)
	value_states = value_states.to(target_dtype)

	dropout_rate = 0.0 if not self.training else self.attention_dropout

	repeat_num = query_states.shape[1]
	key_states = repeat_kv(key_states, repeat_num)
	value_states = repeat_kv(value_states, repeat_num)

	# print("repeat_num", repeat_num)
	# print("query_states shape", query_states.shape, key_states.shape, value_states.shape)

	# Reashape to the expected shape for Flash Attention
	query_states = query_states.transpose(1, 2)
	key_states = key_states.transpose(1, 2)
	value_states = value_states.transpose(1, 2)

	attn_output = self._flash_attention_forward(
	query_states,
	key_states,
	value_states,
	attention_mask,
	this_q_len,
	dropout=dropout_rate,
	use_sliding_windows=use_sliding_windows,
	)

	attn_output = attn_output.reshape(bsz, this_q_len, self.num_key_value_groups * self.head_dim // self.split_ratio).contiguous()
	attn_output = self.o_proj[expert_idx](attn_output)
	attn_output = attn_output[current_batch_ids, current_seq_ids] * (routing_weights[top_x, idx, None] * self.scale_factor_attn)

	final_attn_output.index_add_(0, top_x, attn_output)

	final_attn_output = final_attn_output.reshape(bsz, q_len, hidden_dim)

	if not output_attentions:
	attn_weights = None

	return final_attn_output, attn_weights, past_key_value, router_logits # 🔍 return an extra `router_logits`



	class MixtralFlashAttention2MoE_zt(MixtralFlashAttention2):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	self.top_k_attn = self.config.top_k_attn
	self.scale_factor_attn = self.config.scale_factor_attn
	# self.num_heads
	# self.head_dim
	# self.num_key_value_heads
	# self.num_key_value_groups # total number of experts
	assert self.top_k_attn <= self.num_key_value_groups
	# assert self.top_k_attn % self.num_key_value_heads == 0
	self.attn_hsz = self.hidden_size // self.num_key_value_groups * self.top_k_attn
	self.kv_repeat_num = self.attn_hsz // (self.num_key_value_heads * self.head_dim)
	self.simulated_attn_head_num = self.attn_hsz // self.head_dim
	assert self.attn_hsz % (self.num_key_value_heads * self.head_dim) == 0
	assert self.simulated_attn_head_num == self.num_heads * (self.top_k_attn / self.num_key_value_groups)
	assert self.kv_repeat_num * self.num_key_value_heads == self.simulated_attn_head_num

	self.gate = nn.Linear(self.hidden_size, self.num_key_value_groups, bias=False)
	# tzhu: there are self.num_key_value_groups experts
	# each expert has a size of self.attn_hsz
	self.q_proj = nn.ModuleList(
	[nn.Linear(self.hidden_size, self.attn_hsz) for _ in range(self.num_key_value_groups)]
	)
	self.o_proj = nn.ModuleList(
	[nn.Linear(self.attn_hsz, self.hidden_size) for _ in range(self.num_key_value_groups)]
	)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Cache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	**kwargs,
	):
	if "padding_mask" in kwargs:
	warnings.warn(
	"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
	)

	# overwrite attention_mask with padding_mask
	attention_mask = kwargs.pop("padding_mask")
	bsz, q_len, _ = hidden_states.size()

	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	# tzhu: attn-moe on q_proj
	viewed_hidden_states = hidden_states.view(bsz * q_len, self.hidden_size)
	# router
	router_logits = self.gate(viewed_hidden_states)
	router_scores = F.softmax(router_logits, dim=-1, dtype=torch.float)
	routing_weights, selected_experts = torch.topk(router_scores, self.top_k_attn, dim=-1)
	routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
	routing_weights = routing_weights.to(hidden_states.dtype)
	query_states = torch.zeros(
	(bsz * q_len, self.attn_hsz),
	dtype=hidden_states.dtype,
	device=hidden_states.device,
	)
	# expert_mask: (num_experts, top_k_attn, bsz * q_len)
	expert_mask = F.one_hot(selected_experts, num_classes=self.num_heads).permute(2, 1, 0)
	for expert_idx in range(self.num_key_value_groups):
	expert_layer = self.q_proj[expert_idx]
	idx, top_x = torch.where(expert_mask[expert_idx])
	top_x_list = top_x.tolist()
	idx_list = idx.tolist()
	expert_inputs = viewed_hidden_states[None, top_x_list].reshape(-1, self.hidden_size)
	# inputs (-1, hidden_size) -> outputs (-1, attn_hsz)
	expert_outs = expert_layer(expert_inputs) * routing_weights[top_x_list, idx_list, None] * self.scale_factor_attn
	query_states.index_add_(0, top_x, expert_outs.to(query_states.dtype))
	query_states = query_states.view(bsz, q_len, self.attn_hsz)
	# query_states = query_states.view(
	# bsz, q_len, self.num_heads, self.simulated_attn_head_num
	# ).transpose(1, 2)
	query_states = query_states.view(
	bsz, q_len, self.simulated_attn_head_num, self.head_dim
	).transpose(1, 2)
	key_states = key_states.view(
	bsz, q_len, self.num_key_value_heads, self.head_dim
	).transpose(1, 2)
	value_states = value_states.view(
	bsz, q_len, self.num_key_value_heads, self.head_dim
	).transpose(1, 2)

	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	if self.layer_idx is None:
	raise ValueError(
	f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
	"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
	"with a layer index."
	)
	kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

	# Because the input can be padded, the absolute sequence length depends on the max position id.
	rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
	cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)

	query_states, key_states = apply_rotary_pos_emb(
	query_states, key_states, cos, sin, position_ids
	)

	use_sliding_windows = (
	_flash_supports_window_size
	and getattr(self.config, "sliding_window", None) is not None
	and kv_seq_len > self.config.sliding_window
	)

	if not _flash_supports_window_size:
	logger.warning_once(
	"The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
	" make sure to upgrade flash-attn library."
	)

	if past_key_value is not None:
	# Activate slicing cache only if the config has a value `sliding_windows` attribute
	cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
	if (
	getattr(self.config, "sliding_window", None) is not None
	and kv_seq_len > self.config.sliding_window
	and cache_has_contents
	):
	slicing_tokens = 1 - self.config.sliding_window

	past_key = past_key_value[self.layer_idx][0]
	past_value = past_key_value[self.layer_idx][1]

	past_key = past_key[:, :, slicing_tokens:, :].contiguous()
	past_value = past_value[:, :, slicing_tokens:, :].contiguous()

	if past_key.shape[-2] != self.config.sliding_window - 1:
	raise ValueError(
	f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
	f" {past_key.shape}"
	)

	if attention_mask is not None:
	attention_mask = attention_mask[:, slicing_tokens:]
	attention_mask = torch.cat(
	[attention_mask, torch.ones_like(attention_mask[:, -1:])],
	dim=-1,
	)

	cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
	key_states, value_states = past_key_value.update(
	key_states, value_states, self.layer_idx, cache_kwargs
	)

	# repeat k/v heads if n_kv_heads < n_heads
	key_states = repeat_kv(key_states, self.kv_repeat_num)
	value_states = repeat_kv(value_states, self.kv_repeat_num)
	dropout_rate = 0.0 if not self.training else self.attention_dropout

	# In PEFT, usually we cast the layer norms in float32 for training stability reasons
	# therefore the input hidden states gets silently casted in float32. Hence, we need
	# cast them back in float16 just to be sure everything works as expected.
	input_dtype = query_states.dtype
	if input_dtype == torch.float32:
	# Handle the case where the model is quantized
	if hasattr(self.config, "_pre_quantization_dtype"):
	target_dtype = self.config._pre_quantization_dtype
	else:
	target_dtype = self.q_proj.weight.dtype

	logger.warning_once(
	f"The input hidden states seems to be silently casted in float32, this might be related to"
	f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
	f" {target_dtype}."
	)

	query_states = query_states.to(target_dtype)
	key_states = key_states.to(target_dtype)
	value_states = value_states.to(target_dtype)

	# Reashape to the expected shape for Flash Attention
	query_states = query_states.transpose(1, 2)
	key_states = key_states.transpose(1, 2)
	value_states = value_states.transpose(1, 2)

	attn_output = self._flash_attention_forward(
	query_states,
	key_states,
	value_states,
	attention_mask,
	q_len,
	dropout=dropout_rate,
	use_sliding_windows=use_sliding_windows,
	)

	attn_output = attn_output.reshape(bsz * q_len, self.attn_hsz).contiguous()
	final_attn_output = torch.zeros(
	(bsz * q_len, self.hidden_size),
	dtype=hidden_states.dtype,
	device=hidden_states.device,
	)
	for expert_idx in range(self.num_key_value_groups):
	expert_layer = self.o_proj[expert_idx]
	idx, top_x = torch.where(expert_mask[expert_idx])
	top_x_list = top_x.tolist()
	idx_list = idx.tolist()
	expert_inputs = attn_output[None, top_x_list].reshape(-1, self.attn_hsz)
	expert_outs = expert_layer(expert_inputs) * routing_weights[top_x_list, idx_list, None] * self.scale_factor_attn
	final_attn_output.index_add_(0, top_x, expert_outs.to(final_attn_output.dtype))
	final_attn_output = final_attn_output.view(bsz, q_len, self.hidden_size)

	if not output_attentions:
	attn_weights = None

	return final_attn_output, attn_weights, past_key_value, router_logits


	@torch.no_grad()
	def from_vanilla_attention(attention: MixtralAttention, top_k_attn, scale_factor_attn):
	# config
	layer_idx = attention.layer_idx
	config = attention.config
	config.top_k_attn = top_k_attn
	config.scale_factor_attn = scale_factor_attn

	# init
	attention_moe = MixtralFlashAttention2MoE(config, layer_idx)

	# copy weights
	num_key_value_groups = attention_moe.num_key_value_groups
	head_dim = attention_moe.head_dim

	for i in range(num_key_value_groups):
	indices_q_o = []
	for j in range(attention_moe.num_key_value_heads):
	k = i + j * num_key_value_groups
	indices_q_o.extend(
	list(range(k * head_dim, (k + 1) * head_dim))
	)

	print(i, "indices_q_o", indices_q_o)

	attention_moe.q_proj[i].weight.data = attention.q_proj.weight.data[indices_q_o].clone()
	attention_moe.o_proj[i].weight.data = attention.o_proj.weight.data[:, indices_q_o].clone()

	return attention_moe




	class MixtralBLockSparseTop2MLP(nn.Module):
	def __init__(self, config: MixtralConfig, ffn_dim, add_rescale_bias=False): # 🔍
	super().__init__()
	self.ffn_dim = ffn_dim # 🔍
	self.hidden_dim = config.hidden_size

	self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False) # gate
	self.w2 = nn.Linear(
	self.ffn_dim, self.hidden_dim, bias=add_rescale_bias
	) # 🔍 down (may add bias for rescaling)
	self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False) # up

	self.act_fn = ACT2FN[config.hidden_act]

	def forward(self, hidden_states):
	current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(
	hidden_states
	)
	current_hidden_states = self.w2(current_hidden_states)
	return current_hidden_states


	MISTRAL_ATTENTION_CLASSES = {
	"eager": MixtralAttention,
	"flash_attention_2": MixtralFlashAttention2,
	}

	# 🔍
	MISTRAL_ATTENTION_MOE_CLASSES = {
	"eager": MixtralAttentionMoE,
	"flash_attention_2": MixtralFlashAttention2MoE,
	}


	class SimplifiedSparseGLU(nn.Module):
	def __init__(self, args: MegablocksArguments):
	super().__init__()
	self.args = args

	if args.bf16:
	torch_dtype = torch.bfloat16
	elif args.fp16:
	torch_dtype = torch.float16
	else:
	torch_dtype = None

	# gate
	self.w1 = nn.Parameter(
	torch.empty(
	args.ffn_hidden_size * args.moe_num_experts,
	args.hidden_size,
	dtype=torch_dtype,
	)
	)
	# down
	self.w2 = nn.Parameter(
	torch.empty(
	args.ffn_hidden_size * args.moe_num_experts,
	args.hidden_size,
	dtype=torch_dtype,
	)
	)
	# up
	self.v1 = nn.Parameter(
	torch.empty(
	args.ffn_hidden_size * args.moe_num_experts,
	args.hidden_size,
	dtype=torch_dtype,
	)
	)

	self.act_fn = args.activation_fn

	def forward(self, x, topo):
	if self.args.memory_optimized_mlp:
	raise NotImplementedError(
	"Memory optimized implementation not yet supported with GLU with sparse kernels."
	)

	# TODO (tzhu): test if OOM comes from dtensor conversion
	# TODO (tzhu): return x directly to see if it still encounters OOM
	# w1, v1, w2 = (
	# resolve_dtensor(self.w1),
	# resolve_dtensor(self.v1),
	# resolve_dtensor(self.w2),
	# )

	# Compute the GLU.
	x1 = stk.ops.sdd(x, self.w1.t(), topo)
	x2 = stk.ops.sdd(x, self.v1.t(), topo)

	activation_fn_out = act_fn(x1, self.act_fn)
	x1 = stk.ops.mul(activation_fn_out, x2)

	return stk.ops.dsd(x1, self.w2)


	class SimplifiedGroupedSparseGLU(SimplifiedSparseGLU):
	def forward(self, x, tokens_per_expert):
	batch_sizes = tokens_per_expert.cpu().to(torch.long)
	# w1, v1, w2 = (
	# resolve_dtensor(self.w1),
	# resolve_dtensor(self.v1),
	# resolve_dtensor(self.w2),
	# )

	# Re-shape the weights for the grouped GEMMs.
	# ne = mpu.experts_per_rank(self.args)
	# w1 = self.w1.view(ne, -1, self.args.hidden_size)
	# v1 = self.v1.view(ne, -1, self.args.hidden_size)
	# w2 = self.w2.view(ne, -1, self.args.hidden_size)

	ne = self.args.moe_num_experts
	w1 = self.w1.view(ne, -1, self.args.hidden_size)
	v1 = self.v1.view(ne, -1, self.args.hidden_size)
	w2 = self.w2.view(ne, -1, self.args.hidden_size)

	if self.args.memory_optimized_mlp:
	return memory_optimized_grouped_glu(
	x,
	w1,
	v1,
	w2,
	batch_sizes,
	self.args.quantize_inputs_num_bits,
	self.args.quantize_rematerialize_num_bits,
	self.args.activation_fn,
	)

	# Compute the MLP.
	x1 = gg.ops.gmm(x, w1, batch_sizes, trans_b=True)
	x2 = gg.ops.gmm(x, v1, batch_sizes, trans_b=True)
	x1 = self.act_fn(x1) * x2
	return gg.ops.gmm(x1, w2, batch_sizes)


	_REGISTRY["simplified_glu"] = {
	"grouped": SimplifiedGroupedSparseGLU,
	"sparse": SimplifiedSparseGLU,
	}


	class SimplifiedParallelDroplessMLP(ParallelDroplessMLP):
	def forward(self, x, expert_weights, top_experts):
	in_shape = x.size()

	# Compute the experts.
	x, tokens_per_expert = self.forward_fn(x, expert_weights, top_experts)
	x = x.view(in_shape)
	if self.bias is not None:
	if self.args.return_bias:
	return x, self.bias
	return x + self.bias
	return x


	class MixtralSparseMoeBlock(nn.Module):
	"""
	This implementation is
	strictly equivalent to standard MoE with full capacity (no
	dropped tokens). It's faster since it formulates MoE operations
	in terms of block-sparse operations to accomodate imbalanced
	assignments of tokens to experts, whereas standard MoE either
	(1) drop tokens at the cost of reduced performance or (2) set
	capacity factor to number of experts and thus waste computation
	and memory on padding.
	"""

	def __init__(self, config):
	super().__init__()
	self.hidden_dim = config.hidden_size
	self.ffn_dim = config.intermediate_size
	self.num_experts = config.num_local_experts
	self.top_k = config.num_experts_per_tok

	# specialized for llama-moe-v2
	self.scale_factor = config.scale_factor
	self.moe_type = config.moe_type

	# gating
	self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)

	if self.moe_type == "modulelist":
	self.experts = nn.ModuleList(
	[
	MixtralBLockSparseTop2MLP(
	config,
	config.intermediate_size,
	add_rescale_bias=config.add_rescale_bias,
	)
	for _ in range(self.num_experts)
	] # 🔍
	)
	elif self.moe_type == "megablocks":
	if config.add_rescale_bias:
	raise NotImplementedError(
	"RescaleBias not yet supported with megablocks."
	)
	is_fp16 = self.gate.weight.dtype == torch.float16
	is_bf16 = self.gate.weight.dtype == torch.bfloat16
	args = MegablocksArguments(
	hidden_size=self.hidden_dim,
	ffn_hidden_size=self.ffn_dim,
	moe_num_experts=self.num_experts,
	moe_top_k=self.top_k,
	activation_fn={"silu": F.silu}[config.hidden_act],
	mlp_type="simplified_glu",
	mlp_impl="sparse",
	memory_optimized_mlp=False,
	bias=False,
	fp16=is_fp16,
	bf16=is_bf16,
	)
	self.experts = SimplifiedParallelDroplessMLP(args)
	elif self.moe_type == "scattermoe":
	if config.add_rescale_bias:
	raise NotImplementedError(
	"RescaleBias not yet supported with scattermoe."
	)
	self.experts = scattermoe.mlp.GLUMLP(
	input_size=self.hidden_dim,
	hidden_size=self.ffn_dim,
	num_experts=self.num_experts,
	top_k=self.top_k,
	activation={"silu": F.silu}[config.hidden_act],
	)
	else:
	raise NotImplementedError(f"Unsupported moe_type: {self.moe_type}")

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	""" """
	batch_size, sequence_length, hidden_dim = hidden_states.shape
	hidden_states = hidden_states.view(-1, hidden_dim)
	# router_logits: (batch * sequence_length, n_experts)
	router_logits = self.gate(hidden_states)

	scores = F.softmax(router_logits, dim=1, dtype=torch.float)
	routing_weights, selected_experts = torch.topk(scores, self.top_k, dim=-1)
	routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
	# we cast back to the input dtype
	routing_weights = routing_weights.to(hidden_states.dtype)

	if self.moe_type == "megablocks":
	final_hidden_states = self.experts(
	hidden_states, routing_weights, selected_experts
	)
	elif self.moe_type == "scattermoe":
	final_hidden_states = self.experts(
	hidden_states, routing_weights, selected_experts
	)
	else:
	final_hidden_states = torch.zeros(
	(batch_size * sequence_length, hidden_dim),
	dtype=hidden_states.dtype,
	device=hidden_states.device,
	)

	# One hot encode the selected experts to create an expert mask
	# this will be used to easily index which expert is going to be sollicitated
	expert_mask = torch.nn.functional.one_hot(
	selected_experts, num_classes=self.num_experts
	).permute(2, 1, 0)

	# Loop over all available experts in the model and perform the computation on each expert
	for expert_idx in range(self.num_experts):
	expert_layer = self.experts[expert_idx]
	idx, top_x = torch.where(expert_mask[expert_idx])

	if (
	top_x.shape[0] == 0 and not self.training
	): # skip during training will lead to asynchrony among different GPUs and blocks the training!
	continue

	# in torch it is faster to index using lists than torch tensors
	top_x_list = top_x.tolist()
	idx_list = idx.tolist()

	# Index the correct hidden states and compute the expert hidden state for
	# the current expert. We need to make sure to multiply the output hidden
	# states by `routing_weights` on the corresponding tokens (top-1 and top-2)
	current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
	current_hidden_states = expert_layer(current_state) * (
	routing_weights[top_x_list, idx_list, None] * self.scale_factor
	)

	# However `index_add_` only support torch tensors for indexing so we'll use
	# the `top_x` tensor here.
	final_hidden_states.index_add_(
	0, top_x, current_hidden_states.to(hidden_states.dtype)
	)

	final_hidden_states = final_hidden_states.reshape(
	batch_size, sequence_length, hidden_dim
	)

	return final_hidden_states, router_logits


	class MixtralDecoderLayer(nn.Module):
	def __init__(self, config: MixtralConfig, layer_idx: int):
	super().__init__()
	self.hidden_size = config.hidden_size

	# 🔍
	self.is_moe = (layer_idx >= config.num_moe_contract_layers) and (
	layer_idx < config.num_hidden_layers - config.num_moe_contract_layers
	)
	self.use_attn_moe = config.use_attn_moe

	if self.use_attn_moe:
	attn_class = MISTRAL_ATTENTION_MOE_CLASSES[config._attn_implementation]
	else:
	attn_class = MISTRAL_ATTENTION_CLASSES[config._attn_implementation]
	self.self_attn = attn_class(config, layer_idx)


	if self.is_moe:
	self.block_sparse_moe = MixtralSparseMoeBlock(config)
	self.mlp_residual = (
	MixtralBLockSparseTop2MLP(config, config.intermediate_size_residual)
	if config.intermediate_size_residual is not None
	else None
	)

	else:
	self.block_sparse_moe = MixtralBLockSparseTop2MLP(
	config, config.intermediate_size * config.num_local_experts
	)
	self.mlp_residual = None

	self.input_layernorm = MixtralRMSNorm(
	config.hidden_size, eps=config.rms_norm_eps
	)
	self.post_attention_layernorm = MixtralRMSNorm(
	config.hidden_size, eps=config.rms_norm_eps
	)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	output_attentions: Optional[bool] = False,
	output_router_logits: Optional[bool] = False,
	use_cache: Optional[bool] = False,
	**kwargs,
	) -> Tuple[
	torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
	]:
	if "padding_mask" in kwargs:
	warnings.warn(
	"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
	)
	"""
	Args:
	hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
	attention_mask (`torch.FloatTensor`, optional): attention mask of size
	`(batch, sequence_length)` where padding elements are indicated by 0.
	past_key_value (`Tuple(torch.FloatTensor)`, optional): cached past key and value projection states
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	output_router_logits (`bool`, optional):
	Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
	should not be returned during inference.
	use_cache (`bool`, optional):
	If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
	(see `past_key_values`).
	"""

	residual = hidden_states

	hidden_states = self.input_layernorm(hidden_states)

	# 🔍 Self Attention
	if self.use_attn_moe:
	(
	hidden_states,
	self_attn_weights,
	present_key_value,
	attn_router_logits,
	) = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_value,
	output_attentions=output_attentions,
	use_cache=use_cache,
	)
	else:
	hidden_states, self_attn_weights, present_key_value = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_value,
	output_attentions=output_attentions,
	use_cache=use_cache,
	)
	attn_router_logits = None

	hidden_states = residual + hidden_states

	# Fully Connected
	residual = hidden_states
	hidden_states_input = self.post_attention_layernorm(hidden_states)

	# 🔍
	if self.is_moe:
	hidden_states, router_logits = self.block_sparse_moe(hidden_states_input)
	else:
	hidden_states = self.block_sparse_moe(hidden_states_input)
	router_logits = None

	if self.mlp_residual is not None:
	hidden_states += self.mlp_residual(hidden_states_input) #

	hidden_states = residual + hidden_states

	outputs = (hidden_states,)

	if output_attentions:
	outputs += (self_attn_weights,)

	if use_cache:
	outputs += (present_key_value,)

	if output_router_logits:
	outputs += (router_logits, attn_router_logits) # 🔍

	return outputs


	# Copied from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->Mixtral
	class MixtralPreTrainedModel(PreTrainedModel):
	config_class = MixtralConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_no_split_modules = ["MixtralDecoderLayer"]
	_skip_keys_device_placement = "past_key_values"
	_supports_flash_attn_2 = True
	_supports_cache_class = True

	def _init_weights(self, module):
	std = self.config.initializer_range
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()


	# Copied from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->MIXTRAL,Mistral->Mixtral
	class MixtralModel(MixtralPreTrainedModel):
	"""
	Transformer decoder consisting of config.num_hidden_layers layers. Each layer is a [`MixtralDecoderLayer`]

	Args:
	config: MixtralConfig
	"""

	def __init__(self, config: MixtralConfig):
	super().__init__(config)
	self.padding_idx = config.pad_token_id
	self.vocab_size = config.vocab_size

	self.embed_tokens = nn.Embedding(
	config.vocab_size, config.hidden_size, self.padding_idx
	)
	self.layers = nn.ModuleList(
	[
	MixtralDecoderLayer(config, layer_idx)
	for layer_idx in range(config.num_hidden_layers)
	]
	)
	self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
	self.norm = MixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

	self.gradient_checkpointing = False
	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.embed_tokens

	def set_input_embeddings(self, value):
	self.embed_tokens = value

	# Ignore copy
	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	output_router_logits: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, MoeModelOutputWithPast]:
	output_attentions = (
	output_attentions
	if output_attentions is not None
	else self.config.output_attentions
	)
	output_router_logits = (
	output_router_logits
	if output_router_logits is not None
	else self.config.output_router_logits
	)
	output_hidden_states = (
	output_hidden_states
	if output_hidden_states is not None
	else self.config.output_hidden_states
	)
	use_cache = use_cache if use_cache is not None else self.config.use_cache

	return_dict = (
	return_dict if return_dict is not None else self.config.use_return_dict
	)

	# retrieve input_ids and inputs_embeds
	if input_ids is not None and inputs_embeds is not None:
	raise ValueError(
	"You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
	)
	elif input_ids is not None:
	batch_size, seq_length = input_ids.shape
	elif inputs_embeds is not None:
	batch_size, seq_length, _ = inputs_embeds.shape
	else:
	raise ValueError(
	"You have to specify either decoder_input_ids or decoder_inputs_embeds"
	)

	past_key_values_length = 0

	if self.gradient_checkpointing and self.training:
	if use_cache:
	logger.warning_once(
	"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
	)
	use_cache = False

	if use_cache:
	use_legacy_cache = not isinstance(past_key_values, Cache)
	if use_legacy_cache:
	if self.config.use_attn_moe: # 🔍
	past_key_values = MoECache.from_legacy_cache(past_key_values)
	else: # 🔍
	past_key_values = DynamicCache.from_legacy_cache(past_key_values)
	past_key_values_length = past_key_values.get_usable_length(seq_length)

	# 🔍 add total seen tokens, this is VERY important for getting correct `past_key_values_length`!
	if self.config.use_attn_moe:
	past_key_values.add_seen_tokens_total(seq_length)

	if position_ids is None:
	device = input_ids.device if input_ids is not None else inputs_embeds.device
	position_ids = torch.arange(
	past_key_values_length,
	seq_length + past_key_values_length,
	dtype=torch.long,
	device=device,
	)
	position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
	else:
	position_ids = position_ids.view(-1, seq_length).long()

	if inputs_embeds is None:
	inputs_embeds = self.embed_tokens(input_ids)

	if attention_mask is not None and self._use_flash_attention_2 and use_cache:
	is_padding_right = attention_mask[:, -1].sum().item() != batch_size
	if is_padding_right:
	raise ValueError(
	"You are attempting to perform batched generation with padding_side='right'"
	" this may lead to unexpected behaviour for Flash Attention version of Mixtral. Make sure to "
	" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
	)

	if (
	self._use_flash_attention_2 or self.config.use_attn_moe
	): # 🔍 added special case for attention MoE
	# 2d mask is passed through the layers
	attention_mask = (
	attention_mask
	if (attention_mask is not None and 0 in attention_mask)
	else None
	)
	else:
	# 4d mask is passed through the layers
	attention_mask = _prepare_4d_causal_attention_mask(
	attention_mask,
	(batch_size, seq_length),
	inputs_embeds,
	past_key_values_length,
	sliding_window=self.config.sliding_window,
	)
	# print("attention_mask" , attention_mask)
	hidden_states = inputs_embeds

	# decoder layers
	all_hidden_states = () if output_hidden_states else None
	all_self_attns = () if output_attentions else None
	all_router_logits = () if output_router_logits else None
	all_attn_router_logits = () if output_router_logits else None # 🔍
	next_decoder_cache = None

	for decoder_layer in self.layers:
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	if self.gradient_checkpointing and self.training:

	def create_custom_forward(module):
	def custom_forward(*inputs):
	# None for past_key_value
	return module(*inputs)

	return custom_forward

	layer_outputs: tuple = torch.utils.checkpoint.checkpoint(
	create_custom_forward(decoder_layer),
	hidden_states,
	attention_mask,
	position_ids,
	past_key_values,
	output_attentions,
	output_router_logits,
	use_cache,
	)

	else:
	layer_outputs = decoder_layer(
	hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_value=past_key_values,
	output_attentions=output_attentions,
	output_router_logits=output_router_logits,
	use_cache=use_cache,
	)

	hidden_states = layer_outputs[0]

	if use_cache:
	next_decoder_cache = layer_outputs[2 if output_attentions else 1]

	if output_attentions:
	all_self_attns += (layer_outputs[1],)

	if output_router_logits:
	all_router_logits += (layer_outputs[-2],)
	all_attn_router_logits += (layer_outputs[-1],)

	hidden_states = self.norm(hidden_states)

	# add hidden states from the last decoder layer
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	next_cache = None
	if use_cache:
	next_cache = (
	next_decoder_cache.to_legacy_cache()
	if use_legacy_cache
	else next_decoder_cache
	)

	if not return_dict:
	return tuple(
	v
	for v in [
	hidden_states,
	next_cache,
	all_hidden_states,
	all_self_attns,
	all_router_logits,
	]
	if v is not None
	)
	return MoeModelOutputWithPast(
	last_hidden_state=hidden_states,
	past_key_values=next_cache,
	hidden_states=all_hidden_states,
	attentions=all_self_attns,
	router_logits=all_router_logits,
	attn_router_logits=all_attn_router_logits, # 🔍
	)


	class MixtralForCausalLM(MixtralPreTrainedModel):
	_tied_weights_keys = ["lm_head.weight"]

	def __init__(self, config):
	super().__init__(config)
	self.model = MixtralModel(config)
	self.vocab_size = config.vocab_size
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
	self.router_aux_loss_coef = config.router_aux_loss_coef
	self.num_experts = config.num_local_experts
	self.num_experts_per_tok = config.num_experts_per_tok
	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.model.embed_tokens

	def set_input_embeddings(self, value):
	self.model.embed_tokens = value

	def get_output_embeddings(self):
	return self.lm_head

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def set_decoder(self, decoder):
	self.model = decoder

	def get_decoder(self):
	return self.model

	def _set_gradient_checkpointing(self, module, value=False):
	if isinstance(module, MixtralModel):
	module.gradient_checkpointing = value

	# Ignore copy
	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	output_router_logits: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, MoeCausalLMOutputWithPast]:
	output_attentions = (
	output_attentions
	if output_attentions is not None
	else self.config.output_attentions
	)
	output_router_logits = (
	output_router_logits
	if output_router_logits is not None
	else self.config.output_router_logits
	)

	output_hidden_states = (
	output_hidden_states
	if output_hidden_states is not None
	else self.config.output_hidden_states
	)
	return_dict = (
	return_dict if return_dict is not None else self.config.use_return_dict
	)

	# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
	outputs: MoeModelOutputWithPast = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	output_router_logits=output_router_logits,
	return_dict=return_dict,
	)

	hidden_states = outputs[0]
	logits = self.lm_head(hidden_states)
	logits = logits.float()

	loss = None
	if labels is not None:
	# Shift so that tokens < n predict n
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	# Flatten the tokens
	loss_fct = CrossEntropyLoss()
	shift_logits = shift_logits.view(-1, self.config.vocab_size)
	shift_labels = shift_labels.view(-1)
	# Enable model parallelism
	shift_labels = shift_labels.to(shift_logits.device)
	loss = loss_fct(shift_logits, shift_labels)
	# print("MixtralForCausalLM, cross entropy loss", loss)

	aux_loss = None
	if output_router_logits:
	valid_router_logits = tuple(
	logits
	for logits in (outputs.router_logits if return_dict else outputs[-2])
	if logits is not None
	)

	aux_loss = load_balancing_loss_func(
	valid_router_logits,
	self.num_experts,
	self.num_experts_per_tok,
	use_layer_wise_balance=self.config.use_layer_wise_balance, # ✨
	)
	if labels is not None:
	loss += self.router_aux_loss_coef * aux_loss
	# loss_mlp = self.router_aux_loss_coef * aux_loss
	# loss = loss + loss_mlp
	# print("MixtralForCausalLM, mlp aux_loss", loss_mlp)

	# 🔍 for Attention MoE
	#################################
	valid_attn_router_logits = tuple(
	logits
	for logits in (
	outputs.attn_router_logits if return_dict else outputs[-1]
	)
	if logits is not None
	)

	if len(valid_attn_router_logits) > 0: # exist logits that is not None
	attn_aux_loss = load_balancing_loss_func(
	valid_attn_router_logits,
	self.config.attn_experts,
	self.config.top_k_attn,
	use_layer_wise_balance=self.config.use_layer_wise_balance, # ✨
	)
	if labels is not None:
	loss += self.router_aux_loss_coef * attn_aux_loss
	# loss_attn = self.router_aux_loss_coef * attn_aux_loss
	# loss = loss + loss_attn
	# print("MixtralForCausalLM, attn aux_loss", loss_attn)
	#################################

	if not return_dict:
	output = (logits,) + outputs[1:]
	if output_router_logits:
	output = (aux_loss,) + output
	return (loss,) + output if loss is not None else output

	return MoeCausalLMOutputWithPast(
	loss=loss,
	aux_loss=aux_loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	router_logits=outputs.router_logits,
	)

	def prepare_inputs_for_generation(
	self,
	input_ids,
	past_key_values=None,
	attention_mask=None,
	inputs_embeds=None,
	**kwargs,
	):
	# Omit tokens covered by past_key_values
	if past_key_values is not None:
	if isinstance(past_key_values, MoECache): # 🔍 for MoECache only
	cache_length = past_key_values.get_seq_length()
	past_length = past_key_values._seen_tokens_total # 🔍
	max_cache_length = past_key_values.get_max_length()
	elif isinstance(past_key_values, Cache):
	cache_length = past_key_values.get_seq_length()
	past_length = past_key_values.seen_tokens
	max_cache_length = past_key_values.get_max_length()
	else:
	cache_length = past_length = past_key_values[0][0].shape[2]
	max_cache_length = None

	# Keep only the unprocessed tokens:
	# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
	# some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
	# input)
	if (
	attention_mask is not None
	and attention_mask.shape[1] > input_ids.shape[1]
	):
	input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]

	# 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
	# input_ids based on the past_length.
	elif past_length < input_ids.shape[1]:
	input_ids = input_ids[:, past_length:]

	# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

	# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
	if (
	max_cache_length is not None
	and attention_mask is not None
	and cache_length + input_ids.shape[1] > max_cache_length
	):
	attention_mask = attention_mask[:, -max_cache_length:]

	position_ids = kwargs.get("position_ids", None)
	if attention_mask is not None and position_ids is None:
	# create position_ids on the fly for batch generation
	position_ids = attention_mask.long().cumsum(-1) - 1
	position_ids.masked_fill_(attention_mask == 0, 1)
	if past_key_values:
	position_ids = position_ids[:, -input_ids.shape[1] :]

	# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
	if inputs_embeds is not None and past_key_values is None:
	model_inputs = {"inputs_embeds": inputs_embeds}
	else:
	model_inputs = {"input_ids": input_ids}

	model_inputs.update(
	{
	"position_ids": position_ids,
	"past_key_values": past_key_values,
	"use_cache": kwargs.get("use_cache"),
	"attention_mask": attention_mask,
	}
	)
	return model_inputs

	@staticmethod
	def _reorder_cache(past_key_values, beam_idx):
	# TODO: support for beam search
	print("MixtralForCausalLM, _reorder_cache", beam_idx)
	raise NotImplementedError

	# reordered_past = ()
	# for layer_past in past_key_values:
	# reordered_past += (
	# tuple(
	# past_state.index_select(0, beam_idx.to(past_state.device))
	# for past_state in layer_past
	# ),
	# )
	# return reordered_past

	@torch.no_grad()
	def generate(
	self,
	inputs: Optional[torch.Tensor] = None,
	generation_config: Optional[GenerationConfig] = None,
	logits_processor: Optional[LogitsProcessorList] = None,
	stopping_criteria: Optional[StoppingCriteriaList] = None,
	prefix_allowed_tokens_fn: Optional[
	Callable[[int, torch.Tensor], List[int]]
	] = None,
	synced_gpus: Optional[bool] = None,
	assistant_model: Optional["PreTrainedModel"] = None,
	streamer: Optional["BaseStreamer"] = None,
	negative_prompt_ids: Optional[torch.Tensor] = None,
	negative_prompt_attention_mask: Optional[torch.Tensor] = None,
	**kwargs,
	) -> Union[GenerateOutput, torch.LongTensor]:
	r"""

	Generates sequences of token ids for models with a language modeling head.

	<Tip warning={true}>

	Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
	model's default generation configuration. You can override any `generation_config` by passing the corresponding
	parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.

	For an overview of generation strategies and code examples, check out the [following
	guide](../generation_strategies).

	</Tip>

	Parameters:
	inputs (`torch.Tensor` of varying shape depending on the modality, optional):
	The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
	method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
	should be in the format of `input_ids`. For encoder-decoder models inputs can represent any of
	`input_ids`, `input_values`, `input_features`, or `pixel_values`.
	generation_config ([`~generation.GenerationConfig`], optional):
	The generation configuration to be used as base parametrization for the generation call. `**kwargs`
	passed to generate matching the attributes of `generation_config` will override them. If
	`generation_config` is not provided, the default will be used, which has the following loading
	priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
	configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
	default values, whose documentation should be checked to parameterize generation.
	logits_processor (`LogitsProcessorList`, optional):
	Custom logits processors that complement the default logits processors built from arguments and
	generation config. If a logit processor is passed that is already created with the arguments or a
	generation config an error is thrown. This feature is intended for advanced users.
	stopping_criteria (`StoppingCriteriaList`, optional):
	Custom stopping criteria that complements the default stopping criteria built from arguments and a
	generation config. If a stopping criteria is passed that is already created with the arguments or a
	generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
	sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
	intended for advanced users.
	prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, optional):
	If provided, this function constraints the beam search to allowed tokens only at each step. If not
	provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
	`input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
	on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
	for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
	Retrieval](https://arxiv.org/abs/2010.00904).
	synced_gpus (`bool`, optional):
	Whether to continue running the while loop until max_length. Unless overridden this flag will be set to
	`True` under DeepSpeed ZeRO Stage 3 multiple GPUs environment to avoid hanging if one GPU finished
	generating before other GPUs. Otherwise it'll be set to `False`.
	assistant_model (`PreTrainedModel`, optional):
	An assistant model that can be used to accelerate generation. The assistant model must have the exact
	same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
	is much faster than running generation with the model you're calling generate from. As such, the
	assistant model should be much smaller.
	streamer (`BaseStreamer`, optional):
	Streamer object that will be used to stream the generated sequences. Generated tokens are passed
	through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
	negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	The negative prompt needed for some processors such as CFG. The batch size must match the input batch
	size. This is an experimental feature, subject to breaking API changes in future versions.
	negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, optional):
	Attention_mask for `negative_prompt_ids`.
	kwargs (`Dict[str, Any]`, optional):
	Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
	forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
	specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with decoder_.

	Return:
	[`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
	or when `config.return_dict_in_generate=True`) or a `torch.LongTensor`.

	If the model is not an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
	[`~utils.ModelOutput`] types are:

	- [`~generation.GenerateDecoderOnlyOutput`],
	- [`~generation.GenerateBeamDecoderOnlyOutput`]

	If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
	[`~utils.ModelOutput`] types are:

	- [`~generation.GenerateEncoderDecoderOutput`],
	- [`~generation.GenerateBeamEncoderDecoderOutput`]
	"""
	# 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
	self._validate_model_class()
	tokenizer = kwargs.pop(
	"tokenizer", None
	) # Pull this out first, we only use it for stopping criteria
	generation_config, model_kwargs = self._prepare_generation_config(
	generation_config, **kwargs
	)
	self._validate_model_kwargs(model_kwargs.copy())
	self._validate_assistant(assistant_model)

	# 2. Set generation parameters if not already defined
	if synced_gpus is None:
	if is_deepspeed_zero3_enabled() and dist.get_world_size() > 1:
	synced_gpus = True
	else:
	synced_gpus = False

	logits_processor = (
	logits_processor if logits_processor is not None else LogitsProcessorList()
	)
	stopping_criteria = (
	stopping_criteria
	if stopping_criteria is not None
	else StoppingCriteriaList()
	)

	accepts_attention_mask = "attention_mask" in set(
	inspect.signature(self.forward).parameters.keys()
	)
	requires_attention_mask = "encoder_outputs" not in model_kwargs
	kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None

	# 3. Define model inputs
	inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
	inputs, generation_config.bos_token_id, model_kwargs
	)
	batch_size = inputs_tensor.shape[0]

	device = inputs_tensor.device
	self._prepare_special_tokens(
	generation_config, kwargs_has_attention_mask, device=device
	)

	# decoder-only models must use left-padding for batched generation.
	if not self.config.is_encoder_decoder and not is_torchdynamo_compiling():
	# If `input_ids` was given, check if the last id in any sequence is `pad_token_id`
	# Note: If using, `inputs_embeds` this check does not work, because we want to be more hands-off.
	if (
	generation_config.pad_token_id is not None
	and batch_size > 1
	and len(inputs_tensor.shape) == 2
	and torch.sum(inputs_tensor[:, -1] == generation_config.pad_token_id)
	> 0
	):
	logger.warning(
	"A decoder-only architecture is being used, but right-padding was detected! For correct "
	"generation results, please set `padding_side='left'` when initializing the tokenizer."
	)

	# 4. Define other model kwargs
	# decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
	# generating the first new token or not, and we only want to use the embeddings for the first new token)
	if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
	model_kwargs["use_cache"] = True
	else:
	model_kwargs["use_cache"] = generation_config.use_cache

	if (
	not kwargs_has_attention_mask
	and requires_attention_mask
	and accepts_attention_mask
	):
	model_kwargs[
	"attention_mask"
	] = self._prepare_attention_mask_for_generation(
	inputs_tensor,
	generation_config.pad_token_id,
	generation_config.eos_token_id,
	)

	if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
	# if model is encoder decoder encoder_outputs are created and added to `model_kwargs`
	model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
	inputs_tensor, model_kwargs, model_input_name, generation_config
	)

	# 5. Prepare `input_ids` which will be used for auto-regressive generation
	if self.config.is_encoder_decoder:
	input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
	batch_size=batch_size,
	model_input_name=model_input_name,
	model_kwargs=model_kwargs,
	decoder_start_token_id=generation_config.decoder_start_token_id,
	device=inputs_tensor.device,
	)
	else:
	input_ids = (
	inputs_tensor
	if model_input_name == "input_ids"
	else model_kwargs.pop("input_ids")
	)

	if generation_config.token_healing:
	input_ids = self.heal_tokens(input_ids, tokenizer)

	if streamer is not None:
	streamer.put(input_ids.cpu())

	# 6. Prepare `max_length` depending on other stopping criteria.
	input_ids_length = input_ids.shape[-1]
	has_default_max_length = (
	kwargs.get("max_length") is None
	and generation_config.max_length is not None
	)
	has_default_min_length = (
	kwargs.get("min_length") is None
	and generation_config.min_length is not None
	)
	generation_config = self._prepare_generated_length(
	generation_config=generation_config,
	has_default_max_length=has_default_max_length,
	has_default_min_length=has_default_min_length,
	model_input_name=model_input_name,
	inputs_tensor=inputs_tensor,
	input_ids_length=input_ids_length,
	)

	use_dynamic_cache_by_default = False
	if (
	generation_config.cache_implementation is not None
	and model_kwargs.get("past_key_values") is not None
	):
	raise ValueError(
	"Passing both `cache_implementation` (used to initialize certain caches) and `past_key_values` (a "
	"Cache object) is unsupported. Please use only one of the two."
	)
	elif generation_config.cache_implementation is not None:
	if self.config.use_attn_moe: # 🔍
	raise ValueError(
	"Attention MoE doesn't support specifying the cache type! You can only use `MoECache`"
	)
	if (
	generation_config.cache_implementation
	in NEED_SETUP_CACHE_CLASSES_MAPPING
	):
	if (
	generation_config.cache_implementation == "static"
	and not self._supports_static_cache
	):
	raise ValueError(
	"This model does not support `cache_implementation='static'`. Please check the following "
	"issue: https://github.com/huggingface/transformers/issues/28981"
	)
	model_kwargs["past_key_values"] = self._get_cache(
	generation_config.cache_implementation,
	getattr(generation_config, "num_beams", 1) * batch_size,
	generation_config.max_length,
	)
	elif generation_config.cache_implementation == "quantized":
	if not self._supports_quantized_cache:
	raise ValueError(
	"This model does not support the quantized cache. If you want your model to support quantized "
	"cache, please open an issue."
	)

	cache_config = (
	generation_config.cache_config
	if generation_config.cache_config is not None
	else QuantizedCacheConfig()
	)
	cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend]

	if cache_config.backend == "quanto" and not is_quanto_available():
	raise ImportError(
	"You need to install `quanto` in order to use KV cache quantization with quanto backend. "
	"Please install it via with `pip install quanto`"
	)
	elif cache_config.backend == "HQQ" and not is_hqq_available():
	raise ImportError(
	"You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
	"Please install it via with `pip install hqq`"
	)

	model_kwargs["past_key_values"] = cache_class(cache_config)
	# Use DynamicCache() instance by default. This will avoid back and forth from legacy format that
	# keeps copying the cache thus using much more memory
	elif (
	generation_config.cache_implementation is None
	and self._supports_default_dynamic_cache()
	):
	past = model_kwargs.get("past_key_values", None)
	if past is None:
	if self.config.use_attn_moe: # 🔍
	model_kwargs["past_key_values"] = MoECache(
	# self.config.num_key_value_heads
	self.config.attn_experts
	)
	else: # 🔍
	model_kwargs["past_key_values"] = DynamicCache()
	use_dynamic_cache_by_default = True
	elif isinstance(past, tuple):
	if self.config.use_attn_moe: # 🔍
	model_kwargs["past_key_values"] = MoECache.from_legacy_cache(past)
	else: # 🔍
	model_kwargs["past_key_values"] = DynamicCache.from_legacy_cache(
	past
	)
	use_dynamic_cache_by_default = True

	self._validate_generated_length(
	generation_config, input_ids_length, has_default_max_length
	)

	# 7. determine generation mode
	generation_mode = generation_config.get_generation_mode(assistant_model)

	if streamer is not None and (generation_config.num_beams > 1):
	raise ValueError(
	"`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
	)

	if self.device.type != input_ids.device.type:
	warnings.warn(
	"You are calling .generate() with the `input_ids` being on a device type different"
	f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
	f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
	" Please make sure that you have put `input_ids` to the"
	f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
	" running `.generate()`.",
	UserWarning,
	)

	# 8. prepare distribution pre_processing samplers
	prepared_logits_processor = self._get_logits_processor(
	generation_config=generation_config,
	input_ids_seq_length=input_ids_length,
	encoder_input_ids=inputs_tensor,
	prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
	logits_processor=logits_processor,
	device=inputs_tensor.device,
	model_kwargs=model_kwargs,
	negative_prompt_ids=negative_prompt_ids,
	negative_prompt_attention_mask=negative_prompt_attention_mask,
	)

	# 9. prepare stopping criteria
	prepared_stopping_criteria = self._get_stopping_criteria(
	generation_config=generation_config,
	stopping_criteria=stopping_criteria,
	tokenizer=tokenizer,
	**kwargs,
	)

	# 10. go into different generation modes
	if generation_mode == GenerationMode.ASSISTED_GENERATION:
	if generation_config.num_return_sequences > 1:
	raise ValueError(
	"num_return_sequences has to be 1 when doing assisted generate, "
	f"but is {generation_config.num_return_sequences}."
	)
	if batch_size > 1:
	raise ValueError(
	"assisted generate is only supported for batch_size = 1"
	)
	if not model_kwargs["use_cache"]:
	raise ValueError("assisted generate requires `use_cache=True`")
	if generation_config.cache_implementation == "static":
	raise ValueError(
	"assisted generate is not supported with `static_cache`"
	)
	if self._is_stateful:
	# In assisted generation we need the ability to confirm whether the model would pick certain tokens,
	# which is not possible with stateful models (they can't reset to a previous subset of generated text)
	raise ValueError(
	f"assisted generation is not supported with stateful models, such as {self.__class__.__name__}"
	)

	# 11. Get the candidate generator, given the parameterization
	candidate_generator = self._get_candidate_generator(
	generation_config=generation_config,
	input_ids=input_ids,
	inputs_tensor=inputs_tensor,
	assistant_model=assistant_model,
	logits_processor=logits_processor,
	model_kwargs=model_kwargs,
	)

	# 12. prepare logits warper (if `do_sample` is `True`)
	prepared_logits_warper = (
	self._get_logits_warper(
	generation_config,
	device=input_ids.device,
	)
	if generation_config.do_sample
	else None
	)

	# 13. run assisted generate
	result = self._assisted_decoding(
	input_ids,
	candidate_generator=candidate_generator,
	logits_processor=prepared_logits_processor,
	logits_warper=prepared_logits_warper,
	stopping_criteria=prepared_stopping_criteria,
	generation_config=generation_config,
	synced_gpus=synced_gpus,
	streamer=streamer,
	**model_kwargs,
	)

	elif generation_mode == GenerationMode.CONTRASTIVE_SEARCH:
	if not model_kwargs["use_cache"]:
	raise ValueError("Contrastive search requires `use_cache=True`")
	if self._is_stateful:
	# Just like assisted generation, we need to be able to rollback to a previous state (see comment above)
	raise ValueError(
	f"contrastive search is not supported with stateful models, such as {self.__class__.__name__}"
	)

	result = self._contrastive_search(
	input_ids,
	logits_processor=prepared_logits_processor,
	stopping_criteria=prepared_stopping_criteria,
	generation_config=generation_config,
	synced_gpus=synced_gpus,
	streamer=streamer,
	**model_kwargs,
	)

	elif generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
	# 11. prepare logits warper
	prepared_logits_warper = (
	self._get_logits_warper(generation_config, device=input_ids.device)
	if generation_config.do_sample
	else None
	)

	# 12. expand input_ids with `num_return_sequences` additional sequences per batch
	input_ids, model_kwargs = self._expand_inputs_for_generation(
	input_ids=input_ids,
	expand_size=generation_config.num_return_sequences,
	is_encoder_decoder=self.config.is_encoder_decoder,
	**model_kwargs,
	)

	# 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
	result = self._sample(
	input_ids,
	logits_processor=prepared_logits_processor,
	logits_warper=prepared_logits_warper,
	stopping_criteria=prepared_stopping_criteria,
	generation_config=generation_config,
	synced_gpus=synced_gpus,
	streamer=streamer,
	**model_kwargs,
	)

	elif generation_mode in (
	GenerationMode.BEAM_SAMPLE,
	GenerationMode.BEAM_SEARCH,
	):
	# 11. prepare logits warper
	prepared_logits_warper = (
	self._get_logits_warper(generation_config, device=input_ids.device)
	if generation_config.do_sample
	else None
	)

	# 12. prepare beam search scorer
	beam_scorer = BeamSearchScorer(
	batch_size=batch_size,
	num_beams=generation_config.num_beams,
	device=inputs_tensor.device,
	length_penalty=generation_config.length_penalty,
	do_early_stopping=generation_config.early_stopping,
	num_beam_hyps_to_keep=generation_config.num_return_sequences,
	max_length=generation_config.max_length,
	)

	# 13. interleave input_ids with `num_beams` additional sequences per batch
	input_ids, model_kwargs = self._expand_inputs_for_generation(
	input_ids=input_ids,
	expand_size=generation_config.num_beams,
	is_encoder_decoder=self.config.is_encoder_decoder,
	**model_kwargs,
	)

	# 14. run beam sample
	result = self._beam_search(
	input_ids,
	beam_scorer,
	logits_processor=prepared_logits_processor,
	logits_warper=prepared_logits_warper,
	stopping_criteria=prepared_stopping_criteria,
	generation_config=generation_config,
	synced_gpus=synced_gpus,
	**model_kwargs,
	)

	elif generation_mode == GenerationMode.GROUP_BEAM_SEARCH:
	# 11. prepare beam search scorer
	beam_scorer = BeamSearchScorer(
	batch_size=batch_size,
	num_beams=generation_config.num_beams,
	device=inputs_tensor.device,
	length_penalty=generation_config.length_penalty,
	do_early_stopping=generation_config.early_stopping,
	num_beam_hyps_to_keep=generation_config.num_return_sequences,
	num_beam_groups=generation_config.num_beam_groups,
	max_length=generation_config.max_length,
	)
	# 12. interleave input_ids with `num_beams` additional sequences per batch
	input_ids, model_kwargs = self._expand_inputs_for_generation(
	input_ids=input_ids,
	expand_size=generation_config.num_beams,
	is_encoder_decoder=self.config.is_encoder_decoder,
	**model_kwargs,
	)
	# 13. run beam search
	result = self._group_beam_search(
	input_ids,
	beam_scorer,
	logits_processor=prepared_logits_processor,
	stopping_criteria=prepared_stopping_criteria,
	generation_config=generation_config,
	synced_gpus=synced_gpus,
	**model_kwargs,
	)

	elif generation_mode == GenerationMode.CONSTRAINED_BEAM_SEARCH:
	final_constraints = []
	if generation_config.constraints is not None:
	final_constraints = generation_config.constraints

	if generation_config.force_words_ids is not None:

	def typeerror():
	raise ValueError(
	"`force_words_ids` has to either be a `List[List[List[int]]]` or `List[List[int]]` "
	f"of positive integers, but is {generation_config.force_words_ids}."
	)

	if (
	not isinstance(generation_config.force_words_ids, list)
	or len(generation_config.force_words_ids) == 0
	):
	typeerror()

	for word_ids in generation_config.force_words_ids:
	if isinstance(word_ids[0], list):
	if not isinstance(word_ids, list) or len(word_ids) == 0:
	typeerror()
	if any(
	not isinstance(token_ids, list) for token_ids in word_ids
	):
	typeerror()
	if any(
	any(
	(not isinstance(token_id, int) or token_id < 0)
	for token_id in token_ids
	)
	for token_ids in word_ids
	):
	typeerror()

	constraint = DisjunctiveConstraint(word_ids)
	else:
	if not isinstance(word_ids, list) or len(word_ids) == 0:
	typeerror()
	if any(
	(not isinstance(token_id, int) or token_id < 0)
	for token_id in word_ids
	):
	typeerror()

	constraint = PhrasalConstraint(word_ids)
	final_constraints.append(constraint)

	# 11. prepare beam search scorer
	constrained_beam_scorer = ConstrainedBeamSearchScorer(
	constraints=final_constraints,
	batch_size=batch_size,
	num_beams=generation_config.num_beams,
	device=inputs_tensor.device,
	length_penalty=generation_config.length_penalty,
	do_early_stopping=generation_config.early_stopping,
	num_beam_hyps_to_keep=generation_config.num_return_sequences,
	max_length=generation_config.max_length,
	)
	# 12. interleave input_ids with `num_beams` additional sequences per batch
	input_ids, model_kwargs = self._expand_inputs_for_generation(
	input_ids=input_ids,
	expand_size=generation_config.num_beams,
	is_encoder_decoder=self.config.is_encoder_decoder,
	**model_kwargs,
	)
	# 13. run beam search
	result = self._constrained_beam_search(
	input_ids,
	constrained_beam_scorer=constrained_beam_scorer,
	logits_processor=prepared_logits_processor,
	stopping_criteria=prepared_stopping_criteria,
	generation_config=generation_config,
	synced_gpus=synced_gpus,
	**model_kwargs,
	)

	# Convert to legacy cache if needed
	if use_dynamic_cache_by_default and generation_config.return_legacy_cache:
	if isinstance(result, ModelOutput) and hasattr(result, "past_key_values"):
	if isinstance(result.past_key_values, DynamicCache):
	result.past_key_values = result.past_key_values.to_legacy_cache()
	return result


	# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mixtral, LLAMA->MIXTRAL
	class MixtralForSequenceClassification(MixtralPreTrainedModel):
	def __init__(self, config):
	super().__init__(config)
	self.num_labels = config.num_labels
	self.model = MixtralModel(config)
	self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.model.embed_tokens

	def set_input_embeddings(self, value):
	self.model.embed_tokens = value

	def forward(
	self,
	input_ids: torch.LongTensor = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, SequenceClassifierOutputWithPast]:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
	config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
	`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
	"""
	return_dict = (
	return_dict if return_dict is not None else self.config.use_return_dict
	)

	transformer_outputs = self.model(
	input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)
	hidden_states = transformer_outputs[0]
	logits = self.score(hidden_states)

	if input_ids is not None:
	batch_size = input_ids.shape[0]
	else:
	batch_size = inputs_embeds.shape[0]

	if self.config.pad_token_id is None and batch_size != 1:
	raise ValueError(
	"Cannot handle batch sizes > 1 if no padding token is defined."
	)
	if self.config.pad_token_id is None:
	sequence_lengths = -1
	else:
	if input_ids is not None:
	sequence_lengths = (
	torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
	).to(logits.device)
	else:
	sequence_lengths = -1

	pooled_logits = logits[
	torch.arange(batch_size, device=logits.device), sequence_lengths
	]

	loss = None
	if labels is not None:
	labels = labels.to(logits.device)
	if self.config.problem_type is None:
	if self.num_labels == 1:
	self.config.problem_type = "regression"
	elif self.num_labels > 1 and (
	labels.dtype == torch.long or labels.dtype == torch.int
	):
	self.config.problem_type = "single_label_classification"
	else:
	self.config.problem_type = "multi_label_classification"

	if self.config.problem_type == "regression":
	loss_fct = MSELoss()
	if self.num_labels == 1:
	loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
	else:
	loss = loss_fct(pooled_logits, labels)
	elif self.config.problem_type == "single_label_classification":
	loss_fct = CrossEntropyLoss()
	loss = loss_fct(
	pooled_logits.view(-1, self.num_labels), labels.view(-1)
	)
	elif self.config.problem_type == "multi_label_classification":
	loss_fct = BCEWithLogitsLoss()
	loss = loss_fct(pooled_logits, labels)
	if not return_dict:
	output = (pooled_logits,) + transformer_outputs[1:]
	return ((loss,) + output) if loss is not None else output

	return SequenceClassifierOutputWithPast(
	loss=loss,
	logits=pooled_logits,
	past_key_values=transformer_outputs.past_key_values,
	hidden_states=transformer_outputs.hidden_states,
	attentions=transformer_outputs.attentions,
	)