tomeras1 commited on May 5, 2024

Commit

886484a

verified ·

1 Parent(s): 8ee14c3

Move to in-library checkpoint

Browse files

Files changed (27) hide show

config.json +3 -4
configuration_jamba.py +27 -17
generation_config.json +1 -1
model-00001-of-00021.safetensors +2 -2
model-00002-of-00021.safetensors +2 -2
model-00003-of-00021.safetensors +2 -2
model-00004-of-00021.safetensors +2 -2
model-00005-of-00021.safetensors +2 -2
model-00006-of-00021.safetensors +2 -2
model-00007-of-00021.safetensors +2 -2
model-00008-of-00021.safetensors +2 -2
model-00009-of-00021.safetensors +2 -2
model-00010-of-00021.safetensors +2 -2
model-00011-of-00021.safetensors +2 -2
model-00012-of-00021.safetensors +2 -2
model-00013-of-00021.safetensors +2 -2
model-00014-of-00021.safetensors +2 -2
model-00015-of-00021.safetensors +2 -2
model-00016-of-00021.safetensors +2 -2
model-00017-of-00021.safetensors +2 -2
model-00018-of-00021.safetensors +2 -2
model-00019-of-00021.safetensors +2 -2
model-00020-of-00021.safetensors +2 -2
model-00021-of-00021.safetensors +2 -2
model.safetensors.index.json +0 -0
modeling_jamba.py +397 -632
special_tokens_map.json +28 -4

config.json CHANGED Viewed

@@ -12,7 +12,6 @@
     "AutoModelForSequenceClassification": "model.JambaForSequenceClassification"
   },
   "bos_token_id": 1,
-  "calc_logits_for_entire_prompt": false,
   "eos_token_id": 2,
   "expert_layer_offset": 1,
   "expert_layer_period": 2,
@@ -25,15 +24,15 @@
   "mamba_d_state": 16,
   "mamba_dt_rank": 256,
   "mamba_expand": 2,
-  "mamba_inner_layernorms": true,
   "mamba_proj_bias": false,
   "model_type": "jamba",
-  "n_ctx": 262144,
   "num_attention_heads": 32,
   "num_experts": 16,
   "num_experts_per_tok": 2,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "output_router_logits": false,
   "pad_token_id": 0,
   "rms_norm_eps": 1e-06,
@@ -41,7 +40,7 @@
   "sliding_window": null,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.40.0.dev0",
   "use_cache": true,
   "use_mamba_kernels": true,
   "vocab_size": 65536

     "AutoModelForSequenceClassification": "model.JambaForSequenceClassification"
   },
   "bos_token_id": 1,
   "eos_token_id": 2,
   "expert_layer_offset": 1,
   "expert_layer_period": 2,
   "mamba_d_state": 16,
   "mamba_dt_rank": 256,
   "mamba_expand": 2,
   "mamba_proj_bias": false,
+  "max_position_embeddings": 262144,
   "model_type": "jamba",
   "num_attention_heads": 32,
   "num_experts": 16,
   "num_experts_per_tok": 2,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
+  "num_logits_to_keep": 1,
   "output_router_logits": false,
   "pad_token_id": 0,
   "rms_norm_eps": 1e-06,
   "sliding_window": null,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.1",
   "use_cache": true,
   "use_mamba_kernels": true,
   "vocab_size": 65536

configuration_jamba.py CHANGED Viewed

@@ -26,9 +26,9 @@ class JambaConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`JambaModel`]. It is used to instantiate a
     Jamba model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the jamba-small architecture.
-    [ai21labs/jamba-small](https://huggingface.co/ai21labs/Jamba-v0.1)
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -65,12 +65,12 @@ class JambaConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        calc_logits_for_entire_prompt (`bool`, *optional*, defaults to `False`):
-            Whether or not to calculate logits for entire prompt during generation. If `False`, only the logits of the
-            last prompt token will be calculated, which are the only logits needed for generation. For long sequences,
-            the logits for the entire sequence may use a lot of memory so setting `calc_logits_for_entire_prompt=False`
-            will reduce memory footprint significantly.
-            Note: some generation features may not be available if this is set to `False`.
         output_router_logits (`bool`, *optional*, defaults to `False`):
             Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss. See [here]() for more details
@@ -84,7 +84,7 @@ class JambaConfig(PretrainedConfig):
             The id of the "end-of-sequence" token.
         sliding_window (`int`, *optional*):
             Sliding window attention window size. If not specified, will default to `None`.
-        n_ctx (`int`, *optional*, defaults to 262144):
             This value doesn't have any real effect. The maximum sequence length that this model is intended to be
             used with. It can be used with longer sequences, but performance may degrade.
         attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -118,8 +118,6 @@ class JambaConfig(PretrainedConfig):
             Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
         mamba_proj_bias (`bool`, *optional*, defaults to `False`):
             Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
-        mamba_inner_layernorms (`bool`, *optional*, defaults to `True`):
-            Flag indicating whether or not to apply layernorms to internal mamba activations
     """
@@ -139,14 +137,14 @@ class JambaConfig(PretrainedConfig):
             initializer_range=0.02,
             rms_norm_eps=1e-6,
             use_cache=True,
-            calc_logits_for_entire_prompt=False,
             output_router_logits=False,
             router_aux_loss_coef=0.001,
             pad_token_id=0,
             bos_token_id=1,
             eos_token_id=2,
             sliding_window=None,
-            n_ctx=262144,
             attention_dropout=0.0,
             num_experts_per_tok=2,
             num_experts=16,
@@ -161,7 +159,6 @@ class JambaConfig(PretrainedConfig):
             mamba_dt_rank="auto",
             mamba_conv_bias=True,
             mamba_proj_bias=False,
-            mamba_inner_layernorms=True,
             **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -171,7 +168,7 @@ class JambaConfig(PretrainedConfig):
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.sliding_window = sliding_window
-        self.n_ctx = n_ctx
         self.attention_dropout = attention_dropout
         # for backward compatibility
@@ -184,7 +181,7 @@ class JambaConfig(PretrainedConfig):
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.calc_logits_for_entire_prompt = calc_logits_for_entire_prompt
         self.output_router_logits = output_router_logits
         self.router_aux_loss_coef = router_aux_loss_coef
@@ -202,7 +199,6 @@ class JambaConfig(PretrainedConfig):
         self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
         self.mamba_conv_bias = mamba_conv_bias
         self.mamba_proj_bias = mamba_proj_bias
-        self.mamba_inner_layernorms = mamba_inner_layernorms
         super().__init__(
             pad_token_id=pad_token_id,
@@ -211,3 +207,17 @@ class JambaConfig(PretrainedConfig):
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )

     r"""
     This is the configuration class to store the configuration of a [`JambaModel`]. It is used to instantiate a
     Jamba model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Jamba-v0.1 model.
+    [ai21labs/Jamba-v0.1](https://huggingface.co/ai21labs/Jamba-v0.1)
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
         output_router_logits (`bool`, *optional*, defaults to `False`):
             Whether or not the router logits should be returned by the model. Enabling this will also
             allow the model to output the auxiliary loss. See [here]() for more details
             The id of the "end-of-sequence" token.
         sliding_window (`int`, *optional*):
             Sliding window attention window size. If not specified, will default to `None`.
+        max_position_embeddings (`int`, *optional*, defaults to 262144):
             This value doesn't have any real effect. The maximum sequence length that this model is intended to be
             used with. It can be used with longer sequences, but performance may degrade.
         attention_dropout (`float`, *optional*, defaults to 0.0):
             Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
         mamba_proj_bias (`bool`, *optional*, defaults to `False`):
             Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
     """
             initializer_range=0.02,
             rms_norm_eps=1e-6,
             use_cache=True,
+            num_logits_to_keep=1,
             output_router_logits=False,
             router_aux_loss_coef=0.001,
             pad_token_id=0,
             bos_token_id=1,
             eos_token_id=2,
             sliding_window=None,
+            max_position_embeddings=262144,
             attention_dropout=0.0,
             num_experts_per_tok=2,
             num_experts=16,
             mamba_dt_rank="auto",
             mamba_conv_bias=True,
             mamba_proj_bias=False,
             **kwargs,
     ):
         self.vocab_size = vocab_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
         self.attention_dropout = attention_dropout
         # for backward compatibility
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
         self.output_router_logits = output_router_logits
         self.router_aux_loss_coef = router_aux_loss_coef
         self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
         self.mamba_conv_bias = mamba_conv_bias
         self.mamba_proj_bias = mamba_proj_bias
         super().__init__(
             pad_token_id=pad_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
+    @property
+    def layers_block_type(self):
+        return [
+            "attention" if i % self.attn_layer_period == self.attn_layer_offset else "mamba"
+            for i in range(self.num_hidden_layers)
+        ]
+    @property
+    def layers_num_experts(self):
+        return [
+            self.num_experts if i % self.expert_layer_period == self.expert_layer_offset else 1
+            for i in range(self.num_hidden_layers)
+        ]

generation_config.json CHANGED Viewed

@@ -3,5 +3,5 @@
   "bos_token_id": 1,
   "eos_token_id": 2,
   "pad_token_id": 0,
-  "transformers_version": "4.40.0.dev0"
 }

   "bos_token_id": 1,
   "eos_token_id": 2,
   "pad_token_id": 0,
+  "transformers_version": "4.40.1"
 }

model-00001-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce46bbcbda10cfac6b5855da022777a0387e2b729cbdd219081fa3f69cb214a2
-size 4951236864

 version https://git-lfs.github.com/spec/v1
+oid sha256:1aace34ee0da3bf95605bd150fff6d3e78110be4048a3c389b0a740354b2ccb7
+size 4951761424

model-00002-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7ae0b82247f3164270f151fa12ea1ceb63992e8827c739319fe20342eadafa8a
-size 4884145024

 version https://git-lfs.github.com/spec/v1
+oid sha256:0ba1de67a86329431f14f7ffa165d84055d32ce57a6d2314e3b2464eac3732dc
+size 4884669624

model-00003-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d2d44419116a65b3617fa35d20b69e2060449b53c0ac36192a3ec4b0a60b0a8d
-size 4992294632

 version https://git-lfs.github.com/spec/v1
+oid sha256:1abc4f16865fb78241c9453292ee3b2ca2c1e2d54ee945631da625834b95c9b2
+size 4992557120

model-00004-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70fe04d7dc1124871ca1f6071504ba019174db27cd57c625938e6383ebee5fee
-size 4958591040

 version https://git-lfs.github.com/spec/v1
+oid sha256:45fab97739a58e924791572ea3d06f9c90b9ff2a299460aaa4bd87c6e9d424f3
+size 4958853560

model-00005-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:402079425e45a01c256a080cae3ab39be3f3cfae56dba7c815a44f0c58b3a442
-size 4975501296

 version https://git-lfs.github.com/spec/v1
+oid sha256:c4b0ec6e8f33e6d7b1f837cd4c25818487dcc7e478734606da28110507e51c97
+size 4975763832

model-00006-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2cc9971c058d95a8f13966a3aa82294564381937902634c0c064be68104821ae
-size 4884145016

 version https://git-lfs.github.com/spec/v1
+oid sha256:ed98d5c3c8d7ab7352944bea09b0d54d98066cf567ba3d069da12c05575d56ed
+size 4884669616

model-00007-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d9ba83c87790cdb6fb9f7861a712f315469edbf065ab64bdaa35cc99b4ec8746
-size 4884144968

 version https://git-lfs.github.com/spec/v1
+oid sha256:735be2bc568711bf42a4caebcda8288dd300b31b48fa098b00df3cf1a98e10e2
+size 4884669640

model-00008-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b45331970c155ca74f509576cb050d006997bef08a99189cf047aa1a3a4b254e
-size 4992294696

 version https://git-lfs.github.com/spec/v1
+oid sha256:d0c8d817b2b47661d361e8b520128b3194185f756cc2204a95d642e24895ee51
+size 4992557176

model-00009-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe5a1e58d58598a64a59a3ca87c170a171a7bba2102138c71047d5b5458cdebf
-size 4932506800

 version https://git-lfs.github.com/spec/v1
+oid sha256:e50222cf865ca5678d22574b131294303c46b249478cf70113c701f70331e999
+size 4932507176

model-00010-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bd37916e35b2b3b98e7a9bb790a779ac51ad0bbcff92428c0ed11c8839379205
-size 4884145056

 version https://git-lfs.github.com/spec/v1
+oid sha256:b1b4b69b24ae55827b6c8b1e4a10807aa3525bc85f4d34dc002ac7440757fbf4
+size 4884669672

model-00011-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:392435bc85f4c90bf129c30260da8c820f35bca91610aa0e682cb915f1d855c6
-size 4884145088

 version https://git-lfs.github.com/spec/v1
+oid sha256:60213cac13b92ed34b93ce48e670434f22e3bf8b2b8df20c60b7bf8a9515c35c
+size 4884669696

model-00012-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5141158f7a755a7e0f60c73f4c25ba02c2bfdab548944f8d4146f41391c621a
-size 4884145088

 version https://git-lfs.github.com/spec/v1
+oid sha256:05805eacd3bb40cc9da802350409f1cb078e8b276da7e06c7a8a5ca5b26cc887
+size 4884669688

model-00013-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:14ce5aabc4a17e54e40b30fba322104dd19bad512bab6e554fa56bafe4433da7
-size 4932506800

 version https://git-lfs.github.com/spec/v1
+oid sha256:201df979a1b34ced6cdbb7a790163412636779f1119e3845a704c489181d03d2
+size 4932507176

model-00014-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd79a548e39ee02f6a9b553f93f6652783c9dbc895ab685848d9e1655903965f
-size 4992294648

 version https://git-lfs.github.com/spec/v1
+oid sha256:d0a7eb42a9ea3a385442c2e758dd5efd5dc5b913f1d10bfd37792cc963a33c93
+size 4992557152

model-00015-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eb5af1275e6a0c5bc2c195e1802a64cee6aa92e3a11fcff5acd8b7bbf720ef75
-size 4884145088

 version https://git-lfs.github.com/spec/v1
+oid sha256:a4b9afe4398000c28b36e3aa40c87086af673d4f8a64bfc5767941ab2008bcc9
+size 4884669688

model-00016-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e9efc22a654010417091851b00277db7116e8c532ae5410cacc13bfa49b99c06
-size 4884145088

 version https://git-lfs.github.com/spec/v1
+oid sha256:dd1ac6cc861971c43bdf0c9c6d4c9fe72d33e5227e054a621e2e68f001419763
+size 4884669688

model-00017-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ec7ab387e62b0c65a3567cc4d17d13166b577cf89ff59a8d5d7b248fdbbc68da
-size 4908260352

 version https://git-lfs.github.com/spec/v1
+oid sha256:52d9eea696dd29ef413d617bbcb62a9f159e8fe8170d36e018932cef45ee281d
+size 4908522856

model-00018-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f65a30ff1d8e1fc086460839056e7bc7a6a2ef81f0df35dc1a752bf951f92df
-size 4908391496

 version https://git-lfs.github.com/spec/v1
+oid sha256:77acada7c098e81280645ea0a9dbfa00196dca6da8946498b9907e9e376fb42d
+size 4908654000

model-00019-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7975019ffa4bb6f502e3406a53ef61ee08085330502ba32fb3e9883b7033c8c7
-size 4992294688

 version https://git-lfs.github.com/spec/v1
+oid sha256:09e10dfd6c6459cd3460b1d667639717d3657274c1694c19a6fdbac1be6a76bf
+size 4992557168

model-00020-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6363d3d6f89d09a971af839cd923a206a06e73d090ae74a605ed27e97fab93cf
-size 4884145088

 version https://git-lfs.github.com/spec/v1
+oid sha256:2bd5c27b2cca6e06f7b4497ce8c9b1522a64846817a871bad274d08507960ed0
+size 4884669696

model-00021-of-00021.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a21e65470d7dbe4ae849be427eb5366cc7cc311138cc7f943f3d71d84b7c7ffd
-size 4647318256

 version https://git-lfs.github.com/spec/v1
+oid sha256:a47ef23db8deb5364da676a40dc3dcb011fb9d9ceef13ba044c176e9a83ac1e3
+size 4647318576

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

modeling_jamba.py CHANGED Viewed

@@ -20,8 +20,6 @@
 """ PyTorch Jamba model."""
 import inspect
 import math
-import warnings
-from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
@@ -31,10 +29,9 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import (
-    _prepare_4d_causal_attention_mask,
-    _prepare_4d_causal_attention_mask_for_sdpa,
 )
 from transformers.modeling_outputs import (
     MoeCausalLMOutputWithPast,
@@ -42,7 +39,6 @@ from transformers.modeling_outputs import (
     SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
-from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_13
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -50,11 +46,15 @@ from transformers.utils import (
     logging,
     replace_return_docstrings,
 )
-from transformers.utils.import_utils import is_torch_fx_available
 from .configuration_jamba import JambaConfig
-# try except block so it'll work with trust_remote_code. Later we can have `if is_flash_attn_2_available():`
 try:
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
@@ -63,22 +63,15 @@ try:
 except ImportError:
     pass
-# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
-# It means that the function will not be traced through and simply appear as a node in the graph.
-if is_torch_fx_available():
-    if not is_torch_greater_or_equal_than_1_13:
-        import torch.fx
-    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
-# try except block so it'll work with trust_remote_code. Later we can have `if is_mamba_ssm_available():`
 try:
     from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
     from mamba_ssm.ops.triton.selective_state_update import selective_state_update
 except ImportError:
     selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
-# try except block so it'll work with trust_remote_code. Later we can have `if is_causal_conv1d_available():`
 try:
     from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
 except ImportError:
@@ -94,9 +87,12 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "JambaConfig"
-# Adapted from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
 def load_balancing_loss_func(
-        gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
 ) -> float:
     r"""
     Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
@@ -106,7 +102,7 @@ def load_balancing_loss_func(
     experts is too unbalanced.
     Args:
-        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
             Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
             shape [batch_size X sequence_length, num_experts].
         attention_mask (`torch.Tensor`, None):
@@ -118,16 +114,16 @@ def load_balancing_loss_func(
     Returns:
         The auxiliary loss.
     """
-    if gate_logits is None or not isinstance(gate_logits, tuple):
         return 0
-    if isinstance(gate_logits, tuple):
-        compute_device = gate_logits[0].device
-        concatenated_gate_logits = torch.cat(
-            [layer_gate.to(compute_device) for layer_gate in gate_logits if layer_gate.shape[1] > 1], dim=0
         )
-    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
@@ -141,7 +137,7 @@ def load_balancing_loss_func(
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
     else:
         batch_size, sequence_length = attention_mask.shape
-        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
         # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
         expert_attention_mask = (
@@ -217,6 +213,82 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 # Adapted from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Jamba
 class JambaAttention(nn.Module):
     """
@@ -253,23 +325,16 @@ class JambaAttention(nn.Module):
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
             self,
             hidden_states: torch.Tensor,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
-            past_key_value: Optional[Cache] = None,
             output_attentions: bool = False,
             use_cache: bool = False,
-            **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states)
@@ -280,16 +345,6 @@ class JambaAttention(nn.Module):
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         if past_key_value is not None:
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
@@ -299,19 +354,9 @@ class JambaAttention(nn.Module):
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
@@ -357,37 +402,26 @@ class JambaFlashAttention2(JambaAttention):
             hidden_states: torch.Tensor,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
-            past_key_value: Optional[Cache] = None,
             output_attentions: bool = False,
             use_cache: bool = False,
             **kwargs,
     ):
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-            # overwrite attention_mask with padding_mask
-            attention_mask = kwargs.pop("padding_mask")
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         use_sliding_windows = (
                 _flash_supports_window_size
@@ -403,7 +437,7 @@ class JambaFlashAttention2(JambaAttention):
         if past_key_value is not None:
             # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
             if (
                     getattr(self.config, "sliding_window", None) is not None
                     and kv_seq_len > self.config.sliding_window
@@ -505,7 +539,7 @@ class JambaFlashAttention2(JambaAttention):
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`int`, *optional*):
                 Attention dropout
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
@@ -580,6 +614,7 @@ class JambaFlashAttention2(JambaAttention):
         return attn_output
     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
@@ -637,9 +672,10 @@ class JambaSdpaAttention(JambaAttention):
             hidden_states: torch.Tensor,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
-            past_key_value: Optional[Cache] = None,
             output_attentions: bool = False,
             use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
@@ -666,21 +702,15 @@ class JambaSdpaAttention(JambaAttention):
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         if past_key_value is not None:
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
@@ -693,7 +723,7 @@ class JambaSdpaAttention(JambaAttention):
             query_states,
             key_states,
             value_states,
-            attn_mask=attention_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
@@ -714,99 +744,6 @@ JAMBA_ATTENTION_CLASSES = {
 }
-class HybridMambaAttentionDynamicCache(DynamicCache):
-    """
-    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
-    (which has a constant shape regardless of seq_len).
-    It stores the Key and Value states as a list of tensors, one for each layer.
-    The expected shape for each tensor for attention layers is `[batch_size, num_heads, seq_len, head_dim]`.
-    For the mamba layers, the `key_cache` represents the convolution state and has a shape of `[batch_size, d_inner, 1, d_conv]`,
-    and the `value_cache` represents the ssm state and has a shape of `[batch_size, d_inner, 1, d_state]`. Mamba cache
-    shape[2] is a dummy "seqlen" dimension to match the number of attention cache dimensions. For mamba, the cache
-    doesn't grow with seqlen so this dimension is always 1.
-    """
-    def __init__(self) -> None:
-        super().__init__()
-        self.attention_layer_idx = None  # used to know which layer has data on seqlen in the cache shape
-    def update(
-            self,
-            key_states: torch.Tensor,
-            value_states: torch.Tensor,
-            layer_idx: int,
-            cache_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
-        Parameters:
-            key_states (`torch.Tensor`):
-                The new key states to cache.
-            value_states (`torch.Tensor`):
-                The new value states to cache.
-            layer_idx (`int`):
-                The index of the layer to cache the states for.
-            cache_kwargs (`Dict[str, Any]`, `optional`):
-                Additional arguments for the cache subclass. No additional arguments are used in `HybridMambaAttentionDynamicCache`.
-        Return:
-            A tuple containing the updated key and value states.
-        """
-        # Update the number of seen tokens
-        if self.attention_layer_idx is None and self._is_attn_layer(key_states, value_states):
-            self.attention_layer_idx = layer_idx
-        if self.attention_layer_idx is not None and layer_idx == self.attention_layer_idx:
-            if hasattr(self, "_seen_tokens"):
-                self._seen_tokens += key_states.shape[-2]
-            else:
-                self.seen_tokens += key_states.shape[-2]
-        # Update the cache
-        if len(self.key_cache) <= layer_idx:
-            self.key_cache.append(key_states)
-            self.value_cache.append(value_states)
-        else:
-            if self._is_attn_layer(self.key_cache[layer_idx], self.value_cache[layer_idx]):
-                # attention layer - append the new states to the existing cache on the seqlen dimension
-                self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
-                self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
-            else:
-                # mamba layer - replace the cache with the new states
-                self.key_cache[layer_idx] = key_states
-                self.value_cache[layer_idx] = value_states
-        return self.key_cache[layer_idx], self.value_cache[layer_idx]
-    def get_seq_length(self, layer_idx: Optional[int] = None) -> int:
-        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
-        if layer_idx is not None:
-            if len(self.key_cache) <= layer_idx:
-                return 0
-            if self._is_attn_layer(self.key_cache[layer_idx], self.value_cache[layer_idx]):
-                return self.key_cache[layer_idx].shape[-2]
-            else:
-                warnings.warn(
-                    f"Asked to get the sequence length from cache of layer {layer_idx} which is not an attention layer. "
-                    f"Ignoring that and using an attention layer cache"
-                )
-        if self.attention_layer_idx is None or len(self.key_cache) <= self.attention_layer_idx:
-            return 0
-        return self.key_cache[self.attention_layer_idx].shape[-2]
-    @staticmethod
-    def _is_attn_layer(key_states: torch.Tensor, value_states: torch.Tensor):
-        return key_states.shape[-1] == value_states.shape[-1]
-@dataclass
-class MambaCacheParams:
-    seqlen_offset: int = 0
-    conv_states: Dict[int, torch.Tensor] = field(default_factory=dict)
-    ssm_states: Dict[int, torch.Tensor] = field(default_factory=dict)
 # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
 class JambaMambaMixer(nn.Module):
     """
@@ -838,7 +775,6 @@ class JambaMambaMixer(nn.Module):
         self.activation = config.hidden_act
         self.act = ACT2FN[config.hidden_act]
-        self.apply_inner_layernorms = config.mamba_inner_layernorms
         self.use_fast_kernels = config.use_mamba_kernels
@@ -858,14 +794,9 @@ class JambaMambaMixer(nn.Module):
         self.D = nn.Parameter(torch.ones(self.intermediate_size))
         self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)
-        if self.apply_inner_layernorms:
-            self.dt_layernorm = JambaRMSNorm(self.time_step_rank, eps=config.rms_norm_eps)
-            self.B_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
-            self.C_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
-        else:
-            self.dt_layernorm = None
-            self.B_layernorm = None
-            self.C_layernorm = None
         if not is_fast_path_available:
             logger.warning_once(
@@ -874,145 +805,121 @@ class JambaMambaMixer(nn.Module):
                 " https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config"
             )
-    def _apply_layernorms(self, dt, B, C):
-        if self.dt_layernorm is not None:
-            dt = self.dt_layernorm(dt)
-        if self.B_layernorm is not None:
-            B = self.B_layernorm(B)
-        if self.C_layernorm is not None:
-            C = self.C_layernorm(C)
-        return dt, B, C
-    def cuda_kernels_forward(self, hidden_states: torch.Tensor, cache_params: MambaCacheParams = None):
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states).transpose(1, 2)
-        if (
-                self.training and cache_params is None and not self.apply_inner_layernorms
-        ):  # Doesn't support outputting the states -> used for training
-            contextualized_states = mamba_inner_fn(
-                projected_states,
-                self.conv1d.weight,
-                self.conv1d.bias if self.use_conv_bias else None,
-                self.x_proj.weight,
-                self.dt_proj.weight,
-                self.out_proj.weight,
-                self.out_proj.bias.float() if self.use_bias else None,
-                -torch.exp(self.A_log.float()),
-                None,  # input-dependent B
-                None,  # input-dependent C
-                self.D.float(),
-                delta_bias=self.dt_proj.bias.float(),
-                delta_softplus=True,
-            )
         else:
-            hidden_states, gate = projected_states.chunk(2, dim=1)
-            # 2. Convolution sequence transformation
-            conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
-            if cache_params is not None and cache_params.seqlen_offset > 0:
-                hidden_states = causal_conv1d_update(
-                    hidden_states.squeeze(-1),
-                    cache_params.conv_states[self.layer_idx],
-                    conv_weights,
-                    self.conv1d.bias,
-                    self.activation,
-                )
-                hidden_states = hidden_states.unsqueeze(-1)
-            else:
-                if cache_params is not None:
-                    conv_states = nn.functional.pad(
-                        hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0)
-                    )
-                    cache_params.conv_states[self.layer_idx].copy_(conv_states)
-                hidden_states = causal_conv1d_fn(
-                    hidden_states, conv_weights, self.conv1d.bias, activation=self.activation
-                )
-            # 3. State Space Model sequence transformation
-            # 3.a. input varying initialization of time_step, B and C
-            ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
-            time_step, B, C = torch.split(
-                ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
             )
-            time_step, B, C = self._apply_layernorms(time_step, B, C)
-            # Here we need to apply dt_proj without the bias, as the bias is added in the selective scan kernel.
-            # This is a hack to apply dt_proj while still using the forward pass of `torch.nn.Linear`, which is needed
-            # in order to make quantization work. Quantization code replaces `torch.nn.Linear` layers with quantized
-            # linear layers, and requires to call the forward pass directly.
-            # The original code here was: ```discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)```
-            if hasattr(self.dt_proj, "base_layer"):
-                # In case of LoRA, we need to access the base layer to get the weight
-                time_proj_bias = self.dt_proj.base_layer.bias
-                self.dt_proj.base_layer.bias = None
-            else:
-                time_proj_bias = self.dt_proj.bias
-                self.dt_proj.bias = None
-            discrete_time_step = self.dt_proj(time_step).transpose(1, 2)
-            if hasattr(self.dt_proj, "base_layer"):
-                self.dt_proj.base_layer.bias = time_proj_bias
-            else:
-                self.dt_proj.bias = time_proj_bias
-            A = -torch.exp(self.A_log.float())
-            # 3.c perform the recurrence y ← SSM(A, B, C)(x)
-            time_proj_bias = time_proj_bias.float() if time_proj_bias is not None else None
-            if cache_params is not None and cache_params.seqlen_offset > 0:
-                scan_outputs = selective_state_update(
-                    cache_params.ssm_states[self.layer_idx],
-                    hidden_states[..., 0],
-                    discrete_time_step[..., 0],
-                    A,
-                    B[:, 0],
-                    C[:, 0],
-                    self.D,
-                    gate[..., 0],
-                    time_proj_bias,
-                    dt_softplus=True,
-                ).unsqueeze(-1)
-            else:
-                scan_outputs, ssm_state = selective_scan_fn(
-                    hidden_states,
-                    discrete_time_step,
-                    A,
-                    B.transpose(1, 2),
-                    C.transpose(1, 2),
-                    self.D.float(),
-                    gate,
-                    time_proj_bias,
-                    delta_softplus=True,
-                    return_last_state=True,
-                )
-                if ssm_state is not None and cache_params is not None:
-                    cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
-            # 4. Final linear projection
-            contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
         return contextualized_states
     # fmt: off
-    def slow_forward(self, input_states, cache_params: MambaCacheParams = None):
         batch_size, seq_len, _ = input_states.shape
         dtype = input_states.dtype
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(input_states).transpose(1, 2)                   # [batch, 2 * intermediate_size, seq_len]
         hidden_states, gate = projected_states.chunk(2, dim=1)
         # 2. Convolution sequence transformation
-        if cache_params is not None:
             if self.training:
                 # In training mode, we don't want to perform in-place operations on ssm_state so we can compute the backwards pass
                 ssm_state = cache_params.ssm_states[self.layer_idx].clone()
             else:
                 ssm_state = cache_params.ssm_states[self.layer_idx]
-            if cache_params.seqlen_offset > 0:
                 conv_state = cache_params.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
                 conv_state = torch.roll(conv_state, shifts=-1, dims=-1)
                 conv_state[:, :, -1] = hidden_states[:, :, 0]
-                cache_params.conv_states[self.layer_idx].copy_(conv_state)
                 hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
                 if self.use_conv_bias:
                     hidden_states += self.conv1d.bias
@@ -1022,7 +929,7 @@ class JambaMambaMixer(nn.Module):
                     hidden_states,
                     (self.conv_kernel_size - hidden_states.shape[-1], 0)
                 )
-                cache_params.conv_states[self.layer_idx].copy_(conv_state)
                 hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])     # [batch, intermediate_size, seq_len]
         else:
             ssm_state = torch.zeros(
@@ -1037,7 +944,11 @@ class JambaMambaMixer(nn.Module):
         time_step, B, C = torch.split(
             ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
         )
-        time_step, B, C = self._apply_layernorms(time_step, B, C)
         discrete_time_step = self.dt_proj(time_step)                                    # [batch, seq_len, intermediate_size]
         discrete_time_step = nn.functional.softplus(discrete_time_step).transpose(1, 2) # [batch, intermediate_size, seq_len]
@@ -1057,15 +968,15 @@ class JambaMambaMixer(nn.Module):
         scan_output = scan_output + (hidden_states * self.D[None, :, None])
         scan_output = (scan_output * self.act(gate))
-        if cache_params is not None:
-            cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
         # 4. Final linear projection
         contextualized_states = self.out_proj(scan_output.transpose(1, 2))             # [batch, seq_len, hidden_size]
         return contextualized_states
     # fmt: on
-    def mixer_forward(self, hidden_states, cache_params: MambaCacheParams = None):
         if self.use_fast_kernels:
             if not is_fast_path_available or "cuda" not in self.x_proj.weight.device.type:
                 raise ValueError(
@@ -1074,64 +985,17 @@ class JambaMambaMixer(nn.Module):
             return self.cuda_kernels_forward(hidden_states, cache_params)
         return self.slow_forward(hidden_states, cache_params)
-    def forward(
-            self,
-            hidden_states: torch.Tensor,
-            past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
-            **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
-        if past_key_value is not None:
-            cache_params = MambaCacheParams(
-                seqlen_offset=0 if hidden_states.shape[1] > 1 else past_key_value.seen_tokens,
-            )
-            if len(past_key_value.key_cache) > self.layer_idx:
-                # we already have cache for this layer, use it
-                # remove the dummy seqlen dim (dim=2)
-                cache_params.conv_states[self.layer_idx] = past_key_value.key_cache[self.layer_idx].squeeze(2)
-                cache_params.ssm_states[self.layer_idx] = past_key_value.value_cache[self.layer_idx].squeeze(2)
-            else:
-                # we don't have cache for this layer, initialize it with zeros
-                batch_size = hidden_states.shape[0]
-                cache_params.conv_states[self.layer_idx] = torch.zeros(
-                    batch_size,
-                    self.intermediate_size,
-                    self.conv_kernel_size,
-                    device=hidden_states.device,
-                    dtype=hidden_states.dtype,
-                )
-                cache_params.ssm_states[self.layer_idx] = torch.zeros(
-                    batch_size,
-                    self.intermediate_size,
-                    self.ssm_state_size,
-                    device=hidden_states.device,
-                    dtype=hidden_states.dtype,
-                )
-        else:
-            cache_params = None
-        res = self.mixer_forward(hidden_states, cache_params)
-        if past_key_value is not None:
-            past_key_value.update(
-                # add dummy seqlen dim (dim=2) to match the number of dimensions of the attention cache
-                cache_params.conv_states[self.layer_idx].unsqueeze(2),
-                cache_params.ssm_states[self.layer_idx].unsqueeze(2),
-                self.layer_idx,
-            )
-        return res, past_key_value
 class JambaMLP(nn.Module):
-    def __init__(self, config: JambaConfig):
         super().__init__()
-        self.ffn_dim = config.intermediate_size
-        self.hidden_dim = config.hidden_size
-        self.gate_proj = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
-        self.down_proj = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
-        self.up_proj = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
@@ -1151,39 +1015,20 @@ class JambaSparseMoeBlock(nn.Module):
     and memory on padding.
     """
-    def __init__(self, config: JambaConfig, num_experts: int, num_experts_per_tok: int):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self.ffn_dim = config.intermediate_size
-        #   these values are decided on runtime depending on the layer index
-        self.num_experts = num_experts
-        self.top_k = num_experts_per_tok
-        if num_experts > 1:
-            # expert routing
-            self.router = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
-        else:
-            self.router = None
         self.experts = nn.ModuleList([JambaMLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """ """
         batch_size, sequence_length, hidden_dim = hidden_states.shape
-        if self.num_experts == 1:
-            # in this case we have a single MLP block and don't need to do any routing
-            final_hidden_states = self.experts[0](hidden_states)
-            router_logits = torch.ones(
-                (batch_size * sequence_length, 1),
-                device=hidden_states.device,
-                dtype=hidden_states.dtype,
-                requires_grad=hidden_states.requires_grad,
-            )
-            return final_hidden_states, router_logits
-        # in this case we have multiple experts and need to do routing
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.router(hidden_states)
@@ -1208,15 +1053,11 @@ class JambaSparseMoeBlock(nn.Module):
             if top_x.shape[0] == 0:
                 continue
-            # in torch it is faster to index using lists than torch tensors
-            top_x_list = top_x.tolist()
-            idx_list = idx.tolist()
             # Index the correct hidden states and compute the expert hidden state for
             # the current expert. We need to make sure to multiply the output hidden
             # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
-            current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
-            current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None]
             # However `index_add_` only support torch tensors for indexing so we'll use
             # the `top_x` tensor here.
@@ -1226,37 +1067,33 @@ class JambaSparseMoeBlock(nn.Module):
 class JambaAttentionDecoderLayer(nn.Module):
-    def __init__(self, config: JambaConfig, num_experts: int, layer_idx: int):
         super().__init__()
         self.self_attn = JAMBA_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
-        num_experts_per_tok = config.num_experts_per_tok if num_experts > 1 else 1
-        self.moe = JambaSparseMoeBlock(config, num_experts=num_experts, num_experts_per_tok=num_experts_per_tok)
         self.input_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.pre_moe_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
             self,
             hidden_states: torch.Tensor,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
-            past_key_value: Optional[Tuple[torch.Tensor]] = None,
             output_attentions: Optional[bool] = False,
             output_router_logits: Optional[bool] = False,
             use_cache: Optional[bool] = False,
-            **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                 `(batch, sequence_length)` where padding elements are indicated by 0.
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -1266,6 +1103,8 @@ class JambaAttentionDecoderLayer(nn.Module):
             use_cache (`bool`, *optional*):
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
         """
         residual = hidden_states
@@ -1279,15 +1118,20 @@ class JambaAttentionDecoderLayer(nn.Module):
             past_key_value=past_key_value,
             output_attentions=output_attentions,
             use_cache=use_cache,
         )
         # residual connection after attention
         hidden_states = residual + hidden_states
-        # Experts
         residual = hidden_states
-        hidden_states = self.pre_moe_layernorm(hidden_states)
-        hidden_states, router_logits = self.moe(hidden_states)
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
@@ -1305,15 +1149,15 @@ class JambaAttentionDecoderLayer(nn.Module):
 class JambaMambaDecoderLayer(nn.Module):
-    def __init__(self, config: JambaConfig, num_experts: int, layer_idx: int):
         super().__init__()
         self.mamba = JambaMambaMixer(config=config, layer_idx=layer_idx)
-        num_experts_per_tok = config.num_experts_per_tok if num_experts > 1 else 1
-        self.moe = JambaSparseMoeBlock(config, num_experts=num_experts, num_experts_per_tok=num_experts_per_tok)
         self.input_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.pre_moe_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
             self,
@@ -1324,18 +1168,14 @@ class JambaMambaDecoderLayer(nn.Module):
             output_attentions: Optional[bool] = False,
             output_router_logits: Optional[bool] = False,
             use_cache: Optional[bool] = False,
-            **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                 `(batch, sequence_length)` where padding elements are indicated by 0.
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -1345,28 +1185,31 @@ class JambaMambaDecoderLayer(nn.Module):
             use_cache (`bool`, *optional*):
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
         """
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
-        hidden_states, present_key_value = self.mamba(
             hidden_states=hidden_states,
-            past_key_value=past_key_value,
         )
-        bs, seqlen, _ = hidden_states.shape
-        past_seqlen = self._get_past_seqlen(past_key_value, seqlen)
-        num_attention_heads = self.mamba.config.num_attention_heads
-        self_attn_weights = torch.empty(bs, num_attention_heads, seqlen, past_seqlen, device="meta")
         # residual connection after mamba
         hidden_states = residual + hidden_states
-        # Experts
         residual = hidden_states
-        hidden_states = self.pre_moe_layernorm(hidden_states)
-        hidden_states, router_logits = self.moe(hidden_states)
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
@@ -1375,25 +1218,13 @@ class JambaMambaDecoderLayer(nn.Module):
             outputs += (self_attn_weights,)
         if use_cache:
-            outputs += (present_key_value,)
         if output_router_logits:
             outputs += (router_logits,)
         return outputs
-    def _get_past_seqlen(self, past_key_value, seqlen):
-        if past_key_value is None:
-            return seqlen
-        past_seqlen = past_key_value.get_seq_length()
-        if past_seqlen == 0:
-            return seqlen
-        if past_key_value.attention_layer_idx is None:
-            return seqlen
-        if self.mamba.layer_idx < past_key_value.attention_layer_idx:
-            return past_seqlen + 1
-        return past_seqlen
 JAMBA_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -1416,7 +1247,6 @@ JAMBA_START_DOCSTRING = r"""
     "The bare Jamba Model outputting raw hidden-states without any specific head on top.",
     JAMBA_START_DOCSTRING,
 )
-# Adapted from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->Jamba
 class JambaPreTrainedModel(PreTrainedModel):
     config_class = JambaConfig
     base_model_prefix = "model"
@@ -1438,42 +1268,6 @@ class JambaPreTrainedModel(PreTrainedModel):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-    @staticmethod
-    def _convert_to_standard_cache(
-            past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Standardizes the format of the cache so as to match most implementations, i.e. have the seqlen as the third dim
-        also for mamba layers
-        """
-        attn_layer_index = [k.shape == v.shape for k, v in past_key_value].index(True)
-        seqlen = past_key_value[attn_layer_index][0].shape[2]
-        standard_past_key_value = ()
-        for k, v in past_key_value:
-            if k.shape != v.shape:
-                # mamba layer
-                # expand doesn't use more memory, so it's fine to do it here
-                standard_past_key_value += ((k.expand(-1, -1, seqlen, -1), v.expand(-1, -1, seqlen, -1)),)
-            else:
-                standard_past_key_value += ((k, v),)
-        return standard_past_key_value
-    @staticmethod
-    def _convert_to_jamba_cache(
-            past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
-    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Converts the cache to the format expected by Jamba, i.e. dummy seqlen dimesion with size 1 for mamba layers
-        """
-        jamba_past_key_value = ()
-        for k, v in past_key_value:
-            if k.shape != v.shape:
-                # mamba layer
-                jamba_past_key_value += ((k[:, :, :1, :], v[:, :, :1, :]),)
-            else:
-                jamba_past_key_value += ((k, v),)
-        return jamba_past_key_value
 JAMBA_INPUTS_DOCSTRING = r"""
     Args:
@@ -1510,17 +1304,14 @@ JAMBA_INPUTS_DOCSTRING = r"""
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.num_hidden_layers`, with each tuple having 2 tensors
-            corresponding to the cache of the layer.
-            For attention layers, both tensors have shape of `(batch_size, num_kv_heads, sequence_length, embed_size_per_head)`
-            For mamba layers, the first tensor represents the convolution state and has shape of `(batch_size, d_inner, 1, d_conv)`,
-            and the second tensor represents the ssm state and has shape of `(batch_size, d_inner, 1, d_state)`. Mamba
-            cache shape[2] is a dummy "seqlen" dimension to match the number of attention cache dimensions. For mamba,
-            the cache doesn't grow with seqlen so this dimension is always 1.
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and convolution and
-            ssm states in the mamba blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
@@ -1543,8 +1334,14 @@ JAMBA_INPUTS_DOCSTRING = r"""
             should not be returned during inference.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 @add_start_docstrings(
     "The bare Jamba Model outputting raw hidden-states without any specific head on top.",
@@ -1565,35 +1362,10 @@ class JambaModel(JambaPreTrainedModel):
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        # init each model layer, decide if it's mamba/attention and has experts or not
         decoder_layers = []
         for i in range(config.num_hidden_layers):
-            is_attn = True if (i - self.config.attn_layer_offset) % self.config.attn_layer_period == 0 else False
-            is_expert = True if (i - self.config.expert_layer_offset) % self.config.expert_layer_period == 0 else False
-            num_experts = self.config.num_experts if is_expert else 1
-            if is_attn:
-                decoder_layers.append(JambaAttentionDecoderLayer(config, num_experts=num_experts, layer_idx=i))
-            else:
-                decoder_layers.append(JambaMambaDecoderLayer(config, num_experts=num_experts, layer_idx=i))
-        if not any(isinstance(layer, JambaAttentionDecoderLayer) for layer in decoder_layers):
-            raise ValueError("At least one layer in the decoder must be an attention layer")
-        self._attn_layer_index = [isinstance(layer, JambaAttentionDecoderLayer) for layer in decoder_layers].index(
-            True
-        )
-        if not any(isinstance(layer, JambaMambaDecoderLayer) for layer in decoder_layers):
-            raise ValueError("At least one layer in the decoder must be a Mamba layer")
-        self._mamba_layer_index = [isinstance(layer, JambaMambaDecoderLayer) for layer in decoder_layers].index(True)
-        if (
-                decoder_layers[self._mamba_layer_index].mamba.ssm_state_size
-                == decoder_layers[self._mamba_layer_index].mamba.conv_kernel_size
-        ):
-            raise ValueError("Mamba state size and convolution size must be different")
         self.layers = nn.ModuleList(decoder_layers)
         self._attn_implementation = config._attn_implementation
@@ -1609,20 +1381,20 @@ class JambaModel(JambaPreTrainedModel):
     def set_input_embeddings(self, value):
         self.embed_tokens = value
-    # Ignore copy
     @add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING)
     def forward(
             self,
             input_ids: torch.LongTensor = None,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
-            past_key_values: Optional[Union[List[torch.FloatTensor], HybridMambaAttentionDynamicCache]] = None,
             inputs_embeds: Optional[torch.FloatTensor] = None,
             use_cache: Optional[bool] = None,
             output_attentions: Optional[bool] = None,
             output_hidden_states: Optional[bool] = None,
             output_router_logits: Optional[bool] = None,
             return_dict: Optional[bool] = None,
     ) -> Union[Tuple, MoeModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
@@ -1635,85 +1407,37 @@ class JambaModel(JambaPreTrainedModel):
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-        past_key_values_length = 0
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-        if use_cache:
-            if isinstance(past_key_values, Cache) and not isinstance(
-                    past_key_values, HybridMambaAttentionDynamicCache
-            ):
-                past_key_values = HybridMambaAttentionDynamicCache.from_legacy_cache(past_key_values.to_legacy_cache())
-            use_legacy_cache = not isinstance(past_key_values, HybridMambaAttentionDynamicCache)
-            if use_legacy_cache:
-                past_key_values = HybridMambaAttentionDynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length, self._attn_layer_index)
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
             )
-            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-        else:
-            position_ids = position_ids.view(-1, seq_length).long()
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
-            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-            if is_padding_right:
-                raise ValueError(
-                    "You are attempting to perform batched generation with padding_side='right'"
-                    " this may lead to unexpected behaviour for Flash Attention version of Jamba. Make sure to "
-                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-                )
-        if self._attn_implementation == "flash_attention_2":
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._attn_implementation == "sdpa" and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-                sliding_window=self.config.sliding_window,
             )
-        hidden_states = inputs_embeds
-        # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         all_router_logits = () if output_router_logits else None
-        next_decoder_cache = None
         for decoder_layer in self.layers:
             if output_hidden_states:
@@ -1723,34 +1447,37 @@ class JambaModel(JambaPreTrainedModel):
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
-                    attention_mask,
                     position_ids,
                     past_key_values,
                     output_attentions,
                     output_router_logits,
                     use_cache,
                 )
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
-                    attention_mask=attention_mask,
                     position_ids=position_ids,
                     past_key_value=past_key_values,
                     output_attentions=output_attentions,
                     output_router_logits=output_router_logits,
                     use_cache=use_cache,
                 )
             hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
             if output_attentions:
-                all_self_attns += (layer_outputs[1],)
             if output_router_logits:
-                all_router_logits += (layer_outputs[-1],)
         hidden_states = self.final_layernorm(hidden_states)
@@ -1758,9 +1485,10 @@ class JambaModel(JambaPreTrainedModel):
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
-        next_cache = None
-        if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         if not return_dict:
             return tuple(
@@ -1776,6 +1504,41 @@ class JambaModel(JambaPreTrainedModel):
             router_logits=all_router_logits,
         )
 # Adapted from transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM with MIXTRAL->JAMBA, Mixtral->Jamba
 class JambaForCausalLM(JambaPreTrainedModel):
@@ -1818,7 +1581,7 @@ class JambaForCausalLM(JambaPreTrainedModel):
             input_ids: torch.LongTensor = None,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
-            past_key_values: Optional[List[torch.FloatTensor]] = None,
             inputs_embeds: Optional[torch.FloatTensor] = None,
             labels: Optional[torch.LongTensor] = None,
             use_cache: Optional[bool] = None,
@@ -1826,7 +1589,8 @@ class JambaForCausalLM(JambaPreTrainedModel):
             output_hidden_states: Optional[bool] = None,
             output_router_logits: Optional[bool] = None,
             return_dict: Optional[bool] = None,
-            calc_logits_for_entire_prompt: Optional[bool] = True,
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         r"""
         Args:
@@ -1835,12 +1599,28 @@ class JambaForCausalLM(JambaPreTrainedModel):
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-            calc_logits_for_entire_prompt (`bool`, *optional*):
-                Whether or not to calculate the logits for the entire prompt, or just the last token. Only last token
-                logits are needed for generation, and calculating them only for that token can save memory,
-                which becomes pretty significant for long sequences.
         Returns:
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1864,14 +1644,15 @@ class JambaForCausalLM(JambaPreTrainedModel):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             output_router_logits=output_router_logits,
             return_dict=return_dict,
         )
         hidden_states = outputs[0]
-        if calc_logits_for_entire_prompt:
             logits = self.lm_head(hidden_states)
         else:
-            logits = self.lm_head(hidden_states[..., -1:, :])
         logits = logits.float()
         loss = None
@@ -1921,27 +1702,15 @@ class JambaForCausalLM(JambaPreTrainedModel):
             attention_mask=None,
             inputs_embeds=None,
             output_router_logits=False,
             **kwargs,
     ):
-        # Omit tokens covered by past_key_values
-        if past_key_values is not None:
-            # the cache may be in the stardard format (e.g. in contrastive search), convert to Jamba's format if needed
-            if isinstance(past_key_values, Tuple):
-                if past_key_values[self.model._mamba_layer_index][0].shape[2] > 1:
-                    past_key_values = self._convert_to_jamba_cache(past_key_values)
-            if isinstance(past_key_values, Cache):
-                if not isinstance(past_key_values, HybridMambaAttentionDynamicCache):
-                    past_key_values = HybridMambaAttentionDynamicCache.from_legacy_cache(
-                        past_key_values.to_legacy_cache()
-                    )
-                cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
-            else:
-                cache_length = past_length = past_key_values[self.model._attn_layer_index][0].shape[2]
-                max_cache_length = None
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
@@ -1958,20 +1727,24 @@ class JambaForCausalLM(JambaPreTrainedModel):
             if (
                     max_cache_length is not None
                     and attention_mask is not None
-                    and cache_length + input_ids.shape[1] > max_cache_length
             ):
                 attention_mask = attention_mask[:, -max_cache_length:]
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids}
@@ -1983,20 +1756,12 @@ class JambaForCausalLM(JambaPreTrainedModel):
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
                 "output_router_logits": output_router_logits,
-                "calc_logits_for_entire_prompt": self.config.calc_logits_for_entire_prompt,
             }
         )
         return model_inputs
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
 @add_start_docstrings(
     """

 """ PyTorch Jamba model."""
 import inspect
 import math
 from typing import Any, Dict, List, Optional, Tuple, Union
 import torch
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
+from transformers.cache_utils import DynamicCache  # we need __iter__ and __len__ of pkv
 from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
 )
 from transformers.modeling_outputs import (
     MoeCausalLMOutputWithPast,
     SequenceClassifierOutputWithPast,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
     replace_return_docstrings,
 )
+from transformers.utils.import_utils import (
+    is_causal_conv1d_available,
+    is_flash_attn_2_available,
+    is_mamba_ssm_available,
+)
 from .configuration_jamba import JambaConfig
+# try except block so it'll work with trust_remote_code.
 try:
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 except ImportError:
     pass
+# try except block so it'll work with trust_remote_code.
 try:
     from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
     from mamba_ssm.ops.triton.selective_state_update import selective_state_update
 except ImportError:
     selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
+# try except block so it'll work with trust_remote_code.
 try:
     from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
 except ImportError:
 _CONFIG_FOR_DOC = "JambaConfig"
+# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func with gate->router
 def load_balancing_loss_func(
+        router_logits: torch.Tensor,
+        num_experts: torch.Tensor = None,
+        top_k=2,
+        attention_mask: Optional[torch.Tensor] = None,
 ) -> float:
     r"""
     Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
     experts is too unbalanced.
     Args:
+        router_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
             Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
             shape [batch_size X sequence_length, num_experts].
         attention_mask (`torch.Tensor`, None):
     Returns:
         The auxiliary loss.
     """
+    if router_logits is None or not isinstance(router_logits, tuple):
         return 0
+    if isinstance(router_logits, tuple):
+        compute_device = router_logits[0].device
+        concatenated_router_logits = torch.cat(
+            [layer_router.to(compute_device) for layer_router in router_logits], dim=0
         )
+    routing_weights = torch.nn.functional.softmax(concatenated_router_logits, dim=-1)
     _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
         router_prob_per_expert = torch.mean(routing_weights, dim=0)
     else:
         batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_router_logits.shape[0] // (batch_size * sequence_length)
         # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
         expert_attention_mask = (
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class HybridMambaAttentionDynamicCache(DynamicCache):
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+    def __init__(self, config, batch_size, dtype=torch.float16, device=None):
+        self.dtype = dtype
+        self.layers_block_type = config.layers_block_type
+        self.has_previous_state = False  # only used by mamba
+        intermediate_size = config.mamba_expand * config.hidden_size
+        ssm_state_size = config.mamba_d_state
+        conv_kernel_size = config.mamba_d_conv
+        self.conv_states = []
+        self.ssm_states = []
+        for i in range(config.num_hidden_layers):
+            if self.layers_block_type[i] == "mamba":
+                self.conv_states += [
+                    torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
+                ]
+                self.ssm_states += [
+                    torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype)
+                ]
+            else:
+                self.conv_states += [torch.tensor([[]] * batch_size, device=device)]
+                self.ssm_states += [torch.tensor([[]] * batch_size, device=device)]
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+    def update(
+            self,
+            key_states: torch.Tensor,
+            value_states: torch.Tensor,
+            layer_idx: int,
+            cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Update the cache
+        if self.key_cache[layer_idx].shape[-1] == 0:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.conv_states[layer_idx].device
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.ssm_states[layer_idx].device
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
+        raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.")
+    @classmethod
+    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
+        raise NotImplementedError("HybridMambaAttentionDynamicCache does not have a legacy cache equivalent.")
 # Adapted from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Jamba
 class JambaAttention(nn.Module):
     """
         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
     def forward(
             self,
             hidden_states: torch.Tensor,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
             output_attentions: bool = False,
             use_cache: bool = False,
+            cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         if past_key_value is not None:
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
             hidden_states: torch.Tensor,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
             output_attentions: bool = False,
             use_cache: bool = False,
+            cache_position: Optional[torch.LongTensor] = None,
             **kwargs,
     ):
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = cache_position[-1]
         use_sliding_windows = (
                 _flash_supports_window_size
         if past_key_value is not None:
             # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = cache_position[0] > 0
             if (
                     getattr(self.config, "sliding_window", None) is not None
                     and kv_seq_len > self.config.sliding_window
             attention_mask (`torch.Tensor`):
                 The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
                 position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`float`, *optional*):
                 Attention dropout
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         return attn_output
+    # Copied from transformers.models.mixtral.modeling_mixtral.MixtralFlashAttention2._upad_input
     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
             hidden_states: torch.Tensor,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
             output_attentions: bool = False,
             use_cache: bool = False,
+            cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if output_attentions:
             # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         if past_key_value is not None:
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
+        causal_mask = attention_mask
         if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
             query_states,
             key_states,
             value_states,
+            attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
             # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
             is_causal=self.is_causal and attention_mask is None and q_len > 1,
 }
 # Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
 class JambaMambaMixer(nn.Module):
     """
         self.activation = config.hidden_act
         self.act = ACT2FN[config.hidden_act]
         self.use_fast_kernels = config.use_mamba_kernels
         self.D = nn.Parameter(torch.ones(self.intermediate_size))
         self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)
+        self.dt_layernorm = JambaRMSNorm(self.time_step_rank, eps=config.rms_norm_eps)
+        self.b_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+        self.c_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
         if not is_fast_path_available:
             logger.warning_once(
                 " https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config"
             )
+    def cuda_kernels_forward(self, hidden_states: torch.Tensor, cache_params: HybridMambaAttentionDynamicCache = None):
+        batch_size, seq_len, _ = hidden_states.shape
+        use_precomputed_states = (
+                cache_params is not None
+                and cache_params.has_previous_state
+                and seq_len == 1
+                and cache_params.conv_states[self.layer_idx].shape[0]
+                == cache_params.ssm_states[self.layer_idx].shape[0]
+                == batch_size
+        )
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(hidden_states).transpose(1, 2)
+        # We can't use `mamba_inner_fn` even if in training and without cache params because we have the
+        # inner layernorms which isn't supported by this fused kernel
+        hidden_states, gate = projected_states.chunk(2, dim=1)
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
+        if use_precomputed_states:
+            hidden_states = causal_conv1d_update(
+                hidden_states.squeeze(-1),
+                cache_params.conv_states[self.layer_idx],
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+            )
+            hidden_states = hidden_states.unsqueeze(-1)
         else:
+            if cache_params is not None:
+                conv_states = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0))
+                cache_params.conv_states[self.layer_idx].copy_(conv_states)
+            hidden_states = causal_conv1d_fn(hidden_states, conv_weights, self.conv1d.bias, activation=self.activation)
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.x_proj(hidden_states.transpose(1, 2))
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+        time_step = self.dt_layernorm(time_step)
+        B = self.b_layernorm(B)
+        C = self.c_layernorm(C)
+        # Here we need to apply dt_proj without the bias, as the bias is added in the selective scan kernel.
+        # This is a hack to apply dt_proj while still using the forward pass of `torch.nn.Linear`, which is needed
+        # in order to make quantization work. Quantization code replaces `torch.nn.Linear` layers with quantized
+        # linear layers, and requires to call the forward pass directly.
+        # The original code here was: ```discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)```
+        time_proj_bias = self.dt_proj.bias
+        self.dt_proj.bias = None
+        discrete_time_step = self.dt_proj(time_step).transpose(1, 2)
+        self.dt_proj.bias = time_proj_bias
+        A = -torch.exp(self.A_log.float())
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = time_proj_bias.float() if time_proj_bias is not None else None
+        if use_precomputed_states:
+            scan_outputs = selective_state_update(
+                cache_params.ssm_states[self.layer_idx],
+                hidden_states[..., 0],
+                discrete_time_step[..., 0],
+                A,
+                B[:, 0],
+                C[:, 0],
+                self.D,
+                gate[..., 0],
+                time_proj_bias,
+                dt_softplus=True,
+            ).unsqueeze(-1)
+        else:
+            scan_outputs, ssm_state = selective_scan_fn(
+                hidden_states,
+                discrete_time_step,
+                A,
+                B.transpose(1, 2),
+                C.transpose(1, 2),
+                self.D.float(),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                return_last_state=True,
             )
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
         return contextualized_states
     # fmt: off
+    def slow_forward(self, input_states, cache_params: HybridMambaAttentionDynamicCache = None):
         batch_size, seq_len, _ = input_states.shape
         dtype = input_states.dtype
         # 1. Gated MLP's linear projection
         projected_states = self.in_proj(input_states).transpose(1, 2)                   # [batch, 2 * intermediate_size, seq_len]
         hidden_states, gate = projected_states.chunk(2, dim=1)
+        use_cache = isinstance(cache_params,HybridMambaAttentionDynamicCache)
         # 2. Convolution sequence transformation
+        if use_cache and cache_params.ssm_states[self.layer_idx].shape[0] == batch_size:
             if self.training:
                 # In training mode, we don't want to perform in-place operations on ssm_state so we can compute the backwards pass
                 ssm_state = cache_params.ssm_states[self.layer_idx].clone()
             else:
                 ssm_state = cache_params.ssm_states[self.layer_idx]
+            if cache_params.has_previous_state and seq_len == 1 and \
+                    cache_params.conv_states[self.layer_idx].shape[0] == batch_size:
                 conv_state = cache_params.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
                 conv_state = torch.roll(conv_state, shifts=-1, dims=-1)
                 conv_state[:, :, -1] = hidden_states[:, :, 0]
+                cache_params.conv_states[self.layer_idx] = conv_state
                 hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
                 if self.use_conv_bias:
                     hidden_states += self.conv1d.bias
                     hidden_states,
                     (self.conv_kernel_size - hidden_states.shape[-1], 0)
                 )
+                cache_params.conv_states[self.layer_idx] = conv_state
                 hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])     # [batch, intermediate_size, seq_len]
         else:
             ssm_state = torch.zeros(
         time_step, B, C = torch.split(
             ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
         )
+        time_step = self.dt_layernorm(time_step)
+        B = self.b_layernorm(B)
+        C = self.c_layernorm(C)
         discrete_time_step = self.dt_proj(time_step)                                    # [batch, seq_len, intermediate_size]
         discrete_time_step = nn.functional.softplus(discrete_time_step).transpose(1, 2) # [batch, intermediate_size, seq_len]
         scan_output = scan_output + (hidden_states * self.D[None, :, None])
         scan_output = (scan_output * self.act(gate))
+        if use_cache:
+            cache_params.ssm_states[self.layer_idx] = ssm_state
         # 4. Final linear projection
         contextualized_states = self.out_proj(scan_output.transpose(1, 2))             # [batch, seq_len, hidden_size]
         return contextualized_states
     # fmt: on
+    def forward(self, hidden_states, cache_params: HybridMambaAttentionDynamicCache = None):
         if self.use_fast_kernels:
             if not is_fast_path_available or "cuda" not in self.x_proj.weight.device.type:
                 raise ValueError(
             return self.cuda_kernels_forward(hidden_states, cache_params)
         return self.slow_forward(hidden_states, cache_params)
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Jamba
 class JambaMLP(nn.Module):
+    def __init__(self, config):
         super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, x):
     and memory on padding.
     """
+    def __init__(self, config: JambaConfig):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.router = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
         self.experts = nn.ModuleList([JambaMLP(config) for _ in range(self.num_experts)])
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """ """
         batch_size, sequence_length, hidden_dim = hidden_states.shape
         hidden_states = hidden_states.view(-1, hidden_dim)
         # router_logits: (batch * sequence_length, n_experts)
         router_logits = self.router(hidden_states)
             if top_x.shape[0] == 0:
                 continue
             # Index the correct hidden states and compute the expert hidden state for
             # the current expert. We need to make sure to multiply the output hidden
             # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
             # However `index_add_` only support torch tensors for indexing so we'll use
             # the `top_x` tensor here.
 class JambaAttentionDecoderLayer(nn.Module):
+    def __init__(self, config: JambaConfig, layer_idx: int):
         super().__init__()
+        num_experts = config.layers_num_experts[layer_idx]
         self.self_attn = JAMBA_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+        ffn_layer_class = JambaSparseMoeBlock if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config)
         self.input_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
             self,
             hidden_states: torch.Tensor,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
             output_attentions: Optional[bool] = False,
             output_router_logits: Optional[bool] = False,
             use_cache: Optional[bool] = False,
+            cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                 `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
             use_cache (`bool`, *optional*):
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
         """
         residual = hidden_states
             past_key_value=past_key_value,
             output_attentions=output_attentions,
             use_cache=use_cache,
+            cache_position=cache_position,
         )
         # residual connection after attention
         hidden_states = residual + hidden_states
+        # feed-forward (experts/MLP)
         residual = hidden_states
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        ff_outputs = self.feed_forward(hidden_states)
+        if isinstance(ff_outputs, tuple):
+            hidden_states, router_logits = ff_outputs
+        else:
+            hidden_states, router_logits = ff_outputs, None
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
 class JambaMambaDecoderLayer(nn.Module):
+    def __init__(self, config: JambaConfig, layer_idx: int):
         super().__init__()
+        num_experts = config.layers_num_experts[layer_idx]
         self.mamba = JambaMambaMixer(config=config, layer_idx=layer_idx)
+        ffn_layer_class = JambaSparseMoeBlock if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config)
         self.input_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
             self,
             output_attentions: Optional[bool] = False,
             output_router_logits: Optional[bool] = False,
             use_cache: Optional[bool] = False,
+            cache_position: Optional[torch.LongTensor] = None,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                 `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
             use_cache (`bool`, *optional*):
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
         """
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.mamba(
             hidden_states=hidden_states,
+            cache_params=past_key_value,
         )
+        self_attn_weights = None
         # residual connection after mamba
         hidden_states = residual + hidden_states
+        # feed-forward (experts/MLP)
         residual = hidden_states
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        ff_outputs = self.feed_forward(hidden_states)
+        if isinstance(ff_outputs, tuple):
+            hidden_states, router_logits = ff_outputs
+        else:
+            hidden_states, router_logits = ff_outputs, None
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
             outputs += (self_attn_weights,)
         if use_cache:
+            outputs += (past_key_value,)
         if output_router_logits:
             outputs += (router_logits,)
         return outputs
 JAMBA_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     "The bare Jamba Model outputting raw hidden-states without any specific head on top.",
     JAMBA_START_DOCSTRING,
 )
 class JambaPreTrainedModel(PreTrainedModel):
     config_class = JambaConfig
     base_model_prefix = "model"
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 JAMBA_INPUTS_DOCSTRING = r"""
     Args:
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
+        past_key_values (`HybridMambaAttentionDynamicCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            A HybridMambaAttentionDynamicCache object containing pre-computed hidden-states (keys and values in the
+            self-attention blocks and convolution and ssm states in the mamba blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+            Key and value cache tensors have shape `(batch_size, num_heads, seq_len, head_dim)`.
+            Convolution and ssm states tensors have shape `(batch_size, d_inner, d_conv)` and
+            `(batch_size, d_inner, d_state)` respectively.
+            See the `HybridMambaAttentionDynamicCache` class for more details.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
             should not be returned during inference.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
 """
+ALL_DECODER_LAYER_TYPES = {"attention": JambaAttentionDecoderLayer, "mamba": JambaMambaDecoderLayer}
 @add_start_docstrings(
     "The bare Jamba Model outputting raw hidden-states without any specific head on top.",
         self.vocab_size = config.vocab_size
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         decoder_layers = []
         for i in range(config.num_hidden_layers):
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[i]]
+            decoder_layers.append(layer_class(config, layer_idx=i))
         self.layers = nn.ModuleList(decoder_layers)
         self._attn_implementation = config._attn_implementation
     def set_input_embeddings(self, value):
         self.embed_tokens = value
     @add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING)
     def forward(
             self,
             input_ids: torch.LongTensor = None,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
             inputs_embeds: Optional[torch.FloatTensor] = None,
             use_cache: Optional[bool] = None,
             output_attentions: Optional[bool] = None,
             output_hidden_states: Optional[bool] = None,
             output_router_logits: Optional[bool] = None,
             return_dict: Optional[bool] = None,
+            cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, MoeModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
             )
+            use_cache = False
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
+        hidden_states = inputs_embeds
+        if use_cache and past_key_values is None:
+            logger.warning_once(
+                "Jamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was "
+                "provided, so no cache will be returned."
             )
+        if cache_position is None:
+            cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         all_router_logits = () if output_router_logits else None
         for decoder_layer in self.layers:
             if output_hidden_states:
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
+                    causal_mask,
                     position_ids,
                     past_key_values,
                     output_attentions,
                     output_router_logits,
                     use_cache,
+                    cache_position,
                 )
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
+                    attention_mask=causal_mask,
                     position_ids=position_ids,
                     past_key_value=past_key_values,
                     output_attentions=output_attentions,
                     output_router_logits=output_router_logits,
                     use_cache=use_cache,
+                    cache_position=cache_position,
                 )
             hidden_states = layer_outputs[0]
             if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
             if output_router_logits:
+                if layer_outputs[-1] is not None:
+                    # append router logits only of expert layers. Regular MLP layers return `None` as the router logits
+                    all_router_logits += (layer_outputs[-1],)
         hidden_states = self.final_layernorm(hidden_states)
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+        next_cache = None if not use_cache else past_key_values
         if not return_dict:
             return tuple(
             router_logits=all_router_logits,
         )
+    def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = cache_position[-1] + 1
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+        if (
+                self.config._attn_implementation == "sdpa"
+                and attention_mask is not None
+                and attention_mask.device.type == "cuda"
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
 # Adapted from transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM with MIXTRAL->JAMBA, Mixtral->Jamba
 class JambaForCausalLM(JambaPreTrainedModel):
             input_ids: torch.LongTensor = None,
             attention_mask: Optional[torch.Tensor] = None,
             position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
             inputs_embeds: Optional[torch.FloatTensor] = None,
             labels: Optional[torch.LongTensor] = None,
             use_cache: Optional[bool] = None,
             output_hidden_states: Optional[bool] = None,
             output_router_logits: Optional[bool] = None,
             return_dict: Optional[bool] = None,
+            cache_position: Optional[torch.LongTensor] = None,
+            num_logits_to_keep: Optional[Union[int, None]] = None,
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         r"""
         Args:
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int` or `None`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `None`, calculate logits for all
+                `input_ids`. Only last token logits are needed for generation, and calculating them only for that token
+                can save memory, which becomes pretty significant for long sequences.
         Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, JambaForCausalLM
+        >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             output_router_logits=output_router_logits,
+            cache_position=cache_position,
             return_dict=return_dict,
         )
         hidden_states = outputs[0]
+        if num_logits_to_keep is None:
             logits = self.lm_head(hidden_states)
         else:
+            logits = self.lm_head(hidden_states[..., -num_logits_to_keep:, :])
         logits = logits.float()
         loss = None
             attention_mask=None,
             inputs_embeds=None,
             output_router_logits=False,
+            cache_position=None,
             **kwargs,
     ):
+        empty_past_kv = past_key_values is None
+        # Omit tokens covered by past_key_values
+        if not empty_past_kv:
+            past_length = cache_position[0] if cache_position is not None else attention_mask.shape[1]
+            max_cache_length = self.config.sliding_window
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
             if (
                     max_cache_length is not None
                     and attention_mask is not None
+                    and past_length + input_ids.shape[1] > max_cache_length
             ):
                 attention_mask = attention_mask[:, -max_cache_length:]
+        else:
+            past_key_values = HybridMambaAttentionDynamicCache(
+                self.config, input_ids.shape[0], self.dtype, device=self.device
+            )
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids}
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
                 "output_router_logits": output_router_logits,
+                "num_logits_to_keep": self.config.num_logits_to_keep,
+                "cache_position": cache_position,
             }
         )
         return model_inputs
 @add_start_docstrings(
     """

special_tokens_map.json CHANGED Viewed

@@ -1,6 +1,30 @@
 {
-  "bos_token": "<|startoftext|>",
-  "eos_token": "<|endoftext|>",
-  "pad_token": "<|pad|>",
-  "unk_token": "<|unk|>"
 }

 {
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|pad|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|unk|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
 }