Upload DogeForCausalLM

Browse files

Files changed (4) hide show

config.json +8 -10
configuration_doge.py +16 -24
model.safetensors +2 -2
modeling_doge.py +254 -295

config.json CHANGED Viewed

@@ -1,33 +1,31 @@
 {
-  "_name_or_path": "./checkpoint-10000",
   "architectures": [
     "DogeForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "configuration_doge.DogeConfig",
     "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
   },
   "bos_token_id": 1,
   "eos_token_id": 2,
   "hidden_act": "silu",
   "hidden_bias": false,
   "hidden_dropout": 0.0,
   "hidden_size": 512,
   "initializer_range": 0.02,
-  "inner_values_retrieval_size": 128,
   "intermediate_size": 2048,
-  "max_position_embeddings": 4096,
   "model_type": "doge",
   "num_attention_heads": 4,
-  "num_cdmmoe_experts": 2048,
-  "num_cdmmoe_experts_per_head": 4,
-  "num_cdmmoe_heads": 2,
   "num_hidden_layers": 8,
-  "num_inner_value_heads": 2,
-  "num_inner_values": 4,
-  "num_value_per_head": 2,
   "pad_token_id": 0,
-  "private_expert_retrieval_size": 256,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 10000.0,

 {
+  "_name_or_path": "./results/Doge-60M",
   "architectures": [
     "DogeForCausalLM"
   ],
+  "attention_dropout": 0.0,
   "auto_map": {
     "AutoConfig": "configuration_doge.DogeConfig",
     "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
   },
   "bos_token_id": 1,
   "eos_token_id": 2,
+  "expert_retrieval_size": 256,
   "hidden_act": "silu",
   "hidden_bias": false,
   "hidden_dropout": 0.0,
   "hidden_size": 512,
   "initializer_range": 0.02,
   "intermediate_size": 2048,
+  "is_moe": false,
+  "max_position_embeddings": 2048,
   "model_type": "doge",
   "num_attention_heads": 4,
+  "num_cdmmoe_experts": 4096,
+  "num_cdmmoe_experts_per_head": 8,
+  "num_cdmmoe_heads": 4,
   "num_hidden_layers": 8,
   "pad_token_id": 0,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 10000.0,

configuration_doge.py CHANGED Viewed

@@ -1,9 +1,9 @@
 # coding=utf-8
-# Copyright 2024 Jingze Shi and the HuggingFace Inc. team.    All rights reserved.
 #
 # This code is based on the Wonderful Matrices paper implementation.
 #
-#     https://arxiv.org/abs/2407.16958
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ class DogeConfig(PretrainedConfig):
             Dropout probability for each sequence transformation and state transformation module.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 16384):
             The maximum sequence length that this model might ever be used with.
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
@@ -104,22 +104,18 @@ class DogeConfig(PretrainedConfig):
             Whether to tie weight embeddings
         num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
-        num_inner_values (`int`, *optional*, defaults to 8):
-            Number of inner values for Inner Function Attention.
-        num_inner_value_heads (`int`, *optional*, defaults to 4):
-            Number of inner value heads for Inner Function Attention.
-        num_value_per_head (`int`, *optional*, defaults to 4):
-            Number of values per head, can't be greater than `num_inner_values`.
-        inner_values_retrieval_size (`int`, *optional*, defaults to 128):
-            Dimension of the inner values retrieval states for each attention layer in the Transformer decoder
-        private_expert_retrieval_size (`int`, *optional*, defaults to 256):
-            Dimension of the Private Expert retrieval states for the Cross Domain Mixture of Experts.
         num_cdmmoe_experts (`int`, *optional*, defaults to 4096):
             Number of Private Experts for the Cross Domain Mixture of Experts.
         num_cdmmoe_heads (`int`, *optional*, defaults to 4):
             Number of heads of Private Experts for the Cross Domain Mixture of Experts.
         num_cdmmoe_experts_per_head (`int`, *optional*, defaults to 8):
             Number of Private Experts per head for the Cross Domain Mixture of Experts.
     """
     model_type = "doge"
@@ -134,7 +130,7 @@ class DogeConfig(PretrainedConfig):
         hidden_bias=False,
         hidden_dropout=0.0,
         hidden_act="silu",
-        max_position_embeddings=16384,
         rope_theta=10000.0,
         rope_scaling=None,
         initializer_range=0.02,
@@ -145,14 +141,12 @@ class DogeConfig(PretrainedConfig):
         eos_token_id=2,
         tie_word_embeddings=False,
         num_attention_heads=8,
-        num_inner_values=8,
-        num_inner_value_heads=4,
-        num_value_per_head=4,
-        inner_values_retrieval_size=128,
-        private_expert_retrieval_size=256,
         num_cdmmoe_experts=4096,
         num_cdmmoe_heads=4,
         num_cdmmoe_experts_per_head=8,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -173,14 +167,12 @@ class DogeConfig(PretrainedConfig):
         self.eos_token_id = eos_token_id
         self.tie_word_embeddings = tie_word_embeddings
         self.num_attention_heads = num_attention_heads
-        self.num_inner_values = num_inner_values
-        self.num_inner_value_heads = num_inner_value_heads
-        self.num_value_per_head = num_value_per_head
-        self.inner_values_retrieval_size = inner_values_retrieval_size
-        self.private_expert_retrieval_size = private_expert_retrieval_size
         self.num_cdmmoe_experts = num_cdmmoe_experts
         self.num_cdmmoe_heads = num_cdmmoe_heads
         self.num_cdmmoe_experts_per_head = num_cdmmoe_experts_per_head
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, copy it it to 'rope_type'.

 # coding=utf-8
+# Copyright 2024 Jingze Shi and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on the Wonderful Matrices paper implementation.
 #
+#     https://arxiv.org/abs/2412.11834
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
             Dropout probability for each sequence transformation and state transformation module.
         hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
             The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
             The maximum sequence length that this model might ever be used with.
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
             Whether to tie weight embeddings
         num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        is_moe (`bool`, *optional*, defaults to `False`):
+            Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
         num_cdmmoe_experts (`int`, *optional*, defaults to 4096):
             Number of Private Experts for the Cross Domain Mixture of Experts.
         num_cdmmoe_heads (`int`, *optional*, defaults to 4):
             Number of heads of Private Experts for the Cross Domain Mixture of Experts.
         num_cdmmoe_experts_per_head (`int`, *optional*, defaults to 8):
             Number of Private Experts per head for the Cross Domain Mixture of Experts.
+        expert_retrieval_size (`int`, *optional*, defaults to 256):
+            Dimension of the Expert retrieval states for the Cross Domain Mixture of Experts.
     """
     model_type = "doge"
         hidden_bias=False,
         hidden_dropout=0.0,
         hidden_act="silu",
+        max_position_embeddings=2048,
         rope_theta=10000.0,
         rope_scaling=None,
         initializer_range=0.02,
         eos_token_id=2,
         tie_word_embeddings=False,
         num_attention_heads=8,
+        attention_dropout=0.0,
+        is_moe=False,
         num_cdmmoe_experts=4096,
         num_cdmmoe_heads=4,
         num_cdmmoe_experts_per_head=8,
+        expert_retrieval_size=256,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.eos_token_id = eos_token_id
         self.tie_word_embeddings = tie_word_embeddings
         self.num_attention_heads = num_attention_heads
+        self.attention_dropout = attention_dropout
+        self.is_moe = is_moe
         self.num_cdmmoe_experts = num_cdmmoe_experts
         self.num_cdmmoe_heads = num_cdmmoe_heads
         self.num_cdmmoe_experts_per_head = num_cdmmoe_experts_per_head
+        self.expert_retrieval_size = expert_retrieval_size
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, copy it it to 'rope_type'.

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:794645ba99f640a813621b02d8f89f67a857deeb876d882059c2b01bcabb045a
-size 307592408

 version https://git-lfs.github.com/spec/v1
+oid sha256:26d80cdf90d4f053299b962b1ede76f0fe30ed31ebcb95e5dbd730ce23ffd36a
+size 268580408

modeling_doge.py CHANGED Viewed

@@ -1,9 +1,9 @@
 # coding=utf-8
-# Copyright 2024 Jingze Shi and the HuggingFace Inc. team.    All rights reserved.
 #
 # This code is based on the Wonderful Matrices paper implementation.
 #
-#     https://arxiv.org/abs/2407.16958
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -39,16 +39,15 @@ from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    # is_einx_available,
     logging,
     replace_return_docstrings,
 )
 from .configuration_doge import DogeConfig
-from einx import add as einx_add
 logger = logging.get_logger(__name__)
@@ -76,6 +75,18 @@ class RMSNorm(nn.Module):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
 class RotaryEmbedding(nn.Module):
     def __init__(self, config: Optional[DogeConfig] = None):
         super().__init__()
@@ -172,8 +183,8 @@ def apply_QK_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     return q_embed, k_embed
-class DogeInnerFuncAttn(nn.Module):
-    """Inner Function Attention from 'Wonderful Matrices' paper."""
     def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None):
         super().__init__()
@@ -189,15 +200,10 @@ class DogeInnerFuncAttn(nn.Module):
         self.hidden_dim = config.hidden_size
         self.num_attention_heads = config.num_attention_heads
-        # for accuracy of attention scores, we do not use GQA
         self.attention_head_dim = self.hidden_dim // self.num_attention_heads
-        self.num_inner_values = config.num_inner_values
-        self.num_inner_value_heads = config.num_inner_value_heads
-        self.num_value_per_head = config.num_value_per_head
-        self.inner_values_retrieval_dim = config.inner_values_retrieval_size
-        # Q and K projections
         self.q_proj = nn.Linear(
             self.hidden_dim,
             self.num_attention_heads * self.attention_head_dim,
@@ -208,157 +214,26 @@ class DogeInnerFuncAttn(nn.Module):
             self.num_attention_heads * self.attention_head_dim,
             bias=config.hidden_bias,
         )
         # dynamic mask for the QK^T attention score matrix
-        self.dynamic_mask = nn.Parameter(
-            torch.round(torch.ones(self.num_attention_heads, config.max_position_embeddings))
         )
-        # queries and keys for retrieval V
-        self.v_queries = nn.Linear(
             self.hidden_dim,
-            self.num_inner_value_heads * self.inner_values_retrieval_dim,
             bias=config.hidden_bias,
         )
-        self.v_keys = nn.Parameter(
-            torch.zeros(
-                self.num_inner_value_heads,
-                self.inner_values_retrieval_dim,
-                self.num_inner_values,
-            )
-        )
-        # V for inner function
-        self.v_embed = nn.Embedding(
-            self.num_inner_values,
             self.hidden_dim,
         )
         self.o_proj = nn.Linear(
             self.hidden_dim,
             self.hidden_dim,
             bias=config.hidden_bias,
         )
-    def _update_causal_mask(
-        self,
-        attention_mask: torch.Tensor = None,
-        input_tensor: torch.Tensor = None,
-        cache_position: torch.Tensor = None,
-        past_key_values: Cache = None,
-        output_attentions: bool = False,
-    ):
-        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-        using_static_cache = isinstance(past_key_values, StaticCache)
-        dtype, device = input_tensor.dtype, input_tensor.device
-        sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_cache_shape()
-        else:
-            target_length = (
-                attention_mask.shape[-1]
-                if isinstance(attention_mask, torch.Tensor)
-                else past_seen_tokens + sequence_length + 1
-            )
-        # in case the provided `attention` mask is 2D, we generate a causal mask here (4D).
-        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position_and_dynamic_mask(
-            attention_mask=attention_mask,
-            dynamic_mask=self.dynamic_mask,
-            sequence_length=sequence_length,
-            target_length=target_length,
-            dtype=dtype,
-            device=device,
-            cache_position=cache_position,
-            batch_size=input_tensor.shape[0],
-        )
-        return causal_mask
-    @staticmethod
-    def _prepare_4d_causal_attention_mask_with_cache_position_and_dynamic_mask(
-        attention_mask: torch.Tensor = None,
-        dynamic_mask: torch.Tensor = None,
-        sequence_length: int = None,
-        target_length: int = None,
-        dtype: torch.dtype = None,
-        device: torch.device = None,
-        cache_position: torch.Tensor = None,
-        batch_size: int = None,
-        **kwargs,
-    ):
-        """
-        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-        Args:
-            attention_mask (`torch.Tensor`):
-                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-                `(batch_size, 1, query_length, key_value_length)`.
-            dynamic_mask (`torch.Tensor`):
-                A 2D dynamic mask of shape `(num_heads, max_position_embeddings)`.
-            sequence_length (`int`):
-                The sequence length being processed.
-            target_length (`int`):
-                The target length: when generating with static cache, the mask should be as long as the static cache,
-                to account for the 0 padding, the part of the cache that is not filled yet.
-            dtype (`torch.dtype`):
-                The dtype to use for the 4D attention mask.
-            device (`torch.device`):
-                The device to plcae the 4D attention mask on.
-            cache_position (`torch.Tensor`):
-                Indices depicting the position of the input sequence tokens in the sequence.
-            batch_size (`torch.Tensor`):
-                Batch size.
-        """
-        if attention_mask is not None and attention_mask.dim() == 4:
-            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-            causal_mask = attention_mask
-        else:
-            num_heads = 1 if dynamic_mask is None else dynamic_mask.size(0)
-            min_dtype = torch.finfo(dtype).min
-            causal_mask = torch.full(
-                (sequence_length, target_length),
-                fill_value=min_dtype,
-                dtype=dtype,
-                device=device,
-            )
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-            causal_mask = causal_mask[None, None, :, :].expand(batch_size, num_heads, -1, -1)
-            if attention_mask is not None:
-                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-                mask_length = attention_mask.shape[-1]
-                attention_mask = attention_mask[:, None, None, :].expand(-1, num_heads, 1, -1)
-                if dynamic_mask is not None:
-                    dynamic_mask = dynamic_mask[None, :, None, :mask_length].expand(batch_size, -1, 1, -1)
-                    attention_mask = attention_mask.clone() * dynamic_mask
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask
-                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                    padding_mask == 0, min_dtype
-                )
-        return causal_mask
-    def inner_func(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Each value can share weights with other values to increase the expressive power
-        """
-        bsz, seq_len, _ = hidden_states.shape
-        v_queries = self.v_queries(hidden_states)
-        v_queries = v_queries.view(bsz, seq_len, self.num_inner_value_heads, -1).transpose(1, 2)
-        sim = torch.matmul(v_queries, self.v_keys).transpose(1, 2)
-        v_embed = self.v_embed(sim.topk(k=self.num_value_per_head, dim=-1).indices)
-        v = hidden_states * v_embed.sum(dim=-2).sum(dim=-2)
-        return v
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -369,24 +244,24 @@ class DogeInnerFuncAttn(nn.Module):
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[Cache]]:
-        bsz, seq_len, _ = hidden_states.shape
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
-        value_states = self.inner_func(hidden_states)
-        query_states = query_states.view(bsz, seq_len, self.num_attention_heads, self.attention_head_dim).transpose(
             1, 2
         )
-        key_states = key_states.view(bsz, seq_len, self.num_attention_heads, self.attention_head_dim).transpose(
             1, 2
         )
-        value_states = value_states.view(bsz, seq_len, self.num_attention_heads, self.attention_head_dim).transpose(
             1, 2
         )
         cos, sin = position_embeddings
-        query_states, query_states = apply_QK_rotary_pos_emb(query_states, query_states, cos, sin)
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
@@ -397,38 +272,101 @@ class DogeInnerFuncAttn(nn.Module):
         attn_weights = torch.matmul(query_states, key_states.transpose(-1, -2)) / math.sqrt(self.attention_head_dim)
         # add mask to attention scores
-        causal_mask = self._update_causal_mask(attention_mask, hidden_states, cache_position, past_key_value)
-        causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + causal_mask
         # upcast attention scores to fp32
         attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         # apply attention scores to value states
         attn_output = torch.matmul(attn_weights, value_states)
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, seq_len, -1)
         attn_output = self.o_proj(attn_output)
         return attn_output, past_key_value
-class DogeCDMoE(nn.Module):
-    """Cross-Domain Mixture of Experts from 'Wonderful Matrices' paper."""
     def __init__(self, config: DogeConfig):
         super().__init__()
         self.hidden_dim = config.hidden_size
-        self.act_fn = ACT2FN[config.hidden_act]
         self.intermediate_dim = config.intermediate_size
-        self.private_expert_retrieval_dim = config.private_expert_retrieval_size
-        self.num_cdmmoe_experts = config.num_cdmmoe_experts
-        self.num_cdmmoe_heads = config.num_cdmmoe_heads
-        self.num_cdmmoe_experts_per_head = config.num_cdmmoe_experts_per_head
-        # cross domain
         self.up_proj = nn.Linear(
             self.hidden_dim,
             self.intermediate_dim,
@@ -440,24 +378,46 @@ class DogeCDMoE(nn.Module):
             bias=config.hidden_bias,
         )
-        # queries and keys for retrieval private experts
         self.queries = nn.Linear(
             self.hidden_dim,
-            self.num_cdmmoe_heads * self.private_expert_retrieval_dim,
             bias=False,
         )
-        self.num_keys = int(math.sqrt(self.num_cdmmoe_experts))
         self.keys = nn.Parameter(
             torch.zeros(
                 self.num_cdmmoe_heads,
                 self.num_keys,
                 2,
-                self.private_expert_retrieval_dim // 2,
             )
         )
-        # private experts
-        self.down_embed = nn.Embedding(
             self.num_cdmmoe_experts,
             self.hidden_dim,
         )
@@ -471,7 +431,7 @@ class DogeCDMoE(nn.Module):
         self,
         hidden_states: torch.Tensor,
         **kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
         bsz, seq_len, _ = hidden_states.shape
         # get similarity with queries and keys
@@ -479,7 +439,7 @@ class DogeCDMoE(nn.Module):
         queries = queries.view(bsz, seq_len, 2, self.num_cdmmoe_heads, -1).permute(2, 0, 1, 3, 4)
         sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
-        # get expert scores and indices with the highest similarity
         (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmmoe_experts_per_head, dim=-1)
         if einx_add is not None:
             all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
@@ -491,17 +451,14 @@ class DogeCDMoE(nn.Module):
             all_indices = all_indices.view(*indices_x.shape[:-1], -1)
         scores, pk_indices = all_scores.topk(self.num_cdmmoe_experts_per_head, dim=-1)
         indices = all_indices.gather(-1, pk_indices)
-        # get related expert embeddings based on indices
         down_embed = self.down_embed(indices)
         up_embed = self.up_embed(indices)
-        # efficient retrieval of private experts
-        experts_weights = self.act_fn(torch.einsum("b t d, b t h k d -> b t h k", hidden_states, down_embed) * scores.softmax(dim=-1))
         experts_states = torch.einsum("b t h k, b t h k d -> b t d", experts_weights, up_embed)
-        # mix with shared parameters of cross domain
-        hidden_states = self.down_proj(self.act_fn(self.up_proj(hidden_states)))
         hidden_states = hidden_states + experts_states
         return hidden_states
@@ -511,10 +468,13 @@ class DogeDecoderLayer(nn.Module):
         super().__init__()
         self.hidden_dropout = config.hidden_dropout
-        self.in_attn_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.attn = DogeInnerFuncAttn(config, layer_idx)
-        self.in_ff_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.feed_forward = DogeCDMoE(config)
     def forward(
         self,
@@ -553,7 +513,7 @@ class DogeDecoderLayer(nn.Module):
         # sequence transformation
         residual = hidden_states
-        hidden_states = self.in_attn_layernorm(hidden_states)
         hidden_states, present_key_value = self.attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
@@ -565,14 +525,14 @@ class DogeDecoderLayer(nn.Module):
         )
         self_attn_weights = None
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
-        hidden_states = residual + hidden_states
         # state transformation
         residual = hidden_states
-        hidden_states = self.in_ff_layernorm(hidden_states)
         hidden_states = self.feed_forward(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
-        hidden_states = residual + hidden_states
         outputs = (hidden_states,)
@@ -592,6 +552,7 @@ class DogePreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["DogeDecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
@@ -765,9 +726,9 @@ class DogeModel(DogePreTrainedModel):
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
-        # causal_mask = self._update_causal_mask(
-        #     attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
-        # )
         hidden_states = inputs_embeds
         # create position embeddings to be shared across the decoder layers
@@ -776,6 +737,7 @@ class DogeModel(DogePreTrainedModel):
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         for decoder_layer in self.layers:
             if output_hidden_states:
@@ -785,7 +747,7 @@ class DogeModel(DogePreTrainedModel):
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
-                    attention_mask,
                     position_ids,
                     past_key_values,
                     output_attentions,
@@ -796,7 +758,7 @@ class DogeModel(DogePreTrainedModel):
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
-                    attention_mask=attention_mask,
                     position_ids=position_ids,
                     past_key_value=past_key_values,
                     output_attentions=output_attentions,
@@ -833,100 +795,97 @@ class DogeModel(DogePreTrainedModel):
             attentions=all_self_attns,
         )
-    """Move to DogeInnerFuncAttn"""
-    # def _update_causal_mask(
-    #     self,
-    #     attention_mask: torch.Tensor,
-    #     input_tensor: torch.Tensor,
-    #     cache_position: torch.Tensor,
-    #     past_key_values: Cache,
-    #     output_attentions: bool,
-    # ):
-    #     # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
-    #     # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
-    #     # to infer the attention mask.
-    #     past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-    #     using_static_cache = isinstance(past_key_values, StaticCache)
-    #     dtype, device = input_tensor.dtype, input_tensor.device
-    #     sequence_length = input_tensor.shape[1]
-    #     if using_static_cache:
-    #         target_length = past_key_values.get_max_cache_shape()
-    #     else:
-    #         target_length = (
-    #             attention_mask.shape[-1]
-    #             if isinstance(attention_mask, torch.Tensor)
-    #             else past_seen_tokens + sequence_length + 1
-    #         )
-    #     # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
-    #     causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
-    #         attention_mask,
-    #         sequence_length=sequence_length,
-    #         target_length=target_length,
-    #         dtype=dtype,
-    #         device=device,
-    #         cache_position=cache_position,
-    #         batch_size=input_tensor.shape[0],
-    #     )
-    #     return causal_mask
-    # @staticmethod
-    # def _prepare_4d_causal_attention_mask_with_cache_position(
-    #     attention_mask: torch.Tensor,
-    #     sequence_length: int,
-    #     target_length: int,
-    #     dtype: torch.dtype,
-    #     device: torch.device,
-    #     cache_position: torch.Tensor,
-    #     batch_size: int,
-    #     **kwargs,
-    # ):
-    #     """
-    #     Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-    #     `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-    #     Args:
-    #         attention_mask (`torch.Tensor`):
-    #             A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-    #             `(batch_size, 1, query_length, key_value_length)`.
-    #         sequence_length (`int`):
-    #             The sequence length being processed.
-    #         target_length (`int`):
-    #             The target length: when generating with static cache, the mask should be as long as the static cache,
-    #             to account for the 0 padding, the part of the cache that is not filled yet.
-    #         dtype (`torch.dtype`):
-    #             The dtype to use for the 4D attention mask.
-    #         device (`torch.device`):
-    #             The device to plcae the 4D attention mask on.
-    #         cache_position (`torch.Tensor`):
-    #             Indices depicting the position of the input sequence tokens in the sequence.
-    #         batch_size (`torch.Tensor`):
-    #             Batch size.
-    #     """
-    #     if attention_mask is not None and attention_mask.dim() == 4:
-    #         # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-    #         causal_mask = attention_mask
-    #     else:
-    #         min_dtype = torch.finfo(dtype).min
-    #         causal_mask = torch.full(
-    #             (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
-    #         )
-    #         if sequence_length != 1:
-    #             causal_mask = torch.triu(causal_mask, diagonal=1)
-    #         causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-    #         causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
-    #         if attention_mask is not None:
-    #             causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-    #             mask_length = attention_mask.shape[-1]
-    #             padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-    #             padding_mask = padding_mask == 0
-    #             causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-    #                 padding_mask, min_dtype
-    #             )
-    #     return causal_mask
 class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):

 # coding=utf-8
+# Copyright 2024 Jingze Shi and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on the Wonderful Matrices paper implementation.
 #
+#     https://arxiv.org/abs/2412.11834
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
     replace_return_docstrings,
 )
 from .configuration_doge import DogeConfig
+try:
+    from einx import add as einx_add
+except ImportError:
+    einx_add = None
 logger = logging.get_logger(__name__)
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class Residual(nn.Module):
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+    def forward(self, residual_states, hidden_states):
+        return self.weight * residual_states + hidden_states
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}"
 class RotaryEmbedding(nn.Module):
     def __init__(self, config: Optional[DogeConfig] = None):
         super().__init__()
     return q_embed, k_embed
+class DogeDynamicMaskAttention(nn.Module):
+    """Dynamic Mask Attention from 'Wonderful Matrices' paper."""
     def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self.num_attention_heads = config.num_attention_heads
+        self.attention_dropout = config.attention_dropout
         self.attention_head_dim = self.hidden_dim // self.num_attention_heads
+        # Q K V O projections
         self.q_proj = nn.Linear(
             self.hidden_dim,
             self.num_attention_heads * self.attention_head_dim,
             self.num_attention_heads * self.attention_head_dim,
             bias=config.hidden_bias,
         )
         # dynamic mask for the QK^T attention score matrix
+        self.A = nn.Parameter(
+            torch.ones(self.num_attention_heads)
         )
+        self.dt_proj = nn.Linear(
             self.hidden_dim,
+            self.num_attention_heads,
             bias=config.hidden_bias,
         )
+        self.v_proj = nn.Linear(
             self.hidden_dim,
+            self.num_attention_heads * self.attention_head_dim,
+            bias=config.hidden_bias,
         )
         self.o_proj = nn.Linear(
             self.hidden_dim,
             self.hidden_dim,
             bias=config.hidden_bias,
         )
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[Cache]]:
+        bsz, q_len, _ = hidden_states.shape
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
             1, 2
         )
+        key_states = key_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
             1, 2
         )
+        value_states = value_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
             1, 2
         )
         cos, sin = position_embeddings
+        query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
         attn_weights = torch.matmul(query_states, key_states.transpose(-1, -2)) / math.sqrt(self.attention_head_dim)
         # add mask to attention scores
+        if attention_mask is not None:
+            dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
+            dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
+            dynamic_mask = dynamic_mask < 1.0
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]].masked_fill(dynamic_mask[:, :, None, :], torch.finfo(hidden_states.dtype).min)
+            attn_weights = attn_weights + causal_mask
         # upcast attention scores to fp32
         attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = F.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         # apply attention scores to value states
         attn_output = torch.matmul(attn_weights, value_states)
         attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
         attn_output = self.o_proj(attn_output)
         return attn_output, past_key_value
+class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[Cache]]:
+        bsz, q_len, _ = hidden_states.shape
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        if attention_mask is not None:
+            dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
+            dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
+            dynamic_mask = dynamic_mask < 1.0
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]].masked_fill(dynamic_mask[:, :, None, :], torch.finfo(hidden_states.dtype).min)
+        query_states = query_states.contiguous()
+        key_states = key_states.contiguous()
+        value_states = value_states.contiguous()
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, past_key_value
+DOGE_ATTENTION_CLASSES = {
+    "eager": DogeDynamicMaskAttention,
+    "sdpa": DogeSdpaDynamicMaskAttn,
+}
+class DogeMLP(nn.Module):
     def __init__(self, config: DogeConfig):
         super().__init__()
         self.hidden_dim = config.hidden_size
         self.intermediate_dim = config.intermediate_size
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.gate_proj = nn.Linear(
+            self.hidden_dim,
+            self.intermediate_dim,
+            bias=config.hidden_bias,
+        )
         self.up_proj = nn.Linear(
             self.hidden_dim,
             self.intermediate_dim,
             bias=config.hidden_bias,
         )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
+        return hidden_states
+class DogeCDMoE(DogeMLP):
+    """Cross Domain Mixture of Experts from 'Wonderful Matrices' paper."""
+    def __init__(self, config: DogeConfig):
+        super().__init__(config)
+        self.hidden_dim = config.hidden_size
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.expert_retrieval_dim = config.expert_retrieval_size
+        self.num_cdmmoe_experts = config.num_cdmmoe_experts
+        self.num_cdmmoe_heads = config.num_cdmmoe_heads
+        self.num_cdmmoe_experts_per_head = config.num_cdmmoe_experts_per_head
+        self.num_keys = int(math.sqrt(self.num_cdmmoe_experts))
+        # queries and keys for retrieval experts
         self.queries = nn.Linear(
             self.hidden_dim,
+            self.num_cdmmoe_heads * self.expert_retrieval_dim,
             bias=False,
         )
         self.keys = nn.Parameter(
             torch.zeros(
                 self.num_cdmmoe_heads,
                 self.num_keys,
                 2,
+                self.expert_retrieval_dim // 2,
             )
         )
+        # experts
+        self.down_embed  = nn.Embedding(
             self.num_cdmmoe_experts,
             self.hidden_dim,
         )
         self,
         hidden_states: torch.Tensor,
         **kwargs,
+    ) -> torch.Tensor:
         bsz, seq_len, _ = hidden_states.shape
         # get similarity with queries and keys
         queries = queries.view(bsz, seq_len, 2, self.num_cdmmoe_heads, -1).permute(2, 0, 1, 3, 4)
         sim = torch.einsum("p b t h n, h k p n -> p b t h k", queries, self.keys)
+        # get experts with the highest similarity
         (scores_x, scores_y), (indices_x, indices_y) = sim.topk(self.num_cdmmoe_experts_per_head, dim=-1)
         if einx_add is not None:
             all_scores = einx_add("... i, ... j -> ... (i j)", scores_x, scores_y)
             all_indices = all_indices.view(*indices_x.shape[:-1], -1)
         scores, pk_indices = all_scores.topk(self.num_cdmmoe_experts_per_head, dim=-1)
         indices = all_indices.gather(-1, pk_indices)
         down_embed = self.down_embed(indices)
         up_embed = self.up_embed(indices)
+        # mix experts states with cross domain states
+        experts_weights = torch.einsum("b t d, b t h k d -> b t h k", hidden_states, down_embed)
+        experts_weights = self.act_fn(experts_weights) * scores.softmax(dim=-1)
         experts_states = torch.einsum("b t h k, b t h k d -> b t d", experts_weights, up_embed)
+        hidden_states = self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states))
         hidden_states = hidden_states + experts_states
         return hidden_states
         super().__init__()
         self.hidden_dropout = config.hidden_dropout
+        self.pre_sequence_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attn = DOGE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        self.post_sequence_residual = Residual(config.hidden_size)
+        self.pre_state_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.feed_forward = DogeMLP(config) if config.is_moe == False else DogeCDMoE(config)
+        self.post_state_residual = Residual(config.hidden_size)
     def forward(
         self,
         # sequence transformation
         residual = hidden_states
+        hidden_states = self.pre_sequence_layernorm(hidden_states)
         hidden_states, present_key_value = self.attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
         )
         self_attn_weights = None
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
+        hidden_states = self.post_sequence_residual(residual, hidden_states)
         # state transformation
         residual = hidden_states
+        hidden_states = self.pre_state_layernorm(hidden_states)
         hidden_states = self.feed_forward(hidden_states)
         hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
+        hidden_states = self.post_state_residual(residual, hidden_states)
         outputs = (hidden_states,)
     supports_gradient_checkpointing = True
     _no_split_modules = ["DogeDecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
+    _supports_sdpa = True
     _supports_cache_class = True
     _supports_quantized_cache = True
     _supports_static_cache = True
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
         hidden_states = inputs_embeds
         # create position embeddings to be shared across the decoder layers
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
         for decoder_layer in self.layers:
             if output_hidden_states:
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
+                    causal_mask,
                     position_ids,
                     past_key_values,
                     output_attentions,
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
+                    attention_mask=causal_mask,
                     position_ids=position_ids,
                     past_key_value=past_key_values,
                     output_attentions=output_attentions,
             attentions=all_self_attns,
         )
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor = None,
+        input_tensor: torch.Tensor = None,
+        cache_position: torch.Tensor = None,
+        past_key_values: Cache = None,
+        output_attentions: bool = False,
+    ):
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # in case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask=attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor = None,
+        sequence_length: int = None,
+        target_length: int = None,
+        dtype: torch.dtype = None,
+        device: torch.device = None,
+        cache_position: torch.Tensor = None,
+        batch_size: int = None,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length),
+                fill_value=min_dtype, dtype=dtype, device=device,
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
 class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):