Crystalcareai
/

GemMoE-Beta-1

Text Generation

Transformers

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 13

Commit

cfc4ccd

•

1 Parent(s): 45f7601

Update modeling_gemmoe.py

Browse files

Files changed (1) hide show

modeling_gemmoe.py +18 -48

modeling_gemmoe.py CHANGED Viewed

@@ -221,33 +221,16 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None):
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
-class GemmoeMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = approx_gelu
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 def repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-	"""
-	This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-	num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-	"""
-	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-	if n_rep == 1:
-		return hidden_states
-	hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-	return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class GemmoeAttention(nn.Module):
     """
@@ -569,17 +552,7 @@ class GemmoeSdpaAttention(GemmoeAttention):
     GemmoeAttention as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
-    def repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-        """
-        This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-        num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-        """
-        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-        if n_rep == 1:
-            return hidden_states
-        hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -670,10 +643,12 @@ class GemmoeBlockSparseTop2MLP(nn.Module):
         super().__init__()
         self.ffn_dim = config.intermediate_size
         self.hidden_dim = config.hidden_size
         self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
         self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
         self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
-        self.act_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_states):
         current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
@@ -734,20 +709,14 @@ class GemmoeSparseMoeBlock(nn.Module):
 class GemmoeDecoderLayer(nn.Module):
-    """
-    Decoder layer for the Gemmoe model.
-    Args:
-        config (GemmoeConfig): The configuration object for the Gemmoe model.
-        layer_idx (int): The index of the layer.
-    """
     def __init__(self, config: GemmoeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = GEMMOE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
-        self.mlp = GemmoeMLP(config)
         self.block_sparse_moe = GemmoeSparseMoeBlock(config)
-        self.input_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
@@ -901,6 +870,7 @@ class GemmoeModel(GemmoePreTrainedModel):
 		self.layers = nn.ModuleList(
 			[GemmoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
 		)
 		self.norm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 		self.gradient_checkpointing = False

     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 def repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class GemmoeAttention(nn.Module):
     """
     GemmoeAttention as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
     SDPA API.
     """
     def forward(
         self,
         hidden_states: torch.Tensor,
         super().__init__()
         self.ffn_dim = config.intermediate_size
         self.hidden_dim = config.hidden_size
         self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
         self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
         self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.act_fn = approx_gelu
     def forward(self, hidden_states):
         current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
 class GemmoeDecoderLayer(nn.Module):
     def __init__(self, config: GemmoeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = GEMMOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.block_sparse_moe = GemmoeSparseMoeBlock(config)
+        self.input_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
 		self.layers = nn.ModuleList(
 			[GemmoeDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
 		)
 		self.norm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 		self.gradient_checkpointing = False