Crystalcareai
/

GemMoE-Beta-1

Text Generation

Transformers

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 14, 2024

Commit

9d9c0e7

verified ·

1 Parent(s): 7723261

Update modeling_gemmoe.py

Browse files

Files changed (1) hide show

modeling_gemmoe.py +114 -31

modeling_gemmoe.py CHANGED Viewed

@@ -194,42 +194,54 @@ class GemmoeRMSNorm(nn.Module):
 ALL_LAYERNORM_LAYERS.append(GemmoeRMSNorm)
 class GemmoeRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-        self.max_seq_len_cached = None
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
-        freqs = torch.outer(t, self.inv_freq.to(t.device))
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
         return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
         )
 class GemmoeLinearScalingRotaryEmbedding(GemmoeRotaryEmbedding):
     """GemmoeRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
@@ -948,17 +960,78 @@ GEMMOE_ATTENTION_CLASSES = {
 	"sdpa": GemmoeSdpaAttention,
 	}
 class GemmoeDecoderLayer(nn.Module):
     def __init__(self, config: GemmoeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = GEMMOE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
-        self.mlp = GemMoE(config) if (config.n_routed_experts is not None and  \
-                                           layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0) \
-                                        else GemmoeMLP(config)
         self.input_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -969,6 +1042,7 @@ class GemmoeDecoderLayer(nn.Module):
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
@@ -985,13 +1059,15 @@ class GemmoeDecoderLayer(nn.Module):
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
             past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
         """
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
             )
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         # Self Attention
@@ -1009,7 +1085,12 @@ class GemmoeDecoderLayer(nn.Module):
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
@@ -1019,10 +1100,12 @@ class GemmoeDecoderLayer(nn.Module):
         if use_cache:
             outputs += (present_key_value,)
         return outputs
 GEMMOE_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads

 ALL_LAYERNORM_LAYERS.append(GemmoeRMSNorm)
+class GemmoeRMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.zeros(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * (self.weight + 1)
+ALL_LAYERNORM_LAYERS.append(GemmoeRMSNorm)
 class GemmoeRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
+        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=device, dtype=torch.get_default_dtype())
     def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
+        freq_exponents = (2.0 / self.dim) * (
+            torch.arange(self.dim // 2, dtype=torch.int64, device="cpu").float()
+        )
+        timescale = self.base ** freq_exponents
+        positions = torch.arange(self.max_seq_len_cached, device="cpu", dtype=torch.int64).float()
+        radians_new = positions[..., None] / timescale[None, None, :]
+        radians_new = radians_new.squeeze(0)
+        emb = torch.cat((radians_new, radians_new), dim=-1)
+        cos = emb.cos().to(device=device, non_blocking=True)
+        sin = emb.sin().to(device=device, non_blocking=True)
+        self.register_buffer("cos_cached", cos, persistent=False)
+        self.register_buffer("sin_cached", sin, persistent=False)
+    def forward(self, x, position_ids=None, seq_len=None):
+        if seq_len is None:
+            seq_len = x.size(2)
+        if seq_len > self.max_seq_len_cached:
             self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
         return (
+            self.cos_cached[:seq_len],
+            self.sin_cached[:seq_len],
         )
 class GemmoeLinearScalingRotaryEmbedding(GemmoeRotaryEmbedding):
     """GemmoeRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
 	"sdpa": GemmoeSdpaAttention,
 	}
+class GemmoeBlockSparseTop2MLP(nn.Module):
+    def __init__(self, config: GemmoeConfig):
+        super().__init__()
+        self.ffn_dim = config.intermediate_size
+        self.hidden_dim = config.hidden_size
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
+        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.act_fn = approx_gelu
+    def forward(self, hidden_states):
+        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states
+class GemmoeSparseMoeBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.top_k = 2
+        # gating
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.experts = nn.ModuleList([GemmoeBlockSparseTop2MLP(config) for _ in range(self.num_experts)])
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False)
+        topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        topk_weight = topk_weight.to(hidden_states.dtype)
+        hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
+        y = torch.empty_like(hidden_states)
+        flat_topk_idx = topk_idx.view(-1)
+        for i in range(self.num_experts):
+            expert = self.experts[i]
+            expert_output = expert(hidden_states[flat_topk_idx == i])
+            y[flat_topk_idx == i] = expert_output.to(y.dtype)  # Cast expert_output to the same dtype as y
+        y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
+        final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
 class GemmoeDecoderLayer(nn.Module):
     def __init__(self, config: GemmoeConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.self_attn = GEMMOE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+        if config.n_routed_experts is not None and \
+           layer_idx >= config.first_k_dense_replace and \
+           layer_idx % config.moe_layer_freq == 0:
+            self.block_sparse_moe = GemmoeSparseMoeBlock(config)
+        else:
+            self.mlp = GemmoeMLP(config)
         self.input_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = GemmoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
             past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss,
+                and should not be returned during inference.
         """
         if "padding_mask" in kwargs:
             warnings.warn(
                 "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
             )
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
         # Self Attention
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
+        if hasattr(self, 'block_sparse_moe'):
+            hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+        else:
+            hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states
         outputs = (hidden_states,)
         if use_cache:
             outputs += (present_key_value,)
+        if output_router_logits and hasattr(self, 'block_sparse_moe'):
+            outputs += (router_logits,)
         return outputs
 GEMMOE_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads