Crystalcareai
/

GemMoE-Beta-1

Text Generation

Transformers

gemmoe

custom_code

Model card Files Files and versions Community

Crystalcareai commited on Mar 13

Commit

45f7601

•

1 Parent(s): 2de5917

Update modeling_gemmoe.py

Browse files

Files changed (1) hide show

modeling_gemmoe.py +76 -80

modeling_gemmoe.py CHANGED Viewed

@@ -65,9 +65,82 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "GemmoeConfig"
-class GemmoeDistributedDataParallel(nn.parallel.DistributedDataParallel):
-    def __init__(self, model, **kwargs):
-        super().__init__(model, find_unused_parameters=True, **kwargs)
 def approx_gelu(x):
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3)))
@@ -164,76 +237,6 @@ class GemmoeMLP(nn.Module):
     def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-def load_balancing_loss_func(
-	self,
-	gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
-) -> float:
-	r"""
-	Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
-	See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
-	function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
-	experts is too unbalanced.
-	Args:
-		gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
-			Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
-			shape [batch_size X sequence_length, num_experts].
-		attention_mask (`torch.Tensor`, None):
-			The attention_mask used in forward function
-			shape [batch_size X sequence_length] if not None.
-		num_experts (`int`, *optional*):
-			Number of experts
-	Returns:
-		The auxiliary loss.
-	"""
-	if gate_logits is None or not isinstance(gate_logits, tuple):
-		return 0
-	if isinstance(gate_logits, tuple):
-		compute_device = gate_logits[0].device
-		concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
-	routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
-	_, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
-	expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
-	if attention_mask is None:
-		# Compute the percentage of tokens routed to each experts
-		tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
-		# Compute the average probability of routing to these experts
-		router_prob_per_expert = torch.mean(routing_weights, dim=0)
-	else:
-		batch_size, sequence_length = attention_mask.shape
-		num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
-		# Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
-		expert_attention_mask = (
-			attention_mask[None, :, :, None, None]
-			.expand((num_hidden_layers, batch_size, sequence_length, 2, num_experts))
-			.reshape(-1, 2, num_experts)
-			.to(compute_device)
-		)
-		# Compute the percentage of tokens routed to each experts
-		tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
-			expert_attention_mask, dim=0
-		)
-		# Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
-		router_per_expert_attention_mask = (
-			attention_mask[None, :, :, None]
-			.expand((num_hidden_layers, batch_size, sequence_length, num_experts))
-			.reshape(-1, num_experts)
-			.to(compute_device)
-		)
-		# Compute the average probability of routing to these experts
-		router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
-			router_per_expert_attention_mask, dim=0
-		)
-	overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
-	return overall_loss * num_experts
 def repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 	"""
@@ -1153,13 +1156,6 @@ class GemmoeForCausalLM(GemmoePreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
-    def parallelize(self, device_map=None):
-        self.model = GemmoeDistributedDataParallel(
-            self.model,
-            device_ids=[torch.cuda.current_device()],
-            output_device=torch.cuda.current_device(),
-    )
     def get_input_embeddings(self):
         return self.model.embed_tokens

 _CONFIG_FOR_DOC = "GemmoeConfig"
+def load_balancing_loss_func(
+    gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
+) -> float:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+    Args:
+        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        attention_mask (`torch.Tensor`, None):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+        num_experts (`int`, *optional*):
+            Number of experts
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
 def approx_gelu(x):
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * x**3)))
     def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 def repeat_kv(self, hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 	"""
         # Initialize weights and apply final processing
         self.post_init()
     def get_input_embeddings(self):
         return self.model.embed_tokens