suayptalha
/

minGRU-LM

@@ -8,6 +8,26 @@ from .configuration_minGRULM import MinGRULMConfig
 from minGRU_pytorch.minGRULM import minGRULM
 class MinGRULMPreTrainedModel(PreTrainedModel):
     config_class = MinGRULMConfig
     base_model_prefix = "model"
@@ -28,24 +48,27 @@ class MinGRULMForCausalLM(MinGRULMPreTrainedModel):
     def __init__(self, config: MinGRULMConfig):
         super().__init__(config)
-        # Load model from minGRULM library
-        self.model = minGRULM(
             num_tokens=config.vocab_size,
             dim=config.d_model,
             depth=config.n_layer,
             ff_mult=config.ff_mult,
-            min_gru_expansion=config.expand,
             enable_conv=config.enable_conv,
         )
         self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
         self.post_init()
     def get_input_embeddings(self):
-        return self.model.token_emb
     def set_input_embeddings(self, value):
-        self.model.token_emb = value
     def get_output_embeddings(self):
         return self.lm_head
@@ -56,7 +79,7 @@ class MinGRULMForCausalLM(MinGRULMPreTrainedModel):
         labels: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = True,
     ):
-        # Forward pass through the model
         logits = self.model(input_ids)
         loss = None
@@ -75,4 +98,4 @@ class MinGRULMForCausalLM(MinGRULMPreTrainedModel):
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
-        )

 from minGRU_pytorch.minGRULM import minGRULM
+# Wrapper class for device compatibility
+class MinGRULMWrapped(nn.Module):
+    def __init__(self, min_gru_model):
+        super().__init__()
+        self.min_gru_model = min_gru_model
+        self.device = torch.device("cpu")  # Default device
+    def forward(self, *args, **kwargs):
+        # Move input tensors to the correct device
+        args = [arg.to(self.device) if isinstance(arg, torch.Tensor) else arg for arg in args]
+        kwargs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in kwargs.items()}
+        return self.min_gru_model(*args, **kwargs)
+    def to(self, device):
+        # Update device information
+        self.device = device
+        self.min_gru_model.to(device)
+        return self
 class MinGRULMPreTrainedModel(PreTrainedModel):
     config_class = MinGRULMConfig
     base_model_prefix = "model"
     def __init__(self, config: MinGRULMConfig):
         super().__init__(config)
+        # Load model from minGRULM library and wrap it
+        raw_min_gru = minGRULM(
             num_tokens=config.vocab_size,
             dim=config.d_model,
             depth=config.n_layer,
             ff_mult=config.ff_mult,
+            min_gru_expansion=config.min_gru_expansion,
             enable_conv=config.enable_conv,
         )
+        self.model = MinGRULMWrapped(raw_min_gru)
+        # Language modeling head
         self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
         self.post_init()
     def get_input_embeddings(self):
+        return self.model.min_gru_model.token_emb
     def set_input_embeddings(self, value):
+        self.model.min_gru_model.token_emb = value
     def get_output_embeddings(self):
         return self.lm_head
         labels: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = True,
     ):
+        # Forward pass through the wrapped model
         logits = self.model(input_ids)
         loss = None
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
+        )