microsoft
/

Phi-3-mini-128k-instruct

@@ -25,6 +25,7 @@ import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
@@ -43,9 +44,9 @@ from transformers.utils import (
     logging,
     replace_return_docstrings,
 )
 from .configuration_phi3 import Phi3Config
 logger = logging.get_logger(__name__)
 # Transformers scans dependencies in the modeling file, causing issues on conditional loading. The regex only ignores try/catch blocks, but not if statements
@@ -86,7 +87,7 @@ PHI3_PRETRAINED_MODEL_ARCHIVE_LIST = [
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
 class Phi3RMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-5):
         """
         Phi3RMSNorm is equivalent to T5LayerNorm
         """
@@ -120,7 +121,7 @@ def _get_unpad_data(attention_mask):
 # Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Phi3
 class Phi3RotaryEmbedding(nn.Module):
-    def __init__(self, dim, max_position_embeddings=4096, base=10000, device=None):
         super().__init__()
         self.dim = dim
@@ -228,7 +229,6 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-# Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
@@ -608,7 +608,7 @@ class Phi3FlashAttention2(Phi3Attention):
         return attn_output, attn_weights, past_key_value
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
         self,
         query_states,
@@ -650,14 +650,9 @@ class Phi3FlashAttention2(Phi3Attention):
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
-            (
-                query_states,
-                key_states,
-                value_states,
-                indices_q,
-                cu_seq_lens,
-                max_seq_lens,
-            ) = self._upad_input(query_states, key_states, value_states, attention_mask, query_length)
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
@@ -687,10 +682,7 @@ class Phi3FlashAttention2(Phi3Attention):
                     dropout_p=dropout,
                     softmax_scale=softmax_scale,
                     causal=causal,
-                    window_size=(
-                        self.config.sliding_window,
-                        self.config.sliding_window,
-                    ),
                 )
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
@@ -712,15 +704,12 @@ class Phi3FlashAttention2(Phi3Attention):
                     dropout,
                     softmax_scale=softmax_scale,
                     causal=causal,
-                    window_size=(
-                        self.config.sliding_window,
-                        self.config.sliding_window,
-                    ),
                 )
         return attn_output
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
@@ -737,8 +726,7 @@ class Phi3FlashAttention2(Phi3Attention):
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim),
-                indices_k,
             )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
@@ -1233,7 +1221,7 @@ class Phi3Model(Phi3PreTrainedModel):
 class Phi3ForCausalLM(Phi3PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
-    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3,bias=False->bias=True
     def __init__(self, config):
         super().__init__(config)
         self.model = Phi3Model(config)
@@ -1439,7 +1427,7 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
     """,
     PHI3_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3 with self.transformer->self.model, transformer_outputs->model_outputs
 class Phi3ForSequenceClassification(Phi3PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1555,7 +1543,7 @@ class Phi3ForSequenceClassification(Phi3PreTrainedModel):
     """,
     PHI3_START_DOCSTRING,
 )
-# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,self.transformer->self.model,transformer_outputs->model_outputs
 class Phi3ForTokenClassification(Phi3PreTrainedModel):
     def __init__(self, config: Phi3Config):
         super().__init__(config)
@@ -1622,7 +1610,9 @@ class Phi3ForTokenClassification(Phi3PreTrainedModel):
             labels = labels.to(logits.device)
             batch_size, seq_length = labels.shape
             loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length))
         if not return_dict:
             output = (logits,) + model_outputs[2:]

 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
     logging,
     replace_return_docstrings,
 )
 from .configuration_phi3 import Phi3Config
 logger = logging.get_logger(__name__)
 # Transformers scans dependencies in the modeling file, causing issues on conditional loading. The regex only ignores try/catch blocks, but not if statements
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3
 class Phi3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
         """
         Phi3RMSNorm is equivalent to T5LayerNorm
         """
 # Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Phi3
 class Phi3RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
     return torch.cat((-x2, x1), dim=-1)
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
         return attn_output, attn_weights, past_key_value
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
         self,
         query_states,
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
             cu_seqlens_q, cu_seqlens_k = cu_seq_lens
             max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
                     dropout_p=dropout,
                     softmax_scale=softmax_scale,
                     causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
                 )
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
                     dropout,
                     softmax_scale=softmax_scale,
                     causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
                 )
         return attn_output
+    # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input
     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
         batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
             )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
 class Phi3ForCausalLM(Phi3PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3
     def __init__(self, config):
         super().__init__(config)
         self.model = Phi3Model(config)
     """,
     PHI3_START_DOCSTRING,
 )
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Phi3, LLAMA->PHI3, self.transformer->self.model, transformer_outputs->model_outputs
 class Phi3ForSequenceClassification(Phi3PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
     """,
     PHI3_START_DOCSTRING,
 )
+# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with Mpt->Phi3,MPT->PHI3,self.transformer->self.model,transformer_outputs->model_outputs
 class Phi3ForTokenClassification(Phi3PreTrainedModel):
     def __init__(self, config: Phi3Config):
         super().__init__(config)
             labels = labels.to(logits.device)
             batch_size, seq_length = labels.shape
             loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length)
+            )
         if not return_dict:
             output = (logits,) + model_outputs[2:]