Lin-Chen
/

ShareCaptioner

Feature Extraction

Transformers

PyTorch

InternLMXComposer

custom_code

Model card Files Files and versions Community

chenlin commited on Dec 13, 2023

Commit

a285f80

•

1 Parent(s): 60d2df3

fix import

Browse files

Files changed (1) hide show

modeling_InternLM.py +44 -53

modeling_InternLM.py CHANGED Viewed

@@ -2,12 +2,10 @@ import math
 from typing import List, Union
 from typing import Optional, Tuple
-import rotary_emb
 import torch
 import torch.utils.checkpoint
 import torch.utils.checkpoint
 from einops import rearrange
-from flash_attn.layers.rotary import ApplyRotaryEmbQKV_ as LegacyApplyRotaryEmbQKV_
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
@@ -23,51 +21,70 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "InternLMXComposerConfig"
-class ApplyRotaryEmbQKV_(torch.autograd.Function):
-    """
-    ApplyRotaryEmbQKV_
-    """
     @staticmethod
-    def forward(ctx, qkv, cos, sin, cos_k=None, sin_k=None):
         """
-            qkv: (total, 3, nheads, headdim)
             cos, sin: (seqlen, rotary_dim / 2)
             cos_k, sin_k: (seqlen, rotary_dim / 2), optional
         rotary_dim must be <= headdim
         Apply rotary embedding *inplace* to the first rotary_dim of q and k.
         """
-        _, three, _, headdim = qkv.shape
         assert three == 3
         rotary_seqlen, rotary_dim = cos.shape
         rotary_dim *= 2
         assert rotary_dim <= headdim
         cos_k = cos if cos_k is None else cos_k
         sin_k = sin if sin_k is None else sin_k
-        assert sin.shape == cos_k.shape == sin_k.shape == (rotary_seqlen,
-                                                           rotary_dim // 2)
-        q1, q2 = qkv[:, 0, :, :rotary_dim].chunk(2, dim=-1)
-        rotary_emb.apply_rotary(q1, q2, rearrange(cos, "s d -> s 1 d"),
-                                rearrange(sin, "s d -> s 1 d"), q1, q2, False)
-        k1, k2 = qkv[:, 1, :, :rotary_dim].chunk(2, dim=-1)
-        rotary_emb.apply_rotary(k1, k2, rearrange(cos_k, "s d -> s 1 d"),
-                                rearrange(sin_k, "s d -> s 1 d"), k1, k2,
-                                False)
         ctx.save_for_backward(cos, sin, cos_k, sin_k)
         return qkv
     @staticmethod
     def backward(ctx, dqkv):
         cos, sin, cos_k, sin_k = ctx.saved_tensors
         rotary_dim = cos.shape[-1]
         rotary_dim *= 2
-        dq1, dq2 = dqkv[:, 0, :, :rotary_dim].chunk(2, dim=-1)
-        rotary_emb.apply_rotary(dq1, dq2, rearrange(cos, "s d -> s 1 d"),
-                                rearrange(sin, "s d -> s 1 d"), dq1, dq2, True)
-        dk1, dk2 = dqkv[:, 1, :, :rotary_dim].chunk(2, dim=-1)
-        rotary_emb.apply_rotary(dk1, dk2, rearrange(cos_k, "s d -> s 1 d"),
-                                rearrange(sin_k, "s d -> s 1 d"), dk1, dk2,
-                                True)
-        return dqkv, None, None, None, None
 class ConvertedInternLMRotaryEmbedding(torch.nn.Module):
@@ -120,23 +137,6 @@ class ConvertedInternLMRotaryEmbedding(torch.nn.Module):
                 self._cos_k_cached = (torch.cos(freqs) / scale).to(x.dtype)
                 self._sin_k_cached = (torch.sin(freqs) / scale).to(x.dtype)
-    def forward(self,
-                qkv: torch.Tensor,
-                indexes=0) -> Tuple[torch.Tensor, torch.Tensor]:
-        self._update_cos_sin_cache(qkv, indexes)
-        if self.scale is None:
-            return apply_rotary_emb_qkv_(qkv, self._cos_cached[indexes],
-                                         self._sin_cached[indexes]).to(
-                                             qkv.dtype)
-        else:
-            return apply_rotary_emb_qkv_(
-                qkv,
-                self._cos_cached[indexes],
-                self._sin_cached[indexes],
-                self._cos_k_cached[indexes],
-                self._sin_k_cached[indexes],
-            ).to(qkv.dtype)
     def eval_forward(self, qkv, seqlen_offset=0):
         """
         seqlen_offset: can be used in generation where the qkv being passed in is only the last
@@ -157,7 +157,6 @@ class ConvertedInternLMRotaryEmbedding(torch.nn.Module):
             )
-apply_rotary_emb_qkv_ = ApplyRotaryEmbQKV_.apply
 legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply
@@ -487,7 +486,6 @@ class InternLMPreTrainedModel(PreTrainedModel):
 class InternLMModel(InternLMPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLMDecoderLayer`]
     Args:
         config: InternLMXComposerConfig
     """
@@ -631,7 +629,7 @@ class InternLMModel(InternLMPreTrainedModel):
             past_key_value = past_key_values[
                 idx] if past_key_values is not None else None
-            if self.gradient_checkpointing and self.training and idx % 2 == 0:
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
@@ -696,7 +694,6 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
             setattr(config, 'kqvo_bias', config.kqvo_bias)
         else:
             setattr(config, 'kqvo_bias', False)
         self.model = InternLMModel(config)
         self.lm_head = nn.Linear(config.hidden_size,
@@ -762,20 +759,14 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, InternLMForCausalLM
         >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you consciours? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

 from typing import List, Union
 from typing import Optional, Tuple
 import torch
 import torch.utils.checkpoint
 import torch.utils.checkpoint
 from einops import rearrange
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
 _CONFIG_FOR_DOC = "InternLMXComposerConfig"
+def rotary_embed(x1, x2, cos, sin, conj):
+    x1, x2 = x1.float(), x2.float()
+    if conj:
+        x1, x2 = x1 * cos + x2 * sin, x1 * sin + x2 * cos
+    else:
+        x1, x2 = x1 * cos - x2 * sin, x1 * sin + x2 * cos
+    return x1, x2
+class LegacyApplyRotaryEmbQKV_(torch.autograd.Function):
     @staticmethod
+    def forward(ctx, qkv, cos, sin, cos_k=None, sin_k=None, interleaved=False):
         """
+            qkv: (batch_size, seqlen, 3, nheads, headdim)
             cos, sin: (seqlen, rotary_dim / 2)
             cos_k, sin_k: (seqlen, rotary_dim / 2), optional
+            interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead of
+                1st half and 2nd half (GPT-NeoX style).
         rotary_dim must be <= headdim
         Apply rotary embedding *inplace* to the first rotary_dim of q and k.
         """
+        batch, seqlen, three, nheads, headdim = qkv.shape
         assert three == 3
         rotary_seqlen, rotary_dim = cos.shape
         rotary_dim *= 2
         assert rotary_dim <= headdim
+        assert seqlen <= rotary_seqlen
         cos_k = cos if cos_k is None else cos_k
         sin_k = sin if sin_k is None else sin_k
+        assert sin.shape == cos_k.shape == sin_k.shape == (rotary_seqlen, rotary_dim // 2)
+        q_ro = qkv[:, :, 0, :, :rotary_dim]
+        q1, q2 = q_ro.chunk(2, dim=-1) if not interleaved else (q_ro[..., ::2], q_ro[..., 1::2])
+        # rotary_emb.apply_rotary(q1, q2, rearrange(cos[:seqlen], 's d -> s 1 d'),
+        #                         rearrange(sin[:seqlen], 's d -> s 1 d'), q1, q2, False)
+        q1, q2 = rotary_embed(q1, q2, rearrange(cos[:seqlen], 's d -> s 1 d'), rearrange(sin[:seqlen], 's d -> s 1 d'), False)
+        qkv[:, :, 0, :, :rotary_dim] = torch.cat([q1, q2], dim=-1)
+        k_ro = qkv[:, :, 1, :, :rotary_dim]
+        k1, k2 = k_ro.chunk(2, dim=-1) if not interleaved else (k_ro[..., ::2], k_ro[..., 1::2])
+        # rotary_emb.apply_rotary(k1, k2, rearrange(cos_k[:seqlen], 's d -> s 1 d'),
+        #                         rearrange(sin_k[:seqlen], 's d -> s 1 d'), k1, k2, False)
+        k1, k2 = rotary_embed(k1, k2, rearrange(cos_k[:seqlen], 's d -> s 1 d'), rearrange(sin_k[:seqlen], 's d -> s 1 d'), False)
+        qkv[:, :, 1, :, :rotary_dim] = torch.cat([k1, k2], dim=-1)
         ctx.save_for_backward(cos, sin, cos_k, sin_k)
+        ctx.interleaved = interleaved
         return qkv
     @staticmethod
     def backward(ctx, dqkv):
         cos, sin, cos_k, sin_k = ctx.saved_tensors
+        _, seqlen, _, _, headdim = dqkv.shape
         rotary_dim = cos.shape[-1]
         rotary_dim *= 2
+        dq_ro = dqkv[:, :, 0, :, :rotary_dim]
+        dq1, dq2 = (dq_ro.chunk(2, dim=-1) if not ctx.interleaved
+                    else (dq_ro[..., ::2], dq_ro[..., 1::2]))
+        rotary_emb.apply_rotary(dq1, dq2, rearrange(cos[:seqlen], 's d -> s 1 d'),
+                                rearrange(sin[:seqlen], 's d -> s 1 d'), dq1, dq2, True)
+        dk_ro = dqkv[:, :, 1, :, :rotary_dim]
+        dk1, dk2 = (dk_ro.chunk(2, dim=-1) if not ctx.interleaved
+                    else (dk_ro[..., ::2], dk_ro[..., 1::2]))
+        rotary_emb.apply_rotary(dk1, dk2, rearrange(cos_k[:seqlen], 's d -> s 1 d'),
+                                rearrange(sin_k[:seqlen], 's d -> s 1 d'), dk1, dk2, True)
+        return dqkv, None, None, None, None, None
 class ConvertedInternLMRotaryEmbedding(torch.nn.Module):
                 self._cos_k_cached = (torch.cos(freqs) / scale).to(x.dtype)
                 self._sin_k_cached = (torch.sin(freqs) / scale).to(x.dtype)
     def eval_forward(self, qkv, seqlen_offset=0):
         """
         seqlen_offset: can be used in generation where the qkv being passed in is only the last
             )
 legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply
 class InternLMModel(InternLMPreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLMDecoderLayer`]
     Args:
         config: InternLMXComposerConfig
     """
             past_key_value = past_key_values[
                 idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
                 def create_custom_forward(module):
                     def custom_forward(*inputs):
             setattr(config, 'kqvo_bias', config.kqvo_bias)
         else:
             setattr(config, 'kqvo_bias', False)
         self.model = InternLMModel(config)
         self.lm_head = nn.Linear(config.hidden_size,
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, InternLMForCausalLM
         >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you consciours? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]