x54-729 commited on
Commit
f3a2a3f
1 Parent(s): c823b34

Update modeling_internlm.py

Browse files
Files changed (1) hide show
  1. modeling_internlm.py +197 -105
modeling_internlm.py CHANGED
@@ -1,5 +1,5 @@
1
  # coding=utf-8
2
- # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
  #
4
  # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
  # and OPT implementations in this library. It has been modified from its
@@ -19,26 +19,40 @@
19
  # limitations under the License.
20
  """ PyTorch InternLM model."""
21
  import math
 
 
22
  from typing import List, Optional, Tuple, Union
23
- import threading, queue
24
 
25
  import torch
26
  import torch.utils.checkpoint
27
  from torch import nn
28
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
-
30
  from transformers.activations import ACT2FN
31
- from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 
 
 
 
32
  from transformers.modeling_utils import PreTrainedModel
33
- from transformers.generation.streamers import BaseStreamer
34
- from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
35
- from .configuration_internlm import InternLMConfig
 
 
 
 
 
 
 
 
36
 
 
37
 
38
  logger = logging.get_logger(__name__)
39
 
40
  _CONFIG_FOR_DOC = "InternLMConfig"
41
 
 
42
  # Copied from transformers.models.bart.modeling_bart._make_causal_mask
43
  def _make_causal_mask(
44
  input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
@@ -71,17 +85,10 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
71
 
72
  return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
73
 
74
- def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
75
- """
76
- (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
77
- """
78
- batch, num_key_value_heads, slen, head_dim = hidden_states.shape
79
- if n_rep == 1:
80
- return hidden_states
81
- hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
82
- return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
83
 
84
  class InternLMRMSNorm(nn.Module):
 
 
85
  def __init__(self, hidden_size, eps=1e-6):
86
  """
87
  InternLMRMSNorm is equivalent to T5LayerNorm
@@ -102,6 +109,15 @@ class InternLMRMSNorm(nn.Module):
102
 
103
 
104
  class InternLMRotaryEmbedding(torch.nn.Module):
 
 
 
 
 
 
 
 
 
105
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
106
  super().__init__()
107
  inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
@@ -113,8 +129,8 @@ class InternLMRotaryEmbedding(torch.nn.Module):
113
  freqs = torch.einsum("i,j->ij", t, self.inv_freq)
114
  # Different from paper, but it uses a different permutation in order to obtain the same calculation
115
  emb = torch.cat((freqs, freqs), dim=-1)
116
- self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
117
- self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
118
 
119
  def forward(self, x, seq_len=None):
120
  # x: [bs, num_attention_heads, seq_len, head_size]
@@ -125,11 +141,71 @@ class InternLMRotaryEmbedding(torch.nn.Module):
125
  freqs = torch.einsum("i,j->ij", t, self.inv_freq)
126
  # Different from paper, but it uses a different permutation in order to obtain the same calculation
127
  emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
128
- self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
129
- self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  return (
131
- self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
132
- self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
133
  )
134
 
135
 
@@ -139,15 +215,23 @@ def rotate_half(x):
139
  x2 = x[..., x.shape[-1] // 2 :]
140
  return torch.cat((-x2, x1), dim=-1)
141
 
142
-
143
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
144
- # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
145
- cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
146
- sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
147
- cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
148
- sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
149
- q_embed = (q * cos) + (rotate_half(q) * sin)
150
- k_embed = (k * cos) + (rotate_half(k) * sin)
 
 
 
 
 
 
 
 
 
151
  return q_embed, k_embed
152
 
153
 
@@ -177,8 +261,6 @@ class InternLMAttention(nn.Module):
177
  self.hidden_size = config.hidden_size
178
  self.num_heads = config.num_attention_heads
179
  self.head_dim = self.hidden_size // self.num_heads
180
- self.num_key_value_heads = config.num_key_value_heads
181
- self.num_key_value_groups = self.num_heads // self.num_key_value_heads
182
  self.max_position_embeddings = config.max_position_embeddings
183
 
184
  if (self.head_dim * self.num_heads) != self.hidden_size:
@@ -187,10 +269,28 @@ class InternLMAttention(nn.Module):
187
  f" and `num_heads`: {self.num_heads})."
188
  )
189
  self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
190
- self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
191
- self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
192
  self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
193
- self.rotary_emb = InternLMRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
196
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
@@ -207,15 +307,8 @@ class InternLMAttention(nn.Module):
207
  bsz, q_len, _ = hidden_states.size()
208
 
209
  query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
210
- key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
211
- value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
212
-
213
- kv_seq_len = key_states.shape[-2]
214
- if past_key_value is not None:
215
- kv_seq_len += past_key_value[0].shape[-2]
216
- cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
217
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
218
- # [bsz, nh, t, hd]
219
 
220
  if past_key_value is not None:
221
  # reuse k, v, self_attention
@@ -224,8 +317,9 @@ class InternLMAttention(nn.Module):
224
 
225
  past_key_value = (key_states, value_states) if use_cache else None
226
 
227
- key_states = repeat_kv(key_states, self.num_key_value_groups)
228
- value_states = repeat_kv(value_states, self.num_key_value_groups)
 
229
 
230
  attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
231
 
@@ -336,11 +430,9 @@ INTERNLM_START_DOCSTRING = r"""
336
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
337
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
338
  etc.)
339
-
340
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
341
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
342
  and behavior.
343
-
344
  Parameters:
345
  config ([`InternLMConfig`]):
346
  Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -381,44 +473,34 @@ INTERNLM_INPUTS_DOCSTRING = r"""
381
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
382
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
383
  it.
384
-
385
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
386
  [`PreTrainedTokenizer.__call__`] for details.
387
-
388
  [What are input IDs?](../glossary#input-ids)
389
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
390
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
391
-
392
  - 1 for tokens that are **not masked**,
393
  - 0 for tokens that are **masked**.
394
-
395
  [What are attention masks?](../glossary#attention-mask)
396
-
397
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
398
  [`PreTrainedTokenizer.__call__`] for details.
399
-
400
  If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
401
  `past_key_values`).
402
-
403
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
404
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
405
  information on the default strategy.
406
-
407
  - 1 indicates the head is **not masked**,
408
  - 0 indicates the head is **masked**.
409
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
410
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
411
  config.n_positions - 1]`.
412
-
413
  [What are position IDs?](../glossary#position-ids)
414
- past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
 
415
  Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
416
  `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
417
  `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
418
-
419
  Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
420
  blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
421
-
422
  If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
423
  don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
424
  `decoder_input_ids` of shape `(batch_size, sequence_length)`.
@@ -447,10 +529,10 @@ INTERNLM_INPUTS_DOCSTRING = r"""
447
  class InternLMModel(InternLMPreTrainedModel):
448
  """
449
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLMDecoderLayer`]
450
-
451
  Args:
452
  config: InternLMConfig
453
  """
 
454
  _auto_class = "AutoModel"
455
 
456
  def __init__(self, config: InternLMConfig):
@@ -676,20 +758,14 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
676
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
677
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
678
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
679
-
680
  Returns:
681
-
682
  Example:
683
-
684
  ```python
685
  >>> from transformers import AutoTokenizer, InternLMForCausalLM
686
-
687
  >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
688
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
689
-
690
  >>> prompt = "Hey, are you consciours? Can you talk to me?"
691
  >>> inputs = tokenizer(prompt, return_tensors="pt")
692
-
693
  >>> # Generate
694
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
695
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -780,55 +856,73 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
780
  reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
781
  return reordered_past
782
 
783
- def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = []):
784
  prompt = ""
 
 
 
 
785
  for record in history:
786
- prompt += f"""<|User|>:{record[0]}<eoh>\n<|Bot|>:{record[1]}<eoa>\n"""
787
- prompt += f"""<|User|>:{query}<eoh>\n<|Bot|>:"""
788
  return tokenizer([prompt], return_tensors="pt")
789
-
790
  @torch.no_grad()
791
- def chat(self,
792
- tokenizer,
793
- query: str,
794
- history: List[Tuple[str, str]] = [],
795
- streamer: Optional[BaseStreamer] = None,
796
- max_new_tokens: int = 1024,
797
- do_sample: bool = True,
798
- temperature: float = 0.8,
799
- top_p: float = 0.8,
800
- **kwargs):
801
- inputs = self.build_inputs(tokenizer, query, history)
 
 
 
 
 
802
  inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
803
- outputs = self.generate(**inputs,
804
- streamer=streamer,
805
- max_new_tokens=max_new_tokens,
806
- do_sample=do_sample,
807
- temperature=temperature,
808
- top_p=top_p,
809
- **kwargs)
810
- outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]):]
 
 
811
  response = tokenizer.decode(outputs, skip_special_tokens=True)
812
  response = response.split("<eoa>")[0]
813
  history = history + [(query, response)]
814
  return response, history
815
-
816
  @torch.no_grad()
817
- def stream_chat(self,
818
- tokenizer,
819
- query: str,
820
- history: List[Tuple[str, str]] = [],
821
- max_new_tokens: int = 1024,
822
- do_sample: bool = True,
823
- temperature: float = 0.8,
824
- top_p: float = 0.8,
825
- **kwargs):
 
 
826
  """
827
  Return a generator in format: (response, history)
828
  Eg.
829
  ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')])
830
  ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')])
831
  """
 
 
 
 
 
832
 
833
  response_queue = queue.Queue(maxsize=20)
834
 
@@ -868,12 +962,12 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
868
  tokenizer=tokenizer,
869
  query=query,
870
  streamer=ChatStreamer(tokenizer=tokenizer),
871
- history=history,
872
  max_new_tokens=max_new_tokens,
873
  do_sample=do_sample,
874
  temperature=temperature,
875
  top_p=top_p,
876
- **kwargs
877
  )
878
 
879
  def consumer():
@@ -891,10 +985,8 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
891
  @add_start_docstrings(
892
  """
893
  The InternLM Model transformer with a sequence classification head on top (linear layer).
894
-
895
  [`InternLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
896
  (e.g. GPT-2) do.
897
-
898
  Since it does classification on the last token, it requires to know the position of the last token. If a
899
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
900
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
@@ -1007,4 +1099,4 @@ class InternLMForSequenceClassification(InternLMPreTrainedModel):
1007
  past_key_values=transformer_outputs.past_key_values,
1008
  hidden_states=transformer_outputs.hidden_states,
1009
  attentions=transformer_outputs.attentions,
1010
- )
 
1
  # coding=utf-8
2
+ # Copyright (c) InternLM. All rights reserved.
3
  #
4
  # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
  # and OPT implementations in this library. It has been modified from its
 
19
  # limitations under the License.
20
  """ PyTorch InternLM model."""
21
  import math
22
+ import queue
23
+ import threading
24
  from typing import List, Optional, Tuple, Union
 
25
 
26
  import torch
27
  import torch.utils.checkpoint
28
  from torch import nn
29
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
30
  from transformers.activations import ACT2FN
31
+ from transformers.modeling_outputs import (
32
+ BaseModelOutputWithPast,
33
+ CausalLMOutputWithPast,
34
+ SequenceClassifierOutputWithPast,
35
+ )
36
  from transformers.modeling_utils import PreTrainedModel
37
+ from transformers.utils import (
38
+ add_start_docstrings,
39
+ add_start_docstrings_to_model_forward,
40
+ logging,
41
+ replace_return_docstrings,
42
+ )
43
+
44
+ try:
45
+ from transformers.generation.streamers import BaseStreamer
46
+ except: # noqa # pylint: disable=bare-except
47
+ BaseStreamer = None
48
 
49
+ from .configuration_internlm import InternLMConfig
50
 
51
  logger = logging.get_logger(__name__)
52
 
53
  _CONFIG_FOR_DOC = "InternLMConfig"
54
 
55
+
56
  # Copied from transformers.models.bart.modeling_bart._make_causal_mask
57
  def _make_causal_mask(
58
  input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
 
85
 
86
  return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
87
 
 
 
 
 
 
 
 
 
 
88
 
89
  class InternLMRMSNorm(nn.Module):
90
+ """RMSNorm implemention."""
91
+
92
  def __init__(self, hidden_size, eps=1e-6):
93
  """
94
  InternLMRMSNorm is equivalent to T5LayerNorm
 
109
 
110
 
111
  class InternLMRotaryEmbedding(torch.nn.Module):
112
+ """Implement InternLM's rotary embedding.
113
+
114
+ Args:
115
+ dim (int): Characteristic dimension of each self-attentional head.
116
+ max_position_embeddings (int, optional): Model's training length. Defaults to 2048.
117
+ base (int, optional): The rotation position encodes the rotation Angle base number. Defaults to 10000.
118
+ device (Any, optional): Running device. Defaults to None.
119
+ """
120
+
121
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
122
  super().__init__()
123
  inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
 
129
  freqs = torch.einsum("i,j->ij", t, self.inv_freq)
130
  # Different from paper, but it uses a different permutation in order to obtain the same calculation
131
  emb = torch.cat((freqs, freqs), dim=-1)
132
+ self.register_buffer("cos_cached", emb.cos().to(torch.float32), persistent=False)
133
+ self.register_buffer("sin_cached", emb.sin().to(torch.float32), persistent=False)
134
 
135
  def forward(self, x, seq_len=None):
136
  # x: [bs, num_attention_heads, seq_len, head_size]
 
141
  freqs = torch.einsum("i,j->ij", t, self.inv_freq)
142
  # Different from paper, but it uses a different permutation in order to obtain the same calculation
143
  emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
144
+ self.register_buffer("cos_cached", emb.cos(), persistent=False)
145
+ self.register_buffer("sin_cached", emb.sin(), persistent=False)
146
+ return (
147
+ self.cos_cached[:seq_len, ...].to(dtype=x.dtype),
148
+ self.sin_cached[:seq_len, ...].to(dtype=x.dtype),
149
+ )
150
+
151
+
152
+ class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
153
+ """Implement InternLM's DyanmicNTK extrapolation method, thereby broadening the model support context to 16K.
154
+
155
+ Args:
156
+ dim (int): Characteristic dimension of each self-attentional head.
157
+ max_position_embeddings (int, optional): Model's training length. Defaults to 2048.
158
+ base (int, optional): The rotation position encodes the rotation Angle base number. Defaults to 10000.
159
+ device (Any, optional): Running device. Defaults to None.
160
+ scaling_factor (float, optional): NTK method extrapolation coefficient. Defaults to 1.0.
161
+ """
162
+
163
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
164
+ super().__init__()
165
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
166
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
167
+ self.dim = dim
168
+ self.base = base
169
+ self.scaling_factor = scaling_factor
170
+
171
+ # Build here to make `torch.jit.trace` work.
172
+ self.max_position_embeddings = max_position_embeddings
173
+ self.max_seq_len_cached = max_position_embeddings
174
+ t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
175
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
176
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
177
+ emb = torch.cat((freqs, freqs), dim=-1)
178
+ self.register_buffer("cos_cached", emb.cos(), persistent=False)
179
+ self.register_buffer("sin_cached", emb.sin(), persistent=False)
180
+
181
+ def _update_cached(self, x, seq_len=None):
182
+ self.max_seq_len_cached = max(seq_len, self.max_position_embeddings)
183
+ if seq_len > self.max_position_embeddings:
184
+ base = self.base * (
185
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
186
+ ) ** (self.dim / (self.dim - 2))
187
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim))
188
+ else:
189
+ inv_freq = self.inv_freq
190
+ t = torch.arange(self.max_seq_len_cached, device=inv_freq.device, dtype=inv_freq.dtype)
191
+ freqs = torch.einsum("i,j->ij", t, inv_freq)
192
+ emb = torch.cat((freqs, freqs), dim=-1)
193
+ self.register_buffer("cos_cached", emb.cos(), persistent=False)
194
+ self.register_buffer("sin_cached", emb.sin(), persistent=False)
195
+
196
+ def forward(self, x, seq_len=None):
197
+ # x: [bs, num_attention_heads, seq_len, head_size]
198
+ # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
199
+ if seq_len <= self.max_position_embeddings:
200
+ # Reset the tables if the sequence length has changed,
201
+ if self.max_seq_len_cached > self.max_position_embeddings:
202
+ self._update_cached(x, seq_len)
203
+ else:
204
+ self._update_cached(x, seq_len)
205
+
206
  return (
207
+ self.cos_cached[:seq_len, ...].to(dtype=x.dtype),
208
+ self.sin_cached[:seq_len, ...].to(dtype=x.dtype),
209
  )
210
 
211
 
 
215
  x2 = x[..., x.shape[-1] // 2 :]
216
  return torch.cat((-x2, x1), dim=-1)
217
 
 
218
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
219
+ if position_ids.size(1) == 1:
220
+ q_cos = cos[position_ids].unsqueeze(1).expand(q.shape)
221
+ q_sin = sin[position_ids].unsqueeze(1).expand(q.shape)
222
+ q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
223
+
224
+ position_ids = position_ids.flatten() + 1
225
+ max_length = max(position_ids)
226
+ position_ids = torch.stack([torch.cat([torch.ones(max_length - w, dtype=torch.long), torch.arange(w)]) for w in position_ids])
227
+ k_cos = cos[position_ids].unsqueeze(1).expand(k.shape)
228
+ k_sin = sin[position_ids].unsqueeze(1).expand(k.shape)
229
+ k_embed = (k * k_cos) + (rotate_half(k) * k_sin)
230
+ else:
231
+ cos = cos[position_ids].unsqueeze(1).expand(q.shape)
232
+ sin = sin[position_ids].unsqueeze(1).expand(q.shape)
233
+ q_embed = (q * cos) + (rotate_half(q) * sin)
234
+ k_embed = (k * cos) + (rotate_half(k) * sin)
235
  return q_embed, k_embed
236
 
237
 
 
261
  self.hidden_size = config.hidden_size
262
  self.num_heads = config.num_attention_heads
263
  self.head_dim = self.hidden_size // self.num_heads
 
 
264
  self.max_position_embeddings = config.max_position_embeddings
265
 
266
  if (self.head_dim * self.num_heads) != self.hidden_size:
 
269
  f" and `num_heads`: {self.num_heads})."
270
  )
271
  self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
272
+ self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
273
+ self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
274
  self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
275
+ self.rotary_emb = self._init_rope()
276
+
277
+ def _init_rope(self):
278
+ if self.config.rotary["type"] == "origin":
279
+ self.rotary_emb = InternLMRotaryEmbedding(
280
+ self.head_dim,
281
+ max_position_embeddings=self.max_position_embeddings,
282
+ base=self.config.rotary["base"],
283
+ )
284
+ elif self.config.rotary["type"] == "dynamic":
285
+ self.rotary_emb = InternLMDynamicNTKScalingRotaryEmbedding(
286
+ self.head_dim,
287
+ max_position_embeddings=self.max_position_embeddings,
288
+ base=self.config.rotary["base"],
289
+ scaling_factor=self.config.rotary.get("scaling_factor", 1.0),
290
+ )
291
+ else:
292
+ raise ValueError("Currently we only support rotary embedding's type being one of ('origin', 'dynamic').")
293
+ return self.rotary_emb
294
 
295
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
296
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
307
  bsz, q_len, _ = hidden_states.size()
308
 
309
  query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
310
+ key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
311
+ value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
 
 
 
 
 
 
 
312
 
313
  if past_key_value is not None:
314
  # reuse k, v, self_attention
 
317
 
318
  past_key_value = (key_states, value_states) if use_cache else None
319
 
320
+ kv_seq_len = key_states.shape[-2]
321
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
322
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
323
 
324
  attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
325
 
 
430
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
431
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
432
  etc.)
 
433
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
434
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
435
  and behavior.
 
436
  Parameters:
437
  config ([`InternLMConfig`]):
438
  Model configuration class with all the parameters of the model. Initializing with a config file does not
 
473
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
474
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
475
  it.
 
476
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
477
  [`PreTrainedTokenizer.__call__`] for details.
 
478
  [What are input IDs?](../glossary#input-ids)
479
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
480
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
481
  - 1 for tokens that are **not masked**,
482
  - 0 for tokens that are **masked**.
 
483
  [What are attention masks?](../glossary#attention-mask)
 
484
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
485
  [`PreTrainedTokenizer.__call__`] for details.
 
486
  If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
487
  `past_key_values`).
 
488
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
489
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
490
  information on the default strategy.
 
491
  - 1 indicates the head is **not masked**,
492
  - 0 indicates the head is **masked**.
493
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
494
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
495
  config.n_positions - 1]`.
 
496
  [What are position IDs?](../glossary#position-ids)
497
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
498
+ when `config.use_cache=True`):
499
  Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
500
  `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
501
  `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
 
502
  Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
503
  blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
 
504
  If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
505
  don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
506
  `decoder_input_ids` of shape `(batch_size, sequence_length)`.
 
529
  class InternLMModel(InternLMPreTrainedModel):
530
  """
531
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLMDecoderLayer`]
 
532
  Args:
533
  config: InternLMConfig
534
  """
535
+
536
  _auto_class = "AutoModel"
537
 
538
  def __init__(self, config: InternLMConfig):
 
758
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
759
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
760
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
 
761
  Returns:
 
762
  Example:
 
763
  ```python
764
  >>> from transformers import AutoTokenizer, InternLMForCausalLM
 
765
  >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
766
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
767
  >>> prompt = "Hey, are you consciours? Can you talk to me?"
768
  >>> inputs = tokenizer(prompt, return_tensors="pt")
 
769
  >>> # Generate
770
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
771
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
856
  reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
857
  return reordered_past
858
 
859
+ def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
860
  prompt = ""
861
+ if meta_instruction:
862
+ prompt += f"""<s><|System|>:{meta_instruction}\n"""
863
+ else:
864
+ prompt += "<s>"
865
  for record in history:
866
+ prompt += f"""<|User|>:{record[0]}\n<|Bot|>:{record[1]}<eoa>\n"""
867
+ prompt += f"""<|User|>:{query}\n<|Bot|>:"""
868
  return tokenizer([prompt], return_tensors="pt")
869
+
870
  @torch.no_grad()
871
+ def chat(
872
+ self,
873
+ tokenizer,
874
+ query: str,
875
+ history: List[Tuple[str, str]] = [],
876
+ streamer: Optional[BaseStreamer] = None,
877
+ max_new_tokens: int = 1024,
878
+ do_sample: bool = True,
879
+ temperature: float = 0.8,
880
+ top_p: float = 0.8,
881
+ meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n"
882
+ "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
883
+ "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.",
884
+ **kwargs,
885
+ ):
886
+ inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
887
  inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
888
+ outputs = self.generate(
889
+ **inputs,
890
+ streamer=streamer,
891
+ max_new_tokens=max_new_tokens,
892
+ do_sample=do_sample,
893
+ temperature=temperature,
894
+ top_p=top_p,
895
+ **kwargs,
896
+ )
897
+ outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
898
  response = tokenizer.decode(outputs, skip_special_tokens=True)
899
  response = response.split("<eoa>")[0]
900
  history = history + [(query, response)]
901
  return response, history
902
+
903
  @torch.no_grad()
904
+ def stream_chat(
905
+ self,
906
+ tokenizer,
907
+ query: str,
908
+ history: List[Tuple[str, str]] = [],
909
+ max_new_tokens: int = 1024,
910
+ do_sample: bool = True,
911
+ temperature: float = 0.8,
912
+ top_p: float = 0.8,
913
+ **kwargs,
914
+ ):
915
  """
916
  Return a generator in format: (response, history)
917
  Eg.
918
  ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')])
919
  ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')])
920
  """
921
+ if BaseStreamer is None:
922
+ raise ModuleNotFoundError(
923
+ "The version of `transformers` is too low. Please make sure "
924
+ "that you have installed `transformers>=4.28.0`."
925
+ )
926
 
927
  response_queue = queue.Queue(maxsize=20)
928
 
 
962
  tokenizer=tokenizer,
963
  query=query,
964
  streamer=ChatStreamer(tokenizer=tokenizer),
965
+ history=history,
966
  max_new_tokens=max_new_tokens,
967
  do_sample=do_sample,
968
  temperature=temperature,
969
  top_p=top_p,
970
+ **kwargs,
971
  )
972
 
973
  def consumer():
 
985
  @add_start_docstrings(
986
  """
987
  The InternLM Model transformer with a sequence classification head on top (linear layer).
 
988
  [`InternLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
989
  (e.g. GPT-2) do.
 
990
  Since it does classification on the last token, it requires to know the position of the last token. If a
991
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
992
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
 
1099
  past_key_values=transformer_outputs.past_key_values,
1100
  hidden_states=transformer_outputs.hidden_states,
1101
  attentions=transformer_outputs.attentions,
1102
+ )