x54-729 commited on
Commit
d5770d1
1 Parent(s): 28a9aa9

Fix batch generation

Browse files
Files changed (1) hide show
  1. modeling_internlm.py +162 -57
modeling_internlm.py CHANGED
@@ -1,5 +1,5 @@
1
  # coding=utf-8
2
- # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
  #
4
  # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
  # and OPT implementations in this library. It has been modified from its
@@ -19,14 +19,14 @@
19
  # limitations under the License.
20
  """ PyTorch InternLM model."""
21
  import math
 
 
22
  from typing import List, Optional, Tuple, Union
23
- import threading, queue
24
 
25
  import torch
26
  import torch.utils.checkpoint
27
  from torch import nn
28
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
-
30
  from transformers.activations import ACT2FN
31
  from transformers.modeling_outputs import (
32
  BaseModelOutputWithPast,
@@ -34,15 +34,19 @@ from transformers.modeling_outputs import (
34
  SequenceClassifierOutputWithPast,
35
  )
36
  from transformers.modeling_utils import PreTrainedModel
37
- from transformers.generation.streamers import BaseStreamer
38
  from transformers.utils import (
39
  add_start_docstrings,
40
  add_start_docstrings_to_model_forward,
41
  logging,
42
  replace_return_docstrings,
43
  )
44
- from .configuration_internlm import InternLMConfig
45
 
 
 
 
 
 
 
46
 
47
  logger = logging.get_logger(__name__)
48
 
@@ -82,7 +86,20 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
82
  return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
83
 
84
 
 
 
 
 
 
 
 
 
 
 
 
85
  class InternLMRMSNorm(nn.Module):
 
 
86
  def __init__(self, hidden_size, eps=1e-6):
87
  """
88
  InternLMRMSNorm is equivalent to T5LayerNorm
@@ -103,10 +120,19 @@ class InternLMRMSNorm(nn.Module):
103
 
104
 
105
  class InternLMRotaryEmbedding(torch.nn.Module):
 
 
 
 
 
 
 
 
 
106
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
107
  super().__init__()
108
  inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
109
- self.register_buffer("inv_freq", inv_freq)
110
 
111
  # Build here to make `torch.jit.trace` work.
112
  self.max_seq_len_cached = max_position_embeddings
@@ -114,8 +140,8 @@ class InternLMRotaryEmbedding(torch.nn.Module):
114
  freqs = torch.einsum("i,j->ij", t, self.inv_freq)
115
  # Different from paper, but it uses a different permutation in order to obtain the same calculation
116
  emb = torch.cat((freqs, freqs), dim=-1)
117
- self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
118
- self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
119
 
120
  def forward(self, x, seq_len=None):
121
  # x: [bs, num_attention_heads, seq_len, head_size]
@@ -126,11 +152,71 @@ class InternLMRotaryEmbedding(torch.nn.Module):
126
  freqs = torch.einsum("i,j->ij", t, self.inv_freq)
127
  # Different from paper, but it uses a different permutation in order to obtain the same calculation
128
  emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
129
- self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
130
- self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  return (
132
- self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
133
- self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
134
  )
135
 
136
 
@@ -140,15 +226,23 @@ def rotate_half(x):
140
  x2 = x[..., x.shape[-1] // 2 :]
141
  return torch.cat((-x2, x1), dim=-1)
142
 
143
-
144
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
145
- # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
146
- cos = cos.squeeze(1).squeeze(0) # [seq_len, dim]
147
- sin = sin.squeeze(1).squeeze(0) # [seq_len, dim]
148
- cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
149
- sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim]
150
- q_embed = (q * cos) + (rotate_half(q) * sin)
151
- k_embed = (k * cos) + (rotate_half(k) * sin)
 
 
 
 
 
 
 
 
 
152
  return q_embed, k_embed
153
 
154
 
@@ -178,6 +272,8 @@ class InternLMAttention(nn.Module):
178
  self.hidden_size = config.hidden_size
179
  self.num_heads = config.num_attention_heads
180
  self.head_dim = self.hidden_size // self.num_heads
 
 
181
  self.max_position_embeddings = config.max_position_embeddings
182
 
183
  if (self.head_dim * self.num_heads) != self.hidden_size:
@@ -186,10 +282,31 @@ class InternLMAttention(nn.Module):
186
  f" and `num_heads`: {self.num_heads})."
187
  )
188
  self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
189
- self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
190
- self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
191
  self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
192
- self.rotary_emb = InternLMRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
195
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
@@ -206,15 +323,12 @@ class InternLMAttention(nn.Module):
206
  bsz, q_len, _ = hidden_states.size()
207
 
208
  query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
209
- key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
210
- value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
211
-
212
- kv_seq_len = key_states.shape[-2]
213
- if past_key_value is not None:
214
- kv_seq_len += past_key_value[0].shape[-2]
215
- cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
216
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
217
- # [bsz, nh, t, hd]
218
 
219
  if past_key_value is not None:
220
  # reuse k, v, self_attention
@@ -223,6 +337,13 @@ class InternLMAttention(nn.Module):
223
 
224
  past_key_value = (key_states, value_states) if use_cache else None
225
 
 
 
 
 
 
 
 
226
  attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
227
 
228
  if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
@@ -332,11 +453,9 @@ INTERNLM_START_DOCSTRING = r"""
332
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
333
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
334
  etc.)
335
-
336
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
337
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
338
  and behavior.
339
-
340
  Parameters:
341
  config ([`InternLMConfig`]):
342
  Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -377,44 +496,34 @@ INTERNLM_INPUTS_DOCSTRING = r"""
377
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
378
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
379
  it.
380
-
381
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
382
  [`PreTrainedTokenizer.__call__`] for details.
383
-
384
  [What are input IDs?](../glossary#input-ids)
385
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
386
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
387
-
388
  - 1 for tokens that are **not masked**,
389
  - 0 for tokens that are **masked**.
390
-
391
  [What are attention masks?](../glossary#attention-mask)
392
-
393
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
394
  [`PreTrainedTokenizer.__call__`] for details.
395
-
396
  If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
397
  `past_key_values`).
398
-
399
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
400
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
401
  information on the default strategy.
402
-
403
  - 1 indicates the head is **not masked**,
404
  - 0 indicates the head is **masked**.
405
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
406
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
407
  config.n_positions - 1]`.
408
-
409
  [What are position IDs?](../glossary#position-ids)
410
- past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
 
411
  Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
412
  `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
413
  `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
414
-
415
  Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
416
  blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
417
-
418
  If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
419
  don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
420
  `decoder_input_ids` of shape `(batch_size, sequence_length)`.
@@ -443,7 +552,6 @@ INTERNLM_INPUTS_DOCSTRING = r"""
443
  class InternLMModel(InternLMPreTrainedModel):
444
  """
445
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLMDecoderLayer`]
446
-
447
  Args:
448
  config: InternLMConfig
449
  """
@@ -673,20 +781,14 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
673
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
674
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
675
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
676
-
677
  Returns:
678
-
679
  Example:
680
-
681
  ```python
682
  >>> from transformers import AutoTokenizer, InternLMForCausalLM
683
-
684
  >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
685
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
686
-
687
  >>> prompt = "Hey, are you consciours? Can you talk to me?"
688
  >>> inputs = tokenizer(prompt, return_tensors="pt")
689
-
690
  >>> # Generate
691
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
692
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -776,7 +878,7 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
776
  for layer_past in past_key_values:
777
  reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
778
  return reordered_past
779
-
780
  def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
781
  prompt = ""
782
  if meta_instruction:
@@ -839,6 +941,11 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
839
  ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')])
840
  ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')])
841
  """
 
 
 
 
 
842
 
843
  response_queue = queue.Queue(maxsize=20)
844
 
@@ -891,7 +998,7 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
891
  producer.start()
892
  while True:
893
  res = response_queue.get()
894
- if res is not None:
895
  return
896
  yield res
897
 
@@ -901,10 +1008,8 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
901
  @add_start_docstrings(
902
  """
903
  The InternLM Model transformer with a sequence classification head on top (linear layer).
904
-
905
  [`InternLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
906
  (e.g. GPT-2) do.
907
-
908
  Since it does classification on the last token, it requires to know the position of the last token. If a
909
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
910
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
 
1
  # coding=utf-8
2
+ # Copyright (c) InternLM. All rights reserved.
3
  #
4
  # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
  # and OPT implementations in this library. It has been modified from its
 
19
  # limitations under the License.
20
  """ PyTorch InternLM model."""
21
  import math
22
+ import queue
23
+ import threading
24
  from typing import List, Optional, Tuple, Union
 
25
 
26
  import torch
27
  import torch.utils.checkpoint
28
  from torch import nn
29
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
30
  from transformers.activations import ACT2FN
31
  from transformers.modeling_outputs import (
32
  BaseModelOutputWithPast,
 
34
  SequenceClassifierOutputWithPast,
35
  )
36
  from transformers.modeling_utils import PreTrainedModel
 
37
  from transformers.utils import (
38
  add_start_docstrings,
39
  add_start_docstrings_to_model_forward,
40
  logging,
41
  replace_return_docstrings,
42
  )
 
43
 
44
+ try:
45
+ from transformers.generation.streamers import BaseStreamer
46
+ except: # noqa # pylint: disable=bare-except
47
+ BaseStreamer = None
48
+
49
+ from .configuration_internlm import InternLMConfig
50
 
51
  logger = logging.get_logger(__name__)
52
 
 
86
  return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
87
 
88
 
89
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
90
+ """
91
+ (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
92
+ """
93
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
94
+ if n_rep == 1:
95
+ return hidden_states
96
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
97
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
98
+
99
+
100
  class InternLMRMSNorm(nn.Module):
101
+ """RMSNorm implemention."""
102
+
103
  def __init__(self, hidden_size, eps=1e-6):
104
  """
105
  InternLMRMSNorm is equivalent to T5LayerNorm
 
120
 
121
 
122
  class InternLMRotaryEmbedding(torch.nn.Module):
123
+ """Implement InternLM's rotary embedding.
124
+
125
+ Args:
126
+ dim (int): Characteristic dimension of each self-attentional head.
127
+ max_position_embeddings (int, optional): Model's training length. Defaults to 2048.
128
+ base (int, optional): The rotation position encodes the rotation Angle base number. Defaults to 10000.
129
+ device (Any, optional): Running device. Defaults to None.
130
+ """
131
+
132
  def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
133
  super().__init__()
134
  inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
135
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
136
 
137
  # Build here to make `torch.jit.trace` work.
138
  self.max_seq_len_cached = max_position_embeddings
 
140
  freqs = torch.einsum("i,j->ij", t, self.inv_freq)
141
  # Different from paper, but it uses a different permutation in order to obtain the same calculation
142
  emb = torch.cat((freqs, freqs), dim=-1)
143
+ self.register_buffer("cos_cached", emb.cos().to(torch.float32), persistent=False)
144
+ self.register_buffer("sin_cached", emb.sin().to(torch.float32), persistent=False)
145
 
146
  def forward(self, x, seq_len=None):
147
  # x: [bs, num_attention_heads, seq_len, head_size]
 
152
  freqs = torch.einsum("i,j->ij", t, self.inv_freq)
153
  # Different from paper, but it uses a different permutation in order to obtain the same calculation
154
  emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
155
+ self.register_buffer("cos_cached", emb.cos(), persistent=False)
156
+ self.register_buffer("sin_cached", emb.sin(), persistent=False)
157
+ return (
158
+ self.cos_cached[:seq_len, ...].to(dtype=x.dtype),
159
+ self.sin_cached[:seq_len, ...].to(dtype=x.dtype),
160
+ )
161
+
162
+
163
+ class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
164
+ """Implement InternLM's DyanmicNTK extrapolation method, thereby broadening the model support context to 16K.
165
+
166
+ Args:
167
+ dim (int): Characteristic dimension of each self-attentional head.
168
+ max_position_embeddings (int, optional): Model's training length. Defaults to 2048.
169
+ base (int, optional): The rotation position encodes the rotation Angle base number. Defaults to 10000.
170
+ device (Any, optional): Running device. Defaults to None.
171
+ scaling_factor (float, optional): NTK method extrapolation coefficient. Defaults to 1.0.
172
+ """
173
+
174
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
175
+ super().__init__()
176
+ inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
177
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
178
+ self.dim = dim
179
+ self.base = base
180
+ self.scaling_factor = scaling_factor
181
+
182
+ # Build here to make `torch.jit.trace` work.
183
+ self.max_position_embeddings = max_position_embeddings
184
+ self.max_seq_len_cached = max_position_embeddings
185
+ t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
186
+ freqs = torch.einsum("i,j->ij", t, self.inv_freq)
187
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
188
+ emb = torch.cat((freqs, freqs), dim=-1)
189
+ self.register_buffer("cos_cached", emb.cos(), persistent=False)
190
+ self.register_buffer("sin_cached", emb.sin(), persistent=False)
191
+
192
+ def _update_cached(self, x, seq_len=None):
193
+ self.max_seq_len_cached = max(seq_len, self.max_position_embeddings)
194
+ if seq_len > self.max_position_embeddings:
195
+ base = self.base * (
196
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
197
+ ) ** (self.dim / (self.dim - 2))
198
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim))
199
+ else:
200
+ inv_freq = self.inv_freq
201
+ t = torch.arange(self.max_seq_len_cached, device=inv_freq.device, dtype=inv_freq.dtype)
202
+ freqs = torch.einsum("i,j->ij", t, inv_freq)
203
+ emb = torch.cat((freqs, freqs), dim=-1)
204
+ self.register_buffer("cos_cached", emb.cos(), persistent=False)
205
+ self.register_buffer("sin_cached", emb.sin(), persistent=False)
206
+
207
+ def forward(self, x, seq_len=None):
208
+ # x: [bs, num_attention_heads, seq_len, head_size]
209
+ # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
210
+ if seq_len <= self.max_position_embeddings:
211
+ # Reset the tables if the sequence length has changed,
212
+ if self.max_seq_len_cached > self.max_position_embeddings:
213
+ self._update_cached(x, seq_len)
214
+ else:
215
+ self._update_cached(x, seq_len)
216
+
217
  return (
218
+ self.cos_cached[:seq_len, ...].to(dtype=x.dtype),
219
+ self.sin_cached[:seq_len, ...].to(dtype=x.dtype),
220
  )
221
 
222
 
 
226
  x2 = x[..., x.shape[-1] // 2 :]
227
  return torch.cat((-x2, x1), dim=-1)
228
 
 
229
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
230
+ if position_ids.size(1) == 1:
231
+ q_cos = cos[position_ids].unsqueeze(1).expand(q.shape)
232
+ q_sin = sin[position_ids].unsqueeze(1).expand(q.shape)
233
+ q_embed = (q * q_cos) + (rotate_half(q) * q_sin)
234
+
235
+ position_ids = position_ids.flatten() + 1
236
+ max_length = max(position_ids)
237
+ position_ids = torch.stack([torch.cat([torch.ones(max_length - w, dtype=torch.long), torch.arange(w)]) for w in position_ids])
238
+ k_cos = cos[position_ids].unsqueeze(1).expand(k.shape)
239
+ k_sin = sin[position_ids].unsqueeze(1).expand(k.shape)
240
+ k_embed = (k * k_cos) + (rotate_half(k) * k_sin)
241
+ else:
242
+ cos = cos[position_ids].unsqueeze(1).expand(q.shape)
243
+ sin = sin[position_ids].unsqueeze(1).expand(q.shape)
244
+ q_embed = (q * cos) + (rotate_half(q) * sin)
245
+ k_embed = (k * cos) + (rotate_half(k) * sin)
246
  return q_embed, k_embed
247
 
248
 
 
272
  self.hidden_size = config.hidden_size
273
  self.num_heads = config.num_attention_heads
274
  self.head_dim = self.hidden_size // self.num_heads
275
+ self.num_key_value_heads = config.num_key_value_heads
276
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
277
  self.max_position_embeddings = config.max_position_embeddings
278
 
279
  if (self.head_dim * self.num_heads) != self.hidden_size:
 
282
  f" and `num_heads`: {self.num_heads})."
283
  )
284
  self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.bias)
285
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.bias)
286
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.bias)
287
  self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
288
+ self.rotary_emb = self._init_rope()
289
+
290
+ def _init_rope(self):
291
+ if self.config.rope_scaling is None:
292
+ self.rotary_emb = InternLMRotaryEmbedding(
293
+ self.head_dim,
294
+ max_position_embeddings=self.max_position_embeddings,
295
+ base=self.config.rope_theta,
296
+ )
297
+ else:
298
+ scaling_type = self.config.rope_scaling["type"]
299
+ scaling_factor = self.config.rope_scaling["factor"]
300
+ if scaling_type == "dynamic":
301
+ self.rotary_emb = InternLMDynamicNTKScalingRotaryEmbedding(
302
+ self.head_dim,
303
+ max_position_embeddings=self.max_position_embeddings,
304
+ base=self.config.rope_theta,
305
+ scaling_factor=scaling_factor,
306
+ )
307
+ else:
308
+ raise ValueError("Currently we only support rotary embedding's type being 'dynamic'.")
309
+ return self.rotary_emb
310
 
311
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
312
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
323
  bsz, q_len, _ = hidden_states.size()
324
 
325
  query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
326
+ key_states = (
327
+ self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
328
+ )
329
+ value_states = (
330
+ self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
331
+ )
 
 
 
332
 
333
  if past_key_value is not None:
334
  # reuse k, v, self_attention
 
337
 
338
  past_key_value = (key_states, value_states) if use_cache else None
339
 
340
+ kv_seq_len = key_states.shape[-2]
341
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
342
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
343
+
344
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
345
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
346
+
347
  attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
348
 
349
  if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
 
453
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
454
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
455
  etc.)
 
456
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
457
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
458
  and behavior.
 
459
  Parameters:
460
  config ([`InternLMConfig`]):
461
  Model configuration class with all the parameters of the model. Initializing with a config file does not
 
496
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
497
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
498
  it.
 
499
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
500
  [`PreTrainedTokenizer.__call__`] for details.
 
501
  [What are input IDs?](../glossary#input-ids)
502
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
503
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
504
  - 1 for tokens that are **not masked**,
505
  - 0 for tokens that are **masked**.
 
506
  [What are attention masks?](../glossary#attention-mask)
 
507
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
508
  [`PreTrainedTokenizer.__call__`] for details.
 
509
  If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
510
  `past_key_values`).
 
511
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
512
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
513
  information on the default strategy.
 
514
  - 1 indicates the head is **not masked**,
515
  - 0 indicates the head is **masked**.
516
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
517
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
518
  config.n_positions - 1]`.
 
519
  [What are position IDs?](../glossary#position-ids)
520
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
521
+ when `config.use_cache=True`):
522
  Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
523
  `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
524
  `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
 
525
  Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
526
  blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
 
527
  If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
528
  don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
529
  `decoder_input_ids` of shape `(batch_size, sequence_length)`.
 
552
  class InternLMModel(InternLMPreTrainedModel):
553
  """
554
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLMDecoderLayer`]
 
555
  Args:
556
  config: InternLMConfig
557
  """
 
781
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
782
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
783
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
 
784
  Returns:
 
785
  Example:
 
786
  ```python
787
  >>> from transformers import AutoTokenizer, InternLMForCausalLM
 
788
  >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
789
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
 
790
  >>> prompt = "Hey, are you consciours? Can you talk to me?"
791
  >>> inputs = tokenizer(prompt, return_tensors="pt")
 
792
  >>> # Generate
793
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
794
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
878
  for layer_past in past_key_values:
879
  reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
880
  return reordered_past
881
+
882
  def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
883
  prompt = ""
884
  if meta_instruction:
 
941
  ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')])
942
  ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')])
943
  """
944
+ if BaseStreamer is None:
945
+ raise ModuleNotFoundError(
946
+ "The version of `transformers` is too low. Please make sure "
947
+ "that you have installed `transformers>=4.28.0`."
948
+ )
949
 
950
  response_queue = queue.Queue(maxsize=20)
951
 
 
998
  producer.start()
999
  while True:
1000
  res = response_queue.get()
1001
+ if res is None:
1002
  return
1003
  yield res
1004
 
 
1008
  @add_start_docstrings(
1009
  """
1010
  The InternLM Model transformer with a sequence classification head on top (linear layer).
 
1011
  [`InternLMForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1012
  (e.g. GPT-2) do.
 
1013
  Since it does classification on the last token, it requires to know the position of the last token. If a
1014
  `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1015
  no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the