configuration_internlm.py CHANGED
@@ -1,10 +1,7 @@
1
  # coding=utf-8
2
- # Copyright (c) InternLM. All rights reserved.
3
  #
4
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
- # and OPT implementations in this library. It has been modified from its
6
- # original forms to accommodate minor architectural differences compared
7
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
  #
9
  # Licensed under the Apache License, Version 2.0 (the "License");
10
  # you may not use this file except in compliance with the License.
@@ -27,6 +24,7 @@ logger = logging.get_logger(__name__)
27
  INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
28
 
29
 
 
30
  class InternLMConfig(PretrainedConfig):
31
  r"""
32
  This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
 
1
  # coding=utf-8
2
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
  #
4
+ # This code is based on transformers/src/transformers/models/llama/configuration_llama.py
 
 
 
5
  #
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
  # you may not use this file except in compliance with the License.
 
24
  INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
25
 
26
 
27
+ # Modified from transformers.model.llama.configuration_llama.LlamaConfig
28
  class InternLMConfig(PretrainedConfig):
29
  r"""
30
  This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
modeling_internlm.py CHANGED
@@ -74,7 +74,7 @@ def _get_unpad_data(attention_mask):
74
  )
75
 
76
 
77
- # Copied from transformers.models.bart.modeling_bart._make_causal_mask
78
  def _make_causal_mask(
79
  input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
80
  ):
@@ -92,7 +92,7 @@ def _make_causal_mask(
92
  return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
93
 
94
 
95
- # Copied from transformers.models.bart.modeling_bart._expand_mask
96
  def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
97
  """
98
  Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
@@ -106,6 +106,8 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
106
 
107
  return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
108
 
 
 
109
  class InternLMRMSNorm(nn.Module):
110
  """RMSNorm implemention."""
111
 
@@ -128,6 +130,7 @@ class InternLMRMSNorm(nn.Module):
128
  return self.weight * hidden_states
129
 
130
 
 
131
  class InternLMRotaryEmbedding(torch.nn.Module):
132
  """Implement InternLM's rotary embedding.
133
 
@@ -169,6 +172,7 @@ class InternLMRotaryEmbedding(torch.nn.Module):
169
  )
170
 
171
 
 
172
  class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
173
  """Implement InternLM's DyanmicNTK extrapolation method, thereby broadening the model support context to 16K.
174
 
@@ -229,12 +233,15 @@ class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
229
  )
230
 
231
 
 
232
  def rotate_half(x):
233
  """Rotates half the hidden dims of the input."""
234
  x1 = x[..., : x.shape[-1] // 2]
235
  x2 = x[..., x.shape[-1] // 2 :]
236
  return torch.cat((-x2, x1), dim=-1)
237
 
 
 
238
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
239
  if position_ids.size(1) == 1:
240
  q_cos = cos[position_ids].unsqueeze(1).expand(q.shape)
@@ -255,6 +262,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
255
  return q_embed, k_embed
256
 
257
 
 
258
  class InternLMMLP(nn.Module):
259
  def __init__(
260
  self,
@@ -272,6 +280,7 @@ class InternLMMLP(nn.Module):
272
  return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
273
 
274
 
 
275
  class InternLMAttention(nn.Module):
276
  """Multi-headed attention from 'Attention Is All You Need' paper"""
277
 
@@ -377,10 +386,11 @@ class InternLMAttention(nn.Module):
377
  attn_weights = None
378
 
379
  return attn_output, attn_weights, past_key_value
380
-
 
381
  class InternLMFlashAttention2(InternLMAttention):
382
  """
383
- InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
384
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
385
  flash attention and deal with padding tokens in case the input contains any of them.
386
  """
@@ -395,7 +405,7 @@ class InternLMFlashAttention2(InternLMAttention):
395
  use_cache: bool = False,
396
  **kwargs,
397
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
398
- # InternLM2FlashAttention2 attention does not support output_attentions
399
  bsz, q_len, _ = hidden_states.size()
400
 
401
  query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
@@ -526,6 +536,7 @@ INTERNLM_ATTENTION_CLASSES = {
526
  "flash_attention_2": InternLMFlashAttention2,
527
  }
528
 
 
529
  class InternLMDecoderLayer(nn.Module):
530
  def __init__(self, config: InternLMConfig):
531
  super().__init__()
@@ -611,6 +622,7 @@ INTERNLM_START_DOCSTRING = r"""
611
  """
612
 
613
 
 
614
  @add_start_docstrings(
615
  "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
616
  INTERNLM_START_DOCSTRING,
@@ -692,6 +704,7 @@ INTERNLM_INPUTS_DOCSTRING = r"""
692
  """
693
 
694
 
 
695
  @add_start_docstrings(
696
  "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
697
  INTERNLM_START_DOCSTRING,
@@ -884,6 +897,7 @@ class InternLMModel(InternLMPreTrainedModel):
884
  )
885
 
886
 
 
887
  class InternLMForCausalLM(InternLMPreTrainedModel):
888
  _auto_class = "AutoModelForCausalLM"
889
 
@@ -1037,11 +1051,12 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
1037
  return reordered_past
1038
 
1039
  def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
1040
- prompt = ""
1041
- if meta_instruction:
1042
- prompt += f"""<s><|System|>:{meta_instruction}\n"""
1043
  else:
1044
- prompt += "<s>"
 
 
1045
  for record in history:
1046
  prompt += f"""<|User|>:{record[0]}\n<|Bot|>:{record[1]}<eoa>\n"""
1047
  prompt += f"""<|User|>:{query}\n<|Bot|>:"""
@@ -1114,6 +1129,7 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
1114
  self.query = query
1115
  self.history = history
1116
  self.response = ""
 
1117
  self.received_inputs = False
1118
  self.queue.put((self.response, history + [(self.query, self.response)]))
1119
 
@@ -1128,11 +1144,17 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
1128
  self.received_inputs = True
1129
  return
1130
 
1131
- token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
 
 
 
1132
  if token.strip() != "<eoa>":
1133
  self.response = self.response + token
1134
  history = self.history + [(self.query, self.response)]
1135
  self.queue.put((self.response, history))
 
 
 
1136
 
1137
  def end(self):
1138
  self.queue.put(None)
 
74
  )
75
 
76
 
77
+ # Copied from transformers.models.llama.modeling_llama._make_causal_mask
78
  def _make_causal_mask(
79
  input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
80
  ):
 
92
  return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
93
 
94
 
95
+ # Copied from transformers.models.llama.modeling_llama._expand_mask
96
  def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
97
  """
98
  Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
 
106
 
107
  return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
108
 
109
+
110
+ # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM
111
  class InternLMRMSNorm(nn.Module):
112
  """RMSNorm implemention."""
113
 
 
130
  return self.weight * hidden_states
131
 
132
 
133
+ # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM
134
  class InternLMRotaryEmbedding(torch.nn.Module):
135
  """Implement InternLM's rotary embedding.
136
 
 
172
  )
173
 
174
 
175
+ # Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM
176
  class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
177
  """Implement InternLM's DyanmicNTK extrapolation method, thereby broadening the model support context to 16K.
178
 
 
233
  )
234
 
235
 
236
+ # Copied from transformers.model.llama.modeling_llama.rotate_half
237
  def rotate_half(x):
238
  """Rotates half the hidden dims of the input."""
239
  x1 = x[..., : x.shape[-1] // 2]
240
  x2 = x[..., x.shape[-1] // 2 :]
241
  return torch.cat((-x2, x1), dim=-1)
242
 
243
+
244
+ # Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
245
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
246
  if position_ids.size(1) == 1:
247
  q_cos = cos[position_ids].unsqueeze(1).expand(q.shape)
 
262
  return q_embed, k_embed
263
 
264
 
265
+ # Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->InternLM
266
  class InternLMMLP(nn.Module):
267
  def __init__(
268
  self,
 
280
  return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
281
 
282
 
283
+ # Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->InternLM
284
  class InternLMAttention(nn.Module):
285
  """Multi-headed attention from 'Attention Is All You Need' paper"""
286
 
 
386
  attn_weights = None
387
 
388
  return attn_output, attn_weights, past_key_value
389
+
390
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->InternLM
391
  class InternLMFlashAttention2(InternLMAttention):
392
  """
393
+ InternLM flash attention module. This module inherits from `InternLMAttention` as the weights of the module stays
394
  untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
395
  flash attention and deal with padding tokens in case the input contains any of them.
396
  """
 
405
  use_cache: bool = False,
406
  **kwargs,
407
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
408
+ # InternLMFlashAttention2 attention does not support output_attentions
409
  bsz, q_len, _ = hidden_states.size()
410
 
411
  query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
 
536
  "flash_attention_2": InternLMFlashAttention2,
537
  }
538
 
539
+ # Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->InternLM
540
  class InternLMDecoderLayer(nn.Module):
541
  def __init__(self, config: InternLMConfig):
542
  super().__init__()
 
622
  """
623
 
624
 
625
+ # Copied from transformers.models.llama.modeling_llama.LlamaPretrainedModel with Llama->InternLM
626
  @add_start_docstrings(
627
  "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
628
  INTERNLM_START_DOCSTRING,
 
704
  """
705
 
706
 
707
+ # Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->InternLM
708
  @add_start_docstrings(
709
  "The bare InternLM Model outputting raw hidden-states without any specific head on top.",
710
  INTERNLM_START_DOCSTRING,
 
897
  )
898
 
899
 
900
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->InternLM
901
  class InternLMForCausalLM(InternLMPreTrainedModel):
902
  _auto_class = "AutoModelForCausalLM"
903
 
 
1051
  return reordered_past
1052
 
1053
  def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
1054
+ if tokenizer.add_bos_token:
1055
+ prompt = ""
 
1056
  else:
1057
+ prompt = tokenizer.bos_token
1058
+ if meta_instruction:
1059
+ prompt += f"""<|System|>:{meta_instruction}\n"""
1060
  for record in history:
1061
  prompt += f"""<|User|>:{record[0]}\n<|Bot|>:{record[1]}<eoa>\n"""
1062
  prompt += f"""<|User|>:{query}\n<|Bot|>:"""
 
1129
  self.query = query
1130
  self.history = history
1131
  self.response = ""
1132
+ self.cache = []
1133
  self.received_inputs = False
1134
  self.queue.put((self.response, history + [(self.query, self.response)]))
1135
 
 
1144
  self.received_inputs = True
1145
  return
1146
 
1147
+ self.cache.extend(value.tolist())
1148
+ token = self.tokenizer.decode(self.cache, skip_special_tokens=True)
1149
+ if "�" in token and len(token) <= 5:
1150
+ return
1151
  if token.strip() != "<eoa>":
1152
  self.response = self.response + token
1153
  history = self.history + [(self.query, self.response)]
1154
  self.queue.put((self.response, history))
1155
+ self.cache = []
1156
+ else:
1157
+ self.end()
1158
 
1159
  def end(self):
1160
  self.queue.put(None)
tokenization_internlm.py CHANGED
@@ -1,10 +1,7 @@
1
  # coding=utf-8
2
- # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
  #
4
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
- # and OPT implementations in this library. It has been modified from its
6
- # original forms to accommodate minor architectural differences compared
7
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
  #
9
  # Licensed under the Apache License, Version 2.0 (the "License");
10
  # you may not use this file except in compliance with the License.
@@ -18,7 +15,7 @@
18
  # See the License for the specific language governing permissions and
19
  # limitations under the License.
20
 
21
- """Tokenization classes for IntermLM."""
22
  import os
23
  from shutil import copyfile
24
  from typing import Any, Dict, List, Optional, Tuple
@@ -35,7 +32,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
35
 
36
  PRETRAINED_VOCAB_FILES_MAP = {}
37
 
38
-
39
  class InternLMTokenizer(PreTrainedTokenizer):
40
  """
41
  Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
@@ -81,8 +78,6 @@ class InternLMTokenizer(PreTrainedTokenizer):
81
  **kwargs,
82
  )
83
 
84
- """ Initialization"""
85
-
86
  @property
87
  def no_prefix_space_tokens(self):
88
  if self._no_prefix_space_tokens is None:
 
1
  # coding=utf-8
2
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
  #
4
+ # This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
 
 
 
5
  #
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
  # you may not use this file except in compliance with the License.
 
15
  # See the License for the specific language governing permissions and
16
  # limitations under the License.
17
 
18
+ """Tokenization classes for InternLM."""
19
  import os
20
  from shutil import copyfile
21
  from typing import Any, Dict, List, Optional, Tuple
 
32
 
33
  PRETRAINED_VOCAB_FILES_MAP = {}
34
 
35
+ # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer -> InternLM2Tokenizer
36
  class InternLMTokenizer(PreTrainedTokenizer):
37
  """
38
  Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
 
78
  **kwargs,
79
  )
80
 
 
 
81
  @property
82
  def no_prefix_space_tokens(self):
83
  if self._no_prefix_space_tokens is None: