x54-729
commited on
Commit
·
06197eb
1
Parent(s):
90ff09a
update copy right
Browse files- configuration_internlm.py +3 -5
- modeling_internlm.py +32 -10
- tokenization_internlm.py +4 -9
configuration_internlm.py
CHANGED
@@ -1,10 +1,7 @@
|
|
1 |
# coding=utf-8
|
2 |
-
# Copyright (c) InternLM. All rights reserved.
|
3 |
#
|
4 |
-
# This code is based on
|
5 |
-
# and OPT implementations in this library. It has been modified from its
|
6 |
-
# original forms to accommodate minor architectural differences compared
|
7 |
-
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
8 |
#
|
9 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
10 |
# you may not use this file except in compliance with the License.
|
@@ -27,6 +24,7 @@ logger = logging.get_logger(__name__)
|
|
27 |
INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
28 |
|
29 |
|
|
|
30 |
class InternLMConfig(PretrainedConfig):
|
31 |
r"""
|
32 |
This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
|
|
|
1 |
# coding=utf-8
|
2 |
+
# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
|
3 |
#
|
4 |
+
# This code is based on transformers/src/transformers/models/llama/configuration_llama.py
|
|
|
|
|
|
|
5 |
#
|
6 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
# you may not use this file except in compliance with the License.
|
|
|
24 |
INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
25 |
|
26 |
|
27 |
+
# Modified from transformers.model.llama.configuration_llama.LlamaConfig
|
28 |
class InternLMConfig(PretrainedConfig):
|
29 |
r"""
|
30 |
This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
|
modeling_internlm.py
CHANGED
@@ -74,7 +74,7 @@ def _get_unpad_data(attention_mask):
|
|
74 |
)
|
75 |
|
76 |
|
77 |
-
# Copied from transformers.models.
|
78 |
def _make_causal_mask(
|
79 |
input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
|
80 |
):
|
@@ -92,7 +92,7 @@ def _make_causal_mask(
|
|
92 |
return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
|
93 |
|
94 |
|
95 |
-
# Copied from transformers.models.
|
96 |
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
|
97 |
"""
|
98 |
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
|
@@ -106,6 +106,8 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
|
|
106 |
|
107 |
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
108 |
|
|
|
|
|
109 |
class InternLMRMSNorm(nn.Module):
|
110 |
"""RMSNorm implemention."""
|
111 |
|
@@ -128,6 +130,7 @@ class InternLMRMSNorm(nn.Module):
|
|
128 |
return self.weight * hidden_states
|
129 |
|
130 |
|
|
|
131 |
class InternLMRotaryEmbedding(torch.nn.Module):
|
132 |
"""Implement InternLM's rotary embedding.
|
133 |
|
@@ -169,6 +172,7 @@ class InternLMRotaryEmbedding(torch.nn.Module):
|
|
169 |
)
|
170 |
|
171 |
|
|
|
172 |
class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
|
173 |
"""Implement InternLM's DyanmicNTK extrapolation method, thereby broadening the model support context to 16K.
|
174 |
|
@@ -229,12 +233,15 @@ class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
|
|
229 |
)
|
230 |
|
231 |
|
|
|
232 |
def rotate_half(x):
|
233 |
"""Rotates half the hidden dims of the input."""
|
234 |
x1 = x[..., : x.shape[-1] // 2]
|
235 |
x2 = x[..., x.shape[-1] // 2 :]
|
236 |
return torch.cat((-x2, x1), dim=-1)
|
237 |
|
|
|
|
|
238 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
|
239 |
if position_ids.size(1) == 1:
|
240 |
q_cos = cos[position_ids].unsqueeze(1).expand(q.shape)
|
@@ -255,6 +262,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
|
|
255 |
return q_embed, k_embed
|
256 |
|
257 |
|
|
|
258 |
class InternLMMLP(nn.Module):
|
259 |
def __init__(
|
260 |
self,
|
@@ -272,6 +280,7 @@ class InternLMMLP(nn.Module):
|
|
272 |
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
273 |
|
274 |
|
|
|
275 |
class InternLMAttention(nn.Module):
|
276 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
277 |
|
@@ -377,10 +386,11 @@ class InternLMAttention(nn.Module):
|
|
377 |
attn_weights = None
|
378 |
|
379 |
return attn_output, attn_weights, past_key_value
|
380 |
-
|
|
|
381 |
class InternLMFlashAttention2(InternLMAttention):
|
382 |
"""
|
383 |
-
|
384 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
385 |
flash attention and deal with padding tokens in case the input contains any of them.
|
386 |
"""
|
@@ -395,7 +405,7 @@ class InternLMFlashAttention2(InternLMAttention):
|
|
395 |
use_cache: bool = False,
|
396 |
**kwargs,
|
397 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
398 |
-
#
|
399 |
bsz, q_len, _ = hidden_states.size()
|
400 |
|
401 |
query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
|
@@ -526,6 +536,7 @@ INTERNLM_ATTENTION_CLASSES = {
|
|
526 |
"flash_attention_2": InternLMFlashAttention2,
|
527 |
}
|
528 |
|
|
|
529 |
class InternLMDecoderLayer(nn.Module):
|
530 |
def __init__(self, config: InternLMConfig):
|
531 |
super().__init__()
|
@@ -611,6 +622,7 @@ INTERNLM_START_DOCSTRING = r"""
|
|
611 |
"""
|
612 |
|
613 |
|
|
|
614 |
@add_start_docstrings(
|
615 |
"The bare InternLM Model outputting raw hidden-states without any specific head on top.",
|
616 |
INTERNLM_START_DOCSTRING,
|
@@ -692,6 +704,7 @@ INTERNLM_INPUTS_DOCSTRING = r"""
|
|
692 |
"""
|
693 |
|
694 |
|
|
|
695 |
@add_start_docstrings(
|
696 |
"The bare InternLM Model outputting raw hidden-states without any specific head on top.",
|
697 |
INTERNLM_START_DOCSTRING,
|
@@ -884,6 +897,7 @@ class InternLMModel(InternLMPreTrainedModel):
|
|
884 |
)
|
885 |
|
886 |
|
|
|
887 |
class InternLMForCausalLM(InternLMPreTrainedModel):
|
888 |
_auto_class = "AutoModelForCausalLM"
|
889 |
|
@@ -1037,11 +1051,12 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
|
|
1037 |
return reordered_past
|
1038 |
|
1039 |
def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
|
1040 |
-
|
1041 |
-
|
1042 |
-
prompt += f"""<s><|System|>:{meta_instruction}\n"""
|
1043 |
else:
|
1044 |
-
prompt
|
|
|
|
|
1045 |
for record in history:
|
1046 |
prompt += f"""<|User|>:{record[0]}\n<|Bot|>:{record[1]}<eoa>\n"""
|
1047 |
prompt += f"""<|User|>:{query}\n<|Bot|>:"""
|
@@ -1114,6 +1129,7 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
|
|
1114 |
self.query = query
|
1115 |
self.history = history
|
1116 |
self.response = ""
|
|
|
1117 |
self.received_inputs = False
|
1118 |
self.queue.put((self.response, history + [(self.query, self.response)]))
|
1119 |
|
@@ -1128,11 +1144,17 @@ class InternLMForCausalLM(InternLMPreTrainedModel):
|
|
1128 |
self.received_inputs = True
|
1129 |
return
|
1130 |
|
1131 |
-
|
|
|
|
|
|
|
1132 |
if token.strip() != "<eoa>":
|
1133 |
self.response = self.response + token
|
1134 |
history = self.history + [(self.query, self.response)]
|
1135 |
self.queue.put((self.response, history))
|
|
|
|
|
|
|
1136 |
|
1137 |
def end(self):
|
1138 |
self.queue.put(None)
|
|
|
74 |
)
|
75 |
|
76 |
|
77 |
+
# Copied from transformers.models.llama.modeling_llama._make_causal_mask
|
78 |
def _make_causal_mask(
|
79 |
input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
|
80 |
):
|
|
|
92 |
return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
|
93 |
|
94 |
|
95 |
+
# Copied from transformers.models.llama.modeling_llama._expand_mask
|
96 |
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
|
97 |
"""
|
98 |
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
|
|
|
106 |
|
107 |
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
108 |
|
109 |
+
|
110 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM
|
111 |
class InternLMRMSNorm(nn.Module):
|
112 |
"""RMSNorm implemention."""
|
113 |
|
|
|
130 |
return self.weight * hidden_states
|
131 |
|
132 |
|
133 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM
|
134 |
class InternLMRotaryEmbedding(torch.nn.Module):
|
135 |
"""Implement InternLM's rotary embedding.
|
136 |
|
|
|
172 |
)
|
173 |
|
174 |
|
175 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM
|
176 |
class InternLMDynamicNTKScalingRotaryEmbedding(torch.nn.Module):
|
177 |
"""Implement InternLM's DyanmicNTK extrapolation method, thereby broadening the model support context to 16K.
|
178 |
|
|
|
233 |
)
|
234 |
|
235 |
|
236 |
+
# Copied from transformers.model.llama.modeling_llama.rotate_half
|
237 |
def rotate_half(x):
|
238 |
"""Rotates half the hidden dims of the input."""
|
239 |
x1 = x[..., : x.shape[-1] // 2]
|
240 |
x2 = x[..., x.shape[-1] // 2 :]
|
241 |
return torch.cat((-x2, x1), dim=-1)
|
242 |
|
243 |
+
|
244 |
+
# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb
|
245 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
|
246 |
if position_ids.size(1) == 1:
|
247 |
q_cos = cos[position_ids].unsqueeze(1).expand(q.shape)
|
|
|
262 |
return q_embed, k_embed
|
263 |
|
264 |
|
265 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->InternLM
|
266 |
class InternLMMLP(nn.Module):
|
267 |
def __init__(
|
268 |
self,
|
|
|
280 |
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
|
281 |
|
282 |
|
283 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaAttention with Llama->InternLM
|
284 |
class InternLMAttention(nn.Module):
|
285 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
286 |
|
|
|
386 |
attn_weights = None
|
387 |
|
388 |
return attn_output, attn_weights, past_key_value
|
389 |
+
|
390 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->InternLM
|
391 |
class InternLMFlashAttention2(InternLMAttention):
|
392 |
"""
|
393 |
+
InternLM flash attention module. This module inherits from `InternLMAttention` as the weights of the module stays
|
394 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
395 |
flash attention and deal with padding tokens in case the input contains any of them.
|
396 |
"""
|
|
|
405 |
use_cache: bool = False,
|
406 |
**kwargs,
|
407 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
408 |
+
# InternLMFlashAttention2 attention does not support output_attentions
|
409 |
bsz, q_len, _ = hidden_states.size()
|
410 |
|
411 |
query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
|
|
|
536 |
"flash_attention_2": InternLMFlashAttention2,
|
537 |
}
|
538 |
|
539 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->InternLM
|
540 |
class InternLMDecoderLayer(nn.Module):
|
541 |
def __init__(self, config: InternLMConfig):
|
542 |
super().__init__()
|
|
|
622 |
"""
|
623 |
|
624 |
|
625 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaPretrainedModel with Llama->InternLM
|
626 |
@add_start_docstrings(
|
627 |
"The bare InternLM Model outputting raw hidden-states without any specific head on top.",
|
628 |
INTERNLM_START_DOCSTRING,
|
|
|
704 |
"""
|
705 |
|
706 |
|
707 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->InternLM
|
708 |
@add_start_docstrings(
|
709 |
"The bare InternLM Model outputting raw hidden-states without any specific head on top.",
|
710 |
INTERNLM_START_DOCSTRING,
|
|
|
897 |
)
|
898 |
|
899 |
|
900 |
+
# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->InternLM
|
901 |
class InternLMForCausalLM(InternLMPreTrainedModel):
|
902 |
_auto_class = "AutoModelForCausalLM"
|
903 |
|
|
|
1051 |
return reordered_past
|
1052 |
|
1053 |
def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
|
1054 |
+
if tokenizer.add_bos_token:
|
1055 |
+
prompt = ""
|
|
|
1056 |
else:
|
1057 |
+
prompt = tokenizer.bos_token
|
1058 |
+
if meta_instruction:
|
1059 |
+
prompt += f"""<|System|>:{meta_instruction}\n"""
|
1060 |
for record in history:
|
1061 |
prompt += f"""<|User|>:{record[0]}\n<|Bot|>:{record[1]}<eoa>\n"""
|
1062 |
prompt += f"""<|User|>:{query}\n<|Bot|>:"""
|
|
|
1129 |
self.query = query
|
1130 |
self.history = history
|
1131 |
self.response = ""
|
1132 |
+
self.cache = []
|
1133 |
self.received_inputs = False
|
1134 |
self.queue.put((self.response, history + [(self.query, self.response)]))
|
1135 |
|
|
|
1144 |
self.received_inputs = True
|
1145 |
return
|
1146 |
|
1147 |
+
self.cache.extend(value.tolist())
|
1148 |
+
token = self.tokenizer.decode(self.cache, skip_special_tokens=True)
|
1149 |
+
if "�" in token and len(token) <= 5:
|
1150 |
+
return
|
1151 |
if token.strip() != "<eoa>":
|
1152 |
self.response = self.response + token
|
1153 |
history = self.history + [(self.query, self.response)]
|
1154 |
self.queue.put((self.response, history))
|
1155 |
+
self.cache = []
|
1156 |
+
else:
|
1157 |
+
self.end()
|
1158 |
|
1159 |
def end(self):
|
1160 |
self.queue.put(None)
|
tokenization_internlm.py
CHANGED
@@ -1,10 +1,7 @@
|
|
1 |
# coding=utf-8
|
2 |
-
# Copyright
|
3 |
#
|
4 |
-
# This code is based on
|
5 |
-
# and OPT implementations in this library. It has been modified from its
|
6 |
-
# original forms to accommodate minor architectural differences compared
|
7 |
-
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
8 |
#
|
9 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
10 |
# you may not use this file except in compliance with the License.
|
@@ -18,7 +15,7 @@
|
|
18 |
# See the License for the specific language governing permissions and
|
19 |
# limitations under the License.
|
20 |
|
21 |
-
"""Tokenization classes for
|
22 |
import os
|
23 |
from shutil import copyfile
|
24 |
from typing import Any, Dict, List, Optional, Tuple
|
@@ -35,7 +32,7 @@ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
|
|
35 |
|
36 |
PRETRAINED_VOCAB_FILES_MAP = {}
|
37 |
|
38 |
-
|
39 |
class InternLMTokenizer(PreTrainedTokenizer):
|
40 |
"""
|
41 |
Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
|
@@ -81,8 +78,6 @@ class InternLMTokenizer(PreTrainedTokenizer):
|
|
81 |
**kwargs,
|
82 |
)
|
83 |
|
84 |
-
""" Initialization"""
|
85 |
-
|
86 |
@property
|
87 |
def no_prefix_space_tokens(self):
|
88 |
if self._no_prefix_space_tokens is None:
|
|
|
1 |
# coding=utf-8
|
2 |
+
# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
|
3 |
#
|
4 |
+
# This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
|
|
|
|
|
|
|
5 |
#
|
6 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
# you may not use this file except in compliance with the License.
|
|
|
15 |
# See the License for the specific language governing permissions and
|
16 |
# limitations under the License.
|
17 |
|
18 |
+
"""Tokenization classes for InternLM."""
|
19 |
import os
|
20 |
from shutil import copyfile
|
21 |
from typing import Any, Dict, List, Optional, Tuple
|
|
|
32 |
|
33 |
PRETRAINED_VOCAB_FILES_MAP = {}
|
34 |
|
35 |
+
# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer -> InternLM2Tokenizer
|
36 |
class InternLMTokenizer(PreTrainedTokenizer):
|
37 |
"""
|
38 |
Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
|
|
|
78 |
**kwargs,
|
79 |
)
|
80 |
|
|
|
|
|
81 |
@property
|
82 |
def no_prefix_space_tokens(self):
|
83 |
if self._no_prefix_space_tokens is None:
|