myownskyW7 commited on
Commit
bb26c26
1 Parent(s): c150360
Files changed (2) hide show
  1. modeling_InternLM.py +41 -139
  2. modeling_vit.py +1 -1
modeling_InternLM.py CHANGED
@@ -1,10 +1,7 @@
1
- # This script is based on https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
2
- """ PyTorch LLaMA model."""
3
  import math
4
  from typing import List, Union
5
  from typing import Optional, Tuple
6
 
7
- # from apex.normalization.fused_layer_norm import MixedFusedRMSNorm as LlamaRMSNorm
8
  import rotary_emb
9
  import torch
10
  import torch.utils.checkpoint
@@ -16,18 +13,14 @@ from torch.nn import CrossEntropyLoss
16
  from transformers.activations import ACT2FN
17
  from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
18
  from transformers.modeling_utils import PreTrainedModel
19
- from transformers.models.llama.configuration_llama import LlamaConfig
20
- from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
21
  from transformers.utils import logging
22
 
23
- from modeling_utils import LoRALinear
24
-
25
- # from flash_attn.modules.mha import FlashSelfAttention
26
 
27
  logger = logging.get_logger(__name__)
28
 
29
- _CONFIG_FOR_DOC = "LlamaConfig"
30
- """ PyTorch LLaMA model."""
31
 
32
 
33
  class ApplyRotaryEmbQKV_(torch.autograd.Function):
@@ -77,7 +70,7 @@ class ApplyRotaryEmbQKV_(torch.autograd.Function):
77
  return dqkv, None, None, None, None
78
 
79
 
80
- class ConvertedLlamaRotaryEmbedding(torch.nn.Module):
81
  def __init__(self, dim: int, base=10000, scale_base=0, device=None):
82
  """ """
83
  super().__init__()
@@ -168,9 +161,9 @@ apply_rotary_emb_qkv_ = ApplyRotaryEmbQKV_.apply
168
  legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply
169
 
170
 
171
- class InternConvertedLlamaAttention(nn.Module):
172
  """Multi-headed attention from 'Attention Is All You Need' paper"""
173
- def __init__(self, config: LlamaConfig):
174
  super().__init__()
175
  self.config = config
176
  self.hidden_size = config.hidden_size
@@ -244,7 +237,7 @@ class InternConvertedLlamaAttention(nn.Module):
244
  bias=config.kqvo_bias,
245
  )
246
 
247
- self.rotary_emb = ConvertedLlamaRotaryEmbedding(self.head_dim)
248
 
249
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
250
  return tensor.view(bsz, seq_len, self.num_heads,
@@ -460,10 +453,10 @@ def _expand_mask(mask: torch.Tensor,
460
  torch.finfo(dtype).min)
461
 
462
 
463
- class LlamaRMSNorm(nn.Module):
464
  def __init__(self, hidden_size, eps=1e-6):
465
  """
466
- LlamaRMSNorm is equivalent to T5LayerNorm
467
  """
468
  super().__init__()
469
  self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -482,7 +475,7 @@ class LlamaRMSNorm(nn.Module):
482
  return self.weight * hidden_states
483
 
484
 
485
- class LlamaRotaryEmbedding(torch.nn.Module):
486
  def __init__(self,
487
  dim,
488
  max_position_embeddings=2048,
@@ -550,9 +543,9 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
550
  return q_embed, k_embed
551
 
552
 
553
- class LlamaMLP(nn.Module):
554
  def __init__(self, hidden_size: int, intermediate_size: int,
555
- hidden_act: str, config: LlamaConfig):
556
  super().__init__()
557
  self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
558
  if config.lora_cfg is not None and 'ffn' in config.lora_cfg[
@@ -579,9 +572,9 @@ class LlamaMLP(nn.Module):
579
  return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
580
 
581
 
582
- class LlamaAttention(nn.Module):
583
  """Multi-headed attention from 'Attention Is All You Need' paper"""
584
- def __init__(self, config: LlamaConfig):
585
  super().__init__()
586
  self.config = config
587
  self.hidden_size = config.hidden_size
@@ -648,7 +641,7 @@ class LlamaAttention(nn.Module):
648
  self.hidden_size,
649
  bias=False)
650
 
651
- self.rotary_emb = LlamaRotaryEmbedding(
652
  self.head_dim,
653
  max_position_embeddings=self.max_position_embeddings)
654
 
@@ -731,25 +724,25 @@ class LlamaAttention(nn.Module):
731
  return attn_output, attn_weights, past_key_value
732
 
733
 
734
- class LlamaDecoderLayer(nn.Module):
735
- def __init__(self, config: LlamaConfig):
736
  super().__init__()
737
  self.hidden_size = config.hidden_size
738
  if hasattr(config,
739
  'intern_converted_llm') and config.intern_converted_llm:
740
- self.self_attn = InternConvertedLlamaAttention(config=config)
741
  else:
742
- self.self_attn = LlamaAttention(config=config)
743
- self.mlp = LlamaMLP(
744
  hidden_size=self.hidden_size,
745
  intermediate_size=config.intermediate_size,
746
  hidden_act=config.hidden_act,
747
  config=config,
748
  )
749
- self.input_layernorm = LlamaRMSNorm(config.hidden_size,
750
- eps=config.rms_norm_eps)
751
- self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size,
752
- eps=config.rms_norm_eps)
753
 
754
  def forward(
755
  self,
@@ -807,32 +800,11 @@ class LlamaDecoderLayer(nn.Module):
807
  return outputs
808
 
809
 
810
- LLAMA_START_DOCSTRING = r"""
811
- This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
812
- library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
813
- etc.)
814
-
815
- This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
816
- Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
817
- and behavior.
818
-
819
- Parameters:
820
- config ([`LlamaConfig`]):
821
- Model configuration class with all the parameters of the model. Initializing with a config file does not
822
- load the weights associated with the model, only the configuration. Check out the
823
- [`~PreTrainedModel.from_pretrained`] method to load the model weights.
824
- """
825
-
826
-
827
- @add_start_docstrings(
828
- "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
829
- LLAMA_START_DOCSTRING,
830
- )
831
- class LlamaPreTrainedModel(PreTrainedModel):
832
- config_class = LlamaConfig
833
  base_model_prefix = "model"
834
  supports_gradient_checkpointing = True
835
- _no_split_modules = ["LlamaDecoderLayer"]
836
  _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
837
 
838
  def _init_weights(self, module):
@@ -847,86 +819,18 @@ class LlamaPreTrainedModel(PreTrainedModel):
847
  module.weight.data[module.padding_idx].zero_()
848
 
849
  def _set_gradient_checkpointing(self, module, value=False):
850
- if isinstance(module, LlamaModel):
851
  module.gradient_checkpointing = value
852
 
853
 
854
- LLAMA_INPUTS_DOCSTRING = r"""
855
- Args:
856
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
857
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
858
- it.
859
-
860
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
861
- [`PreTrainedTokenizer.__call__`] for details.
862
-
863
- [What are input IDs?](../glossary#input-ids)
864
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
865
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
866
-
867
- - 1 for tokens that are **not masked**,
868
- - 0 for tokens that are **masked**.
869
-
870
- [What are attention masks?](../glossary#attention-mask)
871
-
872
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
873
- [`PreTrainedTokenizer.__call__`] for details.
874
-
875
- If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
876
- `past_key_values`).
877
-
878
- If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
879
- and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
880
- information on the default strategy.
881
-
882
- - 1 indicates the head is **not masked**,
883
- - 0 indicates the head is **masked**.
884
- position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
885
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
886
- config.n_positions - 1]`.
887
-
888
- [What are position IDs?](../glossary#position-ids)
889
- past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
890
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
891
- `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
892
- `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
893
-
894
- Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
895
- blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
896
-
897
- If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
898
- don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
899
- `decoder_input_ids` of shape `(batch_size, sequence_length)`.
900
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
901
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
902
- is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
903
- model's internal embedding lookup matrix.
904
- use_cache (`bool`, *optional*):
905
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
906
- `past_key_values`).
907
- output_attentions (`bool`, *optional*):
908
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
909
- tensors for more detail.
910
- output_hidden_states (`bool`, *optional*):
911
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
912
- more detail.
913
- return_dict (`bool`, *optional*):
914
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
915
- """
916
-
917
-
918
- @add_start_docstrings(
919
- "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
920
- LLAMA_START_DOCSTRING,
921
- )
922
- class LlamaModel(LlamaPreTrainedModel):
923
  """
924
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
925
 
926
  Args:
927
- config: LlamaConfig
928
  """
929
- def __init__(self, config: LlamaConfig):
930
  super().__init__(config)
931
  self.padding_idx = config.pad_token_id
932
  self.vocab_size = config.vocab_size
@@ -934,9 +838,11 @@ class LlamaModel(LlamaPreTrainedModel):
934
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
935
  self.padding_idx)
936
  self.layers = nn.ModuleList([
937
- LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)
 
938
  ])
939
- self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
940
 
941
  self.gradient_checkpointing = False
942
  # Initialize weights and apply final processing
@@ -975,7 +881,6 @@ class LlamaModel(LlamaPreTrainedModel):
975
 
976
  return combined_attention_mask
977
 
978
- @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
979
  def forward(
980
  self,
981
  input_ids: torch.LongTensor = None,
@@ -1119,19 +1024,19 @@ class LlamaModel(LlamaPreTrainedModel):
1119
  )
1120
 
1121
 
1122
- class LlamaForCausalLM(LlamaPreTrainedModel):
1123
  lora_cfg = None # init in MiniGPT4
1124
 
1125
  def __init__(self, config):
1126
  super().__init__(config)
1127
- # TODO: find a way to explicitly initialize Llama
1128
  setattr(config, 'lora_cfg', self.lora_cfg)
1129
 
1130
  if hasattr(config, 'kqvo_bias'):
1131
  setattr(config, 'kqvo_bias', config.kqvo_bias)
1132
  else:
1133
  setattr(config, 'kqvo_bias', False)
1134
- self.model = LlamaModel(config)
1135
 
1136
  self.lm_head = nn.Linear(config.hidden_size,
1137
  config.vocab_size,
@@ -1185,9 +1090,6 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
1185
  def get_decoder(self):
1186
  return self.model
1187
 
1188
- @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
1189
- @replace_return_docstrings(output_type=CausalLMOutputWithPast,
1190
- config_class=_CONFIG_FOR_DOC)
1191
  def forward(
1192
  self,
1193
  input_ids: torch.LongTensor = None,
@@ -1214,9 +1116,9 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
1214
  Example:
1215
 
1216
  ```python
1217
- >>> from transformers import AutoTokenizer, LlamaForCausalLM
1218
 
1219
- >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1220
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1221
 
1222
  >>> prompt = "Hey, are you consciours? Can you talk to me?"
 
 
 
1
  import math
2
  from typing import List, Union
3
  from typing import Optional, Tuple
4
 
 
5
  import rotary_emb
6
  import torch
7
  import torch.utils.checkpoint
 
13
  from transformers.activations import ACT2FN
14
  from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
15
  from transformers.modeling_utils import PreTrainedModel
 
 
16
  from transformers.utils import logging
17
 
18
+ from .modeling_utils import LoRALinear
19
+ from .configuration_InternLM_XComposer import InternLMXComposerConfig
 
20
 
21
  logger = logging.get_logger(__name__)
22
 
23
+ _CONFIG_FOR_DOC = "InternLMXComposerConfig"
 
24
 
25
 
26
  class ApplyRotaryEmbQKV_(torch.autograd.Function):
 
70
  return dqkv, None, None, None, None
71
 
72
 
73
+ class ConvertedInternLMRotaryEmbedding(torch.nn.Module):
74
  def __init__(self, dim: int, base=10000, scale_base=0, device=None):
75
  """ """
76
  super().__init__()
 
161
  legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply
162
 
163
 
164
+ class InternConvertedInternLMAttention(nn.Module):
165
  """Multi-headed attention from 'Attention Is All You Need' paper"""
166
+ def __init__(self, config: InternLMXComposerConfig):
167
  super().__init__()
168
  self.config = config
169
  self.hidden_size = config.hidden_size
 
237
  bias=config.kqvo_bias,
238
  )
239
 
240
+ self.rotary_emb = ConvertedInternLMRotaryEmbedding(self.head_dim)
241
 
242
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
243
  return tensor.view(bsz, seq_len, self.num_heads,
 
453
  torch.finfo(dtype).min)
454
 
455
 
456
+ class InternLMRMSNorm(nn.Module):
457
  def __init__(self, hidden_size, eps=1e-6):
458
  """
459
+ InternLMRMSNorm is equivalent to T5LayerNorm
460
  """
461
  super().__init__()
462
  self.weight = nn.Parameter(torch.ones(hidden_size))
 
475
  return self.weight * hidden_states
476
 
477
 
478
+ class InternLMRotaryEmbedding(torch.nn.Module):
479
  def __init__(self,
480
  dim,
481
  max_position_embeddings=2048,
 
543
  return q_embed, k_embed
544
 
545
 
546
+ class InternLMMLP(nn.Module):
547
  def __init__(self, hidden_size: int, intermediate_size: int,
548
+ hidden_act: str, config: InternLMXComposerConfig):
549
  super().__init__()
550
  self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
551
  if config.lora_cfg is not None and 'ffn' in config.lora_cfg[
 
572
  return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
573
 
574
 
575
+ class InternLMAttention(nn.Module):
576
  """Multi-headed attention from 'Attention Is All You Need' paper"""
577
+ def __init__(self, config: InternLMXComposerConfig):
578
  super().__init__()
579
  self.config = config
580
  self.hidden_size = config.hidden_size
 
641
  self.hidden_size,
642
  bias=False)
643
 
644
+ self.rotary_emb = InternLMRotaryEmbedding(
645
  self.head_dim,
646
  max_position_embeddings=self.max_position_embeddings)
647
 
 
724
  return attn_output, attn_weights, past_key_value
725
 
726
 
727
+ class InternLMDecoderLayer(nn.Module):
728
+ def __init__(self, config: InternLMXComposerConfig):
729
  super().__init__()
730
  self.hidden_size = config.hidden_size
731
  if hasattr(config,
732
  'intern_converted_llm') and config.intern_converted_llm:
733
+ self.self_attn = InternConvertedInternLMAttention(config=config)
734
  else:
735
+ self.self_attn = InternLMAttention(config=config)
736
+ self.mlp = InternLMMLP(
737
  hidden_size=self.hidden_size,
738
  intermediate_size=config.intermediate_size,
739
  hidden_act=config.hidden_act,
740
  config=config,
741
  )
742
+ self.input_layernorm = InternLMRMSNorm(config.hidden_size,
743
+ eps=config.rms_norm_eps)
744
+ self.post_attention_layernorm = InternLMRMSNorm(
745
+ config.hidden_size, eps=config.rms_norm_eps)
746
 
747
  def forward(
748
  self,
 
800
  return outputs
801
 
802
 
803
+ class InternLMPreTrainedModel(PreTrainedModel):
804
+ config_class = InternLMXComposerConfig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
805
  base_model_prefix = "model"
806
  supports_gradient_checkpointing = True
807
+ _no_split_modules = ["InternLMDecoderLayer"]
808
  _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
809
 
810
  def _init_weights(self, module):
 
819
  module.weight.data[module.padding_idx].zero_()
820
 
821
  def _set_gradient_checkpointing(self, module, value=False):
822
+ if isinstance(module, InternLMModel):
823
  module.gradient_checkpointing = value
824
 
825
 
826
+ class InternLMModel(InternLMPreTrainedModel):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
827
  """
828
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLMDecoderLayer`]
829
 
830
  Args:
831
+ config: InternLMXComposerConfig
832
  """
833
+ def __init__(self, config: InternLMXComposerConfig):
834
  super().__init__(config)
835
  self.padding_idx = config.pad_token_id
836
  self.vocab_size = config.vocab_size
 
838
  self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size,
839
  self.padding_idx)
840
  self.layers = nn.ModuleList([
841
+ InternLMDecoderLayer(config)
842
+ for _ in range(config.num_hidden_layers)
843
  ])
844
+ self.norm = InternLMRMSNorm(config.hidden_size,
845
+ eps=config.rms_norm_eps)
846
 
847
  self.gradient_checkpointing = False
848
  # Initialize weights and apply final processing
 
881
 
882
  return combined_attention_mask
883
 
 
884
  def forward(
885
  self,
886
  input_ids: torch.LongTensor = None,
 
1024
  )
1025
 
1026
 
1027
+ class InternLMForCausalLM(InternLMPreTrainedModel):
1028
  lora_cfg = None # init in MiniGPT4
1029
 
1030
  def __init__(self, config):
1031
  super().__init__(config)
1032
+ # TODO: find a way to explicitly initialize InternLM
1033
  setattr(config, 'lora_cfg', self.lora_cfg)
1034
 
1035
  if hasattr(config, 'kqvo_bias'):
1036
  setattr(config, 'kqvo_bias', config.kqvo_bias)
1037
  else:
1038
  setattr(config, 'kqvo_bias', False)
1039
+ self.model = InternLMModel(config)
1040
 
1041
  self.lm_head = nn.Linear(config.hidden_size,
1042
  config.vocab_size,
 
1090
  def get_decoder(self):
1091
  return self.model
1092
 
 
 
 
1093
  def forward(
1094
  self,
1095
  input_ids: torch.LongTensor = None,
 
1116
  Example:
1117
 
1118
  ```python
1119
+ >>> from transformers import AutoTokenizer, InternLMForCausalLM
1120
 
1121
+ >>> model = InternLMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
1122
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
1123
 
1124
  >>> prompt = "Hey, are you consciours? Can you talk to me?"
modeling_vit.py CHANGED
@@ -7,7 +7,7 @@ import torch.nn.functional as F
7
  import torch.utils.checkpoint as checkpoint
8
  from timm.models.layers import drop_path, to_2tuple, trunc_normal_
9
 
10
- from modeling_utils import download_cached_file
11
 
12
 
13
  def _cfg(url='', **kwargs):
 
7
  import torch.utils.checkpoint as checkpoint
8
  from timm.models.layers import drop_path, to_2tuple, trunc_normal_
9
 
10
+ from .modeling_utils import download_cached_file
11
 
12
 
13
  def _cfg(url='', **kwargs):