XiaoduoAILab commited on
Commit
fa81332
·
verified ·
1 Parent(s): 40c6d69

Upload 10 files

Browse files
ckpt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4033e3c1c7d359cdedca7f7eda6e2db768a676835adfd8cd190d35c6ec2a531f
3
+ size 5007345026
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "out_g_line/xl_g_line_s2_decay_exp10_260k_sft_v2_dedup/iter-0006000",
3
+ "architectures": [
4
+ "XmodelForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_xmodel.XmodelConfig",
10
+ "AutoModelForCausalLM": "modeling_xmodel.XmodelForCausalLM"
11
+ },
12
+ "bos_token_id": 1,
13
+ "dim_model_base": 256,
14
+ "eos_token_id": 2,
15
+ "hidden_act": "silu",
16
+ "hidden_act_param": 0.03,
17
+ "hidden_size": 1536,
18
+ "initializer_range": 0.1,
19
+ "intermediate_size": 3840,
20
+ "max_position_embeddings": 131072,
21
+ "mlp_bias": false,
22
+ "model_type": "xmodel",
23
+ "num_attention_heads": 24,
24
+ "num_hidden_layers": 48,
25
+ "num_key_value_heads": 8,
26
+ "pretraining_tp": 1,
27
+ "rms_norm_eps": 1e-05,
28
+ "rope_scaling": null,
29
+ "rope_theta": 500000.0,
30
+ "scale_depth": 1.4,
31
+ "scale_emb": 12,
32
+ "torch_dtype": "bfloat16",
33
+ "transformers_version": "4.44.2",
34
+ "use_cache": true,
35
+ "vocab_size": 65280
36
+ }
configuration_xmodel.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 XiaoDuo AI. All rights reserved.
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.modeling_rope_utils import rope_config_validation
5
+ from transformers.utils import logging
6
+ from typing_extensions import Self
7
+
8
+ logger = logging.get_logger(__name__)
9
+
10
+
11
+ class XmodelConfig(PretrainedConfig):
12
+ model_type = "xmodel"
13
+ keys_to_ignore_at_inference = ["past_key_values"]
14
+
15
+ def __init__(
16
+ self,
17
+ vocab_size=32000,
18
+ hidden_size=1536,
19
+ intermediate_size=4096,
20
+ num_hidden_layers=48,
21
+ num_attention_heads=24,
22
+ num_key_value_heads=8,
23
+ hidden_act="silu",
24
+ max_position_embeddings=131072,
25
+ initializer_range=0.1,
26
+ rms_norm_eps=1e-5,
27
+ use_cache=True,
28
+ pad_token_id=None,
29
+ bos_token_id=1,
30
+ eos_token_id=2,
31
+ pretraining_tp=1,
32
+ tie_word_embeddings=True,
33
+ rope_theta=500000.0,
34
+ rope_scaling=None,
35
+ attention_bias=False,
36
+ attention_dropout=0.0,
37
+ mlp_bias=False,
38
+ hidden_act_param=0.03,
39
+ scale_emb=12,
40
+ dim_model_base=256,
41
+ scale_depth=1.4,
42
+ **kwargs,
43
+ ):
44
+ self.vocab_size = vocab_size
45
+ self.max_position_embeddings = max_position_embeddings
46
+ self.hidden_size = hidden_size
47
+ # self.intermediate_size = intermediate_size
48
+ if intermediate_size is None:
49
+ self.intermediate_size = find_multiple(int(8 * hidden_size / 3), 256)
50
+ else:
51
+ self.intermediate_size = intermediate_size
52
+ self.num_hidden_layers = num_hidden_layers
53
+ self.num_attention_heads = num_attention_heads
54
+
55
+ # for backward compatibility
56
+ if num_key_value_heads is None:
57
+ num_key_value_heads = num_attention_heads
58
+
59
+ self.num_key_value_heads = num_key_value_heads
60
+ self.hidden_act = hidden_act
61
+ self.hidden_act_param = hidden_act_param
62
+ self.initializer_range = initializer_range
63
+ self.rms_norm_eps = rms_norm_eps
64
+ self.pretraining_tp = pretraining_tp
65
+ self.use_cache = use_cache
66
+ self.rope_theta = rope_theta
67
+ self.rope_scaling = rope_scaling
68
+ self.attention_bias = attention_bias
69
+ self.attention_dropout = attention_dropout
70
+ self.mlp_bias = mlp_bias
71
+ self.scale_emb = scale_emb
72
+ self.dim_model_base = dim_model_base
73
+ self.scale_depth = scale_depth
74
+
75
+ self.auto_map = {
76
+ "AutoConfig": "configuration_xmodel.XmodelConfig",
77
+ "AutoModelForCausalLM": "modeling_xmodel.XmodelForCausalLM"
78
+ }
79
+
80
+ # Validate the correctness of rotary position embeddings parameters
81
+ # BC: if there is a 'type' field, move it to 'rope_type'.
82
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
83
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
84
+ rope_config_validation(self)
85
+
86
+ super().__init__(
87
+ pad_token_id=pad_token_id,
88
+ bos_token_id=bos_token_id,
89
+ eos_token_id=eos_token_id,
90
+ tie_word_embeddings=tie_word_embeddings,
91
+ **kwargs,
92
+ )
93
+
94
+ @classmethod
95
+ def from_name(cls, name: str) -> Self:
96
+ return cls(**xmodel_configs[name])
97
+
98
+
99
+ xmodel_configs = {
100
+ "nano": dict(num_hidden_layers=8,
101
+ num_attention_heads=4,
102
+ num_key_value_heads=1,
103
+ hidden_size=256,
104
+ tie_word_embeddings=True,
105
+ intermediate_size=640),
106
+
107
+ "nano_old": dict(num_hidden_layers=6,
108
+ num_attention_heads=6,
109
+ num_key_value_heads=1,
110
+ hidden_size=192,
111
+ tie_word_embeddings=False),
112
+
113
+ "micro": dict(num_hidden_layers=12,
114
+ num_attention_heads=6,
115
+ num_key_value_heads=1,
116
+ hidden_size=384,
117
+ tie_word_embeddings=True,
118
+ intermediate_size=960),
119
+
120
+ "micro_old": dict(num_hidden_layers=6,
121
+ num_attention_heads=6,
122
+ num_key_value_heads=1,
123
+ hidden_size=384,
124
+ tie_word_embeddings=False),
125
+
126
+ "tiny": dict(num_hidden_layers=18,
127
+ num_attention_heads=8,
128
+ num_key_value_heads=4,
129
+ hidden_size=512,
130
+ tie_word_embeddings=True,
131
+ intermediate_size=1280),
132
+
133
+ "tiny_old": dict(num_hidden_layers=8,
134
+ num_attention_heads=8,
135
+ num_key_value_heads=2,
136
+ hidden_size=512,
137
+ tie_word_embeddings=False),
138
+
139
+ # GPT-1 & Bert-Base
140
+ "small": dict(num_hidden_layers=30,
141
+ num_attention_heads=9,
142
+ num_key_value_heads=3,
143
+ hidden_size=576,
144
+ tie_word_embeddings=True,
145
+ intermediate_size=1440),
146
+
147
+ "small_old": dict(num_hidden_layers=12,
148
+ num_attention_heads=12,
149
+ num_key_value_heads=3,
150
+ hidden_size=768,
151
+ tie_word_embeddings=False),
152
+
153
+ # Bert-Large
154
+ "medium": dict(num_hidden_layers=32,
155
+ num_attention_heads=15,
156
+ num_key_value_heads=5,
157
+ hidden_size=960,
158
+ tie_word_embeddings=True,
159
+ intermediate_size=2400),
160
+
161
+ "medium_old": dict(num_hidden_layers=24,
162
+ num_attention_heads=16,
163
+ num_key_value_heads=4,
164
+ hidden_size=1024,
165
+ tie_word_embeddings=False),
166
+
167
+ # GPT-2
168
+ "xl": dict(num_hidden_layers=48,
169
+ num_attention_heads=24,
170
+ num_key_value_heads=8,
171
+ hidden_size=1536,
172
+ tie_word_embeddings=True,
173
+ intermediate_size=3840), # GPT-2
174
+
175
+ "xl_old": dict(num_hidden_layers=24,
176
+ num_attention_heads=32,
177
+ num_key_value_heads=4,
178
+ hidden_size=2048,
179
+ tie_word_embeddings=False),
180
+
181
+ }
182
+
183
+
184
+ def find_multiple(n: int, k: int) -> int:
185
+ if n % k == 0:
186
+ return n
187
+ return n + k - (n % k)
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.44.2"
6
+ }
modeling_xmodel.py ADDED
@@ -0,0 +1,1560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 XiaoDuo AI. All rights reserved.
2
+
3
+ import inspect
4
+ import math
5
+ import sys
6
+ from pathlib import Path
7
+ from typing import List, Optional, Tuple, Union
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ import torch.utils.checkpoint
12
+ from torch import nn
13
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
14
+ from transformers.activations import ACT2FN
15
+ from transformers.cache_utils import Cache, DynamicCache, StaticCache
16
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
17
+ from transformers.modeling_flash_attention_utils import _flash_attention_forward
18
+ from transformers.modeling_outputs import (
19
+ BaseModelOutputWithPast,
20
+ CausalLMOutputWithPast,
21
+ QuestionAnsweringModelOutput,
22
+ SequenceClassifierOutputWithPast,
23
+ TokenClassifierOutput,
24
+ )
25
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
26
+ from transformers.modeling_utils import PreTrainedModel, GenerationMixin
27
+ from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
28
+ from transformers.utils import (
29
+ add_start_docstrings,
30
+ add_start_docstrings_to_model_forward,
31
+ is_flash_attn_greater_or_equal_2_10,
32
+ logging,
33
+ replace_return_docstrings,
34
+ )
35
+
36
+ # support running without installing as a package
37
+ wd = Path(__file__).parent.parent.resolve()
38
+ sys.path.append(str(wd))
39
+
40
+ from .configuration_xmodel import XmodelConfig
41
+
42
+ logger = logging.get_logger(__name__)
43
+
44
+ _CONFIG_FOR_DOC = "XmodelConfig"
45
+
46
+
47
+ # @torch.jit.script # type: ignore
48
+ def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
49
+ old_dtype = hidden.dtype
50
+ variance = hidden.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
51
+ hidden = (hidden * torch.rsqrt(variance + eps)).to(old_dtype)
52
+ return hidden * weight
53
+
54
+
55
+ class XmodelRMSNorm(nn.Module):
56
+ def __init__(self, hidden_size, eps=1e-6):
57
+ """
58
+ XmodelRMSNorm is equivalent to T5LayerNorm
59
+ """
60
+ super().__init__()
61
+ self.weight = nn.Parameter(torch.ones(hidden_size))
62
+ self.variance_epsilon = eps
63
+
64
+ def forward(self, hidden_states):
65
+ return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
66
+
67
+
68
+ ALL_LAYERNORM_LAYERS.append(XmodelRMSNorm)
69
+
70
+
71
+ class XmodelRotaryEmbedding(nn.Module):
72
+ def __init__(
73
+ self,
74
+ dim=None,
75
+ max_position_embeddings=2048,
76
+ base=10000,
77
+ device=None,
78
+ scaling_factor=1.0,
79
+ rope_type="default",
80
+ config: Optional[XmodelConfig] = None,
81
+ ):
82
+ super().__init__()
83
+ # TODO (joao): remove the `if` below, only used for BC
84
+ self.rope_kwargs = {}
85
+ if config is None:
86
+ logger.warning_once(
87
+ "`XmodelRotaryEmbedding` can now be fully parameterized by passing the model config through the "
88
+ "`config` argument. All other arguments will be removed in v4.45"
89
+ )
90
+ self.rope_kwargs = {
91
+ "rope_type": rope_type,
92
+ "factor": scaling_factor,
93
+ "dim": dim,
94
+ "base": base,
95
+ "max_position_embeddings": max_position_embeddings,
96
+ }
97
+ self.rope_type = rope_type
98
+ self.max_seq_len_cached = max_position_embeddings
99
+ self.original_max_seq_len = max_position_embeddings
100
+ else:
101
+ # BC: "rope_type" was originally "type"
102
+ if config.rope_scaling is not None:
103
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
104
+ else:
105
+ self.rope_type = "default"
106
+ self.max_seq_len_cached = config.max_position_embeddings
107
+ self.original_max_seq_len = config.max_position_embeddings
108
+
109
+ self.config = config
110
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
111
+
112
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
113
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
114
+ self.original_inv_freq = self.inv_freq
115
+
116
+ def _dynamic_frequency_update(self, position_ids, device):
117
+ """
118
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
119
+ 1 - growing beyond the cached sequence length (allow scaling)
120
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
121
+ """
122
+ seq_len = torch.max(position_ids) + 1
123
+ if seq_len > self.max_seq_len_cached: # growth
124
+ inv_freq, self.attention_scaling = self.rope_init_fn(
125
+ self.config, device, seq_len=seq_len, **self.rope_kwargs
126
+ )
127
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
128
+ self.max_seq_len_cached = seq_len
129
+
130
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
131
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
132
+ self.max_seq_len_cached = self.original_max_seq_len
133
+
134
+ @torch.no_grad()
135
+ def forward(self, x, position_ids):
136
+ if "dynamic" in self.rope_type:
137
+ self._dynamic_frequency_update(position_ids, device=x.device)
138
+
139
+ # Core RoPE block
140
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
141
+ position_ids_expanded = position_ids[:, None, :].float()
142
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
143
+ device_type = x.device.type
144
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
145
+ with torch.autocast(device_type=device_type, enabled=False):
146
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
147
+ emb = torch.cat((freqs, freqs), dim=-1)
148
+ cos = emb.cos()
149
+ sin = emb.sin()
150
+
151
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
152
+ cos = cos * self.attention_scaling
153
+ sin = sin * self.attention_scaling
154
+
155
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
156
+
157
+
158
+ class XmodelLinearScalingRotaryEmbedding(XmodelRotaryEmbedding):
159
+ """XmodelRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
160
+
161
+ def __init__(self, *args, **kwargs):
162
+ logger.warning_once(
163
+ "`XmodelLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use "
164
+ "`XmodelRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)."
165
+ )
166
+ kwargs["rope_type"] = "linear"
167
+ super().__init__(*args, **kwargs)
168
+
169
+
170
+ class XmodelDynamicNTKScalingRotaryEmbedding(XmodelRotaryEmbedding):
171
+ """XmodelRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
172
+
173
+ def __init__(self, *args, **kwargs):
174
+ logger.warning_once(
175
+ "`XmodelDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use "
176
+ "`XmodelRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to "
177
+ "__init__)."
178
+ )
179
+ kwargs["rope_type"] = "dynamic"
180
+ super().__init__(*args, **kwargs)
181
+
182
+
183
+ def rotate_half(x):
184
+ """Rotates half the hidden dims of the input."""
185
+ x1 = x[..., : x.shape[-1] // 2]
186
+ x2 = x[..., x.shape[-1] // 2:]
187
+ return torch.cat((-x2, x1), dim=-1)
188
+
189
+
190
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
191
+ """Applies Rotary Position Embedding to the query and key tensors.
192
+
193
+ Args:
194
+ q (`torch.Tensor`): The query tensor.
195
+ k (`torch.Tensor`): The key tensor.
196
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
197
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
198
+ position_ids (`torch.Tensor`, *optional*):
199
+ Deprecated and unused.
200
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
201
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
202
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
203
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
204
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
205
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
206
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
207
+ Returns:
208
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
209
+ """
210
+ cos = cos.unsqueeze(unsqueeze_dim)
211
+ sin = sin.unsqueeze(unsqueeze_dim)
212
+ q_embed = (q * cos) + (rotate_half(q) * sin)
213
+ k_embed = (k * cos) + (rotate_half(k) * sin)
214
+ return q_embed, k_embed
215
+
216
+
217
+ class XmodelMLP(nn.Module):
218
+ def __init__(self, config):
219
+ super().__init__()
220
+ self.config = config
221
+ self.hidden_size = config.hidden_size
222
+ self.intermediate_size = config.intermediate_size
223
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
224
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
225
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
226
+ self.act_fn = ACT2FN[config.hidden_act]
227
+
228
+ def forward(self, x):
229
+ if self.config.pretraining_tp > 1:
230
+ slice = self.intermediate_size // self.config.pretraining_tp
231
+ gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
232
+ up_proj_slices = self.up_proj.weight.split(slice, dim=0)
233
+ down_proj_slices = self.down_proj.weight.split(slice, dim=1)
234
+
235
+ gate_proj = torch.cat(
236
+ [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
237
+ )
238
+ up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
239
+
240
+ intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
241
+ down_proj = [
242
+ F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
243
+ ]
244
+ down_proj = sum(down_proj)
245
+ else:
246
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
247
+
248
+ return down_proj
249
+
250
+
251
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
252
+ """
253
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
254
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
255
+ """
256
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
257
+ if n_rep == 1:
258
+ return hidden_states
259
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
260
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
261
+
262
+
263
+ class XmodelAttention(nn.Module):
264
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
265
+
266
+ def __init__(self, config: XmodelConfig, layer_idx: Optional[int] = None):
267
+ super().__init__()
268
+ self.config = config
269
+ self.layer_idx = layer_idx
270
+ if layer_idx is None:
271
+ logger.warning_once(
272
+ f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
273
+ "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
274
+ "when creating this class."
275
+ )
276
+
277
+ self.attention_dropout = config.attention_dropout
278
+ self.hidden_size = config.hidden_size
279
+ self.num_heads = config.num_attention_heads
280
+ self.head_dim = self.hidden_size // self.num_heads
281
+ self.num_key_value_heads = config.num_key_value_heads
282
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
283
+ self.max_position_embeddings = config.max_position_embeddings
284
+ self.rope_theta = config.rope_theta
285
+ self.is_causal = True
286
+
287
+ if (self.head_dim * self.num_heads) != self.hidden_size:
288
+ raise ValueError(
289
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
290
+ f" and `num_heads`: {self.num_heads})."
291
+ )
292
+
293
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
294
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
295
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
296
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
297
+
298
+ # TODO (joao): remove in v4.45 (RoPE is computed in the model, not in the decoder layers)
299
+ self.rotary_emb = XmodelRotaryEmbedding(config=self.config)
300
+
301
+ def forward(
302
+ self,
303
+ hidden_states: torch.Tensor,
304
+ attention_mask: Optional[torch.Tensor] = None,
305
+ position_ids: Optional[torch.LongTensor] = None,
306
+ past_key_value: Optional[Cache] = None,
307
+ output_attentions: bool = False,
308
+ use_cache: bool = False,
309
+ cache_position: Optional[torch.LongTensor] = None,
310
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45
311
+ **kwargs,
312
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
313
+ bsz, q_len, _ = hidden_states.size()
314
+
315
+ if self.config.pretraining_tp > 1:
316
+ key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
317
+ query_slices = self.q_proj.weight.split(
318
+ (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
319
+ )
320
+ key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
321
+ value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
322
+
323
+ query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
324
+ query_states = torch.cat(query_states, dim=-1)
325
+
326
+ key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
327
+ key_states = torch.cat(key_states, dim=-1)
328
+
329
+ value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
330
+ value_states = torch.cat(value_states, dim=-1)
331
+
332
+ else:
333
+ query_states = self.q_proj(hidden_states)
334
+ key_states = self.k_proj(hidden_states)
335
+ value_states = self.v_proj(hidden_states)
336
+
337
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
338
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
339
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
340
+
341
+ if position_embeddings is None:
342
+ logger.warning_once(
343
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
344
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
345
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
346
+ "removed and `position_embeddings` will be mandatory."
347
+ )
348
+ cos, sin = self.rotary_emb(value_states, position_ids)
349
+ else:
350
+ cos, sin = position_embeddings
351
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
352
+
353
+ if past_key_value is not None:
354
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
355
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
356
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
357
+
358
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
359
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
360
+
361
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
362
+
363
+ if attention_mask is not None: # no matter the length, we just slice it
364
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
365
+ attn_weights = attn_weights + causal_mask
366
+
367
+ # upcast attention to fp32
368
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
369
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
370
+ attn_output = torch.matmul(attn_weights, value_states)
371
+
372
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
373
+ raise ValueError(
374
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
375
+ f" {attn_output.size()}"
376
+ )
377
+
378
+ attn_output = attn_output.transpose(1, 2).contiguous()
379
+
380
+ attn_output = attn_output.reshape(bsz, q_len, -1)
381
+
382
+ if self.config.pretraining_tp > 1:
383
+ attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
384
+ o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
385
+ attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
386
+ else:
387
+ attn_output = self.o_proj(attn_output)
388
+
389
+ if not output_attentions:
390
+ attn_weights = None
391
+
392
+ return attn_output, attn_weights, past_key_value
393
+
394
+
395
+ class XmodelFlashAttention2(XmodelAttention):
396
+ """
397
+ Xmodel flash attention module. This module inherits from `XmodelAttention` as the weights of the module stays
398
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
399
+ flash attention and deal with padding tokens in case the input contains any of them.
400
+ """
401
+
402
+ def __init__(self, *args, **kwargs):
403
+ super().__init__(*args, **kwargs)
404
+
405
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
406
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
407
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
408
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
409
+
410
+ def forward(
411
+ self,
412
+ hidden_states: torch.Tensor,
413
+ attention_mask: Optional[torch.LongTensor] = None,
414
+ position_ids: Optional[torch.LongTensor] = None,
415
+ past_key_value: Optional[Cache] = None,
416
+ output_attentions: bool = False,
417
+ use_cache: bool = False,
418
+ cache_position: Optional[torch.LongTensor] = None,
419
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45
420
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
421
+ if isinstance(past_key_value, StaticCache):
422
+ raise ValueError(
423
+ "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
424
+ "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
425
+ )
426
+
427
+ output_attentions = False
428
+
429
+ bsz, q_len, _ = hidden_states.size()
430
+
431
+ query_states = self.q_proj(hidden_states)
432
+ key_states = self.k_proj(hidden_states)
433
+ value_states = self.v_proj(hidden_states)
434
+
435
+ # Flash attention requires the input to have the shape
436
+ # batch_size x seq_length x head_dim x hidden_dim
437
+ # therefore we just need to keep the original shape
438
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
439
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
440
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
441
+
442
+ if position_embeddings is None:
443
+ logger.warning_once(
444
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
445
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
446
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
447
+ "removed and `position_embeddings` will be mandatory."
448
+ )
449
+ cos, sin = self.rotary_emb(value_states, position_ids)
450
+ else:
451
+ cos, sin = position_embeddings
452
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
453
+
454
+ if past_key_value is not None:
455
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
456
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
457
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
458
+
459
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
460
+ # to be able to avoid many of these transpose/reshape/view.
461
+ query_states = query_states.transpose(1, 2)
462
+ key_states = key_states.transpose(1, 2)
463
+ value_states = value_states.transpose(1, 2)
464
+
465
+ dropout_rate = self.attention_dropout if self.training else 0.0
466
+
467
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
468
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
469
+ # cast them back in the correct dtype just to be sure everything works as expected.
470
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
471
+ # in fp32. (XmodelRMSNorm handles it correctly)
472
+
473
+ input_dtype = query_states.dtype
474
+ if input_dtype == torch.float32:
475
+ if torch.is_autocast_enabled():
476
+ target_dtype = torch.get_autocast_gpu_dtype()
477
+ # Handle the case where the model is quantized
478
+ elif hasattr(self.config, "_pre_quantization_dtype"):
479
+ target_dtype = self.config._pre_quantization_dtype
480
+ else:
481
+ target_dtype = self.q_proj.weight.dtype
482
+
483
+ logger.warning_once(
484
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
485
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
486
+ f" {target_dtype}."
487
+ )
488
+
489
+ query_states = query_states.to(target_dtype)
490
+ key_states = key_states.to(target_dtype)
491
+ value_states = value_states.to(target_dtype)
492
+
493
+ attn_output = _flash_attention_forward(
494
+ query_states,
495
+ key_states,
496
+ value_states,
497
+ attention_mask,
498
+ q_len,
499
+ dropout=dropout_rate,
500
+ sliding_window=getattr(self, "sliding_window", None),
501
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
502
+ is_causal=self.is_causal,
503
+ )
504
+
505
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
506
+ attn_output = self.o_proj(attn_output)
507
+
508
+ if not output_attentions:
509
+ attn_weights = None
510
+
511
+ return attn_output, attn_weights, past_key_value
512
+
513
+
514
+ class XmodelSdpaAttention(XmodelAttention):
515
+ """
516
+ Xmodel attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
517
+ `XmodelAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
518
+ SDPA API.
519
+ """
520
+
521
+ # Adapted from XmodelAttention.forward
522
+ def forward(
523
+ self,
524
+ hidden_states: torch.Tensor,
525
+ attention_mask: Optional[torch.Tensor] = None,
526
+ position_ids: Optional[torch.LongTensor] = None,
527
+ past_key_value: Optional[Cache] = None,
528
+ output_attentions: bool = False,
529
+ use_cache: bool = False,
530
+ cache_position: Optional[torch.LongTensor] = None,
531
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45
532
+ **kwargs,
533
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
534
+ if output_attentions:
535
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
536
+ logger.warning_once(
537
+ "XmodelModel is using XmodelSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
538
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
539
+ )
540
+ return super().forward(
541
+ hidden_states=hidden_states,
542
+ attention_mask=attention_mask,
543
+ position_ids=position_ids,
544
+ past_key_value=past_key_value,
545
+ output_attentions=output_attentions,
546
+ use_cache=use_cache,
547
+ cache_position=cache_position,
548
+ position_embeddings=position_embeddings,
549
+ )
550
+
551
+ bsz, q_len, _ = hidden_states.size()
552
+
553
+ query_states = self.q_proj(hidden_states)
554
+ key_states = self.k_proj(hidden_states)
555
+ value_states = self.v_proj(hidden_states)
556
+
557
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
558
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
559
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
560
+
561
+ if position_embeddings is None:
562
+ logger.warning_once(
563
+ "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
564
+ "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
565
+ "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
566
+ "removed and `position_embeddings` will be mandatory."
567
+ )
568
+ cos, sin = self.rotary_emb(value_states, position_ids)
569
+ else:
570
+ cos, sin = position_embeddings
571
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
572
+
573
+ if past_key_value is not None:
574
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
575
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
576
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
577
+
578
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
579
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
580
+
581
+ causal_mask = attention_mask
582
+ if attention_mask is not None:
583
+ causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
584
+
585
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
586
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
587
+ if query_states.device.type == "cuda" and causal_mask is not None:
588
+ query_states = query_states.contiguous()
589
+ key_states = key_states.contiguous()
590
+ value_states = value_states.contiguous()
591
+
592
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
593
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
594
+ is_causal = True if causal_mask is None and q_len > 1 else False
595
+
596
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
597
+ query_states,
598
+ key_states,
599
+ value_states,
600
+ attn_mask=causal_mask,
601
+ dropout_p=self.attention_dropout if self.training else 0.0,
602
+ is_causal=is_causal,
603
+ )
604
+
605
+ attn_output = attn_output.transpose(1, 2).contiguous()
606
+ attn_output = attn_output.view(bsz, q_len, -1)
607
+
608
+ attn_output = self.o_proj(attn_output)
609
+
610
+ return attn_output, None, past_key_value
611
+
612
+
613
+ XMODEL_ATTENTION_CLASSES = {
614
+ "eager": XmodelAttention,
615
+ "flash_attention_2": XmodelFlashAttention2,
616
+ "sdpa": XmodelSdpaAttention,
617
+ }
618
+
619
+
620
+ class XmodelDecoderLayer(nn.Module):
621
+ def __init__(self, config: XmodelConfig, layer_idx: int):
622
+ super().__init__()
623
+ self.hidden_size = config.hidden_size
624
+
625
+ self.self_attn = XMODEL_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
626
+
627
+ self.mlp = XmodelMLP(config)
628
+ self.input_layernorm = XmodelRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
629
+ self.post_attention_layernorm = XmodelRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
630
+
631
+ self.scale_depth = config.scale_depth
632
+ self.num_hidden_layers = config.num_hidden_layers
633
+
634
+ def forward(
635
+ self,
636
+ hidden_states: torch.Tensor,
637
+ attention_mask: Optional[torch.Tensor] = None,
638
+ position_ids: Optional[torch.LongTensor] = None,
639
+ past_key_value: Optional[Cache] = None,
640
+ output_attentions: Optional[bool] = False,
641
+ use_cache: Optional[bool] = False,
642
+ cache_position: Optional[torch.LongTensor] = None,
643
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45
644
+ **kwargs,
645
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
646
+ """
647
+ Args:
648
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
649
+ attention_mask (`torch.FloatTensor`, *optional*):
650
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
651
+ query_sequence_length, key_sequence_length)` if default attention is used.
652
+ output_attentions (`bool`, *optional*):
653
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
654
+ returned tensors for more detail.
655
+ use_cache (`bool`, *optional*):
656
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
657
+ (see `past_key_values`).
658
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
659
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
660
+ Indices depicting the position of the input sequence tokens in the sequence
661
+ position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
662
+ Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
663
+ with `head_dim` being the embedding dimension of each attention head.
664
+ kwargs (`dict`, *optional*):
665
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
666
+ into the model
667
+ """
668
+ residual = hidden_states
669
+
670
+ hidden_states = self.input_layernorm(hidden_states)
671
+
672
+ # Self Attention
673
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
674
+ hidden_states=hidden_states,
675
+ attention_mask=attention_mask,
676
+ position_ids=position_ids,
677
+ past_key_value=past_key_value,
678
+ output_attentions=output_attentions,
679
+ use_cache=use_cache,
680
+ cache_position=cache_position,
681
+ position_embeddings=position_embeddings,
682
+ **kwargs,
683
+ )
684
+
685
+ hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
686
+
687
+ # Fully Connected
688
+ residual = hidden_states
689
+ hidden_states = self.post_attention_layernorm(hidden_states)
690
+ hidden_states = self.mlp(hidden_states)
691
+ hidden_states = residual + hidden_states * (self.scale_depth / math.sqrt(self.num_hidden_layers))
692
+
693
+ outputs = (hidden_states,)
694
+
695
+ if output_attentions:
696
+ outputs += (self_attn_weights,)
697
+
698
+ if use_cache:
699
+ outputs += (present_key_value,)
700
+
701
+ return outputs
702
+
703
+
704
+ XMODEL_START_DOCSTRING = r"""
705
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
706
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
707
+ etc.)
708
+
709
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
710
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
711
+ and behavior.
712
+
713
+ Parameters:
714
+ config ([`XmodelConfig`]):
715
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
716
+ load the weights associated with the model, only the configuration. Check out the
717
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
718
+ """
719
+
720
+
721
+ @add_start_docstrings(
722
+ "The bare Xmodel Model outputting raw hidden-states without any specific head on top.",
723
+ XMODEL_START_DOCSTRING,
724
+ )
725
+ class XmodelPreTrainedModel(PreTrainedModel):
726
+ config_class = XmodelConfig
727
+ base_model_prefix = "model"
728
+ supports_gradient_checkpointing = True
729
+ _no_split_modules = ["XmodelDecoderLayer"]
730
+ _skip_keys_device_placement = ["past_key_values"]
731
+ _supports_flash_attn_2 = True
732
+ _supports_sdpa = True
733
+ _supports_cache_class = True
734
+ _supports_quantized_cache = True
735
+ _supports_static_cache = True
736
+
737
+ def _init_weights(self, module):
738
+ std = self.config.initializer_range
739
+ depth_std = std / math.sqrt(self.config.hidden_size / self.config.dim_model_base)
740
+ if isinstance(module, nn.Linear):
741
+ module.weight.data.normal_(mean=0.0, std=depth_std)
742
+ if module.bias is not None:
743
+ module.bias.data.zero_()
744
+ elif isinstance(module, nn.Embedding):
745
+ module.weight.data.normal_(mean=0.0, std=std)
746
+ if module.padding_idx is not None:
747
+ module.weight.data[module.padding_idx].zero_()
748
+
749
+
750
+ XMODEL_INPUTS_DOCSTRING = r"""
751
+ Args:
752
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
753
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
754
+ it.
755
+
756
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
757
+ [`PreTrainedTokenizer.__call__`] for details.
758
+
759
+ [What are input IDs?](../glossary#input-ids)
760
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
761
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
762
+
763
+ - 1 for tokens that are **not masked**,
764
+ - 0 for tokens that are **masked**.
765
+
766
+ [What are attention masks?](../glossary#attention-mask)
767
+
768
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
769
+ [`PreTrainedTokenizer.__call__`] for details.
770
+
771
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
772
+ `past_key_values`).
773
+
774
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
775
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
776
+ information on the default strategy.
777
+
778
+ - 1 indicates the head is **not masked**,
779
+ - 0 indicates the head is **masked**.
780
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
781
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
782
+ config.n_positions - 1]`.
783
+
784
+ [What are position IDs?](../glossary#position-ids)
785
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
786
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
787
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
788
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
789
+
790
+ Two formats are allowed:
791
+ - a [`~cache_utils.Cache`] instance;
792
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
793
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
794
+ cache format.
795
+
796
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
797
+ legacy cache format will be returned.
798
+
799
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
800
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
801
+ of shape `(batch_size, sequence_length)`.
802
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
803
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
804
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
805
+ model's internal embedding lookup matrix.
806
+ use_cache (`bool`, *optional*):
807
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
808
+ `past_key_values`).
809
+ output_attentions (`bool`, *optional*):
810
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
811
+ tensors for more detail.
812
+ output_hidden_states (`bool`, *optional*):
813
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
814
+ more detail.
815
+ return_dict (`bool`, *optional*):
816
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
817
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
818
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
819
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
820
+ the complete sequence length.
821
+ """
822
+
823
+
824
+ @add_start_docstrings(
825
+ "The bare Xmodel Model outputting raw hidden-states without any specific head on top.",
826
+ XMODEL_START_DOCSTRING,
827
+ )
828
+ class XmodelModel(XmodelPreTrainedModel):
829
+ """
830
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`XmodelDecoderLayer`]
831
+
832
+ Args:
833
+ config: XmodelConfig
834
+ """
835
+
836
+ def __init__(self, config: XmodelConfig):
837
+ super().__init__(config)
838
+ self.padding_idx = config.pad_token_id
839
+ self.vocab_size = config.vocab_size
840
+
841
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
842
+ self.layers = nn.ModuleList(
843
+ [XmodelDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
844
+ )
845
+ self.norm = XmodelRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
846
+ self.rotary_emb = XmodelRotaryEmbedding(config=config)
847
+ self.gradient_checkpointing = False
848
+
849
+ # Initialize weights and apply final processing
850
+ self.post_init()
851
+
852
+ def get_input_embeddings(self):
853
+ return self.embed_tokens
854
+
855
+ def set_input_embeddings(self, value):
856
+ self.embed_tokens = value
857
+
858
+ @add_start_docstrings_to_model_forward(XMODEL_INPUTS_DOCSTRING)
859
+ def forward(
860
+ self,
861
+ input_ids: torch.LongTensor = None,
862
+ attention_mask: Optional[torch.Tensor] = None,
863
+ position_ids: Optional[torch.LongTensor] = None,
864
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
865
+ inputs_embeds: Optional[torch.FloatTensor] = None,
866
+ use_cache: Optional[bool] = None,
867
+ output_attentions: Optional[bool] = None,
868
+ output_hidden_states: Optional[bool] = None,
869
+ return_dict: Optional[bool] = None,
870
+ cache_position: Optional[torch.LongTensor] = None,
871
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
872
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
873
+ output_hidden_states = (
874
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
875
+ )
876
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
877
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
878
+
879
+ if (input_ids is None) ^ (inputs_embeds is not None):
880
+ raise ValueError(
881
+ "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
882
+ )
883
+
884
+ if self.gradient_checkpointing and self.training and use_cache:
885
+ logger.warning_once(
886
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
887
+ )
888
+ use_cache = False
889
+
890
+ if inputs_embeds is None:
891
+ inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb
892
+
893
+ return_legacy_cache = False
894
+ if (
895
+ use_cache and not isinstance(past_key_values, Cache) and not self.training
896
+ ): # kept for BC (non `Cache` `past_key_values` inputs)
897
+ return_legacy_cache = True
898
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
899
+ logger.warning_once(
900
+ "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. "
901
+ "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)"
902
+ )
903
+
904
+ if cache_position is None:
905
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
906
+ cache_position = torch.arange(
907
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
908
+ )
909
+ if position_ids is None:
910
+ position_ids = cache_position.unsqueeze(0)
911
+
912
+ causal_mask = self._update_causal_mask(
913
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
914
+ )
915
+ hidden_states = inputs_embeds
916
+
917
+ # create position embeddings to be shared across the decoder layers
918
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
919
+
920
+ # decoder layers
921
+ all_hidden_states = () if output_hidden_states else None
922
+ all_self_attns = () if output_attentions else None
923
+ next_decoder_cache = None
924
+
925
+ for decoder_layer in self.layers:
926
+ if output_hidden_states:
927
+ all_hidden_states += (hidden_states,)
928
+
929
+ if self.gradient_checkpointing and self.training:
930
+ layer_outputs = self._gradient_checkpointing_func(
931
+ decoder_layer.__call__,
932
+ hidden_states,
933
+ causal_mask,
934
+ position_ids,
935
+ past_key_values,
936
+ output_attentions,
937
+ use_cache,
938
+ cache_position,
939
+ position_embeddings,
940
+ )
941
+ else:
942
+ layer_outputs = decoder_layer(
943
+ hidden_states,
944
+ attention_mask=causal_mask,
945
+ position_ids=position_ids,
946
+ past_key_value=past_key_values,
947
+ output_attentions=output_attentions,
948
+ use_cache=use_cache,
949
+ cache_position=cache_position,
950
+ position_embeddings=position_embeddings,
951
+ )
952
+
953
+ hidden_states = layer_outputs[0]
954
+
955
+ if use_cache:
956
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
957
+
958
+ if output_attentions:
959
+ all_self_attns += (layer_outputs[1],)
960
+
961
+ hidden_states = self.norm(hidden_states)
962
+
963
+ # add hidden states from the last decoder layer
964
+ if output_hidden_states:
965
+ all_hidden_states += (hidden_states,)
966
+
967
+ next_cache = next_decoder_cache if use_cache else None
968
+ if return_legacy_cache:
969
+ next_cache = next_cache.to_legacy_cache()
970
+
971
+ if not return_dict:
972
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
973
+ return BaseModelOutputWithPast(
974
+ last_hidden_state=hidden_states,
975
+ past_key_values=next_cache,
976
+ hidden_states=all_hidden_states,
977
+ attentions=all_self_attns,
978
+ )
979
+
980
+ def _update_causal_mask(
981
+ self,
982
+ attention_mask: torch.Tensor,
983
+ input_tensor: torch.Tensor,
984
+ cache_position: torch.Tensor,
985
+ past_key_values: Cache,
986
+ output_attentions: bool,
987
+ ):
988
+ # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
989
+ # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes.
990
+ # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using
991
+ # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114
992
+
993
+ if self.config._attn_implementation == "flash_attention_2":
994
+ if attention_mask is not None and 0.0 in attention_mask:
995
+ return attention_mask
996
+ return None
997
+
998
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
999
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
1000
+ # to infer the attention mask.
1001
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
1002
+ using_static_cache = isinstance(past_key_values, StaticCache)
1003
+
1004
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
1005
+ if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
1006
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
1007
+ attention_mask,
1008
+ inputs_embeds=input_tensor,
1009
+ past_key_values_length=past_seen_tokens,
1010
+ is_training=self.training,
1011
+ ):
1012
+ return None
1013
+
1014
+ dtype, device = input_tensor.dtype, input_tensor.device
1015
+ min_dtype = torch.finfo(dtype).min
1016
+ sequence_length = input_tensor.shape[1]
1017
+ if using_static_cache:
1018
+ target_length = past_key_values.get_max_length()
1019
+ else:
1020
+ target_length = (
1021
+ attention_mask.shape[-1]
1022
+ if isinstance(attention_mask, torch.Tensor)
1023
+ else past_seen_tokens + sequence_length + 1
1024
+ )
1025
+
1026
+ if attention_mask is not None and attention_mask.dim() == 4:
1027
+ # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
1028
+ if attention_mask.max() != 0:
1029
+ raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
1030
+ causal_mask = attention_mask
1031
+ else:
1032
+ causal_mask = torch.full(
1033
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
1034
+ )
1035
+ if sequence_length != 1:
1036
+ causal_mask = torch.triu(causal_mask, diagonal=1)
1037
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
1038
+ causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
1039
+ if attention_mask is not None:
1040
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
1041
+ mask_length = attention_mask.shape[-1]
1042
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1043
+ padding_mask = padding_mask == 0
1044
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
1045
+ padding_mask, min_dtype
1046
+ )
1047
+ if (
1048
+ self.config._attn_implementation == "sdpa"
1049
+ and attention_mask is not None
1050
+ and attention_mask.device.type == "cuda"
1051
+ and not output_attentions
1052
+ ):
1053
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1054
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1055
+ # Details: https://github.com/pytorch/pytorch/issues/110213
1056
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
1057
+
1058
+ return causal_mask
1059
+
1060
+
1061
+ class XmodelForCausalLM(XmodelPreTrainedModel, GenerationMixin):
1062
+ _tied_weights_keys = ["lm_head.weight"]
1063
+
1064
+ def __init__(self, config):
1065
+ super().__init__(config)
1066
+ self.model = XmodelModel(config)
1067
+ self.vocab_size = config.vocab_size
1068
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1069
+
1070
+ # Initialize weights and apply final processing
1071
+ self.post_init()
1072
+
1073
+ def get_input_embeddings(self):
1074
+ return self.model.embed_tokens
1075
+
1076
+ def set_input_embeddings(self, value):
1077
+ self.model.embed_tokens = value
1078
+
1079
+ def get_output_embeddings(self):
1080
+ return self.lm_head
1081
+
1082
+ def set_output_embeddings(self, new_embeddings):
1083
+ self.lm_head = new_embeddings
1084
+
1085
+ def set_decoder(self, decoder):
1086
+ self.model = decoder
1087
+
1088
+ def get_decoder(self):
1089
+ return self.model
1090
+
1091
+ @add_start_docstrings_to_model_forward(XMODEL_INPUTS_DOCSTRING)
1092
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1093
+ def forward(
1094
+ self,
1095
+ input_ids: torch.LongTensor = None,
1096
+ attention_mask: Optional[torch.Tensor] = None,
1097
+ position_ids: Optional[torch.LongTensor] = None,
1098
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1099
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1100
+ labels: Optional[torch.LongTensor] = None,
1101
+ use_cache: Optional[bool] = None,
1102
+ output_attentions: Optional[bool] = None,
1103
+ output_hidden_states: Optional[bool] = None,
1104
+ return_dict: Optional[bool] = None,
1105
+ cache_position: Optional[torch.LongTensor] = None,
1106
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1107
+ r"""
1108
+ Args:
1109
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1110
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1111
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1112
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1113
+
1114
+ Returns:
1115
+
1116
+ Example:
1117
+
1118
+ ```python
1119
+ >>> from transformers import AutoTokenizer, AutoModelForCausalLM
1120
+
1121
+ >>> model = AutoModelForCausalLM.from_pretrained("XiaoduoAILab/Xmodel_LM")
1122
+ >>> tokenizer = AutoTokenizer.from_pretrained("XiaoduoAILab/Xmodel_LM")
1123
+
1124
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
1125
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1126
+
1127
+ >>> # Generate
1128
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1129
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1130
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1131
+ ```"""
1132
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1133
+ output_hidden_states = (
1134
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1135
+ )
1136
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1137
+
1138
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1139
+ outputs = self.model(
1140
+ input_ids=input_ids,
1141
+ attention_mask=attention_mask,
1142
+ position_ids=position_ids,
1143
+ past_key_values=past_key_values,
1144
+ inputs_embeds=inputs_embeds,
1145
+ use_cache=use_cache,
1146
+ output_attentions=output_attentions,
1147
+ output_hidden_states=output_hidden_states,
1148
+ return_dict=return_dict,
1149
+ cache_position=cache_position,
1150
+ )
1151
+
1152
+ hidden_states = outputs[0]
1153
+ if self.config.pretraining_tp > 1:
1154
+ lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
1155
+ logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
1156
+ logits = torch.cat(logits, dim=-1)
1157
+ else:
1158
+ logits = self.lm_head(hidden_states / (self.config.hidden_size / self.config.dim_model_base))
1159
+ logits = logits.float()
1160
+
1161
+ loss = None
1162
+ if labels is not None:
1163
+ # Shift so that tokens < n predict n
1164
+ shift_logits = logits[..., :-1, :].contiguous()
1165
+ shift_labels = labels[..., 1:].contiguous()
1166
+ # Flatten the tokens
1167
+ loss_fct = CrossEntropyLoss()
1168
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1169
+ shift_labels = shift_labels.view(-1)
1170
+ # Enable model parallelism
1171
+ shift_labels = shift_labels.to(shift_logits.device)
1172
+ loss = loss_fct(shift_logits, shift_labels)
1173
+
1174
+ if not return_dict:
1175
+ output = (logits,) + outputs[1:]
1176
+ return (loss,) + output if loss is not None else output
1177
+
1178
+ return CausalLMOutputWithPast(
1179
+ loss=loss,
1180
+ logits=logits,
1181
+ past_key_values=outputs.past_key_values,
1182
+ hidden_states=outputs.hidden_states,
1183
+ attentions=outputs.attentions,
1184
+ )
1185
+
1186
+ def prepare_inputs_for_generation(
1187
+ self,
1188
+ input_ids,
1189
+ past_key_values=None,
1190
+ attention_mask=None,
1191
+ inputs_embeds=None,
1192
+ cache_position=None,
1193
+ position_ids=None,
1194
+ use_cache=True,
1195
+ **kwargs,
1196
+ ):
1197
+ # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
1198
+ # Exception 1: when passing input_embeds, input_ids may be missing entries
1199
+ # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
1200
+ if past_key_values is not None:
1201
+ if inputs_embeds is not None: # Exception 1
1202
+ input_ids = input_ids[:, -cache_position.shape[0]:]
1203
+ elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2)
1204
+ input_ids = input_ids[:, cache_position]
1205
+
1206
+ if attention_mask is not None and position_ids is None:
1207
+ # create position_ids on the fly for batch generation
1208
+ position_ids = attention_mask.long().cumsum(-1) - 1
1209
+ position_ids.masked_fill_(attention_mask == 0, 1)
1210
+ if past_key_values:
1211
+ position_ids = position_ids[:, -input_ids.shape[1]:]
1212
+
1213
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1214
+ if inputs_embeds is not None and cache_position[0] == 0:
1215
+ model_inputs = {"inputs_embeds": inputs_embeds}
1216
+ else:
1217
+ model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases
1218
+
1219
+ model_inputs.update(
1220
+ {
1221
+ "position_ids": position_ids,
1222
+ "cache_position": cache_position,
1223
+ "past_key_values": past_key_values,
1224
+ "use_cache": use_cache,
1225
+ "attention_mask": attention_mask,
1226
+ }
1227
+ )
1228
+ return model_inputs
1229
+
1230
+ def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
1231
+ # start with all of the candidate parameters
1232
+ param_dict = {pn: p for pn, p in self.named_parameters()}
1233
+ # filter out those that do not require grad
1234
+ param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
1235
+ # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
1236
+ # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
1237
+ decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
1238
+ nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
1239
+ optim_groups = [
1240
+ {'params': decay_params, 'weight_decay': weight_decay},
1241
+ {'params': nodecay_params, 'weight_decay': 0.0}
1242
+ ]
1243
+ num_decay_params = sum(p.numel() for p in decay_params)
1244
+ num_nodecay_params = sum(p.numel() for p in nodecay_params)
1245
+ # print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
1246
+ # print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
1247
+ # Create AdamW optimizer and use the fused version if it is available
1248
+ fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
1249
+ use_fused = fused_available and device_type == 'cuda'
1250
+ extra_args = dict(fused=True) if use_fused else dict()
1251
+ optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
1252
+ # print(f"using fused AdamW: {use_fused}")
1253
+
1254
+ return optimizer
1255
+
1256
+
1257
+ @add_start_docstrings(
1258
+ """
1259
+ The Xmodel Model transformer with a sequence classification head on top (linear layer).
1260
+
1261
+ [`XmodelForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1262
+ (e.g. GPT-2) do.
1263
+
1264
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1265
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1266
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1267
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1268
+ each row of the batch).
1269
+ """,
1270
+ XMODEL_START_DOCSTRING,
1271
+ )
1272
+ class XmodelForSequenceClassification(XmodelPreTrainedModel):
1273
+ def __init__(self, config):
1274
+ super().__init__(config)
1275
+ self.num_labels = config.num_labels
1276
+ self.model = XmodelModel(config)
1277
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1278
+
1279
+ # Initialize weights and apply final processing
1280
+ self.post_init()
1281
+
1282
+ def get_input_embeddings(self):
1283
+ return self.model.embed_tokens
1284
+
1285
+ def set_input_embeddings(self, value):
1286
+ self.model.embed_tokens = value
1287
+
1288
+ @add_start_docstrings_to_model_forward(XMODEL_INPUTS_DOCSTRING)
1289
+ def forward(
1290
+ self,
1291
+ input_ids: Optional[torch.LongTensor] = None,
1292
+ attention_mask: Optional[torch.Tensor] = None,
1293
+ position_ids: Optional[torch.LongTensor] = None,
1294
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1295
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1296
+ labels: Optional[torch.LongTensor] = None,
1297
+ use_cache: Optional[bool] = None,
1298
+ output_attentions: Optional[bool] = None,
1299
+ output_hidden_states: Optional[bool] = None,
1300
+ return_dict: Optional[bool] = None,
1301
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1302
+ r"""
1303
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1304
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1305
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1306
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1307
+ """
1308
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1309
+
1310
+ transformer_outputs = self.model(
1311
+ input_ids,
1312
+ attention_mask=attention_mask,
1313
+ position_ids=position_ids,
1314
+ past_key_values=past_key_values,
1315
+ inputs_embeds=inputs_embeds,
1316
+ use_cache=use_cache,
1317
+ output_attentions=output_attentions,
1318
+ output_hidden_states=output_hidden_states,
1319
+ return_dict=return_dict,
1320
+ )
1321
+ hidden_states = transformer_outputs[0]
1322
+ logits = self.score(hidden_states)
1323
+
1324
+ if input_ids is not None:
1325
+ batch_size = input_ids.shape[0]
1326
+ else:
1327
+ batch_size = inputs_embeds.shape[0]
1328
+
1329
+ if self.config.pad_token_id is None and batch_size != 1:
1330
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1331
+ if self.config.pad_token_id is None:
1332
+ sequence_lengths = -1
1333
+ else:
1334
+ if input_ids is not None:
1335
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1336
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1337
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
1338
+ sequence_lengths = sequence_lengths.to(logits.device)
1339
+ else:
1340
+ sequence_lengths = -1
1341
+
1342
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1343
+
1344
+ loss = None
1345
+ if labels is not None:
1346
+ labels = labels.to(logits.device)
1347
+ if self.config.problem_type is None:
1348
+ if self.num_labels == 1:
1349
+ self.config.problem_type = "regression"
1350
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1351
+ self.config.problem_type = "single_label_classification"
1352
+ else:
1353
+ self.config.problem_type = "multi_label_classification"
1354
+
1355
+ if self.config.problem_type == "regression":
1356
+ loss_fct = MSELoss()
1357
+ if self.num_labels == 1:
1358
+ loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1359
+ else:
1360
+ loss = loss_fct(pooled_logits, labels)
1361
+ elif self.config.problem_type == "single_label_classification":
1362
+ loss_fct = CrossEntropyLoss()
1363
+ loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1364
+ elif self.config.problem_type == "multi_label_classification":
1365
+ loss_fct = BCEWithLogitsLoss()
1366
+ loss = loss_fct(pooled_logits, labels)
1367
+ if not return_dict:
1368
+ output = (pooled_logits,) + transformer_outputs[1:]
1369
+ return ((loss,) + output) if loss is not None else output
1370
+
1371
+ return SequenceClassifierOutputWithPast(
1372
+ loss=loss,
1373
+ logits=pooled_logits,
1374
+ past_key_values=transformer_outputs.past_key_values,
1375
+ hidden_states=transformer_outputs.hidden_states,
1376
+ attentions=transformer_outputs.attentions,
1377
+ )
1378
+
1379
+
1380
+ @add_start_docstrings(
1381
+ """
1382
+ The Xmodel Model transformer with a span classification head on top for extractive question-answering tasks like
1383
+ SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
1384
+ """,
1385
+ XMODEL_START_DOCSTRING,
1386
+ )
1387
+ class XmodelForQuestionAnswering(XmodelPreTrainedModel):
1388
+ base_model_prefix = "transformer"
1389
+
1390
+ # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Xmodel
1391
+ def __init__(self, config):
1392
+ super().__init__(config)
1393
+ self.transformer = XmodelModel(config)
1394
+ self.qa_outputs = nn.Linear(config.hidden_size, 2)
1395
+
1396
+ # Initialize weights and apply final processing
1397
+ self.post_init()
1398
+
1399
+ def get_input_embeddings(self):
1400
+ return self.transformer.embed_tokens
1401
+
1402
+ def set_input_embeddings(self, value):
1403
+ self.transformer.embed_tokens = value
1404
+
1405
+ @add_start_docstrings_to_model_forward(XMODEL_INPUTS_DOCSTRING)
1406
+ def forward(
1407
+ self,
1408
+ input_ids: Optional[torch.LongTensor] = None,
1409
+ attention_mask: Optional[torch.FloatTensor] = None,
1410
+ position_ids: Optional[torch.LongTensor] = None,
1411
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1412
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1413
+ start_positions: Optional[torch.LongTensor] = None,
1414
+ end_positions: Optional[torch.LongTensor] = None,
1415
+ output_attentions: Optional[bool] = None,
1416
+ output_hidden_states: Optional[bool] = None,
1417
+ return_dict: Optional[bool] = None,
1418
+ ) -> Union[Tuple, QuestionAnsweringModelOutput]:
1419
+ r"""
1420
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1421
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1422
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1423
+ are not taken into account for computing the loss.
1424
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1425
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1426
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1427
+ are not taken into account for computing the loss.
1428
+ """
1429
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1430
+
1431
+ outputs = self.transformer(
1432
+ input_ids,
1433
+ attention_mask=attention_mask,
1434
+ position_ids=position_ids,
1435
+ past_key_values=past_key_values,
1436
+ inputs_embeds=inputs_embeds,
1437
+ output_attentions=output_attentions,
1438
+ output_hidden_states=output_hidden_states,
1439
+ return_dict=return_dict,
1440
+ )
1441
+
1442
+ sequence_output = outputs[0]
1443
+
1444
+ logits = self.qa_outputs(sequence_output)
1445
+ start_logits, end_logits = logits.split(1, dim=-1)
1446
+ start_logits = start_logits.squeeze(-1).contiguous()
1447
+ end_logits = end_logits.squeeze(-1).contiguous()
1448
+
1449
+ total_loss = None
1450
+ if start_positions is not None and end_positions is not None:
1451
+ # If we are on multi-GPU, split add a dimension
1452
+ if len(start_positions.size()) > 1:
1453
+ start_positions = start_positions.squeeze(-1).to(start_logits.device)
1454
+ if len(end_positions.size()) > 1:
1455
+ end_positions = end_positions.squeeze(-1).to(end_logits.device)
1456
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
1457
+ ignored_index = start_logits.size(1)
1458
+ start_positions = start_positions.clamp(0, ignored_index)
1459
+ end_positions = end_positions.clamp(0, ignored_index)
1460
+
1461
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1462
+ start_loss = loss_fct(start_logits, start_positions)
1463
+ end_loss = loss_fct(end_logits, end_positions)
1464
+ total_loss = (start_loss + end_loss) / 2
1465
+
1466
+ if not return_dict:
1467
+ output = (start_logits, end_logits) + outputs[2:]
1468
+ return ((total_loss,) + output) if total_loss is not None else output
1469
+
1470
+ return QuestionAnsweringModelOutput(
1471
+ loss=total_loss,
1472
+ start_logits=start_logits,
1473
+ end_logits=end_logits,
1474
+ hidden_states=outputs.hidden_states,
1475
+ attentions=outputs.attentions,
1476
+ )
1477
+
1478
+
1479
+ @add_start_docstrings(
1480
+ """
1481
+ The Xmodel Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1482
+ output) e.g. for Named-Entity-Recognition (NER) tasks.
1483
+ """,
1484
+ XMODEL_START_DOCSTRING,
1485
+ )
1486
+ class XmodelForTokenClassification(XmodelPreTrainedModel):
1487
+ def __init__(self, config):
1488
+ super().__init__(config)
1489
+ self.num_labels = config.num_labels
1490
+ self.model = XmodelModel(config)
1491
+ if getattr(config, "classifier_dropout", None) is not None:
1492
+ classifier_dropout = config.classifier_dropout
1493
+ elif getattr(config, "hidden_dropout", None) is not None:
1494
+ classifier_dropout = config.hidden_dropout
1495
+ else:
1496
+ classifier_dropout = 0.1
1497
+ self.dropout = nn.Dropout(classifier_dropout)
1498
+ self.score = nn.Linear(config.hidden_size, config.num_labels)
1499
+
1500
+ # Initialize weights and apply final processing
1501
+ self.post_init()
1502
+
1503
+ def get_input_embeddings(self):
1504
+ return self.model.embed_tokens
1505
+
1506
+ def set_input_embeddings(self, value):
1507
+ self.model.embed_tokens = value
1508
+
1509
+ @add_start_docstrings_to_model_forward(XMODEL_INPUTS_DOCSTRING)
1510
+ def forward(
1511
+ self,
1512
+ input_ids: Optional[torch.LongTensor] = None,
1513
+ attention_mask: Optional[torch.Tensor] = None,
1514
+ position_ids: Optional[torch.LongTensor] = None,
1515
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1516
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1517
+ labels: Optional[torch.LongTensor] = None,
1518
+ use_cache: Optional[bool] = None,
1519
+ output_attentions: Optional[bool] = None,
1520
+ output_hidden_states: Optional[bool] = None,
1521
+ return_dict: Optional[bool] = None,
1522
+ ) -> Union[Tuple, TokenClassifierOutput]:
1523
+ r"""
1524
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1525
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1526
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1527
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1528
+ """
1529
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1530
+
1531
+ outputs = self.model(
1532
+ input_ids,
1533
+ attention_mask=attention_mask,
1534
+ position_ids=position_ids,
1535
+ past_key_values=past_key_values,
1536
+ inputs_embeds=inputs_embeds,
1537
+ use_cache=use_cache,
1538
+ output_attentions=output_attentions,
1539
+ output_hidden_states=output_hidden_states,
1540
+ return_dict=return_dict,
1541
+ )
1542
+ sequence_output = outputs[0]
1543
+ sequence_output = self.dropout(sequence_output)
1544
+ logits = self.score(sequence_output)
1545
+
1546
+ loss = None
1547
+ if labels is not None:
1548
+ loss_fct = CrossEntropyLoss()
1549
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1550
+
1551
+ if not return_dict:
1552
+ output = (logits,) + outputs[2:]
1553
+ return ((loss,) + output) if loss is not None else output
1554
+
1555
+ return TokenClassifierOutput(
1556
+ loss=loss,
1557
+ logits=logits,
1558
+ hidden_states=outputs.hidden_states,
1559
+ attentions=outputs.attentions,
1560
+ )
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1e2260ee123682efecab8bc6f2e794e7160a1c386d2cb4849b48779b98e5f92
3
+ size 2503658066
tokenization_xmodel.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+
21
+ import os
22
+ from shutil import copyfile
23
+ from typing import Any, Dict, List, Optional, Tuple
24
+
25
+ import sentencepiece as spm
26
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
27
+ from transformers.utils import logging
28
+
29
+ logger = logging.get_logger(__name__)
30
+
31
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
32
+
33
+ PRETRAINED_VOCAB_FILES_MAP = {
34
+ "vocab_file": {},
35
+ "tokenizer_file": {},
36
+ }
37
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
38
+
39
+
40
+ class XModelTokenizer(PreTrainedTokenizer):
41
+ """
42
+ Construct a XModel tokenizer. Based on byte-level Byte-Pair-Encoding.
43
+
44
+ Args:
45
+ vocab_file (`str`):
46
+ Path to the vocabulary file.
47
+ """
48
+
49
+ vocab_files_names = VOCAB_FILES_NAMES
50
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
51
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
52
+ model_input_names = ["input_ids", "attention_mask"]
53
+
54
+ def __init__(
55
+ self,
56
+ vocab_file,
57
+ unk_token="<unk>",
58
+ bos_token="<s>",
59
+ eos_token="</s>",
60
+ pad_token=None,
61
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
62
+ add_bos_token=True,
63
+ add_eos_token=False,
64
+ clean_up_tokenization_spaces=False,
65
+ **kwargs,
66
+ ):
67
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
68
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
69
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
70
+ unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
71
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
72
+ self.vocab_file = vocab_file
73
+ self.add_bos_token = add_bos_token
74
+ self.add_eos_token = add_eos_token
75
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
76
+ self.sp_model.Load(vocab_file)
77
+ super().__init__(
78
+ bos_token=bos_token,
79
+ eos_token=eos_token,
80
+ unk_token=unk_token,
81
+ pad_token=pad_token,
82
+ add_bos_token=add_bos_token,
83
+ add_eos_token=add_eos_token,
84
+ sp_model_kwargs=self.sp_model_kwargs,
85
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
86
+ **kwargs,
87
+ )
88
+
89
+ def __getstate__(self):
90
+ state = self.__dict__.copy()
91
+ state["sp_model"] = None
92
+ return state
93
+
94
+ def __setstate__(self, d):
95
+ self.__dict__ = d
96
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
97
+ self.sp_model.Load(self.vocab_file)
98
+
99
+ @property
100
+ def vocab_size(self):
101
+ """Returns vocab size"""
102
+ return self.sp_model.get_piece_size()
103
+
104
+ def get_vocab(self):
105
+ """Returns vocab as a dict"""
106
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
107
+ vocab.update(self.added_tokens_encoder)
108
+ return vocab
109
+
110
+ def _tokenize(self, text):
111
+ """Returns a tokenized string."""
112
+ return self.sp_model.encode(text, out_type=str)
113
+
114
+ def _convert_token_to_id(self, token):
115
+ """Converts a token (str) in an id using the vocab."""
116
+ return self.sp_model.piece_to_id(token)
117
+
118
+ def _convert_id_to_token(self, index):
119
+ """Converts an index (integer) in a token (str) using the vocab."""
120
+ token = self.sp_model.IdToPiece(index)
121
+ return token
122
+
123
+ def convert_tokens_to_string(self, tokens):
124
+ """Converts a sequence of tokens (string) in a single string."""
125
+ current_sub_tokens = []
126
+ out_string = ""
127
+ prev_is_special = False
128
+ for i, token in enumerate(tokens):
129
+ # make sure that special tokens are not decoded using sentencepiece model
130
+ if token in self.all_special_tokens:
131
+ if not prev_is_special and i != 0:
132
+ out_string += " "
133
+ out_string += self.sp_model.decode(current_sub_tokens) + token
134
+ prev_is_special = True
135
+ current_sub_tokens = []
136
+ else:
137
+ current_sub_tokens.append(token)
138
+ prev_is_special = False
139
+ out_string += self.sp_model.decode(current_sub_tokens)
140
+ return out_string
141
+
142
+ def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
143
+ """
144
+ Save the vocabulary and special tokens file to a directory.
145
+
146
+ Args:
147
+ save_directory (`str`):
148
+ The directory in which to save the vocabulary.
149
+
150
+ Returns:
151
+ `Tuple(str)`: Paths to the files saved.
152
+ """
153
+ if not os.path.isdir(save_directory):
154
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
155
+ return
156
+ out_vocab_file = os.path.join(
157
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
158
+ )
159
+
160
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
161
+ copyfile(self.vocab_file, out_vocab_file)
162
+ elif not os.path.isfile(self.vocab_file):
163
+ with open(out_vocab_file, "wb") as fi:
164
+ content_spiece_model = self.sp_model.serialized_model_proto()
165
+ fi.write(content_spiece_model)
166
+
167
+ return (out_vocab_file,)
168
+
169
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
170
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
171
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
172
+
173
+ output = bos_token_id + token_ids_0 + eos_token_id
174
+
175
+ if token_ids_1 is not None:
176
+ output = output + bos_token_id + token_ids_1 + eos_token_id
177
+
178
+ return output
179
+
180
+ def get_special_tokens_mask(
181
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
182
+ already_has_special_tokens: bool = False
183
+ ) -> List[int]:
184
+ """
185
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
186
+ special tokens using the tokenizer `prepare_for_model` method.
187
+
188
+ Args:
189
+ token_ids_0 (`List[int]`):
190
+ List of IDs.
191
+ token_ids_1 (`List[int]`, *optional*):
192
+ Optional second list of IDs for sequence pairs.
193
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
194
+ Whether or not the token list is already formatted with special tokens for the model.
195
+
196
+ Returns:
197
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
198
+ """
199
+ if already_has_special_tokens:
200
+ return super().get_special_tokens_mask(
201
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
202
+ )
203
+
204
+ bos_token_id = [1] if self.add_bos_token else []
205
+ eos_token_id = [1] if self.add_eos_token else []
206
+
207
+ if token_ids_1 is None:
208
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
209
+ return (
210
+ bos_token_id
211
+ + ([0] * len(token_ids_0))
212
+ + eos_token_id
213
+ + bos_token_id
214
+ + ([0] * len(token_ids_1))
215
+ + eos_token_id
216
+ )
217
+
218
+ def create_token_type_ids_from_sequences(
219
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
220
+ ) -> List[int]:
221
+ """
222
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
223
+ sequence pair mask has the following format:
224
+
225
+ ```
226
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
227
+ | first sequence | second sequence |
228
+ ```
229
+
230
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
231
+
232
+ Args:
233
+ token_ids_0 (`List[int]`):
234
+ List of ids.
235
+ token_ids_1 (`List[int]`, *optional*):
236
+ Optional second list of IDs for sequence pairs.
237
+
238
+ Returns:
239
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
240
+ """
241
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
242
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
243
+
244
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
245
+
246
+ if token_ids_1 is not None:
247
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
248
+
249
+ return output
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3d91965878687648480d3e4dfedb5c66600b1612559e4579cdba76934b7d47e
3
+ size 1091044
tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_xmodel.XModelTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "add_bos_token": false,
9
+ "add_eos_token": false,
10
+ "bos_token": {
11
+ "__type": "AddedToken",
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
19
+ "clean_up_tokenization_spaces": false,
20
+ "eos_token": {
21
+ "__type": "AddedToken",
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "model_max_length": 1000000000000000019884624838656,
29
+ "sp_model_kwargs": {},
30
+ "tokenizer_class": "XModelTokenizer",
31
+ "unk_token": {
32
+ "__type": "AddedToken",
33
+ "content": "<unk>",
34
+ "lstrip": false,
35
+ "normalized": true,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ }
39
+ }
xmodel_65280.vocab ADDED
The diff for this file is too large to render. See raw diff