Upload folder using huggingface_hub
Browse files- config.json +5 -6
- configuration_qwen.py +2 -4
- gptq_model-4bit-128g.safetensors +2 -2
- modeling_qwen.py +124 -85
- tokenization_qwen.py +16 -46
- tokenizer_config.json +4 -1
config.json
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"architectures": [
|
4 |
"QWenLMHeadModel"
|
5 |
],
|
@@ -8,9 +8,9 @@
|
|
8 |
"AutoConfig": "configuration_qwen.QWenConfig",
|
9 |
"AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
|
10 |
},
|
11 |
-
"bf16":
|
12 |
"emb_dropout_prob": 0.0,
|
13 |
-
"fp16":
|
14 |
"fp32": false,
|
15 |
"hidden_size": 4096,
|
16 |
"initializer_range": 0.02,
|
@@ -27,16 +27,15 @@
|
|
27 |
"rotary_pct": 1.0,
|
28 |
"scale_attn_weights": true,
|
29 |
"seq_length": 8192,
|
30 |
-
"softmax_in_fp32": false,
|
31 |
"tie_word_embeddings": false,
|
32 |
"tokenizer_class": "QWenTokenizer",
|
33 |
"torch_dtype": "float16",
|
34 |
-
"transformers_version": "4.
|
35 |
"use_cache": true,
|
36 |
"use_cache_kernel": false,
|
37 |
"use_cache_quantization": false,
|
38 |
"use_dynamic_ntk": true,
|
39 |
-
"use_flash_attn":
|
40 |
"use_logn_attn": true,
|
41 |
"vocab_size": 151936
|
42 |
}
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "Qwen-7B-Chat-roleplay",
|
3 |
"architectures": [
|
4 |
"QWenLMHeadModel"
|
5 |
],
|
|
|
8 |
"AutoConfig": "configuration_qwen.QWenConfig",
|
9 |
"AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
|
10 |
},
|
11 |
+
"bf16": false,
|
12 |
"emb_dropout_prob": 0.0,
|
13 |
+
"fp16": true,
|
14 |
"fp32": false,
|
15 |
"hidden_size": 4096,
|
16 |
"initializer_range": 0.02,
|
|
|
27 |
"rotary_pct": 1.0,
|
28 |
"scale_attn_weights": true,
|
29 |
"seq_length": 8192,
|
|
|
30 |
"tie_word_embeddings": false,
|
31 |
"tokenizer_class": "QWenTokenizer",
|
32 |
"torch_dtype": "float16",
|
33 |
+
"transformers_version": "4.34.0",
|
34 |
"use_cache": true,
|
35 |
"use_cache_kernel": false,
|
36 |
"use_cache_quantization": false,
|
37 |
"use_dynamic_ntk": true,
|
38 |
+
"use_flash_attn": false,
|
39 |
"use_logn_attn": true,
|
40 |
"vocab_size": 151936
|
41 |
}
|
configuration_qwen.py
CHANGED
@@ -37,7 +37,6 @@ class QWenConfig(PretrainedConfig):
|
|
37 |
tie_word_embeddings=False,
|
38 |
use_cache_quantization=False,
|
39 |
use_cache_kernel=False,
|
40 |
-
softmax_in_fp32=False,
|
41 |
**kwargs,
|
42 |
):
|
43 |
self.vocab_size = vocab_size
|
@@ -62,9 +61,8 @@ class QWenConfig(PretrainedConfig):
|
|
62 |
self.use_logn_attn = use_logn_attn
|
63 |
self.use_flash_attn = use_flash_attn
|
64 |
self.no_bias = no_bias
|
65 |
-
self.use_cache_quantization
|
66 |
-
self.use_cache_kernel
|
67 |
-
self.softmax_in_fp32 = softmax_in_fp32
|
68 |
super().__init__(
|
69 |
tie_word_embeddings=tie_word_embeddings,
|
70 |
**kwargs
|
|
|
37 |
tie_word_embeddings=False,
|
38 |
use_cache_quantization=False,
|
39 |
use_cache_kernel=False,
|
|
|
40 |
**kwargs,
|
41 |
):
|
42 |
self.vocab_size = vocab_size
|
|
|
61 |
self.use_logn_attn = use_logn_attn
|
62 |
self.use_flash_attn = use_flash_attn
|
63 |
self.no_bias = no_bias
|
64 |
+
self.use_cache_quantization=use_cache_quantization
|
65 |
+
self.use_cache_kernel=use_cache_kernel
|
|
|
66 |
super().__init__(
|
67 |
tie_word_embeddings=tie_word_embeddings,
|
68 |
**kwargs
|
gptq_model-4bit-128g.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1b8e151b749ee01a4af6085eb9f6e93f746ad2d5fcf77fbdd524ca8a649814b
|
3 |
+
size 5860657512
|
modeling_qwen.py
CHANGED
@@ -3,16 +3,13 @@
|
|
3 |
# This source code is licensed under the license found in the
|
4 |
# LICENSE file in the root directory of this source tree.
|
5 |
|
6 |
-
import copy
|
7 |
import importlib
|
8 |
import math
|
9 |
-
import pathlib
|
10 |
from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
|
11 |
|
12 |
import torch
|
13 |
import torch.nn.functional as F
|
14 |
import torch.utils.checkpoint
|
15 |
-
import warnings
|
16 |
from torch.cuda.amp import autocast
|
17 |
|
18 |
from torch.nn import CrossEntropyLoss
|
@@ -35,11 +32,14 @@ except ImportError:
|
|
35 |
rearrange = None
|
36 |
from torch import nn
|
37 |
|
|
|
|
|
|
|
|
|
|
|
38 |
SUPPORT_CUDA = torch.cuda.is_available()
|
39 |
SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
|
40 |
SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
|
41 |
-
SUPPORT_TORCH2 = hasattr(torch, '__version__') and int(torch.__version__.split(".")[0]) >= 2
|
42 |
-
|
43 |
|
44 |
from .configuration_qwen import QWenConfig
|
45 |
from .qwen_generation_utils import (
|
@@ -180,7 +180,6 @@ class FlashSelfAttention(torch.nn.Module):
|
|
180 |
assert all((i.is_cuda for i in (q, k, v)))
|
181 |
batch_size, seqlen_q = q.shape[0], q.shape[1]
|
182 |
seqlen_k = k.shape[1]
|
183 |
-
seqlen_out = seqlen_q
|
184 |
|
185 |
q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
|
186 |
cu_seqlens_q = torch.arange(
|
@@ -191,13 +190,12 @@ class FlashSelfAttention(torch.nn.Module):
|
|
191 |
device=q.device,
|
192 |
)
|
193 |
|
194 |
-
if
|
195 |
k, indices_k, cu_seqlens_k, seqlen_k = self.unpad_input(k, attention_mask)
|
196 |
-
|
|
|
197 |
q = q[indices_k]
|
198 |
cu_seqlens_q = cu_seqlens_k
|
199 |
-
seqlen_q = seqlen_k
|
200 |
-
v = v[indices_k]
|
201 |
else:
|
202 |
cu_seqlens_k = torch.arange(
|
203 |
0,
|
@@ -227,8 +225,8 @@ class FlashSelfAttention(torch.nn.Module):
|
|
227 |
softmax_scale=self.softmax_scale,
|
228 |
causal=is_causal,
|
229 |
)
|
230 |
-
if
|
231 |
-
output = self.pad_input(output, indices_k, batch_size,
|
232 |
else:
|
233 |
new_shape = (batch_size, output.shape[0] // batch_size) + output.shape[1:]
|
234 |
output = output.view(new_shape)
|
@@ -285,7 +283,6 @@ class QWenAttention(nn.Module):
|
|
285 |
self.register_buffer("logn_tensor", logn_tensor, persistent=False)
|
286 |
|
287 |
self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
|
288 |
-
self.softmax_in_fp32 = config.softmax_in_fp32 if hasattr(config, 'softmax_in_fp32') else False
|
289 |
self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
|
290 |
self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
|
291 |
cache_dtype = torch.float
|
@@ -296,29 +293,14 @@ class QWenAttention(nn.Module):
|
|
296 |
self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
|
297 |
self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
|
298 |
|
299 |
-
if config.use_cache_quantization and config.use_cache_kernel:
|
300 |
-
# pre check if the support files existing
|
301 |
-
module_root = pathlib.Path(__file__).parent
|
302 |
-
src_files = ("cache_autogptq_cuda_256.cpp", "cache_autogptq_cuda_kernel_256.cu")
|
303 |
-
if any(not (module_root/src).is_file() for src in src_files):
|
304 |
-
warnings.warn("KV cache kernel source files (.cpp and .cu) not found.")
|
305 |
-
self.cache_kernels = None
|
306 |
-
else:
|
307 |
-
try:
|
308 |
-
from .cpp_kernels import cache_autogptq_cuda_256
|
309 |
-
self.cache_kernels = cache_autogptq_cuda_256
|
310 |
-
except ImportError:
|
311 |
-
warnings.warn("Failed to import KV cache kernels.")
|
312 |
-
self.cache_kernels = None
|
313 |
-
|
314 |
def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
|
315 |
device = query.device
|
316 |
if self.use_cache_quantization:
|
317 |
qk, qk_scale, qk_zero = key
|
318 |
-
if self.use_cache_kernel and
|
319 |
shape = query.shape[:-1] + (qk.shape[-2],)
|
320 |
attn_weights = torch.zeros(shape, dtype=torch.float16, device=device)
|
321 |
-
|
322 |
query.contiguous() if query.dtype == torch.float16 else query.to(torch.float16).contiguous(),
|
323 |
qk.transpose(-1, -2).contiguous(),
|
324 |
attn_weights,
|
@@ -360,10 +342,7 @@ class QWenAttention(nn.Module):
|
|
360 |
if attention_mask is not None:
|
361 |
attn_weights = attn_weights + attention_mask
|
362 |
|
363 |
-
|
364 |
-
attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
|
365 |
-
else:
|
366 |
-
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
367 |
|
368 |
attn_weights = attn_weights.type(query.dtype)
|
369 |
attn_weights = self.attn_dropout(attn_weights)
|
@@ -373,10 +352,10 @@ class QWenAttention(nn.Module):
|
|
373 |
|
374 |
if self.use_cache_quantization:
|
375 |
qv, qv_scale, qv_zero = value
|
376 |
-
if self.use_cache_kernel and
|
377 |
shape = attn_weights.shape[:-1] + (query.shape[-1],)
|
378 |
attn_output = torch.zeros(shape, dtype=torch.float16, device=device)
|
379 |
-
|
380 |
attn_weights.contiguous() if attn_weights.dtype == torch.float16 else attn_weights.to(torch.float16).contiguous(),
|
381 |
qv.contiguous(), # dtype: int32
|
382 |
attn_output,
|
@@ -395,6 +374,62 @@ class QWenAttention(nn.Module):
|
|
395 |
|
396 |
return attn_output, attn_weights
|
397 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
def _split_heads(self, tensor, num_heads, attn_head_size):
|
399 |
new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
|
400 |
tensor = tensor.view(new_shape)
|
@@ -408,7 +443,8 @@ class QWenAttention(nn.Module):
|
|
408 |
def forward(
|
409 |
self,
|
410 |
hidden_states: Optional[Tuple[torch.FloatTensor]],
|
411 |
-
rotary_pos_emb_list: Optional[List[
|
|
|
412 |
layer_past: Optional[Tuple[torch.Tensor]] = None,
|
413 |
attention_mask: Optional[torch.FloatTensor] = None,
|
414 |
head_mask: Optional[torch.FloatTensor] = None,
|
@@ -489,7 +525,7 @@ class QWenAttention(nn.Module):
|
|
489 |
else:
|
490 |
seq_start = key.size(1) - query.size(1)
|
491 |
seq_end = key.size(1)
|
492 |
-
logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
|
493 |
query = query * logn_tensor.expand_as(query)
|
494 |
|
495 |
if (
|
@@ -499,11 +535,12 @@ class QWenAttention(nn.Module):
|
|
499 |
and query.is_cuda
|
500 |
):
|
501 |
q, k, v = query, key, value
|
502 |
-
|
|
|
|
|
|
|
|
|
503 |
else:
|
504 |
-
registered_causal_mask = torch.tril(
|
505 |
-
torch.ones((key.size(1), key.size(1)), dtype=torch.bool, device=key.device)
|
506 |
-
).view(1, 1, key.size(1), key.size(1))
|
507 |
query = query.permute(0, 2, 1, 3)
|
508 |
if not self.use_cache_quantization:
|
509 |
key = key.permute(0, 2, 1, 3)
|
@@ -516,28 +553,12 @@ class QWenAttention(nn.Module):
|
|
516 |
and not query.is_cuda
|
517 |
):
|
518 |
raise Exception(_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED)
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
attention_mask = attention_mask.expand(
|
526 |
-
-1, -1, causal_mask.size(2), -1
|
527 |
-
).masked_fill(~causal_mask, torch.finfo(query.dtype).min)
|
528 |
-
else:
|
529 |
-
attention_mask = causal_mask
|
530 |
-
attn_output = F.scaled_dot_product_attention(
|
531 |
-
query, key, value, attn_mask=attention_mask
|
532 |
-
).transpose(1, 2)
|
533 |
-
attn_weight = None
|
534 |
-
else:
|
535 |
-
attn_output, attn_weight = self._attn(
|
536 |
-
query, key, value, registered_causal_mask, attention_mask, head_mask
|
537 |
-
)
|
538 |
-
context_layer = self._merge_heads(
|
539 |
-
attn_output, self.num_heads, self.head_dim
|
540 |
-
)
|
541 |
|
542 |
attn_output = self.c_proj(context_layer)
|
543 |
|
@@ -595,7 +616,8 @@ class QWenBlock(nn.Module):
|
|
595 |
def forward(
|
596 |
self,
|
597 |
hidden_states: Optional[Tuple[torch.FloatTensor]],
|
598 |
-
rotary_pos_emb_list: Optional[List[
|
|
|
599 |
layer_past: Optional[Tuple[torch.Tensor]] = None,
|
600 |
attention_mask: Optional[torch.FloatTensor] = None,
|
601 |
head_mask: Optional[torch.FloatTensor] = None,
|
@@ -609,6 +631,7 @@ class QWenBlock(nn.Module):
|
|
609 |
attn_outputs = self.attn(
|
610 |
layernorm_output,
|
611 |
rotary_pos_emb_list,
|
|
|
612 |
layer_past=layer_past,
|
613 |
attention_mask=attention_mask,
|
614 |
head_mask=head_mask,
|
@@ -708,6 +731,21 @@ class QWenModel(QWenPreTrainedModel):
|
|
708 |
|
709 |
self.use_flash_attn = config.use_flash_attn
|
710 |
self.is_fp32 = not (config.bf16 or config.fp16)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
711 |
|
712 |
self.h = nn.ModuleList(
|
713 |
[
|
@@ -844,9 +882,11 @@ class QWenModel(QWenPreTrainedModel):
|
|
844 |
ntk_alpha = self.get_ntk_alpha(kv_seq_len)
|
845 |
ntk_alpha_list.append(ntk_alpha)
|
846 |
self.rotary_emb._ntk_alpha_cached_list = ntk_alpha_list
|
847 |
-
|
848 |
-
|
849 |
-
|
|
|
|
|
850 |
|
851 |
hidden_states = self.drop(hidden_states)
|
852 |
output_shape = input_shape + (hidden_states.size(-1),)
|
@@ -879,6 +919,7 @@ class QWenModel(QWenPreTrainedModel):
|
|
879 |
create_custom_forward(block),
|
880 |
hidden_states,
|
881 |
rotary_pos_emb_list,
|
|
|
882 |
None,
|
883 |
attention_mask,
|
884 |
head_mask[i],
|
@@ -890,6 +931,7 @@ class QWenModel(QWenPreTrainedModel):
|
|
890 |
hidden_states,
|
891 |
layer_past=layer_past,
|
892 |
rotary_pos_emb_list=rotary_pos_emb_list,
|
|
|
893 |
attention_mask=attention_mask,
|
894 |
head_mask=head_mask[i],
|
895 |
encoder_hidden_states=encoder_hidden_states,
|
@@ -979,6 +1021,15 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
979 |
if config.use_flash_attn:
|
980 |
_import_flash_attn()
|
981 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
982 |
self.transformer = QWenModel(config)
|
983 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
984 |
|
@@ -1115,6 +1166,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
1115 |
query: str,
|
1116 |
history: Optional[HistoryType],
|
1117 |
system: str = "You are a helpful assistant.",
|
|
|
1118 |
stream: Optional[bool] = _SENTINEL,
|
1119 |
stop_words_ids: Optional[List[List[int]]] = None,
|
1120 |
generation_config: Optional[GenerationConfig] = None,
|
@@ -1126,10 +1178,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
1126 |
assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
|
1127 |
if history is None:
|
1128 |
history = []
|
1129 |
-
else:
|
1130 |
-
# make a copy of the user's input such that is is left untouched
|
1131 |
-
history = copy.deepcopy(history)
|
1132 |
-
|
1133 |
if stop_words_ids is None:
|
1134 |
stop_words_ids = []
|
1135 |
|
@@ -1167,11 +1215,8 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
|
1167 |
errors='replace'
|
1168 |
)
|
1169 |
|
1170 |
-
|
1171 |
-
|
1172 |
-
# separating input history and output history also enables the user
|
1173 |
-
# to implement more complex history management
|
1174 |
-
history.append((query, response))
|
1175 |
|
1176 |
return response, history
|
1177 |
|
@@ -1343,16 +1388,11 @@ def apply_rotary_pos_emb(t, freqs):
|
|
1343 |
t_ = t.float()
|
1344 |
cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
|
1345 |
sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
|
|
|
|
|
1346 |
output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
|
1347 |
return output
|
1348 |
else:
|
1349 |
-
# rot_dim = freqs[0].shape[-1]
|
1350 |
-
# cos, sin = freqs
|
1351 |
-
# t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
|
1352 |
-
# t_ = t_.float()
|
1353 |
-
# t_pass_ = t_pass_.float()
|
1354 |
-
# t_ = (t_ * cos) + (_rotate_half(t_) * sin)
|
1355 |
-
# return torch.cat((t_, t_pass_), dim=-1).type_as(t)
|
1356 |
rot_dim = freqs[0].shape[-1]
|
1357 |
cos, sin = freqs
|
1358 |
t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
|
@@ -1365,7 +1405,6 @@ def apply_rotary_pos_emb(t, freqs):
|
|
1365 |
t_ = (t_ * cos) + (_rotate_half(t_) * sin)
|
1366 |
return torch.cat((t_, t_pass_), dim=-1).type_as(t)
|
1367 |
|
1368 |
-
|
1369 |
class RMSNorm(torch.nn.Module):
|
1370 |
def __init__(self, dim: int, eps: float = 1e-6):
|
1371 |
super().__init__()
|
|
|
3 |
# This source code is licensed under the license found in the
|
4 |
# LICENSE file in the root directory of this source tree.
|
5 |
|
|
|
6 |
import importlib
|
7 |
import math
|
|
|
8 |
from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
|
9 |
|
10 |
import torch
|
11 |
import torch.nn.functional as F
|
12 |
import torch.utils.checkpoint
|
|
|
13 |
from torch.cuda.amp import autocast
|
14 |
|
15 |
from torch.nn import CrossEntropyLoss
|
|
|
32 |
rearrange = None
|
33 |
from torch import nn
|
34 |
|
35 |
+
try:
|
36 |
+
from kernels.cpp_kernels import cache_autogptq_cuda_256
|
37 |
+
except ImportError:
|
38 |
+
cache_autogptq_cuda_256 = None
|
39 |
+
|
40 |
SUPPORT_CUDA = torch.cuda.is_available()
|
41 |
SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
|
42 |
SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
|
|
|
|
|
43 |
|
44 |
from .configuration_qwen import QWenConfig
|
45 |
from .qwen_generation_utils import (
|
|
|
180 |
assert all((i.is_cuda for i in (q, k, v)))
|
181 |
batch_size, seqlen_q = q.shape[0], q.shape[1]
|
182 |
seqlen_k = k.shape[1]
|
|
|
183 |
|
184 |
q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
|
185 |
cu_seqlens_q = torch.arange(
|
|
|
190 |
device=q.device,
|
191 |
)
|
192 |
|
193 |
+
if attention_mask is not None:
|
194 |
k, indices_k, cu_seqlens_k, seqlen_k = self.unpad_input(k, attention_mask)
|
195 |
+
v = v[indices_k]
|
196 |
+
if seqlen_q == seqlen_k:
|
197 |
q = q[indices_k]
|
198 |
cu_seqlens_q = cu_seqlens_k
|
|
|
|
|
199 |
else:
|
200 |
cu_seqlens_k = torch.arange(
|
201 |
0,
|
|
|
225 |
softmax_scale=self.softmax_scale,
|
226 |
causal=is_causal,
|
227 |
)
|
228 |
+
if attention_mask is not None and seqlen_q == seqlen_k:
|
229 |
+
output = self.pad_input(output, indices_k, batch_size, seqlen_q)
|
230 |
else:
|
231 |
new_shape = (batch_size, output.shape[0] // batch_size) + output.shape[1:]
|
232 |
output = output.view(new_shape)
|
|
|
283 |
self.register_buffer("logn_tensor", logn_tensor, persistent=False)
|
284 |
|
285 |
self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
|
|
|
286 |
self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
|
287 |
self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
|
288 |
cache_dtype = torch.float
|
|
|
293 |
self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
|
294 |
self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
|
295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
|
297 |
device = query.device
|
298 |
if self.use_cache_quantization:
|
299 |
qk, qk_scale, qk_zero = key
|
300 |
+
if self.use_cache_kernel and cache_autogptq_cuda_256 is not None:
|
301 |
shape = query.shape[:-1] + (qk.shape[-2],)
|
302 |
attn_weights = torch.zeros(shape, dtype=torch.float16, device=device)
|
303 |
+
cache_autogptq_cuda_256.vecquant8matmul_batched_faster_old(
|
304 |
query.contiguous() if query.dtype == torch.float16 else query.to(torch.float16).contiguous(),
|
305 |
qk.transpose(-1, -2).contiguous(),
|
306 |
attn_weights,
|
|
|
342 |
if attention_mask is not None:
|
343 |
attn_weights = attn_weights + attention_mask
|
344 |
|
345 |
+
attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
|
|
|
|
|
|
|
346 |
|
347 |
attn_weights = attn_weights.type(query.dtype)
|
348 |
attn_weights = self.attn_dropout(attn_weights)
|
|
|
352 |
|
353 |
if self.use_cache_quantization:
|
354 |
qv, qv_scale, qv_zero = value
|
355 |
+
if self.use_cache_kernel and cache_autogptq_cuda_256 is not None:
|
356 |
shape = attn_weights.shape[:-1] + (query.shape[-1],)
|
357 |
attn_output = torch.zeros(shape, dtype=torch.float16, device=device)
|
358 |
+
cache_autogptq_cuda_256.vecquant8matmul_batched_column_compression_faster_old(
|
359 |
attn_weights.contiguous() if attn_weights.dtype == torch.float16 else attn_weights.to(torch.float16).contiguous(),
|
360 |
qv.contiguous(), # dtype: int32
|
361 |
attn_output,
|
|
|
374 |
|
375 |
return attn_output, attn_weights
|
376 |
|
377 |
+
def _upcast_and_reordered_attn(
|
378 |
+
self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None
|
379 |
+
):
|
380 |
+
bsz, num_heads, q_seq_len, dk = query.size()
|
381 |
+
_, _, k_seq_len, _ = key.size()
|
382 |
+
|
383 |
+
attn_weights = torch.empty(
|
384 |
+
bsz * num_heads,
|
385 |
+
q_seq_len,
|
386 |
+
k_seq_len,
|
387 |
+
dtype=torch.float32,
|
388 |
+
device=query.device,
|
389 |
+
)
|
390 |
+
|
391 |
+
scale_factor = 1.0
|
392 |
+
if self.scale_attn_weights:
|
393 |
+
scale_factor /= float(value.size(-1)) ** 0.5
|
394 |
+
|
395 |
+
with autocast(enabled=False):
|
396 |
+
q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(
|
397 |
+
-1, dk, k_seq_len
|
398 |
+
)
|
399 |
+
attn_weights = torch.baddbmm(
|
400 |
+
attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor
|
401 |
+
)
|
402 |
+
attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
|
403 |
+
|
404 |
+
query_length, key_length = query.size(-2), key.size(-2)
|
405 |
+
causal_mask = registered_causal_mask[
|
406 |
+
:, :, key_length - query_length : key_length, :key_length
|
407 |
+
]
|
408 |
+
mask_value = torch.finfo(attn_weights.dtype).min
|
409 |
+
mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
|
410 |
+
attn_weights.device
|
411 |
+
)
|
412 |
+
attn_weights = torch.where(causal_mask, attn_weights, mask_value)
|
413 |
+
|
414 |
+
if attention_mask is not None:
|
415 |
+
attn_weights = attn_weights + attention_mask
|
416 |
+
|
417 |
+
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
418 |
+
|
419 |
+
if attn_weights.dtype != torch.float32:
|
420 |
+
raise RuntimeError(
|
421 |
+
"Error with upcasting, attn_weights does not have dtype torch.float32"
|
422 |
+
)
|
423 |
+
attn_weights = attn_weights.type(value.dtype)
|
424 |
+
attn_weights = self.attn_dropout(attn_weights)
|
425 |
+
|
426 |
+
if head_mask is not None:
|
427 |
+
attn_weights = attn_weights * head_mask
|
428 |
+
|
429 |
+
attn_output = torch.matmul(attn_weights, value)
|
430 |
+
|
431 |
+
return attn_output, attn_weights
|
432 |
+
|
433 |
def _split_heads(self, tensor, num_heads, attn_head_size):
|
434 |
new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
|
435 |
tensor = tensor.view(new_shape)
|
|
|
443 |
def forward(
|
444 |
self,
|
445 |
hidden_states: Optional[Tuple[torch.FloatTensor]],
|
446 |
+
rotary_pos_emb_list: Optional[List[torch.Tensor]] = None,
|
447 |
+
registered_causal_mask: Optional[torch.Tensor] = None,
|
448 |
layer_past: Optional[Tuple[torch.Tensor]] = None,
|
449 |
attention_mask: Optional[torch.FloatTensor] = None,
|
450 |
head_mask: Optional[torch.FloatTensor] = None,
|
|
|
525 |
else:
|
526 |
seq_start = key.size(1) - query.size(1)
|
527 |
seq_end = key.size(1)
|
528 |
+
logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
|
529 |
query = query * logn_tensor.expand_as(query)
|
530 |
|
531 |
if (
|
|
|
535 |
and query.is_cuda
|
536 |
):
|
537 |
q, k, v = query, key, value
|
538 |
+
context_layer = self.core_attention_flash(q, k, v, attention_mask=attention_mask)
|
539 |
+
|
540 |
+
# b s h d -> b s (h d)
|
541 |
+
context_layer = context_layer.flatten(2,3).contiguous()
|
542 |
+
|
543 |
else:
|
|
|
|
|
|
|
544 |
query = query.permute(0, 2, 1, 3)
|
545 |
if not self.use_cache_quantization:
|
546 |
key = key.permute(0, 2, 1, 3)
|
|
|
553 |
and not query.is_cuda
|
554 |
):
|
555 |
raise Exception(_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED)
|
556 |
+
attn_output, attn_weight = self._attn(
|
557 |
+
query, key, value, registered_causal_mask, attention_mask, head_mask
|
558 |
+
)
|
559 |
+
context_layer = self._merge_heads(
|
560 |
+
attn_output, self.num_heads, self.head_dim
|
561 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
562 |
|
563 |
attn_output = self.c_proj(context_layer)
|
564 |
|
|
|
616 |
def forward(
|
617 |
self,
|
618 |
hidden_states: Optional[Tuple[torch.FloatTensor]],
|
619 |
+
rotary_pos_emb_list: Optional[List[torch.Tensor]] = None,
|
620 |
+
registered_causal_mask: Optional[torch.Tensor] = None,
|
621 |
layer_past: Optional[Tuple[torch.Tensor]] = None,
|
622 |
attention_mask: Optional[torch.FloatTensor] = None,
|
623 |
head_mask: Optional[torch.FloatTensor] = None,
|
|
|
631 |
attn_outputs = self.attn(
|
632 |
layernorm_output,
|
633 |
rotary_pos_emb_list,
|
634 |
+
registered_causal_mask=registered_causal_mask,
|
635 |
layer_past=layer_past,
|
636 |
attention_mask=attention_mask,
|
637 |
head_mask=head_mask,
|
|
|
731 |
|
732 |
self.use_flash_attn = config.use_flash_attn
|
733 |
self.is_fp32 = not (config.bf16 or config.fp16)
|
734 |
+
if (
|
735 |
+
self.use_flash_attn
|
736 |
+
and flash_attn_unpadded_func is not None
|
737 |
+
and not self.is_fp32
|
738 |
+
):
|
739 |
+
self.registered_causal_mask = None
|
740 |
+
else:
|
741 |
+
max_positions = config.max_position_embeddings
|
742 |
+
self.register_buffer(
|
743 |
+
"registered_causal_mask",
|
744 |
+
torch.tril(
|
745 |
+
torch.ones((max_positions, max_positions), dtype=torch.bool)
|
746 |
+
).view(1, 1, max_positions, max_positions),
|
747 |
+
persistent=False,
|
748 |
+
)
|
749 |
|
750 |
self.h = nn.ModuleList(
|
751 |
[
|
|
|
882 |
ntk_alpha = self.get_ntk_alpha(kv_seq_len)
|
883 |
ntk_alpha_list.append(ntk_alpha)
|
884 |
self.rotary_emb._ntk_alpha_cached_list = ntk_alpha_list
|
885 |
+
|
886 |
+
rotary_pos_emb_list = []
|
887 |
+
for ntk_alpha in ntk_alpha_list:
|
888 |
+
rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha)
|
889 |
+
rotary_pos_emb_list.append(rotary_pos_emb)
|
890 |
|
891 |
hidden_states = self.drop(hidden_states)
|
892 |
output_shape = input_shape + (hidden_states.size(-1),)
|
|
|
919 |
create_custom_forward(block),
|
920 |
hidden_states,
|
921 |
rotary_pos_emb_list,
|
922 |
+
self.registered_causal_mask,
|
923 |
None,
|
924 |
attention_mask,
|
925 |
head_mask[i],
|
|
|
931 |
hidden_states,
|
932 |
layer_past=layer_past,
|
933 |
rotary_pos_emb_list=rotary_pos_emb_list,
|
934 |
+
registered_causal_mask=self.registered_causal_mask,
|
935 |
attention_mask=attention_mask,
|
936 |
head_mask=head_mask[i],
|
937 |
encoder_hidden_states=encoder_hidden_states,
|
|
|
1021 |
if config.use_flash_attn:
|
1022 |
_import_flash_attn()
|
1023 |
|
1024 |
+
|
1025 |
+
if hasattr(config, 'use_cache_quantization') and config.use_cache_quantization:
|
1026 |
+
config.use_flash_attn = False
|
1027 |
+
if hasattr(config, 'use_cache_kernel') and config.use_cache_kernel:
|
1028 |
+
try:
|
1029 |
+
from kernels.cpp_kernels import cache_autogptq_cuda_256
|
1030 |
+
except ImportError:
|
1031 |
+
cache_autogptq_cuda_256 = None
|
1032 |
+
|
1033 |
self.transformer = QWenModel(config)
|
1034 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
1035 |
|
|
|
1166 |
query: str,
|
1167 |
history: Optional[HistoryType],
|
1168 |
system: str = "You are a helpful assistant.",
|
1169 |
+
append_history: bool = True,
|
1170 |
stream: Optional[bool] = _SENTINEL,
|
1171 |
stop_words_ids: Optional[List[List[int]]] = None,
|
1172 |
generation_config: Optional[GenerationConfig] = None,
|
|
|
1178 |
assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
|
1179 |
if history is None:
|
1180 |
history = []
|
|
|
|
|
|
|
|
|
1181 |
if stop_words_ids is None:
|
1182 |
stop_words_ids = []
|
1183 |
|
|
|
1215 |
errors='replace'
|
1216 |
)
|
1217 |
|
1218 |
+
if append_history:
|
1219 |
+
history.append((query, response))
|
|
|
|
|
|
|
1220 |
|
1221 |
return response, history
|
1222 |
|
|
|
1388 |
t_ = t.float()
|
1389 |
cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
|
1390 |
sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
|
1391 |
+
cos = cos.to(device=t.device)
|
1392 |
+
sin = sin.to(device=t.device)
|
1393 |
output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
|
1394 |
return output
|
1395 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1396 |
rot_dim = freqs[0].shape[-1]
|
1397 |
cos, sin = freqs
|
1398 |
t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
|
|
|
1405 |
t_ = (t_ * cos) + (_rotate_half(t_) * sin)
|
1406 |
return torch.cat((t_, t_pass_), dim=-1).type_as(t)
|
1407 |
|
|
|
1408 |
class RMSNorm(torch.nn.Module):
|
1409 |
def __init__(self, dim: int, eps: float = 1e-6):
|
1410 |
super().__init__()
|
tokenization_qwen.py
CHANGED
@@ -27,22 +27,11 @@ IMEND = "<|im_end|>"
|
|
27 |
# regular texts, the surface forms of special tokens need to be
|
28 |
# as different as possible to minimize the impact
|
29 |
EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
(
|
36 |
-
ENDOFTEXT,
|
37 |
-
IMSTART,
|
38 |
-
IMEND,
|
39 |
-
)
|
40 |
-
+ EXTRAS
|
41 |
-
),
|
42 |
-
start=SPECIAL_START_ID,
|
43 |
-
)
|
44 |
-
)
|
45 |
-
SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
|
46 |
|
47 |
|
48 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
@@ -53,7 +42,6 @@ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
|
53 |
for token, rank in (line.split() for line in contents.splitlines() if line)
|
54 |
}
|
55 |
|
56 |
-
|
57 |
class QWenTokenizer(PreTrainedTokenizer):
|
58 |
"""QWen tokenizer."""
|
59 |
|
@@ -63,35 +51,20 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
63 |
self,
|
64 |
vocab_file,
|
65 |
errors="replace",
|
66 |
-
extra_vocab_file=None,
|
67 |
**kwargs,
|
68 |
):
|
69 |
super().__init__(**kwargs)
|
70 |
|
71 |
-
# how to handle errors in decoding
|
72 |
-
# use ignore if you are in streaming inference
|
73 |
-
self.errors = errors
|
74 |
|
75 |
-
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type:
|
76 |
self.special_tokens = {
|
77 |
token: index
|
78 |
-
for index, token in
|
|
|
|
|
79 |
}
|
80 |
|
81 |
-
# try load extra vocab from file
|
82 |
-
if extra_vocab_file is not None:
|
83 |
-
used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
|
84 |
-
extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
|
85 |
-
for token, index in extra_mergeable_ranks.items():
|
86 |
-
if token in self.mergeable_ranks:
|
87 |
-
logger.info(f"extra token {token} exists, skipping")
|
88 |
-
continue
|
89 |
-
if index in used_ids:
|
90 |
-
logger.info(f'the index {index} for extra token {token} exists, skipping')
|
91 |
-
continue
|
92 |
-
self.mergeable_ranks[token] = index
|
93 |
-
# the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
|
94 |
-
|
95 |
enc = tiktoken.Encoding(
|
96 |
"Qwen",
|
97 |
pat_str=PAT_STR,
|
@@ -116,7 +89,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
116 |
def __getstate__(self):
|
117 |
# for pickle lovers
|
118 |
state = self.__dict__.copy()
|
119 |
-
del state[
|
120 |
return state
|
121 |
|
122 |
def __setstate__(self, state):
|
@@ -130,6 +103,7 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
130 |
)
|
131 |
self.tokenizer = enc
|
132 |
|
|
|
133 |
def __len__(self) -> int:
|
134 |
return self.tokenizer.n_vocab
|
135 |
|
@@ -152,17 +126,13 @@ class QWenTokenizer(PreTrainedTokenizer):
|
|
152 |
ids.append(self.mergeable_ranks.get(token))
|
153 |
return ids
|
154 |
|
155 |
-
def _add_tokens(
|
156 |
-
self,
|
157 |
-
new_tokens: Union[List[str], List[AddedToken]],
|
158 |
-
special_tokens: bool = False,
|
159 |
-
) -> int:
|
160 |
if not special_tokens and new_tokens:
|
161 |
-
raise ValueError(
|
162 |
for token in new_tokens:
|
163 |
surface_form = token.content if isinstance(token, AddedToken) else token
|
164 |
-
if surface_form not in
|
165 |
-
raise ValueError(
|
166 |
return 0
|
167 |
|
168 |
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
|
|
|
27 |
# regular texts, the surface forms of special tokens need to be
|
28 |
# as different as possible to minimize the impact
|
29 |
EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
|
30 |
+
SPECIAL_TOKENS = (
|
31 |
+
ENDOFTEXT,
|
32 |
+
IMSTART,
|
33 |
+
IMEND,
|
34 |
+
) + EXTRAS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
|
37 |
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
|
|
42 |
for token, rank in (line.split() for line in contents.splitlines() if line)
|
43 |
}
|
44 |
|
|
|
45 |
class QWenTokenizer(PreTrainedTokenizer):
|
46 |
"""QWen tokenizer."""
|
47 |
|
|
|
51 |
self,
|
52 |
vocab_file,
|
53 |
errors="replace",
|
|
|
54 |
**kwargs,
|
55 |
):
|
56 |
super().__init__(**kwargs)
|
57 |
|
58 |
+
self.errors = errors # how to handle errors in decoding
|
|
|
|
|
59 |
|
60 |
+
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
|
61 |
self.special_tokens = {
|
62 |
token: index
|
63 |
+
for index, token in enumerate(
|
64 |
+
SPECIAL_TOKENS, start=len(self.mergeable_ranks)
|
65 |
+
)
|
66 |
}
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
enc = tiktoken.Encoding(
|
69 |
"Qwen",
|
70 |
pat_str=PAT_STR,
|
|
|
89 |
def __getstate__(self):
|
90 |
# for pickle lovers
|
91 |
state = self.__dict__.copy()
|
92 |
+
del state['tokenizer']
|
93 |
return state
|
94 |
|
95 |
def __setstate__(self, state):
|
|
|
103 |
)
|
104 |
self.tokenizer = enc
|
105 |
|
106 |
+
|
107 |
def __len__(self) -> int:
|
108 |
return self.tokenizer.n_vocab
|
109 |
|
|
|
126 |
ids.append(self.mergeable_ranks.get(token))
|
127 |
return ids
|
128 |
|
129 |
+
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
|
|
|
|
|
|
|
|
|
130 |
if not special_tokens and new_tokens:
|
131 |
+
raise ValueError('Adding regular tokens is not supported')
|
132 |
for token in new_tokens:
|
133 |
surface_form = token.content if isinstance(token, AddedToken) else token
|
134 |
+
if surface_form not in SPECIAL_TOKENS:
|
135 |
+
raise ValueError('Adding unknown special tokens is not supported')
|
136 |
return 0
|
137 |
|
138 |
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
|
tokenizer_config.json
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
{
|
|
|
|
|
2 |
"auto_map": {
|
3 |
"AutoTokenizer": [
|
4 |
"tokenization_qwen.QWenTokenizer",
|
@@ -7,5 +9,6 @@
|
|
7 |
},
|
8 |
"clean_up_tokenization_spaces": true,
|
9 |
"model_max_length": 8192,
|
10 |
-
"tokenizer_class": "QWenTokenizer"
|
|
|
11 |
}
|
|
|
1 |
{
|
2 |
+
"added_tokens_decoder": {},
|
3 |
+
"additional_special_tokens": [],
|
4 |
"auto_map": {
|
5 |
"AutoTokenizer": [
|
6 |
"tokenization_qwen.QWenTokenizer",
|
|
|
9 |
},
|
10 |
"clean_up_tokenization_spaces": true,
|
11 |
"model_max_length": 8192,
|
12 |
+
"tokenizer_class": "QWenTokenizer",
|
13 |
+
"tokenizer_file": null
|
14 |
}
|