Text Generation
Transformers
qwen
roleplay
self_instruct
custom_code
Minami-su commited on
Commit
9430783
·
1 Parent(s): 2e3e244

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "qwen_7b_chat_roleplay",
3
  "architectures": [
4
  "QWenLMHeadModel"
5
  ],
@@ -8,9 +8,9 @@
8
  "AutoConfig": "configuration_qwen.QWenConfig",
9
  "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
10
  },
11
- "bf16": true,
12
  "emb_dropout_prob": 0.0,
13
- "fp16": false,
14
  "fp32": false,
15
  "hidden_size": 4096,
16
  "initializer_range": 0.02,
@@ -27,16 +27,15 @@
27
  "rotary_pct": 1.0,
28
  "scale_attn_weights": true,
29
  "seq_length": 8192,
30
- "softmax_in_fp32": false,
31
  "tie_word_embeddings": false,
32
  "tokenizer_class": "QWenTokenizer",
33
  "torch_dtype": "float16",
34
- "transformers_version": "4.33.0",
35
  "use_cache": true,
36
  "use_cache_kernel": false,
37
  "use_cache_quantization": false,
38
  "use_dynamic_ntk": true,
39
- "use_flash_attn": true,
40
  "use_logn_attn": true,
41
  "vocab_size": 151936
42
  }
 
1
  {
2
+ "_name_or_path": "Qwen-7B-Chat-roleplay",
3
  "architectures": [
4
  "QWenLMHeadModel"
5
  ],
 
8
  "AutoConfig": "configuration_qwen.QWenConfig",
9
  "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
10
  },
11
+ "bf16": false,
12
  "emb_dropout_prob": 0.0,
13
+ "fp16": true,
14
  "fp32": false,
15
  "hidden_size": 4096,
16
  "initializer_range": 0.02,
 
27
  "rotary_pct": 1.0,
28
  "scale_attn_weights": true,
29
  "seq_length": 8192,
 
30
  "tie_word_embeddings": false,
31
  "tokenizer_class": "QWenTokenizer",
32
  "torch_dtype": "float16",
33
+ "transformers_version": "4.34.0",
34
  "use_cache": true,
35
  "use_cache_kernel": false,
36
  "use_cache_quantization": false,
37
  "use_dynamic_ntk": true,
38
+ "use_flash_attn": false,
39
  "use_logn_attn": true,
40
  "vocab_size": 151936
41
  }
configuration_qwen.py CHANGED
@@ -37,7 +37,6 @@ class QWenConfig(PretrainedConfig):
37
  tie_word_embeddings=False,
38
  use_cache_quantization=False,
39
  use_cache_kernel=False,
40
- softmax_in_fp32=False,
41
  **kwargs,
42
  ):
43
  self.vocab_size = vocab_size
@@ -62,9 +61,8 @@ class QWenConfig(PretrainedConfig):
62
  self.use_logn_attn = use_logn_attn
63
  self.use_flash_attn = use_flash_attn
64
  self.no_bias = no_bias
65
- self.use_cache_quantization = use_cache_quantization
66
- self.use_cache_kernel = use_cache_kernel
67
- self.softmax_in_fp32 = softmax_in_fp32
68
  super().__init__(
69
  tie_word_embeddings=tie_word_embeddings,
70
  **kwargs
 
37
  tie_word_embeddings=False,
38
  use_cache_quantization=False,
39
  use_cache_kernel=False,
 
40
  **kwargs,
41
  ):
42
  self.vocab_size = vocab_size
 
61
  self.use_logn_attn = use_logn_attn
62
  self.use_flash_attn = use_flash_attn
63
  self.no_bias = no_bias
64
+ self.use_cache_quantization=use_cache_quantization
65
+ self.use_cache_kernel=use_cache_kernel
 
66
  super().__init__(
67
  tie_word_embeddings=tie_word_embeddings,
68
  **kwargs
gptq_model-4bit-128g.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:135278bf4be7e2bb9edc864237bea8fc96dbd62906dce0b3e6999c9c4c1fb291
3
- size 5860657584
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1b8e151b749ee01a4af6085eb9f6e93f746ad2d5fcf77fbdd524ca8a649814b
3
+ size 5860657512
modeling_qwen.py CHANGED
@@ -3,16 +3,13 @@
3
  # This source code is licensed under the license found in the
4
  # LICENSE file in the root directory of this source tree.
5
 
6
- import copy
7
  import importlib
8
  import math
9
- import pathlib
10
  from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
11
 
12
  import torch
13
  import torch.nn.functional as F
14
  import torch.utils.checkpoint
15
- import warnings
16
  from torch.cuda.amp import autocast
17
 
18
  from torch.nn import CrossEntropyLoss
@@ -35,11 +32,14 @@ except ImportError:
35
  rearrange = None
36
  from torch import nn
37
 
 
 
 
 
 
38
  SUPPORT_CUDA = torch.cuda.is_available()
39
  SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
40
  SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
41
- SUPPORT_TORCH2 = hasattr(torch, '__version__') and int(torch.__version__.split(".")[0]) >= 2
42
-
43
 
44
  from .configuration_qwen import QWenConfig
45
  from .qwen_generation_utils import (
@@ -180,7 +180,6 @@ class FlashSelfAttention(torch.nn.Module):
180
  assert all((i.is_cuda for i in (q, k, v)))
181
  batch_size, seqlen_q = q.shape[0], q.shape[1]
182
  seqlen_k = k.shape[1]
183
- seqlen_out = seqlen_q
184
 
185
  q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
186
  cu_seqlens_q = torch.arange(
@@ -191,13 +190,12 @@ class FlashSelfAttention(torch.nn.Module):
191
  device=q.device,
192
  )
193
 
194
- if batch_size > 1 and attention_mask is not None:
195
  k, indices_k, cu_seqlens_k, seqlen_k = self.unpad_input(k, attention_mask)
196
- if q.size(0) == v.size(0):
 
197
  q = q[indices_k]
198
  cu_seqlens_q = cu_seqlens_k
199
- seqlen_q = seqlen_k
200
- v = v[indices_k]
201
  else:
202
  cu_seqlens_k = torch.arange(
203
  0,
@@ -227,8 +225,8 @@ class FlashSelfAttention(torch.nn.Module):
227
  softmax_scale=self.softmax_scale,
228
  causal=is_causal,
229
  )
230
- if batch_size > 1 and attention_mask is not None and seqlen_q == seqlen_k:
231
- output = self.pad_input(output, indices_k, batch_size, seqlen_out)
232
  else:
233
  new_shape = (batch_size, output.shape[0] // batch_size) + output.shape[1:]
234
  output = output.view(new_shape)
@@ -285,7 +283,6 @@ class QWenAttention(nn.Module):
285
  self.register_buffer("logn_tensor", logn_tensor, persistent=False)
286
 
287
  self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
288
- self.softmax_in_fp32 = config.softmax_in_fp32 if hasattr(config, 'softmax_in_fp32') else False
289
  self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
290
  self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
291
  cache_dtype = torch.float
@@ -296,29 +293,14 @@ class QWenAttention(nn.Module):
296
  self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
297
  self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
298
 
299
- if config.use_cache_quantization and config.use_cache_kernel:
300
- # pre check if the support files existing
301
- module_root = pathlib.Path(__file__).parent
302
- src_files = ("cache_autogptq_cuda_256.cpp", "cache_autogptq_cuda_kernel_256.cu")
303
- if any(not (module_root/src).is_file() for src in src_files):
304
- warnings.warn("KV cache kernel source files (.cpp and .cu) not found.")
305
- self.cache_kernels = None
306
- else:
307
- try:
308
- from .cpp_kernels import cache_autogptq_cuda_256
309
- self.cache_kernels = cache_autogptq_cuda_256
310
- except ImportError:
311
- warnings.warn("Failed to import KV cache kernels.")
312
- self.cache_kernels = None
313
-
314
  def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
315
  device = query.device
316
  if self.use_cache_quantization:
317
  qk, qk_scale, qk_zero = key
318
- if self.use_cache_kernel and self.cache_kernels is not None:
319
  shape = query.shape[:-1] + (qk.shape[-2],)
320
  attn_weights = torch.zeros(shape, dtype=torch.float16, device=device)
321
- self.cache_kernels.vecquant8matmul_batched_faster_old(
322
  query.contiguous() if query.dtype == torch.float16 else query.to(torch.float16).contiguous(),
323
  qk.transpose(-1, -2).contiguous(),
324
  attn_weights,
@@ -360,10 +342,7 @@ class QWenAttention(nn.Module):
360
  if attention_mask is not None:
361
  attn_weights = attn_weights + attention_mask
362
 
363
- if self.softmax_in_fp32:
364
- attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
365
- else:
366
- attn_weights = nn.functional.softmax(attn_weights, dim=-1)
367
 
368
  attn_weights = attn_weights.type(query.dtype)
369
  attn_weights = self.attn_dropout(attn_weights)
@@ -373,10 +352,10 @@ class QWenAttention(nn.Module):
373
 
374
  if self.use_cache_quantization:
375
  qv, qv_scale, qv_zero = value
376
- if self.use_cache_kernel and self.cache_kernels is not None:
377
  shape = attn_weights.shape[:-1] + (query.shape[-1],)
378
  attn_output = torch.zeros(shape, dtype=torch.float16, device=device)
379
- self.cache_kernels.vecquant8matmul_batched_column_compression_faster_old(
380
  attn_weights.contiguous() if attn_weights.dtype == torch.float16 else attn_weights.to(torch.float16).contiguous(),
381
  qv.contiguous(), # dtype: int32
382
  attn_output,
@@ -395,6 +374,62 @@ class QWenAttention(nn.Module):
395
 
396
  return attn_output, attn_weights
397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  def _split_heads(self, tensor, num_heads, attn_head_size):
399
  new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
400
  tensor = tensor.view(new_shape)
@@ -408,7 +443,8 @@ class QWenAttention(nn.Module):
408
  def forward(
409
  self,
410
  hidden_states: Optional[Tuple[torch.FloatTensor]],
411
- rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
 
412
  layer_past: Optional[Tuple[torch.Tensor]] = None,
413
  attention_mask: Optional[torch.FloatTensor] = None,
414
  head_mask: Optional[torch.FloatTensor] = None,
@@ -489,7 +525,7 @@ class QWenAttention(nn.Module):
489
  else:
490
  seq_start = key.size(1) - query.size(1)
491
  seq_end = key.size(1)
492
- logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :].type_as(query)
493
  query = query * logn_tensor.expand_as(query)
494
 
495
  if (
@@ -499,11 +535,12 @@ class QWenAttention(nn.Module):
499
  and query.is_cuda
500
  ):
501
  q, k, v = query, key, value
502
- attn_output = self.core_attention_flash(q, k, v, attention_mask=attention_mask)
 
 
 
 
503
  else:
504
- registered_causal_mask = torch.tril(
505
- torch.ones((key.size(1), key.size(1)), dtype=torch.bool, device=key.device)
506
- ).view(1, 1, key.size(1), key.size(1))
507
  query = query.permute(0, 2, 1, 3)
508
  if not self.use_cache_quantization:
509
  key = key.permute(0, 2, 1, 3)
@@ -516,28 +553,12 @@ class QWenAttention(nn.Module):
516
  and not query.is_cuda
517
  ):
518
  raise Exception(_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED)
519
-
520
- if not self.use_cache_quantization and SUPPORT_TORCH2:
521
- causal_mask = registered_causal_mask[
522
- :, :, key.size(-2) - query.size(-2): key.size(-2), :key.size(-2)
523
- ]
524
- if attention_mask is not None:
525
- attention_mask = attention_mask.expand(
526
- -1, -1, causal_mask.size(2), -1
527
- ).masked_fill(~causal_mask, torch.finfo(query.dtype).min)
528
- else:
529
- attention_mask = causal_mask
530
- attn_output = F.scaled_dot_product_attention(
531
- query, key, value, attn_mask=attention_mask
532
- ).transpose(1, 2)
533
- attn_weight = None
534
- else:
535
- attn_output, attn_weight = self._attn(
536
- query, key, value, registered_causal_mask, attention_mask, head_mask
537
- )
538
- context_layer = self._merge_heads(
539
- attn_output, self.num_heads, self.head_dim
540
- )
541
 
542
  attn_output = self.c_proj(context_layer)
543
 
@@ -595,7 +616,8 @@ class QWenBlock(nn.Module):
595
  def forward(
596
  self,
597
  hidden_states: Optional[Tuple[torch.FloatTensor]],
598
- rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
 
599
  layer_past: Optional[Tuple[torch.Tensor]] = None,
600
  attention_mask: Optional[torch.FloatTensor] = None,
601
  head_mask: Optional[torch.FloatTensor] = None,
@@ -609,6 +631,7 @@ class QWenBlock(nn.Module):
609
  attn_outputs = self.attn(
610
  layernorm_output,
611
  rotary_pos_emb_list,
 
612
  layer_past=layer_past,
613
  attention_mask=attention_mask,
614
  head_mask=head_mask,
@@ -708,6 +731,21 @@ class QWenModel(QWenPreTrainedModel):
708
 
709
  self.use_flash_attn = config.use_flash_attn
710
  self.is_fp32 = not (config.bf16 or config.fp16)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
 
712
  self.h = nn.ModuleList(
713
  [
@@ -844,9 +882,11 @@ class QWenModel(QWenPreTrainedModel):
844
  ntk_alpha = self.get_ntk_alpha(kv_seq_len)
845
  ntk_alpha_list.append(ntk_alpha)
846
  self.rotary_emb._ntk_alpha_cached_list = ntk_alpha_list
847
- rotary_pos_emb_list = [
848
- self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha) for ntk_alpha in ntk_alpha_list
849
- ]
 
 
850
 
851
  hidden_states = self.drop(hidden_states)
852
  output_shape = input_shape + (hidden_states.size(-1),)
@@ -879,6 +919,7 @@ class QWenModel(QWenPreTrainedModel):
879
  create_custom_forward(block),
880
  hidden_states,
881
  rotary_pos_emb_list,
 
882
  None,
883
  attention_mask,
884
  head_mask[i],
@@ -890,6 +931,7 @@ class QWenModel(QWenPreTrainedModel):
890
  hidden_states,
891
  layer_past=layer_past,
892
  rotary_pos_emb_list=rotary_pos_emb_list,
 
893
  attention_mask=attention_mask,
894
  head_mask=head_mask[i],
895
  encoder_hidden_states=encoder_hidden_states,
@@ -979,6 +1021,15 @@ class QWenLMHeadModel(QWenPreTrainedModel):
979
  if config.use_flash_attn:
980
  _import_flash_attn()
981
 
 
 
 
 
 
 
 
 
 
982
  self.transformer = QWenModel(config)
983
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
984
 
@@ -1115,6 +1166,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
1115
  query: str,
1116
  history: Optional[HistoryType],
1117
  system: str = "You are a helpful assistant.",
 
1118
  stream: Optional[bool] = _SENTINEL,
1119
  stop_words_ids: Optional[List[List[int]]] = None,
1120
  generation_config: Optional[GenerationConfig] = None,
@@ -1126,10 +1178,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
1126
  assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
1127
  if history is None:
1128
  history = []
1129
- else:
1130
- # make a copy of the user's input such that is is left untouched
1131
- history = copy.deepcopy(history)
1132
-
1133
  if stop_words_ids is None:
1134
  stop_words_ids = []
1135
 
@@ -1167,11 +1215,8 @@ class QWenLMHeadModel(QWenPreTrainedModel):
1167
  errors='replace'
1168
  )
1169
 
1170
- # as history is a copy of the user inputs,
1171
- # we can always return the new turn to the user.
1172
- # separating input history and output history also enables the user
1173
- # to implement more complex history management
1174
- history.append((query, response))
1175
 
1176
  return response, history
1177
 
@@ -1343,16 +1388,11 @@ def apply_rotary_pos_emb(t, freqs):
1343
  t_ = t.float()
1344
  cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
1345
  sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
 
 
1346
  output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
1347
  return output
1348
  else:
1349
- # rot_dim = freqs[0].shape[-1]
1350
- # cos, sin = freqs
1351
- # t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
1352
- # t_ = t_.float()
1353
- # t_pass_ = t_pass_.float()
1354
- # t_ = (t_ * cos) + (_rotate_half(t_) * sin)
1355
- # return torch.cat((t_, t_pass_), dim=-1).type_as(t)
1356
  rot_dim = freqs[0].shape[-1]
1357
  cos, sin = freqs
1358
  t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
@@ -1365,7 +1405,6 @@ def apply_rotary_pos_emb(t, freqs):
1365
  t_ = (t_ * cos) + (_rotate_half(t_) * sin)
1366
  return torch.cat((t_, t_pass_), dim=-1).type_as(t)
1367
 
1368
-
1369
  class RMSNorm(torch.nn.Module):
1370
  def __init__(self, dim: int, eps: float = 1e-6):
1371
  super().__init__()
 
3
  # This source code is licensed under the license found in the
4
  # LICENSE file in the root directory of this source tree.
5
 
 
6
  import importlib
7
  import math
 
8
  from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
9
 
10
  import torch
11
  import torch.nn.functional as F
12
  import torch.utils.checkpoint
 
13
  from torch.cuda.amp import autocast
14
 
15
  from torch.nn import CrossEntropyLoss
 
32
  rearrange = None
33
  from torch import nn
34
 
35
+ try:
36
+ from kernels.cpp_kernels import cache_autogptq_cuda_256
37
+ except ImportError:
38
+ cache_autogptq_cuda_256 = None
39
+
40
  SUPPORT_CUDA = torch.cuda.is_available()
41
  SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
42
  SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
 
 
43
 
44
  from .configuration_qwen import QWenConfig
45
  from .qwen_generation_utils import (
 
180
  assert all((i.is_cuda for i in (q, k, v)))
181
  batch_size, seqlen_q = q.shape[0], q.shape[1]
182
  seqlen_k = k.shape[1]
 
183
 
184
  q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
185
  cu_seqlens_q = torch.arange(
 
190
  device=q.device,
191
  )
192
 
193
+ if attention_mask is not None:
194
  k, indices_k, cu_seqlens_k, seqlen_k = self.unpad_input(k, attention_mask)
195
+ v = v[indices_k]
196
+ if seqlen_q == seqlen_k:
197
  q = q[indices_k]
198
  cu_seqlens_q = cu_seqlens_k
 
 
199
  else:
200
  cu_seqlens_k = torch.arange(
201
  0,
 
225
  softmax_scale=self.softmax_scale,
226
  causal=is_causal,
227
  )
228
+ if attention_mask is not None and seqlen_q == seqlen_k:
229
+ output = self.pad_input(output, indices_k, batch_size, seqlen_q)
230
  else:
231
  new_shape = (batch_size, output.shape[0] // batch_size) + output.shape[1:]
232
  output = output.view(new_shape)
 
283
  self.register_buffer("logn_tensor", logn_tensor, persistent=False)
284
 
285
  self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
 
286
  self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
287
  self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
288
  cache_dtype = torch.float
 
293
  self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
294
  self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
297
  device = query.device
298
  if self.use_cache_quantization:
299
  qk, qk_scale, qk_zero = key
300
+ if self.use_cache_kernel and cache_autogptq_cuda_256 is not None:
301
  shape = query.shape[:-1] + (qk.shape[-2],)
302
  attn_weights = torch.zeros(shape, dtype=torch.float16, device=device)
303
+ cache_autogptq_cuda_256.vecquant8matmul_batched_faster_old(
304
  query.contiguous() if query.dtype == torch.float16 else query.to(torch.float16).contiguous(),
305
  qk.transpose(-1, -2).contiguous(),
306
  attn_weights,
 
342
  if attention_mask is not None:
343
  attn_weights = attn_weights + attention_mask
344
 
345
+ attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
 
 
 
346
 
347
  attn_weights = attn_weights.type(query.dtype)
348
  attn_weights = self.attn_dropout(attn_weights)
 
352
 
353
  if self.use_cache_quantization:
354
  qv, qv_scale, qv_zero = value
355
+ if self.use_cache_kernel and cache_autogptq_cuda_256 is not None:
356
  shape = attn_weights.shape[:-1] + (query.shape[-1],)
357
  attn_output = torch.zeros(shape, dtype=torch.float16, device=device)
358
+ cache_autogptq_cuda_256.vecquant8matmul_batched_column_compression_faster_old(
359
  attn_weights.contiguous() if attn_weights.dtype == torch.float16 else attn_weights.to(torch.float16).contiguous(),
360
  qv.contiguous(), # dtype: int32
361
  attn_output,
 
374
 
375
  return attn_output, attn_weights
376
 
377
+ def _upcast_and_reordered_attn(
378
+ self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None
379
+ ):
380
+ bsz, num_heads, q_seq_len, dk = query.size()
381
+ _, _, k_seq_len, _ = key.size()
382
+
383
+ attn_weights = torch.empty(
384
+ bsz * num_heads,
385
+ q_seq_len,
386
+ k_seq_len,
387
+ dtype=torch.float32,
388
+ device=query.device,
389
+ )
390
+
391
+ scale_factor = 1.0
392
+ if self.scale_attn_weights:
393
+ scale_factor /= float(value.size(-1)) ** 0.5
394
+
395
+ with autocast(enabled=False):
396
+ q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(
397
+ -1, dk, k_seq_len
398
+ )
399
+ attn_weights = torch.baddbmm(
400
+ attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor
401
+ )
402
+ attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
403
+
404
+ query_length, key_length = query.size(-2), key.size(-2)
405
+ causal_mask = registered_causal_mask[
406
+ :, :, key_length - query_length : key_length, :key_length
407
+ ]
408
+ mask_value = torch.finfo(attn_weights.dtype).min
409
+ mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
410
+ attn_weights.device
411
+ )
412
+ attn_weights = torch.where(causal_mask, attn_weights, mask_value)
413
+
414
+ if attention_mask is not None:
415
+ attn_weights = attn_weights + attention_mask
416
+
417
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
418
+
419
+ if attn_weights.dtype != torch.float32:
420
+ raise RuntimeError(
421
+ "Error with upcasting, attn_weights does not have dtype torch.float32"
422
+ )
423
+ attn_weights = attn_weights.type(value.dtype)
424
+ attn_weights = self.attn_dropout(attn_weights)
425
+
426
+ if head_mask is not None:
427
+ attn_weights = attn_weights * head_mask
428
+
429
+ attn_output = torch.matmul(attn_weights, value)
430
+
431
+ return attn_output, attn_weights
432
+
433
  def _split_heads(self, tensor, num_heads, attn_head_size):
434
  new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
435
  tensor = tensor.view(new_shape)
 
443
  def forward(
444
  self,
445
  hidden_states: Optional[Tuple[torch.FloatTensor]],
446
+ rotary_pos_emb_list: Optional[List[torch.Tensor]] = None,
447
+ registered_causal_mask: Optional[torch.Tensor] = None,
448
  layer_past: Optional[Tuple[torch.Tensor]] = None,
449
  attention_mask: Optional[torch.FloatTensor] = None,
450
  head_mask: Optional[torch.FloatTensor] = None,
 
525
  else:
526
  seq_start = key.size(1) - query.size(1)
527
  seq_end = key.size(1)
528
+ logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
529
  query = query * logn_tensor.expand_as(query)
530
 
531
  if (
 
535
  and query.is_cuda
536
  ):
537
  q, k, v = query, key, value
538
+ context_layer = self.core_attention_flash(q, k, v, attention_mask=attention_mask)
539
+
540
+ # b s h d -> b s (h d)
541
+ context_layer = context_layer.flatten(2,3).contiguous()
542
+
543
  else:
 
 
 
544
  query = query.permute(0, 2, 1, 3)
545
  if not self.use_cache_quantization:
546
  key = key.permute(0, 2, 1, 3)
 
553
  and not query.is_cuda
554
  ):
555
  raise Exception(_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED)
556
+ attn_output, attn_weight = self._attn(
557
+ query, key, value, registered_causal_mask, attention_mask, head_mask
558
+ )
559
+ context_layer = self._merge_heads(
560
+ attn_output, self.num_heads, self.head_dim
561
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
 
563
  attn_output = self.c_proj(context_layer)
564
 
 
616
  def forward(
617
  self,
618
  hidden_states: Optional[Tuple[torch.FloatTensor]],
619
+ rotary_pos_emb_list: Optional[List[torch.Tensor]] = None,
620
+ registered_causal_mask: Optional[torch.Tensor] = None,
621
  layer_past: Optional[Tuple[torch.Tensor]] = None,
622
  attention_mask: Optional[torch.FloatTensor] = None,
623
  head_mask: Optional[torch.FloatTensor] = None,
 
631
  attn_outputs = self.attn(
632
  layernorm_output,
633
  rotary_pos_emb_list,
634
+ registered_causal_mask=registered_causal_mask,
635
  layer_past=layer_past,
636
  attention_mask=attention_mask,
637
  head_mask=head_mask,
 
731
 
732
  self.use_flash_attn = config.use_flash_attn
733
  self.is_fp32 = not (config.bf16 or config.fp16)
734
+ if (
735
+ self.use_flash_attn
736
+ and flash_attn_unpadded_func is not None
737
+ and not self.is_fp32
738
+ ):
739
+ self.registered_causal_mask = None
740
+ else:
741
+ max_positions = config.max_position_embeddings
742
+ self.register_buffer(
743
+ "registered_causal_mask",
744
+ torch.tril(
745
+ torch.ones((max_positions, max_positions), dtype=torch.bool)
746
+ ).view(1, 1, max_positions, max_positions),
747
+ persistent=False,
748
+ )
749
 
750
  self.h = nn.ModuleList(
751
  [
 
882
  ntk_alpha = self.get_ntk_alpha(kv_seq_len)
883
  ntk_alpha_list.append(ntk_alpha)
884
  self.rotary_emb._ntk_alpha_cached_list = ntk_alpha_list
885
+
886
+ rotary_pos_emb_list = []
887
+ for ntk_alpha in ntk_alpha_list:
888
+ rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha)
889
+ rotary_pos_emb_list.append(rotary_pos_emb)
890
 
891
  hidden_states = self.drop(hidden_states)
892
  output_shape = input_shape + (hidden_states.size(-1),)
 
919
  create_custom_forward(block),
920
  hidden_states,
921
  rotary_pos_emb_list,
922
+ self.registered_causal_mask,
923
  None,
924
  attention_mask,
925
  head_mask[i],
 
931
  hidden_states,
932
  layer_past=layer_past,
933
  rotary_pos_emb_list=rotary_pos_emb_list,
934
+ registered_causal_mask=self.registered_causal_mask,
935
  attention_mask=attention_mask,
936
  head_mask=head_mask[i],
937
  encoder_hidden_states=encoder_hidden_states,
 
1021
  if config.use_flash_attn:
1022
  _import_flash_attn()
1023
 
1024
+
1025
+ if hasattr(config, 'use_cache_quantization') and config.use_cache_quantization:
1026
+ config.use_flash_attn = False
1027
+ if hasattr(config, 'use_cache_kernel') and config.use_cache_kernel:
1028
+ try:
1029
+ from kernels.cpp_kernels import cache_autogptq_cuda_256
1030
+ except ImportError:
1031
+ cache_autogptq_cuda_256 = None
1032
+
1033
  self.transformer = QWenModel(config)
1034
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1035
 
 
1166
  query: str,
1167
  history: Optional[HistoryType],
1168
  system: str = "You are a helpful assistant.",
1169
+ append_history: bool = True,
1170
  stream: Optional[bool] = _SENTINEL,
1171
  stop_words_ids: Optional[List[List[int]]] = None,
1172
  generation_config: Optional[GenerationConfig] = None,
 
1178
  assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
1179
  if history is None:
1180
  history = []
 
 
 
 
1181
  if stop_words_ids is None:
1182
  stop_words_ids = []
1183
 
 
1215
  errors='replace'
1216
  )
1217
 
1218
+ if append_history:
1219
+ history.append((query, response))
 
 
 
1220
 
1221
  return response, history
1222
 
 
1388
  t_ = t.float()
1389
  cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
1390
  sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
1391
+ cos = cos.to(device=t.device)
1392
+ sin = sin.to(device=t.device)
1393
  output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
1394
  return output
1395
  else:
 
 
 
 
 
 
 
1396
  rot_dim = freqs[0].shape[-1]
1397
  cos, sin = freqs
1398
  t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
 
1405
  t_ = (t_ * cos) + (_rotate_half(t_) * sin)
1406
  return torch.cat((t_, t_pass_), dim=-1).type_as(t)
1407
 
 
1408
  class RMSNorm(torch.nn.Module):
1409
  def __init__(self, dim: int, eps: float = 1e-6):
1410
  super().__init__()
tokenization_qwen.py CHANGED
@@ -27,22 +27,11 @@ IMEND = "<|im_end|>"
27
  # regular texts, the surface forms of special tokens need to be
28
  # as different as possible to minimize the impact
29
  EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
- # changed to use actual index to avoid misconfiguration with vocabulary expansion
31
- SPECIAL_START_ID = 151643
32
- SPECIAL_TOKENS = tuple(
33
- enumerate(
34
- (
35
- (
36
- ENDOFTEXT,
37
- IMSTART,
38
- IMEND,
39
- )
40
- + EXTRAS
41
- ),
42
- start=SPECIAL_START_ID,
43
- )
44
- )
45
- SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
46
 
47
 
48
  def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
@@ -53,7 +42,6 @@ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
53
  for token, rank in (line.split() for line in contents.splitlines() if line)
54
  }
55
 
56
-
57
  class QWenTokenizer(PreTrainedTokenizer):
58
  """QWen tokenizer."""
59
 
@@ -63,35 +51,20 @@ class QWenTokenizer(PreTrainedTokenizer):
63
  self,
64
  vocab_file,
65
  errors="replace",
66
- extra_vocab_file=None,
67
  **kwargs,
68
  ):
69
  super().__init__(**kwargs)
70
 
71
- # how to handle errors in decoding UTF-8 byte sequences
72
- # use ignore if you are in streaming inference
73
- self.errors = errors
74
 
75
- self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
76
  self.special_tokens = {
77
  token: index
78
- for index, token in SPECIAL_TOKENS
 
 
79
  }
80
 
81
- # try load extra vocab from file
82
- if extra_vocab_file is not None:
83
- used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
84
- extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
85
- for token, index in extra_mergeable_ranks.items():
86
- if token in self.mergeable_ranks:
87
- logger.info(f"extra token {token} exists, skipping")
88
- continue
89
- if index in used_ids:
90
- logger.info(f'the index {index} for extra token {token} exists, skipping')
91
- continue
92
- self.mergeable_ranks[token] = index
93
- # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
94
-
95
  enc = tiktoken.Encoding(
96
  "Qwen",
97
  pat_str=PAT_STR,
@@ -116,7 +89,7 @@ class QWenTokenizer(PreTrainedTokenizer):
116
  def __getstate__(self):
117
  # for pickle lovers
118
  state = self.__dict__.copy()
119
- del state["tokenizer"]
120
  return state
121
 
122
  def __setstate__(self, state):
@@ -130,6 +103,7 @@ class QWenTokenizer(PreTrainedTokenizer):
130
  )
131
  self.tokenizer = enc
132
 
 
133
  def __len__(self) -> int:
134
  return self.tokenizer.n_vocab
135
 
@@ -152,17 +126,13 @@ class QWenTokenizer(PreTrainedTokenizer):
152
  ids.append(self.mergeable_ranks.get(token))
153
  return ids
154
 
155
- def _add_tokens(
156
- self,
157
- new_tokens: Union[List[str], List[AddedToken]],
158
- special_tokens: bool = False,
159
- ) -> int:
160
  if not special_tokens and new_tokens:
161
- raise ValueError("Adding regular tokens is not supported")
162
  for token in new_tokens:
163
  surface_form = token.content if isinstance(token, AddedToken) else token
164
- if surface_form not in SPECIAL_TOKENS_SET:
165
- raise ValueError("Adding unknown special tokens is not supported")
166
  return 0
167
 
168
  def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
 
27
  # regular texts, the surface forms of special tokens need to be
28
  # as different as possible to minimize the impact
29
  EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
30
+ SPECIAL_TOKENS = (
31
+ ENDOFTEXT,
32
+ IMSTART,
33
+ IMEND,
34
+ ) + EXTRAS
 
 
 
 
 
 
 
 
 
 
 
35
 
36
 
37
  def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
 
42
  for token, rank in (line.split() for line in contents.splitlines() if line)
43
  }
44
 
 
45
  class QWenTokenizer(PreTrainedTokenizer):
46
  """QWen tokenizer."""
47
 
 
51
  self,
52
  vocab_file,
53
  errors="replace",
 
54
  **kwargs,
55
  ):
56
  super().__init__(**kwargs)
57
 
58
+ self.errors = errors # how to handle errors in decoding
 
 
59
 
60
+ self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: dict[bytes, int]
61
  self.special_tokens = {
62
  token: index
63
+ for index, token in enumerate(
64
+ SPECIAL_TOKENS, start=len(self.mergeable_ranks)
65
+ )
66
  }
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  enc = tiktoken.Encoding(
69
  "Qwen",
70
  pat_str=PAT_STR,
 
89
  def __getstate__(self):
90
  # for pickle lovers
91
  state = self.__dict__.copy()
92
+ del state['tokenizer']
93
  return state
94
 
95
  def __setstate__(self, state):
 
103
  )
104
  self.tokenizer = enc
105
 
106
+
107
  def __len__(self) -> int:
108
  return self.tokenizer.n_vocab
109
 
 
126
  ids.append(self.mergeable_ranks.get(token))
127
  return ids
128
 
129
+ def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
 
 
 
 
130
  if not special_tokens and new_tokens:
131
+ raise ValueError('Adding regular tokens is not supported')
132
  for token in new_tokens:
133
  surface_form = token.content if isinstance(token, AddedToken) else token
134
+ if surface_form not in SPECIAL_TOKENS:
135
+ raise ValueError('Adding unknown special tokens is not supported')
136
  return 0
137
 
138
  def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
tokenizer_config.json CHANGED
@@ -1,4 +1,6 @@
1
  {
 
 
2
  "auto_map": {
3
  "AutoTokenizer": [
4
  "tokenization_qwen.QWenTokenizer",
@@ -7,5 +9,6 @@
7
  },
8
  "clean_up_tokenization_spaces": true,
9
  "model_max_length": 8192,
10
- "tokenizer_class": "QWenTokenizer"
 
11
  }
 
1
  {
2
+ "added_tokens_decoder": {},
3
+ "additional_special_tokens": [],
4
  "auto_map": {
5
  "AutoTokenizer": [
6
  "tokenization_qwen.QWenTokenizer",
 
9
  },
10
  "clean_up_tokenization_spaces": true,
11
  "model_max_length": 8192,
12
+ "tokenizer_class": "QWenTokenizer",
13
+ "tokenizer_file": null
14
  }