Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

config.json +5 -6
configuration_qwen.py +2 -4
gptq_model-4bit-128g.safetensors +2 -2
modeling_qwen.py +124 -85
tokenization_qwen.py +16 -46
tokenizer_config.json +4 -1

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "qwen_7b_chat_roleplay",
   "architectures": [
     "QWenLMHeadModel"
   ],
@@ -8,9 +8,9 @@
     "AutoConfig": "configuration_qwen.QWenConfig",
     "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
   },
-  "bf16": true,
   "emb_dropout_prob": 0.0,
-  "fp16": false,
   "fp32": false,
   "hidden_size": 4096,
   "initializer_range": 0.02,
@@ -27,16 +27,15 @@
   "rotary_pct": 1.0,
   "scale_attn_weights": true,
   "seq_length": 8192,
-  "softmax_in_fp32": false,
   "tie_word_embeddings": false,
   "tokenizer_class": "QWenTokenizer",
   "torch_dtype": "float16",
-  "transformers_version": "4.33.0",
   "use_cache": true,
   "use_cache_kernel": false,
   "use_cache_quantization": false,
   "use_dynamic_ntk": true,
-  "use_flash_attn": true,
   "use_logn_attn": true,
   "vocab_size": 151936
 }

 {
+  "_name_or_path": "Qwen-7B-Chat-roleplay",
   "architectures": [
     "QWenLMHeadModel"
   ],
     "AutoConfig": "configuration_qwen.QWenConfig",
     "AutoModelForCausalLM": "modeling_qwen.QWenLMHeadModel"
   },
+  "bf16": false,
   "emb_dropout_prob": 0.0,
+  "fp16": true,
   "fp32": false,
   "hidden_size": 4096,
   "initializer_range": 0.02,
   "rotary_pct": 1.0,
   "scale_attn_weights": true,
   "seq_length": 8192,
   "tie_word_embeddings": false,
   "tokenizer_class": "QWenTokenizer",
   "torch_dtype": "float16",
+  "transformers_version": "4.34.0",
   "use_cache": true,
   "use_cache_kernel": false,
   "use_cache_quantization": false,
   "use_dynamic_ntk": true,
+  "use_flash_attn": false,
   "use_logn_attn": true,
   "vocab_size": 151936
 }

configuration_qwen.py CHANGED Viewed

@@ -37,7 +37,6 @@ class QWenConfig(PretrainedConfig):
         tie_word_embeddings=False,
         use_cache_quantization=False,
         use_cache_kernel=False,
-        softmax_in_fp32=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -62,9 +61,8 @@ class QWenConfig(PretrainedConfig):
         self.use_logn_attn = use_logn_attn
         self.use_flash_attn = use_flash_attn
         self.no_bias = no_bias
-        self.use_cache_quantization = use_cache_quantization
-        self.use_cache_kernel = use_cache_kernel
-        self.softmax_in_fp32 = softmax_in_fp32
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs

         tie_word_embeddings=False,
         use_cache_quantization=False,
         use_cache_kernel=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.use_logn_attn = use_logn_attn
         self.use_flash_attn = use_flash_attn
         self.no_bias = no_bias
+        self.use_cache_quantization=use_cache_quantization
+        self.use_cache_kernel=use_cache_kernel
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs

gptq_model-4bit-128g.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:135278bf4be7e2bb9edc864237bea8fc96dbd62906dce0b3e6999c9c4c1fb291
-size 5860657584

 version https://git-lfs.github.com/spec/v1
+oid sha256:d1b8e151b749ee01a4af6085eb9f6e93f746ad2d5fcf77fbdd524ca8a649814b
+size 5860657512

modeling_qwen.py CHANGED Viewed

@@ -3,16 +3,13 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-import copy
 import importlib
 import math
-import pathlib
 from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
-import warnings
 from torch.cuda.amp import autocast
 from torch.nn import CrossEntropyLoss
@@ -35,11 +32,14 @@ except ImportError:
     rearrange = None
 from torch import nn
 SUPPORT_CUDA = torch.cuda.is_available()
 SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
 SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
-SUPPORT_TORCH2 = hasattr(torch, '__version__') and int(torch.__version__.split(".")[0]) >= 2
 from .configuration_qwen import QWenConfig
 from .qwen_generation_utils import (
@@ -180,7 +180,6 @@ class FlashSelfAttention(torch.nn.Module):
         assert all((i.is_cuda for i in (q, k, v)))
         batch_size, seqlen_q = q.shape[0], q.shape[1]
         seqlen_k = k.shape[1]
-        seqlen_out = seqlen_q
         q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
         cu_seqlens_q = torch.arange(
@@ -191,13 +190,12 @@ class FlashSelfAttention(torch.nn.Module):
             device=q.device,
         )
-        if batch_size > 1 and attention_mask is not None:
             k, indices_k, cu_seqlens_k, seqlen_k = self.unpad_input(k, attention_mask)
-            if q.size(0) == v.size(0):
                 q = q[indices_k]
                 cu_seqlens_q = cu_seqlens_k
-                seqlen_q = seqlen_k
-            v = v[indices_k]
         else:
             cu_seqlens_k = torch.arange(
                 0,
@@ -227,8 +225,8 @@ class FlashSelfAttention(torch.nn.Module):
             softmax_scale=self.softmax_scale,
             causal=is_causal,
         )
-        if batch_size > 1 and attention_mask is not None and seqlen_q == seqlen_k:
-            output = self.pad_input(output, indices_k, batch_size, seqlen_out)
         else:
             new_shape = (batch_size, output.shape[0] // batch_size) + output.shape[1:]
             output = output.view(new_shape)
@@ -285,7 +283,6 @@ class QWenAttention(nn.Module):
         self.register_buffer("logn_tensor", logn_tensor, persistent=False)
         self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
-        self.softmax_in_fp32 = config.softmax_in_fp32 if hasattr(config, 'softmax_in_fp32') else False
         self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
         self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
         cache_dtype = torch.float
@@ -296,29 +293,14 @@ class QWenAttention(nn.Module):
         self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
         self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
-        if config.use_cache_quantization and config.use_cache_kernel:
-            # pre check if the support files existing
-            module_root = pathlib.Path(__file__).parent
-            src_files = ("cache_autogptq_cuda_256.cpp", "cache_autogptq_cuda_kernel_256.cu")
-            if any(not (module_root/src).is_file() for src in src_files):
-                warnings.warn("KV cache kernel source files (.cpp and .cu) not found.")
-                self.cache_kernels = None
-            else:
-                try:
-                    from .cpp_kernels import cache_autogptq_cuda_256
-                    self.cache_kernels = cache_autogptq_cuda_256
-                except ImportError:
-                    warnings.warn("Failed to import KV cache kernels.")
-                    self.cache_kernels = None
     def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
         device = query.device
         if self.use_cache_quantization:
             qk, qk_scale, qk_zero = key
-            if self.use_cache_kernel and self.cache_kernels is not None:
                 shape = query.shape[:-1] + (qk.shape[-2],)
                 attn_weights = torch.zeros(shape, dtype=torch.float16, device=device)
-                self.cache_kernels.vecquant8matmul_batched_faster_old(
                     query.contiguous() if query.dtype == torch.float16 else query.to(torch.float16).contiguous(),
                     qk.transpose(-1, -2).contiguous(),
                     attn_weights,
@@ -360,10 +342,7 @@ class QWenAttention(nn.Module):
         if attention_mask is not None:
             attn_weights = attn_weights + attention_mask
-        if self.softmax_in_fp32:
-            attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
-        else:
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1)
         attn_weights = attn_weights.type(query.dtype)
         attn_weights = self.attn_dropout(attn_weights)
@@ -373,10 +352,10 @@ class QWenAttention(nn.Module):
         if self.use_cache_quantization:
             qv, qv_scale, qv_zero = value
-            if self.use_cache_kernel and self.cache_kernels is not None:
                 shape = attn_weights.shape[:-1] + (query.shape[-1],)
                 attn_output = torch.zeros(shape, dtype=torch.float16, device=device)
-                self.cache_kernels.vecquant8matmul_batched_column_compression_faster_old(
                     attn_weights.contiguous() if attn_weights.dtype == torch.float16 else attn_weights.to(torch.float16).contiguous(),
                     qv.contiguous(),  # dtype: int32
                     attn_output,
@@ -395,6 +374,62 @@ class QWenAttention(nn.Module):
         return attn_output, attn_weights
     def _split_heads(self, tensor, num_heads, attn_head_size):
         new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
         tensor = tensor.view(new_shape)
@@ -408,7 +443,8 @@ class QWenAttention(nn.Module):
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
-        rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
@@ -489,7 +525,7 @@ class QWenAttention(nn.Module):
             else:
                 seq_start = key.size(1) - query.size(1)
                 seq_end = key.size(1)
-            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :].type_as(query)
             query = query * logn_tensor.expand_as(query)
         if (
@@ -499,11 +535,12 @@ class QWenAttention(nn.Module):
             and query.is_cuda
         ):
             q, k, v = query, key, value
-            attn_output = self.core_attention_flash(q, k, v, attention_mask=attention_mask)
         else:
-            registered_causal_mask = torch.tril(
-                torch.ones((key.size(1), key.size(1)), dtype=torch.bool, device=key.device)
-            ).view(1, 1, key.size(1), key.size(1))
             query = query.permute(0, 2, 1, 3)
             if not self.use_cache_quantization:
                 key = key.permute(0, 2, 1, 3)
@@ -516,28 +553,12 @@ class QWenAttention(nn.Module):
                 and not query.is_cuda
             ):
                 raise Exception(_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED)
-            if not self.use_cache_quantization and SUPPORT_TORCH2:
-                causal_mask = registered_causal_mask[
-                    :, :, key.size(-2) - query.size(-2): key.size(-2), :key.size(-2)
-                ]
-                if attention_mask is not None:
-                    attention_mask = attention_mask.expand(
-                        -1, -1, causal_mask.size(2), -1
-                    ).masked_fill(~causal_mask, torch.finfo(query.dtype).min)
-                else:
-                    attention_mask = causal_mask
-                attn_output = F.scaled_dot_product_attention(
-                    query, key, value, attn_mask=attention_mask
-                ).transpose(1, 2)
-                attn_weight = None
-            else:
-                attn_output, attn_weight = self._attn(
-                    query, key, value, registered_causal_mask, attention_mask, head_mask
-                )
-        context_layer = self._merge_heads(
-            attn_output, self.num_heads, self.head_dim
-        )
         attn_output = self.c_proj(context_layer)
@@ -595,7 +616,8 @@ class QWenBlock(nn.Module):
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
-        rotary_pos_emb_list: Optional[List[List[torch.Tensor]]] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
@@ -609,6 +631,7 @@ class QWenBlock(nn.Module):
         attn_outputs = self.attn(
             layernorm_output,
             rotary_pos_emb_list,
             layer_past=layer_past,
             attention_mask=attention_mask,
             head_mask=head_mask,
@@ -708,6 +731,21 @@ class QWenModel(QWenPreTrainedModel):
         self.use_flash_attn = config.use_flash_attn
         self.is_fp32 = not (config.bf16 or config.fp16)
         self.h = nn.ModuleList(
             [
@@ -844,9 +882,11 @@ class QWenModel(QWenPreTrainedModel):
                 ntk_alpha = self.get_ntk_alpha(kv_seq_len)
                 ntk_alpha_list.append(ntk_alpha)
         self.rotary_emb._ntk_alpha_cached_list = ntk_alpha_list
-        rotary_pos_emb_list = [
-            self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha) for ntk_alpha in ntk_alpha_list
-        ]
         hidden_states = self.drop(hidden_states)
         output_shape = input_shape + (hidden_states.size(-1),)
@@ -879,6 +919,7 @@ class QWenModel(QWenPreTrainedModel):
                     create_custom_forward(block),
                     hidden_states,
                     rotary_pos_emb_list,
                     None,
                     attention_mask,
                     head_mask[i],
@@ -890,6 +931,7 @@ class QWenModel(QWenPreTrainedModel):
                     hidden_states,
                     layer_past=layer_past,
                     rotary_pos_emb_list=rotary_pos_emb_list,
                     attention_mask=attention_mask,
                     head_mask=head_mask[i],
                     encoder_hidden_states=encoder_hidden_states,
@@ -979,6 +1021,15 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         if config.use_flash_attn:
             _import_flash_attn()
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -1115,6 +1166,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         query: str,
         history: Optional[HistoryType],
         system: str = "You are a helpful assistant.",
         stream: Optional[bool] = _SENTINEL,
         stop_words_ids: Optional[List[List[int]]] = None,
         generation_config: Optional[GenerationConfig] = None,
@@ -1126,10 +1178,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
         if history is None:
             history = []
-        else:
-            # make a copy of the user's input such that is is left untouched
-            history = copy.deepcopy(history)
         if stop_words_ids is None:
             stop_words_ids = []
@@ -1167,11 +1215,8 @@ class QWenLMHeadModel(QWenPreTrainedModel):
             errors='replace'
         )
-        # as history is a copy of the user inputs,
-        # we can always return the new turn to the user.
-        # separating input history and output history also enables the user
-        # to implement more complex history management
-        history.append((query, response))
         return response, history
@@ -1343,16 +1388,11 @@ def apply_rotary_pos_emb(t, freqs):
         t_ = t.float()
         cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
         sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
         output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
         return output
     else:
-        # rot_dim = freqs[0].shape[-1]
-        # cos, sin = freqs
-        # t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
-        # t_ = t_.float()
-        # t_pass_ = t_pass_.float()
-        # t_ = (t_ * cos) + (_rotate_half(t_) * sin)
-        # return torch.cat((t_, t_pass_), dim=-1).type_as(t)
         rot_dim = freqs[0].shape[-1]
         cos, sin = freqs
         t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
@@ -1365,7 +1405,6 @@ def apply_rotary_pos_emb(t, freqs):
         t_ = (t_ * cos) + (_rotate_half(t_) * sin)
         return torch.cat((t_, t_pass_), dim=-1).type_as(t)
 class RMSNorm(torch.nn.Module):
     def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()

 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import importlib
 import math
 from typing import TYPE_CHECKING, Optional, Tuple, Union, Callable, List, Any, Generator
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch.cuda.amp import autocast
 from torch.nn import CrossEntropyLoss
     rearrange = None
 from torch import nn
+try:
+    from kernels.cpp_kernels import cache_autogptq_cuda_256
+except ImportError:
+    cache_autogptq_cuda_256 = None
 SUPPORT_CUDA = torch.cuda.is_available()
 SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
 SUPPORT_FP16 = SUPPORT_CUDA and torch.cuda.get_device_capability(0)[0] >= 7
 from .configuration_qwen import QWenConfig
 from .qwen_generation_utils import (
         assert all((i.is_cuda for i in (q, k, v)))
         batch_size, seqlen_q = q.shape[0], q.shape[1]
         seqlen_k = k.shape[1]
         q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
         cu_seqlens_q = torch.arange(
             device=q.device,
         )
+        if attention_mask is not None:
             k, indices_k, cu_seqlens_k, seqlen_k = self.unpad_input(k, attention_mask)
+            v = v[indices_k]
+            if seqlen_q == seqlen_k:
                 q = q[indices_k]
                 cu_seqlens_q = cu_seqlens_k
         else:
             cu_seqlens_k = torch.arange(
                 0,
             softmax_scale=self.softmax_scale,
             causal=is_causal,
         )
+        if attention_mask is not None and seqlen_q == seqlen_k:
+            output = self.pad_input(output, indices_k, batch_size, seqlen_q)
         else:
             new_shape = (batch_size, output.shape[0] // batch_size) + output.shape[1:]
             output = output.view(new_shape)
         self.register_buffer("logn_tensor", logn_tensor, persistent=False)
         self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
         self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
         self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
         cache_dtype = torch.float
         self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
         self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
     def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
         device = query.device
         if self.use_cache_quantization:
             qk, qk_scale, qk_zero = key
+            if self.use_cache_kernel and cache_autogptq_cuda_256 is not None:
                 shape = query.shape[:-1] + (qk.shape[-2],)
                 attn_weights = torch.zeros(shape, dtype=torch.float16, device=device)
+                cache_autogptq_cuda_256.vecquant8matmul_batched_faster_old(
                     query.contiguous() if query.dtype == torch.float16 else query.to(torch.float16).contiguous(),
                     qk.transpose(-1, -2).contiguous(),
                     attn_weights,
         if attention_mask is not None:
             attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
         attn_weights = attn_weights.type(query.dtype)
         attn_weights = self.attn_dropout(attn_weights)
         if self.use_cache_quantization:
             qv, qv_scale, qv_zero = value
+            if self.use_cache_kernel and cache_autogptq_cuda_256 is not None:
                 shape = attn_weights.shape[:-1] + (query.shape[-1],)
                 attn_output = torch.zeros(shape, dtype=torch.float16, device=device)
+                cache_autogptq_cuda_256.vecquant8matmul_batched_column_compression_faster_old(
                     attn_weights.contiguous() if attn_weights.dtype == torch.float16 else attn_weights.to(torch.float16).contiguous(),
                     qv.contiguous(),  # dtype: int32
                     attn_output,
         return attn_output, attn_weights
+    def _upcast_and_reordered_attn(
+        self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None
+    ):
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+        attn_weights = torch.empty(
+            bsz * num_heads,
+            q_seq_len,
+            k_seq_len,
+            dtype=torch.float32,
+            device=query.device,
+        )
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+        with autocast(enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(
+                -1, dk, k_seq_len
+            )
+            attn_weights = torch.baddbmm(
+                attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor
+            )
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+        query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = registered_causal_mask[
+            :, :, key_length - query_length : key_length, :key_length
+        ]
+        mask_value = torch.finfo(attn_weights.dtype).min
+        mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype).to(
+            attn_weights.device
+        )
+        attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError(
+                "Error with upcasting, attn_weights does not have dtype torch.float32"
+            )
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+        attn_output = torch.matmul(attn_weights, value)
+        return attn_output, attn_weights
     def _split_heads(self, tensor, num_heads, attn_head_size):
         new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
         tensor = tensor.view(new_shape)
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
+        rotary_pos_emb_list: Optional[List[torch.Tensor]] = None,
+        registered_causal_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
             else:
                 seq_start = key.size(1) - query.size(1)
                 seq_end = key.size(1)
+            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
             query = query * logn_tensor.expand_as(query)
         if (
             and query.is_cuda
         ):
             q, k, v = query, key, value
+            context_layer = self.core_attention_flash(q, k, v, attention_mask=attention_mask)
+            # b s h d -> b s (h d)
+            context_layer = context_layer.flatten(2,3).contiguous()
         else:
             query = query.permute(0, 2, 1, 3)
             if not self.use_cache_quantization:
                 key = key.permute(0, 2, 1, 3)
                 and not query.is_cuda
             ):
                 raise Exception(_ERROR_INPUT_CPU_QUERY_WITH_FLASH_ATTN_ACTIVATED)
+            attn_output, attn_weight = self._attn(
+                query, key, value, registered_causal_mask, attention_mask, head_mask
+            )
+            context_layer = self._merge_heads(
+                attn_output, self.num_heads, self.head_dim
+            )
         attn_output = self.c_proj(context_layer)
     def forward(
         self,
         hidden_states: Optional[Tuple[torch.FloatTensor]],
+        rotary_pos_emb_list: Optional[List[torch.Tensor]] = None,
+        registered_causal_mask: Optional[torch.Tensor] = None,
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         head_mask: Optional[torch.FloatTensor] = None,
         attn_outputs = self.attn(
             layernorm_output,
             rotary_pos_emb_list,
+            registered_causal_mask=registered_causal_mask,
             layer_past=layer_past,
             attention_mask=attention_mask,
             head_mask=head_mask,
         self.use_flash_attn = config.use_flash_attn
         self.is_fp32 = not (config.bf16 or config.fp16)
+        if (
+            self.use_flash_attn
+            and flash_attn_unpadded_func is not None
+            and not self.is_fp32
+        ):
+            self.registered_causal_mask = None
+        else:
+            max_positions = config.max_position_embeddings
+            self.register_buffer(
+                "registered_causal_mask",
+                torch.tril(
+                    torch.ones((max_positions, max_positions), dtype=torch.bool)
+                ).view(1, 1, max_positions, max_positions),
+                persistent=False,
+            )
         self.h = nn.ModuleList(
             [
                 ntk_alpha = self.get_ntk_alpha(kv_seq_len)
                 ntk_alpha_list.append(ntk_alpha)
         self.rotary_emb._ntk_alpha_cached_list = ntk_alpha_list
+        rotary_pos_emb_list = []
+        for ntk_alpha in ntk_alpha_list:
+            rotary_pos_emb = self.rotary_emb(kv_seq_len, ntk_alpha=ntk_alpha)
+            rotary_pos_emb_list.append(rotary_pos_emb)
         hidden_states = self.drop(hidden_states)
         output_shape = input_shape + (hidden_states.size(-1),)
                     create_custom_forward(block),
                     hidden_states,
                     rotary_pos_emb_list,
+                    self.registered_causal_mask,
                     None,
                     attention_mask,
                     head_mask[i],
                     hidden_states,
                     layer_past=layer_past,
                     rotary_pos_emb_list=rotary_pos_emb_list,
+                    registered_causal_mask=self.registered_causal_mask,
                     attention_mask=attention_mask,
                     head_mask=head_mask[i],
                     encoder_hidden_states=encoder_hidden_states,
         if config.use_flash_attn:
             _import_flash_attn()
+        if hasattr(config, 'use_cache_quantization') and config.use_cache_quantization:
+            config.use_flash_attn = False
+            if hasattr(config, 'use_cache_kernel') and config.use_cache_kernel:
+                try:
+                    from kernels.cpp_kernels import cache_autogptq_cuda_256
+                except ImportError:
+                    cache_autogptq_cuda_256 = None
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         query: str,
         history: Optional[HistoryType],
         system: str = "You are a helpful assistant.",
+        append_history: bool = True,
         stream: Optional[bool] = _SENTINEL,
         stop_words_ids: Optional[List[List[int]]] = None,
         generation_config: Optional[GenerationConfig] = None,
         assert generation_config.chat_format == 'chatml', _ERROR_BAD_CHAT_FORMAT
         if history is None:
             history = []
         if stop_words_ids is None:
             stop_words_ids = []
             errors='replace'
         )
+        if append_history:
+            history.append((query, response))
         return response, history
         t_ = t.float()
         cos = cos.squeeze(0).squeeze(1)[:, : cos.shape[-1] // 2]
         sin = sin.squeeze(0).squeeze(1)[:, : sin.shape[-1] // 2]
+        cos = cos.to(device=t.device)
+        sin = sin.to(device=t.device)
         output = apply_rotary_emb_func(t_, cos, sin).type_as(t)
         return output
     else:
         rot_dim = freqs[0].shape[-1]
         cos, sin = freqs
         t_, t_pass_ = t[..., :rot_dim], t[..., rot_dim:]
         t_ = (t_ * cos) + (_rotate_half(t_) * sin)
         return torch.cat((t_, t_pass_), dim=-1).type_as(t)
 class RMSNorm(torch.nn.Module):
     def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()

tokenization_qwen.py CHANGED Viewed

@@ -27,22 +27,11 @@ IMEND = "<|im_end|>"
 # regular texts, the surface forms of special tokens need to be
 # as different as possible to minimize the impact
 EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
-# changed to use actual index to avoid misconfiguration with vocabulary expansion
-SPECIAL_START_ID = 151643
-SPECIAL_TOKENS = tuple(
-    enumerate(
-        (
-            (
-                ENDOFTEXT,
-                IMSTART,
-                IMEND,
-            )
-            + EXTRAS
-        ),
-        start=SPECIAL_START_ID,
-    )
-)
-SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
 def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
@@ -53,7 +42,6 @@ def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
         for token, rank in (line.split() for line in contents.splitlines() if line)
     }
 class QWenTokenizer(PreTrainedTokenizer):
     """QWen tokenizer."""
@@ -63,35 +51,20 @@ class QWenTokenizer(PreTrainedTokenizer):
         self,
         vocab_file,
         errors="replace",
-        extra_vocab_file=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
-        # how to handle errors in decoding UTF-8 byte sequences
-        # use ignore if you are in streaming inference
-        self.errors = errors
-        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
         self.special_tokens = {
             token: index
-            for index, token in SPECIAL_TOKENS
         }
-        # try load extra vocab from file
-        if extra_vocab_file is not None:
-            used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
-            extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
-            for token, index in extra_mergeable_ranks.items():
-                if token in self.mergeable_ranks:
-                    logger.info(f"extra token {token} exists, skipping")
-                    continue
-                if index in used_ids:
-                    logger.info(f'the index {index} for extra token {token} exists, skipping')
-                    continue
-                self.mergeable_ranks[token] = index
-            # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
         enc = tiktoken.Encoding(
             "Qwen",
             pat_str=PAT_STR,
@@ -116,7 +89,7 @@ class QWenTokenizer(PreTrainedTokenizer):
     def __getstate__(self):
         # for pickle lovers
         state = self.__dict__.copy()
-        del state["tokenizer"]
         return state
     def __setstate__(self, state):
@@ -130,6 +103,7 @@ class QWenTokenizer(PreTrainedTokenizer):
         )
         self.tokenizer = enc
     def __len__(self) -> int:
         return self.tokenizer.n_vocab
@@ -152,17 +126,13 @@ class QWenTokenizer(PreTrainedTokenizer):
                 ids.append(self.mergeable_ranks.get(token))
         return ids
-    def _add_tokens(
-        self,
-        new_tokens: Union[List[str], List[AddedToken]],
-        special_tokens: bool = False,
-    ) -> int:
         if not special_tokens and new_tokens:
-            raise ValueError("Adding regular tokens is not supported")
         for token in new_tokens:
             surface_form = token.content if isinstance(token, AddedToken) else token
-            if surface_form not in SPECIAL_TOKENS_SET:
-                raise ValueError("Adding unknown special tokens is not supported")
         return 0
     def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:

 # regular texts, the surface forms of special tokens need to be
 # as different as possible to minimize the impact
 EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+SPECIAL_TOKENS = (
+    ENDOFTEXT,
+    IMSTART,
+    IMEND,
+) + EXTRAS
 def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
         for token, rank in (line.split() for line in contents.splitlines() if line)
     }
 class QWenTokenizer(PreTrainedTokenizer):
     """QWen tokenizer."""
         self,
         vocab_file,
         errors="replace",
         **kwargs,
     ):
         super().__init__(**kwargs)
+        self.errors = errors  # how to handle errors in decoding
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
         self.special_tokens = {
             token: index
+            for index, token in enumerate(
+                SPECIAL_TOKENS, start=len(self.mergeable_ranks)
+            )
         }
         enc = tiktoken.Encoding(
             "Qwen",
             pat_str=PAT_STR,
     def __getstate__(self):
         # for pickle lovers
         state = self.__dict__.copy()
+        del state['tokenizer']
         return state
     def __setstate__(self, state):
         )
         self.tokenizer = enc
     def __len__(self) -> int:
         return self.tokenizer.n_vocab
                 ids.append(self.mergeable_ranks.get(token))
         return ids
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
         if not special_tokens and new_tokens:
+            raise ValueError('Adding regular tokens is not supported')
         for token in new_tokens:
             surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS:
+                raise ValueError('Adding unknown special tokens is not supported')
         return 0
     def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:

tokenizer_config.json CHANGED Viewed

@@ -1,4 +1,6 @@
 {
   "auto_map": {
     "AutoTokenizer": [
       "tokenization_qwen.QWenTokenizer",
@@ -7,5 +9,6 @@
   },
   "clean_up_tokenization_spaces": true,
   "model_max_length": 8192,
-  "tokenizer_class": "QWenTokenizer"
 }

 {
+  "added_tokens_decoder": {},
+  "additional_special_tokens": [],
   "auto_map": {
     "AutoTokenizer": [
       "tokenization_qwen.QWenTokenizer",
   },
   "clean_up_tokenization_spaces": true,
   "model_max_length": 8192,
+  "tokenizer_class": "QWenTokenizer",
+  "tokenizer_file": null
 }