update kvcache

Browse files

Files changed (5) hide show

configuration_qwen.py +4 -0
kernels/cache_autogptq_cuda_256.cpp +198 -0
kernels/cache_autogptq_cuda_kernel_256.cu +1708 -0
kernels/cpp_kernels.py +55 -0
modeling_qwen.py +134 -17

configuration_qwen.py CHANGED Viewed

@@ -35,6 +35,8 @@ class QWenConfig(PretrainedConfig):
         intermediate_size=22016,
         no_bias=True,
         tie_word_embeddings=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
@@ -59,6 +61,8 @@ class QWenConfig(PretrainedConfig):
         self.use_logn_attn = use_logn_attn
         self.use_flash_attn = use_flash_attn
         self.no_bias = no_bias
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs

         intermediate_size=22016,
         no_bias=True,
         tie_word_embeddings=False,
+        use_cache_quantization=False,
+        use_cache_kernel=False,
         **kwargs,
     ):
         self.vocab_size = vocab_size
         self.use_logn_attn = use_logn_attn
         self.use_flash_attn = use_flash_attn
         self.no_bias = no_bias
+        self.use_cache_quantization=use_cache_quantization
+        self.use_cache_kernel=use_cache_kernel
         super().__init__(
             tie_word_embeddings=tie_word_embeddings,
             **kwargs

kernels/cache_autogptq_cuda_256.cpp ADDED Viewed

	@@ -0,0 +1,198 @@

+#include <torch/all.h>
+#include <torch/python.h>
+#include <c10/cuda/CUDAGuard.h>
+// adapted from https://github.com/PanQiWei/AutoGPTQ/blob/main/autogptq_extension/cuda_256/autogptq_cuda_256.cpp
+void vecquant8matmul_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+);
+void vecquant8matmul(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_cuda(vec, mat, mul, scales, zeros, g_idx);
+}
+void vecquant8matmul_batched_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant8matmul_batched(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_cuda(vec, mat, mul, scales, zeros);
+}
+void vecquant8matmul_batched_column_compression_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant8matmul_batched_column_compression(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros);
+}
+void vecquant4matmul_batched_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant4matmul_batched(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_batched_cuda(vec, mat, mul, scales, zeros);
+}
+void vecquant4matmul_batched_column_compression_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant4matmul_batched_column_compression(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_batched_column_compression_cuda(vec, mat, mul, scales, zeros);
+}
+void vecquant8matmul_batched_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant8matmul_batched_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_old_cuda(vec, mat, mul, scales, zeros);
+}
+void vecquant4matmul_batched_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant4matmul_batched_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_batched_old_cuda(vec, mat, mul, scales, zeros);
+}
+void vecquant8matmul_batched_column_compression_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant8matmul_batched_column_compression_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros);
+}
+void vecquant4matmul_batched_column_compression_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant4matmul_batched_column_compression_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant4matmul_batched_column_compression_old_cuda(vec, mat, mul, scales, zeros);
+}
+void vecquant8matmul_batched_faster_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant8matmul_batched_faster(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_faster_cuda(vec, mat, mul, scales, zeros);
+}
+void vecquant8matmul_batched_faster_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant8matmul_batched_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_faster_old_cuda(vec, mat, mul, scales, zeros);
+}
+void vecquant8matmul_batched_column_compression_faster_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant8matmul_batched_column_compression_faster(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_column_compression_faster_cuda(vec, mat, mul, scales, zeros);
+}
+void vecquant8matmul_batched_column_compression_faster_old_cuda(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+);
+void vecquant8matmul_batched_column_compression_faster_old(
+  torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
+  torch::Tensor scales, torch::Tensor zeros
+) {
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
+  vecquant8matmul_batched_column_compression_faster_old_cuda(vec, mat, mul, scales, zeros);
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("vecquant8matmul", &vecquant8matmul, "Vector 8-bit Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul_batched", &vecquant8matmul_batched, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul_batched_old", &vecquant8matmul_batched_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul_batched_faster", &vecquant8matmul_batched_faster, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul_batched_faster_old", &vecquant8matmul_batched_faster_old, "Vector 8-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant4matmul_batched_old", &vecquant4matmul_batched_old, "Vector 4-bit old Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant8matmul_batched_column_compression", &vecquant8matmul_batched_column_compression, "Vector 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+  m.def("vecquant8matmul_batched_column_compression_old", &vecquant8matmul_batched_column_compression_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+  m.def("vecquant8matmul_batched_column_compression_faster", &vecquant8matmul_batched_column_compression_faster, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+  m.def("vecquant8matmul_batched_column_compression_faster_old", &vecquant8matmul_batched_column_compression_faster_old, "Vector old 8-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+  m.def("vecquant4matmul_batched_column_compression_old", &vecquant4matmul_batched_column_compression_old, "Vector old 4-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+  m.def("vecquant4matmul_batched", &vecquant4matmul_batched, "Vector 4-bit Batched Quantized Matrix Multiplication (CUDA) (desc_act)");
+  m.def("vecquant4matmul_batched_column_compression", &vecquant4matmul_batched_column_compression, "Vector 4-bit Batched Quantized Matrix Multiplication (CUDA) with weight's column compressed (desc_act)");
+}

kernels/cache_autogptq_cuda_kernel_256.cu ADDED Viewed

	@@ -0,0 +1,1708 @@

+#define _CRT_SECURE_NO_WARNINGS
+#include <torch/all.h>
+#include <torch/python.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <stdint.h>
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700) || defined(USE_ROCM)
+// adapted from https://github.com/PanQiWei/AutoGPTQ/blob/main/autogptq_extension/cuda_256/autogptq_cuda_kernel_256.cu
+__device__ __forceinline__ void atomicAdd(c10::Half* address, c10::Half val) {
+    unsigned int *address_as_ui = reinterpret_cast<unsigned int *>(reinterpret_cast<char *>(address) - (reinterpret_cast<size_t>(address) & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    do {
+        assumed = old;
+        unsigned short hsum = reinterpret_cast<size_t>(address) & 2 ? (old >> 16) : (old & 0xffff);
+        hsum += val;
+        old = reinterpret_cast<size_t>(address) & 2
+                 ? (old & 0xffff) | (hsum << 16)
+                 : (old & 0xffff0000) | hsum;
+        old = atomicCAS(address_as_ui, assumed, old);
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+    } while (assumed != old);
+}
+__device__ __forceinline__ void atomicAdd(__half* address, c10::Half val) {
+    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+    do {
+        assumed = old;
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        half tmpres = __hadd(hsum, val);
+        hsum = __half_raw(tmpres);
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+}
+#endif
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    const       int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulColumnCompressionKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+__global__ void VecQuant8BatchMatMulKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+__global__ void VecQuant8BatchMatMulKernel_faster_old(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width
+);
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+);
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_faster_old(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulColumnCompressionKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+__global__ void VecQuant8BatchMatMulKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width
+);
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+);
+const int BLOCKWIDTH  = 128;
+const int BLOCKHEIGHT8 =  32;
+const int BLOCKHEIGHT4 =  16;
+const int BLOCKHEIGHT_OLD4 =  128;
+//const int BLOCKHEIGHT_OLD8 =  128;
+__device__ inline unsigned int as_unsigned(int i) {
+  return *reinterpret_cast<unsigned int*>(&i);
+}
+__device__ inline int as_int(int i) {
+  return *reinterpret_cast<int*>(&i);
+}
+void vecquant8matmul_batched_column_compression_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int height = vec.size(3);
+  int width = mat.size(3) * 4;
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_batched_cuda", ([&] {
+      VecQuant8BatchMatMulColumnCompressionKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, heads, vec_row, height, width
+      );
+    })
+  );
+}
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+) {
+  int weight_total = batch * heads * height * width / 4;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKWIDTH
+  int h = BLOCKWIDTH * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int k;
+  scalar_t w_tmp;
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+        int i_w = (w / 4);
+        int w_bit = (w % 4) * 8;
+        int w_index = (batch_shift * height + h + k) * width / 4 + i_w;
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * height + h + k];
+          scalar_t zero = zeros[batch_shift * height + h + k];
+          w_tmp = ((as_unsigned(mat[w_index]) >> w_bit) & 0xFF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+void vecquant8matmul_batched_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+  int zero_width = zeros.size(2);
+  dim3 blocks(
+    (height + BLOCKHEIGHT8 - 1) / BLOCKHEIGHT8,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_batched_cuda", ([&] {
+      VecQuant8BatchMatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, heads, vec_row, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKHEIGHT8
+  int h = BLOCKHEIGHT8 * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= vec_height) {
+    return;
+  }
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  // i is index of mat of block first row
+  int i = width * h + w;
+  // if (i >= width * height) {
+  //   return;
+  // }
+  int k;
+  scalar_t w_tmp;
+  int z_w = w / 4;
+  int z_mod = (w % 4) * 8;
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h * 4 + k < vec_height; ++k){
+        int k_w = (k / 4);
+        int k_bit = (k % 4) * 8;
+        int w_index = batch_shift * height * width + i + (k_w * width);
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * width + w];
+          scalar_t zero;
+          if (zero_width == width) {
+            zero = zeros[batch_shift * width + w];
+          } else {
+            zero = scalar_t(((as_unsigned(zeros[batch_shift * zero_width + z_w]) >> z_mod) & 0xFF) + 1);
+          }
+          w_tmp = ((as_unsigned(mat[w_index]) >> k_bit) & 0xFF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * vec_height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h * 4 + k < vec_height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+void vecquant8matmul_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros,
+  torch::Tensor g_idx
+) {
+  int batch = vec.size(0);
+  int vec_height = vec.size(1);
+  int height = mat.size(0);
+  int width = mat.size(1);
+  int zero_width = zeros.size(1);
+  dim3 blocks(
+    (height + BLOCKHEIGHT8 - 1) / BLOCKHEIGHT8,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_cuda", ([&] {
+      VecQuant8MatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(), g_idx.data<int>(),
+        batch, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+template <typename scalar_t>
+__global__ void VecQuant8MatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    const       int* __restrict__ g_idx,
+    int batch,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  int h = BLOCKHEIGHT8 * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + threadIdx.x;
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int g_h = h * 4;
+  int k;
+  unsigned int g;
+  scalar_t w_tmp;
+  int z_w = w / 4;
+  int z_mod = (w % 4) * 8;
+  float weight[BLOCKWIDTH];
+  for (k = 0; k <  BLOCKWIDTH; ++k){
+    int k_w = (k / 4);
+    int k_bit = (k % 4) * 8;
+      g = as_int(g_idx[g_h + k]);
+      scalar_t scale = scales[g * width + w];
+      scalar_t zero = scalar_t(((as_unsigned(zeros[g * zero_width + z_w]) >> z_mod) & 0xFF) + 1);
+      w_tmp = ((as_unsigned(mat[i + (k_w * width)]) >> k_bit) & 0xFF);
+    weight[k] = scale * (w_tmp - zero);
+  }
+  scalar_t res;
+  for (int b = 0; b < batch; ++b){
+      res = 0;
+    blockvec[threadIdx.x] = vec[b * vec_height + blockIdx.x * BLOCKWIDTH + threadIdx.x];
+    __syncthreads();
+    for (k = 0; k <  BLOCKWIDTH; ++k){
+      res += weight[k] * blockvec[k];
+    }
+    atomicAdd(&mul[b * width + w], res);
+    __syncthreads();
+  }
+}
+void vecquant4matmul_batched_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+  int zero_width = zeros.size(2);
+  dim3 blocks(
+    (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_batched_cuda", ([&] {
+      VecQuant4BatchMatMulKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, heads, vec_row, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKHEIGHT4
+  int h = BLOCKHEIGHT4 * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= vec_height) {
+    return;
+  }
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  // i is index of mat of block first row
+  int i = width * h + w;
+  int k;
+  scalar_t w_tmp;
+  int z_w = w / 8;
+  int z_mod = (w % 8) * 4;
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h * 8 + k < vec_height; ++k){
+        int k_w = (k / 8);
+        int k_bit = (k % 8) * 4;
+        int w_index = batch_shift * height * width + i + (k_w * width);
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * width + w];
+          scalar_t zero;
+          if (zero_width == width) {
+            zero = zeros[batch_shift * width + w];
+          } else {
+            zero = scalar_t(((as_unsigned(zeros[batch_shift * zero_width + z_w]) >> z_mod) & 0xF));
+          }
+          w_tmp = ((as_unsigned(mat[w_index]) >> k_bit) & 0xF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * vec_height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h * 8 + k < vec_height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+void vecquant4matmul_batched_column_compression_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int height = vec.size(3);
+  int width = mat.size(3) * 8;
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_batched_cuda", ([&] {
+      VecQuant4BatchMatMulColumnCompressionKernel<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<int>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<int>(),
+        batch, heads, vec_row, height, width
+      );
+    })
+  );
+}
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulColumnCompressionKernel(
+    const  scalar_t* __restrict__ vec,
+    const       int* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const       int* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+) {
+  int weight_total = batch * heads * height * width / 8;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKWIDTH
+  int h = BLOCKWIDTH * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int k;
+  scalar_t w_tmp;
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+        int i_w = (w / 8);
+        int w_bit = (w % 8) * 4;
+        int w_index = (batch_shift * height + h + k) * width / 8 + i_w;
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * height + h + k];
+          scalar_t zero = zeros[batch_shift * height + h + k];
+          w_tmp = ((as_unsigned(mat[w_index]) >> w_bit) & 0xF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+void vecquant8matmul_batched_old_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+  int zero_width = zeros.size(2);
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_batched_old_cuda", ([&] {
+      VecQuant8BatchMatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<uint8_t>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<scalar_t>(),
+        batch, heads, vec_row, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKHEIGHT8
+  int h = BLOCKWIDTH * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= vec_height) {
+    return;
+  }
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  // i is index of mat of block first row
+  int i = width * h + w;
+  int k;
+  scalar_t w_tmp;
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h + k < vec_height; ++k){
+        int k_w = k;
+        int w_index = batch_shift * height * width + i + (k_w * width);
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * width + w];
+          scalar_t zero = zeros[batch_shift * width + w];
+          w_tmp = as_unsigned(mat[w_index]);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * vec_height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h + k < vec_height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+void vecquant8matmul_batched_faster_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+  int zero_width = zeros.size(2);
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  VecQuant8BatchMatMulKernel_faster<<<blocks, threads>>>(
+    (half*) vec.data_ptr(),
+    (uint8_t*) mat.data_ptr(),
+    (half*) mul.data_ptr(),
+    (half*) scales.data_ptr(),
+    (half*) zeros.data_ptr(),
+    batch, heads, vec_row, vec_height, height, width, zero_width
+  );
+}
+__global__ void VecQuant8BatchMatMulKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  //int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  int h = BLOCKWIDTH * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+  __shared__ float blockvec[BLOCKWIDTH];
+  int i = width * h + w;
+  int k;
+  float w_tmp;
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h + k < vec_height; ++k){
+        int k_w = k;
+        int w_index = batch_shift * height * width + i + (k_w * width);
+        float scale = __half2float(scales[batch_shift * width + w]);
+        float zero = __half2float(zeros[batch_shift * width + w]);
+        w_tmp = as_unsigned(mat[w_index]);
+        weight[k] = scale *(w_tmp-zero);
+      }
+      float res;
+      for (int vr = 0; vr < vec_row; ++vr){
+        res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * vec_height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = __half2float(vec[vec_index]);
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h + k < vec_height; ++k){
+            float temp_res = weight[k]*blockvec[k];
+            res += temp_res;
+        }
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], __float2half(res));
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+void vecquant8matmul_batched_column_compression_faster_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int height = vec.size(3);
+  int width = mat.size(3);
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  VecQuant8BatchMatMulColumnCompressionKernel_faster<<<blocks, threads>>>(
+    (half*) vec.data_ptr(),
+    (uint8_t*) mat.data_ptr(),
+    (half*) mul.data_ptr(),
+    (half*) scales.data_ptr(),
+    (half*) zeros.data_ptr(),
+    batch, heads, vec_row, height, width
+  );
+}
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_faster(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+) {
+  //int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  int h = BLOCKWIDTH * blockIdx.x;
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+  __shared__ float blockvec[BLOCKWIDTH];
+  int k;
+  float w_tmp;
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH; ++k){
+        int w_index = (batch_shift * height + h + k) * width  + w;
+        float scale = __half2float(scales[batch_shift * height + h + k]);
+        float zero = __half2float(zeros[batch_shift * height + h + k]);
+        w_tmp = mat[w_index];
+        weight[k] = scale * (w_tmp-zero);
+      }
+      float res;
+      for (int vr = 0; vr < vec_row; ++vr){
+        res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = __half2float(vec[vec_index]);
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH; ++k){
+            res += weight[k]*blockvec[k];
+        }
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], __float2half(res));
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+void vecquant8matmul_batched_column_compression_old_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int height = vec.size(3);
+  int width = mat.size(3);
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant8matmul_batched_column_compression_old_cuda", ([&] {
+      VecQuant8BatchMatMulColumnCompressionKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<uint8_t>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<scalar_t>(),
+        batch, heads, vec_row, height, width
+      );
+    })
+  );
+}
+template <typename scalar_t>
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKWIDTH
+  int h = BLOCKWIDTH * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int k;
+  scalar_t w_tmp;
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+        int w_index = (batch_shift * height + h + k) * width  + w;
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * height + h + k];
+          scalar_t zero = zeros[batch_shift * height + h + k];
+          w_tmp = mat[w_index];
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h + k < height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+void vecquant4matmul_batched_old_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+  int zero_width = zeros.size(2);
+  dim3 blocks(
+    (height + BLOCKHEIGHT_OLD4 - 1) / BLOCKHEIGHT_OLD4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_batched_old_cuda", ([&] {
+      VecQuant4BatchMatMulKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<uint8_t>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<scalar_t>(),
+        batch, heads, vec_row, vec_height, height, width, zero_width
+      );
+    })
+  );
+}
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width,
+    int zero_width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKHEIGHT_OLD4
+  int h = BLOCKHEIGHT_OLD4 * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= vec_height) {
+    return;
+  }
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  // i is index of mat of block first row
+  int i = width * h + w;
+  int k;
+  scalar_t w_tmp;
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h*2 + k < vec_height; ++k){
+        int k_w = (k / 2);
+        int k_bit = (k % 2) * 4;
+        int w_index = batch_shift * height * width + i + (k_w * width);
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * width + w];
+          scalar_t zero = zeros[batch_shift * width + w];
+          w_tmp = ((as_unsigned(mat[w_index]) >> k_bit) & 0xF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * vec_height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h*2 + k < vec_height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+void vecquant4matmul_batched_column_compression_old_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int height = vec.size(3);
+  int width = mat.size(3);
+  dim3 blocks(
+    (height + BLOCKHEIGHT_OLD4 - 1) / BLOCKHEIGHT_OLD4,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  AT_DISPATCH_FLOATING_TYPES(
+    vec.type(), "vecquant4matmul_batched_column_compression_old_cuda", ([&] {
+      VecQuant4BatchMatMulColumnCompressionKernel_old<<<blocks, threads>>>(
+        vec.data<scalar_t>(), mat.data<uint8_t>(), mul.data<scalar_t>(),
+        scales.data<scalar_t>(), zeros.data<scalar_t>(),
+        batch, heads, vec_row, height, width
+      );
+    })
+  );
+}
+template <typename scalar_t>
+__global__ void VecQuant4BatchMatMulColumnCompressionKernel_old(
+    const  scalar_t* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           scalar_t* __restrict__ mul,
+    const  scalar_t* __restrict__ scales,
+    const  scalar_t* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int height,
+    int width
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  // h is index of height with step being BLOCKWIDTH
+  int h = BLOCKHEIGHT_OLD4 * blockIdx.x;
+  // w is index of width with step being 1
+  int w = BLOCKWIDTH * blockIdx.y + tid;
+  if (w >= width && tid >= height) {
+    return;
+  }
+  __shared__ scalar_t blockvec[BLOCKWIDTH];
+  int k;
+  scalar_t w_tmp;
+  float weight[BLOCKWIDTH];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      for (k = 0; k <  BLOCKWIDTH && h*2 + k < height; ++k){
+        int k_w = (k / 2);
+        int k_bit = (k % 2) * 4;
+        int w_index = (batch_shift * height + h + k) * width  + k_w;
+        if (w_index >= weight_total || w >= width) {
+          weight[k] = 0;
+        } else {
+          scalar_t scale = scales[batch_shift * height + h + k];
+          scalar_t zero = zeros[batch_shift * height + h + k];
+          w_tmp = ((as_unsigned(mat[w_index]) >> k_bit) & 0xF);
+          weight[k] = scale * (w_tmp - zero);
+        }
+      }
+      scalar_t res;
+      for (int vr = 0; vr < vec_row; ++vr){
+          res = 0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        if (vec_index < input_total) {
+            blockvec[tid] = vec[vec_index];
+        } else {
+            blockvec[tid] = 0;
+        }
+        __syncthreads();
+          for (k = 0; k <  BLOCKWIDTH && h*2 + k < height; ++k){
+          // res is the dot product of BLOCKWIDTH elements (part of width)
+            res += weight[k] * blockvec[k];
+        }
+        // add res to the final result, final matrix shape: (batch, vec_row, width)
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (out_index < out_total) {
+            atomicAdd(&mul[out_index], res);
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+void vecquant8matmul_batched_faster_old_cuda(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor scales,
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2);
+  int vec_height = vec.size(3);
+  int height = mat.size(2);
+  int width = mat.size(3);
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  VecQuant8BatchMatMulKernel_faster_old<<<blocks, threads>>>(
+    (half*) vec.data_ptr(),
+    (uint8_t*) mat.data_ptr(),
+    (half*) mul.data_ptr(),
+    (half*) scales.data_ptr(),
+    (half*) zeros.data_ptr(),
+    batch, heads, vec_row, vec_height, height, width
+  );
+}
+__global__ void VecQuant8BatchMatMulKernel_faster_old(
+    const  half* __restrict__ vec,
+    const  uint8_t* __restrict__ mat,
+           half* __restrict__ mul,
+    const  half* __restrict__ scales,
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row,
+    int vec_height,
+    int height,
+    int width
+) {
+ int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * vec_height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  const int BLOCKWIDTH_half = BLOCKWIDTH/2;
+  int h = BLOCKWIDTH * blockIdx.x; //head_dim, dim=-1
+  int w = BLOCKWIDTH * blockIdx.y + tid; //seq-len, +0-256 ,dim=-2
+  /*
+  if (w >= width && tid >= vec_height) {
+    return;
+  }
+  */
+  __shared__ half blockvec[BLOCKWIDTH]; //256
+  int i = width * h + w;
+  int k;
+  half w_tmp1 = __float2half(0);
+  half w_tmp2 = __float2half(0);
+  half2 weight[BLOCKWIDTH_half];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      //int zero_index = batch_shift;
+      for (k = 0; k <  BLOCKWIDTH_half; ++k){
+        int w_index1 = batch_shift * height * width + i + (2 * k * width); // [batch,head,h+k, w]
+        int w_index2 = batch_shift * height * width + i + ((2 * k + 1) * width);
+        int zero_index = batch_shift * width + w; // [batch,head, w]
+        if (w_index1 >= weight_total || w >= width || (2 * k + h) >= height) {
+          weight[k] = __float2half2_rn(0);
+        } else {
+            float zero_f=__half2float(zeros[zero_index]);
+            float scale_f= __half2float(scales[zero_index]);
+            if (w_index2 >= weight_total){
+              w_tmp1 = __float2half((as_unsigned(mat[w_index1]) -zero_f)*scale_f);
+              w_tmp2 = __float2half(0);
+              weight[k] = __halves2half2(w_tmp1,w_tmp2);
+              //printf("zero_index is %d w is %d height is %d width is %d w_index1 is %d w_tmp1 is %f w_tmp2 is %f zero is %f scale is %f low is %f high is %f \n ",zero_index,w,height, width,w_index1,__half2float(w_tmp1),__half2float(w_tmp2),zero_f,scale_f,__low2float(weight[k]),__high2float(weight[k]));
+            }else{
+              w_tmp1 = __int2half_rn(as_unsigned(mat[w_index1]));
+              w_tmp2 = __int2half_rn(as_unsigned(mat[w_index2]));
+              //weight[k] = __hmul2(__hsub2(__halves2half2(w_tmp1,w_tmp2), __halves2half2(zero,zero)),__halves2half2(scale,scale));
+              weight[k] = __hfma2(__halves2half2(w_tmp1,w_tmp2), __float2half2_rn(scale_f), __float2half2_rn(-(scale_f * zero_f)));
+              //printf("zero_index1 is %d zero_index2 is %d k is %d head is %d w is %d h is %d height is %d width is %d w_index1 is %d w_index2 is %d zero is %f scale is %f low is %f high is %f \n ",zero_index1,zero_index2,k,head,w,h,height, width,w_index1,w_index2,__half2float(zero1),__half2float(scale1),__low2float(weight[k]),__high2float(weight[k]));
+            }
+        }
+      }
+      for (int vr = 0; vr < vec_row; ++vr){
+        float res=0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (vec_index < input_total) {
+            //blockvec[tid] = __half2float(vec[vec_index]);// [batch, head, vr, tid(seq_len dim+)]
+            blockvec[tid] = vec[vec_index];
+            //printf("width is %d height is %d h is %d w is %d vec_index is %d out_index is %d vec_row is %d vec_height is %d,vr is %d tid is %d blockvec is %f\n",width,height, h,w,vec_index,out_index,vec_row,vec_height,vr,tid,blockvec[tid]);
+        } else {
+            blockvec[tid] = __float2half(0);
+        }
+        __syncthreads();
+        if (out_index < out_total) {
+          for (k = 0; k <  BLOCKWIDTH_half; ++k){
+            half2 res2 = __hmul2(weight[k],__halves2half2(blockvec[2*k],blockvec[2*k+1]));
+            res += __low2float(res2) + __high2float(res2);
+          }
+          atomicAdd(&mul[out_index], __float2half(res));
+        }
+        __syncthreads();
+      }
+    }
+  }
+}
+void vecquant8matmul_batched_column_compression_faster_old_cuda(
+  torch::Tensor vec,  // [batch,heads, seq_q, seq_v]
+  torch::Tensor mat, // [batch,heads, seq_v, head_dim]
+  torch::Tensor mul,  // [batch,heads, seq_q,head_dim]
+  torch::Tensor scales, // [batch,heads, head_dim]
+  torch::Tensor zeros
+) {
+  int batch = vec.size(0);
+  int heads = vec.size(1);
+  int vec_row = vec.size(2); //ql
+  int height = mat.size(2); //vl
+  int width = mat.size(3); //head_dim
+  dim3 blocks(
+    (height + BLOCKWIDTH - 1) / BLOCKWIDTH,
+    (width + BLOCKWIDTH - 1) / BLOCKWIDTH
+  );
+  dim3 threads(BLOCKWIDTH);
+  VecQuant8BatchMatMulColumnCompressionKernel_faster_old<<<blocks, threads>>>(
+    (half*) vec.data_ptr(),
+    (uint8_t*) mat.data_ptr(),
+    (half*) mul.data_ptr(),
+    (half*) scales.data_ptr(),
+    (half*) zeros.data_ptr(),
+    batch, heads, vec_row, height, width
+  );
+}
+__global__ void VecQuant8BatchMatMulColumnCompressionKernel_faster_old(
+    const  half* __restrict__ vec,  // [batch,heads, seq_q, seq_v]
+    const  uint8_t* __restrict__ mat, // [batch,heads, seq_v, head_dim]
+           half* __restrict__ mul, // [batch,heads, seq_q,head_dim]
+    const  half* __restrict__ scales, // [batch,heads, seq_v]
+    const  half* __restrict__ zeros,
+    int batch,
+    int heads,
+    int vec_row, //seq_q
+    int height, //seq_v
+    int width //head_dim
+) {
+  int weight_total = batch * heads * height * width;
+  int input_total = batch * heads * vec_row * height;
+  int out_total = batch * heads * vec_row * width;
+  int tid = threadIdx.x;
+  int h = BLOCKWIDTH * blockIdx.x; // vl
+  int w = BLOCKWIDTH * blockIdx.y + tid; //head_dim + block
+  if (w >= width && tid >= height) {
+    return;
+  }
+  __shared__ half blockvec[BLOCKWIDTH];
+  int k;
+  half w_tmp1 = __float2half(0);
+  half w_tmp2 = __float2half(0);
+  int i = width * h + w;
+  const int BLOCKWIDTH_half = BLOCKWIDTH/2;
+  half2 weight[BLOCKWIDTH_half];
+  for (int b = 0; b < batch; ++b){
+    for (int head = 0; head < heads; ++head){
+      int batch_shift = b * heads + head;
+      //int zero_index = batch_shift;
+      for (k = 0; k <  BLOCKWIDTH_half; ++k){
+        int w_index1 = batch_shift * height * width + i + (2 * k) * width; // [batch,head, h+k, w]
+        int w_index2 = batch_shift * height * width + i + ((2 * k + 1) * width);
+        int zero_index1 = batch_shift * height + h + 2*k; // [batch,head, w]
+        int zero_index2 = batch_shift * height + h + 2*k+1; // [batch,head, w]
+        if (w_index1 >= weight_total || (2 * k + h)>=height) {
+          weight[k]=__float2half2_rn(0);
+        } else{
+            //int zero_index = batch_shift + h; // [batch,head, w]
+            //float scale_f1 = __half2float(scales[zero_index1]);
+            //float zero_f1 =  __half2float(zeros[zero_index1]);
+            if (w_index2>=weight_total){
+              w_tmp1 = __float2half((as_unsigned(mat[w_index1]) - __half2float(zeros[zero_index1]))* __half2float(scales[zero_index1]));
+              w_tmp2 = __float2half(0);
+              weight[k] = __halves2half2(w_tmp1,w_tmp2);
+              //printf("zero_index is %d k is %d w is %d head is %d height is %d width is %d w_index1 is %d w_tmp1 is %f w_tmp2 is %f zero is %f scale is %f low is %f high is %f \n ",zero_index,k,w,head,height, width,w_index1,__half2float(w_tmp1),__half2float(w_tmp2),zero_f,scale_f,__low2float(weight[k]),__high2float(weight[k]));
+            }else{
+              w_tmp1 = __int2half_rn(as_unsigned(mat[w_index1]));
+              w_tmp2 = __int2half_rn(as_unsigned(mat[w_index2]));
+              half zero1=zeros[zero_index1];
+              half zero2=zeros[zero_index2];
+              half scale1=scales[zero_index1];
+              half scale2=scales[zero_index2];
+              weight[k] = __hmul2(__hsub2(__halves2half2(w_tmp1,w_tmp2), __halves2half2(zero1,zero2)),__halves2half2(scale1,scale2));
+              //weight[k] = __hfma2(__halves2half2(w_tmp1,w_tmp2), __float2half2_rn(scale_f), __float2half2_rn(-(scale_f * zero_f)));
+              //printf("zero_index1 is %d zero_index2 is %d k is %d head is %d w is %d h is %d height is %d width is %d w_index1 is %d w_index2 is %d zero is %f scale is %f low is %f high is %f \n ",zero_index1,zero_index2,k,head,w,h,height, width,w_index1,w_index2,__half2float(zero1),__half2float(scale1),__low2float(weight[k]),__high2float(weight[k]));
+            }
+          }
+       }
+      for (int vr = 0; vr < vec_row; ++vr){
+        float res=0;
+        int vec_index = (batch_shift * vec_row + vr) * height + blockIdx.x * BLOCKWIDTH + tid;
+        int out_index = (batch_shift * vec_row + vr) * width + w;
+        if (vec_index < input_total) {
+            //blockvec[tid] = __half2float(vec[vec_index]);
+            blockvec[tid] = vec[vec_index];
+            //printf("vec_index is %d out_index is %d vec_row is %d ,vr is %d tid is %d blockvec is %f\n",vec_index,out_index,vec_row,vr,tid,blockvec[tid]);
+        } else {
+            blockvec[tid] = __float2half(0);
+            //blockvec[tid] = 0;
+        }
+        __syncthreads();
+        if (out_index < out_total) {
+            for (k = 0; k <  BLOCKWIDTH_half; ++k){
+                half2 res2 = __hmul2(weight[k],__halves2half2(blockvec[2*k],blockvec[2*k+1]));
+                res += __low2float(res2) + __high2float(res2);
+            }
+            atomicAdd(&mul[out_index], __float2half(res));
+        }
+        __syncthreads();
+      }
+    }
+  }
+}

kernels/cpp_kernels.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from torch.utils import cpp_extension
+import pathlib
+import os
+import subprocess
+def _get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
+                                         universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+    return raw_output, bare_metal_major, bare_metal_minor
+def _create_build_dir(buildpath):
+    try:
+        os.mkdir(buildpath)
+    except OSError:
+        if not os.path.isdir(buildpath):
+            print(f"Creation of the build directory {buildpath} failed")
+# Check if cuda 11 is installed for compute capability 8.0
+cc_flag = []
+_, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+if int(bare_metal_major) >= 11:
+    cc_flag.append('-gencode')
+    cc_flag.append('arch=compute_80,code=sm_80')
+    if int(bare_metal_minor) >= 7:
+        cc_flag.append('-gencode')
+        cc_flag.append('arch=compute_90,code=sm_90')
+# Build path
+srcpath = pathlib.Path(__file__).parent.absolute()
+buildpath = srcpath / 'build'
+_create_build_dir(buildpath)
+def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
+    return cpp_extension.load(
+        name=name,
+        sources=sources,
+        build_directory=buildpath,
+        extra_cflags=['-O3', ],
+        extra_cuda_cflags=['-O3',
+                           '-gencode', 'arch=compute_70,code=sm_70',
+                           '--use_fast_math'] + extra_cuda_flags + cc_flag,
+        verbose=1
+    )
+extra_flags = []
+cache_autogptq_cuda_256_sources = ["./kernels/cache_autogptq_cuda_256.cpp",
+           "./kernels/cache_autogptq_cuda_kernel_256.cu"]
+cache_autogptq_cuda_256 = _cpp_extention_load_helper("cache_autogptq_cuda_256", cache_autogptq_cuda_256_sources, extra_flags)

modeling_qwen.py CHANGED Viewed

@@ -31,6 +31,7 @@ try:
 except ImportError:
     rearrange = None
 from torch import nn
 SUPPORT_CUDA = torch.cuda.is_available()
 SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
@@ -75,7 +76,6 @@ apply_rotary_emb_func = None
 rms_norm = None
 flash_attn_unpadded_func = None
 def _import_flash_attn():
     global apply_rotary_emb_func, rms_norm, flash_attn_unpadded_func
     try:
@@ -112,6 +112,31 @@ def _import_flash_attn():
             "https://github.com/Dao-AILab/flash-attention"
         )
 class FlashSelfAttention(torch.nn.Module):
     def __init__(
@@ -254,19 +279,51 @@ class QWenAttention(nn.Module):
         self.register_buffer("logn_tensor", logn_tensor, persistent=False)
         self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
     def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
-        attn_weights = torch.matmul(query, key.transpose(-1, -2))
         if self.scale_attn_weights:
             attn_weights = attn_weights / torch.full(
                 [],
-                value.size(-1) ** 0.5,
                 dtype=attn_weights.dtype,
                 device=attn_weights.device,
             )
-        query_length, key_length = query.size(-2), key.size(-2)
         causal_mask = registered_causal_mask[
             :, :, key_length - query_length : key_length, :key_length
         ]
@@ -283,13 +340,32 @@ class QWenAttention(nn.Module):
         attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
-        attn_weights = attn_weights.type(value.dtype)
         attn_weights = self.attn_dropout(attn_weights)
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
-        attn_output = torch.matmul(attn_weights, value)
         attn_output = attn_output.transpose(1, 2)
         return attn_output, attn_weights
@@ -373,7 +449,6 @@ class QWenAttention(nn.Module):
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
     ):
         mixed_x_layer = self.c_attn(hidden_states)
         query, key, value = mixed_x_layer.split(self.split_size, dim=2)
@@ -405,10 +480,34 @@ class QWenAttention(nn.Module):
                 query = torch.cat(query_list, dim=0)
                 key = torch.cat(key_list, dim=0)
         if layer_past is not None:
             past_key, past_value = layer_past[0], layer_past[1]
-            key = torch.cat((past_key, key), dim=1)
-            value = torch.cat((past_value, value), dim=1)
         if use_cache:
             present = (key, value)
@@ -416,8 +515,12 @@ class QWenAttention(nn.Module):
             present = None
         if self.use_logn_attn and not self.training:
-            seq_start = key.size(1) - query.size(1)
-            seq_end = key.size(1)
             logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
             query = query * logn_tensor.expand_as(query)
@@ -435,8 +538,9 @@ class QWenAttention(nn.Module):
         else:
             query = query.permute(0, 2, 1, 3)
-            key = key.permute(0, 2, 1, 3)
-            value = value.permute(0, 2, 1, 3)
             if (
                 registered_causal_mask is None
                 and self.use_flash_attn
@@ -597,6 +701,7 @@ class QWenModel(QWenPreTrainedModel):
         self.vocab_size = config.vocab_size
         self.num_hidden_layers = config.num_hidden_layers
         self.embed_dim = config.hidden_size
         self.gradient_checkpointing = False
         self.use_dynamic_ntk = config.use_dynamic_ntk
@@ -721,8 +826,10 @@ class QWenModel(QWenPreTrainedModel):
             past_length = 0
             past_key_values = tuple([None] * len(self.h))
         else:
-            past_length = past_key_values[0][0].size(-2)
         if position_ids is None:
             position_ids = torch.arange(
                 past_length,
@@ -750,7 +857,10 @@ class QWenModel(QWenPreTrainedModel):
         kv_seq_len = hidden_states.size()[1]
         if past_key_values[0] is not None:
             # past key values[0][0] shape: bs * seq_len * head_num * dim
-            kv_seq_len += past_key_values[0][0].shape[1]
         if self.training or not self.use_dynamic_ntk:
             ntk_alpha_list = [1.0]
@@ -907,6 +1017,12 @@ class QWenLMHeadModel(QWenPreTrainedModel):
         if config.use_flash_attn:
             _import_flash_attn()
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -918,6 +1034,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
             self.lm_head.half()
         self.post_init()
     def get_output_embeddings(self):
         return self.lm_head

 except ImportError:
     rearrange = None
 from torch import nn
+from kernels.cpp_kernels import cache_autogptq_cuda_256
 SUPPORT_CUDA = torch.cuda.is_available()
 SUPPORT_BF16 = SUPPORT_CUDA and torch.cuda.is_bf16_supported()
 rms_norm = None
 flash_attn_unpadded_func = None
 def _import_flash_attn():
     global apply_rotary_emb_func, rms_norm, flash_attn_unpadded_func
     try:
             "https://github.com/Dao-AILab/flash-attention"
         )
+def quantize_cache_v(fdata, bits, qmax, qmin):
+    # b, s, head, h-dim->b, head, s, h-dim
+    qtype = torch.uint8
+    device = fdata.device
+    shape = fdata.shape
+    fdata_cal = torch.flatten(fdata, 2)
+    fmax = torch.amax(fdata_cal, dim=-1, keepdim=True)
+    fmin = torch.amin(fdata_cal, dim=-1, keepdim=True)
+    # Compute params
+    if qmax.device != fmax.device:
+        qmax = qmax.to(device)
+        qmin = qmin.to(device)
+    scale = (fmax - fmin) / (qmax - qmin)
+    zero = qmin - fmin / scale
+    scale = scale.unsqueeze(-1).repeat(1,1,shape[2],1).contiguous()
+    zero = zero.unsqueeze(-1).repeat(1,1,shape[2],1).contiguous()
+    # Quantize
+    res_data = fdata / scale + zero
+    qdata = torch.clamp(res_data, qmin, qmax).to(qtype)
+    return qdata.contiguous(), scale, zero
+def dequantize_cache_torch(qdata, scale, zero):
+    data = scale * (qdata - zero)
+    return data
 class FlashSelfAttention(torch.nn.Module):
     def __init__(
         self.register_buffer("logn_tensor", logn_tensor, persistent=False)
         self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
+        self.use_cache_quantization = config.use_cache_quantization if hasattr(config, 'use_cache_quantization') else False
+        self.use_cache_kernel = config.use_cache_kernel if hasattr(config,'use_cache_kernel') else False
+        cache_dtype = torch.float
+        if self.bf16:
+            cache_dtype=torch.bfloat16
+        elif config.fp16:
+            cache_dtype = torch.float16
+        self.cache_qmax = torch.tensor(torch.iinfo(torch.uint8).max, dtype=cache_dtype)
+        self.cache_qmin = torch.tensor(torch.iinfo(torch.uint8).min, dtype=cache_dtype)
     def _attn(self, query, key, value, registered_causal_mask, attention_mask=None, head_mask=None):
+        device = query.device
+        if self.use_cache_quantization:
+            qk, qk_scale, qk_zero = key
+            if self.use_cache_kernel:
+                shape = query.shape[:-1] + (qk.shape[-2],)
+                attn_weights = torch.zeros(shape, dtype=torch.float16, device=device)
+                cache_autogptq_cuda_256.vecquant8matmul_batched_faster_old(
+                    query.contiguous() if query.dtype == torch.float16 else query.to(torch.float16).contiguous(),
+                    qk.transpose(-1, -2).contiguous(),
+                    attn_weights,
+                    qk_scale.contiguous() if qk_scale.dtype == torch.float16 else qk_scale.to(torch.float16).contiguous(),
+                    qk_zero.contiguous()if qk_zero.dtype == torch.float16 else qk_zero.to(torch.float16).contiguous())
+                # attn_weights = attn_weights.to(query.dtype).contiguous()
+            else:
+                key = dequantize_cache_torch(qk, qk_scale, qk_zero)
+                attn_weights = torch.matmul(query, key.transpose(-1, -2))
+        else:
+            attn_weights = torch.matmul(query, key.transpose(-1, -2))
         if self.scale_attn_weights:
+            if self.use_cache_quantization:
+                size_temp = value[0].size(-1)
+            else:
+                size_temp = value.size(-1)
             attn_weights = attn_weights / torch.full(
                 [],
+                size_temp ** 0.5,
                 dtype=attn_weights.dtype,
                 device=attn_weights.device,
             )
+        if self.use_cache_quantization:
+            query_length, key_length = query.size(-2), key[0].size(-2)
+        else:
+            query_length, key_length = query.size(-2), key.size(-2)
         causal_mask = registered_causal_mask[
             :, :, key_length - query_length : key_length, :key_length
         ]
         attn_weights = nn.functional.softmax(attn_weights.float(), dim=-1)
+        attn_weights = attn_weights.type(query.dtype)
         attn_weights = self.attn_dropout(attn_weights)
         if head_mask is not None:
             attn_weights = attn_weights * head_mask
+        if self.use_cache_quantization:
+            qv, qv_scale, qv_zero = value
+            if self.use_cache_kernel:
+                shape = attn_weights.shape[:-1] + (query.shape[-1],)
+                attn_output = torch.zeros(shape, dtype=torch.float16, device=device)
+                cache_autogptq_cuda_256.vecquant8matmul_batched_column_compression_faster_old(
+                    attn_weights.contiguous() if attn_weights.dtype == torch.float16 else attn_weights.to(torch.float16).contiguous(),
+                    qv.contiguous(),  # dtype: int32
+                    attn_output,
+                    qv_scale.contiguous() if qv_scale.dtype == torch.float16 else qv_scale.to(torch.float16).contiguous(),
+                    qv_zero.contiguous() if qv_zero.dtype == torch.float16 else qv_zero.to(torch.float16).contiguous())
+                if attn_output.dtype != query.dtype:
+                    attn_output = attn_output.to(query.dtype)
+                    attn_weights = attn_weights.to(query.dtype)
+            else:
+                value = dequantize_cache_torch(qv, qv_scale, qv_zero)
+                attn_output = torch.matmul(attn_weights, value)
+        else:
+            attn_output = torch.matmul(attn_weights, value)
         attn_output = attn_output.transpose(1, 2)
         return attn_output, attn_weights
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
     ):
         mixed_x_layer = self.c_attn(hidden_states)
         query, key, value = mixed_x_layer.split(self.split_size, dim=2)
                 query = torch.cat(query_list, dim=0)
                 key = torch.cat(key_list, dim=0)
+        if self.use_cache_quantization:
+            key = quantize_cache_v(key.permute(0, 2, 1, 3),
+                                       bits=8,
+                                       qmin=self.cache_qmin,
+                                       qmax=self.cache_qmax)
+            value = quantize_cache_v(value.permute(0, 2, 1, 3),
+                                         bits=8,
+                                         qmin=self.cache_qmin,
+                                         qmax=self.cache_qmax)
         if layer_past is not None:
             past_key, past_value = layer_past[0], layer_past[1]
+            if self.use_cache_quantization:
+                # use_cache_quantization:
+                # present=((q_key,key_scale,key_zero_point),
+                #          (q_value,value_scale,value_zero_point))
+                key = (torch.cat((past_key[0], key[0]), dim=2),
+                       torch.cat((past_key[1], key[1]), dim=2),
+                       torch.cat((past_key[2], key[2]), dim=2))
+                value = (torch.cat((past_value[0], value[0]), dim=2),
+                         torch.cat((past_value[1], value[1]), dim=2),
+                         torch.cat((past_value[2], value[2]), dim=2))
+            else:
+                # not use_cache_quantization:
+                # present=(key,value)
+                key = torch.cat((past_key, key), dim=1)
+                value = torch.cat((past_value, value), dim=1)
         if use_cache:
             present = (key, value)
             present = None
         if self.use_logn_attn and not self.training:
+            if self.use_cache_quantization:
+                seq_start = key[0].size(2) - query.size(1)
+                seq_end = key[0].size(2)
+            else:
+                seq_start = key.size(1) - query.size(1)
+                seq_end = key.size(1)
             logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
             query = query * logn_tensor.expand_as(query)
         else:
             query = query.permute(0, 2, 1, 3)
+            if not self.use_cache_quantization:
+                key = key.permute(0, 2, 1, 3)
+                value = value.permute(0, 2, 1, 3)
             if (
                 registered_causal_mask is None
                 and self.use_flash_attn
         self.vocab_size = config.vocab_size
         self.num_hidden_layers = config.num_hidden_layers
         self.embed_dim = config.hidden_size
+        self.use_cache_quantization = self.config.use_cache_quantization if hasattr(self.config, 'use_cache_quantization') else False
         self.gradient_checkpointing = False
         self.use_dynamic_ntk = config.use_dynamic_ntk
             past_length = 0
             past_key_values = tuple([None] * len(self.h))
         else:
+            if self.use_cache_quantization:
+                past_length = past_key_values[0][0][0].size(2)
+            else:
+                past_length = past_key_values[0][0].size(-2)
         if position_ids is None:
             position_ids = torch.arange(
                 past_length,
         kv_seq_len = hidden_states.size()[1]
         if past_key_values[0] is not None:
             # past key values[0][0] shape: bs * seq_len * head_num * dim
+            if self.use_cache_quantization:
+                kv_seq_len += past_key_values[0][0][0].shape[2]
+            else:
+                kv_seq_len += past_key_values[0][0].shape[1]
         if self.training or not self.use_dynamic_ntk:
             ntk_alpha_list = [1.0]
         if config.use_flash_attn:
             _import_flash_attn()
+        if hasattr(config, 'use_cache_quantization') and config.use_cache_quantization:
+            config.use_flash_attn = False
+            if hasattr(config, 'use_cache_kernel') and config.use_cache_kernel:
+                from kernels.cpp_kernels import cache_autogptq_cuda_256
         self.transformer = QWenModel(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
             self.lm_head.half()
         self.post_init()
     def get_output_embeddings(self):
         return self.lm_head