Files changed (1) hide show
  1. README.md +5 -3
README.md CHANGED
@@ -24,7 +24,8 @@ python3 quantize_quark.py \
24
  --quant_scheme w_fp8_a_fp8 \
25
  --kv_cache_dtype fp8 \
26
  --num_calib_data 128 \
27
- --model_export quark_safetensors
 
28
  # If model size is too large for single GPU, please use multi GPU instead.
29
  python3 quantize_quark.py \
30
  --model_dir $MODEL_DIR \
@@ -33,6 +34,7 @@ python3 quantize_quark.py \
33
  --kv_cache_dtype fp8 \
34
  --num_calib_data 128 \
35
  --model_export quark_safetensors \
 
36
  --multi_gpu
37
  ```
38
  ## Deployment
@@ -53,9 +55,9 @@ The quantization evaluation results are conducted in pseudo-quantization mode, w
53
  <tr>
54
  <td>Perplexity-wikitext2
55
  </td>
56
- <td>5.3164
57
  </td>
58
- <td>5.4323
59
  </td>
60
  </tr>
61
  </table>
 
24
  --quant_scheme w_fp8_a_fp8 \
25
  --kv_cache_dtype fp8 \
26
  --num_calib_data 128 \
27
+ --model_export quark_safetensors \
28
+ --no_weight_matrix_merge
29
  # If model size is too large for single GPU, please use multi GPU instead.
30
  python3 quantize_quark.py \
31
  --model_dir $MODEL_DIR \
 
34
  --kv_cache_dtype fp8 \
35
  --num_calib_data 128 \
36
  --model_export quark_safetensors \
37
+ --no_weight_matrix_merge \
38
  --multi_gpu
39
  ```
40
  ## Deployment
 
55
  <tr>
56
  <td>Perplexity-wikitext2
57
  </td>
58
+ <td>3.7797
59
  </td>
60
+ <td>3.8561
61
  </td>
62
  </tr>
63
  </table>