|
test_stage: |
|
obcq_modifiers: |
|
SmoothQuantModifier: |
|
smoothing_strength: 0.8 |
|
mappings: |
|
- - ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj'] |
|
- re:.*input_layernorm |
|
- - ['re:.*gate_proj', 're:.*up_proj'] |
|
- re:.*post_attention_layernorm |
|
- - ['re:.*down_proj'] |
|
- re:.*up_proj |
|
QuantizationModifier: |
|
ignore: [LlamaRotaryEmbedding, LlamaRMSNorm, SiLUActivation, model.layers.30.mlp.down_proj, |
|
model.layers.1.mlp.down_proj, model.layers.0.mlp.down_proj, model.layers.4.mlp.down_proj, |
|
model.layers.8.mlp.down_proj, MatMulOutput_QK, MatMulOutput_PV, MatMulLeftInput_QK, |
|
MatMulLeftInput_PV, MatMulRightInput_QK, MatMulRightInput_PV, QuantizableMatMul] |
|
post_oneshot_calibration: true |
|
scheme_overrides: |
|
Linear: |
|
weights: {num_bits: 8, symmetric: true, strategy: channel} |
|
Embedding: |
|
input_activations: null |
|
weights: {num_bits: 8, symmetric: false} |
|
SparseGPTModifier: |
|
sparsity: 0.0 |
|
block_size: 128 |
|
sequential_update: false |
|
quantize: true |
|
percdamp: 0.01 |
|
prunen: 0 |
|
prunem: 0 |
|
targets: [model.layers.0, model.layers.1, model.layers.2, model.layers.3, model.layers.4, |
|
model.layers.5, model.layers.6, model.layers.7, model.layers.8, model.layers.9, model.layers.10, |
|
model.layers.11, model.layers.12, model.layers.13, model.layers.14, model.layers.15, |
|
model.layers.16, model.layers.17, model.layers.18, model.layers.19, model.layers.20, |
|
model.layers.21, model.layers.22, model.layers.23, model.layers.24, model.layers.25, |
|
model.layers.26, model.layers.27, model.layers.28, model.layers.29, model.layers.30, |
|
model.layers.31, lm_head] |
|
target_ids: [attention_mask, position_ids] |
|
|