BagelMix-8x7B-2b-HQQ / HQQbagelmix_def.py
ProphetOfBostrom's picture
Update HQQbagelmix_def.py
f37cbe5 verified
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
model_path = "Undi95/BagelMix-8x7B" #i used a directory here not hf's identifiers but in principle you can do that if your internet's fast and you trust the cache...
model = HQQModelForCausalLM.from_pretrained(model_path)
#Quantize params
from hqq.core.quantize import *
attn_prams = BaseQuantizeConfig(nbits=4, group_size=64, quant_zero=True, quant_scale=True) #4g64
attn_prams['scale_quant_params']['group_size'] = 256
experts_params = BaseQuantizeConfig(nbits=2, group_size=16, quant_zero=True, quant_scale=True) #2g16
quant_config = {}
#Attention
quant_config['self_attn.q_proj'] = attn_prams
quant_config['self_attn.k_proj'] = attn_prams
quant_config['self_attn.v_proj'] = attn_prams
quant_config['self_attn.o_proj'] = attn_prams
#Experts
quant_config['block_sparse_moe.experts.w1'] = experts_params
quant_config['block_sparse_moe.experts.w2'] = experts_params
quant_config['block_sparse_moe.experts.w3'] = experts_params
#Quantize
save_path="models/BagelMix-8x7B-2g16-4g64-HQQ/"
model.quantize_model(quant_config=quant_config)
model.save_quantized(save_path)
#brought to you by python -i and a 200 GB swap device, code adapted from last upload