ProphetOfBostrom
/

BagelMix-8x7B-2b-HQQ

Text Generation

Not-For-All-Audiences

nsfw

Inference Endpoints

Model card Files Files and versions Community

BagelMix-8x7B-2b-HQQ / HQQbagelmix_def.py

ProphetOfBostrom's picture

ProphetOfBostrom

Update HQQbagelmix_def.py

f37cbe5 verified 9 months ago

history blame contribute delete

1.23 kB

	from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
	model_path = "Undi95/BagelMix-8x7B" #i used a directory here not hf's identifiers but in principle you can do that if your internet's fast and you trust the cache...
	model = HQQModelForCausalLM.from_pretrained(model_path)

	#Quantize params
	from hqq.core.quantize import *

	attn_prams = BaseQuantizeConfig(nbits=4, group_size=64, quant_zero=True, quant_scale=True) #4g64
	attn_prams['scale_quant_params']['group_size'] = 256
	experts_params = BaseQuantizeConfig(nbits=2, group_size=16, quant_zero=True, quant_scale=True) #2g16

	quant_config = {}
	#Attention
	quant_config['self_attn.q_proj'] = attn_prams
	quant_config['self_attn.k_proj'] = attn_prams
	quant_config['self_attn.v_proj'] = attn_prams
	quant_config['self_attn.o_proj'] = attn_prams
	#Experts
	quant_config['block_sparse_moe.experts.w1'] = experts_params
	quant_config['block_sparse_moe.experts.w2'] = experts_params
	quant_config['block_sparse_moe.experts.w3'] = experts_params

	#Quantize
	save_path="models/BagelMix-8x7B-2g16-4g64-HQQ/"
	model.quantize_model(quant_config=quant_config)
	model.save_quantized(save_path)

	#brought to you by python -i and a 200 GB swap device, code adapted from last upload