Text Generation
Transformers
mixtral
Not-For-All-Audiences
nsfw
mergekit
Merge
HQQ
2bit
conversational
Inference Endpoints
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer | |
model_path = "Undi95/BagelMix-8x7B" #i used a directory here not hf's identifiers but in principle you can do that if your internet's fast and you trust the cache... | |
model = HQQModelForCausalLM.from_pretrained(model_path) | |
#Quantize params | |
from hqq.core.quantize import * | |
attn_prams = BaseQuantizeConfig(nbits=4, group_size=64, quant_zero=True, quant_scale=True) #4g64 | |
attn_prams['scale_quant_params']['group_size'] = 256 | |
experts_params = BaseQuantizeConfig(nbits=2, group_size=16, quant_zero=True, quant_scale=True) #2g16 | |
quant_config = {} | |
#Attention | |
quant_config['self_attn.q_proj'] = attn_prams | |
quant_config['self_attn.k_proj'] = attn_prams | |
quant_config['self_attn.v_proj'] = attn_prams | |
quant_config['self_attn.o_proj'] = attn_prams | |
#Experts | |
quant_config['block_sparse_moe.experts.w1'] = experts_params | |
quant_config['block_sparse_moe.experts.w2'] = experts_params | |
quant_config['block_sparse_moe.experts.w3'] = experts_params | |
#Quantize | |
save_path="models/BagelMix-8x7B-2g16-4g64-HQQ/" | |
model.quantize_model(quant_config=quant_config) | |
model.save_quantized(save_path) | |
#brought to you by python -i and a 200 GB swap device, code adapted from last upload | |