FlagAlpha/Llama2-Chinese-13b-Chat-4bit · saved model reload error 'LlamaAttention' object has no attribute 'qkv

Aug 24, 2023

import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
DEVICE = "cuda"
DEVICE_ID = "0"
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
device = torch.device(CUDA_DEVICE)
model_path = 'FlagAlpha/Llama2-Chinese-13b-Chat-4bit'
model = AutoGPTQForCausalLM.from_quantized(model_path,device=CUDA_DEVICE)
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_path1,use_fast=False)

save_path="/root/autodl-tmp/hf_repo/llama2_chinese"
model.save_quantized(save_path)

then reload the saved model error come

EricZHangZHeng

LlamaFamily org Aug 24, 2023

Can you send me all the logs with errors? transformers and auto_gptq's version?

lilsyoss

Aug 24, 2023

Can you send me all the logs with errors? transformers and auto_gptq's version?

transformers 4.31.0
auto-gptq 0.4.1+cu1180

I install gptq by below command :
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

full log error :

AttributeError Traceback (most recent call last)
Cell In[1], line 15
9 model_path = "/root/autodl-tmp/hf_repo/llama2_chinese"
12 # gptq_config = GPTQConfig(bits=4, disable_exllama=False)
---> 15 model = AutoGPTQForCausalLM.from_quantized(model_path,device=CUDA_DEVICE)
16 model = model.eval()
17 tokenizer = AutoTokenizer.from_pretrained(model_path,use_fast=False)

File ~/miniconda3/lib/python3.8/site-packages/auto_gptq/modeling/auto.py:108, in AutoGPTQForCausalLM.from_quantized(cls, model_name_or_path, device_map, max_memory, device, low_cpu_mem_usage, use_triton, inject_fused_attention, inject_fused_mlp, use_cuda_fp16, quantize_config, model_basename, use_safetensors, trust_remote_code, warmup_triton, trainable, disable_exllama, **kwargs)
102 # TODO: do we need this filtering of kwargs? @PanQiWei is there a reason we can't just pass all kwargs?
103 keywords = {
104 key: kwargs[key]
105 for key in list(signature(quant_func).parameters.keys()) + huggingface_kwargs
106 if key in kwargs
107 }
--> 108 return quant_func(
109 model_name_or_path=model_name_or_path,
110 device_map=device_map,
111 max_memory=max_memory,
112 device=device,
113 low_cpu_mem_usage=low_cpu_mem_usage,
114 use_triton=use_triton,
115 inject_fused_attention=inject_fused_attention,
116 inject_fused_mlp=inject_fused_mlp,
117 use_cuda_fp16=use_cuda_fp16,
118 quantize_config=quantize_config,
119 model_basename=model_basename,
120 use_safetensors=use_safetensors,
121 trust_remote_code=trust_remote_code,
122 warmup_triton=warmup_triton,
123 trainable=trainable,
124 disable_exllama=disable_exllama,
125 **keywords
126 )

File ~/miniconda3/lib/python3.8/site-packages/auto_gptq/modeling/_base.py:875, in BaseGPTQForCausalLM.from_quantized(cls, model_name_or_path, device_map, max_memory, device, low_cpu_mem_usage, use_triton, torch_dtype, inject_fused_attention, inject_fused_mlp, use_cuda_fp16, quantize_config, model_basename, use_safetensors, trust_remote_code, warmup_triton, trainable, disable_exllama, **kwargs)
872 if low_cpu_mem_usage:
873 make_sure_no_tensor_in_meta_device(model, use_triton, quantize_config.desc_act, quantize_config.group_size, bits=quantize_config.bits)
--> 875 accelerate.utils.modeling.load_checkpoint_in_model(
876 model,
877 checkpoint=model_save_name,
878 device_map=device_map,
879 offload_state_dict=True,
880 offload_buffers=True
881 )
882 model = simple_dispatch_model(model, device_map)
884 # == step4: set seqlen == #

File ~/miniconda3/lib/python3.8/site-packages/accelerate/utils/modeling.py:1336, in load_checkpoint_in_model(model, checkpoint, device_map, offload_folder, dtype, offload_state_dict, offload_buffers, keep_in_fp32_modules, offload_8bit_bnb)
1334 offload_weight(param, param_name, state_dict_folder, index=state_dict_index)
1335 else:
-> 1336 set_module_tensor_to_device(
1337 model,
1338 param_name,
1339 param_device,
1340 value=param,
1341 dtype=new_dtype,
1342 fp16_statistics=fp16_statistics,
1343 )
1345 # Force Python to clean up.
1346 del checkpoint

File ~/miniconda3/lib/python3.8/site-packages/accelerate/utils/modeling.py:255, in set_module_tensor_to_device(module, tensor_name, device, value, dtype, fp16_statistics)
253 splits = tensor_name.split(".")
254 for split in splits[:-1]:
--> 255 new_module = getattr(module, split)
256 if new_module is None:
257 raise ValueError(f"{module} has no attribute {split}.")

File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py:1614, in Module.getattr(self, name)
1612 if name in modules:
1613 return modules[name]
-> 1614 raise AttributeError("'{}' object has no attribute '{}'".format(
1615 type(self).name, name))

AttributeError: 'LlamaAttention' object has no attribute 'qkv_proj'

FlagAlpha
/

Llama2-Chinese-13b-Chat-4bit

saved model reload error 'LlamaAttention' object has no attribute 'qkv_proj'

full log error :