Text Generation
Transformers
Safetensors
English
llama
causal-lm
text-generation-inference
4-bit precision
gptq

Error running the code:

#29
by Andyrasika - opened
#https://huggingface.co/TheBloke/stable-vicuna-13B-GPTQ/discussions/19
quantized_model_dir = "/workspace/models/TheBloke_stable-vicuna-13B-GPTQ"

model_basename = "stable-vicuna-13B-GPTQ-4bit.compat.no-act-order"

use_strict = False

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True)

quantize_config = BaseQuantizeConfig(
        bits=4,
        group_size=128,
        desc_act=False
    )

model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,
        use_safetensors=True,
        strict=use_strict,
        model_basename=model_basename,
        device="cuda:0",
        use_triton=use_triton,
        quantize_config=quantize_config)

# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
logging.set_verbosity(logging.CRITICAL)

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text="Human Assistant chat",
    tokenizer_name_or_path=MODEL_NAME,
)


model = get_peft_model(model, peft_config)
print_trainable_parameters(model)
gc.collect()
torch.cuda.empty_cache()
gc.collect()

gave error:
```

OutOfMemoryError Traceback (most recent call last)
Cell In[18], line 25
17 tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True)
19 quantize_config = BaseQuantizeConfig(
20 bits=4,
21 group_size=128,
22 desc_act=False
23 )
---> 25 model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,
26 use_safetensors=True,
27 strict=use_strict,
28 device="cuda:0",
29 model_basename=model_basename,
30 use_triton=use_triton,
31 quantize_config=quantize_config)
33 # Prevent printing spurious transformers error when using pipeline with AutoGPTQ
34 logging.set_verbosity(logging.CRITICAL)

File /kaggle/working/AutoGPTQ/auto_gptq/modeling/auto.py:108, in AutoGPTQForCausalLM.from_quantized(cls, model_name_or_path, device_map, max_memory, device, low_cpu_mem_usage, use_triton, inject_fused_attention, inject_fused_mlp, use_cuda_fp16, quantize_config, model_basename, use_safetensors, trust_remote_code, warmup_triton, trainable, disable_exllama, **kwargs)
102 # TODO: do we need this filtering of kwargs? @PanQiWei is there a reason we can't just pass all kwargs?
103 keywords = {
104 key: kwargs[key]
105 for key in list(signature(quant_func).parameters.keys()) + huggingface_kwargs
106 if key in kwargs
107 }
--> 108 return quant_func(
109 model_name_or_path=model_name_or_path,
110 device_map=device_map,
111 max_memory=max_memory,
112 device=device,
113 low_cpu_mem_usage=low_cpu_mem_usage,
114 use_triton=use_triton,
115 inject_fused_attention=inject_fused_attention,
116 inject_fused_mlp=inject_fused_mlp,
117 use_cuda_fp16=use_cuda_fp16,
118 quantize_config=quantize_config,
119 model_basename=model_basename,
120 use_safetensors=use_safetensors,
121 trust_remote_code=trust_remote_code,
122 warmup_triton=warmup_triton,
123 trainable=trainable,
124 disable_exllama=disable_exllama,
125 **keywords
126 )

File /kaggle/working/AutoGPTQ/auto_gptq/modeling/_base.py:875, in BaseGPTQForCausalLM.from_quantized(cls, model_name_or_path, device_map, max_memory, device, low_cpu_mem_usage, use_triton, torch_dtype, inject_fused_attention, inject_fused_mlp, use_cuda_fp16, quantize_config, model_basename, use_safetensors, trust_remote_code, warmup_triton, trainable, disable_exllama, **kwargs)
872 if low_cpu_mem_usage:
873 make_sure_no_tensor_in_meta_device(model, use_triton, quantize_config.desc_act, quantize_config.group_size, bits=quantize_config.bits)
--> 875 accelerate.utils.modeling.load_checkpoint_in_model(
876 model,
877 checkpoint=model_save_name,
878 device_map=device_map,
879 offload_state_dict=True,
880 offload_buffers=True
881 )
882 model = simple_dispatch_model(model, device_map)
884 # == step4: set seqlen == #

File /opt/conda/lib/python3.10/site-packages/accelerate/utils/modeling.py:1279, in load_checkpoint_in_model(model, checkpoint, device_map, offload_folder, dtype, offload_state_dict, offload_buffers, keep_in_fp32_modules, offload_8bit_bnb)
1277 buffer_names = [name for name, _ in model.named_buffers()]
1278 for checkpoint_file in checkpoint_files:
-> 1279 checkpoint = load_state_dict(checkpoint_file, device_map=device_map)
1280 if device_map is None:
1281 model.load_state_dict(checkpoint, strict=False)

File /opt/conda/lib/python3.10/site-packages/accelerate/utils/modeling.py:1111, in load_state_dict(checkpoint_file, device_map)
1108 else:
1109 # if we only have one device we can load everything directly
1110 if len(set(device_map.values())) == 1:
-> 1111 return safe_load_file(checkpoint_file, device=list(device_map.values())[0])
1113 devices = list(set(device_map.values()) - {"disk"})
1114 # cpu device should always exist as fallback option

File /opt/conda/lib/python3.10/site-packages/safetensors/torch.py:261, in load_file(filename, device)
259 with safe_open(filename, framework="pt", device=device) as f:
260 for k in f.keys():
--> 261 result[k] = f.get_tensor(k)
262 return result

OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB (GPU 0; 15.90 GiB total capacity; 14.90 GiB already allocated; 13.75 MiB free; 15.04 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
```

OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB (GPU 0; 15.90 GiB total capacity; 14.90 GiB already allocated; 13.75 MiB free; 15.04 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. PYTORCH_CUDA_ALLOC_CONF

I used to get this error when trying to load a big model using low resources (e.g. T4 on Colab), in that case trying loading a small model (TheBloke/vicuna-7B-1.1-GPTQ) which usually works on a free instance on colab.

Sign up or log in to comment