How to finetune the model
How can I finetune the model further? Inference works without problems.
I can do finetuning with the LoracConfig set to
config = LoraConfig(
r=8,
lora_alpha=32,
target_modules=["lm_head"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
but trying any other target_modules (or removing that line) leads to the error
RuntimeError: self and mat2 must have the same dtype
The full script that I am using is
from transformers import AutoTokenizer, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import argparse
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import transformers
parser = argparse.ArgumentParser(description='Simple AutoGPTQ example')
parser.add_argument('model_name_or_path', type=str, help='Model folder or repo')
parser.add_argument('--model_basename', type=str, help='Model file basename if model is not named gptq_model-Xb-Ygr')
parser.add_argument('--use_slow', action="store_true", help='Use slow tokenizer')
parser.add_argument('--use_safetensors', action="store_true", help='Model file basename if model is not named gptq_model-Xb-Ygr')
parser.add_argument('--use_triton', action="store_true", help='Use Triton for inference?')
parser.add_argument('--bits', type=int, default=4, help='Specify GPTQ bits. Only needed if no quantize_config.json is provided')
parser.add_argument('--group_size', type=int, default=128, help='Specify GPTQ group_size. Only needed if no quantize_config.json is provided')
parser.add_argument('--desc_act', action="store_true", help='Specify GPTQ desc_act. Only needed if no quantize_config.json is provided')
args = parser.parse_args()
quantized_model_dir = args.model_name_or_path
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=not args.use_slow,
unk_token="\<unk\>",
bos_token="\<s\>",
eos_token="\</s\>")
quantize_config = BaseQuantizeConfig.from_pretrained(quantized_model_dir)
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,
use_safetensors=args.use_safetensors,
model_basename=args.model_basename,
device="cuda:0",
use_triton=args.use_triton,
quantize_config=quantize_config)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
data = data['train'].train_test_split(train_size=0.9, test_size=0.1)
tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
eval_dataset=data['test'],
args=transformers.TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_steps=2,
max_steps=3,
learning_rate=2e-2,
fp16=True,
logging_steps=1,
output_dir="outputs",
optim="paged_adamw_8bit",
evaluation_strategy='steps',
eval_steps=1
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()
I don't know, I've not tried fine tuning yet.
However you could try updating AutoGPTQ to the latest development version (git clone it and build from source), as version 0.3.0 has built-in PEFT support.
I think this will be the intended way to do LoRA training on quantised GPTQ models.
I've not tried it myself yet but I believe it does work.
I don't know, I've not tried fine tuning yet.
However you could try updating AutoGPTQ to the latest development version (git clone it and build from source), as version 0.3.0 has built-in PEFT support.
I think this will be the intended way to do LoRA training on quantised GPTQ models.
I've not tried it myself yet but I believe it does work.
Thanks, that helped, now I got it working. It required also adapting the example from examples/peft/peft_lora_clm_instruction_tuning.py with the essential difference of using their GPTQLoraConfig.
Great, glad it worked! Could you share the updated code here, so others could use it as well?
@TheBloke Sure, please see below:
# run with
# python simple_autogptq.py ./text-generation-webui/models/TheBloke_guanaco-33B-GPTQ/ --model_basename Guanaco-33B-GPTQ-4bit.act-order --use_safetensors --use_triton
import os
import argparse
from peft import prepare_model_for_kbit_training, TaskType
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, get_gptq_peft_model, BaseQuantizeConfig
from auto_gptq.utils.peft_utils import GPTQLoraConfig
parser = argparse.ArgumentParser(description='Simple AutoGPTQ example')
parser.add_argument('model_name_or_path', type=str, help='Model folder or repo')
parser.add_argument('--model_basename', type=str, help='Model file basename if model is not named gptq_model-Xb-Ygr')
parser.add_argument('--use_slow', action="store_true", help='Use slow tokenizer')
parser.add_argument('--use_safetensors', action="store_true", help='Model file basename if model is not named gptq_model-Xb-Ygr')
parser.add_argument('--use_triton', action="store_true", help='Use Triton for inference?')
parser.add_argument('--bits', type=int, default=4, help='Specify GPTQ bits. Only needed if no quantize_config.json is provided')
parser.add_argument('--group_size', type=int, default=128, help='Specify GPTQ group_size. Only needed if no quantize_config.json is provided')
parser.add_argument('--desc_act', action="store_true", help='Specify GPTQ desc_act. Only needed if no quantize_config.json is provided')
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
model_name_or_path = args.model_name_or_path
model_basename = args.model_basename
tokenizer_name_or_path = model_name_or_path
peft_config = GPTQLoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.1,
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
use_fast=not args.use_slow,
unk_token="<unk>",
bos_token="<s>",
eos_token="</s>")
if not tokenizer.pad_token_id:
tokenizer.pad_token_id = tokenizer.eos_token_id
quantize_config = BaseQuantizeConfig.from_pretrained(model_name_or_path)
model = AutoGPTQForCausalLM.from_quantized(
model_name_or_path,
model_basename=model_basename,
use_safetensors=args.use_safetensors,
use_triton=args.use_triton,
device="cuda:0",
trainable=True,
inject_fused_attention=True,
inject_fused_mlp=False,
quantize_config=quantize_config
)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_gptq_peft_model(model, peft_config=peft_config, auto_find_all_linears=True, train_mode=True)
model.print_trainable_parameters()
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
data = data['train'].train_test_split(train_size=0.9, test_size=0.1)
tokenizer.pad_token = tokenizer.eos_token
trainer = transformers.Trainer(
model=model,
train_dataset=data["train"],
eval_dataset=data['test'],
args=transformers.TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_steps=2,
max_steps=3,
learning_rate=2e-2,
fp16=True,
logging_steps=1,
output_dir="outputs",
optim="paged_adamw_8bit",
evaluation_strategy='steps',
eval_steps=1
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()
Thank you!
Thank you @clause-crahm , if its possible can you please share the full code in your GitHub (and its link)? I understand if you don't want to share.
Hi @ibibek , the code above is really all there is to it. Just make sure to have up-to-date versions of the packages.
@clause-crahm and @TheBloke , I fintuning the model using the source you provided above with "Abirate/english_quotes " dataset, the loss seems problematical, when inference with adapter, the results are wrong , It seems the adapter does not work and the adapter 's parameters are not trained at all.
CUDA_VISIBLE_DEVICES=1 python guanaco_finetuning.py
[2023-07-23 20:48:23,070] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)
The model weights are not tied. Please use the tie_weights
method before using the infer_auto_device
function.
The safetensors archive passed at guanaco-33B-GPTQ/guanaco-33b-GPTQ-4bit--1g.act.order.safetensors does not contain metadata. Make sure to save your model with the save_pretrained
method. Defaulting to 'pt' metadata.
trainable params: 109,117,440 || all params: 4,552,823,296 || trainable%: 2.396698332128724
0%| | 0/3000 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the __call__
method is faster than using a method to encode the text followed by a call to the pad
method to get a padded encoding.
{'loss': 0.7326, 'learning_rate': 0.01, 'epoch': 0.0}
{'loss': 0.2599, 'learning_rate': 0.02, 'epoch': 0.0}
{'loss': 1.123, 'learning_rate': 0.01999332888592395, 'epoch': 0.0}
{'loss': 9.3881, 'learning_rate': 0.0199866577718479, 'epoch': 0.0}
{'loss': 16.0878, 'learning_rate': 0.01997998665777185, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.0199733155436958, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019966644429619748, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019959973315543694, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019953302201467644, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019946631087391597, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019939959973315543, 'epoch': 0.0}
{'loss': 0.0, 'learning_rate': 0.019933288859239492, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019926617745163442, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01991994663108739, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01991327551701134, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01990660440293529, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01989993328885924, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01989326217478319, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01988659106070714, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01987991994663109, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01987324883255504, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019866577718478988, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019859906604402934, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019853235490326884, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019846564376250837, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019839893262174783, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019833222148098732, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019826551034022682, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01981987991994663, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01981320880587058, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01980653769179453, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01979986657771848, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.01979319546364243, 'epoch': 0.01}
{'loss': 0.0, 'learning_rate': 0.019786524349566376, 'epoch': 0.02}
{'loss': 0.0, 'learning_rate': 0.01977985323549033, 'epoch': 0.02}
{'loss': 0.0, 'learning_rate': 0.019773182121414278, 'epoch': 0.02}
{'loss': 0.0, 'learning_rate': 0.019766511007338224, 'epoch': 0.02}
{'loss': 0.0, 'learning_rate': 0.019759839893262174, 'epoch': 0.02}
@clause-crahm and @TheBloke the inference with adapter, but the results may be wrong , It seems the adapter does not work and the adapter 's parameters are not trained at all. Could you help me to check it ?
import sys
from transformers import AutoTokenizer, pipeline, logging
from peft import PeftModel, prepare_model_for_kbit_training, TaskType
from auto_gptq import AutoGPTQForCausalLM,get_gptq_peft_model, BaseQuantizeConfig
from auto_gptq.utils.peft_utils import GPTQLoraConfig
model_name_or_path = "guanaco-33B-GPTQ"
quantized_model_path="guanaco-33B-GPTQ"
model_basename = "guanaco-33b-GPTQ-4bit--1g.act.order"
checkpoint_path='guanaco-33B-GPTQ/gptq_LORA_adapter'
peft_config = GPTQLoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.1,
task_type=TaskType.CAUSAL_LM,
inference_mode=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,use_fast=True)
if not tokenizer.pad_token_id:
tokenizer.pad_token_id = tokenizer.eos_token_id
quantize_config = BaseQuantizeConfig.from_pretrained(model_name_or_path)
model = AutoGPTQForCausalLM.from_quantized(
model_name_or_path,
model_basename=model_basename,
use_safetensors=True,
use_triton=False,
device="cuda:0",
trainable=False,
inject_fused_attention=True,
inject_fused_mlp=False,
quantize_config=quantize_config
)
model.gradient_checkpointing_enable()
model = get_gptq_peft_model(model, peft_config=peft_config,model_id=checkpoint_path, auto_find_all_linears=True, train_mode=False)
model=PeftModel.from_pretrained(model, model_id=checkpoint_path,dapter_name="adapter_model.bin",is_trainable=False)
prompt = "How can we reduce air pollution?"
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=1, max_new_tokens=512)