--- license: unlicense --- Running opt-6.7b with added loras locally on windows! # bitsandbytes I needed to get bitsandbytes working in my venv: I replaced the main.py in C:\Users\user\Desktop\test\peft\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py with the one here! I also added a .dll file here: C:\Users\user\Desktop\test\peft\venv\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda116.dll # Training Script (https://github.com/huggingface/peft/commit/df0e1fb59266c9903ddd6dbfe7339bcd2068d150) (It's from their notebook!) ``` #load import os os.environ["CUDA_VISIBLE_DEVICES"]="0" import torch import torch.nn as nn import bitsandbytes as bnb from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained( "facebook/opt-6.7b", load_in_8bit=True, device_map='auto', ) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-6.7b") #post-processing for param in model.parameters(): param.requires_grad = False # freeze the model - train adapters later if param.ndim == 1: # cast the small parameters (e.g. layernorm) to fp32 for stability param.data = param.data.to(torch.float32) model.gradient_checkpointing_enable() # reduce number of stored activations model.enable_input_require_grads() class CastOutputToFloat(nn.Sequential): def forward(self, x): return super().forward(x).to(torch.float32) model.lm_head = CastOutputToFloat(model.lm_head) # apply lora def print_trainable_parameters(model): """ Prints the number of trainable parameters in the model. """ trainable_params = 0 all_param = 0 for _, param in model.named_parameters(): all_param += param.numel() if param.requires_grad: trainable_params += param.numel() print( f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" ) # apply lora 2 from peft import LoraConfig, get_peft_model config = LoraConfig( r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, config) print_trainable_parameters(model) # training import transformers from datasets import load_dataset data = load_dataset("Abirate/english_quotes") data = data.map(lambda samples: tokenizer(samples['quote']), batched=True) trainer = transformers.Trainer( model=model, train_dataset=data['train'], args=transformers.TrainingArguments( per_device_train_batch_size=4, gradient_accumulation_steps=4, warmup_steps=100, max_steps=200, learning_rate=2e-4, fp16=True, logging_steps=1, output_dir='outputs' ), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) ) model.config.use_cache = False # silence the warnings. Please re-enable for inference! trainer.train() # push to huggingface txtloras model.push_to_hub("Yoshiii/opt-6.7b-lora", use_auth_token=True) # inference batch = tokenizer("Two things are infinite: ", return_tensors='pt') with torch.cuda.amp.autocast(): output_tokens = model.generate(**batch, max_new_tokens=50) print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True)) ``` # Inference (loading this repo lora from hf) ``` import torch from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM, AutoTokenizer peft_model_id = "Yoshiii/opt-6.7b-lora" config = PeftConfig.from_pretrained(peft_model_id) model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto') tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) # Load the Lora model model = PeftModel.from_pretrained(model, peft_model_id) batch = tokenizer("Two things are infinite: ", return_tensors='pt') with torch.cuda.amp.autocast(): output_tokens = model.generate(**batch, max_new_tokens=50) print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True)) ``` Two things are infinite: the universe and human stupidity; and I'm not sure about the universe. -Albert Einstein I'm not sure about the universe either. This output is like the training data. If you run without applying the Lora, it will usually look worse. If you retrain the lora, know that your new lora is not going to output the same results, despite you using the same settings. Inference should usually be deterministic when using the same lora, or using without lora. Also, If you want to download and use the loras from a visible folder, here's the inference script: ``` import torch from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM, AutoTokenizer peft_model_id = "./loramodel" config = PeftConfig.from_pretrained(peft_model_id) model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto') tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) # Load the Lora model model = PeftModel.from_pretrained(model, peft_model_id) batch = tokenizer("Two things are infinite: ", return_tensors='pt') with torch.cuda.amp.autocast(): output_tokens = model.generate(**batch, max_new_tokens=50) print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True)) ``` add your adapter_config.json and your adapter_model.bin to a folder in your current directory named `loramodel`, or whatever you choose.