--- library_name: peft license: apache-2.0 pipeline_tag: text-generation inference: false tags: - mistralai/Mistral-7B-Instruct-v0.1 - finetuned --- ```bash pip install -q accelerate bitsandbytes trl datasets pip install git+https://github.com/huggingface/transformers pip install git+https://github.com/huggingface/peft.git ``` ```python import os import torch from datasets import load_dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, pipeline, logging, ) from peft import LoraConfig, PeftModel base_model_name = "mistralai/Mistral-7B-Instruct-v0.1" ################################################################################ # bitsandbytes parameters ################################################################################ # Activate 4-bit precision base model loading use_4bit = True # Compute dtype for 4-bit base models bnb_4bit_compute_dtype = "float16" # Quantization type (fp4 or nf4) bnb_4bit_quant_type = "nf4" # Activate nested quantization for 4-bit base models (double quantization) use_nested_quant = False # Load the entire model on the GPU 0 device_map = {"": 0} # Load tokenizer and model with QLoRA configuration compute_dtype = getattr(torch, bnb_4bit_compute_dtype) bnb_config = BitsAndBytesConfig( load_in_4bit=use_4bit, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=use_nested_quant, ) # Check GPU compatibility with bfloat16 if compute_dtype == torch.float16 and use_4bit: major, _ = torch.cuda.get_device_capability() if major >= 8: print("=" * 80) print("Your GPU supports bfloat16: accelerate training with bf16=True") print("=" * 80) # Reload model in FP16 and merge it with LoRA weights base_model = AutoModelForCausalLM.from_pretrained( base_model_name, low_cpu_mem_usage=True, return_dict=True, torch_dtype=torch.float16, quantization_config=bnb_config, device_map=device_map, ) model = PeftModel.from_pretrained(base_model, "Ashishkr/mistral-medical-consultation") model = model.merge_and_unload() # Reload tokenizer to save it tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" ``` ```python # Run text generation pipeline with the merged model prompt = """ i have a neck pain since 2 days . """ pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer,do_sample = True, temperature = 0.9, max_length=200) response = pipe(f"[INST] {prompt} [/INST]") print(response[0]['generated_text']) ```