import os import pandas as pd import torch from unsloth import FastLanguageModel, is_bfloat16_supported from trl import SFTTrainer from transformers import TrainingArguments, TextStreamer from llm_toolkit.translation_utils import * from llamafactory.chat import ChatModel print(f"loading {__file__}") def get_model_names( model_name, save_method="merged_4bit_forced", quantization_method="q5_k_m" ): hub_model = model_name.split("/")[-1] + "-MAC-" local_model = "models/" + hub_model return { "local": local_model + save_method, "local-gguf": local_model + quantization_method, "hub": hub_model + save_method, "hub-gguf": hub_model + "gguf-" + quantization_method, } def load_model( model_name, max_seq_length=2048, dtype=None, load_in_4bit=False, template="chatml", adapter_name_or_path=None, ): print(f"loading model: {model_name}") if adapter_name_or_path: args = dict( model_name_or_path=model_name, adapter_name_or_path=adapter_name_or_path, # load the saved LoRA adapters template=template, # same to the one in training finetuning_type="lora", # same to the one in training quantization_bit=4, # load 4-bit quantized model ) chat_model = ChatModel(args) return chat_model.engine.model, chat_model.engine.tokenizer model, tokenizer = FastLanguageModel.from_pretrained( model_name=model_name, # YOUR MODEL YOU USED FOR TRAINING max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit, trust_remote_code=True, ) FastLanguageModel.for_inference(model) return model, tokenizer def test_model(model, tokenizer, prompt): inputs = tokenizer( [prompt], return_tensors="pt", ).to("cuda") text_streamer = TextStreamer(tokenizer) _ = model.generate( **inputs, max_new_tokens=128, streamer=text_streamer, use_cache=True ) def load_trainer( model, tokenizer, dataset, num_train_epochs, max_seq_length=2048, fp16=False, bf16=False, output_dir="./outputs", ): model = FastLanguageModel.get_peft_model( model, r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], lora_alpha=16, lora_dropout=0, # Supports any, but = 0 is optimized bias="none", # Supports any, but = "none" is optimized # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context random_state=3407, use_rslora=False, # We support rank stabilized LoRA loftq_config=None, # And LoftQ ) trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, dataset_text_field="text", max_seq_length=max_seq_length, dataset_num_proc=2, packing=False, # Can make training 5x faster for short sequences. args=TrainingArguments( per_device_train_batch_size=2, gradient_accumulation_steps=4, warmup_steps=5, num_train_epochs=num_train_epochs, learning_rate=2e-4, fp16=not is_bfloat16_supported(), bf16=is_bfloat16_supported(), logging_steps=100, optim="adamw_8bit", weight_decay=0.01, lr_scheduler_type="linear", seed=3407, output_dir=output_dir, ), ) return trainer