dh-mc's picture
T4 original prompt + bf16: best results
c8f289a
raw
history blame
5.45 kB
import os
import re
import sys
import torch
from llamafactory.chat import ChatModel
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TextStreamer,
)
def load_model(
model_name,
max_seq_length=2048,
dtype=torch.bfloat16,
load_in_4bit=False,
adapter_name_or_path=None,
):
print(f"loading model: {model_name}")
if adapter_name_or_path:
template = "llama3" if "llama-3" in model_name.lower() else "chatml"
args = dict(
model_name_or_path=model_name,
adapter_name_or_path=adapter_name_or_path, # load the saved LoRA adapters
template=template, # same to the one in training
finetuning_type="lora", # same to the one in training
quantization_bit=4 if load_in_4bit else None, # load 4-bit quantized model
)
chat_model = ChatModel(args)
return chat_model.engine.model, chat_model.engine.tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
bnb_config = BitsAndBytesConfig(
load_in_4bit=load_in_4bit,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=False,
bnb_4bit_compute_dtype=dtype,
)
model = (
AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
torch_dtype=dtype,
trust_remote_code=True,
device_map="auto",
)
if load_in_4bit
else AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=dtype,
trust_remote_code=True,
device_map="auto",
)
)
return model, tokenizer
def test_model(model, tokenizer, prompt):
inputs = tokenizer(
[prompt],
return_tensors="pt",
).to("cuda")
text_streamer = TextStreamer(tokenizer)
_ = model.generate(
**inputs, max_new_tokens=2048, streamer=text_streamer, use_cache=True
)
def extract_answer(text, debug=False):
if text:
# Remove the begin and end tokens
text = re.sub(
r".*?(assistant|\[/INST\]).+?\b",
"",
text,
flags=re.DOTALL | re.MULTILINE,
)
if debug:
print("--------\nstep 1:", text)
text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE)
if debug:
print("--------\nstep 2:", text)
text = re.sub(
r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE
)
if debug:
print("--------\nstep 3:", text)
text = text.split(".")[0].strip()
if debug:
print("--------\nstep 4:", text)
text = re.sub(
r"^Response:.+?\b",
"",
text,
flags=re.DOTALL | re.MULTILINE,
)
if debug:
print("--------\nstep 5:", text)
return text
def eval_model(model, tokenizer, eval_dataset):
total = len(eval_dataset)
predictions = []
for i in tqdm(range(total)):
inputs = tokenizer(
eval_dataset["prompt"][i : i + 1],
return_tensors="pt",
).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
decoded_output = tokenizer.batch_decode(outputs)
debug = i == 0
decoded_output = [
extract_answer(output, debug=debug) for output in decoded_output
]
predictions.extend(decoded_output)
return predictions
def save_model(
model,
tokenizer,
include_gguf=True,
include_merged=True,
publish=True,
):
try:
token = os.getenv("HF_TOKEN") or None
model_name = os.getenv("MODEL_NAME")
save_method = "lora"
quantization_method = "q5_k_m"
model_names = get_model_names(
model_name, save_method=save_method, quantization_method=quantization_method
)
model.save_pretrained(model_names["local"])
tokenizer.save_pretrained(model_names["local"])
if publish:
model.push_to_hub(
model_names["hub"],
token=token,
)
tokenizer.push_to_hub(
model_names["hub"],
token=token,
)
if include_merged:
model.save_pretrained_merged(
model_names["local"] + "-merged", tokenizer, save_method=save_method
)
if publish:
model.push_to_hub_merged(
model_names["hub"] + "-merged",
tokenizer,
save_method="lora",
token="",
)
if include_gguf:
model.save_pretrained_gguf(
model_names["local-gguf"],
tokenizer,
quantization_method=quantization_method,
)
if publish:
model.push_to_hub_gguf(
model_names["hub-gguf"],
tokenizer,
quantization_method=quantization_method,
token=token,
)
except Exception as e:
print(e)
def print_row_details(df, indices=[0]):
for index in indices:
for col in df.columns:
print("-" * 50)
print(f"{col}: {df[col].iloc[index]}")