logical-reasoning / llm_toolkit /translation_engine.py
dh-mc's picture
initial code
6d1c39a
raw
history blame
7.96 kB
import os
import pandas as pd
from datasets import load_dataset
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from tqdm import tqdm
from llm_toolkit.translation_utils import *
print(f"loading {__file__}")
def get_model_names(
model_name, save_method="merged_4bit_forced", quantization_method="q5_k_m"
):
hub_model = model_name.split("/")[-1] + "-MAC-"
local_model = "models/" + hub_model
return {
"local": local_model + save_method,
"local-gguf": local_model + quantization_method,
"hub": hub_model + save_method,
"hub-gguf": hub_model + "gguf-" + quantization_method,
}
def load_model(
model_name,
max_seq_length=2048,
dtype=None,
load_in_4bit=False,
):
print(f"loading model: {model_name}")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name, # YOUR MODEL YOU USED FOR TRAINING
max_seq_length=max_seq_length,
dtype=dtype,
load_in_4bit=load_in_4bit,
trust_remote_code=True,
)
FastLanguageModel.for_inference(model)
return model, tokenizer
def test_model(model, tokenizer, prompt):
inputs = tokenizer(
[prompt],
return_tensors="pt",
).to("cuda")
text_streamer = TextStreamer(tokenizer)
_ = model.generate(
**inputs, max_new_tokens=128, streamer=text_streamer, use_cache=True
)
def load_trainer(
model,
tokenizer,
dataset,
num_train_epochs,
max_seq_length=2048,
fp16=False,
bf16=False,
output_dir="./outputs",
):
model = FastLanguageModel.get_peft_model(
model,
r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
lora_alpha=16,
lora_dropout=0, # Supports any, but = 0 is optimized
bias="none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
random_state=3407,
use_rslora=False, # We support rank stabilized LoRA
loftq_config=None, # And LoftQ
)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=max_seq_length,
dataset_num_proc=2,
packing=False, # Can make training 5x faster for short sequences.
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_steps=5,
num_train_epochs=num_train_epochs,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=100,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3407,
output_dir=output_dir,
),
)
return trainer
def load_translation_dataset(data_path, tokenizer=None):
train_data_file = data_path.replace(".tsv", "-train.tsv")
test_data_file = data_path.replace(".tsv", "-test.tsv")
if not os.path.exists(train_data_file):
print("generating train/test data files")
dataset = load_dataset(
"csv", data_files=data_path, delimiter="\t", split="train"
)
print(len(dataset))
dataset = dataset.filter(lambda x: x["chinese"] and x["english"])
datasets = dataset.train_test_split(test_size=0.2)
print(len(dataset))
# Convert to pandas DataFrame
train_df = pd.DataFrame(datasets["train"])
test_df = pd.DataFrame(datasets["test"])
# Save to TSV
train_df.to_csv(train_data_file, sep="\t", index=False)
test_df.to_csv(test_data_file, sep="\t", index=False)
print("loading train/test data files")
datasets = load_dataset(
"csv",
data_files={"train": train_data_file, "test": test_data_file},
delimiter="\t",
)
if tokenizer:
translation_prompt = "Please translate the following Chinese text into English and provide only the translated content, nothing else.\n{}"
def formatting_prompts_func(examples):
inputs = examples["chinese"]
outputs = examples["english"]
messages = [
{
"role": "system",
"content": "You are an expert in translating Chinese to English.",
},
None,
]
model_name = os.getenv("MODEL_NAME")
if "mistral" in model_name.lower():
messages = messages[1:]
texts = []
prompts = []
for input, output in zip(inputs, outputs):
prompt = translation_prompt.format(input)
messages[-1] = {"role": "user", "content": prompt}
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
prompts.append(prompt)
texts.append(prompt + output + tokenizer.eos_token)
return {"text": texts, "prompt": prompts}
datasets = datasets.map(
formatting_prompts_func,
batched=True,
)
print(datasets)
return datasets
def eval_model(model, tokenizer, eval_dataset):
total = len(eval_dataset)
predictions = []
for i in tqdm(range(total)):
inputs = tokenizer(
eval_dataset["prompt"][i : i + 1],
return_tensors="pt",
).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=4096, use_cache=False)
decoded_output = tokenizer.batch_decode(outputs)
debug = i == 0
decoded_output = [
extract_answer(output, debug=debug) for output in decoded_output
]
predictions.extend(decoded_output)
return predictions
def save_model(
model,
tokenizer,
include_gguf=True,
include_merged=True,
publish=True,
):
try:
token = os.getenv("HF_TOKEN") or None
model_name = os.getenv("MODEL_NAME")
save_method = "lora"
quantization_method = "q5_k_m"
model_names = get_model_names(
model_name, save_method=save_method, quantization_method=quantization_method
)
model.save_pretrained(model_names["local"])
tokenizer.save_pretrained(model_names["local"])
if publish:
model.push_to_hub(
model_names["hub"],
token=token,
)
tokenizer.push_to_hub(
model_names["hub"],
token=token,
)
if include_merged:
model.save_pretrained_merged(
model_names["local"] + "-merged", tokenizer, save_method=save_method
)
if publish:
model.push_to_hub_merged(
model_names["hub"] + "-merged",
tokenizer,
save_method="lora",
token="",
)
if include_gguf:
model.save_pretrained_gguf(
model_names["local-gguf"],
tokenizer,
quantization_method=quantization_method,
)
if publish:
model.push_to_hub_gguf(
model_names["hub-gguf"],
tokenizer,
quantization_method=quantization_method,
token=token,
)
except Exception as e:
print(e)