Spaces:
Sleeping
Sleeping
# import torch | |
# from trl import SFTTrainer | |
from datasets import load_dataset | |
# from transformers import TrainingArguments, TextStreamer | |
from unsloth.chat_templates import get_chat_template | |
# from unsloth import FastLanguageModel, is_bfloat16_supported | |
def load_data(dataset, tokenizer, samples=None): | |
print("Loading finetuning dataset.") | |
# Base models don't have chat templates so we can choose any - ChatML is popular | |
tokenizer = get_chat_template(tokenizer, | |
mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, | |
chat_template="chatml", | |
) | |
def apply_template(examples): | |
# Ensuring we parse the ShareGPT reformat datasets into the format we want | |
messages = examples["conversations"] | |
text = [tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False) for message in messages] | |
return {"text": text} | |
if samples is not None: | |
# Reducing the training load by only training on a subset | |
dataset = load_dataset(dataset, split=f"train[:{int(samples)}]") | |
else: | |
dataset = load_dataset(dataset, split="train") | |
return dataset.map(apply_template, batched=True) |