Spaces:

xuxw98
/

TAPA

Runtime error

TAPA / scripts /prepare_alpaca.py

Last commit not found

5.11 kB

	"""Implementation derived from https://github.com/tloen/alpaca-lora"""
	import sys
	from pathlib import Path

	# support running without installing as a package
	wd = Path(__file__).parent.parent.resolve()
	sys.path.append(str(wd))

	import torch
	import requests
	import json
	from torch.utils.data import random_split
	from lit_llama.tokenizer import Tokenizer
	from tqdm import tqdm


	DATA_FILE = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json"
	DATA_FILE_NAME = "alpaca_data_cleaned_archive.json"
	IGNORE_INDEX = -1


	def prepare(
	destination_path: Path = Path("data/alpaca"),
	tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"),
	test_split_size: int = 2000,
	max_seq_length: int = 256,
	seed: int = 42,
	mask_inputs: bool = False, # as in alpaca-lora
	data_file_name: str = DATA_FILE_NAME
	) -> None:
	"""Prepare the Alpaca dataset for instruction tuning.

	The output is a training and validation dataset saved as `train.pt` and `val.pt`,
	which stores the preprocessed and tokenized prompts and labels.
	"""

	destination_path.mkdir(parents=True, exist_ok=True)
	file_path = destination_path / data_file_name
	download(file_path)

	# TODO: If we don't have the Meta weights, where do we get the tokenizer from?
	tokenizer = Tokenizer(tokenizer_path)

	with open(file_path, "r") as file:
	data = json.load(file)

	# Partition the dataset into train and test
	train_split_size = len(data) - test_split_size
	train_set, test_set = random_split(
	data,
	lengths=(train_split_size, test_split_size),
	generator=torch.Generator().manual_seed(seed),
	)
	train_set, test_set = list(train_set), list(test_set)

	print(f"train has {len(train_set):,} samples")
	print(f"val has {len(test_set):,} samples")

	print("Processing train split ...")
	train_set = [prepare_sample(sample, tokenizer, max_seq_length, mask_inputs) for sample in tqdm(train_set)]
	torch.save(train_set, file_path.parent / "train.pt")

	print("Processing test split ...")
	test_set = [prepare_sample(sample, tokenizer, max_seq_length, mask_inputs) for sample in tqdm(test_set)]
	torch.save(test_set, file_path.parent / "test.pt")


	def download(file_path: Path):
	"""Downloads the raw json data file and saves it in the given destination."""
	if file_path.exists():
	return
	with open(file_path, "w") as f:
	f.write(requests.get(DATA_FILE).text)


	def prepare_sample(example: dict, tokenizer: Tokenizer, max_length: int, mask_inputs: bool = True):
	"""Processes a single sample.

	Each sample in the dataset consists of:
	- instruction: A string describing the task
	- input: A string holding a special input value for the instruction.
	This only applies to some samples, and in others this is empty.
	- output: The response string

	This function processes this data to produce a prompt text and a label for
	supervised training. The input text is formed as a single message including all
	the instruction, the input (optional) and the response.
	The label/target is the same message but can optionally have the instruction + input text
	masked out (mask_inputs=True).

	Finally, both the prompt and the label get tokenized. If desired, all tokens
	in the label that correspond to the original input prompt get masked out (default).
	"""
	full_prompt = generate_prompt(example)
	full_prompt_and_response = full_prompt + example["output"]
	encoded_full_prompt = tokenize(tokenizer, full_prompt, max_length=max_length, eos=False)
	encoded_full_prompt_and_response = tokenize(tokenizer, full_prompt_and_response, eos=True, max_length=max_length)

	# The labels are the full prompt with response, but with the prompt masked out
	labels = encoded_full_prompt_and_response.clone()
	if mask_inputs:
	labels[:len(encoded_full_prompt)] = IGNORE_INDEX

	return {**example, "input_ids": encoded_full_prompt_and_response, "input_ids_no_response": encoded_full_prompt, "labels": labels}


	def tokenize(tokenizer: Tokenizer, string: str, max_length: int, eos=True) -> torch.Tensor:
	return tokenizer.encode(string, bos=True, eos=eos, max_length=max_length)


	def generate_prompt(example):
	"""Generates a standardized message to prompt the model with an instruction, optional input and a
	'response' field."""

	if example["input"]:
	return (
	"Below is an instruction that describes a task, paired with an input that provides further context. "
	"Write a response that appropriately completes the request.\n\n"
	f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:"
	)
	return (
	"Below is an instruction that describes a task. "
	"Write a response that appropriately completes the request.\n\n"
	f"### Instruction:\n{example['instruction']}\n\n### Response:"
	)


	if __name__ == "__main__":
	from jsonargparse import CLI

	CLI(prepare)