pythia410m-sft-tldr / code /rl_training.py
mnoukhov's picture
Training in progress, step 500
1904ee8 verified
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import bitsandbytes as bnb
import torch
from accelerate import Accelerator, DistributedDataParallelKwargs
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
PreTrainedTokenizerBase,
pipeline,
)
import wandb
# from transformers.trainer_utils import get_last_checkpoint
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler
from trl.models.modeling_value_adapter import AutoModelForCausalLMWithValueAdapter
# import copy
# from torch_ema import ExponentialMovingAverage
# from transforers import pipeline
tqdm.pandas()
@dataclass
class ScriptArguments:
"""
The name of the Casual LM model we wish to fine with PPO
"""
model_name: Optional[str] = field(default="", metadata={"help": "the model name"})
reward_adapter_name: Optional[str] = field(default="", metadata={"help": "the reward model name"})
# tokenizer_name: Optional[str] = field(default=None, metadata={"help": "the tokenizer name"})
dataset_name: Optional[str] = field(
default="CarperAI/openai_summarize_tldr", metadata={"help": "the dataset name"}
)
train_split: Optional[str] = field(
default="train", metadata={"help": "the dataset split to evaluate on; default to 'none' (no evaluation)"}
)
eval_split: Optional[str] = field(
default="train", metadata={"help": "the dataset split to evaluate on; default to 'none' (no evaluation)"}
)
log_with: Optional[str] = field(default="wandb", metadata={"help": "use 'wandb' to log with wandb"})
learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"})
mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"})
batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"})
ppo_epochs: Optional[int] = field(default=4, metadata={"help": "the number of ppo epochs"})
gradient_accumulation_steps: Optional[int] = field(
default=4, metadata={"help": "the number of gradient accumulation steps"}
)
adafactor: Optional[bool] = field(default=False, metadata={"help": "whether to use the adafactor optimizer"})
early_stopping: Optional[bool] = field(default=False, metadata={"help": "whether to early stop"})
target_kl: Optional[float] = field(default=0.1, metadata={"help": "kl target for early stopping"})
reward_baseline: Optional[float] = field(
default=0.0,
metadata={"help": "a baseline value that is subtracted from the reward"},
)
batched_gen: Optional[bool] = field(default=False, metadata={"help": "whether to use the batched text gen"})
save_steps: Optional[int] = field(default=1000, metadata={"help": "the number of steps to save at"})
save_strategy: Optional[str] = field(default="steps")
output_dir: Optional[str] = field(default="runs/", metadata={"help": "n steps to save the model"})
seed: Optional[int] = field(default=0, metadata={"help": "the seed"})
steps: Optional[int] = field(default=20000, metadata={"help": "number of epochs"})
init_kl_coef: Optional[float] = field(
default=0.2,
metadata={"help": "Initial KL penalty coefficient (used for adaptive and linear control)"},
)
adap_kl_ctrl: Optional[bool] = field(default=True, metadata={"help": "Use adaptive KL control, otherwise linear"})
value_adapter: Optional[bool] = field(default=False)
separate_reward_model: Optional[str] = field(default=None, metadata={"help": "the reward model name"})
# Generation
output_min_length: Optional[int] = field(default=24, metadata={"help": "the batch size"})
output_max_length: Optional[int] = field(default=48, metadata={"help": "the batch size"})
input_max_length: Optional[int] = field(default=512, metadata={"help": "maximum length for generation"})
# Quantization
load_in_8bit: Optional[bool] = field(default=False, metadata={"help": "load the model in 8 bits precision"})
load_in_4bit: Optional[bool] = field(default=False, metadata={"help": "load the model in 4 bits precision"})
bf16: Optional[bool] = field(
default=False,
metadata={
"help": "This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU."
},
)
fp16: Optional[bool] = field(
default=False,
metadata={
"help": "This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU."
},
)
# LoRA
use_lora: Optional[bool] = field(
default=True,
)
lora_alpha: Optional[float] = field(default=32, metadata={"help": "the lora alpha parameter"})
lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})
lora_all_linear: Optional[bool] = field(default=False, metadata={"help": "lora adapter on all linear layers"})
# Gold Model
eval_steps: Optional[int] = field(default=None)
gold_model_name: Optional[str] = field(default=None, metadata={"help": "the reward model name"})
gold_in_8bit: Optional[bool] = field(default=False, metadata={"help": "gold the model in 8 bits precision"})
gold_in_4bit: Optional[bool] = field(default=False, metadata={"help": "gold the model in 4 bits precision"})
gold_bf16: Optional[bool] = field(
default=False,
)
gold_fp16: Optional[bool] = field(
default=False,
)
gold_eval_greedy: Optional[bool] = field(default=True)
# # EMA stuff
# ema_decay: Optional[float] = field(default=0.995, metadata={"help": "the ema decay rate"})
# reset_freq: Optional[int] = field(default=None, metadata={"help": "reset every n epochs"})
input_ids_input: Optional[bool] = field(
default=False,
)
strip_prompt: Optional[bool] = field(
default=False,
)
just_eval: Optional[bool] = field(default=False)
@dataclass
class PromptCollator:
tokenizer: PreTrainedTokenizerBase
padding: Union[bool, str] = True
max_prompt_length: Optional[int] = None
prompt_field: str = "prompt"
return_tensors: str = "pt"
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
prompts = [feat[self.prompt_field] for feat in features]
original_side = self.tokenizer.padding_side
self.tokenizer.padding_side = "left"
tokenized_batch = self.tokenizer(
prompts,
truncation=True,
padding=True,
max_length=self.max_prompt_length,
return_tensors=self.return_tensors,
)
tokenized_batch["prompt"] = prompts
self.tokenizer.padding_side = original_side
return tokenized_batch
def find_all_linear_names(args, model):
cls = bnb.nn.Linear4bit if args.load_in_4bit else (bnb.nn.Linear8bitLt if args.load_in_8bit else torch.nn.Linear)
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split(".")
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if "lm_head" in lora_module_names: # needed for 16-bit
lora_module_names.remove("lm_head")
if "score" in lora_module_names: # needed for 16-bit
lora_module_names.remove("score")
return list(lora_module_names)
def create_and_prepare_model(args):
if args.load_in_8bit and args.load_in_4bit:
raise ValueError("You can't load the model in 8 bits and 4 bits at the same time")
elif args.load_in_8bit or args.load_in_4bit:
quantization_config = BitsAndBytesConfig(load_in_8bit=args.load_in_8bit, load_in_4bit=args.load_in_4bit)
device_map = {"": Accelerator().local_process_index}
else:
device_map = None
quantization_config = None
if args.bf16:
torch_dtype = torch.bfloat16
# elif args.fp16:
# torch_dtype = torch.float16
else:
torch_dtype = torch.float32
if script_args.value_adapter:
model_cls = AutoModelForCausalLMWithValueAdapter
else:
model_cls = AutoModelForCausalLMWithValueHead
if args.use_lora:
# we add `score` to the list of modules to save to
# correctly save the score head.
# if args.pretrained_adapter is not None:
# model = PeftModel.from_pretrained(model, args.pretrained_adapter)
# else:
if args.lora_all_linear:
# hardcoded pythia
# target_modules = find_all_linear_names(args, model)
target_modules = ["dense_h_to_4h", "dense_4h_to_h", "query_key_value", "dense"]
else:
target_modules = None
peft_config = LoraConfig(
r=args.lora_r,
lora_alpha=args.lora_alpha,
lora_dropout=args.lora_dropout,
bias="none",
task_type="CAUSAL_LM",
target_modules=target_modules,
modules_to_save=["score"],
)
# model = get_peft_model(model, peft_config)
# TODO check
# modules_to_save = ["score"]
# for key, _ in model.named_modules():
# target_module_found = any(key.endswith(target_key) for target_key in modules_to_save)
# if target_module_found:
# model.get_submodule(key + ".original_module").requires_grad_(False)
#
# if torch_dtype == torch.bfloat16:
# for name, module in model.named_modules():
# if isinstance(module, LoraLayer):
# module = module.to(torch_dtype)
# if "norm" in name:
# module = module.to(torch.float32)
# if "score" in name or "embed_tokens" in name:
# if hasattr(module, "weight") and module.weight.dtype == torch.float32:
# module = module.to(torch_dtype)
else:
peft_config = None
model = model_cls.from_pretrained(
args.model_name,
quantization_config=quantization_config,
device_map=device_map,
torch_dtype=torch_dtype,
peft_config=peft_config,
reward_adapter=script_args.reward_adapter_name,
)
# if script_args.ignore_bias_buffers:
# torch distributed hack
if quantization_config is not None:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=args.gradient_checkpointing)
args.gradient_checkpointing = False
model.config.torch_dtype = torch_dtype
# model.config.use_cache = not args.gradient_checkpointing
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name)
if getattr(tokenizer, "pad_token", None) is None:
tokenizer.pad_token = tokenizer.eos_token
if getattr(model.config, "pad_token_id", None) is None:
model.config.pad_token_id = model.config.eos_token_id
model.eval()
return model, tokenizer
def create_and_prepare_dataset(args, tokenizer, split, num_proc=2):
dataset = load_dataset(args.dataset_name, split=split)
def strip_prompt(examples):
examples["prompt"] = [prompt.strip() for prompt in examples["prompt"]]
return examples
if args.strip_prompt:
dataset = dataset.map(strip_prompt, batched=True)
dataset = dataset.rename_column("prompt", "query")
original_columns = dataset.column_names
original_columns.remove("query")
dataset = dataset.map(
tokenizer,
batched=True,
num_proc=num_proc,
input_columns="query",
remove_columns=original_columns,
fn_kwargs=dict(truncation=True, max_length=args.input_max_length),
)
dataset.set_format("torch")
return dataset
def collator(data):
return dict((key, [d[key] for d in data]) for key in data[0])
def decode_and_encode(output_token_ids: List[torch.Tensor], tokenizer, max_length, de_and_retokenize=True):
if de_and_retokenize:
texts = [q + r for q, r in zip(batch["query"], batch["response"])]
output_encoding = tokenizer(
texts,
padding=True,
truncation=True,
return_tensors="pt",
return_token_type_ids=False,
max_length=max_length,
).to(ppo_trainer.accelerator.device)
else:
default_padding_side = tokenizer.padding_side
tokenizer.padding_side = "left"
full_response_mask = [torch.ones_like(element) for element in output_token_ids]
full_response_encoding = {"input_ids": output_token_ids, "attention_mask": full_response_mask}
output_encoding = tokenizer.pad(
full_response_encoding,
padding=True,
max_length=max_length,
return_tensors="pt",
)
tokenizer.padding_side = default_padding_side
return output_encoding
def create_and_prepare_gold_model(script_args, accelerator):
if script_args.gold_in_8bit or script_args.gold_in_4bit:
gold_quantization_config = BitsAndBytesConfig(
load_in_8bit=script_args.gold_in_8bit, load_in_4bit=script_args.gold_in_4bit
)
gold_device_map = {"": accelerator.local_process_index}
else:
gold_device_map = None
gold_quantization_config = None
if script_args.gold_bf16:
torch_dtype = torch.bfloat16
elif script_args.gold_fp16:
torch_dtype = torch.float16
else:
torch_dtype = torch.float32
gold_model = AutoModelForSequenceClassification.from_pretrained(
script_args.gold_model_name,
quantization_config=gold_quantization_config,
torch_dtype=torch_dtype,
device_map=gold_device_map,
)
if getattr(gold_model.config, "pad_token_id", None) is None:
gold_model.config.pad_token_id = gold_model.config.eos_token_id
gold_model = accelerator.prepare(gold_model)
gold_model.eval()
return gold_model
def create_and_prepare_eval(args, tokenizer, accelerator):
dataset = load_dataset(args.dataset_name, split=args.eval_split)
def strip_prompt(examples):
examples["prompt"] = [prompt.strip() for prompt in examples["prompt"]]
return examples
if args.strip_prompt:
dataset = dataset.map(strip_prompt, batched=True)
# data_collator = PromptCollator(
# tokenizer,
# max_prompt_length=args.input_max_length,
# prompt_field="prompt",
# )
dataloader = DataLoader(dataset, batch_size=args.batch_size)
return accelerator.prepare(dataloader)
def get_batch_samples(
accelerator, model, tokenizer, input_ids, attention_mask, return_ids=False, generation_config=None
):
policy_output = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
generation_config=generation_config,
)
# if self.ref_model is None:
with accelerator.unwrap_model(model).disable_adapter():
reference_output = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
generation_config=generation_config,
)
# else:
# reference_output = self.ref_model.generate(
# **inputs,
# generation_config=self.generation_config,
# )
policy_output = pad_to_length(policy_output, self.max_length, tokenizer.pad_token_id)
policy_output_decoded = tokenizer.batch_decode(policy_output, skip_special_tokens=True)
reference_output = pad_to_length(reference_output, self.max_length, tokenizer.pad_token_id)
reference_output_decoded = tokenizer.batch_decode(reference_output, skip_special_tokens=True)
if return_ids:
return policy_output_decoded, reference_output_decoded, policy_output
else:
return policy_output_decoded, reference_output_decoded
def gold_eval(dataloader, model, gold_model, accelerator, epoch, log_n_samples_during_eval=0):
samples_to_log = []
gold_reward_sum = 0.0
total_samples = 0
greedy_generation_kwargs = {
"min_length": -1,
"top_p": 1.0,
"do_sample": False,
"pad_token_id": tokenizer.pad_token_id,
"eos_token_id": tokenizer.eos_token_id,
"max_new_tokens": script_args.output_max_length,
}
for batch in tqdm(
dataloader,
disable=not ppo_trainer.accelerator.is_local_main_process,
desc="Gold Eval",
):
import pdb
pdb.set_trace()
full_response_tensors = ppo_trainer.generate(
batch["input_ids"],
return_prompt=True,
**greedy_generation_kwargs,
)
response_tensors = []
for question, full_response in zip(question_tensors, full_response_tensors):
response_tensors.append(full_response[len(question) :])
batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
texts = [q + r for q, r in zip(batch["prompt"], batch["response"])]
import pdb
pdb.set_trace()
policy_output = tokenizer(
texts, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False
).to(ppo_trainer.accelerator.device)
# gold reward
with torch.no_grad():
gold_rewards = gold_model(
input_ids=policy_output["input_ids"], attention_mask=policy_output["attention_mask"]
)[0]
gold_rewards = accelerator.gather_for_metrics(gold_rewards)
if accelerator.is_local_main_process():
gold_reward_sum += gold_rewards.sum().item()
total_samples += gold_rewards.size(0)
for i, (prompt, resp) in enumerate(zip(batch["prompt"], batch["response"])):
if len(samples_to_log) < log_n_samples_during_eval:
samples_to_log.append([prompt, resp])
else:
break
if accelerator.is_local_main_process():
print(f"gold reward mean {gold_reward_sum / total_samples}")
gold_log = {
"eval/gold_rewards_mean": gold_reward_sum / total_samples,
}
gold_log["epoch"] = epoch
if samples_to_log:
gold_log["game_log"] = (
wandb.Table(
columns=["Prompt", "Policy", "Ref Model"],
rows=samples_to_log,
),
)
accelerator.log(gold_log)
return gold_reward_sum / total_samples, samples_to_log
if __name__ == "__main__":
parser = HfArgumentParser(ScriptArguments)
script_args: ScriptArguments = parser.parse_args_into_dataclasses()[0]
config = PPOConfig(
steps=script_args.steps,
model_name=script_args.model_name,
learning_rate=script_args.learning_rate,
log_with=script_args.log_with,
batch_size=script_args.batch_size,
mini_batch_size=script_args.mini_batch_size,
gradient_accumulation_steps=script_args.gradient_accumulation_steps,
optimize_cuda_cache=True,
early_stopping=script_args.early_stopping,
target_kl=script_args.target_kl,
ppo_epochs=script_args.ppo_epochs,
seed=script_args.seed,
init_kl_coef=script_args.init_kl_coef,
adap_kl_ctrl=script_args.adap_kl_ctrl,
accelerator_kwargs={"kwargs_handlers": [DistributedDataParallelKwargs(find_unused_parameters=False)]},
)
# set seed before initializing value head for deterministic eval
set_seed(config.seed)
model, tokenizer = create_and_prepare_model(script_args)
train_dataset = create_and_prepare_dataset(script_args, tokenizer, script_args.train_split)
# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
ppo_trainer = PPOTrainer(
config,
model,
ref_model=None,
tokenizer=tokenizer,
dataset=train_dataset,
data_collator=collator,
)
# Gold Model Eval
if script_args.gold_model_name is not None:
gold_model = create_and_prepare_gold_model(script_args, ppo_trainer.accelerator)
eval_dataloader = create_and_prepare_eval(script_args, tokenizer, ppo_trainer.accelerator)
if script_args.just_eval:
gold_eval(
eval_dataloader,
ppo_trainer.model,
gold_model,
ppo_trainer.accelerator,
epoch=0,
log_n_samples_during_eval=0,
)
exit()
if script_args.separate_reward_model:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
device = 0 if torch.cuda.is_available() else "cpu" # to avoid a ` pipeline` bug
sentiment_pipe = pipeline(
"sentiment-analysis",
model=script_args.separate_reward_model,
device_map={"": Accelerator().local_process_index},
model_kwargs={"load_in_8bit": True},
tokenizer=tokenizer,
return_token_type_ids=False,
)
sent_kwargs = {
"return_all_scores": True,
"function_to_apply": "none",
"batch_size": 16,
"truncation": True,
}
# We then define the arguments to pass to the `generate` function. These arguments
# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
# the `generate` function of the trained model.
generation_kwargs = {
"min_length": -1,
"top_k": 0.0,
"top_p": 1.0,
"do_sample": True,
"pad_token_id": tokenizer.pad_token_id,
"eos_token_id": tokenizer.eos_token_id,
}
output_length_sampler = LengthSampler(script_args.output_min_length, script_args.output_max_length)
for epoch, batch in tqdm(
enumerate(ppo_trainer.dataloader),
total=config.total_ppo_epochs,
disable=not ppo_trainer.accelerator.is_local_main_process,
):
if epoch >= config.total_ppo_epochs:
break
question_tensors = batch["input_ids"]
full_response_tensors = ppo_trainer.generate(
question_tensors,
return_prompt=True,
length_sampler=output_length_sampler,
**generation_kwargs,
)
response_tensors = []
for question, full_response in zip(question_tensors, full_response_tensors):
response_tensors.append(full_response[len(question) :])
batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
# policy_output_encoding = create_encoding_from_output()
# # Compute sentiment score
# if script_args.input_ids_input:
# max_length = script_args.input_max_length + script_args.output_max_length
# default_padding_side = tokenizer.padding_side
# tokenizer.padding_side = "left"
# full_response_mask = [torch.ones_like(element) for element in full_response_tensors]
# full_response_encoding = {"input_ids": full_response_tensors, "attention_mask": full_response_mask}
# policy_output = tokenizer.pad(
# full_response_encoding,
# padding="max_length",
# max_length=max_length,
# return_tensors="pt",
# )
# tokenizer.padding_side = default_padding_side
# else:
texts = [q + r for q, r in zip(batch["query"], batch["response"])]
policy_output = tokenizer(
texts, padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False
).to(ppo_trainer.accelerator.device)
# if script_args.separate_reward_model:
# pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
# raw_rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs]
# else:
raw_rewards = ppo_trainer.compute_reward_model_score(**policy_output)
rewards = [(raw_rewards[i] - script_args.reward_baseline) for i in range(len(raw_rewards))]
# Run PPO step
if not script_args.just_eval:
stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
else:
stats = {}
if script_args.eval_steps is not None and epoch % script_args.eval_steps == 0:
if script_args.gold_eval_greedy:
greedy_generation_kwargs = {
"min_length": -1,
"top_p": 1.0,
"do_sample": False,
"pad_token_id": tokenizer.pad_token_id,
"eos_token_id": tokenizer.eos_token_id,
"max_new_tokens": script_args.output_max_length,
}
greedy_output = ppo_trainer.generate(
question_tensors,
return_prompt=True,
**greedy_generation_kwargs,
)
max_length = script_args.input_max_length + script_args.output_max_length
policy_output = tokenizer.batch_decode(greedy_output, skip_special_tokens=True)
with torch.no_grad():
gold_rewards = gold_model(**policy_output)[0]
else:
gold_rewards = None
stats["epoch"] = epoch
ppo_trainer.log_stats(stats, batch, rewards, gold_rewards)
# ppo_trainer.accelerator.print(stats)
if script_args.save_strategy != "no" and epoch > 0 and epoch % script_args.save_steps == 0:
ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}")