|
|
|
import transformers
|
|
import datasets
|
|
from transformers import PreTrainedTokenizerFast
|
|
from transformers import (
|
|
GPT2TokenizerFast,
|
|
AutoConfig,
|
|
AutoModelForCausalLM,
|
|
Trainer,
|
|
TrainingArguments,
|
|
default_data_collator
|
|
)
|
|
from transformers.trainer_utils import get_last_checkpoint
|
|
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
config_name = "config_large_bpe.json"
|
|
tokenizer_files = "/path/to/tokenizer/files"
|
|
input_dir = "/data/dir"
|
|
output_dir = "/out/dir"
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir=output_dir,
|
|
per_device_train_batch_size=4,
|
|
per_device_eval_batch_size=4,
|
|
learning_rate=2.067e-5,
|
|
lr_scheduler_type="linear",
|
|
adam_beta1=0.95,
|
|
adam_beta2=0.985,
|
|
adam_epsilon=1e-8,
|
|
weight_decay=0.001,
|
|
gradient_accumulation_steps=32,
|
|
num_train_epochs=6.7,
|
|
save_total_limit=2,
|
|
dataloader_num_workers=10,
|
|
save_steps=100,
|
|
warmup_steps=1000,
|
|
do_eval=True,
|
|
eval_steps=1000,
|
|
evaluation_strategy="steps",
|
|
logging_strategy="steps",
|
|
logging_steps=100,
|
|
bf16=True,
|
|
tf32=True,
|
|
fp16_opt_level="O2",
|
|
half_precision_backend="amp",
|
|
bf16_full_eval=True
|
|
)
|
|
|
|
print("setting up tokenizer...")
|
|
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_files)
|
|
|
|
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
|
from tokenizers.processors import TemplateProcessing
|
|
tokenizer._tokenizer.post_processor = TemplateProcessing(
|
|
single="$0 "+tokenizer.eos_token,
|
|
pair="$A "+tokenizer.eos_token+" $B:1 "+tokenizer.eos_token,
|
|
special_tokens=[(tokenizer.eos_token, 0)],
|
|
)
|
|
|
|
print("loading model...")
|
|
config = AutoConfig.from_pretrained(config_name)
|
|
model = AutoModelForCausalLM.from_config(config)
|
|
|
|
model.gradient_checkpointing_enable()
|
|
print("loading data...")
|
|
dataset = datasets.load_from_disk(input_dir)
|
|
|
|
print("starting training...")
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=dataset["train"],
|
|
data_collator=default_data_collator,
|
|
eval_dataset=dataset["test"].select(range(10000)),
|
|
tokenizer=tokenizer
|
|
)
|
|
|
|
|
|
checkpoint = get_last_checkpoint(output_dir)
|
|
print("checkpoint:", checkpoint)
|
|
trainer.train(resume_from_checkpoint=checkpoint)
|
|
|
|
if __name__ == "__main__":
|
|
main() |