gpt-fi / data /train_tokenizer.py
Vaino Hatanpaa
add training and evaluation scripts
ceedef8
raw
history blame contribute delete
914 Bytes
from tokenizers import Tokenizer, normalizers, models, pre_tokenizers, processors, ByteLevelBPETokenizer
import tokenizers
from tokenizers.models import WordPiece, BPE
from tokenizers.trainers import WordPieceTrainer, BpeTrainer
from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence
from tokenizers.processors import TemplateProcessing
import os
from transformers import AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
import datasets
input_dir = "/dataset/location"
dataset = datasets.load_from_disk(input_dir)
def get_training_corpus():
for start_idx in range(0, len(dataset), 10000):
samples = dataset[start_idx : start_idx + 10000]
yield samples["text"]
print("start")
tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), vocab_size=50000)
print("end")
tokenizer.save_vocabulary("/tokenizer_location")