from tokenizers import Tokenizer, normalizers, models, pre_tokenizers, processors, ByteLevelBPETokenizer import tokenizers from tokenizers.models import WordPiece, BPE from tokenizers.trainers import WordPieceTrainer, BpeTrainer from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence from tokenizers.processors import TemplateProcessing import os from transformers import AutoTokenizer old_tokenizer = AutoTokenizer.from_pretrained("gpt2") import datasets input_dir = "/dataset/location" dataset = datasets.load_from_disk(input_dir) def get_training_corpus(): for start_idx in range(0, len(dataset), 10000): samples = dataset[start_idx : start_idx + 10000] yield samples["text"] print("start") tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), vocab_size=50000) print("end") tokenizer.save_vocabulary("/tokenizer_location")