from tokenizers import Tokenizer, normalizers, models, pre_tokenizers, processors, ByteLevelBPETokenizer | |
import tokenizers | |
from tokenizers.models import WordPiece, BPE | |
from tokenizers.trainers import WordPieceTrainer, BpeTrainer | |
from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence | |
from tokenizers.processors import TemplateProcessing | |
import os | |
from transformers import AutoTokenizer | |
old_tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
import datasets | |
input_dir = "/dataset/location" | |
dataset = datasets.load_from_disk(input_dir) | |
def get_training_corpus(): | |
for start_idx in range(0, len(dataset), 10000): | |
samples = dataset[start_idx : start_idx + 10000] | |
yield samples["text"] | |
print("start") | |
tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), vocab_size=50000) | |
print("end") | |
tokenizer.save_vocabulary("/tokenizer_location") | |