|
'''Training script for tokenizer''' |
|
|
|
from datasets import load_dataset |
|
from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer |
|
from .utils import model_dir |
|
|
|
|
|
dataset = load_dataset("oscar", "unshuffled_deduplicated_no", split="train") |
|
|
|
|
|
tokenizer = ByteLevelBPETokenizer() |
|
|
|
def batch_iterator(batch_size=1000): |
|
for i in range(0, len(dataset), batch_size): |
|
yield dataset[i: i + batch_size]["text"] |
|
|
|
|
|
tokenizer.train_from_iterator( |
|
batch_iterator(), |
|
vocab_size=50265, |
|
min_frequency=2, |
|
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"] |
|
) |
|
|
|
|
|
tokenizer_path = model_dir / 'tokenizer.json' |
|
tokenizer.save(str(tokenizer_path)) |
|
|