|
from datasets import load_dataset, concatenate_datasets |
|
from tokenizers import trainers, Tokenizer, normalizers |
|
from t5_tokenizer_model import SentencePieceUnigramTokenizer |
|
|
|
|
|
vocab_size = 50_000 |
|
input_sentence_size = None |
|
model_dir = "./" |
|
|
|
|
|
dataset = load_dataset("json", data_files=["/mnt/disks/flaxdisk/corpus/norwegian_colossal_corpus_validation.json","/mnt/disks/flaxdisk/corpus/special_chars.json"], split='train') |
|
|
|
tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>") |
|
|
|
|
|
|
|
def batch_iterator(input_sentence_size=None): |
|
if input_sentence_size is None: |
|
input_sentence_size = len(dataset) |
|
batch_length = 100 |
|
for i in range(0, input_sentence_size, batch_length): |
|
yield dataset[i: i + batch_length]["text"] |
|
|
|
|
|
|
|
tokenizer.train_from_iterator( |
|
iterator=batch_iterator(input_sentence_size=input_sentence_size), |
|
vocab_size=vocab_size, |
|
show_progress=True, |
|
) |
|
|
|
|
|
tokenizer.save(f"{model_dir}/tokenizer.json") |
|
|
|
|