Text Generation
Transformers
PyTorch
Safetensors
Finnish
llama
finnish
text-generation-inference
File size: 619 Bytes
a971b09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import datasets
from transformers import AutoTokenizer

dataset = datasets.load_from_disk("/researchdisk/lm_training_dataset_v2_filtered")
dataset = dataset["train"].train_test_split(train_size=0.02)

old_tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b_700bt_preview")

def get_training_corpus():
    return (
        dataset["train"][i : i + 1000]["text"]
        for i in range(0, len(dataset["train"]), 1000)
    )


training_corpus = get_training_corpus()

tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=64256, min_frequency=2)
tokenizer.save_pretrained("./")