tgpt-opt-nano / tokenizer /bertbuildtokenizer.py
lixiangchun's picture
initial upload
56377c9 verified
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace
from transformers import PreTrainedTokenizerFast
from tokenizers.processors import TemplateProcessing
import os
import json
def build_tokenizer(files):
assert type(files) == list and len(files) > 0
# Build word-level tokenizer, i.e. tokenize sentences by whitespace.
tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
trainer = WordLevelTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train(files, trainer)
return tokenizer
def tokenizer_from_file(tokenizer_file):
tokenizer = Tokenizer.from_file(tokenizer_file)
#sentinel_tokens = [(f"<extra_id_{i}>", tokenizer.token_to_id(f"<extra_id_{i}>")) for i in range(100)]
# For BERT, we want our tokenizer to automatically add special tokens, like "[CLS]" or "[SEP]".
# GPT des not requires [CLS] and [SEP] at pretraining while BERT requires them.
#+https://swethatanamala.github.io/2018/12/24/summary-of-bert-paper/
# GPT converges faster by adding [BOS] and [EOS] than without [BOS] and [EOS].
tokenizer.post_processor = TemplateProcessing(
single="[CLS] $A [SEP]", # BERT
##single="[BOS] $A [EOS]", # GPT
##single="$A </s>",
pair="[CLS] $A [SEP] $B:1 [SEP]:1",
special_tokens=[
("[PAD]", tokenizer.token_to_id("[PAD]")),
("[UNK]", tokenizer.token_to_id("[UNK]")),
("[CLS]", tokenizer.token_to_id("[CLS]")),
("[SEP]", tokenizer.token_to_id("[SEP]")),
("[MASK]", tokenizer.token_to_id("[MASK]")),
],
)
# Instantiate with a tokenizer object
tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer, model_max_length=512,
pad_token='[PAD]', unk_token='[UNK]', cls_token='[CLS]',
sep_token='[SEP]', mask_token='[MASK]')
return tokenizer
if not os.path.exists("tmp.json"):
tokenizer = build_tokenizer(files = ["gene_rank_merge_2021Aug25.txt", "../t5/t5finetune_data_flat.csv"])
tokenizer.save("tmp.json")
d=json.load(open("tmp.json"))
#for i in range(7, 107):
# d['added_tokens'].append({'id':i, 'special': True, 'content': f"<extra_id_{i-7}>",'single_word': False,'lstrip': False,'rstrip': False,'normalized': False})
vmax = 0
for k, v in d['model']['vocab'].items():
if v > vmax:
vmax = v
assert vmax + 1 == len(d['model']['vocab'])
for i in range(0, 100):
##d['model']['vocab'][f"extra_id_{i}"] = vmax + 1 + i
d['model']['vocab'][f"unused{i}"] = vmax + 1 + i
with open('bert.json','w') as f:
json.dump(d, f)
tk = tokenizer_from_file("bert.json")
tk.save_pretrained("berttokenizer")