lixiangchun
/

tgpt-opt-nano

Model card Files Files and versions Metrics Training metrics Community

tgpt-opt-nano / tokenizer /bertbuildtokenizer.py

lixiangchun's picture

initial upload

56377c9 verified 4 months ago

history blame contribute delete

2.85 kB

	from tokenizers import Tokenizer
	from tokenizers.models import WordLevel
	from tokenizers.trainers import WordLevelTrainer
	from tokenizers.pre_tokenizers import Whitespace
	from transformers import PreTrainedTokenizerFast
	from tokenizers.processors import TemplateProcessing
	import os
	import json

	def build_tokenizer(files):
	assert type(files) == list and len(files) > 0

	# Build word-level tokenizer, i.e. tokenize sentences by whitespace.
	tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
	trainer = WordLevelTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
	tokenizer.pre_tokenizer = Whitespace()
	tokenizer.train(files, trainer)

	return tokenizer


	def tokenizer_from_file(tokenizer_file):
	tokenizer = Tokenizer.from_file(tokenizer_file)

	#sentinel_tokens = [(f"<extra_id_{i}>", tokenizer.token_to_id(f"<extra_id_{i}>")) for i in range(100)]
	# For BERT, we want our tokenizer to automatically add special tokens, like "[CLS]" or "[SEP]".
	# GPT des not requires [CLS] and [SEP] at pretraining while BERT requires them.
	#+https://swethatanamala.github.io/2018/12/24/summary-of-bert-paper/
	# GPT converges faster by adding [BOS] and [EOS] than without [BOS] and [EOS].
	tokenizer.post_processor = TemplateProcessing(
	single="[CLS] $A [SEP]", # BERT
	##single="[BOS] $A [EOS]", # GPT
	##single="$A </s>",
	pair="[CLS] $A [SEP] $B:1 [SEP]:1",
	special_tokens=[
	("[PAD]", tokenizer.token_to_id("[PAD]")),
	("[UNK]", tokenizer.token_to_id("[UNK]")),
	("[CLS]", tokenizer.token_to_id("[CLS]")),
	("[SEP]", tokenizer.token_to_id("[SEP]")),
	("[MASK]", tokenizer.token_to_id("[MASK]")),
	],
	)

	# Instantiate with a tokenizer object
	tokenizer = PreTrainedTokenizerFast(
	tokenizer_object=tokenizer, model_max_length=512,
	pad_token='[PAD]', unk_token='[UNK]', cls_token='[CLS]',
	sep_token='[SEP]', mask_token='[MASK]')

	return tokenizer

	if not os.path.exists("tmp.json"):
	tokenizer = build_tokenizer(files = ["gene_rank_merge_2021Aug25.txt", "../t5/t5finetune_data_flat.csv"])
	tokenizer.save("tmp.json")

	d=json.load(open("tmp.json"))

	#for i in range(7, 107):
	# d['added_tokens'].append({'id':i, 'special': True, 'content': f"<extra_id_{i-7}>",'single_word': False,'lstrip': False,'rstrip': False,'normalized': False})

	vmax = 0
	for k, v in d['model']['vocab'].items():
	if v > vmax:
	vmax = v

	assert vmax + 1 == len(d['model']['vocab'])

	for i in range(0, 100):
	##d['model']['vocab'][f"extra_id_{i}"] = vmax + 1 + i
	d['model']['vocab'][f"unused{i}"] = vmax + 1 + i

	with open('bert.json','w') as f:
	json.dump(d, f)


	tk = tokenizer_from_file("bert.json")
	tk.save_pretrained("berttokenizer")