Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

update

9495a4f about 1 year ago

1.18 kB


	"""

	## 指令 special token

	{"token_id": 29961, "decode_str": "[", "token": "["}
	{"token_id": 25580, "decode_str": "INST", "token": "INST"}
	{"token_id": 29962, "decode_str": "]", "token": "]"}

	{"token_id": 3532, "decode_str": "<<", "token": "▁<<"}
	{"token_id": 14816, "decode_str": "SY", "token": "SY"}
	{"token_id": 29903, "decode_str": "S", "token": "S"}
	{"token_id": 6778, "decode_str": ">>", "token": ">>"}

	{"token_id": 13, "decode_str": "\n", "token": "<0x0A>"}

	疑问：为什么不将 <<SYS>> <</SYS>> [INST] [/INST] 做成1个id？
	"""

	import os
	from transformers import LlamaTokenizer
	from vocab import TokenizerType, TokenizerImpl

	CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
	TOKENIZER_DIR = os.path.join(CURRENT_DIR, "tokenizer")



	tokenizer = LlamaTokenizer.from_pretrained(TOKENIZER_DIR)


	tokenizer.parent = ""
	tokenizer.type = TokenizerType.ByteBPE.name
	tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.com/facebookresearch/llama/blob/main/llama/tokenizer.py
	tokenizer.comments = "split all numbers into individual digits, " \
	"and fallback to bytes to decompose unknown UTF-8 characters"