Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

fix chatglm; new feature about add_special_tokens;

d27a756 9 months ago

1.09 kB

	"""
	https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

	https://github.com/openai/tiktoken

	词典路径： https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py

	"""

	import json
	import tiktoken


	tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
	text = "你好，请告诉我聚乙烯是什么"
	# text = "a bcjik今天天气颗粒剂范大将军发卡卡萨"
	encoding = tokenizer.encode(text)
	decoding_bytes = tokenizer.decode_tokens_bytes(encoding)
	print(encoding)
	print(decoding_bytes)

	# for token in tokens:
	# token_str = encoding.decode([token])
	# print(token, token_str, json.dumps(token_str))


	tokenizer.decode_tokens_bytes([10])
	tokenizer.decode_single_token_bytes(10)
	tokenizer.decode_bytes([10])


	f_out = open("vocab.jsonl", "w")
	# 100255
	for i in range(tokenizer.n_vocab):
	# decode_bytes
	# decode_single_token_bytes
	try:
	token_str = tokenizer.decode([i])
	except:
	token_str = None
	f_out.write(json.dumps({"id": i, "token": json.dumps(token_str)}) + "\n")