""" | |
https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb | |
https://github.com/openai/tiktoken | |
词典路径: https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py | |
""" | |
import json | |
import tiktoken | |
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo') | |
text = "你好,请告诉我聚乙烯是什么" | |
# text = "a bcjik今天天气颗粒剂范大将军发卡卡萨" | |
encoding = tokenizer.encode(text) | |
decoding_bytes = tokenizer.decode_tokens_bytes(encoding) | |
print(encoding) | |
print(decoding_bytes) | |
# for token in tokens: | |
# token_str = encoding.decode([token]) | |
# print(token, token_str, json.dumps(token_str)) | |
tokenizer.decode_tokens_bytes([10]) | |
tokenizer.decode_single_token_bytes(10) | |
tokenizer.decode_bytes([10]) | |
f_out = open("vocab.jsonl", "w") | |
# 100255 | |
for i in range(tokenizer.n_vocab): | |
# decode_bytes | |
# decode_single_token_bytes | |
try: | |
token_str = tokenizer.decode([i]) | |
except: | |
token_str = None | |
f_out.write(json.dumps({"id": i, "token": json.dumps(token_str)}) + "\n") | |