|
""" |
|
https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb |
|
|
|
https://github.com/openai/tiktoken |
|
|
|
词典路径: https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py |
|
|
|
""" |
|
|
|
import json |
|
import tiktoken |
|
|
|
|
|
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo') |
|
encoding = tokenizer.encode("a bcjik今天天气颗粒剂范大将军发卡卡萨") |
|
decoding_bytes = tokenizer.decode_tokens_bytes(encoding) |
|
print(encoding) |
|
print(decoding_bytes) |
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer.decode_tokens_bytes([10]) |
|
tokenizer.decode_single_token_bytes(10) |
|
tokenizer.decode_bytes([10]) |
|
|
|
f_out = open("vocab.jsonl", "w") |
|
|
|
for i in range(tokenizer.n_vocab): |
|
|
|
|
|
try: |
|
token_str = tokenizer.decode([i]) |
|
except: |
|
token_str = None |
|
f_out.write(json.dumps({"id": i, "token": json.dumps(token_str)}) + "\n") |
|
|
|
|