|
""" |
|
gpt_35_turbo decode UnicodeDecodeError 99413 b' \xe6\xb5' |
|
gpt_35_turbo decode UnicodeDecodeError 99742 b'\x8c\xa8' |
|
gpt_35_turbo decode UnicodeDecodeError 99834 b'\xad\x90' |
|
gpt_35_turbo decode UnicodeDecodeError 100112 b'\xe0\xae\xbf\xe0\xae' |
|
gpt_35_turbo decode KeyError 100256 |
|
gpt_35_turbo decode KeyError 100261 |
|
gpt_35_turbo decode KeyError 100262 |
|
gpt_35_turbo decode KeyError 100263 |
|
""" |
|
|
|
|
|
|
|
import json |
|
import tiktoken |
|
|
|
|
|
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo') |
|
|
|
tokens = [100263, 99834] |
|
|
|
tokenizer.decode(tokens) |
|
tokenizer.decode(tokens) |
|
|
|
tokenizer._core_bpe.decode_bytes(tokens).decode("utf-8", errors="replace") |
|
|
|
for token_id in [100263, 99834]: |
|
try: |
|
tokenizer.decode_tokens_bytes([token_id]) |
|
except: |
|
pass |
|
|
|
try: |
|
tokenizer.decode_single_token_bytes(token_id) |
|
except: |
|
pass |
|
|
|
try: |
|
tokenizer.decode_bytes([token_id]) |
|
except: |
|
pass |
|
|
|
|
|
|
|
|