|
""" |
|
最简单的tokenizer |
|
""" |
|
|
|
|
|
|
|
|
|
import json |
|
from tokenizers import Tokenizer |
|
|
|
tokenizer = Tokenizer.from_file("20B_tokenizer_chinese.v2.json") |
|
print("vocab_size with added_tokens:", tokenizer.get_vocab_size(with_added_tokens=True)) |
|
print("vocab_size without added_tokens:", tokenizer.get_vocab_size(with_added_tokens=False)) |
|
|
|
|
|
|
|
|
|
def test_token(): |
|
""" |
|
TODO: 特殊符号编码有问题 〚 < |
|
:return: |
|
""" |
|
|
|
text = "中国解决方法黑白侗鸩,。!?;一个人去哪里疗疗 一 个刹车卉〚卟<‛⦆" |
|
|
|
encoding = tokenizer.encode(text) |
|
decoding = tokenizer.decode(encoding.ids) |
|
print(decoding) |
|
for word in text: |
|
encoding = tokenizer.encode(word) |
|
for token_id in encoding.ids: |
|
decode_str = tokenizer.decode([token_id]) |
|
token = tokenizer.id_to_token(token_id) |
|
print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) |
|
|
|
def test_encode(): |
|
text = "中国解决方法黑白侗鸩,。!?;一个人去哪里疗疗<|endoftext|>一 个刹车卉〚卟<‛⦆" |
|
encoding = tokenizer.encode(text) |
|
print(tokenizer.decode(encoding.ids)) |
|
for token_id in encoding.ids: |
|
decode_str = tokenizer.decode([token_id]) |
|
token = tokenizer.id_to_token(token_id) |
|
print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) |
|
|
|
def test_decode(): |
|
|
|
encoding = [] |
|
|
|
decode_str = tokenizer.decode(encoding) |
|
print(decode_str) |
|
|
|
|
|
|
|
test_encode() |
|
|
|
|
|
|
|
|