|
|
|
|
|
|
|
|
|
""" |
|
|
|
plots |
|
|
|
table |
|
|
|
## related demo |
|
- [](http://text-processing.com/demo/tokenize/) |
|
- [gpt-tokenizer](https://gpt-tokenizer.dev/) |
|
- [llama-tokenizer-js](https://belladoreai.github.io/llama-tokenizer-js/example-demo/build/) |
|
- [](https://huggingface.co/spaces/Xenova/the-tokenizer-playground) |
|
|
|
## 可视化 |
|
|
|
[ The, 2, QUICK, Brown, Foxes, jumped, over, the, lazy, dog's, bone ] |
|
""" |
|
|
|
import json |
|
import pandas as pd |
|
import gradio as gr |
|
|
|
from vocab import all_tokenizers, load_tokener |
|
|
|
|
|
|
|
css = """ |
|
.space-show {white-space: pre-wrap;} |
|
.cell-wrap {white-space: pre-wrap;} |
|
.category-legend {display: none !important} |
|
.statistics textarea {min-width: min(50px,100%) !important; font-size: 20px !important; font-weight: 600 !important; text-align: center !important; border: none !important;} |
|
.statistics label {text-align: center !important;} |
|
""" |
|
|
|
example_text = """Replace this text in the input field to see how tokenization works |
|
华为智能音箱发布:华为Sound X""" |
|
|
|
|
|
examples = [ |
|
["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], |
|
["标点测试:,。!?;", "baichuan_7b", "llama"], |
|
["符号测试:🦙", "baichuan_7b", "llama"], |
|
["中文测试:🦙", "baichuan_7b", "llama"], |
|
["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"], |
|
] |
|
|
|
|
|
|
|
def tokenize(text, tokenizer_type, color_num=5): |
|
""" |
|
TODO: cache tokenizer |
|
""" |
|
print(text, tokenizer_type) |
|
pos_tokens = [] |
|
tokenizer = load_tokener(tokenizer_type) |
|
encoding = tokenizer.encode(text) |
|
|
|
table = [] |
|
|
|
for idx, token_id in enumerate(encoding): |
|
decode_text = tokenizer.decode([token_id]) |
|
pos_tokens.extend([(decode_text, str(idx % color_num))]) |
|
|
|
|
|
token = tokenizer.convert_ids_to_tokens([token_id])[0] |
|
if isinstance(token, bytes): |
|
try: |
|
token_str = token.decode("utf-8") |
|
except: |
|
token_str = token.decode("utf-8", errors="ignore") |
|
print("decode_error", token, token_str) |
|
|
|
token_bytes = token |
|
json_dumps = json.dumps(token_str) |
|
elif isinstance(token, str): |
|
token_str = token |
|
token_bytes = bytes(token_str, "utf-8") |
|
json_dumps = json.dumps(token_str) |
|
else: |
|
return |
|
|
|
|
|
|
|
table.append( |
|
{"TokenID": token_id, |
|
"⭐Token": token_str, |
|
"Text": decode_text, |
|
|
|
"Bytes": str(token_bytes), |
|
|
|
} |
|
) |
|
|
|
table_df = pd.DataFrame(table) |
|
print(table) |
|
|
|
|
|
return pos_tokens, table_df, len(encoding) |
|
|
|
|
|
def tokenize_pair(text, tokenizer_type_1, tokenizer_type_2): |
|
pos_tokens_1, table_df_1, token_size_1 = tokenize(text, tokenizer_type_1) |
|
pos_tokens_2, table_df_2, token_size_2 = tokenize(text, tokenizer_type_2) |
|
return pos_tokens_1, table_df_1, token_size_1, pos_tokens_2, table_df_2, token_size_2 |
|
|
|
|
|
def get_vocab_size(tokenizer_type): |
|
tokenizer = load_tokener(tokenizer_type) |
|
return tokenizer.vocab_size |
|
|
|
def test_coding(): |
|
bytes1 = b'\xe4\xb8\xad' |
|
print(bytes1) |
|
|
|
|
|
with gr.Blocks(css=css) as demo: |
|
gr.HTML("""<h1 align="center">Tokenizer Arena ⚔️</h1>""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown("## Input Text") |
|
user_input = gr.Textbox( |
|
value=example_text, |
|
label="Input Text", |
|
lines=5, |
|
show_label=False, |
|
) |
|
|
|
|
|
|
|
gr.Markdown("## Tokenization") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=6): |
|
with gr.Group(): |
|
tokenizer_type_1 = gr.Dropdown( |
|
all_tokenizers, |
|
value="llama", |
|
label="Tokenizer 1", |
|
) |
|
with gr.Group(): |
|
""" |
|
<div class="stat"><div class="stat-value">69</div><div class="stat-label">Characters</div></div> |
|
""" |
|
with gr.Row(): |
|
stats_vocab_size_1 = gr.TextArea( |
|
label="VocabSize", |
|
lines=1, |
|
elem_classes="statistics" |
|
) |
|
stats_token_size_1 = gr.TextArea( |
|
label="Tokens", |
|
lines=1, |
|
elem_classes="statistics" |
|
) |
|
stats_3 = gr.TextArea( |
|
label="Compress Rate", |
|
lines=1, |
|
elem_classes="statistics" |
|
) |
|
|
|
gr.Image("images/VS.svg", scale=1, show_label=False, show_download_button=False, container=False) |
|
with gr.Column(scale=6): |
|
with gr.Group(): |
|
tokenizer_type_2 = gr.Dropdown( |
|
all_tokenizers, |
|
value="baichuan_7b", |
|
label="Tokenizer 2", |
|
) |
|
with gr.Group(): |
|
with gr.Row(): |
|
stats_vocab_size_2 = gr.TextArea( |
|
label="VocabSize", |
|
lines=1, |
|
elem_classes="statistics" |
|
) |
|
stats_token_size_2 = gr.TextArea( |
|
label="Tokens", |
|
lines=1, |
|
elem_classes="statistics" |
|
) |
|
stats_6 = gr.TextArea( |
|
label="Compress Rate", |
|
lines=1, |
|
elem_classes="statistics" |
|
) |
|
|
|
|
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
output_text_1 = gr.Highlightedtext( |
|
label="Tokens 1", |
|
show_legend=True, |
|
elem_classes="space-show" |
|
) |
|
with gr.Column(): |
|
output_text_2 = gr.Highlightedtext( |
|
label="Tokens 2", |
|
show_legend=True, |
|
elem_classes="space-show" |
|
) |
|
|
|
with gr.Row(): |
|
output_table_1 = gr.Dataframe( |
|
headers=["TokenID", "Byte", "Text"], |
|
datatype=["str", "str", "str"], |
|
|
|
) |
|
output_table_2 = gr.Dataframe( |
|
headers=["TokenID", "Token", "Text"], |
|
datatype=["str", "str", "str"], |
|
) |
|
|
|
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1]) |
|
tokenizer_type_1.change(get_vocab_size, [tokenizer_type_1], [stats_vocab_size_1]) |
|
|
|
user_input.change(tokenize_pair, |
|
[user_input, tokenizer_type_1, tokenizer_type_2], |
|
[output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2]) |
|
|
|
tokenizer_type_2.change(tokenize, [user_input, tokenizer_type_2], [output_text_2, output_table_2, stats_token_size_2]) |
|
tokenizer_type_2.change(get_vocab_size, [tokenizer_type_2], [stats_vocab_size_2]) |
|
|
|
gr.Examples( |
|
examples, |
|
[user_input, tokenizer_type_1, tokenizer_type_2], |
|
[output_text_1, output_table_1, stats_token_size_1, output_text_2, output_table_2, stats_token_size_2], |
|
tokenize_pair, |
|
cache_examples=True, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|