Spaces:
Running
Running
update compress rate
Browse files- app.py +19 -12
- utils/compress_rate_util.py +6 -4
app.py
CHANGED
@@ -39,6 +39,7 @@ import gradio as gr
|
|
39 |
from vocab import all_tokenizers
|
40 |
from util import *
|
41 |
from examples import example_fn, example_types
|
|
|
42 |
|
43 |
get_window_url_params = """
|
44 |
function(url_params) {
|
@@ -75,16 +76,17 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
|
|
75 |
|
76 |
# compress rate setting
|
77 |
with gr.Accordion("Compress Rate Setting", open=True):
|
78 |
-
gr.Markdown(
|
|
|
79 |
with gr.Row():
|
80 |
compress_rate_corpus = gr.CheckboxGroup(
|
81 |
-
|
82 |
value=["cc100-en", "cc100-zh-Hans"],
|
83 |
label="corpus",
|
84 |
# info=""
|
85 |
)
|
86 |
compress_rate_unit = gr.Radio(
|
87 |
-
|
88 |
value="b_tokens/g_bytes",
|
89 |
label="unit",
|
90 |
)
|
@@ -194,12 +196,10 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
|
|
194 |
output_table_1 = gr.Dataframe()
|
195 |
output_table_2 = gr.Dataframe()
|
196 |
|
197 |
-
|
198 |
# setting
|
199 |
# compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
|
200 |
# [stats_compress_rate_1, stats_compress_rate_2])
|
201 |
|
202 |
-
|
203 |
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
|
204 |
[output_text_1, output_table_1])
|
205 |
tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
|
@@ -218,15 +218,22 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
|
|
218 |
tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
|
219 |
tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
|
220 |
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
221 |
-
tokenizer_type_2.change(get_compress_rate,
|
222 |
-
[
|
223 |
-
|
224 |
-
|
225 |
-
compress_rate_unit.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
|
226 |
-
[stats_compress_rate_1])
|
227 |
-
compress_rate_unit.change(get_compress_rate, [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
|
228 |
[stats_compress_rate_2])
|
229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
dropdown_examples.change(
|
232 |
example_fn,
|
|
|
39 |
from vocab import all_tokenizers
|
40 |
from util import *
|
41 |
from examples import example_fn, example_types
|
42 |
+
from utils.compress_rate_util import common_units, common_corpuses
|
43 |
|
44 |
get_window_url_params = """
|
45 |
function(url_params) {
|
|
|
76 |
|
77 |
# compress rate setting
|
78 |
with gr.Accordion("Compress Rate Setting", open=True):
|
79 |
+
gr.Markdown(
|
80 |
+
"Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
|
81 |
with gr.Row():
|
82 |
compress_rate_corpus = gr.CheckboxGroup(
|
83 |
+
common_corpuses, # , "code"
|
84 |
value=["cc100-en", "cc100-zh-Hans"],
|
85 |
label="corpus",
|
86 |
# info=""
|
87 |
)
|
88 |
compress_rate_unit = gr.Radio(
|
89 |
+
common_units,
|
90 |
value="b_tokens/g_bytes",
|
91 |
label="unit",
|
92 |
)
|
|
|
196 |
output_table_1 = gr.Dataframe()
|
197 |
output_table_2 = gr.Dataframe()
|
198 |
|
|
|
199 |
# setting
|
200 |
# compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
|
201 |
# [stats_compress_rate_1, stats_compress_rate_2])
|
202 |
|
|
|
203 |
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
|
204 |
[output_text_1, output_table_1])
|
205 |
tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
|
|
|
218 |
tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
|
219 |
tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
|
220 |
[stats_overlap_token_size_1, stats_overlap_token_size_2])
|
221 |
+
tokenizer_type_2.change(get_compress_rate,
|
222 |
+
[tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
|
|
|
|
|
|
|
|
|
|
|
223 |
[stats_compress_rate_2])
|
224 |
|
225 |
+
compress_rate_unit.change(get_compress_rate,
|
226 |
+
[tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
|
227 |
+
[stats_compress_rate_1])
|
228 |
+
compress_rate_unit.change(get_compress_rate,
|
229 |
+
[tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
|
230 |
+
[stats_compress_rate_2])
|
231 |
+
compress_rate_corpus.change(get_compress_rate,
|
232 |
+
[tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
|
233 |
+
[stats_compress_rate_1])
|
234 |
+
compress_rate_corpus.change(get_compress_rate,
|
235 |
+
[tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
|
236 |
+
[stats_compress_rate_2])
|
237 |
|
238 |
dropdown_examples.change(
|
239 |
example_fn,
|
utils/compress_rate_util.py
CHANGED
@@ -18,6 +18,10 @@ from typing import List
|
|
18 |
|
19 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
20 |
|
|
|
|
|
|
|
|
|
21 |
|
22 |
def get_n_bytes_of_string(string_text):
|
23 |
n_bytes = len(string_text.encode("utf-8"))
|
@@ -55,14 +59,12 @@ def unit_convertor(stat, unit):
|
|
55 |
return round(value, 2)
|
56 |
|
57 |
|
58 |
-
all_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
|
59 |
-
|
60 |
|
61 |
def pprint(stats):
|
62 |
table = []
|
63 |
for tokenizer_name, stat in stats.items():
|
64 |
columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]}
|
65 |
-
for unit in
|
66 |
if unit not in stat:
|
67 |
columns[unit] = unit_convertor(stat, unit)
|
68 |
else:
|
@@ -146,7 +148,7 @@ def main():
|
|
146 |
corpuses = [sys.argv[2]]
|
147 |
else:
|
148 |
tokenizers = all_tokenizers
|
149 |
-
corpuses =
|
150 |
|
151 |
stats = {}
|
152 |
for lang in corpuses:
|
|
|
18 |
|
19 |
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
20 |
|
21 |
+
common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
|
22 |
+
common_corpuses = ["cc100-en", "cc100-zh-Hans", "cc100-es"]
|
23 |
+
# code: https://huggingface.co/datasets/codeparrot/github-code-clean python java c sql html
|
24 |
+
# math:
|
25 |
|
26 |
def get_n_bytes_of_string(string_text):
|
27 |
n_bytes = len(string_text.encode("utf-8"))
|
|
|
59 |
return round(value, 2)
|
60 |
|
61 |
|
|
|
|
|
62 |
|
63 |
def pprint(stats):
|
64 |
table = []
|
65 |
for tokenizer_name, stat in stats.items():
|
66 |
columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]}
|
67 |
+
for unit in common_units:
|
68 |
if unit not in stat:
|
69 |
columns[unit] = unit_convertor(stat, unit)
|
70 |
else:
|
|
|
148 |
corpuses = [sys.argv[2]]
|
149 |
else:
|
150 |
tokenizers = all_tokenizers
|
151 |
+
corpuses = common_corpuses
|
152 |
|
153 |
stats = {}
|
154 |
for lang in corpuses:
|