xu-song commited on
Commit
367a536
·
1 Parent(s): 988921c

update compress rate

Browse files
Files changed (2) hide show
  1. app.py +19 -12
  2. utils/compress_rate_util.py +6 -4
app.py CHANGED
@@ -39,6 +39,7 @@ import gradio as gr
39
  from vocab import all_tokenizers
40
  from util import *
41
  from examples import example_fn, example_types
 
42
 
43
  get_window_url_params = """
44
  function(url_params) {
@@ -75,16 +76,17 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
75
 
76
  # compress rate setting
77
  with gr.Accordion("Compress Rate Setting", open=True):
78
- gr.Markdown("Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
 
79
  with gr.Row():
80
  compress_rate_corpus = gr.CheckboxGroup(
81
- ["cc100-en", "cc100-zh-Hans", "cc100-es"], # , "code"
82
  value=["cc100-en", "cc100-zh-Hans"],
83
  label="corpus",
84
  # info=""
85
  )
86
  compress_rate_unit = gr.Radio(
87
- ["b_tokens/g_bytes", "g_bytes/b_tokens", "t_tokens/t_bytes", "t_bytes/t_tokens", "n_chars/n_tokens"],
88
  value="b_tokens/g_bytes",
89
  label="unit",
90
  )
@@ -194,12 +196,10 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
194
  output_table_1 = gr.Dataframe()
195
  output_table_2 = gr.Dataframe()
196
 
197
-
198
  # setting
199
  # compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
200
  # [stats_compress_rate_1, stats_compress_rate_2])
201
 
202
-
203
  tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
204
  [output_text_1, output_table_1])
205
  tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
@@ -218,15 +218,22 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo:
218
  tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
219
  tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
220
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
221
- tokenizer_type_2.change(get_compress_rate, [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
222
- [stats_compress_rate_2])
223
-
224
-
225
- compress_rate_unit.change(get_compress_rate, [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
226
- [stats_compress_rate_1])
227
- compress_rate_unit.change(get_compress_rate, [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
228
  [stats_compress_rate_2])
229
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
  dropdown_examples.change(
232
  example_fn,
 
39
  from vocab import all_tokenizers
40
  from util import *
41
  from examples import example_fn, example_types
42
+ from utils.compress_rate_util import common_units, common_corpuses
43
 
44
  get_window_url_params = """
45
  function(url_params) {
 
76
 
77
  # compress rate setting
78
  with gr.Accordion("Compress Rate Setting", open=True):
79
+ gr.Markdown(
80
+ "Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ")
81
  with gr.Row():
82
  compress_rate_corpus = gr.CheckboxGroup(
83
+ common_corpuses, # , "code"
84
  value=["cc100-en", "cc100-zh-Hans"],
85
  label="corpus",
86
  # info=""
87
  )
88
  compress_rate_unit = gr.Radio(
89
+ common_units,
90
  value="b_tokens/g_bytes",
91
  label="unit",
92
  )
 
196
  output_table_1 = gr.Dataframe()
197
  output_table_2 = gr.Dataframe()
198
 
 
199
  # setting
200
  # compress_rate_unit.change(compress_rate_unit_change, [compress_rate_unit],
201
  # [stats_compress_rate_1, stats_compress_rate_2])
202
 
 
203
  tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1],
204
  [output_text_1, output_table_1])
205
  tokenizer_type_1.change(basic_count, [tokenizer_type_1], [stats_vocab_size_1, stats_zh_token_size_1])
 
218
  tokenizer_type_2.change(basic_count, [tokenizer_type_2], [stats_vocab_size_2, stats_zh_token_size_2])
219
  tokenizer_type_2.change(get_overlap_token_size, [tokenizer_type_1, tokenizer_type_2],
220
  [stats_overlap_token_size_1, stats_overlap_token_size_2])
221
+ tokenizer_type_2.change(get_compress_rate,
222
+ [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
 
 
 
 
 
223
  [stats_compress_rate_2])
224
 
225
+ compress_rate_unit.change(get_compress_rate,
226
+ [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
227
+ [stats_compress_rate_1])
228
+ compress_rate_unit.change(get_compress_rate,
229
+ [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
230
+ [stats_compress_rate_2])
231
+ compress_rate_corpus.change(get_compress_rate,
232
+ [tokenizer_type_1, compress_rate_corpus, compress_rate_unit],
233
+ [stats_compress_rate_1])
234
+ compress_rate_corpus.change(get_compress_rate,
235
+ [tokenizer_type_2, compress_rate_corpus, compress_rate_unit],
236
+ [stats_compress_rate_2])
237
 
238
  dropdown_examples.change(
239
  example_fn,
utils/compress_rate_util.py CHANGED
@@ -18,6 +18,10 @@ from typing import List
18
 
19
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
20
 
 
 
 
 
21
 
22
  def get_n_bytes_of_string(string_text):
23
  n_bytes = len(string_text.encode("utf-8"))
@@ -55,14 +59,12 @@ def unit_convertor(stat, unit):
55
  return round(value, 2)
56
 
57
 
58
- all_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
59
-
60
 
61
  def pprint(stats):
62
  table = []
63
  for tokenizer_name, stat in stats.items():
64
  columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]}
65
- for unit in all_units:
66
  if unit not in stat:
67
  columns[unit] = unit_convertor(stat, unit)
68
  else:
@@ -146,7 +148,7 @@ def main():
146
  corpuses = [sys.argv[2]]
147
  else:
148
  tokenizers = all_tokenizers
149
- corpuses = ["en", "zh-Hans"]
150
 
151
  stats = {}
152
  for lang in corpuses:
 
18
 
19
  CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
20
 
21
+ common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
22
+ common_corpuses = ["cc100-en", "cc100-zh-Hans", "cc100-es"]
23
+ # code: https://huggingface.co/datasets/codeparrot/github-code-clean python java c sql html
24
+ # math:
25
 
26
  def get_n_bytes_of_string(string_text):
27
  n_bytes = len(string_text.encode("utf-8"))
 
59
  return round(value, 2)
60
 
61
 
 
 
62
 
63
  def pprint(stats):
64
  table = []
65
  for tokenizer_name, stat in stats.items():
66
  columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]}
67
+ for unit in common_units:
68
  if unit not in stat:
69
  columns[unit] = unit_convertor(stat, unit)
70
  else:
 
148
  corpuses = [sys.argv[2]]
149
  else:
150
  tokenizers = all_tokenizers
151
+ corpuses = common_corpuses
152
 
153
  stats = {}
154
  for lang in corpuses: