Spaces:
Running
Running
fix typo
Browse files- compression_app.py +2 -2
compression_app.py
CHANGED
@@ -28,7 +28,7 @@ from compression_util import get_compression_leaderboard, common_corpuses
|
|
28 |
docs = """## 📖 What is a good tokenizer?
|
29 |
|
30 |
From a compression perspective, a good tokenizer should be lossless,
|
31 |
-
and keep high compression rate (
|
32 |
The encoding and decoding process can be formulated as
|
33 |
```python
|
34 |
token_ids = tokenizer.encode(input_text) # compressed tokens
|
@@ -144,7 +144,7 @@ with gr.Blocks(theme=theme) as demo:
|
|
144 |
|
145 |
gr.Markdown("## 🏆 Compression Rate Leaderboard\n"
|
146 |
"This leaderboard aims to evaluate tokenizer performance on different languages.\n"
|
147 |
-
"Lower `oov_ratio` refers to
|
148 |
"Lower `char/token` means more words might be segmented into subwords."
|
149 |
)
|
150 |
search_bar = gr.Textbox(
|
|
|
28 |
docs = """## 📖 What is a good tokenizer?
|
29 |
|
30 |
From a compression perspective, a good tokenizer should be lossless,
|
31 |
+
and keep high compression rate (fewer tokens for given text).
|
32 |
The encoding and decoding process can be formulated as
|
33 |
```python
|
34 |
token_ids = tokenizer.encode(input_text) # compressed tokens
|
|
|
144 |
|
145 |
gr.Markdown("## 🏆 Compression Rate Leaderboard\n"
|
146 |
"This leaderboard aims to evaluate tokenizer performance on different languages.\n"
|
147 |
+
"Lower `oov_ratio` refers to fewer out-of-vocabulary tokens.\n"
|
148 |
"Lower `char/token` means more words might be segmented into subwords."
|
149 |
)
|
150 |
search_bar = gr.Textbox(
|