|
from transformers import AutoTokenizer |
|
from tqdm import tqdm |
|
import gradio as gr |
|
import pandas as pd |
|
from datasets import load_dataset |
|
import random |
|
from pathlib import Path |
|
|
|
initial_list_of_models = [ |
|
"Xenova/gpt-4o", |
|
"NousResearch/Meta-Llama-3-8B", |
|
"CohereForAI/c4ai-command-r-v01", |
|
"CohereForAI/c4ai-command-r-plus", |
|
"core42/jais-13b", |
|
] |
|
|
|
dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"] |
|
|
|
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl" |
|
if dataframe_path.exists(): |
|
df = pd.read_json(dataframe_path, lines=True) |
|
else: |
|
df = pd.DataFrame(columns=["📛 Models", "➕ Total Number of Tokens", "📘 Vocab Size", "Tokenizer Class"]) |
|
|
|
for model_name in tqdm(initial_list_of_models): |
|
if model_name in df["📛 Models"].values: |
|
continue |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
model_name, use_fast=True, trust_remote_code=True |
|
) |
|
vocab_size = tokenizer.vocab_size |
|
number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids]) |
|
df = df._append( |
|
{ |
|
"📛 Models": model_name, |
|
"📘 Vocab Size": vocab_size, |
|
"➕ Total Number of Tokens": number_of_tokens, |
|
"Tokenizer Class": tokenizer.__class__.__name__, |
|
}, |
|
ignore_index=True, |
|
) |
|
|
|
|
|
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True) |
|
|
|
|
|
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) |
|
|
|
def submit(model_name): |
|
global df |
|
if model_name in df["📛 Models"].values: |
|
return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist()) |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
model_name, use_fast=True, trust_remote_code=True |
|
) |
|
vocab_size = tokenizer.vocab_size |
|
number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids]) |
|
df = df._append( |
|
{ |
|
"📛 Models": model_name, |
|
"➕ Total Number of Tokens": number_of_tokens, |
|
"📘 Vocab Size": vocab_size, |
|
"Tokenizer Class": tokenizer.__class__.__name__, |
|
}, |
|
ignore_index=True, |
|
) |
|
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True) |
|
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) |
|
return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist()) |
|
|
|
def generate_distinct_colors(n): |
|
"""Generate n visually distinct colors in hexadecimal format.""" |
|
if n > 256**3: |
|
raise ValueError("Cannot generate more than 16,777,216 unique colors.") |
|
|
|
|
|
|
|
spacing = int((256 * 256 * 256)**(1/3) / n**(1/3)) |
|
max_val = 256 - spacing |
|
|
|
|
|
used_colors = set() |
|
|
|
|
|
result = [] |
|
|
|
attempts = 0 |
|
while len(result) < n: |
|
|
|
r = random.randint(0, max_val) |
|
g = random.randint(0, max_val) |
|
b = random.randint(0, max_val) |
|
|
|
|
|
r = min(255, r * spacing) |
|
g = min(255, g * spacing) |
|
b = min(255, b * spacing) |
|
|
|
|
|
color = f"#{r:02X}{g:02X}{b:02X}" |
|
|
|
|
|
if color not in used_colors: |
|
used_colors.add(color) |
|
result.append(color) |
|
else: |
|
attempts += 1 |
|
if attempts > 50: |
|
|
|
spacing = max(1, spacing - 1) |
|
max_val = 256 - spacing |
|
attempts = 0 |
|
|
|
return result |
|
|
|
def decode_bpe_tokens(tokens): |
|
fixed_tokens = [] |
|
for token in tokens: |
|
|
|
if token.startswith('Ġ'): |
|
|
|
try: |
|
|
|
fixed_token = ' ' + token[1:].encode('utf-8').decode('utf-8') |
|
except UnicodeDecodeError: |
|
fixed_token = token |
|
else: |
|
try: |
|
|
|
fixed_token = token.encode('utf-8').decode('utf-8') |
|
except UnicodeDecodeError: |
|
fixed_token = token |
|
fixed_tokens.append(fixed_token) |
|
return fixed_tokens |
|
|
|
def decode_arabic_tokens(tokens): |
|
decoded_tokens = [] |
|
for token in tokens: |
|
decoded_token = token.encode('latin-1', 'backslashreplace').decode('unicode-escape') |
|
decoded_tokens.append(decoded_token) |
|
return decoded_tokens |
|
|
|
def tokenize_text(text, chosen_model): |
|
tokenizer = AutoTokenizer.from_pretrained(chosen_model) |
|
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text)) |
|
random_colors = generate_distinct_colors(len(tokenized_text)) |
|
|
|
final_tokenized_text = [] |
|
for token in tokenized_text: |
|
correct_tokenized_text = "" |
|
for char in text: |
|
correct_tokenized_text += char |
|
current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text)) |
|
if current_token[0] == token: |
|
final_tokenized_text.append(correct_tokenized_text) |
|
text = text[len(correct_tokenized_text):] |
|
break |
|
print(final_tokenized_text) |
|
|
|
output = [] |
|
color_map = {} |
|
for idx, token in enumerate(final_tokenized_text): |
|
output.append((token, str(idx))) |
|
color_map[str(idx+1)] = random_colors[idx % len(random_colors)] |
|
|
|
return gr.HighlightedText(output, color_map) |
|
|
|
leaderboard_description = """The numbers in this leaderboard are based on the total number of tokens in the Arabic |
|
dataset [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations). |
|
""" |
|
|
|
with gr.Blocks() as demo: |
|
gr.HTML("<center><h1>Arabic Tokenizers Leaderboard</h1></center>") |
|
gr.Markdown("## What is the best tokenizer for Arabic?") |
|
gr.Markdown(leaderboard_description) |
|
with gr.Tab(label="Leaderboard"): |
|
dataframe = gr.Dataframe(df) |
|
with gr.Accordion("Barplot", open=False): |
|
barplot = gr.BarPlot( |
|
df, |
|
x="📛 Models", |
|
y="➕ Total Number of Tokens", |
|
x_title=" ", |
|
y_title=" ", |
|
width=1000, |
|
height=400, |
|
tooltip=["📘 Vocab Size", "➕ Total Number of Tokens"], |
|
vertical=False, |
|
x_label_angle=30, |
|
) |
|
model_name = gr.Textbox( |
|
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)" |
|
) |
|
submit_new_model_btn = gr.Button(value="Submit", variant="primary") |
|
with gr.Tab(label="Try tokenizers"): |
|
text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right") |
|
dropdown = gr.Dropdown( |
|
label="Select a model", |
|
choices=df["📛 Models"].tolist(), |
|
value=df["📛 Models"].tolist()[0], |
|
) |
|
submit_text_btn = gr.Button(value="Submit", variant="primary") |
|
tokenized_textbox = gr.HighlightedText(label="Tokenized text") |
|
|
|
submit_new_model_btn.click(submit, model_name, outputs=[dataframe, barplot, dropdown]) |
|
submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox]) |
|
|
|
|
|
demo.launch() |
|
|