from transformers import AutoTokenizer from tqdm import tqdm import gradio as gr import pandas as pd from datasets import load_dataset import random from pathlib import Path initial_list_of_models = [ "riotu-lab/Aranizer-PBE-86k", "riotu-lab/Aranizer-PBE-64k", "riotu-lab/Aranizer-PBE-32k", "riotu-lab/Aranizer-SP-86k", "riotu-lab/Aranizer-SP-64k", "riotu-lab/Aranizer-SP-32k", "asafaya/bert-base-arabic", "inceptionai/jais-family-30b-16k", "Xenova/gpt-4o", "FreedomIntelligence/AceGPT-v1.5-13B-Chat", "FreedomIntelligence/AceGPT-13B", "Qwen/Qwen2.5-72B-Instruct", "microsoft/Phi-3-mini-128k-instruct", "unsloth/gemma-2b-bnb-4bit", "unsloth/Llama-3.3-70B-Instruct", "CohereForAI/c4ai-command-r-v01", "CohereForAI/c4ai-command-r-plus", "CohereForAI/aya-101", ] dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl" if dataframe_path.exists(): df = pd.read_json(dataframe_path, lines=True) else: df = pd.DataFrame( columns=[ "👳 Tokenize Tashkeel", "📛 Models", "🪺 Fertility Score", "➕ Total Number of Tokens", "📘 Vocab Size", "Tokenizer Class", ] ) # Datasets used for calculating the number of tokens arabic_dataset1 = load_dataset("ImruQays/Rasaif-Classical-Arabic-English-Parallel-texts", split="train")["ar"] arabic_dataset2 = load_dataset("HeshamHaroon/arabic-quotes", split="train")["quote"] arabic_dataset3 = load_dataset("SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots", split="train")["text"] all_data = arabic_dataset1 + arabic_dataset2 + arabic_dataset3 print(f"Total number of samples: {len(all_data)}") all_text = " ".join(all_data) all_words = all_text.split() def benchmark_tokenizer(model_name) -> float: # Initialize the tokenizer tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True, trust_remote_code=True ) vocab_size = tokenizer.vocab_size total_number_of_tokens = len(tokenizer.tokenize(all_text)) # Check if the tokenizer maintains the tashkeel dummy_text = "السَّلَامُ عَلَيْكُمْ وَرَحْمَةُ اللَّهِ وَبَرَكَاتُهُ" tokenized_text = tokenizer.decode(tokenizer.encode(dummy_text), skip_special_tokens=True) tashkeel_maintainer = "✅" if tokenized_text == dummy_text else "❌" return { "👳 Tokenize Tashkeel": tashkeel_maintainer, "📛 Models": model_name, "🪺 Fertility Score": round(total_number_of_tokens / len(all_words), 3), "📘 Vocab Size": vocab_size, "➕ Total Number of Tokens": total_number_of_tokens, "Tokenizer Class": tokenizer.__class__.__name__, } for model_name in tqdm(initial_list_of_models): if model_name in df["📛 Models"].values: continue benchmark_data = benchmark_tokenizer(model_name) df = df._append(benchmark_data, ignore_index=True) # Sort the dataframe by the number of tokens df = df.sort_values(by="➕ Total Number of Tokens", ascending=True) # Save the dataframe to a csv file df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) def submit(model_name): global df if model_name in df["📛 Models"].values: return ( gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist()), ) benchmark_data = benchmark_tokenizer(model_name) df = df._append(benchmark_data, ignore_index=True) df = df.sort_values(by="➕ Total Number of Tokens", ascending=True) df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) return ( gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist()), ) def generate_distinct_colors(n): """Generate n visually distinct colors in hexadecimal format.""" if n > 256**3: raise ValueError("Cannot generate more than 16,777,216 unique colors.") # To ensure colors are distinct, calculate an appropriate distance between colors # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate spacing = int((256 * 256 * 256) ** (1 / 3) / n ** (1 / 3)) max_val = 256 - spacing # Set to keep track of used colors used_colors = set() # List to store the result colors result = [] attempts = 0 while len(result) < n: # Generate a color with a random start and controlled spacing r = random.randint(0, max_val) g = random.randint(0, max_val) b = random.randint(0, max_val) # Scale up by spacing to ensure minimum distance between colors r = min(255, r * spacing) g = min(255, g * spacing) b = min(255, b * spacing) # Format the color in hexadecimal color = f"#{r:02X}{g:02X}{b:02X}" # Ensure this color hasn't been used if color not in used_colors: used_colors.add(color) result.append(color) else: attempts += 1 if attempts > 50: # Dynamically adjust spacing if stuck spacing = max(1, spacing - 1) max_val = 256 - spacing attempts = 0 return result def decode_bpe_tokens(tokens): fixed_tokens = [] for token in tokens: # Check if the token starts with the special BPE space character 'Ġ' if token.startswith("Ġ"): # Process the rest of the token try: # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters fixed_token = " " + token[1:].encode("utf-8").decode("utf-8") except UnicodeDecodeError: fixed_token = token # Use the original token if decoding fails else: try: # Directly encode and decode without misinterpretation steps fixed_token = token.encode("utf-8").decode("utf-8") except UnicodeDecodeError: fixed_token = token # Use the original token if decoding fails fixed_tokens.append(fixed_token) return fixed_tokens def tokenize_text(text, chosen_model, better_tokenization=False): tokenizer = AutoTokenizer.from_pretrained(chosen_model) tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text)) random_colors = generate_distinct_colors(len(tokenized_text)) if better_tokenization: final_tokenized_text = [] for token in tokenized_text: correct_tokenized_text = "" for char in text: correct_tokenized_text += char current_token = decode_bpe_tokens( tokenizer.tokenize(correct_tokenized_text) ) if current_token[0] == token: final_tokenized_text.append(correct_tokenized_text) text = text[len(correct_tokenized_text) :] break else: final_tokenized_text = tokenized_text print(final_tokenized_text) output = [] color_map = {} for idx, token in enumerate(final_tokenized_text): output.append((token, str(idx))) color_map[str(idx + 1)] = random_colors[idx % len(random_colors)] return gr.HighlightedText(output, color_map) def refresh(): global df df = pd.read_json(dataframe_path, lines=True) return ( gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist()), ) leaderboard_description = """The `Total Number of Tokens` in this leaderboard is based on the total number of tokens got from the Arabic section of [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations) dataset (This dataset was chosen because it represents Arabic Fusha text in a small and concentrated manner). **A tokenizer that scores high in this leaderboard should be efficient in parsing Arabic in its different dialects and forms.** ## Updates/Notes: 1. New datasets is added for the evaluation (e.g. [arabic-quotes](https://huggingface.co/datasets/HeshamHaroon/arabic-quotes), [Moroccan_Arabic_Wikipedia_20230101_nobots](https://huggingface.co/datasets/SaiedAlshahrani/Moroccan_Arabic_Wikipedia_20230101_nobots)). 1. `Fertility Score` is calculated by dividing the total number of tokens by the total number of words in the dataset (Lower is better). 1. `Tokenize Tashkeel` is an indicator of whether the tokenizer maintains the tashkeel when tokenizing or not (`✅` for yes, `❌` for no). 1. `Vocab Size` is the total number of tokens in the tokenizer's vocabulary (e.g. `10000` tokens). 1. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`) 1. `Total Number of Tokens` is the total number of tokens in the dataset after tokenization (Lower is better). **Note**: Press `Refresh` to get the latest data available in the leaderboard (The initial state may be deceiving). """ with gr.Blocks() as demo: gr.HTML("