from transformers import AutoTokenizer from tqdm import tqdm import gradio as gr import pandas as pd from datasets import load_dataset import random from pathlib import Path initial_list_of_models = [ "Xenova/gpt-4o", "NousResearch/Meta-Llama-3-8B", "CohereForAI/c4ai-command-r-v01", "CohereForAI/c4ai-command-r-plus", "core42/jais-13b", ] dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arabic"] dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl" if dataframe_path.exists(): df = pd.read_json(dataframe_path, lines=True) else: df = pd.DataFrame(columns=["📛 Models", "âž• Total Number of Tokens", "📘 Vocab Size", "Tokenizer Class"]) for model_name in tqdm(initial_list_of_models): if model_name in df["📛 Models"].values: continue tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True, trust_remote_code=True ) vocab_size = tokenizer.vocab_size number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids]) df = df._append( { "📛 Models": model_name, "📘 Vocab Size": vocab_size, "âž• Total Number of Tokens": number_of_tokens, "Tokenizer Class": tokenizer.__class__.__name__, }, ignore_index=True, ) # Sort the dataframe by the number of tokens df = df.sort_values(by="âž• Total Number of Tokens", ascending=True) # Save the dataframe to a csv file df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) def submit(model_name): global df if model_name in df["📛 Models"].values: return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist()) tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True, trust_remote_code=True ) vocab_size = tokenizer.vocab_size number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids]) df = df._append( { "📛 Models": model_name, "âž• Total Number of Tokens": number_of_tokens, "📘 Vocab Size": vocab_size, "Tokenizer Class": tokenizer.__class__.__name__, }, ignore_index=True, ) df = df.sort_values(by="âž• Total Number of Tokens", ascending=True) df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist()) def generate_distinct_colors(n): """Generate n visually distinct colors in hexadecimal format.""" if n > 256**3: raise ValueError("Cannot generate more than 16,777,216 unique colors.") # To ensure colors are distinct, calculate an appropriate distance between colors # The cube root of 256**3 (total colors) divided by n gives a crude initial spacing estimate spacing = int((256 * 256 * 256)**(1/3) / n**(1/3)) max_val = 256 - spacing # Set to keep track of used colors used_colors = set() # List to store the result colors result = [] attempts = 0 while len(result) < n: # Generate a color with a random start and controlled spacing r = random.randint(0, max_val) g = random.randint(0, max_val) b = random.randint(0, max_val) # Scale up by spacing to ensure minimum distance between colors r = min(255, r * spacing) g = min(255, g * spacing) b = min(255, b * spacing) # Format the color in hexadecimal color = f"#{r:02X}{g:02X}{b:02X}" # Ensure this color hasn't been used if color not in used_colors: used_colors.add(color) result.append(color) else: attempts += 1 if attempts > 50: # Dynamically adjust spacing if stuck spacing = max(1, spacing - 1) max_val = 256 - spacing attempts = 0 return result def decode_bpe_tokens(tokens): fixed_tokens = [] for token in tokens: # Check if the token starts with the special BPE space character 'Ä ' if token.startswith('Ä '): # Process the rest of the token try: # Decode the rest of the token from UTF-8 bytes understood as Latin-1 characters fixed_token = ' ' + token[1:].encode('utf-8').decode('utf-8') except UnicodeDecodeError: fixed_token = token # Use the original token if decoding fails else: try: # Directly encode and decode without misinterpretation steps fixed_token = token.encode('utf-8').decode('utf-8') except UnicodeDecodeError: fixed_token = token # Use the original token if decoding fails fixed_tokens.append(fixed_token) return fixed_tokens def decode_arabic_tokens(tokens): decoded_tokens = [] for token in tokens: decoded_token = token.encode('latin-1', 'backslashreplace').decode('unicode-escape') decoded_tokens.append(decoded_token) return decoded_tokens def tokenize_text(text, chosen_model): tokenizer = AutoTokenizer.from_pretrained(chosen_model) tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text)) random_colors = generate_distinct_colors(len(tokenized_text)) final_tokenized_text = [] for token in tokenized_text: correct_tokenized_text = "" for char in text: correct_tokenized_text += char current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text)) if current_token[0] == token: final_tokenized_text.append(correct_tokenized_text) text = text[len(correct_tokenized_text):] break print(final_tokenized_text) output = [] color_map = {} for idx, token in enumerate(final_tokenized_text): output.append((token, str(idx))) color_map[str(idx+1)] = random_colors[idx % len(random_colors)] return gr.HighlightedText(output, color_map) leaderboard_description = """The numbers in this leaderboard are based on the total number of tokens in the Arabic dataset [rasaif-translations](https://huggingface.co/datasets/MohamedRashad/rasaif-translations). """ with gr.Blocks() as demo: gr.HTML("