import streamlit as st import pandas as pd from utils import get_tokenizer, get_tokenization, get_vocab_size, check_latin_support import logging import matplotlib.pyplot as plt # Configure logging logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') dataframe_path = "darija_tokenizers_leaderboard.jsonl" def calculate_metrics(tokenizer_id, text): logging.debug(f"Calculating metrics for tokenizer: {tokenizer_id}") try: tokenizer = get_tokenizer(tokenizer_id) tokens = get_tokenization(tokenizer, text) vocab_size = get_vocab_size(tokenizer) tokens_count = len(tokens) tokens_ratio = tokens_count / len(text) if len(text) > 0 else 0 latin_support = check_latin_support(tokenizer) tokenizer_class = tokenizer.__class__.__name__ logging.debug(f"Metrics calculated: vocab_size={vocab_size}, tokens_count={tokens_count}, tokens_ratio={tokens_ratio}, latin_support={latin_support}, tokenizer_class={tokenizer_class}") return { "Tokenizer": tokenizer_id, "Vocabulary Size": vocab_size, "Token Count": tokens_count, "Tokens/Character Ratio": tokens_ratio, "Latin Support": latin_support, "Tokenizer Class": tokenizer_class } except Exception as e: logging.error(f"Error processing {tokenizer_id}: {e}") st.error(f"Error processing {tokenizer_id}: {e}") return None leaderboard_description = """ The `Tokens/Character Ratio` in this leaderboard is based on the number of tokens generated by the tokenizer divided by the number of characters in the input text. **A tokenizer that scores low in this leaderboard is considered more efficient in parsing Darija text.** ## Notes: 1. `Vocabulary Size` is the total number of tokens in the tokenizer's vocabulary. 2. `Token Count` is the total number of tokens generated by the tokenizer for the input text. 3. `Tokens/Character Ratio` is the ratio of the number of tokens to the number of characters in the input text. 4. `Latin Support` indicates whether the tokenizer can handle Latin characters in addition to Arabic characters (`✅` for yes, `❌` for no). 5. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`). """ def leaderboard_tab(df): st.header("Darija Tokenizers Leaderboard 🇲🇦") st.markdown("A comparison of different tokenizers for the Moroccan Darija dialect.") st.markdown(leaderboard_description) df = df.sort_values(by="Tokens/Character Ratio", ascending=True) st.dataframe(df, hide_index=True) # Table first model_name = st.text_input("Enter a tokenizer name from Hugging Face (e.g. google/gemma-2-27b-it)") col1, col2 = st.columns([3,1]) with col1: submit_new_model_btn = st.button("Submit New Model", type="primary") with col2: refresh_btn = st.button("Refresh", type="secondary") if submit_new_model_btn: if model_name and "Tokenizer" in df.columns and model_name in df["Tokenizer"].values: st.warning("This model is already in the leaderboard.") else: with st.spinner(f"Benchmarking {model_name}..."): input_text = "الللي اكتشفوا الساحل الغربي لأمريكا الشمالية" try: tokenizer = get_tokenizer(model_name) tokens = tokenizer.tokenize(input_text) if all(token == tokenizer.unk_token for token in tokens): st.error(f"Tokenizer {model_name} does not support Arabic characters.") return benchmark_data = calculate_metrics(model_name, input_text) if benchmark_data: df = df._append(benchmark_data, ignore_index=True) df = df.sort_values(by="Tokens/Character Ratio", ascending=True) df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) st.success(f"Model {model_name} added to the leaderboard.") st.rerun() # Refresh the page else: st.error(f"Failed to benchmark {model_name}") except Exception as e: st.error(f"Error loading tokenizer {model_name}: {e}") if refresh_btn: try: df = pd.read_json(dataframe_path, lines=True) st.success("Leaderboard refreshed.") except: st.error("Failed to refresh leaderboard.") with st.expander("Tokens/Character Ratio Barplot (Lower is Better)", expanded=False): # Barplot last fig = plt.figure(figsize=(10, 6)) # Sort by ratio ascending since lower is better sorted_df = df.sort_values("Tokens/Character Ratio", ascending=True) plt.bar(sorted_df["Tokenizer"], sorted_df["Tokens/Character Ratio"]) plt.xticks(rotation=45, ha='right') plt.ylabel("Tokens/Character Ratio") plt.tight_layout() st.pyplot(fig)