abdeljalilELmajjodi's picture
Update tabs/leaderboard.py
9f0a734 verified
import streamlit as st
import pandas as pd
from utils import get_tokenizer, get_tokenization, get_vocab_size, check_latin_support
import logging
import matplotlib.pyplot as plt
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
dataframe_path = "darija_tokenizers_leaderboard.jsonl"
def calculate_metrics(tokenizer_id, text):
logging.debug(f"Calculating metrics for tokenizer: {tokenizer_id}")
try:
tokenizer = get_tokenizer(tokenizer_id)
tokens = get_tokenization(tokenizer, text)
vocab_size = get_vocab_size(tokenizer)
tokens_count = len(tokens)
tokens_ratio = tokens_count / len(text) if len(text) > 0 else 0
latin_support = check_latin_support(tokenizer)
tokenizer_class = tokenizer.__class__.__name__
logging.debug(f"Metrics calculated: vocab_size={vocab_size}, tokens_count={tokens_count}, tokens_ratio={tokens_ratio}, latin_support={latin_support}, tokenizer_class={tokenizer_class}")
return {
"Tokenizer": tokenizer_id,
"Vocabulary Size": vocab_size,
"Token Count": tokens_count,
"Tokens/Character Ratio": tokens_ratio,
"Latin Support": latin_support,
"Tokenizer Class": tokenizer_class
}
except Exception as e:
logging.error(f"Error processing {tokenizer_id}: {e}")
st.error(f"Error processing {tokenizer_id}: {e}")
return None
leaderboard_description = """
The `Tokens/Character Ratio` in this leaderboard is based on the number of tokens generated by the tokenizer divided by the number of characters in the input text.
**A tokenizer that scores low in this leaderboard is considered more efficient in parsing Darija text.**
## Notes:
1. `Vocabulary Size` is the total number of tokens in the tokenizer's vocabulary.
2. `Token Count` is the total number of tokens generated by the tokenizer for the input text.
3. `Tokens/Character Ratio` is the ratio of the number of tokens to the number of characters in the input text.
4. `Latin Support` indicates whether the tokenizer can handle Latin characters in addition to Arabic characters (`โœ…` for yes, `โŒ` for no).
5. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`).
"""
def leaderboard_tab(df):
st.header("Darija Tokenizers Leaderboard ๐Ÿ‡ฒ๐Ÿ‡ฆ")
st.markdown("A comparison of different tokenizers for the Moroccan Darija dialect.")
st.markdown(leaderboard_description)
df = df.sort_values(by="Tokens/Character Ratio", ascending=True)
st.dataframe(df, hide_index=True) # Table first
model_name = st.text_input("Enter a tokenizer name from Hugging Face (e.g. google/gemma-2-27b-it)")
col1, col2 = st.columns([3,1])
with col1:
submit_new_model_btn = st.button("Submit New Model", type="primary")
with col2:
refresh_btn = st.button("Refresh", type="secondary")
if submit_new_model_btn:
if model_name and "Tokenizer" in df.columns and model_name in df["Tokenizer"].values:
st.warning("This model is already in the leaderboard.")
else:
with st.spinner(f"Benchmarking {model_name}..."):
input_text = "ุงู„ู„ู„ูŠ ุงูƒุชุดููˆุง ุงู„ุณุงุญู„ ุงู„ุบุฑุจูŠ ู„ุฃู…ุฑูŠูƒุง ุงู„ุดู…ุงู„ูŠุฉ"
try:
tokenizer = get_tokenizer(model_name)
tokens = tokenizer.tokenize(input_text)
if all(token == tokenizer.unk_token for token in tokens):
st.error(f"Tokenizer {model_name} does not support Arabic characters.")
return
benchmark_data = calculate_metrics(model_name, input_text)
if benchmark_data:
df = df._append(benchmark_data, ignore_index=True)
df = df.sort_values(by="Tokens/Character Ratio", ascending=True)
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
st.success(f"Model {model_name} added to the leaderboard.")
st.rerun() # Refresh the page
else:
st.error(f"Failed to benchmark {model_name}")
except Exception as e:
st.error(f"Error loading tokenizer {model_name}: {e}")
if refresh_btn:
try:
df = pd.read_json(dataframe_path, lines=True)
st.success("Leaderboard refreshed.")
except:
st.error("Failed to refresh leaderboard.")
with st.expander("Tokens/Character Ratio Barplot (Lower is Better)", expanded=False): # Barplot last
fig = plt.figure(figsize=(10, 6))
# Sort by ratio ascending since lower is better
sorted_df = df.sort_values("Tokens/Character Ratio", ascending=True)
plt.bar(sorted_df["Tokenizer"], sorted_df["Tokens/Character Ratio"])
plt.xticks(rotation=45, ha='right')
plt.ylabel("Tokens/Character Ratio")
plt.tight_layout()
st.pyplot(fig)