|
import streamlit as st |
|
import pandas as pd |
|
from utils import get_tokenizer, get_tokenization, get_vocab_size, check_latin_support |
|
import logging |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
dataframe_path = "darija_tokenizers_leaderboard.jsonl" |
|
|
|
|
|
def calculate_metrics(tokenizer_id, text): |
|
logging.debug(f"Calculating metrics for tokenizer: {tokenizer_id}") |
|
try: |
|
tokenizer = get_tokenizer(tokenizer_id) |
|
tokens = get_tokenization(tokenizer, text) |
|
vocab_size = get_vocab_size(tokenizer) |
|
tokens_count = len(tokens) |
|
tokens_ratio = tokens_count / len(text) if len(text) > 0 else 0 |
|
latin_support = check_latin_support(tokenizer) |
|
tokenizer_class = tokenizer.__class__.__name__ |
|
logging.debug(f"Metrics calculated: vocab_size={vocab_size}, tokens_count={tokens_count}, tokens_ratio={tokens_ratio}, latin_support={latin_support}, tokenizer_class={tokenizer_class}") |
|
return { |
|
"Tokenizer": tokenizer_id, |
|
"Vocabulary Size": vocab_size, |
|
"Token Count": tokens_count, |
|
"Tokens/Character Ratio": tokens_ratio, |
|
"Latin Support": latin_support, |
|
"Tokenizer Class": tokenizer_class |
|
} |
|
except Exception as e: |
|
logging.error(f"Error processing {tokenizer_id}: {e}") |
|
st.error(f"Error processing {tokenizer_id}: {e}") |
|
return None |
|
|
|
leaderboard_description = """ |
|
The `Tokens/Character Ratio` in this leaderboard is based on the number of tokens generated by the tokenizer divided by the number of characters in the input text. |
|
|
|
**A tokenizer that scores low in this leaderboard is considered more efficient in parsing Darija text.** |
|
|
|
## Notes: |
|
1. `Vocabulary Size` is the total number of tokens in the tokenizer's vocabulary. |
|
2. `Token Count` is the total number of tokens generated by the tokenizer for the input text. |
|
3. `Tokens/Character Ratio` is the ratio of the number of tokens to the number of characters in the input text. |
|
4. `Latin Support` indicates whether the tokenizer can handle Latin characters in addition to Arabic characters (`โ
` for yes, `โ` for no). |
|
5. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`). |
|
""" |
|
|
|
def leaderboard_tab(df): |
|
st.header("Darija Tokenizers Leaderboard ๐ฒ๐ฆ") |
|
st.markdown("A comparison of different tokenizers for the Moroccan Darija dialect.") |
|
st.markdown(leaderboard_description) |
|
|
|
df = df.sort_values(by="Tokens/Character Ratio", ascending=True) |
|
st.dataframe(df, hide_index=True) |
|
|
|
model_name = st.text_input("Enter a tokenizer name from Hugging Face (e.g. google/gemma-2-27b-it)") |
|
col1, col2 = st.columns([3,1]) |
|
with col1: |
|
submit_new_model_btn = st.button("Submit New Model", type="primary") |
|
with col2: |
|
refresh_btn = st.button("Refresh", type="secondary") |
|
|
|
if submit_new_model_btn: |
|
if model_name and "Tokenizer" in df.columns and model_name in df["Tokenizer"].values: |
|
st.warning("This model is already in the leaderboard.") |
|
else: |
|
with st.spinner(f"Benchmarking {model_name}..."): |
|
input_text = "ุงูููู ุงูุชุดููุง ุงูุณุงุญู ุงูุบุฑุจู ูุฃู
ุฑููุง ุงูุดู
ุงููุฉ" |
|
try: |
|
tokenizer = get_tokenizer(model_name) |
|
tokens = tokenizer.tokenize(input_text) |
|
if all(token == tokenizer.unk_token for token in tokens): |
|
st.error(f"Tokenizer {model_name} does not support Arabic characters.") |
|
return |
|
benchmark_data = calculate_metrics(model_name, input_text) |
|
if benchmark_data: |
|
df = df._append(benchmark_data, ignore_index=True) |
|
df = df.sort_values(by="Tokens/Character Ratio", ascending=True) |
|
df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False) |
|
st.success(f"Model {model_name} added to the leaderboard.") |
|
st.rerun() |
|
else: |
|
st.error(f"Failed to benchmark {model_name}") |
|
except Exception as e: |
|
st.error(f"Error loading tokenizer {model_name}: {e}") |
|
if refresh_btn: |
|
try: |
|
df = pd.read_json(dataframe_path, lines=True) |
|
st.success("Leaderboard refreshed.") |
|
except: |
|
st.error("Failed to refresh leaderboard.") |
|
|
|
with st.expander("Tokens/Character Ratio Barplot (Lower is Better)", expanded=False): |
|
fig = plt.figure(figsize=(10, 6)) |
|
|
|
sorted_df = df.sort_values("Tokens/Character Ratio", ascending=True) |
|
plt.bar(sorted_df["Tokenizer"], sorted_df["Tokens/Character Ratio"]) |
|
plt.xticks(rotation=45, ha='right') |
|
plt.ylabel("Tokens/Character Ratio") |
|
plt.tight_layout() |
|
st.pyplot(fig) |