Spaces:

atlasia
/

darija-tokenizers-leaderboard

Running

App Files Files Community

darija-tokenizers-leaderboard / tabs /leaderboard.py

abdeljalilELmajjodi

Update tabs/leaderboard.py

9f0a734 verified 4 days ago

raw

history blame contribute delete

5.14 kB

	import streamlit as st
	import pandas as pd
	from utils import get_tokenizer, get_tokenization, get_vocab_size, check_latin_support
	import logging
	import matplotlib.pyplot as plt

	# Configure logging
	logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

	dataframe_path = "darija_tokenizers_leaderboard.jsonl"


	def calculate_metrics(tokenizer_id, text):
	logging.debug(f"Calculating metrics for tokenizer: {tokenizer_id}")
	try:
	tokenizer = get_tokenizer(tokenizer_id)
	tokens = get_tokenization(tokenizer, text)
	vocab_size = get_vocab_size(tokenizer)
	tokens_count = len(tokens)
	tokens_ratio = tokens_count / len(text) if len(text) > 0 else 0
	latin_support = check_latin_support(tokenizer)
	tokenizer_class = tokenizer.__class__.__name__
	logging.debug(f"Metrics calculated: vocab_size={vocab_size}, tokens_count={tokens_count}, tokens_ratio={tokens_ratio}, latin_support={latin_support}, tokenizer_class={tokenizer_class}")
	return {
	"Tokenizer": tokenizer_id,
	"Vocabulary Size": vocab_size,
	"Token Count": tokens_count,
	"Tokens/Character Ratio": tokens_ratio,
	"Latin Support": latin_support,
	"Tokenizer Class": tokenizer_class
	}
	except Exception as e:
	logging.error(f"Error processing {tokenizer_id}: {e}")
	st.error(f"Error processing {tokenizer_id}: {e}")
	return None

	leaderboard_description = """
	The `Tokens/Character Ratio` in this leaderboard is based on the number of tokens generated by the tokenizer divided by the number of characters in the input text.

	A tokenizer that scores low in this leaderboard is considered more efficient in parsing Darija text.

	## Notes:
	1. `Vocabulary Size` is the total number of tokens in the tokenizer's vocabulary.
	2. `Token Count` is the total number of tokens generated by the tokenizer for the input text.
	3. `Tokens/Character Ratio` is the ratio of the number of tokens to the number of characters in the input text.
	4. `Latin Support` indicates whether the tokenizer can handle Latin characters in addition to Arabic characters (`✅` for yes, `❌` for no).
	5. `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`).
	"""

	def leaderboard_tab(df):
	st.header("Darija Tokenizers Leaderboard 🇲🇦")
	st.markdown("A comparison of different tokenizers for the Moroccan Darija dialect.")
	st.markdown(leaderboard_description)

	df = df.sort_values(by="Tokens/Character Ratio", ascending=True)
	st.dataframe(df, hide_index=True) # Table first

	model_name = st.text_input("Enter a tokenizer name from Hugging Face (e.g. google/gemma-2-27b-it)")
	col1, col2 = st.columns([3,1])
	with col1:
	submit_new_model_btn = st.button("Submit New Model", type="primary")
	with col2:
	refresh_btn = st.button("Refresh", type="secondary")

	if submit_new_model_btn:
	if model_name and "Tokenizer" in df.columns and model_name in df["Tokenizer"].values:
	st.warning("This model is already in the leaderboard.")
	else:
	with st.spinner(f"Benchmarking {model_name}..."):
	input_text = "الللي اكتشفوا الساحل الغربي لأمريكا الشمالية"
	try:
	tokenizer = get_tokenizer(model_name)
	tokens = tokenizer.tokenize(input_text)
	if all(token == tokenizer.unk_token for token in tokens):
	st.error(f"Tokenizer {model_name} does not support Arabic characters.")
	return
	benchmark_data = calculate_metrics(model_name, input_text)
	if benchmark_data:
	df = df._append(benchmark_data, ignore_index=True)
	df = df.sort_values(by="Tokens/Character Ratio", ascending=True)
	df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
	st.success(f"Model {model_name} added to the leaderboard.")
	st.rerun() # Refresh the page
	else:
	st.error(f"Failed to benchmark {model_name}")
	except Exception as e:
	st.error(f"Error loading tokenizer {model_name}: {e}")
	if refresh_btn:
	try:
	df = pd.read_json(dataframe_path, lines=True)
	st.success("Leaderboard refreshed.")
	except:
	st.error("Failed to refresh leaderboard.")

	with st.expander("Tokens/Character Ratio Barplot (Lower is Better)", expanded=False): # Barplot last
	fig = plt.figure(figsize=(10, 6))
	# Sort by ratio ascending since lower is better
	sorted_df = df.sort_values("Tokens/Character Ratio", ascending=True)
	plt.bar(sorted_df["Tokenizer"], sorted_df["Tokens/Character Ratio"])
	plt.xticks(rotation=45, ha='right')
	plt.ylabel("Tokens/Character Ratio")
	plt.tight_layout()
	st.pyplot(fig)