import streamlit as st from utils import get_tokenizer, get_tokenization, get_vocab_size import logging import pandas as pd # Configure logging logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') dataframe_path = "darija_tokenizers_leaderboard.jsonl" def calculate_metrics(tokenizer_id, text): logging.debug(f"Calculating metrics for tokenizer: {tokenizer_id}") try: tokenizer = get_tokenizer(tokenizer_id) tokens = get_tokenization(tokenizer, text) vocab_size = get_vocab_size(tokenizer) tokens_count = len(tokens) tokens_ratio = tokens_count / len(text) if len(text) > 0 else 0 logging.debug(f"Metrics calculated: vocab_size={vocab_size}, tokens_count={tokens_count}, tokens_ratio={tokens_ratio}") return { "Tokenizer": tokenizer_id, "Vocabulary Size": vocab_size, "Token Count": tokens_count, "Tokens/Character Ratio": tokens_ratio } except Exception as e: logging.error(f"Error processing {tokenizer_id}: {e}") st.error(f"Error processing {tokenizer_id}: {e}") return None def comparison_tab(df): st.header("Tokenizer Comparison") st.markdown("Compare two tokenizers side by side.") input_text = st.text_area("Enter text to compare:", "هذا مثال لنص بالدارجة المغربية") col1, col2 = st.columns(2) with col1: st.subheader("Tokenizer 1") tokenizer_1_choice = st.radio("Select Tokenizer 1 Source", ["From Leaderboard", "Enter New Model"], key="tokenizer_1_source") if tokenizer_1_choice == "From Leaderboard": model_1 = st.selectbox("Select Tokenizer 1", df["Tokenizer"].tolist(), key="model_1") else: model_1 = st.text_input("Enter Tokenizer 1 Name", key="model_1_input") if input_text and model_1: with st.spinner(f"Tokenizing with {model_1}..."): metrics = calculate_metrics(model_1, input_text) if metrics: st.write(f"**Vocabulary Size:** {metrics['Vocabulary Size']}") st.write(f"**Token Count:** {metrics['Token Count']}") st.write(f"**Tokens/Character Ratio:** {metrics['Tokens/Character Ratio']:.4f}") tokenizer = get_tokenizer(model_1) tokens = tokenizer.tokenize(input_text) tokens_html = ' '.join([f'{token}' for token in tokens]) st.markdown(f'
{tokens_html}
', unsafe_allow_html=True) with col2: st.subheader("Tokenizer 2") tokenizer_2_choice = st.radio("Select Tokenizer 2 Source", ["From Leaderboard", "Enter New Model"], key="tokenizer_2_source") if tokenizer_2_choice == "From Leaderboard": model_2 = st.selectbox("Select Tokenizer 2", df["Tokenizer"].tolist(), key="model_2") else: model_2 = st.text_input("Enter Tokenizer 2 Name", key="model_2_input") if input_text and model_2: with st.spinner(f"Tokenizing with {model_2}..."): metrics = calculate_metrics(model_2, input_text) if metrics: st.write(f"**Vocabulary Size:** {metrics['Vocabulary Size']}") st.write(f"**Token Count:** {metrics['Token Count']}") st.write(f"**Tokens/Character Ratio:** {metrics['Tokens/Character Ratio']:.4f}") tokenizer = get_tokenizer(model_2) tokens = tokenizer.tokenize(input_text) tokens_html = ' '.join([f'{token}' for token in tokens]) st.markdown(f'
{tokens_html}
', unsafe_allow_html=True)