Spaces:
Runtime error
Runtime error
File size: 5,432 Bytes
daa27ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import streamlit as st
from collections import defaultdict
import tqdm
import transformers
from transformers import AutoTokenizer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import random, glob
@st.cache_data
def load_data():
return pd.read_csv("MassiveDatasetValidationData.csv")
def reload_example_text_data(language):
random_id = random.choice(val_data["id"])
tempdf = val_data[val_data["id"] == random_id]
tempdf = tempdf[["iso", "text", *selected_tokenizers]]
tempdf = tempdf[tempdf["iso"] == language]
tempdf.set_index("iso", inplace=True)
tempdf.columns = ["Text"] + [f"Num Tokens ({t})" for t in selected_tokenizers]
st.session_state.examplesdf = tempdf
tokenizer_names_to_test = [
"openai/gpt4",
"Xenova/gpt-4o",
"Xenova/claude-tokenizer",
"CohereForAI/aya-101",
"meta-llama/Meta-Llama-3-70B",
"mistralai/Mixtral-8x22B-v0.1",
"google/gemma-7b",
"facebook/nllb-200-distilled-600M",
"xlm-roberta-base",
"bert-base-uncased",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
"bigscience/bloom",
"StabilityAI/stablelm-base-alpha-7b",
"google/flan-t5-base",
"facebook/mbart-large-50",
"EleutherAI/gpt-neox-20b",
]
with st.sidebar:
st.header("Comparing Tokenizers")
link = "This project compares the tokenization length for different tokenizers. Some tokenizers may result in significantly more tokens than others for the same text."
st.markdown(link)
st.header("Data Visualization")
st.subheader("Tokenizers")
selected_tokenizers = st.multiselect(
"Select tokenizers",
options=tokenizer_names_to_test,
default=["openai/gpt4", "Xenova/gpt-4o", "Xenova/claude-tokenizer"],
max_selections=6,
label_visibility="collapsed",
)
st.subheader("Data")
with st.spinner("Loading dataset..."):
val_data = load_data()
st.success(f"Data loaded: {len(val_data)}")
with st.expander("Data Source"):
st.write(
"The data in this figure is the validation set of the [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation) dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)"
)
st.subheader("Language")
language_options = sorted(val_data.lang.unique())
default_language_index = (
language_options.index("English") if "English" in language_options else 0
)
selected_language = st.selectbox(
"Select language",
options=language_options,
index=default_language_index,
label_visibility="collapsed",
)
st.subheader("Figure")
selected_figure = st.radio(
"Select figure type",
options=["Boxplot", "Histogram", "Scatterplot"],
index=0,
label_visibility="collapsed",
)
st.header("Example Text")
with st.spinner("Loading example text..."):
reload_example_text_data(selected_language)
st.table(st.session_state.examplesdf)
st.button("Reload", on_click=reload_example_text_data, args=(selected_language,))
tokenizer_to_num_tokens = defaultdict(list)
for _, row in tqdm.tqdm(val_data.iterrows(), total=len(val_data)):
text = row["text"]
for tokenizer_name in selected_tokenizers:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
num_tokens = len(tokenizer(text)["input_ids"])
tokenizer_to_num_tokens[tokenizer_name].append(num_tokens)
if selected_figure == "Boxplot":
fig = go.Figure()
for tokenizer_name in selected_tokenizers:
fig.add_trace(
go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
)
fig.update_layout(
title=f"Distribution of Number of Tokens for Selected Tokenizers",
xaxis_title="Tokenizer",
yaxis_title="Number of Tokens",
)
st.plotly_chart(fig)
elif selected_figure == "Histogram":
fig = make_subplots(
rows=len(selected_tokenizers), cols=1, subplot_titles=selected_tokenizers
)
for i, tokenizer_name in enumerate(selected_tokenizers):
fig.add_trace(
go.Histogram(
x=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name
),
row=i + 1,
col=1,
)
fig.update_layout(
height=200 * len(selected_tokenizers),
title_text="Histogram of Number of Tokens",
)
st.plotly_chart(fig)
elif selected_figure == "Scatterplot":
df = pd.DataFrame(tokenizer_to_num_tokens)
fig = px.scatter_matrix(
df,
dimensions=selected_tokenizers,
color_discrete_sequence=px.colors.qualitative.Plotly,
)
fig.update_layout(
title=f"Scatterplot Matrix of Number of Tokens for Selected Tokenizers",
width=800,
height=800,
)
st.plotly_chart(fig)
|