5w4n commited on
Commit
daa27ce
1 Parent(s): 238ae95
Files changed (1) hide show
  1. app.py +151 -0
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from collections import defaultdict
3
+ import tqdm
4
+ import transformers
5
+ from transformers import AutoTokenizer
6
+ import pandas as pd
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+ import numpy as np
10
+ import plotly.figure_factory as ff
11
+ import plotly.express as px
12
+ from plotly.subplots import make_subplots
13
+ import plotly.graph_objects as go
14
+ import random, glob
15
+
16
+
17
+ @st.cache_data
18
+ def load_data():
19
+ return pd.read_csv("MassiveDatasetValidationData.csv")
20
+
21
+
22
+ def reload_example_text_data(language):
23
+ random_id = random.choice(val_data["id"])
24
+ tempdf = val_data[val_data["id"] == random_id]
25
+ tempdf = tempdf[["iso", "text", *selected_tokenizers]]
26
+ tempdf = tempdf[tempdf["iso"] == language]
27
+ tempdf.set_index("iso", inplace=True)
28
+ tempdf.columns = ["Text"] + [f"Num Tokens ({t})" for t in selected_tokenizers]
29
+ st.session_state.examplesdf = tempdf
30
+
31
+
32
+ tokenizer_names_to_test = [
33
+ "openai/gpt4",
34
+ "Xenova/gpt-4o",
35
+ "Xenova/claude-tokenizer",
36
+ "CohereForAI/aya-101",
37
+ "meta-llama/Meta-Llama-3-70B",
38
+ "mistralai/Mixtral-8x22B-v0.1",
39
+ "google/gemma-7b",
40
+ "facebook/nllb-200-distilled-600M",
41
+ "xlm-roberta-base",
42
+ "bert-base-uncased",
43
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
44
+ "bigscience/bloom",
45
+ "StabilityAI/stablelm-base-alpha-7b",
46
+ "google/flan-t5-base",
47
+ "facebook/mbart-large-50",
48
+ "EleutherAI/gpt-neox-20b",
49
+ ]
50
+
51
+ with st.sidebar:
52
+ st.header("Comparing Tokenizers")
53
+ link = "This project compares the tokenization length for different tokenizers. Some tokenizers may result in significantly more tokens than others for the same text."
54
+ st.markdown(link)
55
+
56
+ st.header("Data Visualization")
57
+ st.subheader("Tokenizers")
58
+ selected_tokenizers = st.multiselect(
59
+ "Select tokenizers",
60
+ options=tokenizer_names_to_test,
61
+ default=["openai/gpt4", "Xenova/gpt-4o", "Xenova/claude-tokenizer"],
62
+ max_selections=6,
63
+ label_visibility="collapsed",
64
+ )
65
+
66
+ st.subheader("Data")
67
+ with st.spinner("Loading dataset..."):
68
+ val_data = load_data()
69
+ st.success(f"Data loaded: {len(val_data)}")
70
+
71
+ with st.expander("Data Source"):
72
+ st.write(
73
+ "The data in this figure is the validation set of the [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation) dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)"
74
+ )
75
+
76
+ st.subheader("Language")
77
+ language_options = sorted(val_data.lang.unique())
78
+ default_language_index = (
79
+ language_options.index("English") if "English" in language_options else 0
80
+ )
81
+ selected_language = st.selectbox(
82
+ "Select language",
83
+ options=language_options,
84
+ index=default_language_index,
85
+ label_visibility="collapsed",
86
+ )
87
+
88
+ st.subheader("Figure")
89
+ selected_figure = st.radio(
90
+ "Select figure type",
91
+ options=["Boxplot", "Histogram", "Scatterplot"],
92
+ index=0,
93
+ label_visibility="collapsed",
94
+ )
95
+
96
+ st.header("Example Text")
97
+ with st.spinner("Loading example text..."):
98
+ reload_example_text_data(selected_language)
99
+ st.table(st.session_state.examplesdf)
100
+ st.button("Reload", on_click=reload_example_text_data, args=(selected_language,))
101
+
102
+ tokenizer_to_num_tokens = defaultdict(list)
103
+ for _, row in tqdm.tqdm(val_data.iterrows(), total=len(val_data)):
104
+ text = row["text"]
105
+ for tokenizer_name in selected_tokenizers:
106
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
107
+ num_tokens = len(tokenizer(text)["input_ids"])
108
+ tokenizer_to_num_tokens[tokenizer_name].append(num_tokens)
109
+
110
+ if selected_figure == "Boxplot":
111
+ fig = go.Figure()
112
+ for tokenizer_name in selected_tokenizers:
113
+ fig.add_trace(
114
+ go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
115
+ )
116
+ fig.update_layout(
117
+ title=f"Distribution of Number of Tokens for Selected Tokenizers",
118
+ xaxis_title="Tokenizer",
119
+ yaxis_title="Number of Tokens",
120
+ )
121
+ st.plotly_chart(fig)
122
+ elif selected_figure == "Histogram":
123
+ fig = make_subplots(
124
+ rows=len(selected_tokenizers), cols=1, subplot_titles=selected_tokenizers
125
+ )
126
+ for i, tokenizer_name in enumerate(selected_tokenizers):
127
+ fig.add_trace(
128
+ go.Histogram(
129
+ x=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name
130
+ ),
131
+ row=i + 1,
132
+ col=1,
133
+ )
134
+ fig.update_layout(
135
+ height=200 * len(selected_tokenizers),
136
+ title_text="Histogram of Number of Tokens",
137
+ )
138
+ st.plotly_chart(fig)
139
+ elif selected_figure == "Scatterplot":
140
+ df = pd.DataFrame(tokenizer_to_num_tokens)
141
+ fig = px.scatter_matrix(
142
+ df,
143
+ dimensions=selected_tokenizers,
144
+ color_discrete_sequence=px.colors.qualitative.Plotly,
145
+ )
146
+ fig.update_layout(
147
+ title=f"Scatterplot Matrix of Number of Tokens for Selected Tokenizers",
148
+ width=800,
149
+ height=800,
150
+ )
151
+ st.plotly_chart(fig)