Hynek Kydlíček commited on
Commit
c923467
1 Parent(s): 28a3b2a
app.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
2
+
3
+ import ast
4
+ import argparse
5
+ import glob
6
+ import pickle
7
+
8
+ import gradio as gr
9
+ import numpy as np
10
+ import pandas as pd
11
+ import plotly.graph_objects as go
12
+ import pandas as pd
13
+
14
+
15
+ def make_default_md():
16
+
17
+ leaderboard_md = f"""
18
+ # 🏆 CZ-EVAL Leaderboard
19
+ [Developer](https://me.hynky.name/) | [Twitter](https://twitter.com/HKydlicek)
20
+
21
+ CZ-EVAL is a evaluation leadboard of Tasks in Czech for LLMs.
22
+
23
+ It's evaluated on following datasets:
24
+
25
+ - Math Problems Understanding [Klokan-QA](https://huggingface.co/datasets/hynky/klokan-qa)
26
+ - Reasoning and General Knowledge [TSP-QA](https://huggingface.co/datasets/hynky/tsp-qa)
27
+
28
+ 💻 Code: The evaluation code can be found at [hynky1999/LLM-Eval](https://github.com/hynky1999/LLM-Eval). Model inference is done using [Open-Router](https://openrouter.ai/) or on cloud using [Modal Labs](https://modal.com/).
29
+ """
30
+ return leaderboard_md
31
+
32
+
33
+ def make_arena_leaderboard_md(arena_df):
34
+ total_models = len(arena_df)
35
+
36
+ leaderboard_md = f"""
37
+ Total #models: **{total_models}**. Last updated: Feb 15, 2024.
38
+ """
39
+ return leaderboard_md
40
+
41
+
42
+ def make_full_leaderboard_md(elo_results):
43
+ leaderboard_md = f"""
44
+ Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
45
+ - [Klokan-QA](https://huggingface.co/datasets/hynky/klokan-qa) - Mathematical competitions dataset
46
+ - [TSP](https://huggingface.co/datasets/hynky/TSP) - Comprehensive dataset of
47
+
48
+ """
49
+ return leaderboard_md
50
+
51
+
52
+ # Combine all category accuracies into a single DataFrame
53
+
54
+
55
+ def plot_spider(df, title):
56
+ categories = df.columns.tolist()[1:]
57
+ categories = [
58
+ *categories,
59
+ categories[0],
60
+ ] # Ensure the graph is circular by appending the start to the end
61
+ colors = [
62
+ "#1f77b4", # muted blue
63
+ "#ff7f0e", # safety orange
64
+ "#2ca02c", # cooked asparagus green
65
+ "#d62728", # brick red
66
+ "#9467bd", # muted purple
67
+ "#8c564b", # chestnut brown
68
+ "#e377c2", # raspberry yogurt pink
69
+ "#7f7f7f", # middle gray
70
+ "#bcbd22", # curry yellow-green
71
+ "#17becf", # blue-teal
72
+ ]
73
+
74
+ # Setting for 1000x1000
75
+ fig_1000 = go.Figure()
76
+
77
+ for i, (idx, row) in enumerate(df.iterrows()):
78
+ name = row[0]
79
+ row = row.tolist()[1:]
80
+ row = row + [
81
+ row[0]
82
+ ] # Ensure the graph is circular by appending the start to the end
83
+ color = colors[i]
84
+ fig_1000.add_trace(
85
+ go.Scatterpolar(
86
+ r=row,
87
+ theta=categories,
88
+ opacity=0.4,
89
+ name=name,
90
+ line=dict(
91
+ color=color, width=4
92
+ ), # Adjust line width for better visibility
93
+ )
94
+ )
95
+
96
+ fig_1000.update_layout(
97
+ width=600,
98
+ height=628,
99
+ polar=dict(
100
+ angularaxis=dict(
101
+ gridwidth=2, # Increase line width for better visibility
102
+ rotation=90,
103
+ direction="clockwise",
104
+ ),
105
+ radialaxis=dict(
106
+ visible=True,
107
+ range=[0, 100],
108
+ angle=45,
109
+ tickangle=45,
110
+ tickvals=[0, 25, 50, 75, 100],
111
+ ticktext=["0%", "25%", "50%", "75%", "100%"],
112
+ ),
113
+ ),
114
+ title_text=title,
115
+ title_x=0.5,
116
+ title_y=0.97,
117
+ title_xanchor="center",
118
+ title_yanchor="top",
119
+ title_font_size=24,
120
+ title_font_color="#333333",
121
+ font=dict(family="Arial", size=16, color="#333333"),
122
+ legend=dict(
123
+ orientation="h", yanchor="bottom", y=-0.45, xanchor="center", x=0.5
124
+ ),
125
+ )
126
+ return fig_1000
127
+
128
+
129
+ def openrouter_hyperlink(model_name):
130
+ return f'<a target="_blank" href="https://openrouter.ai/models/{model_name}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
131
+
132
+
133
+ def get_full_table(model_table_df):
134
+ num_cols = ["klokan", "culture", "analytical", "critical", "verbal"]
135
+ # Multiply by 100 and round to 2 decimals
136
+ # Add average
137
+ model_table_df["average"] = model_table_df[num_cols].mean(axis=1)
138
+ model_table_df[num_cols + ["average"]] = model_table_df[
139
+ num_cols + ["average"]
140
+ ].apply(lambda x: round(x * 100, 2))
141
+
142
+ # Sort and add rank
143
+ model_table_df.sort_values(by="average", ascending=False, inplace=True)
144
+ model_table_df.insert(0, "rank", np.arange(1, len(model_table_df) + 1))
145
+
146
+ # Add link
147
+ model_table_df["model_name"] = model_table_df["model_name"].apply(
148
+ lambda x: openrouter_hyperlink(x)
149
+ )
150
+
151
+ model_table_df.rename(
152
+ columns={
153
+ "model_name": "🤖 Model",
154
+ "klokan": "🧮 Klokan-QA",
155
+ "culture": "🌍 TSP-Culture",
156
+ "analytical": "🔍 TSP-Analytical",
157
+ "critical": "💡 TSP-Critical",
158
+ "verbal": "📖 TSP-Verbal",
159
+ "average": "📊 Average",
160
+ },
161
+ inplace=True,
162
+ )
163
+
164
+ return model_table_df
165
+
166
+
167
+ def build_leaderboard_tab(leaderboard_table_file, klokan_table_file, tsp_table_file):
168
+
169
+ results = pd.read_csv(leaderboard_table_file)
170
+ results = get_full_table(results)
171
+ # p1, p2 = get_grafs(pd.read_json(klokan_table_file), pd.read_json(tsp_table_file))
172
+ default_md = make_default_md()
173
+
174
+ md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
175
+ with gr.Tabs() as tabs:
176
+ # arena table
177
+ with gr.Tab("CZ-EVAL Leaderboard", id=0):
178
+ md = make_arena_leaderboard_md(results)
179
+ gr.Markdown(md, elem_id="leaderboard_markdown")
180
+ gr.Dataframe(
181
+ datatype=[
182
+ "str",
183
+ "markdown",
184
+ "number",
185
+ "number",
186
+ "number",
187
+ "number",
188
+ "number",
189
+ "number",
190
+ "str",
191
+ "str",
192
+ "str",
193
+ ],
194
+ value=results,
195
+ elem_id="arena_leaderboard_dataframe",
196
+ height=700,
197
+ column_widths=[
198
+ 50,
199
+ 200,
200
+ 120,
201
+ 100,
202
+ 100,
203
+ 150,
204
+ 150,
205
+ 100,
206
+ 150,
207
+ 150,
208
+ 150,
209
+ ],
210
+ wrap=True,
211
+ )
212
+
213
+ p1 = plot_spider(pd.read_csv(klokan_table_file), "Klokan-QA - Acurracy")
214
+ p2 = plot_spider(pd.read_csv(tsp_table_file), "TSP - Accuracy")
215
+
216
+ gr.Markdown(
217
+ f"""## More Statistics for CZ-EVAL\n
218
+ Below are figures for more statistics.
219
+ """,
220
+ elem_id="leaderboard_markdown",
221
+ )
222
+ with gr.Row():
223
+ with gr.Column():
224
+ gr.Markdown(
225
+ "#### Figure 1: Performance of models on Klokan-QA per difficulty"
226
+ )
227
+ plot_1 = gr.Plot(p1, show_label=False)
228
+ with gr.Column():
229
+ gr.Markdown("#### Figure 2: Performance of models on TSP dataset")
230
+ plot_2 = gr.Plot(p2, show_label=False)
231
+
232
+ return [md_1, plot_1, plot_2]
233
+
234
+
235
+ block_css = """
236
+ #notice_markdown {
237
+ font-size: 104%
238
+ }
239
+ #notice_markdown th {
240
+ display: none;
241
+ }
242
+ #notice_markdown td {
243
+ padding-top: 6px;
244
+ padding-bottom: 6px;
245
+ }
246
+ #leaderboard_markdown {
247
+ font-size: 104%
248
+ }
249
+ #leaderboard_markdown td {
250
+ padding-top: 6px;
251
+ padding-bottom: 6px;
252
+ }
253
+ #leaderboard_dataframe td {
254
+ line-height: 0.1em;
255
+ }
256
+ footer {
257
+ display:none !important
258
+ }
259
+ .image-container {
260
+ display: flex;
261
+ align-items: center;
262
+ padding: 1px;
263
+ }
264
+ .image-container img {
265
+ margin: 0 30px;
266
+ height: 20px;
267
+ max-height: 100%;
268
+ width: auto;
269
+ max-width: 20%;
270
+ }
271
+ """
272
+
273
+
274
+ def build_demo(leadboard_table, klokan_table, tsp_table):
275
+ text_size = gr.themes.sizes.text_lg
276
+
277
+ with gr.Blocks(
278
+ title="CZ-EVAL Leaderboard",
279
+ theme=gr.themes.Base(text_size=text_size),
280
+ css=block_css,
281
+ ) as demo:
282
+ leader_components = build_leaderboard_tab(
283
+ leadboard_table, klokan_table, tsp_table
284
+ )
285
+ return demo
286
+
287
+
288
+ demo = build_demo(
289
+ leadboard_table="./leaderboard/table.csv",
290
+ klokan_table="./leaderboard/klokan.csv",
291
+ tsp_table="./leaderboard/tsp.csv",
292
+ )
293
+
294
+ if __name__ == "__main__":
295
+ demo.launch()
leaderboard/klokan.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ,Elementary 2-3,Elementary 4-5,Elementary 6-7,Elementary 8-9,High School 1-2,High School 3-4
2
+ anthropic/claude-2.1,43.96551724137931,50.35971223021583,39.87730061349693,39.75155279503105,33.33333333333333,14.772727272727273
3
+ google/gemini-pro,25.0,28.05755395683453,22.699386503067483,20.496894409937887,24.691358024691358,19.318181818181817
4
+ mistralai/mixtral-8x7b-instruct,34.48275862068966,25.899280575539567,25.766871165644172,25.465838509316768,20.98765432098765,19.318181818181817
5
+ openai/gpt-3.5-turbo,37.06896551724138,41.007194244604314,33.74233128834356,29.81366459627329,26.543209876543212,17.045454545454543
6
+ openai/gpt-4-1106-preview,66.37931034482759,62.589928057553955,50.306748466257666,40.993788819875775,32.71604938271605,36.36363636363637
leaderboard/table.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model_name,analytical,critical,culture,verbal,klokan
2
+ anthropic/claude-2.1,0.3804034582132565,0.6449912126537786,0.7981770833333334,0.6336336336336337,0.3823884197828709
3
+ google/gemini-pro,0.2680115273775216,0.5992970123022847,0.7825520833333334,0.5765765765765766,0.23522316043425814
4
+ mistralai/mixtral-8x7b-instruct,0.24495677233429394,0.4833040421792619,0.6432291666666666,0.36936936936936937,0.25331724969843183
5
+ openai/gpt-3.5-turbo,0.27761767531219983,0.46572934973637964,0.6822916666666666,0.4084084084084084,0.3148371531966224
6
+ openai/gpt-4-1106-preview,0.4793467819404419,0.7662565905096661,0.9166666666666666,0.7207207207207207,0.47889022919179736
leaderboard/tsp.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ,Analytical,Critical,Cultural,Verbal
2
+ anthropic/claude-2.1,38.04034582132565,64.49912126537785,79.81770833333334,63.36336336336337
3
+ google/gemini-pro,26.801152737752158,59.929701230228474,78.25520833333334,57.65765765765766
4
+ mistralai/mixtral-8x7b-instruct,24.495677233429394,48.33040421792619,64.32291666666666,36.93693693693694
5
+ openai/gpt-3.5-turbo,27.761767531219984,46.57293497363796,68.22916666666666,40.84084084084084
6
+ openai/gpt-4-1106-preview,47.93467819404419,76.6256590509666,91.66666666666666,72.07207207207207
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==4.19.1
2
+ pandas==2.2.0
3
+ plotly==5.19.0