Yotam-Perlitz commited on
Commit
a3b611d
โ€ข
1 Parent(s): 9e72aa4

improve logic

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

Files changed (1) hide show
  1. app.py +169 -67
app.py CHANGED
@@ -8,6 +8,26 @@ from bat import Benchmark, Config, Reporter, Tester
8
  from datetime import datetime
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  holistic_scenarios = [
12
  "Helm Lite",
13
  "HF OpenLLM v2",
@@ -21,14 +41,38 @@ holistic_scenarios = [
21
 
22
 
23
  st.markdown(
24
- """<h1 style='text-align: center; color: black;'>๐Ÿ‹๏ธโ€โ™‚๏ธ BenchBench Leaderboard ๐Ÿ‹๏ธโ€โ™‚๏ธ</h1>""",
 
 
25
  unsafe_allow_html=True,
26
  )
27
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  st.markdown(
29
  """
30
- This leaderboard, featured in our work -- [Benchmark Agreement Testing Done Right: A Guide for LLM Benchmark Evaluation](https://arxiv.org/abs/2407.13696),
31
- serves as a meta-benchmark. It ranks individual benchmarks based on their agreement with an aggregated reference benchmark, which harnesses insights from numerous diverse benchmarks.
 
 
 
 
 
 
 
 
 
 
32
  """
33
  )
34
 
@@ -38,26 +82,19 @@ all_scenarios_for_aggragate = (
38
  all_scenarios_for_aggragate.df["scenario"].unique().tolist()
39
  )
40
 
41
- st.subheader("The Leaderboard", divider=True)
42
- # st.subheader("๐Ÿ‹๏ธโ€โ™‚๏ธ BenchBench Leaderboard ๐Ÿ‹", divider=True)
43
 
44
-
45
- with st.form("my_form_0"):
46
- # leftcol, rightcol = st.columns([5, 1])
47
- # with leftcol:
48
- aggragate_scenarios = st.multiselect(
49
- "Scenarios in Aggregate (defualts are the 'Holistic' benchmarks)",
50
- all_scenarios_for_aggragate,
51
- holistic_scenarios,
52
- )
53
- # with rightcol:
54
- # st.markdown("###")
55
- submitted = st.form_submit_button(label="\n\nRun BAT\n\n")
56
-
57
- with st.expander("Leaderboard configurations (defaults are great BTW)", icon="โš™๏ธ"):
58
  with st.form("my_form_1"):
 
 
 
 
 
 
59
  corr_type = st.selectbox(
60
- label="Select Correlation type", options=["kendall", "pearson"], index=0
61
  )
62
 
63
  aggregate_scenario_whitelist = aggragate_scenarios
@@ -68,13 +105,13 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
68
  # ]
69
 
70
  model_select_strategy = st.selectbox(
71
- label="Select strategy",
72
  options=["random", "top_aggregate", "somewhere_aggregate"],
73
  index=0,
74
  )
75
 
76
  n_models_taken_list = st.slider(
77
- label="Select number of models to use",
78
  min_value=3,
79
  max_value=15,
80
  value=8,
@@ -82,46 +119,67 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
82
 
83
  n_models_taken_list = [n_models_taken_list]
84
 
85
- n_exps = 10
86
 
87
  submitted = st.form_submit_button(label="Run BAT")
88
 
 
89
  with st.expander("Add your benchmarks here!", icon="๐Ÿ”ฅ"):
90
- uploaded_file = st.file_uploader("Add your benchmark as a CSV")
91
- st.download_button(
92
- label="Download example CSV",
93
- data=pd.read_csv("assets/mybench_240901.csv")
94
- .to_csv(index=False)
95
- .encode("utf-8"),
96
- file_name="mybench_240901.csv",
97
- mime="text/csv",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  )
99
 
 
 
 
 
 
 
100
  my_benchmark = Benchmark()
101
  if uploaded_file is not None:
 
 
 
 
102
  df = pd.read_csv(uploaded_file)
103
 
104
  my_benchmark.assign_df(
105
  df,
106
  data_source=f"uploaded_benchmark_{datetime.now().strftime('%y%m%d')}.csv",
107
- )
108
-
109
- allbench = Benchmark()
110
- allbench.load_local_catalog()
111
-
112
- allbench.add_aggregate(
113
- new_col_name="aggregate",
114
- agg_source_name="aggregate",
115
- scenario_whitelist=aggregate_scenario_whitelist,
116
- min_scenario_for_models_to_appear_in_agg=1
117
- if len(aggregate_scenario_whitelist) == 1
118
- else 3,
119
  )
120
 
121
  uploaded_models = my_benchmark.df[
122
  my_benchmark.df["source"].str.contains("uploaded")
123
  ]["model"].unique()
124
- aggregate_models = allbench.df[allbench.df["source"].str.contains("aggregate")][
125
  "model"
126
  ].unique()
127
 
@@ -180,8 +238,12 @@ def run_load(
180
  aggregate_scores = pd.read_csv(
181
  cache_path.replace("agreement", "aggregate_scores")
182
  )
 
 
 
 
183
 
184
- return agreements, aggregate_scores
185
 
186
  else:
187
  print("Cached results not found, calculating")
@@ -245,11 +307,12 @@ def run_load(
245
  aggragate_scores.to_csv(
246
  cache_path.replace("agreement", "aggregate_scores"), index=False
247
  )
 
248
 
249
- return agreements, aggragate_scores
250
 
251
 
252
- agreements, aggragare_score_df = run_load(
253
  aggregate_scenario_whitelist=aggregate_scenario_whitelist,
254
  n_models_taken_list=n_models_taken_list,
255
  model_select_strategy_list=[model_select_strategy],
@@ -275,17 +338,15 @@ z_scores["date"] = z_scores["source"].apply(
275
  else x.split(".csv")[0].split("_")[-2]
276
  )
277
 
 
278
 
279
- # print(z_scores["scenario"].unique().tolist())
280
 
281
- # z_scores["scenario"] = z_scores["scenario"].apply(lambda x: get_nice_benchmark_name(x))
282
- z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
283
- # , format="%y%m%d"
284
  data = (
285
  z_scores.rename(
286
  columns={
287
  "scenario": "Benchmark",
288
- "z_score": "Z Score",
289
  "corr_with_agg": corr_name,
290
  "p_value_of_corr_with_agg": "p-value of Corr.",
291
  # "n_models_of_corr_with_agg": "# Models Used",
@@ -293,7 +354,7 @@ data = (
293
  "date": "Snapshot Date",
294
  }
295
  )
296
- .sort_values("Z Score", ascending=False)
297
  .reset_index(drop=True)
298
  )
299
 
@@ -308,10 +369,10 @@ def highlight_uploaded_benchmark(row):
308
 
309
  styled_data = (
310
  data.style.background_gradient(
311
- subset=["Z Score"],
312
  cmap="RdYlGn",
313
- vmin=-data["Z Score"].abs().max(),
314
- vmax=data["Z Score"].abs().max(),
315
  )
316
  .apply(highlight_uploaded_benchmark, axis=1)
317
  .background_gradient(
@@ -320,17 +381,19 @@ styled_data = (
320
  vmin=0.1,
321
  vmax=1,
322
  )
323
- .format(subset=["Z Score", corr_name, "p-value of Corr."], formatter="{:.2}")
324
  .set_properties(**{"text-align": "center"})
325
  )
326
 
327
  cols_used = [
328
  "Benchmark",
329
- "Z Score",
330
  corr_name,
331
  "p-value of Corr.",
332
  "Snapshot Date",
333
  ]
 
 
334
  st.dataframe(
335
  data=styled_data,
336
  column_order=cols_used,
@@ -348,7 +411,8 @@ aggragare_score_df.rename(
348
  },
349
  inplace=True,
350
  )
351
- with st.expander(label="Model scored by the aggragate"):
 
352
  st.dataframe(
353
  data=aggragare_score_df,
354
  hide_index=True,
@@ -632,6 +696,52 @@ with st.expander(label="Citations"):
632
  """
633
  )
634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  st.markdown(
636
  "BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
637
  "The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "
@@ -648,14 +758,6 @@ st.write(r"""
648
  """)
649
 
650
 
651
- benchmarks = data["Benchmark"].unique().tolist()
652
- plotted_scenario = st.selectbox(
653
- "Choose Benchmark to plot",
654
- benchmarks,
655
- index=benchmarks.index("LMSys Arena"),
656
- )
657
-
658
-
659
  fig = px.histogram(
660
  data.query("Benchmark!=@plotted_scenario"), x=corr_name, nbins=len(data) - 1
661
  )
 
8
  from datetime import datetime
9
 
10
 
11
+ st.set_page_config(
12
+ page_title="BenchBench",
13
+ page_icon="๐Ÿ‹๏ธโ€โ™‚๏ธ",
14
+ layout="wide",
15
+ initial_sidebar_state="auto",
16
+ menu_items=None,
17
+ )
18
+
19
+ # # Inject custom CSS to set the width of the sidebar
20
+ # st.markdown(
21
+ # """
22
+ # <style>
23
+ # section[data-testid="stSidebar"] {
24
+ # width: 200px !important; # Set the width to your desired value
25
+ # }
26
+ # </style>
27
+ # """,
28
+ # unsafe_allow_html=True,
29
+ # )
30
+
31
  holistic_scenarios = [
32
  "Helm Lite",
33
  "HF OpenLLM v2",
 
41
 
42
 
43
  st.markdown(
44
+ """
45
+ <h1 style='text-align: center; color: black;'>๐Ÿ‹๏ธโ€โ™‚๏ธ BenchBench Leaderboard ๐Ÿ‹๏ธโ€โ™‚๏ธ</h1>
46
+ """,
47
  unsafe_allow_html=True,
48
  )
49
 
50
+ st.divider()
51
+
52
+ st.markdown(
53
+ """
54
+ The BenchBench leaderboard ranks benchmarks based on their agreement with the *Aggregate Benchmark* โ€“ a comprehensive, combined measure of existing benchmark results.
55
+ \n
56
+ To achive it, we scraped results from multiple benchmarks (citations below) to allow for obtaining benchmark agreement results with a wide range of benchmark using a large set of models.
57
+ \n
58
+ BenchBench is for you if:
59
+ """
60
+ )
61
+
62
  st.markdown(
63
  """
64
+ - **You have a new benchmark**: Show that it agrees/disagrees with known benchmarks.
65
+ - **You are looking for a benchmark to run/trust**: Find an efficient/private/preferble alternative.
66
+ """
67
+ )
68
+
69
+ st.markdown(
70
+ """
71
+ In our work -- [Benchmark Agreement Testing Done Right](https://arxiv.org/abs/2407.13696),
72
+ we standardize BAT and show the importance of its configurations, notably,
73
+ the benchmarks we compare to, and the models we use to compare with, check it out int he sidebar.
74
+ \n
75
+ We show that agreements are best reporesented with the Z Score, the relative agreement of each benchmark to the Aggragate benchmark, as presented below.
76
  """
77
  )
78
 
 
82
  all_scenarios_for_aggragate.df["scenario"].unique().tolist()
83
  )
84
 
85
+ with st.sidebar:
86
+ st.markdown("""# Configurations""")
87
 
88
+ # with st.expander("Leaderboard configurations (defaults are great BTW)", icon="โš™๏ธ"):
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  with st.form("my_form_1"):
90
+ aggragate_scenarios = st.multiselect(
91
+ "Aggregate Benchmark",
92
+ all_scenarios_for_aggragate,
93
+ holistic_scenarios,
94
+ )
95
+
96
  corr_type = st.selectbox(
97
+ label="Correlation type", options=["kendall", "pearson"], index=0
98
  )
99
 
100
  aggregate_scenario_whitelist = aggragate_scenarios
 
105
  # ]
106
 
107
  model_select_strategy = st.selectbox(
108
+ label="Model Select strategy",
109
  options=["random", "top_aggregate", "somewhere_aggregate"],
110
  index=0,
111
  )
112
 
113
  n_models_taken_list = st.slider(
114
+ label="Minimal number of models to use",
115
  min_value=3,
116
  max_value=15,
117
  value=8,
 
119
 
120
  n_models_taken_list = [n_models_taken_list]
121
 
122
+ n_exps = 5
123
 
124
  submitted = st.form_submit_button(label="Run BAT")
125
 
126
+
127
  with st.expander("Add your benchmarks here!", icon="๐Ÿ”ฅ"):
128
+ aggbench = Benchmark()
129
+ aggbench.load_local_catalog()
130
+
131
+ aggbench.add_aggregate(
132
+ new_col_name="aggregate",
133
+ agg_source_name="aggregate",
134
+ scenario_whitelist=aggregate_scenario_whitelist,
135
+ min_scenario_for_models_to_appear_in_agg=1
136
+ if len(aggregate_scenario_whitelist) == 1
137
+ else 3,
138
+ )
139
+
140
+ agg_models = (
141
+ aggbench.df.query('scenario=="aggregate"').sample(n=10)["model"].tolist()
142
+ )
143
+
144
+ st.markdown(
145
+ "Adding your benchmark is as simple as uploading a csv with the following format, one column indicates the model and the other the benchmark scores."
146
+ )
147
+
148
+ st.dataframe(
149
+ pd.read_csv("assets/mybench_240901.csv"),
150
+ use_container_width=True,
151
+ hide_index=True,
152
+ height=200,
153
+ )
154
+
155
+ st.markdown(
156
+ "Not sure, what models you should run your benchmark on?" "\ntry these:"
157
  )
158
 
159
+ st.code(agg_models)
160
+
161
+ st.markdown("Got the data? Upload it here ๐Ÿ‘‡:")
162
+
163
+ uploaded_file = st.file_uploader("Add your benchmark as a CSV")
164
+
165
  my_benchmark = Benchmark()
166
  if uploaded_file is not None:
167
+ st.markdown(
168
+ "Your benchmark has been uploaded, BAT results will soon be caluclated... check out its results here: [Benchmark BAT Report Card](#benchmark-report-card)"
169
+ )
170
+
171
  df = pd.read_csv(uploaded_file)
172
 
173
  my_benchmark.assign_df(
174
  df,
175
  data_source=f"uploaded_benchmark_{datetime.now().strftime('%y%m%d')}.csv",
176
+ normalized_names=False,
 
 
 
 
 
 
 
 
 
 
 
177
  )
178
 
179
  uploaded_models = my_benchmark.df[
180
  my_benchmark.df["source"].str.contains("uploaded")
181
  ]["model"].unique()
182
+ aggregate_models = aggbench.df[aggbench.df["source"].str.contains("aggregate")][
183
  "model"
184
  ].unique()
185
 
 
238
  aggregate_scores = pd.read_csv(
239
  cache_path.replace("agreement", "aggregate_scores")
240
  )
241
+ allbench = Benchmark(
242
+ pd.read_csv(cache_path.replace("agreement", "allbench")),
243
+ normalized_names=True,
244
+ )
245
 
246
+ return agreements, aggregate_scores, allbench
247
 
248
  else:
249
  print("Cached results not found, calculating")
 
307
  aggragate_scores.to_csv(
308
  cache_path.replace("agreement", "aggregate_scores"), index=False
309
  )
310
+ allbench.df.to_csv(cache_path.replace("agreement", "allbench"), index=False)
311
 
312
+ return agreements, aggragate_scores, allbench
313
 
314
 
315
+ agreements, aggragare_score_df, allbench = run_load(
316
  aggregate_scenario_whitelist=aggregate_scenario_whitelist,
317
  n_models_taken_list=n_models_taken_list,
318
  model_select_strategy_list=[model_select_strategy],
 
338
  else x.split(".csv")[0].split("_")[-2]
339
  )
340
 
341
+ z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
342
 
343
+ z_score_name = "Relative agreement (Z Score)"
344
 
 
 
 
345
  data = (
346
  z_scores.rename(
347
  columns={
348
  "scenario": "Benchmark",
349
+ "z_score": z_score_name,
350
  "corr_with_agg": corr_name,
351
  "p_value_of_corr_with_agg": "p-value of Corr.",
352
  # "n_models_of_corr_with_agg": "# Models Used",
 
354
  "date": "Snapshot Date",
355
  }
356
  )
357
+ .sort_values(z_score_name, ascending=False)
358
  .reset_index(drop=True)
359
  )
360
 
 
369
 
370
  styled_data = (
371
  data.style.background_gradient(
372
+ subset=[z_score_name],
373
  cmap="RdYlGn",
374
+ vmin=-data[z_score_name].abs().max(),
375
+ vmax=data[z_score_name].abs().max(),
376
  )
377
  .apply(highlight_uploaded_benchmark, axis=1)
378
  .background_gradient(
 
381
  vmin=0.1,
382
  vmax=1,
383
  )
384
+ .format(subset=[z_score_name, corr_name, "p-value of Corr."], formatter="{:.2}")
385
  .set_properties(**{"text-align": "center"})
386
  )
387
 
388
  cols_used = [
389
  "Benchmark",
390
+ z_score_name,
391
  corr_name,
392
  "p-value of Corr.",
393
  "Snapshot Date",
394
  ]
395
+
396
+
397
  st.dataframe(
398
  data=styled_data,
399
  column_order=cols_used,
 
411
  },
412
  inplace=True,
413
  )
414
+
415
+ with st.expander(label="Aggragate Benchmark scores"):
416
  st.dataframe(
417
  data=aggragare_score_df,
418
  hide_index=True,
 
696
  """
697
  )
698
 
699
+
700
+ st.subheader("Benchmark Report Card")
701
+
702
+
703
+ benchmarks = allbench.df["scenario"].unique().tolist()
704
+ index_to_use = 0
705
+ if not my_benchmark.is_empty:
706
+ index_to_use = benchmarks.index(my_benchmark.df["scenario"].unique()[0])
707
+
708
+ plotted_scenario = st.selectbox(
709
+ "Choose Benchmark to plot",
710
+ benchmarks,
711
+ index=index_to_use,
712
+ )
713
+
714
+ col1, col2, col3 = st.columns(3)
715
+ cur_data = data.query(f"Benchmark=='{plotted_scenario}'")
716
+ col1.metric("Relative agreement", cur_data["Relative agreement (Z Score)"])
717
+ col2.metric("Kendall Tau Corr.", cur_data["Kendall Tau Corr."])
718
+ col3.metric("p-value of Corr.", cur_data["p-value of Corr."])
719
+
720
+ cur_df = allbench.df.query(f'scenario=="aggregate" or scenario=="{plotted_scenario}"')
721
+
722
+ # Filter models that are present in both scenarios
723
+ models_in_both = cur_df.groupby("model")["scenario"].nunique().eq(2).index
724
+
725
+ # Pivot the DataFrame to have scenarios as columns
726
+ df_pivot = cur_df[cur_df["model"].isin(models_in_both)].pivot(
727
+ index="model", columns="scenario", values="score"
728
+ )
729
+
730
+ # Create the scatter plot using Plotly Express
731
+ fig = px.scatter(
732
+ df_pivot,
733
+ x=df_pivot.columns[0],
734
+ y=df_pivot.columns[1],
735
+ trendline="ols",
736
+ labels={
737
+ df_pivot.columns[0]: df_pivot.columns[0],
738
+ df_pivot.columns[1]: df_pivot.columns[1],
739
+ },
740
+ hover_name=df_pivot.index,
741
+ title="Model Scores Comparison between Scenarios",
742
+ )
743
+ st.plotly_chart(fig, use_container_width=True)
744
+
745
  st.markdown(
746
  "BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
747
  "The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "
 
758
  """)
759
 
760
 
 
 
 
 
 
 
 
 
761
  fig = px.histogram(
762
  data.query("Benchmark!=@plotted_scenario"), x=corr_name, nbins=len(data) - 1
763
  )