Yotam-Perlitz commited on
Commit
e2be414
Β·
1 Parent(s): 1035432

add upload benchmark option

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

Files changed (1) hide show
  1. app.py +153 -80
app.py CHANGED
@@ -7,28 +7,36 @@ import streamlit as st
7
  from bat import Benchmark, Config, Reporter, Tester
8
  from bat.utils import get_holistic_benchmark
9
 
10
- benchmarks_dict = {
11
- "arena_elo": "LMSys Arena",
12
- "mt_bench": "MT Bench",
13
- "mixeval": "Mix Eval",
14
- "alpacav2": "AlpacaEval V2",
15
- "arena_hard": "Arena Hard",
16
- "arc_c": "ARC-C",
17
- "eq_benchv2": "EQ Bench V2",
18
- "agieval": "AGIEval",
19
- "llmonitor": "LLMonitor",
20
- "bbh": "BBH",
21
- "mmlu": "MMLU",
22
- "alpacav1": "AlpacaEval V1",
23
- "magi": "MAGI",
24
- "alpacaeval2_lc": "AlpacaEval V2 Length Adjusted",
25
- "gpt4all": "GPT-4-All",
26
- "humaneval": "HumanEval",
27
- "mbpp": "MBPP",
28
- "hellaswag": "HellaSwag",
29
- "hugging_6": "HF OpenLLM V1",
30
- "winogrande": "Winogrande",
31
- }
 
 
 
 
 
 
 
 
32
 
33
  st.markdown(
34
  """<h1 style='text-align: center; color: black;'>πŸ‹οΈβ€β™‚οΈ BenchBench Leaderboard πŸ‹οΈβ€β™‚οΈ</h1>""",
@@ -47,46 +55,57 @@ st.subheader("The Leaderboard", divider=True)
47
  # st.subheader("πŸ‹οΈβ€β™‚οΈ BenchBench Leaderboard πŸ‹", divider=True)
48
 
49
  leftcol, rightcol = st.columns([2, 1])
50
- with leftcol:
51
- with st.expander("Leaderboard configurations (defaults are great BTW)", icon="βš™οΈ"):
52
- with st.form("my_form"):
53
- all_scenarios_for_aggragate_with_all = all_scenarios_for_aggragate.tolist()
54
- all_scenarios_for_aggragate_with_all.append("All Holistic")
55
-
56
- aggragate_scenarios = st.multiselect(
57
- "Scenarios in Aggregate",
58
- all_scenarios_for_aggragate_with_all,
59
- ["All Holistic"],
60
- # all_scenarios_for_aggragate,
61
- )
62
-
63
- corr_type = st.selectbox(
64
- label="Select Correlation type", options=["kendall", "pearson"], index=0
65
- )
66
-
67
- aggragate_scenario_blacklist = (
68
- [
69
- scen
70
- for scen in all_scenarios_for_aggragate
71
- if scen not in aggragate_scenarios
72
- ]
73
- if "All Holistic" not in aggragate_scenarios
74
- else []
75
- )
76
-
77
- model_select_strategy = st.selectbox(
78
- label="Select strategy",
79
- options=["random", "top_aggregate", "somewhere_aggregate"],
80
- index=0,
81
- )
82
-
83
- n_models_taken_list = [5]
84
- n_exps = 10
85
-
86
- submitted = st.form_submit_button(label="Run BAT")
87
-
88
- with rightcol:
89
- st.button("βž• Add your benchmark here!")
 
 
 
 
 
 
 
 
 
 
 
90
 
91
 
92
  def run_load(
@@ -95,6 +114,8 @@ def run_load(
95
  model_select_strategy_list=["random"],
96
  corr_types=["kendall"],
97
  n_exps=10,
 
 
98
  ):
99
  # Create a hash of the inputs to generate a unique cache file for each set of inputs
100
  input_str = (
@@ -104,6 +125,14 @@ def run_load(
104
  + str(corr_types)
105
  + str(n_exps)
106
  )
 
 
 
 
 
 
 
 
107
  input_hash = hashlib.md5(input_str.encode()).hexdigest()
108
  cache_file = f"agreements_cache_{input_hash}.csv"
109
 
@@ -112,7 +141,7 @@ def run_load(
112
  cache_path = os.path.join(cache_dir, cache_file)
113
 
114
  # Check if the cache file exists
115
- if os.path.exists(cache_path):
116
  print("Loading cached results...")
117
  agreements = pd.read_csv(cache_path)
118
  return agreements
@@ -126,11 +155,33 @@ def run_load(
126
  model_select_strategy_list=model_select_strategy_list,
127
  corr_types=corr_types,
128
  n_exps=n_exps if n_models_taken_list != [0] else 1,
129
- # reference_data_path="data/combined_holistic.csv",
130
  )
131
 
132
- holistic = get_holistic_benchmark()
133
- holistic_scenarios = holistic.get_scenarios()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  holistic.clear_repeated_scenarios()
135
  holistic.add_aggragete(
136
  new_col_name="aggregate",
@@ -139,16 +190,18 @@ def run_load(
139
  min_scenario_for_models_to_appear_in_agg=5,
140
  )
141
 
142
- allbench = Benchmark(
143
- pd.read_csv("assets/combined_20240704.csv"),
144
- # data_source=newbench_name,
145
- )
 
 
146
  allbench.df = allbench.df.drop(columns=["tag"])
147
  allbench.clear_repeated_scenarios()
148
  allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
149
 
150
- allbench.df = allbench.df[~allbench.df["scenario"].str.contains("_mixed")]
151
- allbench.df = allbench.df[~allbench.df["scenario"].str.contains("agentbench")]
152
 
153
  # st.dataframe(holistic.df.query('scenario=="aggregate"'))
154
 
@@ -158,6 +211,10 @@ def run_load(
158
 
159
  # len(allbench.get_scenario_appearences_count().keys())
160
 
 
 
 
 
161
  agreements = tester.all_vs_all_agreement_testing(
162
  allbench, single_source_scenario="aggregate"
163
  )
@@ -173,8 +230,12 @@ agreements = run_load(
173
  model_select_strategy_list=[model_select_strategy],
174
  corr_types=[corr_type],
175
  n_exps=n_exps,
 
176
  )
177
 
 
 
 
178
  reporter = Reporter()
179
  z_scores = reporter.get_all_z_scores(agreements=agreements, aggragate_name="aggregate")
180
 
@@ -201,17 +262,29 @@ data = (
201
 
202
  data = data[~data["Source"].str.contains("livebench")]
203
  data = data[~data["Source"].str.contains("biggen")]
204
- data.drop(columns=["Source"], inplace=True)
205
- data["Benchmark"] = data["Benchmark"].apply(lambda x: benchmarks_dict[x])
 
206
 
207
  # Apply coloring based on 'Z' valuesz
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
- styled_data = data.style.background_gradient(
210
- subset=["Z Score"],
211
- cmap="RdYlGn",
212
- vmin=-data["Z Score"].abs().max(),
213
- vmax=data["Z Score"].abs().max(),
214
- ).format(subset=["Z Score", corr_name, "p value of Corr."], formatter="{:.2}")
215
 
216
  st.dataframe(
217
  data=styled_data,
 
7
  from bat import Benchmark, Config, Reporter, Tester
8
  from bat.utils import get_holistic_benchmark
9
 
10
+
11
+ def get_nice_benchmark_name(bench_name):
12
+ benchmarks_dict = {
13
+ "arena_elo": "LMSys Arena",
14
+ "mt_bench": "MT Bench",
15
+ "mixeval": "Mix Eval",
16
+ "alpacav2": "AlpacaEval V2",
17
+ "arena_hard": "Arena Hard",
18
+ "arc_c": "ARC-C",
19
+ "eq_benchv2": "EQ Bench V2",
20
+ "agieval": "AGIEval",
21
+ "llmonitor": "LLMonitor",
22
+ "bbh": "BBH",
23
+ "mmlu": "MMLU",
24
+ "alpacav1": "AlpacaEval V1",
25
+ "magi": "MAGI",
26
+ "alpacaeval2_lc": "AlpacaEval V2 Length Adjusted",
27
+ "gpt4all": "GPT-4-All",
28
+ "humaneval": "HumanEval",
29
+ "mbpp": "MBPP",
30
+ "hellaswag": "HellaSwag",
31
+ "hugging_6": "HF OpenLLM V1",
32
+ "winogrande": "Winogrande",
33
+ }
34
+
35
+ if bench_name in benchmarks_dict:
36
+ return benchmarks_dict[bench_name]
37
+ else:
38
+ return bench_name
39
+
40
 
41
  st.markdown(
42
  """<h1 style='text-align: center; color: black;'>πŸ‹οΈβ€β™‚οΈ BenchBench Leaderboard πŸ‹οΈβ€β™‚οΈ</h1>""",
 
55
  # st.subheader("πŸ‹οΈβ€β™‚οΈ BenchBench Leaderboard πŸ‹", divider=True)
56
 
57
  leftcol, rightcol = st.columns([2, 1])
58
+
59
+ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="βš™οΈ"):
60
+ with st.form("my_form"):
61
+ all_scenarios_for_aggragate_with_all = all_scenarios_for_aggragate.tolist()
62
+ all_scenarios_for_aggragate_with_all.append("All Holistic")
63
+
64
+ aggragate_scenarios = st.multiselect(
65
+ "Scenarios in Aggregate",
66
+ all_scenarios_for_aggragate_with_all,
67
+ ["All Holistic"],
68
+ # all_scenarios_for_aggragate,
69
+ )
70
+
71
+ corr_type = st.selectbox(
72
+ label="Select Correlation type", options=["kendall", "pearson"], index=0
73
+ )
74
+
75
+ aggragate_scenario_blacklist = (
76
+ [
77
+ scen
78
+ for scen in all_scenarios_for_aggragate
79
+ if scen not in aggragate_scenarios
80
+ ]
81
+ if "All Holistic" not in aggragate_scenarios
82
+ else []
83
+ )
84
+
85
+ model_select_strategy = st.selectbox(
86
+ label="Select strategy",
87
+ options=["random", "top_aggregate", "somewhere_aggregate"],
88
+ index=0,
89
+ )
90
+
91
+ n_models_taken_list = [5]
92
+ n_exps = 10
93
+
94
+ submitted = st.form_submit_button(label="Run BAT")
95
+
96
+
97
+ uploaded_file = st.file_uploader("add your benchmark as a CSV")
98
+ st.download_button(
99
+ label="Download example CSV",
100
+ data=pd.read_csv("assets/mybench.csv").to_csv().encode("utf-8"),
101
+ file_name="mybench.csv",
102
+ mime="text/csv",
103
+ )
104
+
105
+ my_benchmark = Benchmark()
106
+ if uploaded_file is not None:
107
+ df = pd.read_csv(uploaded_file)
108
+ my_benchmark.assign_df(df, data_source="Uploaded Benchmark")
109
 
110
 
111
  def run_load(
 
114
  model_select_strategy_list=["random"],
115
  corr_types=["kendall"],
116
  n_exps=10,
117
+ my_benchmark=Benchmark(),
118
+ use_caching=False,
119
  ):
120
  # Create a hash of the inputs to generate a unique cache file for each set of inputs
121
  input_str = (
 
125
  + str(corr_types)
126
  + str(n_exps)
127
  )
128
+
129
+ if not my_benchmark.is_empty:
130
+ input_str += str(
131
+ hashlib.sha256(
132
+ my_benchmark.df.to_csv(index=False).encode("utf-8")
133
+ ).hexdigest()
134
+ )
135
+
136
  input_hash = hashlib.md5(input_str.encode()).hexdigest()
137
  cache_file = f"agreements_cache_{input_hash}.csv"
138
 
 
141
  cache_path = os.path.join(cache_dir, cache_file)
142
 
143
  # Check if the cache file exists
144
+ if os.path.exists(cache_path) and use_caching:
145
  print("Loading cached results...")
146
  agreements = pd.read_csv(cache_path)
147
  return agreements
 
155
  model_select_strategy_list=model_select_strategy_list,
156
  corr_types=corr_types,
157
  n_exps=n_exps if n_models_taken_list != [0] else 1,
 
158
  )
159
 
160
+ holistic_scenarios = [
161
+ "arena_hard",
162
+ "mixeval",
163
+ "agieval",
164
+ "arc_c",
165
+ "alpacav1",
166
+ "alpacav2",
167
+ "alpacaeval2_lc",
168
+ "arena_elo",
169
+ "bbh",
170
+ "eq_benchv2",
171
+ "gpt4all",
172
+ "hugging_6",
173
+ "llmonitor",
174
+ "magi",
175
+ "mmlu",
176
+ "mt_bench",
177
+ "biggen_mwr",
178
+ "olmes_average",
179
+ "mmlu_pro",
180
+ ]
181
+ holistic = Benchmark()
182
+ holistic.load_local_catalog()
183
+ holistic.df = holistic.df.query("scenario in @holistic_scenarios")
184
+
185
  holistic.clear_repeated_scenarios()
186
  holistic.add_aggragete(
187
  new_col_name="aggregate",
 
190
  min_scenario_for_models_to_appear_in_agg=5,
191
  )
192
 
193
+ allbench = Benchmark()
194
+ allbench.load_local_catalog()
195
+
196
+ # allbench.df = allbench.df[~allbench.df["source"].str.contains("livebench")]
197
+
198
+ allbench.extend(my_benchmark)
199
  allbench.df = allbench.df.drop(columns=["tag"])
200
  allbench.clear_repeated_scenarios()
201
  allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
202
 
203
+ # allbench.df = allbench.df[~allbench.df["scenario"].str.contains("_mixed")]
204
+ # allbench.df = allbench.df[~allbench.df["scenario"].str.contains("agentbench")]
205
 
206
  # st.dataframe(holistic.df.query('scenario=="aggregate"'))
207
 
 
211
 
212
  # len(allbench.get_scenario_appearences_count().keys())
213
 
214
+ allbench.df.query('source=="BlueBench"').model.unique()
215
+
216
+ allbench.df.query('scenario=="aggregate"').model.unique()
217
+
218
  agreements = tester.all_vs_all_agreement_testing(
219
  allbench, single_source_scenario="aggregate"
220
  )
 
230
  model_select_strategy_list=[model_select_strategy],
231
  corr_types=[corr_type],
232
  n_exps=n_exps,
233
+ my_benchmark=my_benchmark,
234
  )
235
 
236
+ if not my_benchmark.is_empty:
237
+ print()
238
+
239
  reporter = Reporter()
240
  z_scores = reporter.get_all_z_scores(agreements=agreements, aggragate_name="aggregate")
241
 
 
262
 
263
  data = data[~data["Source"].str.contains("livebench")]
264
  data = data[~data["Source"].str.contains("biggen")]
265
+ # data.drop(columns=["Source"], inplace=True)
266
+ data["Benchmark"] = data["Benchmark"].apply(lambda x: get_nice_benchmark_name(x))
267
+
268
 
269
  # Apply coloring based on 'Z' valuesz
270
+ def highlight_uploaded_benchmark(row):
271
+ if row["Source"] == "Uploaded Benchmark":
272
+ return ["background-color: rgba(100,100,100,0.1)"] * len(row)
273
+ else:
274
+ return [""] * len(row)
275
+
276
+
277
+ styled_data = (
278
+ data.style.background_gradient(
279
+ subset=["Z Score"],
280
+ cmap="RdYlGn",
281
+ vmin=-data["Z Score"].abs().max(),
282
+ vmax=data["Z Score"].abs().max(),
283
+ )
284
+ .format(subset=["Z Score", corr_name, "p value of Corr."], formatter="{:.2}")
285
+ .apply(highlight_uploaded_benchmark, axis=1)
286
+ )
287
 
 
 
 
 
 
 
288
 
289
  st.dataframe(
290
  data=styled_data,