Spaces:
Running
Running
Yotam-Perlitz
commited on
Commit
•
baec6d9
1
Parent(s):
dcfe1ca
fix csv saving
Browse filesSigned-off-by: Yotam-Perlitz <y.perlitz@ibm.com>
app.py
CHANGED
@@ -293,7 +293,7 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
|
|
293 |
uploaded_file = st.file_uploader("add your benchmark as a CSV")
|
294 |
st.download_button(
|
295 |
label="Download example CSV",
|
296 |
-
data=pd.read_csv("assets/mybench.csv").to_csv().encode("utf-8"),
|
297 |
file_name="mybench.csv",
|
298 |
mime="text/csv",
|
299 |
)
|
@@ -341,7 +341,11 @@ def run_load(
|
|
341 |
if os.path.exists(cache_path) and use_caching:
|
342 |
print("Loading cached results...")
|
343 |
agreements = pd.read_csv(cache_path)
|
344 |
-
|
|
|
|
|
|
|
|
|
345 |
|
346 |
else:
|
347 |
print("Cached results not found, calculating")
|
@@ -366,6 +370,10 @@ def run_load(
|
|
366 |
min_scenario_for_models_to_appear_in_agg=5,
|
367 |
)
|
368 |
|
|
|
|
|
|
|
|
|
369 |
allbench = Benchmark()
|
370 |
allbench.load_local_catalog()
|
371 |
|
@@ -387,11 +395,14 @@ def run_load(
|
|
387 |
)
|
388 |
|
389 |
agreements.to_csv(cache_path, index=False)
|
|
|
|
|
|
|
390 |
|
391 |
-
return agreements
|
392 |
|
393 |
|
394 |
-
agreements = run_load(
|
395 |
aggragate_scenario_blacklist=aggragate_scenario_blacklist,
|
396 |
n_models_taken_list=n_models_taken_list,
|
397 |
model_select_strategy_list=[model_select_strategy],
|
@@ -467,6 +478,22 @@ st.dataframe(
|
|
467 |
height=500,
|
468 |
)
|
469 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
470 |
st.markdown(
|
471 |
"BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
|
472 |
"The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "
|
|
|
293 |
uploaded_file = st.file_uploader("add your benchmark as a CSV")
|
294 |
st.download_button(
|
295 |
label="Download example CSV",
|
296 |
+
data=pd.read_csv("assets/mybench.csv").to_csv(index=False).encode("utf-8"),
|
297 |
file_name="mybench.csv",
|
298 |
mime="text/csv",
|
299 |
)
|
|
|
341 |
if os.path.exists(cache_path) and use_caching:
|
342 |
print("Loading cached results...")
|
343 |
agreements = pd.read_csv(cache_path)
|
344 |
+
aggregate_scores = pd.read_csv(
|
345 |
+
cache_path.replace("agreement", "aggregate_scores")
|
346 |
+
)
|
347 |
+
|
348 |
+
return agreements, aggregate_scores
|
349 |
|
350 |
else:
|
351 |
print("Cached results not found, calculating")
|
|
|
370 |
min_scenario_for_models_to_appear_in_agg=5,
|
371 |
)
|
372 |
|
373 |
+
aggragate_scores = holistic.df.query('scenario=="aggregate"')[
|
374 |
+
["model", "score"]
|
375 |
+
].sort_values(by="score", ascending=False)
|
376 |
+
|
377 |
allbench = Benchmark()
|
378 |
allbench.load_local_catalog()
|
379 |
|
|
|
395 |
)
|
396 |
|
397 |
agreements.to_csv(cache_path, index=False)
|
398 |
+
aggragate_scores.to_csv(
|
399 |
+
cache_path.replace("agreement", "aggregate_scores"), index=False
|
400 |
+
)
|
401 |
|
402 |
+
return agreements, aggragate_scores
|
403 |
|
404 |
|
405 |
+
agreements, aggragare_score_df = run_load(
|
406 |
aggragate_scenario_blacklist=aggragate_scenario_blacklist,
|
407 |
n_models_taken_list=n_models_taken_list,
|
408 |
model_select_strategy_list=[model_select_strategy],
|
|
|
478 |
height=500,
|
479 |
)
|
480 |
|
481 |
+
aggragare_score_df.rename(
|
482 |
+
columns={
|
483 |
+
"model": "Model",
|
484 |
+
"score": "Mean Win Rate over Selected Scenarios for Aggragate",
|
485 |
+
},
|
486 |
+
inplace=True,
|
487 |
+
)
|
488 |
+
with st.expander(label="Model scored by the aggragate"):
|
489 |
+
st.dataframe(
|
490 |
+
data=aggragare_score_df,
|
491 |
+
hide_index=True,
|
492 |
+
height=500,
|
493 |
+
use_container_width=True,
|
494 |
+
)
|
495 |
+
|
496 |
+
|
497 |
st.markdown(
|
498 |
"BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
|
499 |
"The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "
|