Spaces:
Paused
Paused
Nathan Habib
commited on
Commit
•
e3aaf53
1
Parent(s):
26286b2
add new evals to the leaderboard
Browse files- app.py +22 -19
- src/assets/hardcoded_evals.py +3 -0
- src/assets/text_content.py +53 -1
- src/get_model_info/utils.py +3 -0
- src/plots/read_results.py +6 -3
app.py
CHANGED
@@ -88,6 +88,9 @@ BENCHMARK_COLS = [
|
|
88 |
AutoEvalColumn.hellaswag,
|
89 |
AutoEvalColumn.mmlu,
|
90 |
AutoEvalColumn.truthfulqa,
|
|
|
|
|
|
|
91 |
]
|
92 |
]
|
93 |
|
@@ -107,7 +110,7 @@ update_collections(original_df.copy())
|
|
107 |
leaderboard_df = original_df.copy()
|
108 |
|
109 |
models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
|
110 |
-
plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
|
111 |
to_be_dumped = f"models = {repr(models)}\n"
|
112 |
|
113 |
(
|
@@ -516,24 +519,24 @@ with demo:
|
|
516 |
queue=True,
|
517 |
)
|
518 |
|
519 |
-
with gr.TabItem("📈 Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
538 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
539 |
|
|
|
88 |
AutoEvalColumn.hellaswag,
|
89 |
AutoEvalColumn.mmlu,
|
90 |
AutoEvalColumn.truthfulqa,
|
91 |
+
AutoEvalColumn.winogrande,
|
92 |
+
AutoEvalColumn.gsm8k,
|
93 |
+
AutoEvalColumn.drop
|
94 |
]
|
95 |
]
|
96 |
|
|
|
110 |
leaderboard_df = original_df.copy()
|
111 |
|
112 |
models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
|
113 |
+
#plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
|
114 |
to_be_dumped = f"models = {repr(models)}\n"
|
115 |
|
116 |
(
|
|
|
519 |
queue=True,
|
520 |
)
|
521 |
|
522 |
+
# with gr.TabItem("📈 Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
|
523 |
+
# with gr.Row():
|
524 |
+
# with gr.Column():
|
525 |
+
# chart = create_metric_plot_obj(
|
526 |
+
# plot_df,
|
527 |
+
# ["Average ⬆️"],
|
528 |
+
# HUMAN_BASELINES,
|
529 |
+
# title="Average of Top Scores and Human Baseline Over Time",
|
530 |
+
# )
|
531 |
+
# gr.Plot(value=chart, interactive=False, width=500, height=500)
|
532 |
+
# with gr.Column():
|
533 |
+
# chart = create_metric_plot_obj(
|
534 |
+
# plot_df,
|
535 |
+
# ["ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K", "DROP"],
|
536 |
+
# HUMAN_BASELINES,
|
537 |
+
# title="Top Scores and Human Baseline Over Time",
|
538 |
+
# )
|
539 |
+
# gr.Plot(value=chart, interactive=False, width=500, height=500)
|
540 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
541 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
542 |
|
src/assets/hardcoded_evals.py
CHANGED
@@ -35,6 +35,9 @@ baseline = {
|
|
35 |
AutoEvalColumn.hellaswag.name: 25.0,
|
36 |
AutoEvalColumn.mmlu.name: 25.0,
|
37 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
|
|
|
|
|
|
38 |
AutoEvalColumn.dummy.name: "baseline",
|
39 |
AutoEvalColumn.model_type.name: "",
|
40 |
}
|
|
|
35 |
AutoEvalColumn.hellaswag.name: 25.0,
|
36 |
AutoEvalColumn.mmlu.name: 25.0,
|
37 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
38 |
+
AutoEvalColumn.winogrande.name: 50.0,
|
39 |
+
AutoEvalColumn.gsm8k.name: 0.21,
|
40 |
+
AutoEvalColumn.drop.name: 0.47,
|
41 |
AutoEvalColumn.dummy.name: "baseline",
|
42 |
AutoEvalColumn.model_type.name: "",
|
43 |
}
|
src/assets/text_content.py
CHANGED
@@ -31,7 +31,10 @@ If there is no icon, we have not uploaded the information on the model yet, feel
|
|
31 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
32 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
33 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
34 |
-
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model
|
|
|
|
|
|
|
35 |
|
36 |
For all these evaluations, a higher score is a better score.
|
37 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
@@ -55,6 +58,14 @@ The tasks and few shots parameters are:
|
|
55 |
- HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
|
56 |
- TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
|
57 |
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
## Quantization
|
60 |
To get more information about quantization, see:
|
@@ -166,4 +177,45 @@ CITATION_BUTTON_TEXT = r"""
|
|
166 |
eprint={2109.07958},
|
167 |
archivePrefix={arXiv},
|
168 |
primaryClass={cs.CL}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
}"""
|
|
|
31 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
32 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
33 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
34 |
+
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
|
35 |
+
- <a href="https://arxiv.org/abs/1907.10641" target="_blank"> Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
|
36 |
+
- <a href="https://arxiv.org/abs/2110.14168" target="_blank"> GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
|
37 |
+
- <a href="https://arxiv.org/abs/1903.00161" target="_blank"> DROP </a> (3-shot) - English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.
|
38 |
|
39 |
For all these evaluations, a higher score is a better score.
|
40 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
|
|
58 |
- HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
|
59 |
- TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
|
60 |
- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
|
61 |
+
- Winogrande: 5-shot, *winogrande* (`acc`)
|
62 |
+
- GSM8k: 5-shot, *gsm8k* (`acc`)
|
63 |
+
- DROP: 3-shot, *drop* (`f1`)
|
64 |
+
|
65 |
+
Side note on the baseline scores:
|
66 |
+
- for log-likelihood evaluation, we select the random baseline
|
67 |
+
- for DROP, we select the best submission score according to [their leaderboard](https://leaderboard.allenai.org/drop/submissions/public) when the paper came out (NAQANet score)
|
68 |
+
- for GSM8K, we select the score obtained in the paper after inetuning a 6B model on the full GSM8K training set for 50 epochs
|
69 |
|
70 |
## Quantization
|
71 |
To get more information about quantization, see:
|
|
|
177 |
eprint={2109.07958},
|
178 |
archivePrefix={arXiv},
|
179 |
primaryClass={cs.CL}
|
180 |
+
}
|
181 |
+
@misc{DBLP:journals/corr/abs-1907-10641,
|
182 |
+
title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
|
183 |
+
author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
|
184 |
+
year={2019},
|
185 |
+
eprint={1907.10641},
|
186 |
+
archivePrefix={arXiv},
|
187 |
+
primaryClass={cs.CL}
|
188 |
+
}
|
189 |
+
@misc{DBLP:journals/corr/abs-2110-14168,
|
190 |
+
title={Training Verifiers to Solve Math Word Problems},
|
191 |
+
author={Karl Cobbe and
|
192 |
+
Vineet Kosaraju and
|
193 |
+
Mohammad Bavarian and
|
194 |
+
Mark Chen and
|
195 |
+
Heewoo Jun and
|
196 |
+
Lukasz Kaiser and
|
197 |
+
Matthias Plappert and
|
198 |
+
Jerry Tworek and
|
199 |
+
Jacob Hilton and
|
200 |
+
Reiichiro Nakano and
|
201 |
+
Christopher Hesse and
|
202 |
+
John Schulman},
|
203 |
+
year={2021},
|
204 |
+
eprint={2110.14168},
|
205 |
+
archivePrefix={arXiv},
|
206 |
+
primaryClass={cs.CL}
|
207 |
+
}
|
208 |
+
@misc{DBLP:journals/corr/abs-1903-00161,
|
209 |
+
title={{DROP:} {A} Reading Comprehension Benchmark Requiring Discrete Reasoning
|
210 |
+
Over Paragraphs},
|
211 |
+
author={Dheeru Dua and
|
212 |
+
Yizhong Wang and
|
213 |
+
Pradeep Dasigi and
|
214 |
+
Gabriel Stanovsky and
|
215 |
+
Sameer Singh and
|
216 |
+
Matt Gardner},
|
217 |
+
year={2019},
|
218 |
+
eprinttype={arXiv},
|
219 |
+
eprint={1903.00161},
|
220 |
+
primaryClass={cs.CL}
|
221 |
}"""
|
src/get_model_info/utils.py
CHANGED
@@ -29,6 +29,9 @@ class AutoEvalColumn: # Auto evals column
|
|
29 |
hellaswag = ColumnContent("HellaSwag", "number", True)
|
30 |
mmlu = ColumnContent("MMLU", "number", True)
|
31 |
truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
|
|
|
|
|
|
32 |
model_type = ColumnContent("Type", "str", False)
|
33 |
precision = ColumnContent("Precision", "str", False) # , True)
|
34 |
license = ColumnContent("Hub License", "str", False)
|
|
|
29 |
hellaswag = ColumnContent("HellaSwag", "number", True)
|
30 |
mmlu = ColumnContent("MMLU", "number", True)
|
31 |
truthfulqa = ColumnContent("TruthfulQA", "number", True)
|
32 |
+
winogrande = ColumnContent("Winogrande", "number", True)
|
33 |
+
gsm8k = ColumnContent("GSM8K", "number", True)
|
34 |
+
drop = ColumnContent("DROP", "number", True)
|
35 |
model_type = ColumnContent("Type", "str", False)
|
36 |
precision = ColumnContent("Precision", "str", False) # , True)
|
37 |
license = ColumnContent("Hub License", "str", False)
|
src/plots/read_results.py
CHANGED
@@ -8,13 +8,16 @@ import numpy as np
|
|
8 |
|
9 |
from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
|
10 |
|
11 |
-
METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
|
12 |
-
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
|
13 |
BENCH_TO_NAME = {
|
14 |
"arc:challenge": AutoEvalColumn.arc.name,
|
15 |
"hellaswag": AutoEvalColumn.hellaswag.name,
|
16 |
"hendrycksTest": AutoEvalColumn.mmlu.name,
|
17 |
"truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
|
|
|
|
|
|
|
18 |
}
|
19 |
|
20 |
|
@@ -46,7 +49,7 @@ class EvalResult:
|
|
46 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
47 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
48 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
49 |
-
data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) /
|
50 |
data_dict[AutoEvalColumn.still_on_hub.name] = (
|
51 |
is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
|
52 |
)
|
|
|
8 |
|
9 |
from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
|
10 |
|
11 |
+
METRICS = ["acc_norm", "acc_norm", "acc", "mc2", "acc", "acc", "f1"]
|
12 |
+
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc", "winogrande", "gsm8k", "drop"]
|
13 |
BENCH_TO_NAME = {
|
14 |
"arc:challenge": AutoEvalColumn.arc.name,
|
15 |
"hellaswag": AutoEvalColumn.hellaswag.name,
|
16 |
"hendrycksTest": AutoEvalColumn.mmlu.name,
|
17 |
"truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
|
18 |
+
"winogrande": AutoEvalColumn.winogrande.name,
|
19 |
+
"gsm8k": AutoEvalColumn.gsm8k.name,
|
20 |
+
"drop": AutoEvalColumn.drop.name,
|
21 |
}
|
22 |
|
23 |
|
|
|
49 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
50 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
51 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
52 |
+
data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 7.0
|
53 |
data_dict[AutoEvalColumn.still_on_hub.name] = (
|
54 |
is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
|
55 |
)
|