Nathan Habib commited on
Commit
e3aaf53
β€’
1 Parent(s): 26286b2

add new evals to the leaderboard

Browse files
app.py CHANGED
@@ -88,6 +88,9 @@ BENCHMARK_COLS = [
88
  AutoEvalColumn.hellaswag,
89
  AutoEvalColumn.mmlu,
90
  AutoEvalColumn.truthfulqa,
 
 
 
91
  ]
92
  ]
93
 
@@ -107,7 +110,7 @@ update_collections(original_df.copy())
107
  leaderboard_df = original_df.copy()
108
 
109
  models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
110
- plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
111
  to_be_dumped = f"models = {repr(models)}\n"
112
 
113
  (
@@ -516,24 +519,24 @@ with demo:
516
  queue=True,
517
  )
518
 
519
- with gr.TabItem("πŸ“ˆ Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
520
- with gr.Row():
521
- with gr.Column():
522
- chart = create_metric_plot_obj(
523
- plot_df,
524
- ["Average ⬆️"],
525
- HUMAN_BASELINES,
526
- title="Average of Top Scores and Human Baseline Over Time",
527
- )
528
- gr.Plot(value=chart, interactive=False, width=500, height=500)
529
- with gr.Column():
530
- chart = create_metric_plot_obj(
531
- plot_df,
532
- ["ARC", "HellaSwag", "MMLU", "TruthfulQA"],
533
- HUMAN_BASELINES,
534
- title="Top Scores and Human Baseline Over Time",
535
- )
536
- gr.Plot(value=chart, interactive=False, width=500, height=500)
537
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
538
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
539
 
 
88
  AutoEvalColumn.hellaswag,
89
  AutoEvalColumn.mmlu,
90
  AutoEvalColumn.truthfulqa,
91
+ AutoEvalColumn.winogrande,
92
+ AutoEvalColumn.gsm8k,
93
+ AutoEvalColumn.drop
94
  ]
95
  ]
96
 
 
110
  leaderboard_df = original_df.copy()
111
 
112
  models = original_df["model_name_for_query"].tolist() # needed for model backlinks in their to the leaderboard
113
+ #plot_df = create_plot_df(create_scores_df(join_model_info_with_results(original_df)))
114
  to_be_dumped = f"models = {repr(models)}\n"
115
 
116
  (
 
519
  queue=True,
520
  )
521
 
522
+ # with gr.TabItem("πŸ“ˆ Metrics evolution through time", elem_id="llm-benchmark-tab-table", id=4):
523
+ # with gr.Row():
524
+ # with gr.Column():
525
+ # chart = create_metric_plot_obj(
526
+ # plot_df,
527
+ # ["Average ⬆️"],
528
+ # HUMAN_BASELINES,
529
+ # title="Average of Top Scores and Human Baseline Over Time",
530
+ # )
531
+ # gr.Plot(value=chart, interactive=False, width=500, height=500)
532
+ # with gr.Column():
533
+ # chart = create_metric_plot_obj(
534
+ # plot_df,
535
+ # ["ARC", "HellaSwag", "MMLU", "TruthfulQA", "Winogrande", "GSM8K", "DROP"],
536
+ # HUMAN_BASELINES,
537
+ # title="Top Scores and Human Baseline Over Time",
538
+ # )
539
+ # gr.Plot(value=chart, interactive=False, width=500, height=500)
540
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
541
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
542
 
src/assets/hardcoded_evals.py CHANGED
@@ -35,6 +35,9 @@ baseline = {
35
  AutoEvalColumn.hellaswag.name: 25.0,
36
  AutoEvalColumn.mmlu.name: 25.0,
37
  AutoEvalColumn.truthfulqa.name: 25.0,
 
 
 
38
  AutoEvalColumn.dummy.name: "baseline",
39
  AutoEvalColumn.model_type.name: "",
40
  }
 
35
  AutoEvalColumn.hellaswag.name: 25.0,
36
  AutoEvalColumn.mmlu.name: 25.0,
37
  AutoEvalColumn.truthfulqa.name: 25.0,
38
+ AutoEvalColumn.winogrande.name: 50.0,
39
+ AutoEvalColumn.gsm8k.name: 0.21,
40
+ AutoEvalColumn.drop.name: 0.47,
41
  AutoEvalColumn.dummy.name: "baseline",
42
  AutoEvalColumn.model_type.name: "",
43
  }
src/assets/text_content.py CHANGED
@@ -31,7 +31,10 @@ If there is no icon, we have not uploaded the information on the model yet, feel
31
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
32
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
33
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
34
- - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model’s propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
 
 
 
35
 
36
  For all these evaluations, a higher score is a better score.
37
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
@@ -55,6 +58,14 @@ The tasks and few shots parameters are:
55
  - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
56
  - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
57
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
 
 
 
 
 
 
 
 
58
 
59
  ## Quantization
60
  To get more information about quantization, see:
@@ -166,4 +177,45 @@ CITATION_BUTTON_TEXT = r"""
166
  eprint={2109.07958},
167
  archivePrefix={arXiv},
168
  primaryClass={cs.CL}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  }"""
 
31
  - <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
32
  - <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
33
  - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
34
+ - <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a test to measure a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA in the Harness is actually a minima a 6-shots task, as it is prepended by 6 examples systematically, even when launched using 0 for the number of few-shot examples.
35
+ - <a href="https://arxiv.org/abs/1907.10641" target="_blank"> Winogrande </a> (5-shot) - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
36
+ - <a href="https://arxiv.org/abs/2110.14168" target="_blank"> GSM8k </a> (5-shot) - diverse grade school math word problems to measure a model's ability to solve multi-step mathematical reasoning problems.
37
+ - <a href="https://arxiv.org/abs/1903.00161" target="_blank"> DROP </a> (3-shot) - English reading comprehension benchmark requiring Discrete Reasoning Over the content of Paragraphs.
38
 
39
  For all these evaluations, a higher score is a better score.
40
  We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
 
58
  - HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
59
  - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
60
  - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (average of all the results `acc`)
61
+ - Winogrande: 5-shot, *winogrande* (`acc`)
62
+ - GSM8k: 5-shot, *gsm8k* (`acc`)
63
+ - DROP: 3-shot, *drop* (`f1`)
64
+
65
+ Side note on the baseline scores:
66
+ - for log-likelihood evaluation, we select the random baseline
67
+ - for DROP, we select the best submission score according to [their leaderboard](https://leaderboard.allenai.org/drop/submissions/public) when the paper came out (NAQANet score)
68
+ - for GSM8K, we select the score obtained in the paper after inetuning a 6B model on the full GSM8K training set for 50 epochs
69
 
70
  ## Quantization
71
  To get more information about quantization, see:
 
177
  eprint={2109.07958},
178
  archivePrefix={arXiv},
179
  primaryClass={cs.CL}
180
+ }
181
+ @misc{DBLP:journals/corr/abs-1907-10641,
182
+ title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
183
+ author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
184
+ year={2019},
185
+ eprint={1907.10641},
186
+ archivePrefix={arXiv},
187
+ primaryClass={cs.CL}
188
+ }
189
+ @misc{DBLP:journals/corr/abs-2110-14168,
190
+ title={Training Verifiers to Solve Math Word Problems},
191
+ author={Karl Cobbe and
192
+ Vineet Kosaraju and
193
+ Mohammad Bavarian and
194
+ Mark Chen and
195
+ Heewoo Jun and
196
+ Lukasz Kaiser and
197
+ Matthias Plappert and
198
+ Jerry Tworek and
199
+ Jacob Hilton and
200
+ Reiichiro Nakano and
201
+ Christopher Hesse and
202
+ John Schulman},
203
+ year={2021},
204
+ eprint={2110.14168},
205
+ archivePrefix={arXiv},
206
+ primaryClass={cs.CL}
207
+ }
208
+ @misc{DBLP:journals/corr/abs-1903-00161,
209
+ title={{DROP:} {A} Reading Comprehension Benchmark Requiring Discrete Reasoning
210
+ Over Paragraphs},
211
+ author={Dheeru Dua and
212
+ Yizhong Wang and
213
+ Pradeep Dasigi and
214
+ Gabriel Stanovsky and
215
+ Sameer Singh and
216
+ Matt Gardner},
217
+ year={2019},
218
+ eprinttype={arXiv},
219
+ eprint={1903.00161},
220
+ primaryClass={cs.CL}
221
  }"""
src/get_model_info/utils.py CHANGED
@@ -29,6 +29,9 @@ class AutoEvalColumn: # Auto evals column
29
  hellaswag = ColumnContent("HellaSwag", "number", True)
30
  mmlu = ColumnContent("MMLU", "number", True)
31
  truthfulqa = ColumnContent("TruthfulQA", "number", True)
 
 
 
32
  model_type = ColumnContent("Type", "str", False)
33
  precision = ColumnContent("Precision", "str", False) # , True)
34
  license = ColumnContent("Hub License", "str", False)
 
29
  hellaswag = ColumnContent("HellaSwag", "number", True)
30
  mmlu = ColumnContent("MMLU", "number", True)
31
  truthfulqa = ColumnContent("TruthfulQA", "number", True)
32
+ winogrande = ColumnContent("Winogrande", "number", True)
33
+ gsm8k = ColumnContent("GSM8K", "number", True)
34
+ drop = ColumnContent("DROP", "number", True)
35
  model_type = ColumnContent("Type", "str", False)
36
  precision = ColumnContent("Precision", "str", False) # , True)
37
  license = ColumnContent("Hub License", "str", False)
src/plots/read_results.py CHANGED
@@ -8,13 +8,16 @@ import numpy as np
8
 
9
  from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
10
 
11
- METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
12
- BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
13
  BENCH_TO_NAME = {
14
  "arc:challenge": AutoEvalColumn.arc.name,
15
  "hellaswag": AutoEvalColumn.hellaswag.name,
16
  "hendrycksTest": AutoEvalColumn.mmlu.name,
17
  "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
 
 
 
18
  }
19
 
20
 
@@ -46,7 +49,7 @@ class EvalResult:
46
  data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
47
  data_dict[AutoEvalColumn.dummy.name] = base_model
48
  data_dict[AutoEvalColumn.revision.name] = self.revision
49
- data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
50
  data_dict[AutoEvalColumn.still_on_hub.name] = (
51
  is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
52
  )
 
8
 
9
  from src.get_model_info.utils import AutoEvalColumn, make_clickable_model
10
 
11
+ METRICS = ["acc_norm", "acc_norm", "acc", "mc2", "acc", "acc", "f1"]
12
+ BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc", "winogrande", "gsm8k", "drop"]
13
  BENCH_TO_NAME = {
14
  "arc:challenge": AutoEvalColumn.arc.name,
15
  "hellaswag": AutoEvalColumn.hellaswag.name,
16
  "hendrycksTest": AutoEvalColumn.mmlu.name,
17
  "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
18
+ "winogrande": AutoEvalColumn.winogrande.name,
19
+ "gsm8k": AutoEvalColumn.gsm8k.name,
20
+ "drop": AutoEvalColumn.drop.name,
21
  }
22
 
23
 
 
49
  data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
50
  data_dict[AutoEvalColumn.dummy.name] = base_model
51
  data_dict[AutoEvalColumn.revision.name] = self.revision
52
+ data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 7.0
53
  data_dict[AutoEvalColumn.still_on_hub.name] = (
54
  is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
55
  )