Sean Cho commited on
Commit
9afcf15
1 Parent(s): a86ccea

update evaluation columns

Browse files
app.py CHANGED
@@ -258,8 +258,8 @@ NUMERIC_INTERVALS = {
258
  "~3B": (1.5, 5),
259
  "~7B": (6, 11),
260
  "~13B": (12, 15),
261
- "~35B": (16, 55),
262
- "60B+": (55, 10000),
263
  }
264
 
265
  def filter_models(
 
258
  "~3B": (1.5, 5),
259
  "~7B": (6, 11),
260
  "~13B": (12, 15),
261
+ # "~35B": (16, 55),
262
+ # "60B+": (55, 10000),
263
  }
264
 
265
  def filter_models(
src/display_models/read_results.py CHANGED
@@ -9,12 +9,14 @@ import numpy as np
9
  from src.display_models.utils import AutoEvalColumn, make_clickable_model
10
 
11
  METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
12
- BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
13
  BENCH_TO_NAME = {
14
  "arc:challenge": AutoEvalColumn.arc.name,
15
  "hellaswag": AutoEvalColumn.hellaswag.name,
16
  "hendrycksTest": AutoEvalColumn.mmlu.name,
17
  "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
 
 
18
  }
19
 
20
 
 
9
  from src.display_models.utils import AutoEvalColumn, make_clickable_model
10
 
11
  METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
12
+ BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc", "commongen", "ethicalverification"]
13
  BENCH_TO_NAME = {
14
  "arc:challenge": AutoEvalColumn.arc.name,
15
  "hellaswag": AutoEvalColumn.hellaswag.name,
16
  "hendrycksTest": AutoEvalColumn.mmlu.name,
17
  "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
18
+ "commongen": AutoEvalColumn.commongen.name,
19
+ "ethicalverification": AutoEvalColumn.ethicalverification.name,
20
  }
21
 
22
 
src/display_models/utils.py CHANGED
@@ -25,10 +25,12 @@ class AutoEvalColumn: # Auto evals column
25
  model_type_symbol = ColumnContent("T", "str", True)
26
  model = ColumnContent("Model", "markdown", True)
27
  average = ColumnContent("Average ⬆️", "number", True)
28
- arc = ColumnContent("ARC", "number", True)
29
- hellaswag = ColumnContent("HellaSwag", "number", True)
30
- mmlu = ColumnContent("MMLU", "number", True)
31
- truthfulqa = ColumnContent("TruthfulQA", "number", True)
 
 
32
  model_type = ColumnContent("Type", "str", False)
33
  precision = ColumnContent("Precision", "str", False) # , True)
34
  license = ColumnContent("Hub License", "str", False)
 
25
  model_type_symbol = ColumnContent("T", "str", True)
26
  model = ColumnContent("Model", "markdown", True)
27
  average = ColumnContent("Average ⬆️", "number", True)
28
+ arc = ColumnContent("Ko-ARC", "number", True)
29
+ hellaswag = ColumnContent("Ko-HellaSwag", "number", True)
30
+ mmlu = ColumnContent("Ko-MMLU", "number", True)
31
+ truthfulqa = ColumnContent("Ko-TruthfulQA", "number", True)
32
+ commongen = ColumnContent("Ko-CommonGen", "number", True)
33
+ ethicalverification = ColumnContent("EthicalVerification", "number", True)
34
  model_type = ColumnContent("Type", "str", False)
35
  precision = ColumnContent("Precision", "str", False) # , True)
36
  license = ColumnContent("Hub License", "str", False)