Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
9134169
·
1 Parent(s): 8b7a945

feat: seperate the qa and longdoc tasks

Browse files
src/benchmarks.py CHANGED
@@ -110,7 +110,8 @@ class Benchmark:
110
  metric: str # ndcg_at_1 ,metric_key in the json file
111
  col_name: str # [domain]_[language], name to display in the leaderboard
112
 
113
- benchmark_dict = {}
 
114
  for task, domain_dict in dataset_dict.items():
115
  for domain, lang_dict in domain_dict.items():
116
  for lang, dataset_list in lang_dict.items():
@@ -119,13 +120,14 @@ for task, domain_dict in dataset_dict.items():
119
  benchmark_name = get_safe_name(benchmark_name)
120
  col_name = f"{domain}_{lang}"
121
  for metric in dataset_list:
122
- benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
123
  elif task == "long_doc":
124
  for dataset in dataset_list:
125
  col_name = f"{domain}_{lang}_{dataset}"
126
  for metric in metric_list:
127
  benchmark_name = f"{task}_{domain}_{lang}_{dataset}_{metric}"
128
  benchmark_name = get_safe_name(benchmark_name)
129
- benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
130
 
131
- Benchmarks = Enum('Benchmarks', benchmark_dict)
 
 
110
  metric: str # ndcg_at_1 ,metric_key in the json file
111
  col_name: str # [domain]_[language], name to display in the leaderboard
112
 
113
+ qa_benchmark_dict = {}
114
+ long_doc_benchmark_dict = {}
115
  for task, domain_dict in dataset_dict.items():
116
  for domain, lang_dict in domain_dict.items():
117
  for lang, dataset_list in lang_dict.items():
 
120
  benchmark_name = get_safe_name(benchmark_name)
121
  col_name = f"{domain}_{lang}"
122
  for metric in dataset_list:
123
+ qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
124
  elif task == "long_doc":
125
  for dataset in dataset_list:
126
  col_name = f"{domain}_{lang}_{dataset}"
127
  for metric in metric_list:
128
  benchmark_name = f"{task}_{domain}_{lang}_{dataset}_{metric}"
129
  benchmark_name = get_safe_name(benchmark_name)
130
+ long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
131
 
132
+ BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
133
+ BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
src/display/utils.py CHANGED
@@ -1,6 +1,6 @@
1
  from dataclasses import dataclass, make_dataclass
2
 
3
- from src.benchmarks import Benchmarks
4
 
5
 
6
  def fields(raw_class):
@@ -19,25 +19,32 @@ class ColumnContent:
19
  never_hidden: bool = False
20
 
21
 
22
- ## Leaderboard columns
23
- auto_eval_column_dict = []
24
- # Init
25
- auto_eval_column_dict.append(
26
- ["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)]
27
- )
28
- auto_eval_column_dict.append(
29
- ["reranking_model", ColumnContent, ColumnContent("Reranking Model", "markdown", True, never_hidden=True)]
30
- )
31
- auto_eval_column_dict.append(
32
- ["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]
33
- )
34
- for benchmark in Benchmarks:
35
  auto_eval_column_dict.append(
36
- [benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
37
  )
 
 
 
 
 
 
 
 
 
 
38
 
39
- # We use make dataclass to dynamically fill the scores from Tasks
40
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
 
 
 
 
41
 
42
 
43
  ## For the queue columns in the submission tab
@@ -48,10 +55,12 @@ class EvalQueueColumn: # Queue column
48
 
49
 
50
  # Column selection
51
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
52
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
53
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
54
 
55
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
56
 
57
- BENCHMARK_COLS = [t.value.col_name for t in Benchmarks]
 
 
 
1
  from dataclasses import dataclass, make_dataclass
2
 
3
+ from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
4
 
5
 
6
  def fields(raw_class):
 
19
  never_hidden: bool = False
20
 
21
 
22
+ def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
23
+ ## Leaderboard columns
24
+ auto_eval_column_dict = []
25
+ # Init
 
 
 
 
 
 
 
 
 
26
  auto_eval_column_dict.append(
27
+ ["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)]
28
  )
29
+ auto_eval_column_dict.append(
30
+ ["reranking_model", ColumnContent, ColumnContent("Reranking Model", "markdown", True, never_hidden=True)]
31
+ )
32
+ auto_eval_column_dict.append(
33
+ ["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]
34
+ )
35
+ for benchmark in benchmarks:
36
+ auto_eval_column_dict.append(
37
+ [benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
38
+ )
39
 
40
+ # We use make dataclass to dynamically fill the scores from Tasks
41
+ return make_dataclass(cls_name, auto_eval_column_dict, frozen=True)
42
+
43
+
44
+ AutoEvalColumnQA = make_autoevalcolumn(
45
+ "AutoEvalColumnQA", BenchmarksQA)
46
+ AutoEvalColumnLongDoc = make_autoevalcolumn(
47
+ "AutoEvalColumnLongDoc", BenchmarksLongDoc)
48
 
49
 
50
  ## For the queue columns in the submission tab
 
55
 
56
 
57
  # Column selection
58
+ COLS = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
59
+ TYPES = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
60
+ COLS_LITE = [c.name for c in fields(AutoEvalColumnQA) if c.displayed_by_default and not c.hidden]
61
 
62
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
63
 
64
+ QA_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksQA]
65
+
66
+ LONG_DOC_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksLongDoc]
tests/src/display/test_utils.py CHANGED
@@ -1,10 +1,11 @@
1
  import pytest
2
- from src.display.utils import fields, AutoEvalColumn, COLS, COLS_LITE, TYPES, EVAL_COLS, BENCHMARK_COLS
3
 
4
 
 
5
  def test_fields():
6
- for c in fields(AutoEvalColumn):
7
- print(c.name)
8
 
9
 
10
  def test_macro_variables():
@@ -12,4 +13,4 @@ def test_macro_variables():
12
  print(f'COLS_LITE: {COLS_LITE}')
13
  print(f'TYPES: {TYPES}')
14
  print(f'EVAL_COLS: {EVAL_COLS}')
15
- print(f'BENCHMARK_COLS: {BENCHMARK_COLS}')
 
1
  import pytest
2
+ from src.display.utils import fields, AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS, COLS_LITE, TYPES, EVAL_COLS, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
3
 
4
 
5
+ @pytest.mark.parametrize('auto_eval_column')
6
  def test_fields():
7
+ for c in fields(AutoEvalColumnQA):
8
+ print(c)
9
 
10
 
11
  def test_macro_variables():
 
13
  print(f'COLS_LITE: {COLS_LITE}')
14
  print(f'TYPES: {TYPES}')
15
  print(f'EVAL_COLS: {EVAL_COLS}')
16
+ print(f'BENCHMARK_COLS: {QA_BENCHMARK_COLS}')