Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: seperate the qa and longdoc tasks
Browse files- src/benchmarks.py +6 -4
- src/display/utils.py +30 -21
- tests/src/display/test_utils.py +5 -4
src/benchmarks.py
CHANGED
@@ -110,7 +110,8 @@ class Benchmark:
|
|
110 |
metric: str # ndcg_at_1 ,metric_key in the json file
|
111 |
col_name: str # [domain]_[language], name to display in the leaderboard
|
112 |
|
113 |
-
|
|
|
114 |
for task, domain_dict in dataset_dict.items():
|
115 |
for domain, lang_dict in domain_dict.items():
|
116 |
for lang, dataset_list in lang_dict.items():
|
@@ -119,13 +120,14 @@ for task, domain_dict in dataset_dict.items():
|
|
119 |
benchmark_name = get_safe_name(benchmark_name)
|
120 |
col_name = f"{domain}_{lang}"
|
121 |
for metric in dataset_list:
|
122 |
-
|
123 |
elif task == "long_doc":
|
124 |
for dataset in dataset_list:
|
125 |
col_name = f"{domain}_{lang}_{dataset}"
|
126 |
for metric in metric_list:
|
127 |
benchmark_name = f"{task}_{domain}_{lang}_{dataset}_{metric}"
|
128 |
benchmark_name = get_safe_name(benchmark_name)
|
129 |
-
|
130 |
|
131 |
-
|
|
|
|
110 |
metric: str # ndcg_at_1 ,metric_key in the json file
|
111 |
col_name: str # [domain]_[language], name to display in the leaderboard
|
112 |
|
113 |
+
qa_benchmark_dict = {}
|
114 |
+
long_doc_benchmark_dict = {}
|
115 |
for task, domain_dict in dataset_dict.items():
|
116 |
for domain, lang_dict in domain_dict.items():
|
117 |
for lang, dataset_list in lang_dict.items():
|
|
|
120 |
benchmark_name = get_safe_name(benchmark_name)
|
121 |
col_name = f"{domain}_{lang}"
|
122 |
for metric in dataset_list:
|
123 |
+
qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
|
124 |
elif task == "long_doc":
|
125 |
for dataset in dataset_list:
|
126 |
col_name = f"{domain}_{lang}_{dataset}"
|
127 |
for metric in metric_list:
|
128 |
benchmark_name = f"{task}_{domain}_{lang}_{dataset}_{metric}"
|
129 |
benchmark_name = get_safe_name(benchmark_name)
|
130 |
+
long_doc_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name)
|
131 |
|
132 |
+
BenchmarksQA = Enum('BenchmarksQA', qa_benchmark_dict)
|
133 |
+
BenchmarksLongDoc = Enum('BenchmarksLongDoc', long_doc_benchmark_dict)
|
src/display/utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
|
3 |
-
from src.benchmarks import
|
4 |
|
5 |
|
6 |
def fields(raw_class):
|
@@ -19,25 +19,32 @@ class ColumnContent:
|
|
19 |
never_hidden: bool = False
|
20 |
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)]
|
27 |
-
)
|
28 |
-
auto_eval_column_dict.append(
|
29 |
-
["reranking_model", ColumnContent, ColumnContent("Reranking Model", "markdown", True, never_hidden=True)]
|
30 |
-
)
|
31 |
-
auto_eval_column_dict.append(
|
32 |
-
["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]
|
33 |
-
)
|
34 |
-
for benchmark in Benchmarks:
|
35 |
auto_eval_column_dict.append(
|
36 |
-
[
|
37 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
# We use make dataclass to dynamically fill the scores from Tasks
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
|
43 |
## For the queue columns in the submission tab
|
@@ -48,10 +55,12 @@ class EvalQueueColumn: # Queue column
|
|
48 |
|
49 |
|
50 |
# Column selection
|
51 |
-
COLS = [c.name for c in fields(
|
52 |
-
TYPES = [c.type for c in fields(
|
53 |
-
COLS_LITE = [c.name for c in fields(
|
54 |
|
55 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
56 |
|
57 |
-
|
|
|
|
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
|
3 |
+
from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
|
4 |
|
5 |
|
6 |
def fields(raw_class):
|
|
|
19 |
never_hidden: bool = False
|
20 |
|
21 |
|
22 |
+
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
23 |
+
## Leaderboard columns
|
24 |
+
auto_eval_column_dict = []
|
25 |
+
# Init
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
auto_eval_column_dict.append(
|
27 |
+
["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)]
|
28 |
)
|
29 |
+
auto_eval_column_dict.append(
|
30 |
+
["reranking_model", ColumnContent, ColumnContent("Reranking Model", "markdown", True, never_hidden=True)]
|
31 |
+
)
|
32 |
+
auto_eval_column_dict.append(
|
33 |
+
["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]
|
34 |
+
)
|
35 |
+
for benchmark in benchmarks:
|
36 |
+
auto_eval_column_dict.append(
|
37 |
+
[benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
|
38 |
+
)
|
39 |
|
40 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
41 |
+
return make_dataclass(cls_name, auto_eval_column_dict, frozen=True)
|
42 |
+
|
43 |
+
|
44 |
+
AutoEvalColumnQA = make_autoevalcolumn(
|
45 |
+
"AutoEvalColumnQA", BenchmarksQA)
|
46 |
+
AutoEvalColumnLongDoc = make_autoevalcolumn(
|
47 |
+
"AutoEvalColumnLongDoc", BenchmarksLongDoc)
|
48 |
|
49 |
|
50 |
## For the queue columns in the submission tab
|
|
|
55 |
|
56 |
|
57 |
# Column selection
|
58 |
+
COLS = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
|
59 |
+
TYPES = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
|
60 |
+
COLS_LITE = [c.name for c in fields(AutoEvalColumnQA) if c.displayed_by_default and not c.hidden]
|
61 |
|
62 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
63 |
|
64 |
+
QA_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksQA]
|
65 |
+
|
66 |
+
LONG_DOC_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksLongDoc]
|
tests/src/display/test_utils.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
import pytest
|
2 |
-
from src.display.utils import fields,
|
3 |
|
4 |
|
|
|
5 |
def test_fields():
|
6 |
-
for c in fields(
|
7 |
-
print(c
|
8 |
|
9 |
|
10 |
def test_macro_variables():
|
@@ -12,4 +13,4 @@ def test_macro_variables():
|
|
12 |
print(f'COLS_LITE: {COLS_LITE}')
|
13 |
print(f'TYPES: {TYPES}')
|
14 |
print(f'EVAL_COLS: {EVAL_COLS}')
|
15 |
-
print(f'BENCHMARK_COLS: {
|
|
|
1 |
import pytest
|
2 |
+
from src.display.utils import fields, AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS, COLS_LITE, TYPES, EVAL_COLS, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
|
3 |
|
4 |
|
5 |
+
@pytest.mark.parametrize('auto_eval_column')
|
6 |
def test_fields():
|
7 |
+
for c in fields(AutoEvalColumnQA):
|
8 |
+
print(c)
|
9 |
|
10 |
|
11 |
def test_macro_variables():
|
|
|
13 |
print(f'COLS_LITE: {COLS_LITE}')
|
14 |
print(f'TYPES: {TYPES}')
|
15 |
print(f'EVAL_COLS: {EVAL_COLS}')
|
16 |
+
print(f'BENCHMARK_COLS: {QA_BENCHMARK_COLS}')
|