Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
7845083
·
1 Parent(s): 32ee53f

feat: implement the version selector for qa

Browse files
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  from apscheduler.schedulers.background import BackgroundScheduler
3
 
4
  from src.about import (
@@ -13,7 +14,7 @@ from src.display.css_html_js import custom_css
13
  from src.envs import (
14
  API,
15
  EVAL_RESULTS_PATH,
16
- REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST
17
  )
18
  from src.loaders import (
19
  load_eval_results
@@ -48,9 +49,10 @@ def restart_space():
48
  # print(f'failed to download')
49
  # restart_space()
50
 
51
-
52
  data = load_eval_results(EVAL_RESULTS_PATH)
53
-
 
54
 
55
  def update_metric_qa(
56
  metric: str,
@@ -60,28 +62,36 @@ def update_metric_qa(
60
  query: str,
61
  show_anonymous: bool,
62
  show_revision_and_timestamp: bool,
63
- selected_version: str,
64
- ):
65
- return update_metric(data[selected_version].raw_data, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
66
-
67
-
68
- def update_metric_long_doc(
69
- metric: str,
70
- domains: list,
71
- langs: list,
72
- reranking_model: list,
73
- query: str,
74
- show_anonymous: bool,
75
- show_revision_and_timestamp,
76
  ):
77
- return update_metric(data["AIR-Bench_24.04"].raw_data, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
78
-
79
-
80
- DOMAIN_COLS_QA = list(frozenset([c.value.domain for c in list(QABenchmarks)]))
81
- LANG_COLS_QA = list(frozenset([c.value.lang for c in list(QABenchmarks)]))
82
-
83
- DOMAIN_COLS_LONG_DOC = list(frozenset([c.value.domain for c in list(LongDocBenchmarks)]))
84
- LANG_COLS_LONG_DOC = list(frozenset([c.value.lang for c in list(LongDocBenchmarks)]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  demo = gr.Blocks(css=custom_css)
87
 
@@ -99,11 +109,12 @@ with demo:
99
  with gr.Column(min_width=320):
100
  # select domain
101
  with gr.Row():
102
- selected_domains = get_domain_dropdown(DOMAIN_COLS_QA, DOMAIN_COLS_QA)
 
103
  # select language
104
  with gr.Row():
105
- selected_langs = get_language_dropdown(LANG_COLS_QA, LANG_COLS_QA)
106
-
107
  with gr.Column():
108
  # select the metric
109
  selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
@@ -119,16 +130,25 @@ with demo:
119
  search_bar = get_search_bar()
120
  # select reranking models
121
  with gr.Column():
122
- selected_rerankings = get_reranking_dropdown(data["AIR-Bench_24.04"].reranking_models)
123
- leaderboard_table = get_leaderboard_table(data["AIR-Bench_24.04"].leaderboard_df_qa, data["AIR-Bench_24.04"].types_qa)
 
 
124
  # Dummy leaderboard for handling the case when the user uses backspace key
125
- hidden_leaderboard_table_for_search = get_leaderboard_table(data["AIR-Bench_24.04"].raw_df_qa, data["AIR-Bench_24.04"].types_qa, visible=False)
 
126
 
 
 
 
 
 
127
  set_listeners(
128
  "qa",
129
  leaderboard_table,
130
  hidden_leaderboard_table_for_search,
131
  search_bar,
 
132
  selected_domains,
133
  selected_langs,
134
  selected_rerankings,
@@ -147,7 +167,6 @@ with demo:
147
  search_bar,
148
  show_anonymous,
149
  show_revision_and_timestamp,
150
- selected_version,
151
  ],
152
  leaderboard_table,
153
  queue=True
@@ -468,3 +487,5 @@ if __name__ == "__main__":
468
  scheduler.start()
469
  demo.queue(default_concurrency_limit=40)
470
  demo.launch()
 
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
 
5
  from src.about import (
 
14
  from src.envs import (
15
  API,
16
  EVAL_RESULTS_PATH,
17
+ REPO_ID, DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, METRIC_LIST, LATEST_BENCHMARK_VERSION
18
  )
19
  from src.loaders import (
20
  load_eval_results
 
49
  # print(f'failed to download')
50
  # restart_space()
51
 
52
+ global data
53
  data = load_eval_results(EVAL_RESULTS_PATH)
54
+ global datastore
55
+ datastore = data[LATEST_BENCHMARK_VERSION]
56
 
57
  def update_metric_qa(
58
  metric: str,
 
62
  query: str,
63
  show_anonymous: bool,
64
  show_revision_and_timestamp: bool,
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  ):
66
+ return update_metric(datastore, 'qa', metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
67
+
68
+
69
+ # def update_metric_long_doc(
70
+ # metric: str,
71
+ # domains: list,
72
+ # langs: list,
73
+ # reranking_model: list,
74
+ # query: str,
75
+ # show_anonymous: bool,
76
+ # show_revision_and_timestamp,
77
+ # ):
78
+ # return update_metric(datastore.raw_data, "long-doc", metric, domains, langs, reranking_model, query, show_anonymous, show_revision_and_timestamp)
79
+
80
+
81
+ def update_datastore(version):
82
+ global datastore
83
+ global data
84
+ datastore = data[version]
85
+ selected_domains = get_domain_dropdown(QABenchmarks[datastore.slug])
86
+ selected_langs = get_language_dropdown(QABenchmarks[datastore.slug])
87
+ selected_rerankings = get_reranking_dropdown(datastore.reranking_models)
88
+ leaderboard_table = get_leaderboard_table(
89
+ datastore.raw_df_qa, datastore.types_qa)
90
+ hidden_leaderboard_table_for_search = get_leaderboard_table(
91
+ datastore.raw_df_qa, datastore.types_qa, visible=False)
92
+ return selected_domains, selected_langs, selected_rerankings, leaderboard_table, hidden_leaderboard_table_for_search
93
+ # DOMAIN_COLS_LONG_DOC = list(frozenset([c.value.domain for c in list(LongDocBenchmarks)]))
94
+ # LANG_COLS_LONG_DOC = list(frozenset([c.value.lang for c in list(LongDocBenchmarks)]))
95
 
96
  demo = gr.Blocks(css=custom_css)
97
 
 
109
  with gr.Column(min_width=320):
110
  # select domain
111
  with gr.Row():
112
+ selected_domains = get_domain_dropdown(QABenchmarks[datastore.slug])
113
+ # selected_domains = get_domain_dropdown(QABenchmarks["2404"])
114
  # select language
115
  with gr.Row():
116
+ selected_langs = get_language_dropdown(QABenchmarks[datastore.slug])
117
+ # selected_langs = get_language_dropdown(QABenchmarks["2404"])
118
  with gr.Column():
119
  # select the metric
120
  selected_metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
 
130
  search_bar = get_search_bar()
131
  # select reranking models
132
  with gr.Column():
133
+ selected_rerankings = get_reranking_dropdown(datastore.reranking_models)
134
+ # shown_table
135
+ leaderboard_table = get_leaderboard_table(
136
+ datastore.leaderboard_df_qa, datastore.types_qa)
137
  # Dummy leaderboard for handling the case when the user uses backspace key
138
+ hidden_leaderboard_table_for_search = get_leaderboard_table(
139
+ datastore.raw_df_qa, datastore.types_qa, visible=False)
140
 
141
+ selected_version.change(
142
+ update_datastore,
143
+ [selected_version,],
144
+ [selected_domains, selected_langs, selected_rerankings, leaderboard_table, hidden_leaderboard_table_for_search]
145
+ )
146
  set_listeners(
147
  "qa",
148
  leaderboard_table,
149
  hidden_leaderboard_table_for_search,
150
  search_bar,
151
+ selected_version,
152
  selected_domains,
153
  selected_langs,
154
  selected_rerankings,
 
167
  search_bar,
168
  show_anonymous,
169
  show_revision_and_timestamp,
 
170
  ],
171
  leaderboard_table,
172
  queue=True
 
487
  scheduler.start()
488
  demo.queue(default_concurrency_limit=40)
489
  demo.launch()
490
+
491
+
src/benchmarks.py CHANGED
@@ -26,33 +26,50 @@ class Benchmark:
26
 
27
 
28
  # create a function return an enum class containing all the benchmarks
29
- def get_benchmarks_enum(benchmark_version):
30
- qa_benchmark_dict = {}
31
- long_doc_benchmark_dict = {}
32
- for task, domain_dict in BenchmarkTable[benchmark_version].items():
33
- for domain, lang_dict in domain_dict.items():
34
- for lang, dataset_list in lang_dict.items():
35
- if task == "qa":
36
- benchmark_name = f"{domain}_{lang}"
37
- benchmark_name = get_safe_name(benchmark_name)
38
  col_name = benchmark_name
39
  for metric in dataset_list:
40
- qa_benchmark_dict[benchmark_name] = \
41
- Benchmark(
42
- benchmark_name, metric, col_name, domain, lang, task)
43
- elif task == "long-doc":
 
 
 
 
 
 
44
  for dataset in dataset_list:
45
  benchmark_name = f"{domain}_{lang}_{dataset}"
46
  benchmark_name = get_safe_name(benchmark_name)
47
  col_name = benchmark_name
48
  for metric in METRIC_LIST:
49
- long_doc_benchmark_dict[benchmark_name] = \
50
- Benchmark(
51
- benchmark_name, metric, col_name, domain,
52
- lang, task)
53
- return qa_benchmark_dict, long_doc_benchmark_dict
54
 
55
- _qa_benchmark_dict, _long_doc_benchmark_dict = get_benchmarks_enum('AIR-Bench_24.04')
56
 
57
- QABenchmarks = Enum('QABenchmarks', _qa_benchmark_dict)
58
- LongDocBenchmarks = Enum('LongDocBenchmarks', _long_doc_benchmark_dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  # create a function return an enum class containing all the benchmarks
29
+ def get_benchmarks_enum(benchmark_version, task_type):
30
+ benchmark_dict = {}
31
+ if task_type == "qa":
32
+ for task, domain_dict in BenchmarkTable[benchmark_version].items():
33
+ if task != task_type:
34
+ continue
35
+ for domain, lang_dict in domain_dict.items():
36
+ for lang, dataset_list in lang_dict.items():
37
+ benchmark_name = get_safe_name(f"{domain}_{lang}")
38
  col_name = benchmark_name
39
  for metric in dataset_list:
40
+ if "test" not in dataset_list[metric]["splits"]:
41
+ continue
42
+ benchmark_dict[benchmark_name] = \
43
+ Benchmark(benchmark_name, metric, col_name, domain, lang, task)
44
+ elif task_type == "long-doc":
45
+ for task, domain_dict in BenchmarkTable[benchmark_version].items():
46
+ if task != task_type:
47
+ continue
48
+ for domain, lang_dict in domain_dict.items():
49
+ for lang, dataset_list in lang_dict.items():
50
  for dataset in dataset_list:
51
  benchmark_name = f"{domain}_{lang}_{dataset}"
52
  benchmark_name = get_safe_name(benchmark_name)
53
  col_name = benchmark_name
54
  for metric in METRIC_LIST:
55
+ benchmark_dict[benchmark_name] = \
56
+ Benchmark(benchmark_name, metric, col_name, domain, lang, task)
57
+ return benchmark_dict
 
 
58
 
 
59
 
60
+ versions = ("AIR-Bench_24.04", "AIR-Bench_24.05")
61
+ qa_benchmark_dict = {}
62
+ for version in versions:
63
+ safe_version_name = get_safe_name(version)[-4:]
64
+ qa_benchmark_dict[safe_version_name] = Enum(f"QABenchmarks_{safe_version_name}", get_benchmarks_enum(version, "qa"))
65
+
66
+ long_doc_benchmark_dict = {}
67
+ for version in versions:
68
+ safe_version_name = get_safe_name(version)[-4:]
69
+ long_doc_benchmark_dict[safe_version_name] = Enum(f"LongDocBenchmarks_{safe_version_name}", get_benchmarks_enum(version, "long-doc"))
70
+
71
+ # _qa_benchmark_dict, = get_benchmarks_enum('AIR-Bench_24.04', "qa")
72
+ # _long_doc_benchmark_dict = get_benchmarks_enum('AIR-Bench_24.04', "long-doc")
73
+
74
+ QABenchmarks = Enum('QABenchmarks', qa_benchmark_dict)
75
+ LongDocBenchmarks = Enum('LongDocBenchmarks', long_doc_benchmark_dict)
src/display/columns.py CHANGED
@@ -1,16 +1,8 @@
1
  from dataclasses import dataclass, make_dataclass
2
 
3
  from src.benchmarks import QABenchmarks, LongDocBenchmarks
4
-
5
- COL_NAME_AVG = "Average ⬆️"
6
- COL_NAME_RETRIEVAL_MODEL = "Retrieval Method"
7
- COL_NAME_RERANKING_MODEL = "Reranking Model"
8
- COL_NAME_RETRIEVAL_MODEL_LINK = "Retrieval Model LINK"
9
- COL_NAME_RERANKING_MODEL_LINK = "Reranking Model LINK"
10
- COL_NAME_RANK = "Rank 🏆"
11
- COL_NAME_REVISION = "Revision"
12
- COL_NAME_TIMESTAMP = "Submission Date"
13
- COL_NAME_IS_ANONYMOUS = "Anonymous Submission"
14
 
15
 
16
  def fields(raw_class):
@@ -69,7 +61,7 @@ def get_default_auto_eval_column_dict():
69
  def make_autoevalcolumn(cls_name, benchmarks):
70
  auto_eval_column_dict = get_default_auto_eval_column_dict()
71
  # Leaderboard columns
72
- for benchmark in benchmarks:
73
  auto_eval_column_dict.append(
74
  [benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
75
  )
@@ -78,16 +70,28 @@ def make_autoevalcolumn(cls_name, benchmarks):
78
  return make_dataclass(cls_name, auto_eval_column_dict, frozen=True)
79
 
80
 
81
- AutoEvalColumnQA = make_autoevalcolumn("AutoEvalColumnQA", QABenchmarks)
82
- AutoEvalColumnLongDoc = make_autoevalcolumn("AutoEvalColumnLongDoc", LongDocBenchmarks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- fixed_cols = get_default_auto_eval_column_dict()[:-3]
85
 
86
- FIXED_COLS = [c.name for _, _, c in fixed_cols]
87
- FIXED_COLS_TYPES = [c.type for _, _, c in fixed_cols]
 
88
 
89
  # Column selection
90
- COLS_QA = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
91
- COLS_LONG_DOC = [c.name for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
92
- TYPES_QA = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
93
- TYPES_LONG_DOC = [c.type for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
 
1
  from dataclasses import dataclass, make_dataclass
2
 
3
  from src.benchmarks import QABenchmarks, LongDocBenchmarks
4
+ from src.envs import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL_LINK, \
5
+ COL_NAME_RERANKING_MODEL_LINK, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
 
 
 
 
 
 
 
 
6
 
7
 
8
  def fields(raw_class):
 
61
  def make_autoevalcolumn(cls_name, benchmarks):
62
  auto_eval_column_dict = get_default_auto_eval_column_dict()
63
  # Leaderboard columns
64
+ for benchmark in list(benchmarks.value):
65
  auto_eval_column_dict.append(
66
  [benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
67
  )
 
70
  return make_dataclass(cls_name, auto_eval_column_dict, frozen=True)
71
 
72
 
73
+ def get_default_col_names_and_types(benchmarks):
74
+ AutoEvalColumn = make_autoevalcolumn("AutoEvalColumn", benchmarks)
75
+ col_names = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
76
+ col_types = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
77
+ return col_names, col_types
78
+
79
+ # AutoEvalColumnQA = make_autoevalcolumn("AutoEvalColumnQA", QABenchmarks)
80
+ # COLS_QA = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
81
+ # TYPES_QA = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
82
+
83
+
84
+ def get_fixed_col_names_and_types():
85
+ fixed_cols = get_default_auto_eval_column_dict()[:-3]
86
+ return [c.name for _, _, c in fixed_cols], [c.type for _, _, c in fixed_cols]
87
+
88
+ # fixed_cols = get_default_auto_eval_column_dict()[:-3]
89
+ # FIXED_COLS = [c.name for _, _, c in fixed_cols]
90
+ # FIXED_COLS_TYPES = [c.type for _, _, c in fixed_cols]
91
 
 
92
 
93
+ # AutoEvalColumnLongDoc = make_autoevalcolumn("AutoEvalColumnLongDoc", LongDocBenchmarks)
94
+ # COLS_LONG_DOC = [c.name for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
95
+ # TYPES_LONG_DOC = [c.type for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
96
 
97
  # Column selection
 
 
 
 
src/display/gradio_formatting.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  from src.envs import BENCHMARK_VERSION_LIST, LATEST_BENCHMARK_VERSION
 
3
 
4
  def get_version_dropdown():
5
  return gr.Dropdown(
@@ -52,7 +53,10 @@ def get_metric_dropdown(metric_list, default_metrics):
52
  )
53
 
54
 
55
- def get_domain_dropdown(domain_list, default_domains):
 
 
 
56
  return gr.CheckboxGroup(
57
  choices=domain_list,
58
  value=default_domains,
@@ -61,7 +65,10 @@ def get_domain_dropdown(domain_list, default_domains):
61
  )
62
 
63
 
64
- def get_language_dropdown(language_list, default_languages):
 
 
 
65
  return gr.Dropdown(
66
  choices=language_list,
67
  value=default_languages,
 
1
  import gradio as gr
2
  from src.envs import BENCHMARK_VERSION_LIST, LATEST_BENCHMARK_VERSION
3
+ from src.benchmarks import QABenchmarks
4
 
5
  def get_version_dropdown():
6
  return gr.Dropdown(
 
53
  )
54
 
55
 
56
+ def get_domain_dropdown(benchmarks, default_domains=None):
57
+ domain_list = list(frozenset([c.value.domain for c in list(benchmarks.value)]))
58
+ if default_domains is None:
59
+ default_domains = domain_list
60
  return gr.CheckboxGroup(
61
  choices=domain_list,
62
  value=default_domains,
 
65
  )
66
 
67
 
68
+ def get_language_dropdown(benchmarks, default_languages=None):
69
+ language_list = list(frozenset([c.value.lang for c in list(benchmarks.value)]))
70
+ if default_languages is None:
71
+ default_languages = language_list
72
  return gr.Dropdown(
73
  choices=language_list,
74
  value=default_languages,
src/envs.py CHANGED
@@ -27,7 +27,7 @@ BM25_LINK = model_hyperlink("https://github.com/castorini/pyserini", "BM25")
27
 
28
  BENCHMARK_VERSION_LIST = [
29
  "AIR-Bench_24.04",
30
- # "AIR-Bench_24.05",
31
  ]
32
 
33
  LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]
@@ -65,3 +65,12 @@ METRIC_LIST = [
65
  "mrr_at_100",
66
  "mrr_at_1000"
67
  ]
 
 
 
 
 
 
 
 
 
 
27
 
28
  BENCHMARK_VERSION_LIST = [
29
  "AIR-Bench_24.04",
30
+ "AIR-Bench_24.05",
31
  ]
32
 
33
  LATEST_BENCHMARK_VERSION = BENCHMARK_VERSION_LIST[-1]
 
65
  "mrr_at_100",
66
  "mrr_at_1000"
67
  ]
68
+ COL_NAME_AVG = "Average ⬆️"
69
+ COL_NAME_RETRIEVAL_MODEL = "Retrieval Method"
70
+ COL_NAME_RERANKING_MODEL = "Reranking Model"
71
+ COL_NAME_RETRIEVAL_MODEL_LINK = "Retrieval Model LINK"
72
+ COL_NAME_RERANKING_MODEL_LINK = "Reranking Model LINK"
73
+ COL_NAME_RANK = "Rank 🏆"
74
+ COL_NAME_REVISION = "Revision"
75
+ COL_NAME_TIMESTAMP = "Submission Date"
76
+ COL_NAME_IS_ANONYMOUS = "Anonymous Submission"
src/loaders.py CHANGED
@@ -3,8 +3,8 @@ from typing import List
3
 
4
  import pandas as pd
5
 
6
- from src.envs import DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC
7
- from src.display.columns import COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
8
 
9
  from src.models import FullEvalResult, LeaderboardDataStore
10
  from src.utils import get_default_cols, get_leaderboard_df
@@ -50,34 +50,43 @@ def load_raw_eval_results(results_path: str) -> List[FullEvalResult]:
50
  continue
51
  return results
52
 
53
-
54
- def load_leaderboard_datastore(file_path) -> LeaderboardDataStore:
55
- lb_data_store = LeaderboardDataStore(None, None, None, None, None, None, None, None)
 
 
 
 
 
 
 
 
56
  lb_data_store.raw_data = load_raw_eval_results(file_path)
57
  print(f'raw data: {len(lb_data_store.raw_data)}')
58
 
59
  lb_data_store.raw_df_qa = get_leaderboard_df(
60
- lb_data_store.raw_data, task='qa', metric=DEFAULT_METRIC_QA)
61
  print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
62
  lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
63
- shown_columns_qa, types_qa = get_default_cols(
64
- 'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True)
 
65
  lb_data_store.types_qa = types_qa
66
  lb_data_store.leaderboard_df_qa = \
67
  lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
68
  lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
69
 
70
- lb_data_store.raw_df_long_doc = get_leaderboard_df(
71
- lb_data_store.raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
72
- print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
73
- lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
74
- shown_columns_long_doc, types_long_doc = get_default_cols(
75
- 'long-doc', lb_data_store.leaderboard_df_long_doc.columns, add_fix_cols=True)
76
- lb_data_store.types_long_doc = types_long_doc
77
- lb_data_store.leaderboard_df_long_doc = \
78
- lb_data_store.leaderboard_df_long_doc[
79
- ~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
80
- lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
81
 
82
  lb_data_store.reranking_models = sorted(
83
  list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
@@ -86,8 +95,8 @@ def load_leaderboard_datastore(file_path) -> LeaderboardDataStore:
86
 
87
  def load_eval_results(file_path: str):
88
  output = {}
89
- versions = ("AIR-Bench_24.04",)
90
- for version in versions:
91
  fn = f"{file_path}/{version}"
92
- output[version] = load_leaderboard_datastore(fn)
93
  return output
 
3
 
4
  import pandas as pd
5
 
6
+ from src.envs import DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC, COL_NAME_REVISION, COL_NAME_TIMESTAMP, \
7
+ COL_NAME_IS_ANONYMOUS, BENCHMARK_VERSION_LIST
8
 
9
  from src.models import FullEvalResult, LeaderboardDataStore
10
  from src.utils import get_default_cols, get_leaderboard_df
 
50
  continue
51
  return results
52
 
53
+ def get_safe_name(name: str):
54
+ """Get RFC 1123 compatible safe name"""
55
+ name = name.replace('-', '_')
56
+ return ''.join(
57
+ character.lower()
58
+ for character in name
59
+ if (character.isalnum() or character == '_'))
60
+
61
+ def load_leaderboard_datastore(file_path, version) -> LeaderboardDataStore:
62
+ slug = get_safe_name(version)[-4:]
63
+ lb_data_store = LeaderboardDataStore(version, slug, None, None, None, None, None, None, None, None)
64
  lb_data_store.raw_data = load_raw_eval_results(file_path)
65
  print(f'raw data: {len(lb_data_store.raw_data)}')
66
 
67
  lb_data_store.raw_df_qa = get_leaderboard_df(
68
+ lb_data_store, task='qa', metric=DEFAULT_METRIC_QA)
69
  print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}')
70
  lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy()
71
+ shown_columns_qa, types_qa = get_default_cols('qa', lb_data_store.slug, add_fix_cols=True)
72
+ # shown_columns_qa, types_qa = get_default_cols(
73
+ # 'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True)
74
  lb_data_store.types_qa = types_qa
75
  lb_data_store.leaderboard_df_qa = \
76
  lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa]
77
  lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
78
 
79
+ # lb_data_store.raw_df_long_doc = get_leaderboard_df(
80
+ # lb_data_store, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC)
81
+ # print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}')
82
+ # lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy()
83
+ # shown_columns_long_doc, types_long_doc = get_default_cols(
84
+ # 'long-doc', lb_data_store.leaderboard_df_long_doc.columns, add_fix_cols=True)
85
+ # lb_data_store.types_long_doc = types_long_doc
86
+ # lb_data_store.leaderboard_df_long_doc = \
87
+ # lb_data_store.leaderboard_df_long_doc[
88
+ # ~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][shown_columns_long_doc]
89
+ # lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
90
 
91
  lb_data_store.reranking_models = sorted(
92
  list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data])))
 
95
 
96
  def load_eval_results(file_path: str):
97
  output = {}
98
+ # versions = BENCHMARK_VERSION_LIST
99
+ for version in BENCHMARK_VERSION_LIST:
100
  fn = f"{file_path}/{version}"
101
+ output[version] = load_leaderboard_datastore(fn, version)
102
  return output
src/models.py CHANGED
@@ -6,7 +6,7 @@ from typing import List, Optional
6
  import pandas as pd
7
 
8
  from src.benchmarks import get_safe_name
9
- from src.display.columns import COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL_LINK, \
10
  COL_NAME_RERANKING_MODEL_LINK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
11
  from src.display.formatting import make_clickable_model
12
 
@@ -128,6 +128,8 @@ class FullEvalResult:
128
 
129
  @dataclass
130
  class LeaderboardDataStore:
 
 
131
  raw_data: Optional[list]
132
  raw_df_qa: Optional[pd.DataFrame]
133
  raw_df_long_doc: Optional[pd.DataFrame]
 
6
  import pandas as pd
7
 
8
  from src.benchmarks import get_safe_name
9
+ from src.envs import COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL_LINK, \
10
  COL_NAME_RERANKING_MODEL_LINK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
11
  from src.display.formatting import make_clickable_model
12
 
 
128
 
129
  @dataclass
130
  class LeaderboardDataStore:
131
+ version: str
132
+ slug: str
133
  raw_data: Optional[list]
134
  raw_df_qa: Optional[pd.DataFrame]
135
  raw_df_long_doc: Optional[pd.DataFrame]
src/utils.py CHANGED
@@ -2,17 +2,14 @@ import json
2
  import hashlib
3
  from datetime import datetime, timezone
4
  from pathlib import Path
5
- from typing import List
6
 
7
  import pandas as pd
8
 
9
  from src.benchmarks import QABenchmarks, LongDocBenchmarks
10
  from src.display.formatting import styled_message, styled_error
11
- from src.display.columns import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, \
12
- COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS, COLS_QA, TYPES_QA, COLS_LONG_DOC, TYPES_LONG_DOC, \
13
- FIXED_COLS, FIXED_COLS_TYPES
14
- from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION
15
- from src.models import FullEvalResult
16
 
17
  import re
18
 
@@ -62,61 +59,95 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
62
  return df[(df[COL_NAME_RETRIEVAL_MODEL].str.contains(query, case=False))]
63
 
64
 
65
- def get_default_cols(task: str, columns: list=[], add_fix_cols: bool=True) -> list:
66
  cols = []
67
  types = []
68
  if task == "qa":
69
- cols_list = COLS_QA
70
- types_list = TYPES_QA
71
- benchmark_list = [c.value.col_name for c in list(QABenchmarks)]
72
- elif task == "long-doc":
73
- cols_list = COLS_LONG_DOC
74
- types_list = TYPES_LONG_DOC
75
- benchmark_list = [c.value.col_name for c in list(LongDocBenchmarks)]
76
  else:
77
  raise NotImplemented
78
  for col_name, col_type in zip(cols_list, types_list):
79
  if col_name not in benchmark_list:
80
  continue
81
- if len(columns) > 0 and col_name not in columns:
82
- continue
83
  cols.append(col_name)
84
  types.append(col_type)
85
 
86
  if add_fix_cols:
87
  _cols = []
88
  _types = []
 
89
  for col_name, col_type in zip(cols, types):
90
- if col_name in FIXED_COLS:
91
  continue
92
  _cols.append(col_name)
93
  _types.append(col_type)
94
- cols = FIXED_COLS + _cols
95
- types = FIXED_COLS_TYPES + _types
96
  return cols, types
97
 
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def select_columns(
100
  df: pd.DataFrame,
101
  domain_query: list,
102
  language_query: list,
103
  task: str = "qa",
104
- reset_ranking: bool = True
 
105
  ) -> pd.DataFrame:
106
- cols, _ = get_default_cols(task=task, columns=df.columns, add_fix_cols=False)
107
  selected_cols = []
108
  for c in cols:
109
  if task == "qa":
110
- eval_col = QABenchmarks[c].value
111
  elif task == "long-doc":
112
- eval_col = LongDocBenchmarks[c].value
113
  if eval_col.domain not in domain_query:
114
  continue
115
  if eval_col.lang not in language_query:
116
  continue
117
  selected_cols.append(c)
118
  # We use COLS to maintain sorting
119
- filtered_df = df[FIXED_COLS + selected_cols]
 
 
120
  if reset_ranking:
121
  filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
122
  filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
@@ -125,9 +156,17 @@ def select_columns(
125
 
126
  return filtered_df
127
 
 
 
 
 
 
 
 
128
 
129
  def _update_table(
130
  task: str,
 
131
  hidden_df: pd.DataFrame,
132
  domains: list,
133
  langs: list,
@@ -137,33 +176,24 @@ def _update_table(
137
  reset_ranking: bool = True,
138
  show_revision_and_timestamp: bool = False
139
  ):
 
 
 
 
 
140
  filtered_df = hidden_df.copy()
141
  if not show_anonymous:
142
  filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
143
  filtered_df = filter_models(filtered_df, reranking_query)
144
  filtered_df = filter_queries(query, filtered_df)
145
- filtered_df = select_columns(filtered_df, domains, langs, task, reset_ranking)
146
  if not show_revision_and_timestamp:
147
  filtered_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
148
  return filtered_df
149
 
150
 
151
- def update_table(
152
- hidden_df: pd.DataFrame,
153
- domains: list,
154
- langs: list,
155
- reranking_query: list,
156
- query: str,
157
- show_anonymous: bool,
158
- show_revision_and_timestamp: bool = False,
159
- reset_ranking: bool = True
160
- ):
161
- return _update_table(
162
- "qa",
163
- hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
164
-
165
-
166
  def update_table_long_doc(
 
167
  hidden_df: pd.DataFrame,
168
  domains: list,
169
  langs: list,
@@ -176,11 +206,12 @@ def update_table_long_doc(
176
  ):
177
  return _update_table(
178
  "long-doc",
 
179
  hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
180
 
181
 
182
  def update_metric(
183
- raw_data: List[FullEvalResult],
184
  task: str,
185
  metric: str,
186
  domains: list,
@@ -190,9 +221,12 @@ def update_metric(
190
  show_anonymous: bool = False,
191
  show_revision_and_timestamp: bool = False,
192
  ) -> pd.DataFrame:
 
193
  if task == 'qa':
194
- leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
 
195
  return update_table(
 
196
  leaderboard_df,
197
  domains,
198
  langs,
@@ -202,8 +236,10 @@ def update_metric(
202
  show_revision_and_timestamp
203
  )
204
  elif task == "long-doc":
205
- leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
 
206
  return update_table_long_doc(
 
207
  leaderboard_df,
208
  domains,
209
  langs,
@@ -321,17 +357,20 @@ def reset_rank(df):
321
  return df
322
 
323
 
324
- def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -> pd.DataFrame:
325
  """
326
  Creates a dataframe from all the individual experiment results
327
  """
 
328
  cols = [COL_NAME_IS_ANONYMOUS, ]
329
  if task == "qa":
330
- cols += COLS_QA
331
- benchmark_cols = [t.value.col_name for t in QABenchmarks]
332
- elif task == "long-doc":
333
- cols += COLS_LONG_DOC
334
- benchmark_cols = [t.value.col_name for t in LongDocBenchmarks]
 
 
335
  else:
336
  raise NotImplemented
337
  all_data_json = []
@@ -366,6 +405,7 @@ def set_listeners(
366
  target_df,
367
  source_df,
368
  search_bar,
 
369
  selected_domains,
370
  selected_langs,
371
  selected_rerankings,
@@ -385,11 +425,27 @@ def set_listeners(
385
  search_bar,
386
  show_anonymous
387
  ]
388
- search_bar_args = [source_df,] + selector_list
389
- selector_args = search_bar_args + [show_revision_and_timestamp,]
390
  # Set search_bar listener
391
  search_bar.submit(update_table_func, search_bar_args, target_df)
392
 
393
  # Set column-wise listener
394
  for selector in selector_list:
395
  selector.change(update_table_func, selector_args, target_df, queue=True,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import hashlib
3
  from datetime import datetime, timezone
4
  from pathlib import Path
 
5
 
6
  import pandas as pd
7
 
8
  from src.benchmarks import QABenchmarks, LongDocBenchmarks
9
  from src.display.formatting import styled_message, styled_error
10
+ from src.display.columns import get_default_col_names_and_types, get_fixed_col_names_and_types
11
+ from src.envs import API, SEARCH_RESULTS_REPO, LATEST_BENCHMARK_VERSION, COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, \
12
+ COL_NAME_RERANKING_MODEL, COL_NAME_RANK, COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
 
 
13
 
14
  import re
15
 
 
59
  return df[(df[COL_NAME_RETRIEVAL_MODEL].str.contains(query, case=False))]
60
 
61
 
62
+ def get_default_cols(task: str, version_slug, add_fix_cols: bool=True) -> tuple:
63
  cols = []
64
  types = []
65
  if task == "qa":
66
+ benchmarks = QABenchmarks[version_slug]
67
+ cols_list, types_list = get_default_col_names_and_types(benchmarks)
68
+ # cols_list = COLS_QA
69
+ # types_list = TYPES_QA
70
+ benchmark_list = [c.value.col_name for c in list(benchmarks.value)]
 
 
71
  else:
72
  raise NotImplemented
73
  for col_name, col_type in zip(cols_list, types_list):
74
  if col_name not in benchmark_list:
75
  continue
 
 
76
  cols.append(col_name)
77
  types.append(col_type)
78
 
79
  if add_fix_cols:
80
  _cols = []
81
  _types = []
82
+ fixed_cols, fixed_cols_types = get_fixed_col_names_and_types()
83
  for col_name, col_type in zip(cols, types):
84
+ if col_name in fixed_cols:
85
  continue
86
  _cols.append(col_name)
87
  _types.append(col_type)
88
+ cols = fixed_cols + _cols
89
+ types = fixed_cols_types + _types
90
  return cols, types
91
 
92
 
93
+ # def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
94
+ # cols = []
95
+ # types = []
96
+ # if task == "qa":
97
+ # cols_list = COLS_QA
98
+ # types_list = TYPES_QA
99
+ # benchmark_list = [c.value.col_name for c in list(QABenchmarks)]
100
+ # elif task == "long-doc":
101
+ # cols_list = COLS_LONG_DOC
102
+ # types_list = TYPES_LONG_DOC
103
+ # benchmark_list = [c.value.col_name for c in list(LongDocBenchmarks)]
104
+ # else:
105
+ # raise NotImplemented
106
+ # for col_name, col_type in zip(cols_list, types_list):
107
+ # if col_name not in benchmark_list:
108
+ # continue
109
+ # if len(columns) > 0 and col_name not in columns:
110
+ # continue
111
+ # cols.append(col_name)
112
+ # types.append(col_type)
113
+ #
114
+ # if add_fix_cols:
115
+ # _cols = []
116
+ # _types = []
117
+ # for col_name, col_type in zip(cols, types):
118
+ # if col_name in FIXED_COLS:
119
+ # continue
120
+ # _cols.append(col_name)
121
+ # _types.append(col_type)
122
+ # cols = FIXED_COLS + _cols
123
+ # types = FIXED_COLS_TYPES + _types
124
+ # return cols, types
125
+
126
+
127
  def select_columns(
128
  df: pd.DataFrame,
129
  domain_query: list,
130
  language_query: list,
131
  task: str = "qa",
132
+ reset_ranking: bool = True,
133
+ version_slug: str = None
134
  ) -> pd.DataFrame:
135
+ cols, _ = get_default_cols(task=task, version_slug=version_slug, add_fix_cols=False)
136
  selected_cols = []
137
  for c in cols:
138
  if task == "qa":
139
+ eval_col = QABenchmarks[version_slug].value[c].value
140
  elif task == "long-doc":
141
+ eval_col = LongDocBenchmarks[version_slug].value[c].value
142
  if eval_col.domain not in domain_query:
143
  continue
144
  if eval_col.lang not in language_query:
145
  continue
146
  selected_cols.append(c)
147
  # We use COLS to maintain sorting
148
+ fixed_cols, _ = get_fixed_col_names_and_types()
149
+ filtered_df = df[fixed_cols + selected_cols]
150
+ filtered_df.replace({"": pd.NA}, inplace=True)
151
  if reset_ranking:
152
  filtered_df[COL_NAME_AVG] = filtered_df[selected_cols].apply(calculate_mean, axis=1).round(decimals=2)
153
  filtered_df.sort_values(by=[COL_NAME_AVG], ascending=False, inplace=True)
 
156
 
157
  return filtered_df
158
 
159
+ def get_safe_name(name: str):
160
+ """Get RFC 1123 compatible safe name"""
161
+ name = name.replace('-', '_')
162
+ return ''.join(
163
+ character.lower()
164
+ for character in name
165
+ if (character.isalnum() or character == '_'))
166
 
167
  def _update_table(
168
  task: str,
169
+ version: str,
170
  hidden_df: pd.DataFrame,
171
  domains: list,
172
  langs: list,
 
176
  reset_ranking: bool = True,
177
  show_revision_and_timestamp: bool = False
178
  ):
179
+ version_slug = get_safe_name(version)[-4:]
180
+ if isinstance(hidden_df, str):
181
+ print(f"task: {task}")
182
+ print(f"version: {version}")
183
+ print(f"hidden_df is a string: {hidden_df}")
184
  filtered_df = hidden_df.copy()
185
  if not show_anonymous:
186
  filtered_df = filtered_df[~filtered_df[COL_NAME_IS_ANONYMOUS]]
187
  filtered_df = filter_models(filtered_df, reranking_query)
188
  filtered_df = filter_queries(query, filtered_df)
189
+ filtered_df = select_columns(filtered_df, domains, langs, task, reset_ranking, version_slug)
190
  if not show_revision_and_timestamp:
191
  filtered_df.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True)
192
  return filtered_df
193
 
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  def update_table_long_doc(
196
+ version: str,
197
  hidden_df: pd.DataFrame,
198
  domains: list,
199
  langs: list,
 
206
  ):
207
  return _update_table(
208
  "long-doc",
209
+ version,
210
  hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
211
 
212
 
213
  def update_metric(
214
+ datastore,
215
  task: str,
216
  metric: str,
217
  domains: list,
 
221
  show_anonymous: bool = False,
222
  show_revision_and_timestamp: bool = False,
223
  ) -> pd.DataFrame:
224
+ # raw_data = datastore.raw_data
225
  if task == 'qa':
226
+ leaderboard_df = get_leaderboard_df(datastore, task=task, metric=metric)
227
+ version = datastore.version
228
  return update_table(
229
+ version,
230
  leaderboard_df,
231
  domains,
232
  langs,
 
236
  show_revision_and_timestamp
237
  )
238
  elif task == "long-doc":
239
+ leaderboard_df = get_leaderboard_df(datastore, task=task, metric=metric)
240
+ version = datastore.version
241
  return update_table_long_doc(
242
+ version,
243
  leaderboard_df,
244
  domains,
245
  langs,
 
357
  return df
358
 
359
 
360
+ def get_leaderboard_df(datastore, task: str, metric: str) -> pd.DataFrame:
361
  """
362
  Creates a dataframe from all the individual experiment results
363
  """
364
+ raw_data = datastore.raw_data
365
  cols = [COL_NAME_IS_ANONYMOUS, ]
366
  if task == "qa":
367
+ benchmarks = QABenchmarks[datastore.slug]
368
+ cols_qa, _ = get_default_col_names_and_types(benchmarks)
369
+ cols += cols_qa
370
+ benchmark_cols = [t.value.col_name for t in list(benchmarks.value)]
371
+ # elif task == "long-doc":
372
+ # cols += COLS_LONG_DOC
373
+ # benchmark_cols = [t.value.col_name for t in LongDocBenchmarks]
374
  else:
375
  raise NotImplemented
376
  all_data_json = []
 
405
  target_df,
406
  source_df,
407
  search_bar,
408
+ version,
409
  selected_domains,
410
  selected_langs,
411
  selected_rerankings,
 
425
  search_bar,
426
  show_anonymous
427
  ]
428
+ search_bar_args = [source_df, version,] + selector_list
429
+ selector_args = [version, source_df] + selector_list + [show_revision_and_timestamp,]
430
  # Set search_bar listener
431
  search_bar.submit(update_table_func, search_bar_args, target_df)
432
 
433
  # Set column-wise listener
434
  for selector in selector_list:
435
  selector.change(update_table_func, selector_args, target_df, queue=True,)
436
+
437
+ def update_table(
438
+ version: str,
439
+ hidden_df: pd.DataFrame,
440
+ domains: list,
441
+ langs: list,
442
+ reranking_query: list,
443
+ query: str,
444
+ show_anonymous: bool,
445
+ show_revision_and_timestamp: bool = False,
446
+ reset_ranking: bool = True,
447
+ ):
448
+ return _update_table(
449
+ "qa",
450
+ version,
451
+ hidden_df, domains, langs, reranking_query, query, show_anonymous, reset_ranking, show_revision_and_timestamp)
tests/src/test_benchmarks.py CHANGED
@@ -2,9 +2,14 @@ from src.benchmarks import QABenchmarks, LongDocBenchmarks
2
 
3
 
4
  def test_qabenchmarks():
5
- print(list(QABenchmarks))
6
- for benchmark in list(QABenchmarks):
7
- print(benchmark.name, benchmark.metric, benchmark.col_name, benchmark.domain, benchmark.lang, benchmark.task)
 
 
 
 
 
8
 
9
 
10
  def test_longdocbenchmarks():
 
2
 
3
 
4
  def test_qabenchmarks():
5
+ for benchmark_list in list(QABenchmarks):
6
+ print(benchmark_list.name)
7
+ for b in list(benchmark_list.value):
8
+ print(b)
9
+ qa_benchmarks = QABenchmarks["2404"]
10
+ l = list(frozenset([c.value.domain for c in list(qa_benchmarks.value)]))
11
+ print(l)
12
+
13
 
14
 
15
  def test_longdocbenchmarks():
tests/test_utils.py CHANGED
@@ -1,9 +1,10 @@
1
  import pandas as pd
2
  import pytest
3
 
4
- from src.utils import filter_models, search_table, filter_queries, select_columns, update_table_long_doc, get_iso_format_timestamp, get_default_cols, update_table
5
- from src.display.columns import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, \
6
- COL_NAME_REVISION, COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
 
7
 
8
 
9
  @pytest.fixture
 
1
  import pandas as pd
2
  import pytest
3
 
4
+ from src.utils import filter_models, search_table, filter_queries, select_columns, update_table_long_doc, get_iso_format_timestamp, get_default_cols
5
+ from app import update_table
6
+ from src.envs import COL_NAME_AVG, COL_NAME_RETRIEVAL_MODEL, COL_NAME_RERANKING_MODEL, COL_NAME_RANK, COL_NAME_REVISION, \
7
+ COL_NAME_TIMESTAMP, COL_NAME_IS_ANONYMOUS
8
 
9
 
10
  @pytest.fixture