Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
1a2dba5
1 Parent(s): e22a0ca

feat: adapt to the latest data format

Browse files
app.py CHANGED
@@ -27,12 +27,12 @@ try:
27
  except Exception:
28
  restart_space()
29
 
30
- raw_data = get_raw_eval_results(EVAL_RESULTS_PATH)
31
 
32
  original_df_qa = get_leaderboard_df(
33
  raw_data, task='qa', metric='ndcg_at_3')
34
  original_df_long_doc = get_leaderboard_df(
35
- raw_data, task='long_doc', metric='ndcg_at_3')
36
  print(f'raw data: {len(raw_data)}')
37
  print(f'QA data loaded: {original_df_qa.shape}')
38
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
@@ -42,7 +42,7 @@ shown_columns_qa = get_default_cols('qa', leaderboard_df_qa.columns, add_fix_col
42
  leaderboard_df_qa = leaderboard_df_qa[shown_columns_qa]
43
 
44
  leaderboard_df_long_doc = original_df_long_doc.copy()
45
- shown_columns_long_doc = get_default_cols('long_doc', leaderboard_df_long_doc.columns, add_fix_cols=True)
46
  leaderboard_df_long_doc = leaderboard_df_long_doc[shown_columns_long_doc]
47
 
48
 
@@ -62,7 +62,7 @@ def update_metric_long_doc(
62
  reranking_model: list,
63
  query: str,
64
  ):
65
- return update_metric(raw_data, 'long_doc', metric, domains, langs, reranking_model, query)
66
 
67
 
68
  demo = gr.Blocks(css=custom_css)
 
27
  except Exception:
28
  restart_space()
29
 
30
+ raw_data = get_raw_eval_results(f"{EVAL_RESULTS_PATH}/AIR-Bench_24.04")
31
 
32
  original_df_qa = get_leaderboard_df(
33
  raw_data, task='qa', metric='ndcg_at_3')
34
  original_df_long_doc = get_leaderboard_df(
35
+ raw_data, task='long-doc', metric='ndcg_at_3')
36
  print(f'raw data: {len(raw_data)}')
37
  print(f'QA data loaded: {original_df_qa.shape}')
38
  print(f'Long-Doc data loaded: {len(original_df_long_doc)}')
 
42
  leaderboard_df_qa = leaderboard_df_qa[shown_columns_qa]
43
 
44
  leaderboard_df_long_doc = original_df_long_doc.copy()
45
+ shown_columns_long_doc = get_default_cols('long-doc', leaderboard_df_long_doc.columns, add_fix_cols=True)
46
  leaderboard_df_long_doc = leaderboard_df_long_doc[shown_columns_long_doc]
47
 
48
 
 
62
  reranking_model: list,
63
  query: str,
64
  ):
65
+ return update_metric(raw_data, "long-doc", metric, domains, langs, reranking_model, query)
66
 
67
 
68
  demo = gr.Blocks(css=custom_css)
src/benchmarks.py CHANGED
@@ -40,7 +40,7 @@ dataset_dict = {
40
  "arxiv": {
41
  "en": ["Arxiv", ]},
42
  },
43
- "long_doc": {
44
  "arxiv": {
45
  "en": ["gpt-3", "llama2", "llm-survey", "gemini"],
46
  },
@@ -125,7 +125,7 @@ for task, domain_dict in dataset_dict.items():
125
  col_name = benchmark_name
126
  for metric in dataset_list:
127
  qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
128
- elif task == "long_doc":
129
  for dataset in dataset_list:
130
  benchmark_name = f"{domain}_{lang}_{dataset}"
131
  benchmark_name = get_safe_name(benchmark_name)
 
40
  "arxiv": {
41
  "en": ["Arxiv", ]},
42
  },
43
+ "long-doc": {
44
  "arxiv": {
45
  "en": ["gpt-3", "llama2", "llm-survey", "gemini"],
46
  },
 
125
  col_name = benchmark_name
126
  for metric in dataset_list:
127
  qa_benchmark_dict[benchmark_name] = Benchmark(benchmark_name, metric, col_name, domain, lang, task)
128
+ elif task == "long-doc":
129
  for dataset in dataset_list:
130
  benchmark_name = f"{domain}_{lang}_{dataset}"
131
  benchmark_name = get_safe_name(benchmark_name)
src/display/utils.py CHANGED
@@ -22,6 +22,8 @@ class ColumnContent:
22
  COL_NAME_AVG = "Average ⬆️"
23
  COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
24
  COL_NAME_RERANKING_MODEL = "Reranking Model"
 
 
25
  COL_NAME_RANK = "Rank 🏆"
26
 
27
  def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
@@ -34,6 +36,12 @@ def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
34
  auto_eval_column_dict.append(
35
  ["reranking_model", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, never_hidden=True)]
36
  )
 
 
 
 
 
 
37
  auto_eval_column_dict.append(
38
  ["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
39
  )
 
22
  COL_NAME_AVG = "Average ⬆️"
23
  COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
24
  COL_NAME_RERANKING_MODEL = "Reranking Model"
25
+ COL_NAME_RETRIEVAL_MODEL_LINK = "Retrieval Model LINK"
26
+ COL_NAME_RERANKING_MODEL_LINK = "Reranking Model LINK"
27
  COL_NAME_RANK = "Rank 🏆"
28
 
29
  def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
 
36
  auto_eval_column_dict.append(
37
  ["reranking_model", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, never_hidden=True)]
38
  )
39
+ auto_eval_column_dict.append(
40
+ ["retrieval_model_link", ColumnContent, ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", False, hidden=True, never_hidden=False)]
41
+ )
42
+ auto_eval_column_dict.append(
43
+ ["reranking_model_link", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", False, hidden=True, never_hidden=False)]
44
+ )
45
  auto_eval_column_dict.append(
46
  ["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
47
  )
src/leaderboard/read_evals.py CHANGED
@@ -12,6 +12,8 @@ from src.display.formatting import has_no_nan_values
12
  from src.display.utils import (
13
  COL_NAME_RERANKING_MODEL,
14
  COL_NAME_RETRIEVAL_MODEL,
 
 
15
  COLS_QA,
16
  QA_BENCHMARK_COLS,
17
  COLS_LONG_DOC,
@@ -44,6 +46,8 @@ class FullEvalResult:
44
  eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
45
  retrieval_model: str
46
  reranking_model: str
 
 
47
  results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
48
  date: str = ""
49
 
@@ -58,10 +62,15 @@ class FullEvalResult:
58
 
59
  # store all the results for different metrics and tasks
60
  result_list = []
 
 
61
  for item in model_data:
62
  config = item.get("config", {})
63
  # eval results for different metrics
64
  results = item.get("results", [])
 
 
 
65
  eval_result = EvalResult(
66
  eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
67
  retrieval_model=config["retrieval_model"],
@@ -75,6 +84,8 @@ class FullEvalResult:
75
  eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
76
  retrieval_model=result_list[0].retrieval_model,
77
  reranking_model=result_list[0].reranking_model,
 
 
78
  results=result_list
79
  )
80
 
@@ -91,6 +102,8 @@ class FullEvalResult:
91
  results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
92
  results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = self.retrieval_model
93
  results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = self.reranking_model
 
 
94
 
95
  # print(f'result loaded: {eval_result.eval_name}')
96
  for result in eval_result.results:
@@ -99,9 +112,9 @@ class FullEvalResult:
99
  lang = result["lang"]
100
  dataset = result["dataset"]
101
  value = result["value"]
102
- if task == 'qa':
103
  benchmark_name = f"{domain}_{lang}"
104
- elif task == 'long_doc':
105
  benchmark_name = f"{domain}_{lang}_{dataset}"
106
  results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
107
  return [v for v in results.values()]
@@ -115,13 +128,12 @@ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
115
  for root, dirs, files in os.walk(results_path):
116
  if len(files) == 0:
117
  continue
118
- try:
119
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7], reverse=True)
120
- except dateutil.parser._parser.ParserError:
121
- files = [files[-1]]
122
 
123
  # select the latest results
124
  for file in files:
 
 
 
125
  model_result_filepaths.append(os.path.join(root, file))
126
 
127
  eval_results = {}
@@ -154,7 +166,7 @@ def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -
154
  if task == "qa":
155
  cols = COLS_QA
156
  benchmark_cols = QA_BENCHMARK_COLS
157
- elif task == "long_doc":
158
  cols = COLS_LONG_DOC
159
  benchmark_cols = LONG_DOC_BENCHMARK_COLS
160
  else:
 
12
  from src.display.utils import (
13
  COL_NAME_RERANKING_MODEL,
14
  COL_NAME_RETRIEVAL_MODEL,
15
+ COL_NAME_RERANKING_MODEL_LINK,
16
+ COL_NAME_RETRIEVAL_MODEL_LINK,
17
  COLS_QA,
18
  QA_BENCHMARK_COLS,
19
  COLS_LONG_DOC,
 
46
  eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
47
  retrieval_model: str
48
  reranking_model: str
49
+ retrieval_model_link: str
50
+ reranking_model_link: str
51
  results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
52
  date: str = ""
53
 
 
62
 
63
  # store all the results for different metrics and tasks
64
  result_list = []
65
+ retrieval_model_link = ""
66
+ reranking_model_link = ""
67
  for item in model_data:
68
  config = item.get("config", {})
69
  # eval results for different metrics
70
  results = item.get("results", [])
71
+ retrieval_model_link=config["retreival_model_link"]
72
+ if config["reranking_model_link"] is not None:
73
+ reranking_model_link=""
74
  eval_result = EvalResult(
75
  eval_name=f"{config['retrieval_model']}_{config['reranking_model']}_{config['metric']}",
76
  retrieval_model=config["retrieval_model"],
 
84
  eval_name=f"{result_list[0].retrieval_model}_{result_list[0].reranking_model}",
85
  retrieval_model=result_list[0].retrieval_model,
86
  reranking_model=result_list[0].reranking_model,
87
+ retrieval_model_link=retrieval_model_link,
88
+ reranking_model_link=reranking_model_link,
89
  results=result_list
90
  )
91
 
 
102
  results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
103
  results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = self.retrieval_model
104
  results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = self.reranking_model
105
+ results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL_LINK] = self.retrieval_model_link
106
+ results[eval_result.eval_name][COL_NAME_RERANKING_MODEL_LINK] = self.reranking_model_link
107
 
108
  # print(f'result loaded: {eval_result.eval_name}')
109
  for result in eval_result.results:
 
112
  lang = result["lang"]
113
  dataset = result["dataset"]
114
  value = result["value"]
115
+ if dataset == 'default':
116
  benchmark_name = f"{domain}_{lang}"
117
+ else:
118
  benchmark_name = f"{domain}_{lang}_{dataset}"
119
  results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
120
  return [v for v in results.values()]
 
128
  for root, dirs, files in os.walk(results_path):
129
  if len(files) == 0:
130
  continue
 
 
 
 
131
 
132
  # select the latest results
133
  for file in files:
134
+ if file != "results.json":
135
+ print(f'skip {file}')
136
+ continue
137
  model_result_filepaths.append(os.path.join(root, file))
138
 
139
  eval_results = {}
 
166
  if task == "qa":
167
  cols = COLS_QA
168
  benchmark_cols = QA_BENCHMARK_COLS
169
+ elif task == "long-doc":
170
  cols = COLS_LONG_DOC
171
  benchmark_cols = LONG_DOC_BENCHMARK_COLS
172
  else:
tests/src/leaderboard/test_read_evals.py CHANGED
@@ -28,35 +28,35 @@ def test_to_dict():
28
 
29
 
30
  def test_get_raw_eval_results():
31
- results_path = cur_fp.parents[2] / "toydata" / "test_results" / "bge-m3"
32
  results = get_raw_eval_results(results_path)
33
  # only load the latest results
34
- assert len(results) == 2
35
- assert results[0].eval_name == "bge-m3_NoReranker"
36
- assert len(results[0].results) == 6
37
- assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
38
- assert len(results[1].results) == 6
39
 
40
 
41
  def test_get_leaderboard_df():
42
- results_path = cur_fp.parents[2] / "toydata" / "test_results"
43
  raw_data = get_raw_eval_results(results_path)
44
- df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_1')
45
- assert df.shape[0] == 2
46
  # the results contain only one embedding model
47
- for i in range(2):
48
- assert df["Retrieval Model"][i] == "bge-m3"
49
- # the results contain only two reranking model
50
- assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
51
- assert df["Reranking Model"][1] == "NoReranker"
52
- assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
53
- assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()
54
 
55
 
56
  def test_get_leaderboard_df_long_doc():
57
  results_path = cur_fp.parents[2] / "toydata" / "test_results"
58
  raw_data = get_raw_eval_results(results_path)
59
- df = get_leaderboard_df(raw_data, 'long_doc', 'ndcg_at_1')
60
  assert df.shape[0] == 2
61
  # the results contain only one embedding model
62
  for i in range(2):
 
28
 
29
 
30
  def test_get_raw_eval_results():
31
+ results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
32
  results = get_raw_eval_results(results_path)
33
  # only load the latest results
34
+ assert len(results) == 4
35
+ assert results[0].eval_name == "bge-base-en-v1.5_NoReranker"
36
+ assert len(results[0].results) == 70
37
+ assert results[0].eval_name == "bge-base-en-v1.5_bge-reranker-v2-m3"
38
+ assert len(results[1].results) == 70
39
 
40
 
41
  def test_get_leaderboard_df():
42
+ results_path = cur_fp.parents[2] / "toydata" / "eval_results" / "AIR-Bench_24.04"
43
  raw_data = get_raw_eval_results(results_path)
44
+ df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_3')
45
+ assert df.shape[0] == 4
46
  # the results contain only one embedding model
47
+ # for i in range(4):
48
+ # assert df["Retrieval Model"][i] == "bge-m3"
49
+ # # the results contain only two reranking model
50
+ # assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
51
+ # assert df["Reranking Model"][1] == "NoReranker"
52
+ # assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
53
+ # assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh', ]].isnull().values.any()
54
 
55
 
56
  def test_get_leaderboard_df_long_doc():
57
  results_path = cur_fp.parents[2] / "toydata" / "test_results"
58
  raw_data = get_raw_eval_results(results_path)
59
+ df = get_leaderboard_df(raw_data, 'long-doc', 'ndcg_at_1')
60
  assert df.shape[0] == 2
61
  # the results contain only one embedding model
62
  for i in range(2):
utils.py CHANGED
@@ -47,7 +47,7 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
47
  def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
48
  if task == "qa":
49
  cols = list(frozenset(COLS_QA).intersection(frozenset(BENCHMARK_COLS_QA)).intersection(frozenset(columns)))
50
- elif task == "long_doc":
51
  cols = list(frozenset(COLS_LONG_DOC).intersection(frozenset(BENCHMARK_COLS_LONG_DOC)).intersection(frozenset(columns)))
52
  else:
53
  raise NotImplemented
@@ -68,7 +68,7 @@ def select_columns(df: pd.DataFrame, domain_query: list, language_query: list, t
68
  for c in cols:
69
  if task == "qa":
70
  eval_col = BenchmarksQA[c].value
71
- elif task == "long_doc":
72
  eval_col = BenchmarksLongDoc[c].value
73
  if eval_col.domain not in domain_query:
74
  continue
@@ -127,7 +127,7 @@ def update_metric(
127
  reranking_model,
128
  query
129
  )
130
- elif task == 'long_doc':
131
  leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
132
  return update_table_long_doc(
133
  leaderboard_df,
 
47
  def get_default_cols(task: str, columns: list, add_fix_cols: bool=True) -> list:
48
  if task == "qa":
49
  cols = list(frozenset(COLS_QA).intersection(frozenset(BENCHMARK_COLS_QA)).intersection(frozenset(columns)))
50
+ elif task == "long-doc":
51
  cols = list(frozenset(COLS_LONG_DOC).intersection(frozenset(BENCHMARK_COLS_LONG_DOC)).intersection(frozenset(columns)))
52
  else:
53
  raise NotImplemented
 
68
  for c in cols:
69
  if task == "qa":
70
  eval_col = BenchmarksQA[c].value
71
+ elif task == "long-doc":
72
  eval_col = BenchmarksLongDoc[c].value
73
  if eval_col.domain not in domain_query:
74
  continue
 
127
  reranking_model,
128
  query
129
  )
130
+ elif task == "long-doc":
131
  leaderboard_df = get_leaderboard_df(raw_data, task=task, metric=metric)
132
  return update_table_long_doc(
133
  leaderboard_df,