Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
3d59d51
·
1 Parent(s): ea6034c

feat: fix the to_dict function

Browse files
src/leaderboard/read_evals.py CHANGED
@@ -1,4 +1,5 @@
1
  import glob
 
2
  import json
3
  import os.path
4
  from dataclasses import dataclass
@@ -6,7 +7,7 @@ from typing import List
6
 
7
  import dateutil.parser._parser
8
 
9
- from src.display.utils import AutoEvalColumn
10
  from src.benchmarks import get_safe_name
11
 
12
 
@@ -61,20 +62,19 @@ class FullEvalResult:
61
  results=result_list
62
  )
63
 
64
- def to_dict(self, task='qa', metric='ndcg_at_1'):
65
  """Convert FullEvalResult to a list of dict compatible with our dataframe UI
66
  """
67
- results = []
68
  for eval_result in self.results:
69
  if eval_result.metric != metric:
70
  continue
71
  if eval_result.task != task:
72
  continue
73
- data_dict = {
74
- "eval_name": eval_result.eval_name,
75
- AutoEvalColumn.retrieval_model.name: self.retrieval_model,
76
- AutoEvalColumn.reranking_model.name: self.reranking_model,
77
- }
78
  for result in eval_result.results:
79
  # add result for each domain, language, and dataset
80
  domain = result["domain"]
@@ -82,12 +82,11 @@ class FullEvalResult:
82
  dataset = result["dataset"]
83
  value = result["value"]
84
  if task == 'qa':
85
- benchmark_name = f"{task}_{domain}_{lang}"
86
  elif task == 'long_doc':
87
- benchmark_name = f"{task}_{domain}_{lang}_{dataset}_{metric}"
88
- data_dict[get_safe_name(benchmark_name)] = value
89
- results.append(data_dict)
90
- return results
91
 
92
  def update_with_request_file(self, request_path):
93
  """
@@ -148,7 +147,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> List[FullEval
148
  eval_results = {}
149
  for model_result_filepath in model_result_filepaths:
150
  # create evaluation results
151
- # TODO: fix the bug here, the running results should not be loaded
152
  eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
153
  # get the latest result that is finished
154
  eval_result.update_with_request_file(requests_path)
 
1
  import glob
2
+ from collections import defaultdict
3
  import json
4
  import os.path
5
  from dataclasses import dataclass
 
7
 
8
  import dateutil.parser._parser
9
 
10
+ from src.display.utils import AutoEvalColumnQA
11
  from src.benchmarks import get_safe_name
12
 
13
 
 
62
  results=result_list
63
  )
64
 
65
+ def to_dict(self, task='qa', metric='ndcg_at_1') -> List:
66
  """Convert FullEvalResult to a list of dict compatible with our dataframe UI
67
  """
68
+ results = defaultdict(dict)
69
  for eval_result in self.results:
70
  if eval_result.metric != metric:
71
  continue
72
  if eval_result.task != task:
73
  continue
74
+ results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
75
+ results[eval_result.eval_name][AutoEvalColumnQA.retrieval_model.name] = self.retrieval_model
76
+ results[eval_result.eval_name][AutoEvalColumnQA.reranking_model.name] = self.reranking_model
77
+
 
78
  for result in eval_result.results:
79
  # add result for each domain, language, and dataset
80
  domain = result["domain"]
 
82
  dataset = result["dataset"]
83
  value = result["value"]
84
  if task == 'qa':
85
+ benchmark_name = f"{domain}_{lang}"
86
  elif task == 'long_doc':
87
+ benchmark_name = f"{domain}_{lang}_{dataset}_{metric}"
88
+ results[eval_result.eval_name][get_safe_name(benchmark_name)] = value
89
+ return [v for v in results.values()]
 
90
 
91
  def update_with_request_file(self, request_path):
92
  """
 
147
  eval_results = {}
148
  for model_result_filepath in model_result_filepaths:
149
  # create evaluation results
 
150
  eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
151
  # get the latest result that is finished
152
  eval_result.update_with_request_file(requests_path)
src/populate.py CHANGED
@@ -4,7 +4,7 @@ import os
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
9
  from typing import Tuple
10
 
@@ -12,10 +12,13 @@ from typing import Tuple
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[list[EvalResult], pd.DataFrame]:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
- all_data_json = [v.to_dict() for v in raw_data]
 
 
16
 
17
  df = pd.DataFrame.from_records(all_data_json)
18
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
19
  df = df[cols].round(decimals=2)
20
 
21
  # filter out if any of the benchmarks have not been produced
 
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumnQA, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
9
  from typing import Tuple
10
 
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[list[EvalResult], pd.DataFrame]:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
+ all_data_json = []
16
+ for v in raw_data:
17
+ all_data_json += v.to_dict()
18
 
19
  df = pd.DataFrame.from_records(all_data_json)
20
+ df["Average ⬆️"] = df[benchmark_cols].mean(axis=1)
21
+ # df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
22
  df = df[cols].round(decimals=2)
23
 
24
  # filter out if any of the benchmarks have not been produced
tests/src/display/test_utils.py CHANGED
@@ -2,7 +2,6 @@ import pytest
2
  from src.display.utils import fields, AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS, COLS_LITE, TYPES, EVAL_COLS, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
3
 
4
 
5
- @pytest.mark.parametrize('auto_eval_column')
6
  def test_fields():
7
  for c in fields(AutoEvalColumnQA):
8
  print(c)
 
2
  from src.display.utils import fields, AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS, COLS_LITE, TYPES, EVAL_COLS, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
3
 
4
 
 
5
  def test_fields():
6
  for c in fields(AutoEvalColumnQA):
7
  print(c)
tests/src/leaderboard/test_read_evals.py CHANGED
@@ -14,8 +14,13 @@ def test_init_from_json_file():
14
  def test_to_dict():
15
  json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
16
  full_eval_result = FullEvalResult.init_from_json_file(json_fp)
17
- result_dict = full_eval_result.to_dict(task='qa', metric='ndcg_at_1')
18
- assert len(result_dict) == 2
 
 
 
 
 
19
 
20
 
21
  def test_get_request_file_for_model():
 
14
  def test_to_dict():
15
  json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
16
  full_eval_result = FullEvalResult.init_from_json_file(json_fp)
17
+ result_list = full_eval_result.to_dict(task='qa', metric='ndcg_at_1')
18
+ assert len(result_list) == 1
19
+ result_dict = result_list[0]
20
+ assert result_dict["Retrieval Model"] == "bge-m3"
21
+ assert result_dict["Reranking Model"] == "bge-reranker-v2-m3"
22
+ assert result_dict["qa_wiki_en"] is not None
23
+ assert result_dict["qa_wiki_zh"] is not None
24
 
25
 
26
  def test_get_request_file_for_model():
tests/src/test_populate.py CHANGED
@@ -3,10 +3,18 @@ from pathlib import Path
3
 
4
  cur_fp = Path(__file__)
5
 
 
6
  def test_get_leaderboard_df():
7
- requests_path = cur_fp.parents[2] / "toydata" / "test_requests"
8
- results_path = cur_fp.parents[2] / "toydata" / "test_results"
9
- cols = []
10
- benchmark_cols = []
11
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
12
- get_leaderboard_df(results_path, requests_path, cols, benchmark_cols)
 
 
 
 
 
 
 
 
3
 
4
  cur_fp = Path(__file__)
5
 
6
+
7
  def test_get_leaderboard_df():
8
+ requests_path = cur_fp.parents[1] / "toydata" / "test_requests"
9
+ results_path = cur_fp.parents[1] / "toydata" / "test_results"
10
+ cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
11
+ benchmark_cols = ['wiki_en', 'wiki_zh',]
12
+ raw_data, df = get_leaderboard_df(results_path, requests_path, cols, benchmark_cols)
13
+ assert df.shape[0] == 2
14
+ assert df["Retrieval Model"][0] == "bge-m3"
15
+ assert df["Retrieval Model"][1] == "bge-m3"
16
+ assert df["Reranking Model"][0] == "NoReranker"
17
+ assert df["Reranking Model"][1] == "bge-reranker-v2-m3"
18
+ assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
19
+
20
+