Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
3b83af7
·
1 Parent(s): e5c7cad

chore: clean up the requests related codes

Browse files
src/display/utils.py CHANGED
@@ -19,18 +19,22 @@ class ColumnContent:
19
  never_hidden: bool = False
20
 
21
 
 
 
 
 
22
  def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
  auto_eval_column_dict.append(
27
- ["retrieval_model", ColumnContent, ColumnContent("Retrieval Model", "markdown", True, never_hidden=True)]
28
  )
29
  auto_eval_column_dict.append(
30
- ["reranking_model", ColumnContent, ColumnContent("Reranking Model", "markdown", True, never_hidden=True)]
31
  )
32
  auto_eval_column_dict.append(
33
- ["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]
34
  )
35
  for benchmark in benchmarks:
36
  auto_eval_column_dict.append(
 
19
  never_hidden: bool = False
20
 
21
 
22
+ COL_NAME_AVG = "Average ⬆️"
23
+ COL_NAME_RETRIEVAL_MODEL = "Retrieval Model"
24
+ COL_NAME_RERANKING_MODEL = "Reranking Model"
25
+
26
  def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
27
  ## Leaderboard columns
28
  auto_eval_column_dict = []
29
  # Init
30
  auto_eval_column_dict.append(
31
+ ["retrieval_model", ColumnContent, ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", True, never_hidden=True)]
32
  )
33
  auto_eval_column_dict.append(
34
+ ["reranking_model", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, never_hidden=True)]
35
  )
36
  auto_eval_column_dict.append(
37
+ ["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
38
  )
39
  for benchmark in benchmarks:
40
  auto_eval_column_dict.append(
src/leaderboard/read_evals.py CHANGED
@@ -1,24 +1,28 @@
1
- import glob
2
- from collections import defaultdict
3
  import json
4
  import os.path
 
5
  from dataclasses import dataclass
6
  from typing import List
7
 
8
  import dateutil.parser._parser
 
9
 
10
- from src.display.utils import AutoEvalColumnQA
11
  from src.benchmarks import get_safe_name
 
 
 
12
 
13
 
14
  @dataclass
15
  class EvalResult:
16
- """Full evaluation result of a single embedding model
 
 
17
  """
18
  eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]_[metric]
19
  retrieval_model: str
20
  reranking_model: str
21
- results: list # results on all the benchmarks over different domains, languages, and datasets. Use benchmark.name as the key
22
  task: str
23
  metric: str
24
  timestamp: str = "" # submission timestamp
@@ -26,6 +30,9 @@ class EvalResult:
26
 
27
  @dataclass
28
  class FullEvalResult:
 
 
 
29
  eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
30
  retrieval_model: str
31
  reranking_model: str
@@ -34,7 +41,8 @@ class FullEvalResult:
34
 
35
  @classmethod
36
  def init_from_json_file(cls, json_filepath):
37
- """Initiate from the result json file for a single model.
 
38
  The json file will be written only when the status is FINISHED.
39
  """
40
  with open(json_filepath) as fp:
@@ -63,19 +71,18 @@ class FullEvalResult:
63
  )
64
 
65
  def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
66
- """Convert FullEvalResult to a list of dict compatible with our dataframe UI
 
67
  """
68
  results = defaultdict(dict)
69
  for eval_result in self.results:
70
  if eval_result.metric != metric:
71
- # print(f'result skipped: {metric} != {eval_result.metric}')
72
  continue
73
  if eval_result.task != task:
74
- # print(f'result skipped: {task} != {eval_result.task}')
75
  continue
76
  results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
77
- results[eval_result.eval_name][AutoEvalColumnQA.retrieval_model.name] = self.retrieval_model
78
- results[eval_result.eval_name][AutoEvalColumnQA.reranking_model.name] = self.reranking_model
79
 
80
  print(f'result loaded: {eval_result.eval_name}')
81
  for result in eval_result.results:
@@ -92,43 +99,20 @@ class FullEvalResult:
92
  return [v for v in results.values()]
93
 
94
 
95
- def get_request_file_for_model(requests_path, retrieval_model_name, reranking_model_name):
96
- """
97
- Load the request status from a json file
98
- """
99
- request_files = os.path.join(
100
- requests_path,
101
- f"{retrieval_model_name}",
102
- f"{reranking_model_name}",
103
- "eval_request_*.json",
104
- )
105
- request_files = glob.glob(request_files)
106
-
107
- request_file = ""
108
- request_files = sorted(request_files, reverse=True)
109
- for tmp_request_file in request_files:
110
- with open(tmp_request_file, "r") as f:
111
- req_content = json.load(f)
112
- if req_content["status"] in ["FINISHED"]:
113
- request_file = tmp_request_file
114
- break
115
- return request_file
116
-
117
-
118
  def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
119
  """
120
  Load the evaluation results from a json file
121
  """
122
  model_result_filepaths = []
123
  for root, dirs, files in os.walk(results_path):
124
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
125
  continue
126
  try:
127
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7], reverse=True)
128
  except dateutil.parser._parser.ParserError:
129
  files = [files[-1]]
130
 
131
- # select the latest and finished results
132
  for file in files:
133
  model_result_filepaths.append(os.path.join(root, file))
134
 
@@ -136,7 +120,6 @@ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
136
  for model_result_filepath in model_result_filepaths:
137
  # create evaluation results
138
  eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
139
- model_result_date_str = model_result_filepath.split('/')[-1].removeprefix("results_").removesuffix(".json")
140
  print(f'file loaded: {model_result_filepath}')
141
  eval_name = eval_result.eval_name
142
  eval_results[eval_name] = eval_result
@@ -150,3 +133,35 @@ def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
150
  print(f"loading failed: {k}")
151
  continue
152
  return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os.path
3
+ from collections import defaultdict
4
  from dataclasses import dataclass
5
  from typing import List
6
 
7
  import dateutil.parser._parser
8
+ import pandas as pd
9
 
 
10
  from src.benchmarks import get_safe_name
11
+ from src.display.formatting import has_no_nan_values
12
+ from src.display.utils import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL, COLS_QA, QA_BENCHMARK_COLS, \
13
+ COLS_LONG_DOC, LONG_DOC_BENCHMARK_COLS, COL_NAME_AVG
14
 
15
 
16
  @dataclass
17
  class EvalResult:
18
+ """
19
+ Evaluation result of a single embedding model with a specific reranking model on benchmarks over different
20
+ domains, languages, and datasets
21
  """
22
  eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]_[metric]
23
  retrieval_model: str
24
  reranking_model: str
25
+ results: list # results on all the benchmarks stored as dict
26
  task: str
27
  metric: str
28
  timestamp: str = "" # submission timestamp
 
30
 
31
  @dataclass
32
  class FullEvalResult:
33
+ """
34
+ Evaluation result of a single embedding model with a specific reranking model on benchmarks over different tasks
35
+ """
36
  eval_name: str # name of the evaluation, [retrieval_model]_[reranking_model]
37
  retrieval_model: str
38
  reranking_model: str
 
41
 
42
  @classmethod
43
  def init_from_json_file(cls, json_filepath):
44
+ """
45
+ Initiate from the result json file for a single model.
46
  The json file will be written only when the status is FINISHED.
47
  """
48
  with open(json_filepath) as fp:
 
71
  )
72
 
73
  def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
74
+ """
75
+ Convert the results in all the EvalResults over different tasks and metrics. The output is a list of dict compatible with the dataframe UI
76
  """
77
  results = defaultdict(dict)
78
  for eval_result in self.results:
79
  if eval_result.metric != metric:
 
80
  continue
81
  if eval_result.task != task:
 
82
  continue
83
  results[eval_result.eval_name]["eval_name"] = eval_result.eval_name
84
+ results[eval_result.eval_name][COL_NAME_RETRIEVAL_MODEL] = self.retrieval_model
85
+ results[eval_result.eval_name][COL_NAME_RERANKING_MODEL] = self.reranking_model
86
 
87
  print(f'result loaded: {eval_result.eval_name}')
88
  for result in eval_result.results:
 
99
  return [v for v in results.values()]
100
 
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  def get_raw_eval_results(results_path: str) -> List[FullEvalResult]:
103
  """
104
  Load the evaluation results from a json file
105
  """
106
  model_result_filepaths = []
107
  for root, dirs, files in os.walk(results_path):
108
+ if len(files) == 0:
109
  continue
110
  try:
111
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7], reverse=True)
112
  except dateutil.parser._parser.ParserError:
113
  files = [files[-1]]
114
 
115
+ # select the latest results
116
  for file in files:
117
  model_result_filepaths.append(os.path.join(root, file))
118
 
 
120
  for model_result_filepath in model_result_filepaths:
121
  # create evaluation results
122
  eval_result = FullEvalResult.init_from_json_file(model_result_filepath)
 
123
  print(f'file loaded: {model_result_filepath}')
124
  eval_name = eval_result.eval_name
125
  eval_results[eval_name] = eval_result
 
133
  print(f"loading failed: {k}")
134
  continue
135
  return results
136
+
137
+
138
+ def get_leaderboard_df(raw_data: List[FullEvalResult], task: str, metric: str) -> pd.DataFrame:
139
+ """
140
+ Creates a dataframe from all the individual experiment results
141
+ """
142
+ if task == "qa":
143
+ cols = COLS_QA
144
+ benchmark_cols = QA_BENCHMARK_COLS
145
+ elif task == "long_doc":
146
+ cols = COLS_LONG_DOC
147
+ benchmark_cols = LONG_DOC_BENCHMARK_COLS
148
+ else:
149
+ raise NotImplemented
150
+ all_data_json = []
151
+ for v in raw_data:
152
+ all_data_json += v.to_dict(task=task, metric=metric)
153
+ df = pd.DataFrame.from_records(all_data_json)
154
+ print(f'dataframe created: {df.shape}')
155
+
156
+ # calculate the average score for selected benchmarks
157
+ _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
158
+ df[COL_NAME_AVG] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
159
+ df = df.sort_values(by=[COL_NAME_AVG], ascending=False)
160
+ df.reset_index(inplace=True)
161
+
162
+ _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
163
+ df = df[_cols].round(decimals=2)
164
+
165
+ # filter out if any of the benchmarks have not been produced
166
+ df = df[has_no_nan_values(df, _benchmark_cols)]
167
+ return df
src/populate.py DELETED
@@ -1,94 +0,0 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, FullEvalResult
9
- from typing import Tuple, List
10
-
11
-
12
- def get_leaderboard_df(raw_data: List[FullEvalResult], cols: list, benchmark_cols: list, task: str, metric: str) -> pd.DataFrame:
13
- """Creates a dataframe from all the individual experiment results"""
14
- all_data_json = []
15
- for v in raw_data:
16
- all_data_json += v.to_dict(task=task, metric=metric)
17
- df = pd.DataFrame.from_records(all_data_json)
18
- print(f'dataframe created: {df.shape}')
19
-
20
- # calculate the average score for selected benchmarks
21
- _benchmark_cols = frozenset(benchmark_cols).intersection(frozenset(df.columns.to_list()))
22
- if task == 'qa':
23
- df[AutoEvalColumnQA.average.name] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
24
- df = df.sort_values(by=[AutoEvalColumnQA.average.name], ascending=False)
25
- elif task == "long_doc":
26
- df[AutoEvalColumnLongDoc.average.name] = df[list(_benchmark_cols)].mean(axis=1).round(decimals=2)
27
- df = df.sort_values(by=[AutoEvalColumnLongDoc.average.name], ascending=False)
28
-
29
- df.reset_index(inplace=True)
30
-
31
- _cols = frozenset(cols).intersection(frozenset(df.columns.to_list()))
32
- df = df[_cols].round(decimals=2)
33
-
34
- # filter out if any of the benchmarks have not been produced
35
- df = df[has_no_nan_values(df, _benchmark_cols)]
36
- return df
37
-
38
-
39
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
40
- """Creates the different dataframes for the evaluation queues requests"""
41
- # entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
42
- # all_evals = []
43
- #
44
- # for entry in entries:
45
- # if ".json" in entry:
46
- # file_path = os.path.join(save_path, entry)
47
- # with open(file_path) as fp:
48
- # data = json.load(fp)
49
- #
50
- # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
51
- # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
52
- #
53
- # all_evals.append(data)
54
- # elif ".md" not in entry:
55
- # # this is a folder
56
- # sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
57
- # for sub_entry in sub_entries:
58
- # file_path = os.path.join(save_path, entry, sub_entry)
59
- # with open(file_path) as fp:
60
- # data = json.load(fp)
61
- #
62
- # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
63
- # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
64
- # all_evals.append(data)
65
- #
66
- # pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
67
- # running_list = [e for e in all_evals if e["status"] == "RUNNING"]
68
- # finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
69
- # df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
70
- # df_running = pd.DataFrame.from_records(running_list, columns=cols)
71
- # df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
72
- cols = ["Retrieval Model", "Submitted Time", "Status"]
73
- df_finished = pd.DataFrame(
74
- {
75
- "Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
76
- "Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
77
- "Status": ["FINISHED", "FINISHED"]
78
- }
79
- )
80
- df_running = pd.DataFrame(
81
- {
82
- "Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
83
- "Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
84
- "Status": ["RUNNING", "RUNNING"]
85
- }
86
- )
87
- df_pending = pd.DataFrame(
88
- {
89
- "Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
90
- "Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
91
- "Status": ["PENDING", "PENDING"]
92
- }
93
- )
94
- return df_finished, df_running, df_pending
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/src/leaderboard/test_read_evals.py CHANGED
@@ -1,6 +1,6 @@
1
  from pathlib import Path
2
 
3
- from src.leaderboard.read_evals import FullEvalResult, get_raw_eval_results
4
 
5
  cur_fp = Path(__file__)
6
 
@@ -8,7 +8,11 @@ cur_fp = Path(__file__)
8
  def test_init_from_json_file():
9
  json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
10
  full_eval_result = FullEvalResult.init_from_json_file(json_fp)
11
- assert len(full_eval_result.results) == 6
 
 
 
 
12
 
13
 
14
  def test_to_dict():
@@ -32,3 +36,32 @@ def test_get_raw_eval_results():
32
  assert len(results[0].results) == 6
33
  assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
34
  assert len(results[1].results) == 6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pathlib import Path
2
 
3
+ from src.leaderboard.read_evals import FullEvalResult, get_raw_eval_results, get_leaderboard_df
4
 
5
  cur_fp = Path(__file__)
6
 
 
8
  def test_init_from_json_file():
9
  json_fp = cur_fp.parents[2] / "toydata" / "test_data.json"
10
  full_eval_result = FullEvalResult.init_from_json_file(json_fp)
11
+ num_different_task_domain_lang_metric_dataset_combination = 6
12
+ assert len(full_eval_result.results) == \
13
+ num_different_task_domain_lang_metric_dataset_combination
14
+ assert full_eval_result.retrieval_model == "bge-m3"
15
+ assert full_eval_result.reranking_model == "bge-reranker-v2-m3"
16
 
17
 
18
  def test_to_dict():
 
36
  assert len(results[0].results) == 6
37
  assert results[1].eval_name == "bge-m3_bge-reranker-v2-m3"
38
  assert len(results[1].results) == 6
39
+
40
+ def test_get_leaderboard_df():
41
+ results_path = cur_fp.parents[2] / "toydata" / "test_results"
42
+ raw_data = get_raw_eval_results(results_path)
43
+ df = get_leaderboard_df(raw_data, 'qa', 'ndcg_at_1')
44
+ assert df.shape[0] == 2
45
+ # the results contain only one embedding model
46
+ for i in range(2):
47
+ assert df["Retrieval Model"][i] == "bge-m3"
48
+ # the results contain only two reranking model
49
+ assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
50
+ assert df["Reranking Model"][1] == "NoReranker"
51
+ assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
52
+ assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
53
+
54
+
55
+ def test_get_leaderboard_df_long_doc():
56
+ results_path = cur_fp.parents[2] / "toydata" / "test_results"
57
+ raw_data = get_raw_eval_results(results_path)
58
+ df = get_leaderboard_df(raw_data, 'long_doc', 'ndcg_at_1')
59
+ assert df.shape[0] == 2
60
+ # the results contain only one embedding model
61
+ for i in range(2):
62
+ assert df["Retrieval Model"][i] == "bge-m3"
63
+ # the results contains only two reranking model
64
+ assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
65
+ assert df["Reranking Model"][1] == "NoReranker"
66
+ assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
67
+ assert not df[['Average ⬆️', 'law_en_lex_files_500k_600k',]].isnull().values.any()
tests/src/test_populate.py DELETED
@@ -1,41 +0,0 @@
1
- from src.populate import get_leaderboard_df
2
- from src.leaderboard.read_evals import get_raw_eval_results
3
- from pathlib import Path
4
-
5
- cur_fp = Path(__file__)
6
-
7
-
8
- def test_get_leaderboard_df():
9
- requests_path = cur_fp.parents[1] / "toydata" / "test_requests"
10
- results_path = cur_fp.parents[1] / "toydata" / "test_results"
11
- cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'wiki_en', 'wiki_zh',]
12
- benchmark_cols = ['wiki_en', 'wiki_zh',]
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- df = get_leaderboard_df(raw_data, cols, benchmark_cols, 'qa', 'ndcg_at_1')
15
- assert df.shape[0] == 2
16
- # the results contain only one embedding model
17
- for i in range(2):
18
- assert df["Retrieval Model"][i] == "bge-m3"
19
- # the results contains only two reranking model
20
- assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
21
- assert df["Reranking Model"][1] == "NoReranker"
22
- assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
23
- assert not df[['Average ⬆️', 'wiki_en', 'wiki_zh',]].isnull().values.any()
24
-
25
-
26
- def test_get_leaderboard_df_long_doc():
27
- requests_path = cur_fp.parents[1] / "toydata" / "test_requests"
28
- results_path = cur_fp.parents[1] / "toydata" / "test_results"
29
- cols = ['Retrieval Model', 'Reranking Model', 'Average ⬆️', 'law_en_lex_files_500k_600k',]
30
- benchmark_cols = ['law_en_lex_files_500k_600k',]
31
- raw_data = get_raw_eval_results(results_path, requests_path)
32
- df = get_leaderboard_df(raw_data, cols, benchmark_cols, 'long_doc', 'ndcg_at_1')
33
- assert df.shape[0] == 2
34
- # the results contain only one embedding model
35
- for i in range(2):
36
- assert df["Retrieval Model"][i] == "bge-m3"
37
- # the results contains only two reranking model
38
- assert df["Reranking Model"][0] == "bge-reranker-v2-m3"
39
- assert df["Reranking Model"][1] == "NoReranker"
40
- assert df["Average ⬆️"][0] > df["Average ⬆️"][1]
41
- assert not df[['Average ⬆️', 'law_en_lex_files_500k_600k',]].isnull().values.any()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/toydata/test_results/bge-m3/NoReranker/results_2023-11-21T18-10-08.json CHANGED
@@ -11,7 +11,7 @@
11
  "domain": "law",
12
  "lang": "en",
13
  "dataset": "lex_files_500K-600K",
14
- "value": 0.75723
15
  }
16
  ]
17
  },
@@ -27,7 +27,7 @@
27
  "domain": "law",
28
  "lang": "en",
29
  "dataset": "lex_files_500K-600K",
30
- "value": 0.69909
31
  }
32
  ]
33
  },
@@ -43,7 +43,7 @@
43
  "domain": "wiki",
44
  "lang": "en",
45
  "dataset": "unknown",
46
- "value": 0.69083
47
  }
48
  ]
49
  },
@@ -59,7 +59,7 @@
59
  "domain": "wiki",
60
  "lang": "en",
61
  "dataset": "unknown",
62
- "value": 0.73359
63
  }
64
  ]
65
  },
 
11
  "domain": "law",
12
  "lang": "en",
13
  "dataset": "lex_files_500K-600K",
14
+ "value": 0.45723
15
  }
16
  ]
17
  },
 
27
  "domain": "law",
28
  "lang": "en",
29
  "dataset": "lex_files_500K-600K",
30
+ "value": 0.49909
31
  }
32
  ]
33
  },
 
43
  "domain": "wiki",
44
  "lang": "en",
45
  "dataset": "unknown",
46
+ "value": 0.49083
47
  }
48
  ]
49
  },
 
59
  "domain": "wiki",
60
  "lang": "en",
61
  "dataset": "unknown",
62
+ "value": 0.43359
63
  }
64
  ]
65
  },
utils.py CHANGED
@@ -7,9 +7,8 @@ from huggingface_hub import HfApi
7
 
8
  from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
9
  from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
10
- from src.leaderboard.read_evals import FullEvalResult
11
  from typing import List
12
- from src.populate import get_leaderboard_df
13
 
14
 
15
  def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame:
 
7
 
8
  from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
9
  from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
10
+ from src.leaderboard.read_evals import FullEvalResult, get_leaderboard_df
11
  from typing import List
 
12
 
13
 
14
  def filter_models(df: pd.DataFrame, reranking_query: list) -> pd.DataFrame: