Update src/leaderboard/read_evals.py

#6
by jcole1 - opened
app.py CHANGED
@@ -1,5 +1,5 @@
1
- import subprocess
2
  import gradio as gr
 
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
@@ -18,8 +18,6 @@ from src.display.utils import (
18
  COLS,
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
- NUMERIC_INTERVALS,
22
- TYPES,
23
  AutoEvalColumn,
24
  ModelType,
25
  fields,
@@ -34,6 +32,7 @@ from src.submission.submit import add_new_eval
34
  def restart_space():
35
  API.restart_space(repo_id=REPO_ID)
36
 
 
37
  try:
38
  print(EVAL_REQUESTS_PATH)
39
  snapshot_download(
@@ -50,8 +49,7 @@ except Exception:
50
  restart_space()
51
 
52
 
53
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
- leaderboard_df = original_df.copy()
55
 
56
  (
57
  finished_eval_queue_df,
@@ -59,77 +57,36 @@ leaderboard_df = original_df.copy()
59
  pending_eval_queue_df,
60
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
61
 
62
-
63
- # Searching and filtering
64
- def update_table(
65
- hidden_df: pd.DataFrame,
66
- columns: list,
67
- type_query: list,
68
- precision_query: str,
69
- size_query: list,
70
- show_deleted: bool,
71
- query: str,
72
- ):
73
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
74
- filtered_df = filter_queries(query, filtered_df)
75
- df = select_columns(filtered_df, columns)
76
- return df
77
-
78
-
79
- def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
80
- return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
81
-
82
-
83
- def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
84
- always_here_cols = [
85
- AutoEvalColumn.model_type_symbol.name,
86
- AutoEvalColumn.model.name,
87
- ]
88
- # We use COLS to maintain sorting
89
- filtered_df = df[
90
- always_here_cols + [c for c in COLS if c in df.columns and c in columns]
91
- ]
92
- return filtered_df
93
-
94
-
95
- def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
96
- final_df = []
97
- if query != "":
98
- queries = [q.strip() for q in query.split(";")]
99
- for _q in queries:
100
- _q = _q.strip()
101
- if _q != "":
102
- temp_filtered_df = search_table(filtered_df, _q)
103
- if len(temp_filtered_df) > 0:
104
- final_df.append(temp_filtered_df)
105
- if len(final_df) > 0:
106
- filtered_df = pd.concat(final_df)
107
- filtered_df = filtered_df.drop_duplicates(
108
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
109
- )
110
-
111
- return filtered_df
112
-
113
-
114
- def filter_models(
115
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
116
- ) -> pd.DataFrame:
117
- # Show all models
118
- if show_deleted:
119
- filtered_df = df
120
- else: # Show only still on the hub models
121
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
122
-
123
- type_emoji = [t[0] for t in type_query]
124
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
125
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
126
-
127
- numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
128
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
129
- mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
130
- filtered_df = filtered_df.loc[mask]
131
-
132
- return filtered_df
133
 
134
 
135
  demo = gr.Blocks(css=custom_css)
@@ -139,105 +96,7 @@ with demo:
139
 
140
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
141
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
142
- with gr.Row():
143
- with gr.Column():
144
- with gr.Row():
145
- search_bar = gr.Textbox(
146
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
147
- show_label=False,
148
- elem_id="search-bar",
149
- )
150
- with gr.Row():
151
- shown_columns = gr.CheckboxGroup(
152
- choices=[
153
- c.name
154
- for c in fields(AutoEvalColumn)
155
- if not c.hidden and not c.never_hidden
156
- ],
157
- value=[
158
- c.name
159
- for c in fields(AutoEvalColumn)
160
- if c.displayed_by_default and not c.hidden and not c.never_hidden
161
- ],
162
- label="Select columns to show",
163
- elem_id="column-select",
164
- interactive=True,
165
- )
166
- with gr.Row():
167
- deleted_models_visibility = gr.Checkbox(
168
- value=False, label="Show gated/private/deleted models", interactive=True
169
- )
170
- with gr.Column(min_width=320):
171
- #with gr.Box(elem_id="box-filter"):
172
- filter_columns_type = gr.CheckboxGroup(
173
- label="Model types",
174
- choices=[t.to_str() for t in ModelType],
175
- value=[t.to_str() for t in ModelType],
176
- interactive=True,
177
- elem_id="filter-columns-type",
178
- )
179
- filter_columns_precision = gr.CheckboxGroup(
180
- label="Precision",
181
- choices=[i.value.name for i in Precision],
182
- value=[i.value.name for i in Precision],
183
- interactive=True,
184
- elem_id="filter-columns-precision",
185
- )
186
- filter_columns_size = gr.CheckboxGroup(
187
- label="Model sizes (in billions of parameters)",
188
- choices=list(NUMERIC_INTERVALS.keys()),
189
- value=list(NUMERIC_INTERVALS.keys()),
190
- interactive=True,
191
- elem_id="filter-columns-size",
192
- )
193
-
194
- leaderboard_table = gr.components.Dataframe(
195
- value=leaderboard_df[
196
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
197
- + shown_columns.value
198
- ],
199
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
200
- datatype=TYPES,
201
- elem_id="leaderboard-table",
202
- interactive=False,
203
- visible=True,
204
- )
205
-
206
- # Dummy leaderboard for handling the case when the user uses backspace key
207
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
208
- value=original_df[COLS],
209
- headers=COLS,
210
- datatype=TYPES,
211
- visible=False,
212
- )
213
- search_bar.submit(
214
- update_table,
215
- [
216
- hidden_leaderboard_table_for_search,
217
- shown_columns,
218
- filter_columns_type,
219
- filter_columns_precision,
220
- filter_columns_size,
221
- deleted_models_visibility,
222
- search_bar,
223
- ],
224
- leaderboard_table,
225
- )
226
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
227
- selector.change(
228
- update_table,
229
- [
230
- hidden_leaderboard_table_for_search,
231
- shown_columns,
232
- filter_columns_type,
233
- filter_columns_precision,
234
- filter_columns_size,
235
- deleted_models_visibility,
236
- search_bar,
237
- ],
238
- leaderboard_table,
239
- queue=True,
240
- )
241
 
242
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
243
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
1
  import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
 
18
  COLS,
19
  EVAL_COLS,
20
  EVAL_TYPES,
 
 
21
  AutoEvalColumn,
22
  ModelType,
23
  fields,
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
+ ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
 
49
  restart_space()
50
 
51
 
52
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
53
 
54
  (
55
  finished_eval_queue_df,
 
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
+ def init_leaderboard(dataframe):
61
+ if dataframe is None or dataframe.empty:
62
+ raise ValueError("Leaderboard DataFrame is empty or None.")
63
+ return Leaderboard(
64
+ value=dataframe,
65
+ datatype=[c.type for c in fields(AutoEvalColumn)],
66
+ select_columns=SelectColumns(
67
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
+ label="Select Columns to Display:",
70
+ ),
71
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
+ filter_columns=[
74
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
+ ColumnFilter(
77
+ AutoEvalColumn.params.name,
78
+ type="slider",
79
+ min=0,
80
+ max=2000,
81
+ label="Select the number of parameters (M)",
82
+ ),
83
+ # ColumnFilter(
84
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
+ # ),
86
+ ],
87
+ # bool_checkboxgroup_label="Hide models",
88
+ # interactive=False,
89
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
 
92
  demo = gr.Blocks(css=custom_css)
 
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
requirements.txt CHANGED
@@ -15,4 +15,5 @@ transformers==4.35.2
15
  tokenizers>=0.15.0
16
  git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
  accelerate==0.24.1
18
- sentencepiece
 
 
15
  tokenizers>=0.15.0
16
  git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
  accelerate==0.24.1
18
+ sentencepiece
19
+ gradio_leaderboard
src/display/utils.py CHANGED
@@ -26,7 +26,7 @@ auto_eval_column_dict = []
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
@@ -91,10 +91,6 @@ class WeightType(Enum):
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
94
- float32 = ModelDetails("float32")
95
- #qt_8bit = ModelDetails("8bit")
96
- #qt_4bit = ModelDetails("4bit")
97
- #qt_GPTQ = ModelDetails("GPTQ")
98
  Unknown = ModelDetails("?")
99
 
100
  def from_str(precision):
@@ -102,34 +98,13 @@ class Precision(Enum):
102
  return Precision.float16
103
  if precision in ["torch.bfloat16", "bfloat16"]:
104
  return Precision.bfloat16
105
- if precision in ["float32"]:
106
- return Precision.float32
107
- #if precision in ["8bit"]:
108
- # return Precision.qt_8bit
109
- #if precision in ["4bit"]:
110
- # return Precision.qt_4bit
111
- #if precision in ["GPTQ", "None"]:
112
- # return Precision.qt_GPTQ
113
  return Precision.Unknown
114
 
115
  # Column selection
116
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
117
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
118
- COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
119
- TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
120
 
121
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
122
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
123
 
124
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
125
 
126
- NUMERIC_INTERVALS = {
127
- "?": pd.Interval(-1, 0, closed="right"),
128
- "~1.5": pd.Interval(0, 2, closed="right"),
129
- "~3": pd.Interval(2, 4, closed="right"),
130
- "~7": pd.Interval(4, 9, closed="right"),
131
- "~13": pd.Interval(9, 20, closed="right"),
132
- "~35": pd.Interval(20, 45, closed="right"),
133
- "~60": pd.Interval(45, 70, closed="right"),
134
- "70+": pd.Interval(70, 10000, closed="right"),
135
- }
 
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Rank", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
 
 
 
 
94
  Unknown = ModelDetails("?")
95
 
96
  def from_str(precision):
 
98
  return Precision.float16
99
  if precision in ["torch.bfloat16", "bfloat16"]:
100
  return Precision.bfloat16
 
 
 
 
 
 
 
 
101
  return Precision.Unknown
102
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
 
 
105
 
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
 
 
 
 
 
 
 
 
 
 
 
src/envs.py CHANGED
@@ -6,7 +6,7 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "yangheng" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
src/leaderboard/read_evals.py CHANGED
@@ -60,6 +60,7 @@ class EvalResult:
60
  still_on_hub, _, model_config = is_model_on_hub(
61
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
 
63
  architecture = "?"
64
  if model_config is not None:
65
  architectures = getattr(model_config, "architectures", None)
@@ -70,13 +71,15 @@ class EvalResult:
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
-
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
 
 
 
80
  results[task.benchmark] = mean_acc
81
 
82
  return self(
@@ -93,8 +96,8 @@ class EvalResult:
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
 
96
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
  try:
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
@@ -107,9 +110,11 @@ class EvalResult:
107
  except Exception:
108
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
 
110
- def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -138,6 +143,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
138
  requests_path,
139
  f"{model_name}_eval_request_*.json",
140
  )
 
141
  request_files = glob.glob(request_files)
142
 
143
  # Select correct request file (precision)
@@ -146,6 +152,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
 
 
149
  if (
150
  req_content["status"] in ["FINISHED"]
151
  and req_content["precision"] == precision.split(".")[-1]
@@ -186,9 +194,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
186
  eval_results[eval_name] = eval_result
187
 
188
  results = []
189
- for v in eval_results.values():
 
 
 
 
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
+ print("Is model on hub? \n", _)
64
  architecture = "?"
65
  if model_config is not None:
66
  architectures = getattr(model_config, "architectures", None)
 
71
  results = {}
72
  for task in Tasks:
73
  task = task.value
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
+ if task.benchmark == "mRNA":
79
+ # Keep RMSE at original value
80
+ mean_acc = np.mean(accs)
81
+ else:
82
+ mean_acc = np.mean(accs) * 100.0
83
  results[task.benchmark] = mean_acc
84
 
85
  return self(
 
96
 
97
  def update_with_request_file(self, requests_path):
98
  """Finds the relevant request file for the current model and updates info with it"""
99
+ # print("Requests Path: ", requests_path)
100
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
 
101
  try:
102
  with open(request_file, "r") as f:
103
  request = json.load(f)
 
110
  except Exception:
111
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
112
 
113
+ def to_dict(self, rank):
114
  """Converts the Eval Result to a dict compatible with our dataframe display"""
115
+ average = rank
116
+ # average = sorted(average, reverse=True)
117
+ # rank = [rank+1 for rank, value in enumerate(average)]
118
  data_dict = {
119
  "eval_name": self.eval_name, # not a column, just a save name,
120
  AutoEvalColumn.precision.name: self.precision.value.name,
 
143
  requests_path,
144
  f"{model_name}_eval_request_*.json",
145
  )
146
+ # print("Request Files: ", request_files)
147
  request_files = glob.glob(request_files)
148
 
149
  # Select correct request file (precision)
 
152
  for tmp_request_file in request_files:
153
  with open(tmp_request_file, "r") as f:
154
  req_content = json.load(f)
155
+ # print("Request File: ", tmp_request_file)
156
+ # print("Req Content: ", req_content)
157
  if (
158
  req_content["status"] in ["FINISHED"]
159
  and req_content["precision"] == precision.split(".")[-1]
 
194
  eval_results[eval_name] = eval_result
195
 
196
  results = []
197
+ for result in eval_results.values():
198
+ result.average = np.mean(list(result.results.values()))
199
+ sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
200
+ print(f"SORTED RESULTS HERE: \n{sorted_results}")
201
+ for i,v in enumerate(sorted_results):
202
  try:
203
+ v.to_dict(i) # we test if the dict version is complete
204
  results.append(v)
205
  except KeyError: # not all eval values present
206
  continue
src/populate.py CHANGED
@@ -1,8 +1,9 @@
1
  import json
2
  import os
3
-
4
  import pandas as pd
5
 
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
@@ -11,15 +12,22 @@ from src.leaderboard.read_evals import get_raw_eval_results
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
 
 
 
 
 
 
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
22
- return raw_data, df
 
23
 
24
 
25
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
@@ -55,4 +63,4 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
55
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
1
  import json
2
  import os
3
+ import numpy as np
4
  import pandas as pd
5
 
6
+
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
9
  from src.leaderboard.read_evals import get_raw_eval_results
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
+ for result in raw_data:
16
+ result.average = np.mean(list(result.results.values()))
17
+ sorted_results = sorted(raw_data, key=lambda r: r.average, reverse=True)
18
+ print(sorted_results)
19
+ # ranks = [rank+1 for rank, value in enumerate(sorted_results)]
20
+ # rank = [rank+1 for rank, value in enumerate(average)]
21
+ all_data_json = [v.to_dict(i+1) for i, v in enumerate(raw_data)]
22
 
23
  df = pd.DataFrame.from_records(all_data_json)
24
+ # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
25
  df = df[cols].round(decimals=2)
26
 
27
  # filter out if any of the benchmarks have not been produced
28
  df = df[has_no_nan_values(df, benchmark_cols)]
29
+ print(df)
30
+ return df
31
 
32
 
33
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
63
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
64
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
65
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
66
+ return df_finished[cols], df_running[cols], df_pending[cols]