idolezal commited on
Commit
219886f
β€’
1 Parent(s): 6c4fc74

Code formatting

Browse files
Files changed (3) hide show
  1. app.py +113 -40
  2. compare_significance.py +50 -18
  3. server.py +30 -11
app.py CHANGED
@@ -5,9 +5,15 @@ import pandas as pd
5
  from gradio.themes.utils.sizes import text_md
6
  from gradio_modal import Modal
7
 
8
- from content import (HEADER_MARKDOWN, LEADERBOARD_TAB_TITLE_MARKDOWN, SUBMISSION_TAB_TITLE_MARKDOWN,
9
- MODAL_SUBMIT_MARKDOWN,
10
- SUBMISSION_DETAILS_MARKDOWN, RANKING_AFTER_SUBMISSION_MARKDOWN, MORE_DETAILS_MARKDOWN)
 
 
 
 
 
 
11
  from server import LeaderboardServer
12
 
13
  leaderboard_server = LeaderboardServer()
@@ -38,37 +44,63 @@ def process_submission(team_name, submission_id, description, link_to_model, sub
38
  leaderboard_server.prepare_model_for_submission(submission_file, metadata)
39
  except ValueError as err:
40
  gr.Warning(str(err))
41
- return gr.update(value='Pre-submit model', visible=True, interactive=True), gr.update(
42
- visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
43
- visible=False), gr.update(visible=False)
44
- return gr.update(visible=False), gr.update(visible=True), gr.update(interactive=True,
45
- visible=True), gr.update(
46
- interactive=True, visible=True), gr.update(visible=True), gr.update(
47
- value=leaderboard_server.get_leaderboard(leaderboard_server.pre_submit[0]), visible=True)
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
 
50
  def submit_results():
51
  leaderboard_server.save_pre_submit()
52
  leaderboard_server.update_leaderboard()
53
  gr.Info('Submission successful!')
54
- return gr.update(value='Pre-submit model', visible=True, interactive=True), gr.update(
55
- visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
56
- visible=False), gr.update(visible=False), gr.DataFrame(
57
- value=leaderboard_server.get_leaderboard(), visible=True), gr.update(visible=False), gr.update(
58
- choices=leaderboard_server.submission_ids)
 
 
 
 
 
 
59
 
60
 
61
  def erase_pre_submit():
62
  leaderboard_server.pre_submit = None
63
- return gr.update(value='Pre-submit model', visible=True, interactive=True), gr.update(
64
- visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
65
- visible=False), gr.update(visible=False), gr.update(visible=False)
 
 
 
 
 
 
66
 
67
 
68
  def fetch_model_detail(submission_id):
69
  metadata = leaderboard_server.get_model_detail(submission_id)
70
- return gr.update(value=metadata['description'], visible=True), gr.update(value=metadata['link_to_model'],
71
- visible=True)
 
 
72
 
73
 
74
  def show_modal():
@@ -82,9 +114,10 @@ def hide_modal():
82
  def on_application_load():
83
  leaderboard_server.save_pre_submit()
84
  leaderboard_server.update_leaderboard()
85
- return gr.DataFrame(
86
- value=leaderboard_server.get_leaderboard(), visible=True), gr.update(
87
- choices=leaderboard_server.submission_ids)
 
88
 
89
 
90
  custom_css = """
@@ -143,7 +176,7 @@ footer {visibility: hidden}
143
 
144
  """
145
 
146
- with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main):
147
  with gr.Row():
148
  with gr.Row():
149
  gr.Markdown(HEADER_MARKDOWN)
@@ -155,17 +188,30 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
155
 
156
  with gr.Row():
157
  with gr.Tab("Overall"):
158
- results_table = gr.DataFrame(leaderboard_server.get_leaderboard(), interactive=False, label=None,
159
- visible=True, elem_classes="leaderboard-table")
 
 
 
 
 
160
  for c in leaderboard_server.tasks_categories:
161
  with gr.Tab(c):
162
- results_table = gr.DataFrame(leaderboard_server.get_leaderboard(), interactive=False, label=None,
163
- visible=True, elem_classes="leaderboard-table")
 
 
 
 
 
164
 
165
  with gr.Tab('Model details'):
166
  gr.Markdown(MORE_DETAILS_MARKDOWN)
167
- detail_dropdown = gr.Dropdown(choices=leaderboard_server.submission_ids, label="Select model",
168
- interactive=True)
 
 
 
169
 
170
  with gr.Row():
171
  model_description = gr.Text(value='', label='Model description', visible=False, interactive=False)
@@ -174,7 +220,8 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
174
  detail_dropdown.change(
175
  fn=fetch_model_detail,
176
  inputs=[detail_dropdown],
177
- outputs=[model_description, model_url])
 
178
 
179
  with gr.Tab('Submission'):
180
  with gr.Column():
@@ -213,13 +260,24 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
213
  pre_submission_btn.click(
214
  fn=on_submit_pressed,
215
  concurrency_limit=1,
216
- outputs=[pre_submission_btn]
217
  ).then(
218
  fn=process_submission,
219
- inputs=[submission_team_name_tb, submission_id_tb, description_tb, link_to_model_tb,
220
- submission_file_path],
221
- outputs=[pre_submission_btn, submit_prompt, submission_btn_yes, submission_btn_no, pre_submit_info,
222
- pre_submit_table]
 
 
 
 
 
 
 
 
 
 
 
223
  )
224
 
225
  submission_btn_yes.click(
@@ -229,8 +287,17 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
229
 
230
  modal_submit_yes.click(
231
  fn=submit_results,
232
- outputs=[pre_submission_btn, submission_btn_yes, submission_btn_no, submit_prompt, pre_submit_info,
233
- pre_submit_table, results_table, modal_submit, detail_dropdown]
 
 
 
 
 
 
 
 
 
234
  )
235
 
236
  modal_submit_no.click(
@@ -240,8 +307,14 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
240
 
241
  submission_btn_no.click(
242
  fn=erase_pre_submit,
243
- outputs=[pre_submission_btn, submission_btn_yes, submission_btn_no, submit_prompt, pre_submit_info,
244
- pre_submit_table]
 
 
 
 
 
 
245
  )
246
  main.load(on_application_load, inputs=None, outputs=[results_table, detail_dropdown])
247
 
 
5
  from gradio.themes.utils.sizes import text_md
6
  from gradio_modal import Modal
7
 
8
+ from content import (
9
+ HEADER_MARKDOWN,
10
+ LEADERBOARD_TAB_TITLE_MARKDOWN,
11
+ SUBMISSION_TAB_TITLE_MARKDOWN,
12
+ MODAL_SUBMIT_MARKDOWN,
13
+ SUBMISSION_DETAILS_MARKDOWN,
14
+ RANKING_AFTER_SUBMISSION_MARKDOWN,
15
+ MORE_DETAILS_MARKDOWN,
16
+ )
17
  from server import LeaderboardServer
18
 
19
  leaderboard_server = LeaderboardServer()
 
44
  leaderboard_server.prepare_model_for_submission(submission_file, metadata)
45
  except ValueError as err:
46
  gr.Warning(str(err))
47
+ return (
48
+ gr.update(value='Pre-submit model', visible=True, interactive=True),
49
+ gr.update(visible=False),
50
+ gr.update(visible=False),
51
+ gr.update(visible=False),
52
+ gr.update(visible=False),
53
+ gr.update(visible=False),
54
+ )
55
+ return (
56
+ gr.update(visible=False),
57
+ gr.update(visible=True),
58
+ gr.update(interactive=True, visible=True),
59
+ gr.update(interactive=True, visible=True),
60
+ gr.update(visible=True),
61
+ gr.update(
62
+ value=leaderboard_server.get_leaderboard(leaderboard_server.pre_submit[0]),
63
+ visible=True,
64
+ ),
65
+ )
66
 
67
 
68
  def submit_results():
69
  leaderboard_server.save_pre_submit()
70
  leaderboard_server.update_leaderboard()
71
  gr.Info('Submission successful!')
72
+ return (
73
+ gr.update(value='Pre-submit model', visible=True, interactive=True),
74
+ gr.update(visible=False),
75
+ gr.update(visible=False),
76
+ gr.update(visible=False),
77
+ gr.update(visible=False),
78
+ gr.update(visible=False),
79
+ gr.DataFrame(value=leaderboard_server.get_leaderboard(), visible=True),
80
+ gr.update(visible=False),
81
+ gr.update(choices=leaderboard_server.submission_ids),
82
+ )
83
 
84
 
85
  def erase_pre_submit():
86
  leaderboard_server.pre_submit = None
87
+ return (
88
+ gr.update(value='Pre-submit model', visible=True, interactive=True),
89
+ gr.update(visible=False),
90
+ gr.update(visible=False),
91
+ gr.update(visible=False),
92
+ gr.update(visible=False),
93
+ gr.update(visible=False),
94
+ gr.update(visible=False),
95
+ )
96
 
97
 
98
  def fetch_model_detail(submission_id):
99
  metadata = leaderboard_server.get_model_detail(submission_id)
100
+ return (
101
+ gr.update(value=metadata['description'], visible=True),
102
+ gr.update(value=metadata['link_to_model'], visible=True)
103
+ )
104
 
105
 
106
  def show_modal():
 
114
  def on_application_load():
115
  leaderboard_server.save_pre_submit()
116
  leaderboard_server.update_leaderboard()
117
+ return (
118
+ gr.DataFrame(value=leaderboard_server.get_leaderboard(), visible=True),
119
+ gr.update(choices=leaderboard_server.submission_ids)
120
+ )
121
 
122
 
123
  custom_css = """
 
176
 
177
  """
178
 
179
+ with gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main:
180
  with gr.Row():
181
  with gr.Row():
182
  gr.Markdown(HEADER_MARKDOWN)
 
188
 
189
  with gr.Row():
190
  with gr.Tab("Overall"):
191
+ results_table = gr.DataFrame(
192
+ leaderboard_server.get_leaderboard(),
193
+ interactive=False,
194
+ label=None,
195
+ visible=True,
196
+ elem_classes="leaderboard-table",
197
+ )
198
  for c in leaderboard_server.tasks_categories:
199
  with gr.Tab(c):
200
+ results_table = gr.DataFrame(
201
+ leaderboard_server.get_leaderboard(),
202
+ interactive=False,
203
+ label=None,
204
+ visible=True,
205
+ elem_classes="leaderboard-table",
206
+ )
207
 
208
  with gr.Tab('Model details'):
209
  gr.Markdown(MORE_DETAILS_MARKDOWN)
210
+ detail_dropdown = gr.Dropdown(
211
+ choices=leaderboard_server.submission_ids,
212
+ label="Select model",
213
+ interactive=True,
214
+ )
215
 
216
  with gr.Row():
217
  model_description = gr.Text(value='', label='Model description', visible=False, interactive=False)
 
220
  detail_dropdown.change(
221
  fn=fetch_model_detail,
222
  inputs=[detail_dropdown],
223
+ outputs=[model_description, model_url],
224
+ )
225
 
226
  with gr.Tab('Submission'):
227
  with gr.Column():
 
260
  pre_submission_btn.click(
261
  fn=on_submit_pressed,
262
  concurrency_limit=1,
263
+ outputs=[pre_submission_btn],
264
  ).then(
265
  fn=process_submission,
266
+ inputs=[
267
+ submission_team_name_tb,
268
+ submission_id_tb,
269
+ description_tb,
270
+ link_to_model_tb,
271
+ submission_file_path,
272
+ ],
273
+ outputs=[
274
+ pre_submission_btn,
275
+ submit_prompt,
276
+ submission_btn_yes,
277
+ submission_btn_no,
278
+ pre_submit_info,
279
+ pre_submit_table,
280
+ ],
281
  )
282
 
283
  submission_btn_yes.click(
 
287
 
288
  modal_submit_yes.click(
289
  fn=submit_results,
290
+ outputs=[
291
+ pre_submission_btn,
292
+ submission_btn_yes,
293
+ submission_btn_no,
294
+ submit_prompt,
295
+ pre_submit_info,
296
+ pre_submit_table,
297
+ results_table,
298
+ modal_submit,
299
+ detail_dropdown,
300
+ ],
301
  )
302
 
303
  modal_submit_no.click(
 
307
 
308
  submission_btn_no.click(
309
  fn=erase_pre_submit,
310
+ outputs=[
311
+ pre_submission_btn,
312
+ submission_btn_yes,
313
+ submission_btn_no,
314
+ submit_prompt,
315
+ pre_submit_info,
316
+ pre_submit_table,
317
+ ],
318
  )
319
  main.load(on_application_load, inputs=None, outputs=[results_table, detail_dropdown])
320
 
compare_significance.py CHANGED
@@ -36,7 +36,14 @@ def _get_CMs(i, probabilities, references, thresholds):
36
  FN += 1
37
  else:
38
  TN += 1
39
- cm = {"TP": TP, "FP": FP, "TN": TN, "FN": FN, "threshold": threshold, "class": i}
 
 
 
 
 
 
 
40
  confusion_matrices.append(cm)
41
 
42
  return confusion_matrices
@@ -73,16 +80,20 @@ def compute_significance_bootstrap(scores_A, scores_B):
73
  return pval, delta_orig
74
 
75
 
76
- def compute_significance_avg_mcauroc(probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
77
- probsB: Sequence[Sequence[float]], referencesB: Sequence[int]):
 
 
78
  # compute MC-AUC for model A
79
  model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=100)
80
  model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=100)
81
  delta = np.mean(model_A_scores) - np.mean(model_B_scores)
82
 
83
  # one-tailed test
84
- p_value = ((model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
85
- / (len(model_A_scores) * len(model_B_scores)))
 
 
86
 
87
  return p_value, delta
88
 
@@ -114,8 +125,10 @@ def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
114
  auc_scores_per_class = []
115
  for i in range(len(n_classes)):
116
  # for i-th class vs all others
117
- fpr[i], _, thresholds[i] = roc_curve(y_true=[1 if x == n_classes[i] else 0 for x in references],
118
- y_score=[prob[i] for prob in probs])
 
 
119
 
120
  confusion_matrices = _get_CMs(i, probs, references, thresholds)
121
  tp, fn = convert_confusion_matrices(confusion_matrices)
@@ -194,13 +207,20 @@ def process_task(task, dataA, dataB, significance_level):
194
  assert len(dataA[task]) == len(dataB[task])
195
 
196
  if metricA == "avg_mcauroc":
197
- p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
198
- probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
 
 
199
  elif metricA in ["acc", "exact_match"]:
200
- p_value, delta = compute_significance_ttest(scores_A=dataA[task][0], scores_B=dataB[task][0])
 
 
 
201
  elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
202
- p_value, delta = compute_significance_bootstrap(scores_A=np.array(dataA[task][0]),
203
- scores_B=np.array(dataB[task][0]))
 
 
204
  else:
205
  raise ValueError(f"Unsupported metric {metricA}")
206
 
@@ -228,14 +248,21 @@ def check_significance(fileA, fileB, significance_level=0.05):
228
  assert len(dataA[task]) == len(dataB[task])
229
 
230
  if metricA == "avg_mcauroc":
231
- p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
232
- probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
 
 
233
 
234
  elif metricA in ["acc", "exact_match"]:
235
- p_value, delta = compute_significance_ttest(scores_A=dataA[task][0], scores_B=dataB[task][0])
 
 
 
236
  elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
237
- p_value, delta = compute_significance_bootstrap(scores_A=np.array(dataA[task][0]),
238
- scores_B=np.array(dataB[task][0]))
 
 
239
  else:
240
  raise ValueError(f"Unsupported metric {metricA}")
241
  if delta <= 0:
@@ -253,7 +280,12 @@ def main():
253
  parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
254
  parser.add_argument("--modelA", help="ModelA JSON file from lm harness.")
255
  parser.add_argument("--modelB", help="ModelB JSON file from lm harness.")
256
- parser.add_argument("--significance_level", type=float, default=0.05, help="Significance level (e.g., 0.05)")
 
 
 
 
 
257
  args = parser.parse_args()
258
 
259
  result = check_significance(args.modelA, args.modelB, args.significance_level)
 
36
  FN += 1
37
  else:
38
  TN += 1
39
+ cm = {
40
+ "TP": TP,
41
+ "FP": FP,
42
+ "TN": TN,
43
+ "FN": FN,
44
+ "threshold": threshold,
45
+ "class": i,
46
+ }
47
  confusion_matrices.append(cm)
48
 
49
  return confusion_matrices
 
80
  return pval, delta_orig
81
 
82
 
83
+ def compute_significance_avg_mcauroc(
84
+ probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
85
+ probsB: Sequence[Sequence[float]], referencesB: Sequence[int],
86
+ ):
87
  # compute MC-AUC for model A
88
  model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=100)
89
  model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=100)
90
  delta = np.mean(model_A_scores) - np.mean(model_B_scores)
91
 
92
  # one-tailed test
93
+ p_value = (
94
+ (model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
95
+ / (len(model_A_scores) * len(model_B_scores))
96
+ )
97
 
98
  return p_value, delta
99
 
 
125
  auc_scores_per_class = []
126
  for i in range(len(n_classes)):
127
  # for i-th class vs all others
128
+ fpr[i], _, thresholds[i] = roc_curve(
129
+ y_true=[1 if x == n_classes[i] else 0 for x in references],
130
+ y_score=[prob[i] for prob in probs],
131
+ )
132
 
133
  confusion_matrices = _get_CMs(i, probs, references, thresholds)
134
  tp, fn = convert_confusion_matrices(confusion_matrices)
 
207
  assert len(dataA[task]) == len(dataB[task])
208
 
209
  if metricA == "avg_mcauroc":
210
+ p_value, delta = compute_significance_avg_mcauroc(
211
+ probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
212
+ probsB=dataB[task][0][1], referencesB=dataB[task][0][0],
213
+ )
214
  elif metricA in ["acc", "exact_match"]:
215
+ p_value, delta = compute_significance_ttest(
216
+ scores_A=dataA[task][0],
217
+ scores_B=dataB[task][0]
218
+ )
219
  elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
220
+ p_value, delta = compute_significance_bootstrap(
221
+ scores_A=np.array(dataA[task][0]),
222
+ scores_B=np.array(dataB[task][0])
223
+ )
224
  else:
225
  raise ValueError(f"Unsupported metric {metricA}")
226
 
 
248
  assert len(dataA[task]) == len(dataB[task])
249
 
250
  if metricA == "avg_mcauroc":
251
+ p_value, delta = compute_significance_avg_mcauroc(
252
+ probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
253
+ probsB=dataB[task][0][1], referencesB=dataB[task][0][0],
254
+ )
255
 
256
  elif metricA in ["acc", "exact_match"]:
257
+ p_value, delta = compute_significance_ttest(
258
+ scores_A=dataA[task][0],
259
+ scores_B=dataB[task][0]
260
+ )
261
  elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
262
+ p_value, delta = compute_significance_bootstrap(
263
+ scores_A=np.array(dataA[task][0]),
264
+ scores_B=np.array(dataB[task][0])
265
+ )
266
  else:
267
  raise ValueError(f"Unsupported metric {metricA}")
268
  if delta <= 0:
 
280
  parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
281
  parser.add_argument("--modelA", help="ModelA JSON file from lm harness.")
282
  parser.add_argument("--modelB", help="ModelB JSON file from lm harness.")
283
+ parser.add_argument(
284
+ "--significance_level",
285
+ type=float,
286
+ default=0.05,
287
+ help="Significance level (e.g., 0.05)",
288
+ )
289
  args = parser.parse_args()
290
 
291
  result = check_significance(args.modelA, args.modelB, args.significance_level)
server.py CHANGED
@@ -18,12 +18,17 @@ REPO = f"{ORG}/LLM_benchmark_data"
18
  HF_TOKEN = os.environ.get("HF_TOKEN")
19
  TASKS_METADATA_PATH = "./tasks_metadata.json"
20
 
 
21
  class LeaderboardServer:
22
  def __init__(self):
23
  self.server_address = REPO
24
  self.repo_type = "dataset"
25
- self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,
26
- local_dir="./")
 
 
 
 
27
  self.submisssion_id_to_file = {} # Map submission ids to file paths
28
  self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))
29
  self.tasks_categories = {self.tasks_metadata[task]["category"] for task in self.tasks_metadata}
@@ -33,8 +38,12 @@ class LeaderboardServer:
33
  self.pre_submit = None
34
 
35
  def update_leaderboard(self):
36
- self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,
37
- local_dir="./")
 
 
 
 
38
  self.fetch_existing_models()
39
  self.tournament_results = self.load_tournament_results()
40
 
@@ -96,17 +105,27 @@ class LeaderboardServer:
96
  else:
97
  processed_results.append(local_results)
98
  dataframe = pd.DataFrame.from_records(processed_results)
99
- df_order = (["submission_id"] + list(self.tasks_metadata.keys()) +
100
- [col for col in dataframe.columns if
101
- col != "submission_id" and col not in self.tasks_metadata.keys()])
 
 
 
 
 
 
102
  dataframe = dataframe[df_order]
103
- dataframe = dataframe.rename(columns={key: value["name"] for key, value in self.tasks_metadata.items()})
 
 
104
  return dataframe
105
 
106
  def start_tournament(self, new_model_id, new_model_file):
107
  new_tournament = copy.deepcopy(self.tournament_results)
108
  new_tournament[new_model_id] = {}
109
- new_tournament[new_model_id][new_model_id] = {task: False for task in self.tasks_metadata.keys()}
 
 
110
 
111
  for model in self.submission_ids:
112
  res = check_significance(new_model_file, self.submisssion_id_to_file[model])
@@ -124,7 +143,7 @@ class LeaderboardServer:
124
  data = json.load(f)
125
  data["metadata"] = metadata
126
  with open(file, "w") as f:
127
- json.dump(data, f, separators=(',', ':')) # compact JSON
128
 
129
  model_id = metadata["team_name"] + "_" + metadata["submission_id"]
130
  tournament_results = self.start_tournament(model_id, file)
@@ -145,7 +164,7 @@ class LeaderboardServer:
145
  # Temporary save tournament results
146
  tournament_results_path = os.path.join(self.local_leaderboard, "tournament.json")
147
  with open(tournament_results_path, "w") as f:
148
- json.dump(tournament_results, f, sort_keys=True, indent=2) # readable JSON
149
 
150
  api.upload_file(
151
  path_or_fileobj=tournament_results_path,
 
18
  HF_TOKEN = os.environ.get("HF_TOKEN")
19
  TASKS_METADATA_PATH = "./tasks_metadata.json"
20
 
21
+
22
  class LeaderboardServer:
23
  def __init__(self):
24
  self.server_address = REPO
25
  self.repo_type = "dataset"
26
+ self.local_leaderboard = snapshot_download(
27
+ self.server_address,
28
+ repo_type=self.repo_type,
29
+ token=HF_TOKEN,
30
+ local_dir="./",
31
+ )
32
  self.submisssion_id_to_file = {} # Map submission ids to file paths
33
  self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))
34
  self.tasks_categories = {self.tasks_metadata[task]["category"] for task in self.tasks_metadata}
 
38
  self.pre_submit = None
39
 
40
  def update_leaderboard(self):
41
+ self.local_leaderboard = snapshot_download(
42
+ self.server_address,
43
+ repo_type=self.repo_type,
44
+ token=HF_TOKEN,
45
+ local_dir="./",
46
+ )
47
  self.fetch_existing_models()
48
  self.tournament_results = self.load_tournament_results()
49
 
 
105
  else:
106
  processed_results.append(local_results)
107
  dataframe = pd.DataFrame.from_records(processed_results)
108
+ df_order = (
109
+ ["submission_id"]
110
+ + list(self.tasks_metadata.keys())
111
+ + [
112
+ col
113
+ for col in dataframe.columns
114
+ if col != "submission_id" and col not in self.tasks_metadata.keys()
115
+ ]
116
+ )
117
  dataframe = dataframe[df_order]
118
+ dataframe = dataframe.rename(
119
+ columns={key: value["name"] for key, value in self.tasks_metadata.items()}
120
+ )
121
  return dataframe
122
 
123
  def start_tournament(self, new_model_id, new_model_file):
124
  new_tournament = copy.deepcopy(self.tournament_results)
125
  new_tournament[new_model_id] = {}
126
+ new_tournament[new_model_id][new_model_id] = {
127
+ task: False for task in self.tasks_metadata.keys()
128
+ }
129
 
130
  for model in self.submission_ids:
131
  res = check_significance(new_model_file, self.submisssion_id_to_file[model])
 
143
  data = json.load(f)
144
  data["metadata"] = metadata
145
  with open(file, "w") as f:
146
+ json.dump(data, f, separators=(',', ':')) # compact JSON
147
 
148
  model_id = metadata["team_name"] + "_" + metadata["submission_id"]
149
  tournament_results = self.start_tournament(model_id, file)
 
164
  # Temporary save tournament results
165
  tournament_results_path = os.path.join(self.local_leaderboard, "tournament.json")
166
  with open(tournament_results_path, "w") as f:
167
+ json.dump(tournament_results, f, sort_keys=True, indent=2) # readable JSON
168
 
169
  api.upload_file(
170
  path_or_fileobj=tournament_results_path,