Spaces:
Running
Running
Code formatting
Browse files- app.py +113 -40
- compare_significance.py +50 -18
- server.py +30 -11
app.py
CHANGED
@@ -5,9 +5,15 @@ import pandas as pd
|
|
5 |
from gradio.themes.utils.sizes import text_md
|
6 |
from gradio_modal import Modal
|
7 |
|
8 |
-
from content import (
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
from server import LeaderboardServer
|
12 |
|
13 |
leaderboard_server = LeaderboardServer()
|
@@ -38,37 +44,63 @@ def process_submission(team_name, submission_id, description, link_to_model, sub
|
|
38 |
leaderboard_server.prepare_model_for_submission(submission_file, metadata)
|
39 |
except ValueError as err:
|
40 |
gr.Warning(str(err))
|
41 |
-
return
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
|
50 |
def submit_results():
|
51 |
leaderboard_server.save_pre_submit()
|
52 |
leaderboard_server.update_leaderboard()
|
53 |
gr.Info('Submission successful!')
|
54 |
-
return
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
|
61 |
def erase_pre_submit():
|
62 |
leaderboard_server.pre_submit = None
|
63 |
-
return
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
|
68 |
def fetch_model_detail(submission_id):
|
69 |
metadata = leaderboard_server.get_model_detail(submission_id)
|
70 |
-
return
|
71 |
-
|
|
|
|
|
72 |
|
73 |
|
74 |
def show_modal():
|
@@ -82,9 +114,10 @@ def hide_modal():
|
|
82 |
def on_application_load():
|
83 |
leaderboard_server.save_pre_submit()
|
84 |
leaderboard_server.update_leaderboard()
|
85 |
-
return
|
86 |
-
value=leaderboard_server.get_leaderboard(), visible=True),
|
87 |
-
choices=leaderboard_server.submission_ids)
|
|
|
88 |
|
89 |
|
90 |
custom_css = """
|
@@ -143,7 +176,7 @@ footer {visibility: hidden}
|
|
143 |
|
144 |
"""
|
145 |
|
146 |
-
with
|
147 |
with gr.Row():
|
148 |
with gr.Row():
|
149 |
gr.Markdown(HEADER_MARKDOWN)
|
@@ -155,17 +188,30 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
|
|
155 |
|
156 |
with gr.Row():
|
157 |
with gr.Tab("Overall"):
|
158 |
-
results_table = gr.DataFrame(
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
160 |
for c in leaderboard_server.tasks_categories:
|
161 |
with gr.Tab(c):
|
162 |
-
results_table = gr.DataFrame(
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
with gr.Tab('Model details'):
|
166 |
gr.Markdown(MORE_DETAILS_MARKDOWN)
|
167 |
-
detail_dropdown = gr.Dropdown(
|
168 |
-
|
|
|
|
|
|
|
169 |
|
170 |
with gr.Row():
|
171 |
model_description = gr.Text(value='', label='Model description', visible=False, interactive=False)
|
@@ -174,7 +220,8 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
|
|
174 |
detail_dropdown.change(
|
175 |
fn=fetch_model_detail,
|
176 |
inputs=[detail_dropdown],
|
177 |
-
outputs=[model_description, model_url]
|
|
|
178 |
|
179 |
with gr.Tab('Submission'):
|
180 |
with gr.Column():
|
@@ -213,13 +260,24 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
|
|
213 |
pre_submission_btn.click(
|
214 |
fn=on_submit_pressed,
|
215 |
concurrency_limit=1,
|
216 |
-
outputs=[pre_submission_btn]
|
217 |
).then(
|
218 |
fn=process_submission,
|
219 |
-
inputs=[
|
220 |
-
|
221 |
-
|
222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
)
|
224 |
|
225 |
submission_btn_yes.click(
|
@@ -229,8 +287,17 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
|
|
229 |
|
230 |
modal_submit_yes.click(
|
231 |
fn=submit_results,
|
232 |
-
outputs=[
|
233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
)
|
235 |
|
236 |
modal_submit_no.click(
|
@@ -240,8 +307,14 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main
|
|
240 |
|
241 |
submission_btn_no.click(
|
242 |
fn=erase_pre_submit,
|
243 |
-
outputs=[
|
244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
)
|
246 |
main.load(on_application_load, inputs=None, outputs=[results_table, detail_dropdown])
|
247 |
|
|
|
5 |
from gradio.themes.utils.sizes import text_md
|
6 |
from gradio_modal import Modal
|
7 |
|
8 |
+
from content import (
|
9 |
+
HEADER_MARKDOWN,
|
10 |
+
LEADERBOARD_TAB_TITLE_MARKDOWN,
|
11 |
+
SUBMISSION_TAB_TITLE_MARKDOWN,
|
12 |
+
MODAL_SUBMIT_MARKDOWN,
|
13 |
+
SUBMISSION_DETAILS_MARKDOWN,
|
14 |
+
RANKING_AFTER_SUBMISSION_MARKDOWN,
|
15 |
+
MORE_DETAILS_MARKDOWN,
|
16 |
+
)
|
17 |
from server import LeaderboardServer
|
18 |
|
19 |
leaderboard_server = LeaderboardServer()
|
|
|
44 |
leaderboard_server.prepare_model_for_submission(submission_file, metadata)
|
45 |
except ValueError as err:
|
46 |
gr.Warning(str(err))
|
47 |
+
return (
|
48 |
+
gr.update(value='Pre-submit model', visible=True, interactive=True),
|
49 |
+
gr.update(visible=False),
|
50 |
+
gr.update(visible=False),
|
51 |
+
gr.update(visible=False),
|
52 |
+
gr.update(visible=False),
|
53 |
+
gr.update(visible=False),
|
54 |
+
)
|
55 |
+
return (
|
56 |
+
gr.update(visible=False),
|
57 |
+
gr.update(visible=True),
|
58 |
+
gr.update(interactive=True, visible=True),
|
59 |
+
gr.update(interactive=True, visible=True),
|
60 |
+
gr.update(visible=True),
|
61 |
+
gr.update(
|
62 |
+
value=leaderboard_server.get_leaderboard(leaderboard_server.pre_submit[0]),
|
63 |
+
visible=True,
|
64 |
+
),
|
65 |
+
)
|
66 |
|
67 |
|
68 |
def submit_results():
|
69 |
leaderboard_server.save_pre_submit()
|
70 |
leaderboard_server.update_leaderboard()
|
71 |
gr.Info('Submission successful!')
|
72 |
+
return (
|
73 |
+
gr.update(value='Pre-submit model', visible=True, interactive=True),
|
74 |
+
gr.update(visible=False),
|
75 |
+
gr.update(visible=False),
|
76 |
+
gr.update(visible=False),
|
77 |
+
gr.update(visible=False),
|
78 |
+
gr.update(visible=False),
|
79 |
+
gr.DataFrame(value=leaderboard_server.get_leaderboard(), visible=True),
|
80 |
+
gr.update(visible=False),
|
81 |
+
gr.update(choices=leaderboard_server.submission_ids),
|
82 |
+
)
|
83 |
|
84 |
|
85 |
def erase_pre_submit():
|
86 |
leaderboard_server.pre_submit = None
|
87 |
+
return (
|
88 |
+
gr.update(value='Pre-submit model', visible=True, interactive=True),
|
89 |
+
gr.update(visible=False),
|
90 |
+
gr.update(visible=False),
|
91 |
+
gr.update(visible=False),
|
92 |
+
gr.update(visible=False),
|
93 |
+
gr.update(visible=False),
|
94 |
+
gr.update(visible=False),
|
95 |
+
)
|
96 |
|
97 |
|
98 |
def fetch_model_detail(submission_id):
|
99 |
metadata = leaderboard_server.get_model_detail(submission_id)
|
100 |
+
return (
|
101 |
+
gr.update(value=metadata['description'], visible=True),
|
102 |
+
gr.update(value=metadata['link_to_model'], visible=True)
|
103 |
+
)
|
104 |
|
105 |
|
106 |
def show_modal():
|
|
|
114 |
def on_application_load():
|
115 |
leaderboard_server.save_pre_submit()
|
116 |
leaderboard_server.update_leaderboard()
|
117 |
+
return (
|
118 |
+
gr.DataFrame(value=leaderboard_server.get_leaderboard(), visible=True),
|
119 |
+
gr.update(choices=leaderboard_server.submission_ids)
|
120 |
+
)
|
121 |
|
122 |
|
123 |
custom_css = """
|
|
|
176 |
|
177 |
"""
|
178 |
|
179 |
+
with gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css=custom_css) as main:
|
180 |
with gr.Row():
|
181 |
with gr.Row():
|
182 |
gr.Markdown(HEADER_MARKDOWN)
|
|
|
188 |
|
189 |
with gr.Row():
|
190 |
with gr.Tab("Overall"):
|
191 |
+
results_table = gr.DataFrame(
|
192 |
+
leaderboard_server.get_leaderboard(),
|
193 |
+
interactive=False,
|
194 |
+
label=None,
|
195 |
+
visible=True,
|
196 |
+
elem_classes="leaderboard-table",
|
197 |
+
)
|
198 |
for c in leaderboard_server.tasks_categories:
|
199 |
with gr.Tab(c):
|
200 |
+
results_table = gr.DataFrame(
|
201 |
+
leaderboard_server.get_leaderboard(),
|
202 |
+
interactive=False,
|
203 |
+
label=None,
|
204 |
+
visible=True,
|
205 |
+
elem_classes="leaderboard-table",
|
206 |
+
)
|
207 |
|
208 |
with gr.Tab('Model details'):
|
209 |
gr.Markdown(MORE_DETAILS_MARKDOWN)
|
210 |
+
detail_dropdown = gr.Dropdown(
|
211 |
+
choices=leaderboard_server.submission_ids,
|
212 |
+
label="Select model",
|
213 |
+
interactive=True,
|
214 |
+
)
|
215 |
|
216 |
with gr.Row():
|
217 |
model_description = gr.Text(value='', label='Model description', visible=False, interactive=False)
|
|
|
220 |
detail_dropdown.change(
|
221 |
fn=fetch_model_detail,
|
222 |
inputs=[detail_dropdown],
|
223 |
+
outputs=[model_description, model_url],
|
224 |
+
)
|
225 |
|
226 |
with gr.Tab('Submission'):
|
227 |
with gr.Column():
|
|
|
260 |
pre_submission_btn.click(
|
261 |
fn=on_submit_pressed,
|
262 |
concurrency_limit=1,
|
263 |
+
outputs=[pre_submission_btn],
|
264 |
).then(
|
265 |
fn=process_submission,
|
266 |
+
inputs=[
|
267 |
+
submission_team_name_tb,
|
268 |
+
submission_id_tb,
|
269 |
+
description_tb,
|
270 |
+
link_to_model_tb,
|
271 |
+
submission_file_path,
|
272 |
+
],
|
273 |
+
outputs=[
|
274 |
+
pre_submission_btn,
|
275 |
+
submit_prompt,
|
276 |
+
submission_btn_yes,
|
277 |
+
submission_btn_no,
|
278 |
+
pre_submit_info,
|
279 |
+
pre_submit_table,
|
280 |
+
],
|
281 |
)
|
282 |
|
283 |
submission_btn_yes.click(
|
|
|
287 |
|
288 |
modal_submit_yes.click(
|
289 |
fn=submit_results,
|
290 |
+
outputs=[
|
291 |
+
pre_submission_btn,
|
292 |
+
submission_btn_yes,
|
293 |
+
submission_btn_no,
|
294 |
+
submit_prompt,
|
295 |
+
pre_submit_info,
|
296 |
+
pre_submit_table,
|
297 |
+
results_table,
|
298 |
+
modal_submit,
|
299 |
+
detail_dropdown,
|
300 |
+
],
|
301 |
)
|
302 |
|
303 |
modal_submit_no.click(
|
|
|
307 |
|
308 |
submission_btn_no.click(
|
309 |
fn=erase_pre_submit,
|
310 |
+
outputs=[
|
311 |
+
pre_submission_btn,
|
312 |
+
submission_btn_yes,
|
313 |
+
submission_btn_no,
|
314 |
+
submit_prompt,
|
315 |
+
pre_submit_info,
|
316 |
+
pre_submit_table,
|
317 |
+
],
|
318 |
)
|
319 |
main.load(on_application_load, inputs=None, outputs=[results_table, detail_dropdown])
|
320 |
|
compare_significance.py
CHANGED
@@ -36,7 +36,14 @@ def _get_CMs(i, probabilities, references, thresholds):
|
|
36 |
FN += 1
|
37 |
else:
|
38 |
TN += 1
|
39 |
-
cm = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
confusion_matrices.append(cm)
|
41 |
|
42 |
return confusion_matrices
|
@@ -73,16 +80,20 @@ def compute_significance_bootstrap(scores_A, scores_B):
|
|
73 |
return pval, delta_orig
|
74 |
|
75 |
|
76 |
-
def compute_significance_avg_mcauroc(
|
77 |
-
|
|
|
|
|
78 |
# compute MC-AUC for model A
|
79 |
model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=100)
|
80 |
model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=100)
|
81 |
delta = np.mean(model_A_scores) - np.mean(model_B_scores)
|
82 |
|
83 |
# one-tailed test
|
84 |
-
p_value = (
|
85 |
-
|
|
|
|
|
86 |
|
87 |
return p_value, delta
|
88 |
|
@@ -114,8 +125,10 @@ def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
|
|
114 |
auc_scores_per_class = []
|
115 |
for i in range(len(n_classes)):
|
116 |
# for i-th class vs all others
|
117 |
-
fpr[i], _, thresholds[i] = roc_curve(
|
118 |
-
|
|
|
|
|
119 |
|
120 |
confusion_matrices = _get_CMs(i, probs, references, thresholds)
|
121 |
tp, fn = convert_confusion_matrices(confusion_matrices)
|
@@ -194,13 +207,20 @@ def process_task(task, dataA, dataB, significance_level):
|
|
194 |
assert len(dataA[task]) == len(dataB[task])
|
195 |
|
196 |
if metricA == "avg_mcauroc":
|
197 |
-
p_value, delta = compute_significance_avg_mcauroc(
|
198 |
-
|
|
|
|
|
199 |
elif metricA in ["acc", "exact_match"]:
|
200 |
-
p_value, delta = compute_significance_ttest(
|
|
|
|
|
|
|
201 |
elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
|
202 |
-
p_value, delta = compute_significance_bootstrap(
|
203 |
-
|
|
|
|
|
204 |
else:
|
205 |
raise ValueError(f"Unsupported metric {metricA}")
|
206 |
|
@@ -228,14 +248,21 @@ def check_significance(fileA, fileB, significance_level=0.05):
|
|
228 |
assert len(dataA[task]) == len(dataB[task])
|
229 |
|
230 |
if metricA == "avg_mcauroc":
|
231 |
-
p_value, delta = compute_significance_avg_mcauroc(
|
232 |
-
|
|
|
|
|
233 |
|
234 |
elif metricA in ["acc", "exact_match"]:
|
235 |
-
p_value, delta = compute_significance_ttest(
|
|
|
|
|
|
|
236 |
elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
|
237 |
-
p_value, delta = compute_significance_bootstrap(
|
238 |
-
|
|
|
|
|
239 |
else:
|
240 |
raise ValueError(f"Unsupported metric {metricA}")
|
241 |
if delta <= 0:
|
@@ -253,7 +280,12 @@ def main():
|
|
253 |
parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
|
254 |
parser.add_argument("--modelA", help="ModelA JSON file from lm harness.")
|
255 |
parser.add_argument("--modelB", help="ModelB JSON file from lm harness.")
|
256 |
-
parser.add_argument(
|
|
|
|
|
|
|
|
|
|
|
257 |
args = parser.parse_args()
|
258 |
|
259 |
result = check_significance(args.modelA, args.modelB, args.significance_level)
|
|
|
36 |
FN += 1
|
37 |
else:
|
38 |
TN += 1
|
39 |
+
cm = {
|
40 |
+
"TP": TP,
|
41 |
+
"FP": FP,
|
42 |
+
"TN": TN,
|
43 |
+
"FN": FN,
|
44 |
+
"threshold": threshold,
|
45 |
+
"class": i,
|
46 |
+
}
|
47 |
confusion_matrices.append(cm)
|
48 |
|
49 |
return confusion_matrices
|
|
|
80 |
return pval, delta_orig
|
81 |
|
82 |
|
83 |
+
def compute_significance_avg_mcauroc(
|
84 |
+
probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
|
85 |
+
probsB: Sequence[Sequence[float]], referencesB: Sequence[int],
|
86 |
+
):
|
87 |
# compute MC-AUC for model A
|
88 |
model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=100)
|
89 |
model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=100)
|
90 |
delta = np.mean(model_A_scores) - np.mean(model_B_scores)
|
91 |
|
92 |
# one-tailed test
|
93 |
+
p_value = (
|
94 |
+
(model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
|
95 |
+
/ (len(model_A_scores) * len(model_B_scores))
|
96 |
+
)
|
97 |
|
98 |
return p_value, delta
|
99 |
|
|
|
125 |
auc_scores_per_class = []
|
126 |
for i in range(len(n_classes)):
|
127 |
# for i-th class vs all others
|
128 |
+
fpr[i], _, thresholds[i] = roc_curve(
|
129 |
+
y_true=[1 if x == n_classes[i] else 0 for x in references],
|
130 |
+
y_score=[prob[i] for prob in probs],
|
131 |
+
)
|
132 |
|
133 |
confusion_matrices = _get_CMs(i, probs, references, thresholds)
|
134 |
tp, fn = convert_confusion_matrices(confusion_matrices)
|
|
|
207 |
assert len(dataA[task]) == len(dataB[task])
|
208 |
|
209 |
if metricA == "avg_mcauroc":
|
210 |
+
p_value, delta = compute_significance_avg_mcauroc(
|
211 |
+
probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
212 |
+
probsB=dataB[task][0][1], referencesB=dataB[task][0][0],
|
213 |
+
)
|
214 |
elif metricA in ["acc", "exact_match"]:
|
215 |
+
p_value, delta = compute_significance_ttest(
|
216 |
+
scores_A=dataA[task][0],
|
217 |
+
scores_B=dataB[task][0]
|
218 |
+
)
|
219 |
elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
|
220 |
+
p_value, delta = compute_significance_bootstrap(
|
221 |
+
scores_A=np.array(dataA[task][0]),
|
222 |
+
scores_B=np.array(dataB[task][0])
|
223 |
+
)
|
224 |
else:
|
225 |
raise ValueError(f"Unsupported metric {metricA}")
|
226 |
|
|
|
248 |
assert len(dataA[task]) == len(dataB[task])
|
249 |
|
250 |
if metricA == "avg_mcauroc":
|
251 |
+
p_value, delta = compute_significance_avg_mcauroc(
|
252 |
+
probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
253 |
+
probsB=dataB[task][0][1], referencesB=dataB[task][0][0],
|
254 |
+
)
|
255 |
|
256 |
elif metricA in ["acc", "exact_match"]:
|
257 |
+
p_value, delta = compute_significance_ttest(
|
258 |
+
scores_A=dataA[task][0],
|
259 |
+
scores_B=dataB[task][0]
|
260 |
+
)
|
261 |
elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
|
262 |
+
p_value, delta = compute_significance_bootstrap(
|
263 |
+
scores_A=np.array(dataA[task][0]),
|
264 |
+
scores_B=np.array(dataB[task][0])
|
265 |
+
)
|
266 |
else:
|
267 |
raise ValueError(f"Unsupported metric {metricA}")
|
268 |
if delta <= 0:
|
|
|
280 |
parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
|
281 |
parser.add_argument("--modelA", help="ModelA JSON file from lm harness.")
|
282 |
parser.add_argument("--modelB", help="ModelB JSON file from lm harness.")
|
283 |
+
parser.add_argument(
|
284 |
+
"--significance_level",
|
285 |
+
type=float,
|
286 |
+
default=0.05,
|
287 |
+
help="Significance level (e.g., 0.05)",
|
288 |
+
)
|
289 |
args = parser.parse_args()
|
290 |
|
291 |
result = check_significance(args.modelA, args.modelB, args.significance_level)
|
server.py
CHANGED
@@ -18,12 +18,17 @@ REPO = f"{ORG}/LLM_benchmark_data"
|
|
18 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
19 |
TASKS_METADATA_PATH = "./tasks_metadata.json"
|
20 |
|
|
|
21 |
class LeaderboardServer:
|
22 |
def __init__(self):
|
23 |
self.server_address = REPO
|
24 |
self.repo_type = "dataset"
|
25 |
-
self.local_leaderboard = snapshot_download(
|
26 |
-
|
|
|
|
|
|
|
|
|
27 |
self.submisssion_id_to_file = {} # Map submission ids to file paths
|
28 |
self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))
|
29 |
self.tasks_categories = {self.tasks_metadata[task]["category"] for task in self.tasks_metadata}
|
@@ -33,8 +38,12 @@ class LeaderboardServer:
|
|
33 |
self.pre_submit = None
|
34 |
|
35 |
def update_leaderboard(self):
|
36 |
-
self.local_leaderboard = snapshot_download(
|
37 |
-
|
|
|
|
|
|
|
|
|
38 |
self.fetch_existing_models()
|
39 |
self.tournament_results = self.load_tournament_results()
|
40 |
|
@@ -96,17 +105,27 @@ class LeaderboardServer:
|
|
96 |
else:
|
97 |
processed_results.append(local_results)
|
98 |
dataframe = pd.DataFrame.from_records(processed_results)
|
99 |
-
df_order = (
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
dataframe = dataframe[df_order]
|
103 |
-
dataframe = dataframe.rename(
|
|
|
|
|
104 |
return dataframe
|
105 |
|
106 |
def start_tournament(self, new_model_id, new_model_file):
|
107 |
new_tournament = copy.deepcopy(self.tournament_results)
|
108 |
new_tournament[new_model_id] = {}
|
109 |
-
new_tournament[new_model_id][new_model_id] = {
|
|
|
|
|
110 |
|
111 |
for model in self.submission_ids:
|
112 |
res = check_significance(new_model_file, self.submisssion_id_to_file[model])
|
@@ -124,7 +143,7 @@ class LeaderboardServer:
|
|
124 |
data = json.load(f)
|
125 |
data["metadata"] = metadata
|
126 |
with open(file, "w") as f:
|
127 |
-
json.dump(data, f, separators=(',', ':'))
|
128 |
|
129 |
model_id = metadata["team_name"] + "_" + metadata["submission_id"]
|
130 |
tournament_results = self.start_tournament(model_id, file)
|
@@ -145,7 +164,7 @@ class LeaderboardServer:
|
|
145 |
# Temporary save tournament results
|
146 |
tournament_results_path = os.path.join(self.local_leaderboard, "tournament.json")
|
147 |
with open(tournament_results_path, "w") as f:
|
148 |
-
json.dump(tournament_results, f, sort_keys=True, indent=2)
|
149 |
|
150 |
api.upload_file(
|
151 |
path_or_fileobj=tournament_results_path,
|
|
|
18 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
19 |
TASKS_METADATA_PATH = "./tasks_metadata.json"
|
20 |
|
21 |
+
|
22 |
class LeaderboardServer:
|
23 |
def __init__(self):
|
24 |
self.server_address = REPO
|
25 |
self.repo_type = "dataset"
|
26 |
+
self.local_leaderboard = snapshot_download(
|
27 |
+
self.server_address,
|
28 |
+
repo_type=self.repo_type,
|
29 |
+
token=HF_TOKEN,
|
30 |
+
local_dir="./",
|
31 |
+
)
|
32 |
self.submisssion_id_to_file = {} # Map submission ids to file paths
|
33 |
self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))
|
34 |
self.tasks_categories = {self.tasks_metadata[task]["category"] for task in self.tasks_metadata}
|
|
|
38 |
self.pre_submit = None
|
39 |
|
40 |
def update_leaderboard(self):
|
41 |
+
self.local_leaderboard = snapshot_download(
|
42 |
+
self.server_address,
|
43 |
+
repo_type=self.repo_type,
|
44 |
+
token=HF_TOKEN,
|
45 |
+
local_dir="./",
|
46 |
+
)
|
47 |
self.fetch_existing_models()
|
48 |
self.tournament_results = self.load_tournament_results()
|
49 |
|
|
|
105 |
else:
|
106 |
processed_results.append(local_results)
|
107 |
dataframe = pd.DataFrame.from_records(processed_results)
|
108 |
+
df_order = (
|
109 |
+
["submission_id"]
|
110 |
+
+ list(self.tasks_metadata.keys())
|
111 |
+
+ [
|
112 |
+
col
|
113 |
+
for col in dataframe.columns
|
114 |
+
if col != "submission_id" and col not in self.tasks_metadata.keys()
|
115 |
+
]
|
116 |
+
)
|
117 |
dataframe = dataframe[df_order]
|
118 |
+
dataframe = dataframe.rename(
|
119 |
+
columns={key: value["name"] for key, value in self.tasks_metadata.items()}
|
120 |
+
)
|
121 |
return dataframe
|
122 |
|
123 |
def start_tournament(self, new_model_id, new_model_file):
|
124 |
new_tournament = copy.deepcopy(self.tournament_results)
|
125 |
new_tournament[new_model_id] = {}
|
126 |
+
new_tournament[new_model_id][new_model_id] = {
|
127 |
+
task: False for task in self.tasks_metadata.keys()
|
128 |
+
}
|
129 |
|
130 |
for model in self.submission_ids:
|
131 |
res = check_significance(new_model_file, self.submisssion_id_to_file[model])
|
|
|
143 |
data = json.load(f)
|
144 |
data["metadata"] = metadata
|
145 |
with open(file, "w") as f:
|
146 |
+
json.dump(data, f, separators=(',', ':')) # compact JSON
|
147 |
|
148 |
model_id = metadata["team_name"] + "_" + metadata["submission_id"]
|
149 |
tournament_results = self.start_tournament(model_id, file)
|
|
|
164 |
# Temporary save tournament results
|
165 |
tournament_results_path = os.path.join(self.local_leaderboard, "tournament.json")
|
166 |
with open(tournament_results_path, "w") as f:
|
167 |
+
json.dump(tournament_results, f, sort_keys=True, indent=2) # readable JSON
|
168 |
|
169 |
api.upload_file(
|
170 |
path_or_fileobj=tournament_results_path,
|