Spaces:
Paused
Paused
do not display incomplete models for now
Browse files
app.py
CHANGED
@@ -93,6 +93,21 @@ if not IS_PUBLIC:
|
|
93 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
94 |
EVAL_TYPES = ["markdown", "str", "bool", "bool", "bool", "str"]
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
def get_leaderboard():
|
98 |
if repo:
|
@@ -125,11 +140,22 @@ def get_leaderboard():
|
|
125 |
}
|
126 |
all_data.append(gpt35_values)
|
127 |
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
|
135 |
def get_eval_table():
|
@@ -144,7 +170,7 @@ def get_eval_table():
|
|
144 |
all_evals = []
|
145 |
|
146 |
for entry in entries:
|
147 |
-
print(entry)
|
148 |
if ".json" in entry:
|
149 |
file_path = os.path.join("evals/eval_requests", entry)
|
150 |
with open(file_path) as fp:
|
@@ -171,12 +197,17 @@ def get_eval_table():
|
|
171 |
data["model"] = make_clickable_model(data["model"])
|
172 |
all_evals.append(data)
|
173 |
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
|
178 |
leaderboard = get_leaderboard()
|
179 |
-
|
180 |
|
181 |
|
182 |
def is_model_on_hub(model_name, revision) -> bool:
|
@@ -237,7 +268,7 @@ def add_new_eval(
|
|
237 |
if out_path.lower() in requested_models:
|
238 |
duplicate_request_message = "This model has been already submitted."
|
239 |
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{duplicate_request_message}</p>"
|
240 |
-
|
241 |
with open(out_path, "w") as f:
|
242 |
f.write(json.dumps(eval_entry))
|
243 |
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
|
@@ -256,7 +287,10 @@ def add_new_eval(
|
|
256 |
|
257 |
|
258 |
def refresh():
|
259 |
-
|
|
|
|
|
|
|
260 |
|
261 |
|
262 |
block = gr.Blocks()
|
@@ -289,16 +323,43 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
289 |
|
290 |
"""
|
291 |
)
|
292 |
-
with gr.Accordion("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
with gr.Row():
|
294 |
-
|
295 |
-
value=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
)
|
297 |
|
298 |
with gr.Row():
|
299 |
refresh_button = gr.Button("Refresh")
|
300 |
refresh_button.click(
|
301 |
-
refresh,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
)
|
303 |
|
304 |
with gr.Accordion("Submit a new model for evaluation"):
|
@@ -332,5 +393,14 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
332 |
submission_result,
|
333 |
)
|
334 |
|
335 |
-
block.load(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
336 |
block.launch()
|
|
|
93 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
94 |
EVAL_TYPES = ["markdown", "str", "bool", "bool", "bool", "str"]
|
95 |
|
96 |
+
BENCHMARK_COLS = [
|
97 |
+
"ARC (25-shot) ⬆️",
|
98 |
+
"HellaSwag (10-shot) ⬆️",
|
99 |
+
"MMLU (5-shot) ⬆️",
|
100 |
+
"TruthfulQA (0-shot) ⬆️",
|
101 |
+
]
|
102 |
+
|
103 |
+
|
104 |
+
def has_no_nan_values(df, columns):
|
105 |
+
return df[columns].notna().all(axis=1)
|
106 |
+
|
107 |
+
|
108 |
+
def has_nan_values(df, columns):
|
109 |
+
return df[columns].isna().any(axis=1)
|
110 |
+
|
111 |
|
112 |
def get_leaderboard():
|
113 |
if repo:
|
|
|
140 |
}
|
141 |
all_data.append(gpt35_values)
|
142 |
|
143 |
+
df = pd.DataFrame.from_records(all_data)
|
144 |
+
df = df.sort_values(by=["Average ⬆️"], ascending=False)
|
145 |
+
df = df[COLS]
|
146 |
+
|
147 |
+
# get incomplete models
|
148 |
+
incomplete_models = df[has_nan_values(df, BENCHMARK_COLS)]["Model"].tolist()
|
149 |
+
print(
|
150 |
+
[
|
151 |
+
model.split(" style")[0].split("https://huggingface.co/")[1]
|
152 |
+
for model in incomplete_models
|
153 |
+
]
|
154 |
+
)
|
155 |
+
|
156 |
+
# filter out if any of the benchmarks have not been produced
|
157 |
+
df = df[has_no_nan_values(df, BENCHMARK_COLS)]
|
158 |
+
return df
|
159 |
|
160 |
|
161 |
def get_eval_table():
|
|
|
170 |
all_evals = []
|
171 |
|
172 |
for entry in entries:
|
173 |
+
# print(entry)
|
174 |
if ".json" in entry:
|
175 |
file_path = os.path.join("evals/eval_requests", entry)
|
176 |
with open(file_path) as fp:
|
|
|
197 |
data["model"] = make_clickable_model(data["model"])
|
198 |
all_evals.append(data)
|
199 |
|
200 |
+
pending_list = [e for e in all_evals if e["status"] == "PENDING"]
|
201 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
202 |
+
finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
|
203 |
+
df_pending = pd.DataFrame.from_records(pending_list)
|
204 |
+
df_running = pd.DataFrame.from_records(running_list)
|
205 |
+
df_finished = pd.DataFrame.from_records(finished_list)
|
206 |
+
return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
|
207 |
|
208 |
|
209 |
leaderboard = get_leaderboard()
|
210 |
+
finished_eval_queue, running_eval_queue, pending_eval_queue = get_eval_table()
|
211 |
|
212 |
|
213 |
def is_model_on_hub(model_name, revision) -> bool:
|
|
|
268 |
if out_path.lower() in requested_models:
|
269 |
duplicate_request_message = "This model has been already submitted."
|
270 |
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{duplicate_request_message}</p>"
|
271 |
+
|
272 |
with open(out_path, "w") as f:
|
273 |
f.write(json.dumps(eval_entry))
|
274 |
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
|
|
|
287 |
|
288 |
|
289 |
def refresh():
|
290 |
+
leaderboard = get_leaderboard()
|
291 |
+
finished_eval_queue, running_eval_queue, pending_eval_queue = get_eval_table()
|
292 |
+
get_leaderboard(), get_eval_table()
|
293 |
+
return leaderboard, finished_eval_queue, running_eval_queue, pending_eval_queue
|
294 |
|
295 |
|
296 |
block = gr.Blocks()
|
|
|
323 |
|
324 |
"""
|
325 |
)
|
326 |
+
with gr.Accordion("Finished Evaluations", open=False):
|
327 |
+
with gr.Row():
|
328 |
+
finished_eval_table = gr.components.Dataframe(
|
329 |
+
value=finished_eval_queue,
|
330 |
+
headers=EVAL_COLS,
|
331 |
+
datatype=EVAL_TYPES,
|
332 |
+
max_rows=5,
|
333 |
+
)
|
334 |
+
with gr.Accordion("Running Evaluation Queue", open=False):
|
335 |
with gr.Row():
|
336 |
+
running_eval_table = gr.components.Dataframe(
|
337 |
+
value=running_eval_queue,
|
338 |
+
headers=EVAL_COLS,
|
339 |
+
datatype=EVAL_TYPES,
|
340 |
+
max_rows=5,
|
341 |
+
)
|
342 |
+
|
343 |
+
with gr.Accordion("Running & Pending Evaluation Queue", open=False):
|
344 |
+
with gr.Row():
|
345 |
+
pending_eval_table = gr.components.Dataframe(
|
346 |
+
value=pending_eval_queue,
|
347 |
+
headers=EVAL_COLS,
|
348 |
+
datatype=EVAL_TYPES,
|
349 |
+
max_rows=5,
|
350 |
)
|
351 |
|
352 |
with gr.Row():
|
353 |
refresh_button = gr.Button("Refresh")
|
354 |
refresh_button.click(
|
355 |
+
refresh,
|
356 |
+
inputs=[],
|
357 |
+
outputs=[
|
358 |
+
leaderboard_table,
|
359 |
+
finished_eval_table,
|
360 |
+
running_eval_table,
|
361 |
+
pending_eval_table,
|
362 |
+
],
|
363 |
)
|
364 |
|
365 |
with gr.Accordion("Submit a new model for evaluation"):
|
|
|
393 |
submission_result,
|
394 |
)
|
395 |
|
396 |
+
block.load(
|
397 |
+
refresh,
|
398 |
+
inputs=[],
|
399 |
+
outputs=[
|
400 |
+
leaderboard_table,
|
401 |
+
finished_eval_table,
|
402 |
+
running_eval_table,
|
403 |
+
pending_eval_table,
|
404 |
+
],
|
405 |
+
)
|
406 |
block.launch()
|