Spaces:
Running
Running
add new models
Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json
CHANGED
@@ -21,6 +21,28 @@
|
|
21 |
"Total Puzzles": 1000,
|
22 |
"Reason Lens": "1153.83"
|
23 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
{
|
25 |
"Model": "gpt-4o-2024-05-13",
|
26 |
"Mode": "sampling",
|
@@ -230,6 +252,17 @@
|
|
230 |
"Total Puzzles": 1000,
|
231 |
"Reason Lens": "809.95"
|
232 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
{
|
234 |
"Model": "gemma-2-27b-it@nvidia",
|
235 |
"Mode": "greedy",
|
@@ -252,6 +285,17 @@
|
|
252 |
"Total Puzzles": 1000,
|
253 |
"Reason Lens": "1015.06"
|
254 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
{
|
256 |
"Model": "reka-core-20240501",
|
257 |
"Mode": "greedy",
|
@@ -263,17 +307,6 @@
|
|
263 |
"Total Puzzles": 1000,
|
264 |
"Reason Lens": "1078.29"
|
265 |
},
|
266 |
-
{
|
267 |
-
"Model": "gemma-2-9b-it",
|
268 |
-
"Mode": "greedy",
|
269 |
-
"Puzzle Acc": "12.90",
|
270 |
-
"Cell Acc": "37.07",
|
271 |
-
"No answer": "0.50",
|
272 |
-
"Easy Puzzle Acc": "42.14",
|
273 |
-
"Hard Puzzle Acc": "1.53",
|
274 |
-
"Total Puzzles": 1000,
|
275 |
-
"Reason Lens": "859.14"
|
276 |
-
},
|
277 |
{
|
278 |
"Model": "gemma-2-9b-it@nvidia",
|
279 |
"Mode": "greedy",
|
@@ -296,6 +329,28 @@
|
|
296 |
"Total Puzzles": 1000,
|
297 |
"Reason Lens": "1216.40"
|
298 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
{
|
300 |
"Model": "Yi-1.5-34B-Chat",
|
301 |
"Mode": "greedy",
|
@@ -329,6 +384,17 @@
|
|
329 |
"Total Puzzles": 1000,
|
330 |
"Reason Lens": "820.66"
|
331 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
{
|
333 |
"Model": "reka-flash-20240226",
|
334 |
"Mode": "greedy",
|
@@ -351,6 +417,17 @@
|
|
351 |
"Total Puzzles": 1000,
|
352 |
"Reason Lens": "1148.16"
|
353 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
{
|
355 |
"Model": "Qwen2-7B-Instruct",
|
356 |
"Mode": "greedy",
|
|
|
21 |
"Total Puzzles": 1000,
|
22 |
"Reason Lens": "1153.83"
|
23 |
},
|
24 |
+
{
|
25 |
+
"Model": "Llama-3.1-405B-Instruct-Turbo",
|
26 |
+
"Mode": "greedy",
|
27 |
+
"Puzzle Acc": "32.60",
|
28 |
+
"Cell Acc": "45.80",
|
29 |
+
"No answer": "12.50",
|
30 |
+
"Easy Puzzle Acc": "87.14",
|
31 |
+
"Hard Puzzle Acc": "11.39",
|
32 |
+
"Total Puzzles": 1000,
|
33 |
+
"Reason Lens": "314.66"
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"Model": "Llama-3.1-405B-Instruct-Turbo",
|
37 |
+
"Mode": "sampling",
|
38 |
+
"Puzzle Acc": "32.60",
|
39 |
+
"Cell Acc": "47.04",
|
40 |
+
"No answer": "10.80",
|
41 |
+
"Easy Puzzle Acc": "86.07",
|
42 |
+
"Hard Puzzle Acc": "11.81",
|
43 |
+
"Total Puzzles": 1000,
|
44 |
+
"Reason Lens": "439.96"
|
45 |
+
},
|
46 |
{
|
47 |
"Model": "gpt-4o-2024-05-13",
|
48 |
"Mode": "sampling",
|
|
|
252 |
"Total Puzzles": 1000,
|
253 |
"Reason Lens": "809.95"
|
254 |
},
|
255 |
+
{
|
256 |
+
"Model": "Athene-70B",
|
257 |
+
"Mode": "greedy",
|
258 |
+
"Puzzle Acc": "16.70",
|
259 |
+
"Cell Acc": "32.98",
|
260 |
+
"No answer": "21.10",
|
261 |
+
"Easy Puzzle Acc": "52.50",
|
262 |
+
"Hard Puzzle Acc": "2.78",
|
263 |
+
"Total Puzzles": 1000,
|
264 |
+
"Reason Lens": "391.19"
|
265 |
+
},
|
266 |
{
|
267 |
"Model": "gemma-2-27b-it@nvidia",
|
268 |
"Mode": "greedy",
|
|
|
285 |
"Total Puzzles": 1000,
|
286 |
"Reason Lens": "1015.06"
|
287 |
},
|
288 |
+
{
|
289 |
+
"Model": "command-r-plus",
|
290 |
+
"Mode": "greedy",
|
291 |
+
"Puzzle Acc": "13.90",
|
292 |
+
"Cell Acc": "39.01",
|
293 |
+
"No answer": "0.20",
|
294 |
+
"Easy Puzzle Acc": "44.64",
|
295 |
+
"Hard Puzzle Acc": "1.94",
|
296 |
+
"Total Puzzles": 1000,
|
297 |
+
"Reason Lens": "810.53"
|
298 |
+
},
|
299 |
{
|
300 |
"Model": "reka-core-20240501",
|
301 |
"Mode": "greedy",
|
|
|
307 |
"Total Puzzles": 1000,
|
308 |
"Reason Lens": "1078.29"
|
309 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
{
|
311 |
"Model": "gemma-2-9b-it@nvidia",
|
312 |
"Mode": "greedy",
|
|
|
329 |
"Total Puzzles": 1000,
|
330 |
"Reason Lens": "1216.40"
|
331 |
},
|
332 |
+
{
|
333 |
+
"Model": "Mistral-Nemo-Instruct-2407",
|
334 |
+
"Mode": "greedy",
|
335 |
+
"Puzzle Acc": "11.80",
|
336 |
+
"Cell Acc": "34.93",
|
337 |
+
"No answer": "1.60",
|
338 |
+
"Easy Puzzle Acc": "38.93",
|
339 |
+
"Hard Puzzle Acc": "1.25",
|
340 |
+
"Total Puzzles": 1000,
|
341 |
+
"Reason Lens": "925.88"
|
342 |
+
},
|
343 |
+
{
|
344 |
+
"Model": "Phi-3-mini-4k-instruct",
|
345 |
+
"Mode": "greedy",
|
346 |
+
"Puzzle Acc": "11.60",
|
347 |
+
"Cell Acc": "13.50",
|
348 |
+
"No answer": "59.00",
|
349 |
+
"Easy Puzzle Acc": "38.21",
|
350 |
+
"Hard Puzzle Acc": "1.25",
|
351 |
+
"Total Puzzles": 1000,
|
352 |
+
"Reason Lens": "790.29"
|
353 |
+
},
|
354 |
{
|
355 |
"Model": "Yi-1.5-34B-Chat",
|
356 |
"Mode": "greedy",
|
|
|
384 |
"Total Puzzles": 1000,
|
385 |
"Reason Lens": "820.66"
|
386 |
},
|
387 |
+
{
|
388 |
+
"Model": "command-r",
|
389 |
+
"Mode": "greedy",
|
390 |
+
"Puzzle Acc": "9.90",
|
391 |
+
"Cell Acc": "32.66",
|
392 |
+
"No answer": "1.50",
|
393 |
+
"Easy Puzzle Acc": "32.14",
|
394 |
+
"Hard Puzzle Acc": "1.25",
|
395 |
+
"Total Puzzles": 1000,
|
396 |
+
"Reason Lens": "1005.17"
|
397 |
+
},
|
398 |
{
|
399 |
"Model": "reka-flash-20240226",
|
400 |
"Mode": "greedy",
|
|
|
417 |
"Total Puzzles": 1000,
|
418 |
"Reason Lens": "1148.16"
|
419 |
},
|
420 |
+
{
|
421 |
+
"Model": "Mixtral-8x7B-Instruct-v0.1",
|
422 |
+
"Mode": "greedy",
|
423 |
+
"Puzzle Acc": "8.70",
|
424 |
+
"Cell Acc": "26.47",
|
425 |
+
"No answer": "20.30",
|
426 |
+
"Easy Puzzle Acc": "28.93",
|
427 |
+
"Hard Puzzle Acc": "0.83",
|
428 |
+
"Total Puzzles": 1000,
|
429 |
+
"Reason Lens": "1177.21"
|
430 |
+
},
|
431 |
{
|
432 |
"Model": "Qwen2-7B-Instruct",
|
433 |
"Mode": "greedy",
|
model_info.json
CHANGED
@@ -64,5 +64,9 @@
|
|
64 |
"SELM-Llama-3-8B-Instruct-iter-3": {"pretty_name": "SELM (Llama3-8B-Inst-iter3)", "hf_model_id": "ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3"},
|
65 |
"nemotron-4-340b-instruct": {"pretty_name": "Nemotron-4-340B-Instruct", "hf_model_id": "nvidia/Nemotron-4-340B-Instruct"},
|
66 |
"Llama-3-8B-Magpie-Align-v0.1": {"pretty_name": "Magpie-8B-Align-v0.1", "hf_model_id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1"},
|
67 |
-
"mathstral-7B-v0.1":{"pretty_name": "mathstral-7B-v0.1", "hf_model_id": "mistralai/mathstral-7B-v0.1"}
|
|
|
|
|
|
|
|
|
68 |
}
|
|
|
64 |
"SELM-Llama-3-8B-Instruct-iter-3": {"pretty_name": "SELM (Llama3-8B-Inst-iter3)", "hf_model_id": "ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3"},
|
65 |
"nemotron-4-340b-instruct": {"pretty_name": "Nemotron-4-340B-Instruct", "hf_model_id": "nvidia/Nemotron-4-340B-Instruct"},
|
66 |
"Llama-3-8B-Magpie-Align-v0.1": {"pretty_name": "Magpie-8B-Align-v0.1", "hf_model_id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1"},
|
67 |
+
"mathstral-7B-v0.1":{"pretty_name": "mathstral-7B-v0.1 π¨", "hf_model_id": "mistralai/mathstral-7B-v0.1"},
|
68 |
+
"Llama-3.1-405B-Instruct-Turbo": {"pretty_name": "Llama-3.1-405B-Instruct-Turbo π¨", "hf_model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct"},
|
69 |
+
"Mistral-Nemo-Instruct-2407": {"pretty_name": "Mistral-Nemo-Inst (12B) π¨", "hf_model_id": "Mistral-Nemo-Instruct-2407"},
|
70 |
+
"Phi-3-mini-4k-instruct": {"pretty_name": "Phi-3-mini-4k-instruct π¨", "hf_model_id": "microsoft/Phi-3-mini-4k-instruct"},
|
71 |
+
"Athene-70B": {"pretty_name": "Athene-70B π¨", "hf_model_id": "Nexusflow/Athene-70B"}
|
72 |
}
|