yuchenlin commited on
Commit
29abfee
·
1 Parent(s): b2c3610

using new columns

Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json CHANGED
@@ -1,4 +1,38 @@
1
  [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  {
3
  "Model": "o1-preview-2024-09-12",
4
  "Mode": "greedy",
@@ -7,8 +41,65 @@
7
  "No answer": "0.30",
8
  "Easy Puzzle Acc": "98.57",
9
  "Hard Puzzle Acc": "60.83",
 
 
 
 
10
  "Total Puzzles": 1000,
11
- "Reason Lens": "1565.88"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  },
13
  {
14
  "Model": "o1-mini-2024-09-12",
@@ -18,30 +109,65 @@
18
  "No answer": "0.80",
19
  "Easy Puzzle Acc": "87.14",
20
  "Hard Puzzle Acc": "39.17",
 
 
 
 
21
  "Total Puzzles": 1000,
22
- "Reason Lens": "993.28"
 
 
23
  },
24
  {
25
- "Model": "claude-3-5-sonnet-20240620",
26
  "Mode": "greedy",
27
- "Puzzle Acc": "33.40",
28
- "Cell Acc": "54.34",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  "No answer": "0.00",
30
- "Easy Puzzle Acc": "87.50",
31
- "Hard Puzzle Acc": "12.36",
 
 
 
 
32
  "Total Puzzles": 1000,
33
- "Reason Lens": "1141.94"
 
 
34
  },
35
  {
36
  "Model": "claude-3-5-sonnet-20240620",
37
- "Mode": "sampling",
38
  "Puzzle Acc": "33.40",
39
- "Cell Acc": "53.01",
40
- "No answer": "0.10",
41
- "Easy Puzzle Acc": "88.21",
42
- "Hard Puzzle Acc": "12.08",
 
 
 
 
43
  "Total Puzzles": 1000,
44
- "Reason Lens": "1153.83"
 
 
45
  },
46
  {
47
  "Model": "Llama-3.1-405B-Inst-fp8@together",
@@ -51,19 +177,14 @@
51
  "No answer": "12.50",
52
  "Easy Puzzle Acc": "87.14",
53
  "Hard Puzzle Acc": "11.39",
 
 
 
 
54
  "Total Puzzles": 1000,
55
- "Reason Lens": "314.66"
56
- },
57
- {
58
- "Model": "Llama-3.1-405B-Inst-fp8@together",
59
- "Mode": "sampling",
60
- "Puzzle Acc": "32.60",
61
- "Cell Acc": "47.04",
62
- "No answer": "10.80",
63
- "Easy Puzzle Acc": "86.07",
64
- "Hard Puzzle Acc": "11.81",
65
- "Total Puzzles": 1000,
66
- "Reason Lens": "439.96"
67
  },
68
  {
69
  "Model": "gpt-4o-2024-08-06",
@@ -73,19 +194,14 @@
73
  "No answer": "3.60",
74
  "Easy Puzzle Acc": "84.64",
75
  "Hard Puzzle Acc": "11.11",
 
 
 
 
76
  "Total Puzzles": 1000,
77
- "Reason Lens": "1106.51"
78
- },
79
- {
80
- "Model": "gpt-4o-2024-05-13",
81
- "Mode": "sampling",
82
- "Puzzle Acc": "30.80",
83
- "Cell Acc": "46.19",
84
- "No answer": "6.60",
85
- "Easy Puzzle Acc": "81.07",
86
- "Hard Puzzle Acc": "11.25",
87
- "Total Puzzles": 1000,
88
- "Reason Lens": "1549.74"
89
  },
90
  {
91
  "Model": "gemini-1.5-pro-exp-0827",
@@ -95,8 +211,14 @@
95
  "No answer": "0.80",
96
  "Easy Puzzle Acc": "79.64",
97
  "Hard Puzzle Acc": "11.39",
 
 
 
 
98
  "Total Puzzles": 1000,
99
- "Reason Lens": "1594.47"
 
 
100
  },
101
  {
102
  "Model": "Llama-3.1-405B-Inst@sambanova",
@@ -106,8 +228,14 @@
106
  "No answer": "24.70",
107
  "Easy Puzzle Acc": "84.64",
108
  "Hard Puzzle Acc": "8.89",
 
 
 
 
109
  "Total Puzzles": 1000,
110
- "Reason Lens": "2001.12"
 
 
111
  },
112
  {
113
  "Model": "chatgpt-4o-latest-24-09-07",
@@ -117,8 +245,14 @@
117
  "No answer": "4.20",
118
  "Easy Puzzle Acc": "81.43",
119
  "Hard Puzzle Acc": "9.86",
 
 
 
 
120
  "Total Puzzles": 1000,
121
- "Reason Lens": "1539.99"
 
 
122
  },
123
  {
124
  "Model": "Mistral-Large-2",
@@ -128,8 +262,14 @@
128
  "No answer": "1.70",
129
  "Easy Puzzle Acc": "80.36",
130
  "Hard Puzzle Acc": "9.03",
 
 
 
 
131
  "Total Puzzles": 1000,
132
- "Reason Lens": "1592.39"
 
 
133
  },
134
  {
135
  "Model": "gpt-4-turbo-2024-04-09",
@@ -139,8 +279,14 @@
139
  "No answer": "0.10",
140
  "Easy Puzzle Acc": "80.71",
141
  "Hard Puzzle Acc": "8.06",
 
 
 
 
142
  "Total Puzzles": 1000,
143
- "Reason Lens": "1148.46"
 
 
144
  },
145
  {
146
  "Model": "gpt-4o-2024-05-13",
@@ -150,8 +296,31 @@
150
  "No answer": "19.30",
151
  "Easy Puzzle Acc": "77.86",
152
  "Hard Puzzle Acc": "8.89",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  "Total Puzzles": 1000,
154
- "Reason Lens": "1643.51"
 
 
155
  },
156
  {
157
  "Model": "gpt-4-0314",
@@ -161,8 +330,14 @@
161
  "No answer": "0.20",
162
  "Easy Puzzle Acc": "77.14",
163
  "Hard Puzzle Acc": "7.64",
 
 
 
 
164
  "Total Puzzles": 1000,
165
- "Reason Lens": "1203.17"
 
 
166
  },
167
  {
168
  "Model": "claude-3-opus-20240229",
@@ -172,8 +347,14 @@
172
  "No answer": "0.00",
173
  "Easy Puzzle Acc": "78.21",
174
  "Hard Puzzle Acc": "7.08",
 
 
 
 
175
  "Total Puzzles": 1000,
176
- "Reason Lens": "855.72"
 
 
177
  },
178
  {
179
  "Model": "Qwen2.5-72B-Instruct",
@@ -183,19 +364,14 @@
183
  "No answer": "11.90",
184
  "Easy Puzzle Acc": "76.43",
185
  "Hard Puzzle Acc": "7.22",
 
 
 
 
186
  "Total Puzzles": 1000,
187
- "Reason Lens": "1795.90"
188
- },
189
- {
190
- "Model": "gpt-4-turbo-2024-04-09",
191
- "Mode": "sampling",
192
- "Puzzle Acc": "26.40",
193
- "Cell Acc": "47.93",
194
- "No answer": "0.00",
195
- "Easy Puzzle Acc": "74.29",
196
- "Hard Puzzle Acc": "7.78",
197
- "Total Puzzles": 1000,
198
- "Reason Lens": "1165.90"
199
  },
200
  {
201
  "Model": "Qwen2.5-32B-Instruct",
@@ -205,8 +381,14 @@
205
  "No answer": "6.30",
206
  "Easy Puzzle Acc": "77.50",
207
  "Hard Puzzle Acc": "6.11",
 
 
 
 
208
  "Total Puzzles": 1000,
209
- "Reason Lens": "1333.07"
 
 
210
  },
211
  {
212
  "Model": "gemini-1.5-pro-exp-0801",
@@ -216,8 +398,14 @@
216
  "No answer": "0.00",
217
  "Easy Puzzle Acc": "72.50",
218
  "Hard Puzzle Acc": "6.81",
 
 
 
 
219
  "Total Puzzles": 1000,
220
- "Reason Lens": "1389.75"
 
 
221
  },
222
  {
223
  "Model": "Llama-3.1-405B-Inst@hyperbolic",
@@ -227,8 +415,14 @@
227
  "No answer": "6.25",
228
  "Easy Puzzle Acc": "66.67",
229
  "Hard Puzzle Acc": "15.38",
 
 
 
 
230
  "Total Puzzles": 16,
231
- "Reason Lens": "1517.13"
 
 
232
  },
233
  {
234
  "Model": "gemini-1.5-flash-exp-0827",
@@ -238,8 +432,14 @@
238
  "No answer": "8.50",
239
  "Easy Puzzle Acc": "70.71",
240
  "Hard Puzzle Acc": "7.22",
 
 
 
 
241
  "Total Puzzles": 1000,
242
- "Reason Lens": "1705.11"
 
 
243
  },
244
  {
245
  "Model": "Meta-Llama-3.1-70B-Instruct",
@@ -249,8 +449,14 @@
249
  "No answer": "43.00",
250
  "Easy Puzzle Acc": "73.57",
251
  "Hard Puzzle Acc": "5.97",
 
 
 
 
252
  "Total Puzzles": 1000,
253
- "Reason Lens": "1483.68"
 
 
254
  },
255
  {
256
  "Model": "deepseek-v2-chat-0628",
@@ -260,8 +466,14 @@
260
  "No answer": "5.20",
261
  "Easy Puzzle Acc": "68.57",
262
  "Hard Puzzle Acc": "4.86",
 
 
 
 
263
  "Total Puzzles": 1000,
264
- "Reason Lens": "1260.23"
 
 
265
  },
266
  {
267
  "Model": "deepseek-v2.5-0908",
@@ -271,8 +483,14 @@
271
  "No answer": "12.70",
272
  "Easy Puzzle Acc": "68.21",
273
  "Hard Puzzle Acc": "4.17",
 
 
 
 
274
  "Total Puzzles": 1000,
275
- "Reason Lens": "1294.46"
 
 
276
  },
277
  {
278
  "Model": "Qwen2-72B-Instruct",
@@ -282,8 +500,14 @@
282
  "No answer": "10.20",
283
  "Easy Puzzle Acc": "63.93",
284
  "Hard Puzzle Acc": "4.86",
 
 
 
 
285
  "Total Puzzles": 1000,
286
- "Reason Lens": "1813.82"
 
 
287
  },
288
  {
289
  "Model": "deepseek-v2-coder-0614",
@@ -293,8 +517,14 @@
293
  "No answer": "4.90",
294
  "Easy Puzzle Acc": "64.64",
295
  "Hard Puzzle Acc": "4.17",
 
 
 
 
296
  "Total Puzzles": 1000,
297
- "Reason Lens": "1324.55"
 
 
298
  },
299
  {
300
  "Model": "deepseek-v2-coder-0724",
@@ -304,8 +534,14 @@
304
  "No answer": "3.40",
305
  "Easy Puzzle Acc": "61.79",
306
  "Hard Puzzle Acc": "4.44",
 
 
 
 
307
  "Total Puzzles": 1000,
308
- "Reason Lens": "1230.63"
 
 
309
  },
310
  {
311
  "Model": "gpt-4o-mini-2024-07-18",
@@ -315,19 +551,14 @@
315
  "No answer": "0.10",
316
  "Easy Puzzle Acc": "62.50",
317
  "Hard Puzzle Acc": "3.61",
 
 
 
 
318
  "Total Puzzles": 1000,
319
- "Reason Lens": "943.52"
320
- },
321
- {
322
- "Model": "gemini-1.5-pro",
323
- "Mode": "sampling",
324
- "Puzzle Acc": "19.70",
325
- "Cell Acc": "45.24",
326
- "No answer": "0.40",
327
- "Easy Puzzle Acc": "60.00",
328
- "Hard Puzzle Acc": "4.03",
329
- "Total Puzzles": 1000,
330
- "Reason Lens": "1356.77"
331
  },
332
  {
333
  "Model": "gemini-1.5-flash",
@@ -337,8 +568,14 @@
337
  "No answer": "22.70",
338
  "Easy Puzzle Acc": "59.29",
339
  "Hard Puzzle Acc": "3.89",
 
 
 
 
340
  "Total Puzzles": 1000,
341
- "Reason Lens": "1538.18"
 
 
342
  },
343
  {
344
  "Model": "gemini-1.5-pro",
@@ -348,8 +585,14 @@
348
  "No answer": "0.80",
349
  "Easy Puzzle Acc": "55.71",
350
  "Hard Puzzle Acc": "5.28",
 
 
 
 
351
  "Total Puzzles": 1000,
352
- "Reason Lens": "1336.17"
 
 
353
  },
354
  {
355
  "Model": "yi-large-preview",
@@ -359,8 +602,14 @@
359
  "No answer": "1.40",
360
  "Easy Puzzle Acc": "58.93",
361
  "Hard Puzzle Acc": "3.33",
 
 
 
 
362
  "Total Puzzles": 1000,
363
- "Reason Lens": "833.36"
 
 
364
  },
365
  {
366
  "Model": "yi-large",
@@ -370,41 +619,48 @@
370
  "No answer": "1.80",
371
  "Easy Puzzle Acc": "58.21",
372
  "Hard Puzzle Acc": "3.47",
 
 
 
 
373
  "Total Puzzles": 1000,
374
- "Reason Lens": "757.01"
 
 
375
  },
376
  {
377
- "Model": "claude-3-sonnet-20240229",
378
  "Mode": "greedy",
379
  "Puzzle Acc": "18.70",
380
- "Cell Acc": "43.66",
381
- "No answer": "0.00",
382
- "Easy Puzzle Acc": "58.93",
383
- "Hard Puzzle Acc": "3.06",
 
 
 
 
384
  "Total Puzzles": 1000,
385
- "Reason Lens": "1095.37"
 
 
386
  },
387
  {
388
- "Model": "Qwen2-72B-Instruct",
389
- "Mode": "sampling",
390
  "Puzzle Acc": "18.70",
391
- "Cell Acc": "40.57",
392
- "No answer": "3.20",
393
- "Easy Puzzle Acc": "57.50",
394
- "Hard Puzzle Acc": "3.61",
395
- "Total Puzzles": 1000,
396
- "Reason Lens": "1894.72"
397
- },
398
- {
399
- "Model": "gemini-1.5-flash",
400
- "Mode": "sampling",
401
- "Puzzle Acc": "18.40",
402
- "Cell Acc": "36.03",
403
- "No answer": "12.80",
404
- "Easy Puzzle Acc": "57.86",
405
  "Hard Puzzle Acc": "3.06",
 
 
 
 
406
  "Total Puzzles": 1000,
407
- "Reason Lens": "1713.03"
 
 
408
  },
409
  {
410
  "Model": "Meta-Llama-3-70B-Instruct",
@@ -414,8 +670,14 @@
414
  "No answer": "0.20",
415
  "Easy Puzzle Acc": "52.86",
416
  "Hard Puzzle Acc": "2.78",
 
 
 
 
417
  "Total Puzzles": 1000,
418
- "Reason Lens": "809.95"
 
 
419
  },
420
  {
421
  "Model": "Athene-70B",
@@ -425,8 +687,14 @@
425
  "No answer": "21.10",
426
  "Easy Puzzle Acc": "52.50",
427
  "Hard Puzzle Acc": "2.78",
 
 
 
 
428
  "Total Puzzles": 1000,
429
- "Reason Lens": "391.19"
 
 
430
  },
431
  {
432
  "Model": "gemma-2-27b-it",
@@ -436,8 +704,14 @@
436
  "No answer": "1.10",
437
  "Easy Puzzle Acc": "50.71",
438
  "Hard Puzzle Acc": "2.92",
 
 
 
 
439
  "Total Puzzles": 1000,
440
- "Reason Lens": "1014.56"
 
 
441
  },
442
  {
443
  "Model": "claude-3-haiku-20240307",
@@ -447,8 +721,14 @@
447
  "No answer": "0.10",
448
  "Easy Puzzle Acc": "47.86",
449
  "Hard Puzzle Acc": "1.25",
 
 
 
 
450
  "Total Puzzles": 1000,
451
- "Reason Lens": "1015.06"
 
 
452
  },
453
  {
454
  "Model": "command-r-plus",
@@ -458,8 +738,14 @@
458
  "No answer": "0.20",
459
  "Easy Puzzle Acc": "44.64",
460
  "Hard Puzzle Acc": "1.94",
 
 
 
 
461
  "Total Puzzles": 1000,
462
- "Reason Lens": "810.53"
 
 
463
  },
464
  {
465
  "Model": "reka-core-20240501",
@@ -469,19 +755,14 @@
469
  "No answer": "4.00",
470
  "Easy Puzzle Acc": "43.21",
471
  "Hard Puzzle Acc": "1.25",
 
 
 
 
472
  "Total Puzzles": 1000,
473
- "Reason Lens": "1078.29"
474
- },
475
- {
476
- "Model": "Meta-Llama-3.1-8B-Instruct",
477
- "Mode": "greedy",
478
- "Puzzle Acc": "12.80",
479
- "Cell Acc": "13.68",
480
- "No answer": "61.50",
481
- "Easy Puzzle Acc": "43.57",
482
- "Hard Puzzle Acc": "0.83",
483
- "Total Puzzles": 1000,
484
- "Reason Lens": "1043.90"
485
  },
486
  {
487
  "Model": "gemma-2-9b-it",
@@ -491,8 +772,31 @@
491
  "No answer": "0.00",
492
  "Easy Puzzle Acc": "41.79",
493
  "Hard Puzzle Acc": "1.53",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
494
  "Total Puzzles": 1000,
495
- "Reason Lens": "849.84"
 
 
496
  },
497
  {
498
  "Model": "Qwen2.5-7B-Instruct",
@@ -502,8 +806,14 @@
502
  "No answer": "9.50",
503
  "Easy Puzzle Acc": "38.93",
504
  "Hard Puzzle Acc": "1.53",
 
 
 
 
505
  "Total Puzzles": 1000,
506
- "Reason Lens": "850.93"
 
 
507
  },
508
  {
509
  "Model": "Meta-Llama-3-8B-Instruct",
@@ -513,8 +823,14 @@
513
  "No answer": "29.20",
514
  "Easy Puzzle Acc": "40.71",
515
  "Hard Puzzle Acc": "0.69",
 
 
 
 
516
  "Total Puzzles": 1000,
517
- "Reason Lens": "1216.40"
 
 
518
  },
519
  {
520
  "Model": "Mistral-Nemo-Instruct-2407",
@@ -524,8 +840,14 @@
524
  "No answer": "1.60",
525
  "Easy Puzzle Acc": "38.93",
526
  "Hard Puzzle Acc": "1.25",
 
 
 
 
527
  "Total Puzzles": 1000,
528
- "Reason Lens": "925.88"
 
 
529
  },
530
  {
531
  "Model": "Phi-3-mini-4k-instruct",
@@ -535,8 +857,14 @@
535
  "No answer": "59.00",
536
  "Easy Puzzle Acc": "38.21",
537
  "Hard Puzzle Acc": "1.25",
 
 
 
 
538
  "Total Puzzles": 1000,
539
- "Reason Lens": "790.29"
 
 
540
  },
541
  {
542
  "Model": "Yi-1.5-34B-Chat",
@@ -546,19 +874,14 @@
546
  "No answer": "4.40",
547
  "Easy Puzzle Acc": "37.50",
548
  "Hard Puzzle Acc": "1.39",
 
 
 
 
549
  "Total Puzzles": 1000,
550
- "Reason Lens": "869.65"
551
- },
552
- {
553
- "Model": "Meta-Llama-3-8B-Instruct",
554
- "Mode": "sampling",
555
- "Puzzle Acc": "11.00",
556
- "Cell Acc": "26.11",
557
- "No answer": "22.30",
558
- "Easy Puzzle Acc": "36.79",
559
- "Hard Puzzle Acc": "0.97",
560
- "Total Puzzles": 1000,
561
- "Reason Lens": "1282.40"
562
  },
563
  {
564
  "Model": "gpt-3.5-turbo-0125",
@@ -568,8 +891,14 @@
568
  "No answer": "0.10",
569
  "Easy Puzzle Acc": "33.57",
570
  "Hard Puzzle Acc": "0.97",
 
 
 
 
571
  "Total Puzzles": 1000,
572
- "Reason Lens": "820.66"
 
 
573
  },
574
  {
575
  "Model": "command-r",
@@ -579,8 +908,14 @@
579
  "No answer": "1.50",
580
  "Easy Puzzle Acc": "32.14",
581
  "Hard Puzzle Acc": "1.25",
 
 
 
 
582
  "Total Puzzles": 1000,
583
- "Reason Lens": "1005.17"
 
 
584
  },
585
  {
586
  "Model": "reka-flash-20240226",
@@ -590,8 +925,14 @@
590
  "No answer": "18.70",
591
  "Easy Puzzle Acc": "30.71",
592
  "Hard Puzzle Acc": "0.97",
 
 
 
 
593
  "Total Puzzles": 1000,
594
- "Reason Lens": "1074.80"
 
 
595
  },
596
  {
597
  "Model": "mathstral-7B-v0.1",
@@ -601,8 +942,14 @@
601
  "No answer": "36.00",
602
  "Easy Puzzle Acc": "30.00",
603
  "Hard Puzzle Acc": "0.83",
 
 
 
 
604
  "Total Puzzles": 1000,
605
- "Reason Lens": "1148.16"
 
 
606
  },
607
  {
608
  "Model": "Mixtral-8x7B-Instruct-v0.1",
@@ -612,8 +959,14 @@
612
  "No answer": "20.30",
613
  "Easy Puzzle Acc": "28.93",
614
  "Hard Puzzle Acc": "0.83",
 
 
 
 
615
  "Total Puzzles": 1000,
616
- "Reason Lens": "1177.21"
 
 
617
  },
618
  {
619
  "Model": "Qwen2-7B-Instruct",
@@ -623,8 +976,31 @@
623
  "No answer": "24.40",
624
  "Easy Puzzle Acc": "29.29",
625
  "Hard Puzzle Acc": "0.28",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
626
  "Total Puzzles": 1000,
627
- "Reason Lens": "1473.23"
 
 
628
  },
629
  {
630
  "Model": "Phi-3.5-mini-instruct",
@@ -634,8 +1010,14 @@
634
  "No answer": "80.60",
635
  "Easy Puzzle Acc": "21.79",
636
  "Hard Puzzle Acc": "0.42",
 
 
 
 
637
  "Total Puzzles": 1000,
638
- "Reason Lens": "718.43"
 
 
639
  },
640
  {
641
  "Model": "Qwen2.5-3B-Instruct",
@@ -645,8 +1027,14 @@
645
  "No answer": "56.70",
646
  "Easy Puzzle Acc": "17.14",
647
  "Hard Puzzle Acc": "0.00",
 
 
 
 
648
  "Total Puzzles": 1000,
649
- "Reason Lens": "906.58"
 
 
650
  },
651
  {
652
  "Model": "gemma-2-2b-it",
@@ -656,8 +1044,14 @@
656
  "No answer": "57.20",
657
  "Easy Puzzle Acc": "14.29",
658
  "Hard Puzzle Acc": "0.28",
 
 
 
 
659
  "Total Puzzles": 1000,
660
- "Reason Lens": "1032.89"
 
 
661
  },
662
  {
663
  "Model": "Yi-1.5-9B-Chat",
@@ -667,7 +1061,13 @@
667
  "No answer": "11.30",
668
  "Easy Puzzle Acc": "8.21",
669
  "Hard Puzzle Acc": "0.00",
 
 
 
 
670
  "Total Puzzles": 1000,
671
- "Reason Lens": "1592.60"
 
 
672
  }
673
  ]
 
1
  [
2
+ {
3
+ "Model": "o1-2024-12-17",
4
+ "Mode": "greedy",
5
+ "Puzzle Acc": "81.00",
6
+ "Cell Acc": "78.74",
7
+ "No answer": "0.20",
8
+ "Easy Puzzle Acc": "98.21",
9
+ "Hard Puzzle Acc": "74.31",
10
+ "Small Puzzle Acc": "97.19",
11
+ "Medium Puzzle Acc": "92.14",
12
+ "Large Puzzle Acc": "78.00",
13
+ "XL Puzzle Acc": "42.50",
14
+ "Total Puzzles": 1000,
15
+ "Reason Lens": "1197.51",
16
+ "N_Mode": "single",
17
+ "N_Size": 1
18
+ },
19
+ {
20
+ "Model": "deepseek-R1",
21
+ "Mode": "greedy",
22
+ "Puzzle Acc": "78.70",
23
+ "Cell Acc": "80.54",
24
+ "No answer": "0.00",
25
+ "Easy Puzzle Acc": "98.57",
26
+ "Hard Puzzle Acc": "70.97",
27
+ "Small Puzzle Acc": "98.44",
28
+ "Medium Puzzle Acc": "95.71",
29
+ "Large Puzzle Acc": "73.50",
30
+ "XL Puzzle Acc": "28.50",
31
+ "Total Puzzles": 1000,
32
+ "Reason Lens": "586.33",
33
+ "N_Mode": "single",
34
+ "N_Size": 1
35
+ },
36
  {
37
  "Model": "o1-preview-2024-09-12",
38
  "Mode": "greedy",
 
41
  "No answer": "0.30",
42
  "Easy Puzzle Acc": "98.57",
43
  "Hard Puzzle Acc": "60.83",
44
+ "Small Puzzle Acc": "98.12",
45
+ "Medium Puzzle Acc": "88.21",
46
+ "Large Puzzle Acc": "59.50",
47
+ "XL Puzzle Acc": "17.00",
48
  "Total Puzzles": 1000,
49
+ "Reason Lens": "1565.88",
50
+ "N_Mode": "single",
51
+ "N_Size": 1
52
+ },
53
+ {
54
+ "Model": "o1-preview-2024-09-12-v2",
55
+ "Mode": "greedy",
56
+ "Puzzle Acc": "70.40",
57
+ "Cell Acc": "74.18",
58
+ "No answer": "0.40",
59
+ "Easy Puzzle Acc": "98.21",
60
+ "Hard Puzzle Acc": "59.58",
61
+ "Small Puzzle Acc": "97.81",
62
+ "Medium Puzzle Acc": "88.57",
63
+ "Large Puzzle Acc": "55.50",
64
+ "XL Puzzle Acc": "16.00",
65
+ "Total Puzzles": 1000,
66
+ "Reason Lens": "1559.71",
67
+ "N_Mode": "single",
68
+ "N_Size": 1
69
+ },
70
+ {
71
+ "Model": "o1-mini-2024-09-12-v3",
72
+ "Mode": "greedy",
73
+ "Puzzle Acc": "59.70",
74
+ "Cell Acc": "70.32",
75
+ "No answer": "1.00",
76
+ "Easy Puzzle Acc": "86.07",
77
+ "Hard Puzzle Acc": "49.44",
78
+ "Small Puzzle Acc": "87.50",
79
+ "Medium Puzzle Acc": "76.79",
80
+ "Large Puzzle Acc": "39.00",
81
+ "XL Puzzle Acc": "12.00",
82
+ "Total Puzzles": 1000,
83
+ "Reason Lens": "1166.38",
84
+ "N_Mode": "single",
85
+ "N_Size": 1
86
+ },
87
+ {
88
+ "Model": "o1-mini-2024-09-12-v2",
89
+ "Mode": "greedy",
90
+ "Puzzle Acc": "56.80",
91
+ "Cell Acc": "69.87",
92
+ "No answer": "1.30",
93
+ "Easy Puzzle Acc": "82.86",
94
+ "Hard Puzzle Acc": "46.67",
95
+ "Small Puzzle Acc": "83.44",
96
+ "Medium Puzzle Acc": "76.43",
97
+ "Large Puzzle Acc": "36.00",
98
+ "XL Puzzle Acc": "7.50",
99
+ "Total Puzzles": 1000,
100
+ "Reason Lens": "1164.95",
101
+ "N_Mode": "single",
102
+ "N_Size": 1
103
  },
104
  {
105
  "Model": "o1-mini-2024-09-12",
 
109
  "No answer": "0.80",
110
  "Easy Puzzle Acc": "87.14",
111
  "Hard Puzzle Acc": "39.17",
112
+ "Small Puzzle Acc": "87.81",
113
+ "Medium Puzzle Acc": "67.50",
114
+ "Large Puzzle Acc": "24.50",
115
+ "XL Puzzle Acc": "3.50",
116
  "Total Puzzles": 1000,
117
+ "Reason Lens": "993.28",
118
+ "N_Mode": "single",
119
+ "N_Size": 1
120
  },
121
  {
122
+ "Model": "deepseek-v3",
123
  "Mode": "greedy",
124
+ "Puzzle Acc": "42.10",
125
+ "Cell Acc": "42.04",
126
+ "No answer": "27.90",
127
+ "Easy Puzzle Acc": "90.00",
128
+ "Hard Puzzle Acc": "23.47",
129
+ "Small Puzzle Acc": "85.62",
130
+ "Medium Puzzle Acc": "44.64",
131
+ "Large Puzzle Acc": "10.00",
132
+ "XL Puzzle Acc": "1.00",
133
+ "Total Puzzles": 1000,
134
+ "Reason Lens": "2158.00",
135
+ "N_Mode": "single",
136
+ "N_Size": 1
137
+ },
138
+ {
139
+ "Model": "claude-3-5-sonnet-20241022",
140
+ "Mode": "greedy",
141
+ "Puzzle Acc": "36.20",
142
+ "Cell Acc": "54.27",
143
  "No answer": "0.00",
144
+ "Easy Puzzle Acc": "91.07",
145
+ "Hard Puzzle Acc": "14.86",
146
+ "Small Puzzle Acc": "84.69",
147
+ "Medium Puzzle Acc": "28.93",
148
+ "Large Puzzle Acc": "4.00",
149
+ "XL Puzzle Acc": "1.00",
150
  "Total Puzzles": 1000,
151
+ "Reason Lens": "861.18",
152
+ "N_Mode": "single",
153
+ "N_Size": 1
154
  },
155
  {
156
  "Model": "claude-3-5-sonnet-20240620",
157
+ "Mode": "greedy",
158
  "Puzzle Acc": "33.40",
159
+ "Cell Acc": "54.34",
160
+ "No answer": "0.00",
161
+ "Easy Puzzle Acc": "87.50",
162
+ "Hard Puzzle Acc": "12.36",
163
+ "Small Puzzle Acc": "83.44",
164
+ "Medium Puzzle Acc": "21.79",
165
+ "Large Puzzle Acc": "3.00",
166
+ "XL Puzzle Acc": "0.00",
167
  "Total Puzzles": 1000,
168
+ "Reason Lens": "1141.94",
169
+ "N_Mode": "single",
170
+ "N_Size": 1
171
  },
172
  {
173
  "Model": "Llama-3.1-405B-Inst-fp8@together",
 
177
  "No answer": "12.50",
178
  "Easy Puzzle Acc": "87.14",
179
  "Hard Puzzle Acc": "11.39",
180
+ "Small Puzzle Acc": "81.25",
181
+ "Medium Puzzle Acc": "22.50",
182
+ "Large Puzzle Acc": "1.50",
183
+ "XL Puzzle Acc": "0.00",
184
  "Total Puzzles": 1000,
185
+ "Reason Lens": "314.66",
186
+ "N_Mode": "single",
187
+ "N_Size": 1
 
 
 
 
 
 
 
 
 
188
  },
189
  {
190
  "Model": "gpt-4o-2024-08-06",
 
194
  "No answer": "3.60",
195
  "Easy Puzzle Acc": "84.64",
196
  "Hard Puzzle Acc": "11.11",
197
+ "Small Puzzle Acc": "80.00",
198
+ "Medium Puzzle Acc": "19.64",
199
+ "Large Puzzle Acc": "2.50",
200
+ "XL Puzzle Acc": "0.50",
201
  "Total Puzzles": 1000,
202
+ "Reason Lens": "1106.51",
203
+ "N_Mode": "single",
204
+ "N_Size": 1
 
 
 
 
 
 
 
 
 
205
  },
206
  {
207
  "Model": "gemini-1.5-pro-exp-0827",
 
211
  "No answer": "0.80",
212
  "Easy Puzzle Acc": "79.64",
213
  "Hard Puzzle Acc": "11.39",
214
+ "Small Puzzle Acc": "75.31",
215
+ "Medium Puzzle Acc": "20.71",
216
+ "Large Puzzle Acc": "3.00",
217
+ "XL Puzzle Acc": "0.00",
218
  "Total Puzzles": 1000,
219
+ "Reason Lens": "1594.47",
220
+ "N_Mode": "single",
221
+ "N_Size": 1
222
  },
223
  {
224
  "Model": "Llama-3.1-405B-Inst@sambanova",
 
228
  "No answer": "24.70",
229
  "Easy Puzzle Acc": "84.64",
230
  "Hard Puzzle Acc": "8.89",
231
+ "Small Puzzle Acc": "79.06",
232
+ "Medium Puzzle Acc": "16.43",
233
+ "Large Puzzle Acc": "0.50",
234
+ "XL Puzzle Acc": "0.50",
235
  "Total Puzzles": 1000,
236
+ "Reason Lens": "2001.12",
237
+ "N_Mode": "single",
238
+ "N_Size": 1
239
  },
240
  {
241
  "Model": "chatgpt-4o-latest-24-09-07",
 
245
  "No answer": "4.20",
246
  "Easy Puzzle Acc": "81.43",
247
  "Hard Puzzle Acc": "9.86",
248
+ "Small Puzzle Acc": "76.88",
249
+ "Medium Puzzle Acc": "17.86",
250
+ "Large Puzzle Acc": "1.50",
251
+ "XL Puzzle Acc": "0.00",
252
  "Total Puzzles": 1000,
253
+ "Reason Lens": "1539.99",
254
+ "N_Mode": "single",
255
+ "N_Size": 1
256
  },
257
  {
258
  "Model": "Mistral-Large-2",
 
262
  "No answer": "1.70",
263
  "Easy Puzzle Acc": "80.36",
264
  "Hard Puzzle Acc": "9.03",
265
+ "Small Puzzle Acc": "75.94",
266
+ "Medium Puzzle Acc": "15.00",
267
+ "Large Puzzle Acc": "2.50",
268
+ "XL Puzzle Acc": "0.00",
269
  "Total Puzzles": 1000,
270
+ "Reason Lens": "1592.39",
271
+ "N_Mode": "single",
272
+ "N_Size": 1
273
  },
274
  {
275
  "Model": "gpt-4-turbo-2024-04-09",
 
279
  "No answer": "0.10",
280
  "Easy Puzzle Acc": "80.71",
281
  "Hard Puzzle Acc": "8.06",
282
+ "Small Puzzle Acc": "75.31",
283
+ "Medium Puzzle Acc": "15.00",
284
+ "Large Puzzle Acc": "0.50",
285
+ "XL Puzzle Acc": "0.00",
286
  "Total Puzzles": 1000,
287
+ "Reason Lens": "1148.46",
288
+ "N_Mode": "single",
289
+ "N_Size": 1
290
  },
291
  {
292
  "Model": "gpt-4o-2024-05-13",
 
296
  "No answer": "19.30",
297
  "Easy Puzzle Acc": "77.86",
298
  "Hard Puzzle Acc": "8.89",
299
+ "Small Puzzle Acc": "73.75",
300
+ "Medium Puzzle Acc": "16.43",
301
+ "Large Puzzle Acc": "0.00",
302
+ "XL Puzzle Acc": "0.00",
303
+ "Total Puzzles": 1000,
304
+ "Reason Lens": "1643.51",
305
+ "N_Mode": "single",
306
+ "N_Size": 1
307
+ },
308
+ {
309
+ "Model": "grok-2-1212",
310
+ "Mode": "greedy",
311
+ "Puzzle Acc": "27.70",
312
+ "Cell Acc": "48.16",
313
+ "No answer": "3.50",
314
+ "Easy Puzzle Acc": "76.43",
315
+ "Hard Puzzle Acc": "8.75",
316
+ "Small Puzzle Acc": "71.88",
317
+ "Medium Puzzle Acc": "13.93",
318
+ "Large Puzzle Acc": "4.00",
319
+ "XL Puzzle Acc": "0.00",
320
  "Total Puzzles": 1000,
321
+ "Reason Lens": "2551.39",
322
+ "N_Mode": "single",
323
+ "N_Size": 1
324
  },
325
  {
326
  "Model": "gpt-4-0314",
 
330
  "No answer": "0.20",
331
  "Easy Puzzle Acc": "77.14",
332
  "Hard Puzzle Acc": "7.64",
333
+ "Small Puzzle Acc": "71.25",
334
+ "Medium Puzzle Acc": "13.57",
335
+ "Large Puzzle Acc": "2.50",
336
+ "XL Puzzle Acc": "0.00",
337
  "Total Puzzles": 1000,
338
+ "Reason Lens": "1203.17",
339
+ "N_Mode": "single",
340
+ "N_Size": 1
341
  },
342
  {
343
  "Model": "claude-3-opus-20240229",
 
347
  "No answer": "0.00",
348
  "Easy Puzzle Acc": "78.21",
349
  "Hard Puzzle Acc": "7.08",
350
+ "Small Puzzle Acc": "73.44",
351
+ "Medium Puzzle Acc": "12.14",
352
+ "Large Puzzle Acc": "0.50",
353
+ "XL Puzzle Acc": "0.00",
354
  "Total Puzzles": 1000,
355
+ "Reason Lens": "855.72",
356
+ "N_Mode": "single",
357
+ "N_Size": 1
358
  },
359
  {
360
  "Model": "Qwen2.5-72B-Instruct",
 
364
  "No answer": "11.90",
365
  "Easy Puzzle Acc": "76.43",
366
  "Hard Puzzle Acc": "7.22",
367
+ "Small Puzzle Acc": "72.50",
368
+ "Medium Puzzle Acc": "12.14",
369
+ "Large Puzzle Acc": "0.00",
370
+ "XL Puzzle Acc": "0.00",
371
  "Total Puzzles": 1000,
372
+ "Reason Lens": "1795.90",
373
+ "N_Mode": "single",
374
+ "N_Size": 1
 
 
 
 
 
 
 
 
 
375
  },
376
  {
377
  "Model": "Qwen2.5-32B-Instruct",
 
381
  "No answer": "6.30",
382
  "Easy Puzzle Acc": "77.50",
383
  "Hard Puzzle Acc": "6.11",
384
+ "Small Puzzle Acc": "72.19",
385
+ "Medium Puzzle Acc": "10.36",
386
+ "Large Puzzle Acc": "0.50",
387
+ "XL Puzzle Acc": "0.00",
388
  "Total Puzzles": 1000,
389
+ "Reason Lens": "1333.07",
390
+ "N_Mode": "single",
391
+ "N_Size": 1
392
  },
393
  {
394
  "Model": "gemini-1.5-pro-exp-0801",
 
398
  "No answer": "0.00",
399
  "Easy Puzzle Acc": "72.50",
400
  "Hard Puzzle Acc": "6.81",
401
+ "Small Puzzle Acc": "66.56",
402
+ "Medium Puzzle Acc": "13.93",
403
+ "Large Puzzle Acc": "0.00",
404
+ "XL Puzzle Acc": "0.00",
405
  "Total Puzzles": 1000,
406
+ "Reason Lens": "1389.75",
407
+ "N_Mode": "single",
408
+ "N_Size": 1
409
  },
410
  {
411
  "Model": "Llama-3.1-405B-Inst@hyperbolic",
 
415
  "No answer": "6.25",
416
  "Easy Puzzle Acc": "66.67",
417
  "Hard Puzzle Acc": "15.38",
418
+ "Small Puzzle Acc": "50.00",
419
+ "Medium Puzzle Acc": "33.33",
420
+ "Large Puzzle Acc": "0.00",
421
+ "XL Puzzle Acc": "0.00",
422
  "Total Puzzles": 16,
423
+ "Reason Lens": "1517.13",
424
+ "N_Mode": "single",
425
+ "N_Size": 1
426
  },
427
  {
428
  "Model": "gemini-1.5-flash-exp-0827",
 
432
  "No answer": "8.50",
433
  "Easy Puzzle Acc": "70.71",
434
  "Hard Puzzle Acc": "7.22",
435
+ "Small Puzzle Acc": "65.00",
436
+ "Medium Puzzle Acc": "13.57",
437
+ "Large Puzzle Acc": "2.00",
438
+ "XL Puzzle Acc": "0.00",
439
  "Total Puzzles": 1000,
440
+ "Reason Lens": "1705.11",
441
+ "N_Mode": "single",
442
+ "N_Size": 1
443
  },
444
  {
445
  "Model": "Meta-Llama-3.1-70B-Instruct",
 
449
  "No answer": "43.00",
450
  "Easy Puzzle Acc": "73.57",
451
  "Hard Puzzle Acc": "5.97",
452
+ "Small Puzzle Acc": "67.81",
453
+ "Medium Puzzle Acc": "10.36",
454
+ "Large Puzzle Acc": "1.50",
455
+ "XL Puzzle Acc": "0.00",
456
  "Total Puzzles": 1000,
457
+ "Reason Lens": "1483.68",
458
+ "N_Mode": "single",
459
+ "N_Size": 1
460
  },
461
  {
462
  "Model": "deepseek-v2-chat-0628",
 
466
  "No answer": "5.20",
467
  "Easy Puzzle Acc": "68.57",
468
  "Hard Puzzle Acc": "4.86",
469
+ "Small Puzzle Acc": "63.44",
470
+ "Medium Puzzle Acc": "8.57",
471
+ "Large Puzzle Acc": "0.00",
472
+ "XL Puzzle Acc": "0.00",
473
  "Total Puzzles": 1000,
474
+ "Reason Lens": "1260.23",
475
+ "N_Mode": "single",
476
+ "N_Size": 1
477
  },
478
  {
479
  "Model": "deepseek-v2.5-0908",
 
483
  "No answer": "12.70",
484
  "Easy Puzzle Acc": "68.21",
485
  "Hard Puzzle Acc": "4.17",
486
+ "Small Puzzle Acc": "62.19",
487
+ "Medium Puzzle Acc": "7.86",
488
+ "Large Puzzle Acc": "0.00",
489
+ "XL Puzzle Acc": "0.00",
490
  "Total Puzzles": 1000,
491
+ "Reason Lens": "1294.46",
492
+ "N_Mode": "single",
493
+ "N_Size": 1
494
  },
495
  {
496
  "Model": "Qwen2-72B-Instruct",
 
500
  "No answer": "10.20",
501
  "Easy Puzzle Acc": "63.93",
502
  "Hard Puzzle Acc": "4.86",
503
+ "Small Puzzle Acc": "60.94",
504
+ "Medium Puzzle Acc": "6.79",
505
+ "Large Puzzle Acc": "0.00",
506
+ "XL Puzzle Acc": "0.00",
507
  "Total Puzzles": 1000,
508
+ "Reason Lens": "1813.82",
509
+ "N_Mode": "single",
510
+ "N_Size": 1
511
  },
512
  {
513
  "Model": "deepseek-v2-coder-0614",
 
517
  "No answer": "4.90",
518
  "Easy Puzzle Acc": "64.64",
519
  "Hard Puzzle Acc": "4.17",
520
+ "Small Puzzle Acc": "59.69",
521
+ "Medium Puzzle Acc": "7.14",
522
+ "Large Puzzle Acc": "0.00",
523
+ "XL Puzzle Acc": "0.00",
524
  "Total Puzzles": 1000,
525
+ "Reason Lens": "1324.55",
526
+ "N_Mode": "single",
527
+ "N_Size": 1
528
  },
529
  {
530
  "Model": "deepseek-v2-coder-0724",
 
534
  "No answer": "3.40",
535
  "Easy Puzzle Acc": "61.79",
536
  "Hard Puzzle Acc": "4.44",
537
+ "Small Puzzle Acc": "57.50",
538
+ "Medium Puzzle Acc": "7.14",
539
+ "Large Puzzle Acc": "0.50",
540
+ "XL Puzzle Acc": "0.00",
541
  "Total Puzzles": 1000,
542
+ "Reason Lens": "1230.63",
543
+ "N_Mode": "single",
544
+ "N_Size": 1
545
  },
546
  {
547
  "Model": "gpt-4o-mini-2024-07-18",
 
551
  "No answer": "0.10",
552
  "Easy Puzzle Acc": "62.50",
553
  "Hard Puzzle Acc": "3.61",
554
+ "Small Puzzle Acc": "58.75",
555
+ "Medium Puzzle Acc": "4.64",
556
+ "Large Puzzle Acc": "0.00",
557
+ "XL Puzzle Acc": "0.00",
558
  "Total Puzzles": 1000,
559
+ "Reason Lens": "943.52",
560
+ "N_Mode": "single",
561
+ "N_Size": 1
 
 
 
 
 
 
 
 
 
562
  },
563
  {
564
  "Model": "gemini-1.5-flash",
 
568
  "No answer": "22.70",
569
  "Easy Puzzle Acc": "59.29",
570
  "Hard Puzzle Acc": "3.89",
571
+ "Small Puzzle Acc": "55.00",
572
+ "Medium Puzzle Acc": "6.43",
573
+ "Large Puzzle Acc": "0.00",
574
+ "XL Puzzle Acc": "0.00",
575
  "Total Puzzles": 1000,
576
+ "Reason Lens": "1538.18",
577
+ "N_Mode": "single",
578
+ "N_Size": 1
579
  },
580
  {
581
  "Model": "gemini-1.5-pro",
 
585
  "No answer": "0.80",
586
  "Easy Puzzle Acc": "55.71",
587
  "Hard Puzzle Acc": "5.28",
588
+ "Small Puzzle Acc": "52.19",
589
+ "Medium Puzzle Acc": "9.64",
590
+ "Large Puzzle Acc": "0.00",
591
+ "XL Puzzle Acc": "0.00",
592
  "Total Puzzles": 1000,
593
+ "Reason Lens": "1336.17",
594
+ "N_Mode": "single",
595
+ "N_Size": 1
596
  },
597
  {
598
  "Model": "yi-large-preview",
 
602
  "No answer": "1.40",
603
  "Easy Puzzle Acc": "58.93",
604
  "Hard Puzzle Acc": "3.33",
605
+ "Small Puzzle Acc": "53.75",
606
+ "Medium Puzzle Acc": "6.07",
607
+ "Large Puzzle Acc": "0.00",
608
+ "XL Puzzle Acc": "0.00",
609
  "Total Puzzles": 1000,
610
+ "Reason Lens": "833.36",
611
+ "N_Mode": "single",
612
+ "N_Size": 1
613
  },
614
  {
615
  "Model": "yi-large",
 
619
  "No answer": "1.80",
620
  "Easy Puzzle Acc": "58.21",
621
  "Hard Puzzle Acc": "3.47",
622
+ "Small Puzzle Acc": "54.37",
623
+ "Medium Puzzle Acc": "5.00",
624
+ "Large Puzzle Acc": "0.00",
625
+ "XL Puzzle Acc": "0.00",
626
  "Total Puzzles": 1000,
627
+ "Reason Lens": "757.01",
628
+ "N_Mode": "single",
629
+ "N_Size": 1
630
  },
631
  {
632
+ "Model": "claude-3-5-haiku-20241022",
633
  "Mode": "greedy",
634
  "Puzzle Acc": "18.70",
635
+ "Cell Acc": "43.22",
636
+ "No answer": "0.10",
637
+ "Easy Puzzle Acc": "57.86",
638
+ "Hard Puzzle Acc": "3.47",
639
+ "Small Puzzle Acc": "53.12",
640
+ "Medium Puzzle Acc": "6.07",
641
+ "Large Puzzle Acc": "0.00",
642
+ "XL Puzzle Acc": "0.00",
643
  "Total Puzzles": 1000,
644
+ "Reason Lens": "660.91",
645
+ "N_Mode": "single",
646
+ "N_Size": 1
647
  },
648
  {
649
+ "Model": "claude-3-sonnet-20240229",
650
+ "Mode": "greedy",
651
  "Puzzle Acc": "18.70",
652
+ "Cell Acc": "43.66",
653
+ "No answer": "0.00",
654
+ "Easy Puzzle Acc": "58.93",
 
 
 
 
 
 
 
 
 
 
 
655
  "Hard Puzzle Acc": "3.06",
656
+ "Small Puzzle Acc": "54.06",
657
+ "Medium Puzzle Acc": "4.29",
658
+ "Large Puzzle Acc": "1.00",
659
+ "XL Puzzle Acc": "0.00",
660
  "Total Puzzles": 1000,
661
+ "Reason Lens": "1095.37",
662
+ "N_Mode": "single",
663
+ "N_Size": 1
664
  },
665
  {
666
  "Model": "Meta-Llama-3-70B-Instruct",
 
670
  "No answer": "0.20",
671
  "Easy Puzzle Acc": "52.86",
672
  "Hard Puzzle Acc": "2.78",
673
+ "Small Puzzle Acc": "48.44",
674
+ "Medium Puzzle Acc": "4.64",
675
+ "Large Puzzle Acc": "0.00",
676
+ "XL Puzzle Acc": "0.00",
677
  "Total Puzzles": 1000,
678
+ "Reason Lens": "809.95",
679
+ "N_Mode": "single",
680
+ "N_Size": 1
681
  },
682
  {
683
  "Model": "Athene-70B",
 
687
  "No answer": "21.10",
688
  "Easy Puzzle Acc": "52.50",
689
  "Hard Puzzle Acc": "2.78",
690
+ "Small Puzzle Acc": "48.75",
691
+ "Medium Puzzle Acc": "3.93",
692
+ "Large Puzzle Acc": "0.00",
693
+ "XL Puzzle Acc": "0.00",
694
  "Total Puzzles": 1000,
695
+ "Reason Lens": "391.19",
696
+ "N_Mode": "single",
697
+ "N_Size": 1
698
  },
699
  {
700
  "Model": "gemma-2-27b-it",
 
704
  "No answer": "1.10",
705
  "Easy Puzzle Acc": "50.71",
706
  "Hard Puzzle Acc": "2.92",
707
+ "Small Puzzle Acc": "46.56",
708
+ "Medium Puzzle Acc": "5.00",
709
+ "Large Puzzle Acc": "0.00",
710
+ "XL Puzzle Acc": "0.00",
711
  "Total Puzzles": 1000,
712
+ "Reason Lens": "1014.56",
713
+ "N_Mode": "single",
714
+ "N_Size": 1
715
  },
716
  {
717
  "Model": "claude-3-haiku-20240307",
 
721
  "No answer": "0.10",
722
  "Easy Puzzle Acc": "47.86",
723
  "Hard Puzzle Acc": "1.25",
724
+ "Small Puzzle Acc": "43.75",
725
+ "Medium Puzzle Acc": "1.07",
726
+ "Large Puzzle Acc": "0.00",
727
+ "XL Puzzle Acc": "0.00",
728
  "Total Puzzles": 1000,
729
+ "Reason Lens": "1015.06",
730
+ "N_Mode": "single",
731
+ "N_Size": 1
732
  },
733
  {
734
  "Model": "command-r-plus",
 
738
  "No answer": "0.20",
739
  "Easy Puzzle Acc": "44.64",
740
  "Hard Puzzle Acc": "1.94",
741
+ "Small Puzzle Acc": "40.94",
742
+ "Medium Puzzle Acc": "2.86",
743
+ "Large Puzzle Acc": "0.00",
744
+ "XL Puzzle Acc": "0.00",
745
  "Total Puzzles": 1000,
746
+ "Reason Lens": "810.53",
747
+ "N_Mode": "single",
748
+ "N_Size": 1
749
  },
750
  {
751
  "Model": "reka-core-20240501",
 
755
  "No answer": "4.00",
756
  "Easy Puzzle Acc": "43.21",
757
  "Hard Puzzle Acc": "1.25",
758
+ "Small Puzzle Acc": "39.38",
759
+ "Medium Puzzle Acc": "1.43",
760
+ "Large Puzzle Acc": "0.00",
761
+ "XL Puzzle Acc": "0.00",
762
  "Total Puzzles": 1000,
763
+ "Reason Lens": "1078.29",
764
+ "N_Mode": "single",
765
+ "N_Size": 1
 
 
 
 
 
 
 
 
 
766
  },
767
  {
768
  "Model": "gemma-2-9b-it",
 
772
  "No answer": "0.00",
773
  "Easy Puzzle Acc": "41.79",
774
  "Hard Puzzle Acc": "1.53",
775
+ "Small Puzzle Acc": "37.81",
776
+ "Medium Puzzle Acc": "2.50",
777
+ "Large Puzzle Acc": "0.00",
778
+ "XL Puzzle Acc": "0.00",
779
+ "Total Puzzles": 1000,
780
+ "Reason Lens": "849.84",
781
+ "N_Mode": "single",
782
+ "N_Size": 1
783
+ },
784
+ {
785
+ "Model": "Meta-Llama-3.1-8B-Instruct",
786
+ "Mode": "greedy",
787
+ "Puzzle Acc": "12.80",
788
+ "Cell Acc": "13.68",
789
+ "No answer": "61.50",
790
+ "Easy Puzzle Acc": "43.57",
791
+ "Hard Puzzle Acc": "0.83",
792
+ "Small Puzzle Acc": "39.38",
793
+ "Medium Puzzle Acc": "0.71",
794
+ "Large Puzzle Acc": "0.00",
795
+ "XL Puzzle Acc": "0.00",
796
  "Total Puzzles": 1000,
797
+ "Reason Lens": "1043.90",
798
+ "N_Mode": "single",
799
+ "N_Size": 1
800
  },
801
  {
802
  "Model": "Qwen2.5-7B-Instruct",
 
806
  "No answer": "9.50",
807
  "Easy Puzzle Acc": "38.93",
808
  "Hard Puzzle Acc": "1.53",
809
+ "Small Puzzle Acc": "36.25",
810
+ "Medium Puzzle Acc": "1.43",
811
+ "Large Puzzle Acc": "0.00",
812
+ "XL Puzzle Acc": "0.00",
813
  "Total Puzzles": 1000,
814
+ "Reason Lens": "850.93",
815
+ "N_Mode": "single",
816
+ "N_Size": 1
817
  },
818
  {
819
  "Model": "Meta-Llama-3-8B-Instruct",
 
823
  "No answer": "29.20",
824
  "Easy Puzzle Acc": "40.71",
825
  "Hard Puzzle Acc": "0.69",
826
+ "Small Puzzle Acc": "36.88",
827
+ "Medium Puzzle Acc": "0.36",
828
+ "Large Puzzle Acc": "0.00",
829
+ "XL Puzzle Acc": "0.00",
830
  "Total Puzzles": 1000,
831
+ "Reason Lens": "1216.40",
832
+ "N_Mode": "single",
833
+ "N_Size": 1
834
  },
835
  {
836
  "Model": "Mistral-Nemo-Instruct-2407",
 
840
  "No answer": "1.60",
841
  "Easy Puzzle Acc": "38.93",
842
  "Hard Puzzle Acc": "1.25",
843
+ "Small Puzzle Acc": "35.31",
844
+ "Medium Puzzle Acc": "1.79",
845
+ "Large Puzzle Acc": "0.00",
846
+ "XL Puzzle Acc": "0.00",
847
  "Total Puzzles": 1000,
848
+ "Reason Lens": "925.88",
849
+ "N_Mode": "single",
850
+ "N_Size": 1
851
  },
852
  {
853
  "Model": "Phi-3-mini-4k-instruct",
 
857
  "No answer": "59.00",
858
  "Easy Puzzle Acc": "38.21",
859
  "Hard Puzzle Acc": "1.25",
860
+ "Small Puzzle Acc": "35.94",
861
+ "Medium Puzzle Acc": "0.36",
862
+ "Large Puzzle Acc": "0.00",
863
+ "XL Puzzle Acc": "0.00",
864
  "Total Puzzles": 1000,
865
+ "Reason Lens": "790.29",
866
+ "N_Mode": "single",
867
+ "N_Size": 1
868
  },
869
  {
870
  "Model": "Yi-1.5-34B-Chat",
 
874
  "No answer": "4.40",
875
  "Easy Puzzle Acc": "37.50",
876
  "Hard Puzzle Acc": "1.39",
877
+ "Small Puzzle Acc": "35.00",
878
+ "Medium Puzzle Acc": "1.07",
879
+ "Large Puzzle Acc": "0.00",
880
+ "XL Puzzle Acc": "0.00",
881
  "Total Puzzles": 1000,
882
+ "Reason Lens": "869.65",
883
+ "N_Mode": "single",
884
+ "N_Size": 1
 
 
 
 
 
 
 
 
 
885
  },
886
  {
887
  "Model": "gpt-3.5-turbo-0125",
 
891
  "No answer": "0.10",
892
  "Easy Puzzle Acc": "33.57",
893
  "Hard Puzzle Acc": "0.97",
894
+ "Small Puzzle Acc": "30.31",
895
+ "Medium Puzzle Acc": "1.07",
896
+ "Large Puzzle Acc": "0.50",
897
+ "XL Puzzle Acc": "0.00",
898
  "Total Puzzles": 1000,
899
+ "Reason Lens": "820.66",
900
+ "N_Mode": "single",
901
+ "N_Size": 1
902
  },
903
  {
904
  "Model": "command-r",
 
908
  "No answer": "1.50",
909
  "Easy Puzzle Acc": "32.14",
910
  "Hard Puzzle Acc": "1.25",
911
+ "Small Puzzle Acc": "30.31",
912
+ "Medium Puzzle Acc": "0.71",
913
+ "Large Puzzle Acc": "0.00",
914
+ "XL Puzzle Acc": "0.00",
915
  "Total Puzzles": 1000,
916
+ "Reason Lens": "1005.17",
917
+ "N_Mode": "single",
918
+ "N_Size": 1
919
  },
920
  {
921
  "Model": "reka-flash-20240226",
 
925
  "No answer": "18.70",
926
  "Easy Puzzle Acc": "30.71",
927
  "Hard Puzzle Acc": "0.97",
928
+ "Small Puzzle Acc": "28.44",
929
+ "Medium Puzzle Acc": "0.71",
930
+ "Large Puzzle Acc": "0.00",
931
+ "XL Puzzle Acc": "0.00",
932
  "Total Puzzles": 1000,
933
+ "Reason Lens": "1074.80",
934
+ "N_Mode": "single",
935
+ "N_Size": 1
936
  },
937
  {
938
  "Model": "mathstral-7B-v0.1",
 
942
  "No answer": "36.00",
943
  "Easy Puzzle Acc": "30.00",
944
  "Hard Puzzle Acc": "0.83",
945
+ "Small Puzzle Acc": "27.19",
946
+ "Medium Puzzle Acc": "1.07",
947
+ "Large Puzzle Acc": "0.00",
948
+ "XL Puzzle Acc": "0.00",
949
  "Total Puzzles": 1000,
950
+ "Reason Lens": "1148.16",
951
+ "N_Mode": "single",
952
+ "N_Size": 1
953
  },
954
  {
955
  "Model": "Mixtral-8x7B-Instruct-v0.1",
 
959
  "No answer": "20.30",
960
  "Easy Puzzle Acc": "28.93",
961
  "Hard Puzzle Acc": "0.83",
962
+ "Small Puzzle Acc": "26.25",
963
+ "Medium Puzzle Acc": "1.07",
964
+ "Large Puzzle Acc": "0.00",
965
+ "XL Puzzle Acc": "0.00",
966
  "Total Puzzles": 1000,
967
+ "Reason Lens": "1177.21",
968
+ "N_Mode": "single",
969
+ "N_Size": 1
970
  },
971
  {
972
  "Model": "Qwen2-7B-Instruct",
 
976
  "No answer": "24.40",
977
  "Easy Puzzle Acc": "29.29",
978
  "Hard Puzzle Acc": "0.28",
979
+ "Small Puzzle Acc": "26.25",
980
+ "Medium Puzzle Acc": "0.00",
981
+ "Large Puzzle Acc": "0.00",
982
+ "XL Puzzle Acc": "0.00",
983
+ "Total Puzzles": 1000,
984
+ "Reason Lens": "1473.23",
985
+ "N_Mode": "single",
986
+ "N_Size": 1
987
+ },
988
+ {
989
+ "Model": "Llama-3.2-3B-Instruct@together",
990
+ "Mode": "greedy",
991
+ "Puzzle Acc": "7.40",
992
+ "Cell Acc": "13.14",
993
+ "No answer": "54.50",
994
+ "Easy Puzzle Acc": "25.71",
995
+ "Hard Puzzle Acc": "0.28",
996
+ "Small Puzzle Acc": "23.12",
997
+ "Medium Puzzle Acc": "0.00",
998
+ "Large Puzzle Acc": "0.00",
999
+ "XL Puzzle Acc": "0.00",
1000
  "Total Puzzles": 1000,
1001
+ "Reason Lens": "963.47",
1002
+ "N_Mode": "single",
1003
+ "N_Size": 1
1004
  },
1005
  {
1006
  "Model": "Phi-3.5-mini-instruct",
 
1010
  "No answer": "80.60",
1011
  "Easy Puzzle Acc": "21.79",
1012
  "Hard Puzzle Acc": "0.42",
1013
+ "Small Puzzle Acc": "19.38",
1014
+ "Medium Puzzle Acc": "0.71",
1015
+ "Large Puzzle Acc": "0.00",
1016
+ "XL Puzzle Acc": "0.00",
1017
  "Total Puzzles": 1000,
1018
+ "Reason Lens": "718.43",
1019
+ "N_Mode": "single",
1020
+ "N_Size": 1
1021
  },
1022
  {
1023
  "Model": "Qwen2.5-3B-Instruct",
 
1027
  "No answer": "56.70",
1028
  "Easy Puzzle Acc": "17.14",
1029
  "Hard Puzzle Acc": "0.00",
1030
+ "Small Puzzle Acc": "15.00",
1031
+ "Medium Puzzle Acc": "0.00",
1032
+ "Large Puzzle Acc": "0.00",
1033
+ "XL Puzzle Acc": "0.00",
1034
  "Total Puzzles": 1000,
1035
+ "Reason Lens": "906.58",
1036
+ "N_Mode": "single",
1037
+ "N_Size": 1
1038
  },
1039
  {
1040
  "Model": "gemma-2-2b-it",
 
1044
  "No answer": "57.20",
1045
  "Easy Puzzle Acc": "14.29",
1046
  "Hard Puzzle Acc": "0.28",
1047
+ "Small Puzzle Acc": "13.12",
1048
+ "Medium Puzzle Acc": "0.00",
1049
+ "Large Puzzle Acc": "0.00",
1050
+ "XL Puzzle Acc": "0.00",
1051
  "Total Puzzles": 1000,
1052
+ "Reason Lens": "1032.89",
1053
+ "N_Mode": "single",
1054
+ "N_Size": 1
1055
  },
1056
  {
1057
  "Model": "Yi-1.5-9B-Chat",
 
1061
  "No answer": "11.30",
1062
  "Easy Puzzle Acc": "8.21",
1063
  "Hard Puzzle Acc": "0.00",
1064
+ "Small Puzzle Acc": "7.19",
1065
+ "Medium Puzzle Acc": "0.00",
1066
+ "Large Puzzle Acc": "0.00",
1067
+ "XL Puzzle Acc": "0.00",
1068
  "Total Puzzles": 1000,
1069
+ "Reason Lens": "1592.60",
1070
+ "N_Mode": "single",
1071
+ "N_Size": 1
1072
  }
1073
  ]
app.py CHANGED
@@ -66,9 +66,12 @@ def _tab_leaderboard():
66
  # default_main_df_no_task = default_main_df.copy()
67
  default_mode = "greedy"
68
  default_main_df = df_filters(default_mode, False)
 
 
 
69
  with gr.Row():
70
  with gr.Column(scale=5):
71
- mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
72
  # with gr.Row():
73
  # with gr.Column(scale=2):
74
 
@@ -140,7 +143,7 @@ def _tab_submit():
140
  and apply for the access for the [private dataset](https://huggingface.co/datasets/WildEval/ZebraLogic) that contains the truth solutions.
141
  """
142
 
143
- gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
144
 
145
 
146
 
@@ -159,7 +162,7 @@ def build_demo():
159
  _tab_leaderboard()
160
  with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
161
  _tab_explore()
162
- with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
163
  _tab_submit()
164
 
165
  with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
@@ -200,7 +203,9 @@ def data_load(result_file):
200
  pass
201
  original_df = pd.DataFrame(raw_data)
202
  original_df = original_df[original_df["Total Puzzles"] == 1000]
 
203
  original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
 
204
  # print(original_df.columns)
205
 
206
 
 
66
  # default_main_df_no_task = default_main_df.copy()
67
  default_mode = "greedy"
68
  default_main_df = df_filters(default_mode, False)
69
+ print(default_main_df.columns)
70
+ # drop the Mode column
71
+ default_main_df = default_main_df.drop(columns=["Mode"])
72
  with gr.Row():
73
  with gr.Column(scale=5):
74
+ mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode, visible=False)
75
  # with gr.Row():
76
  # with gr.Column(scale=2):
77
 
 
143
  and apply for the access for the [private dataset](https://huggingface.co/datasets/WildEval/ZebraLogic) that contains the truth solutions.
144
  """
145
 
146
+ gr.Markdown("## 🚀 Evaluate your models\n\n" + markdown_text, elem_classes="markdown-text")
147
 
148
 
149
 
 
162
  _tab_leaderboard()
163
  with gr.TabItem("🔍 Explore", elem_id="od-benchmark-tab-table", id=1):
164
  _tab_explore()
165
+ with gr.TabItem("🚀 Evaluate your models", elem_id="od-benchmark-tab-table", id=3):
166
  _tab_submit()
167
 
168
  with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
 
203
  pass
204
  original_df = pd.DataFrame(raw_data)
205
  original_df = original_df[original_df["Total Puzzles"] == 1000]
206
+
207
  original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
208
+ print(f"original_df.columns: {original_df.columns}")
209
  # print(original_df.columns)
210
 
211
 
constants.py CHANGED
@@ -36,12 +36,16 @@ CITATION_TEXT = """
36
 
37
  column_names = OrderedDict({
38
  "Model": "Model",
39
- "Mode": "Mode",
40
  "Puzzle Acc": "Puzzle Acc",
 
 
 
 
41
  "Cell Acc": "Cell Acc",
42
- "No answer": "No answer",
43
- "Easy Puzzle Acc": "Easy Puzzle Acc",
44
- "Hard Puzzle Acc": "Hard Puzzle Acc",
45
  # "Total Puzzles": "Total Puzzles",
46
  # "Reason Lens": "Reason Lens",
47
  })
@@ -64,10 +68,18 @@ ORDERED_COLUMN_NAMES = [
64
  "Model",
65
  "Mode",
66
  "Puzzle Acc",
67
- "Easy Puzzle Acc",
68
- "Hard Puzzle Acc",
 
 
 
 
 
 
 
 
69
  "Cell Acc",
70
- "No answer",
71
  ]
72
 
73
 
 
36
 
37
  column_names = OrderedDict({
38
  "Model": "Model",
39
+ # "Mode": "Mode",
40
  "Puzzle Acc": "Puzzle Acc",
41
+ "Small Puzzle Acc": "Small",
42
+ "Medium Puzzle Acc": "Medium",
43
+ "Large Puzzle Acc": "Large",
44
+ "XL Puzzle Acc": "XL",
45
  "Cell Acc": "Cell Acc",
46
+ # "No answer": "No answer",
47
+ # "Easy Puzzle Acc": "Easy Puzzle Acc",
48
+ # "Hard Puzzle Acc": "Hard Puzzle Acc",
49
  # "Total Puzzles": "Total Puzzles",
50
  # "Reason Lens": "Reason Lens",
51
  })
 
68
  "Model",
69
  "Mode",
70
  "Puzzle Acc",
71
+ "XL",
72
+ "Large",
73
+ "Medium",
74
+ "Small",
75
+ "XL Puzzle Acc",
76
+ "Large Puzzle Acc",
77
+ "Medium Puzzle Acc",
78
+ "Small Puzzle Acc",
79
+ # "Easy Puzzle Acc",
80
+ # "Hard Puzzle Acc",
81
  "Cell Acc",
82
+ # "No answer",
83
  ]
84
 
85
 
eval_utils.py CHANGED
@@ -1,6 +1,6 @@
1
- import json
2
  from collections import defaultdict
3
- import os
4
  from tabulate import tabulate
5
  from datasets import load_dataset
6
 
@@ -10,28 +10,28 @@ def load_private_solutions():
10
  global private_solutions
11
  private_zebra_data = load_dataset("WildEval/ZebraLogic", "grid_mode", split="test")
12
  for item in private_zebra_data:
13
- private_solutions[item["id"]] = item["solution"]
14
- return
15
 
16
  def load_model_results(run_name_folders):
17
  model_results = {}
18
  for run_name, folder in run_name_folders.items():
19
- # iterate all json files under the folder
20
  for filename in os.listdir(folder):
21
  filepath = os.path.join(folder, filename)
22
  if not filename.endswith(".json"):
23
  continue
24
- model_name = filename.replace(".json", "")
25
  model_name = f"{model_name}%{run_name}"
26
- model_results[model_name] = filepath
27
  return model_results
28
-
29
  def extract_last_complete_json(s):
30
  # Stack to keep track of opening and closing braces
31
  stack = []
32
  last_json_start = None
33
  last_json_str = None
34
-
35
  for i, char in enumerate(s):
36
  if char == '{':
37
  stack.append(i)
@@ -44,14 +44,14 @@ def extract_last_complete_json(s):
44
  # Complete JSON object found
45
  last_json_str = s[last_json_start:i+1]
46
  last_json_start = None
47
-
48
  # Load the last JSON object
49
  if last_json_str:
50
  try:
51
  return json.loads(last_json_str.replace("\n", ""))
52
  except json.JSONDecodeError:
53
  pass
54
-
55
  return None
56
 
57
  def eval_each_puzzle(id, prediction_table):
@@ -64,15 +64,15 @@ def eval_each_puzzle(id, prediction_table):
64
  columns = solution["header"]
65
  assert columns[0] == "House"
66
  solution_table = {}
67
- this_total_cells = 0
68
  for i in range(num_houses):
69
- solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
70
  this_total_cells += len(columns) - 1
71
-
72
- this_correct_cells = 0 # number in the solution_table
73
  for house in solution_table:
74
- for column in solution_table[house]:
75
- # if prediction_table[house][column] not exist then pass
76
  if house in prediction_table and column in prediction_table[house]:
77
  truth_cell = solution_table[house][column].lower().strip()
78
  if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
@@ -82,23 +82,24 @@ def eval_each_puzzle(id, prediction_table):
82
  elif type(prediction_table[house][column]) == str:
83
  predicted_cell = prediction_table[house][column].lower().strip()
84
  if truth_cell == predicted_cell:
85
- this_correct_cells += 1
86
  return this_total_cells, this_correct_cells, private_solutions[id]
87
 
 
88
  def eval_model(model, filepath):
89
  global private_solutions
90
  with open(filepath, "r") as f:
91
  print(f"Processing {filepath}")
92
  data = json.load(f)
93
 
94
- solved_puzzles = 0
95
  num_total_puzzles = len(data)
96
  correct_cells = 0
97
  total_cells = 0
98
- no_asnwer = 0
99
 
100
  num_total_puzzles_by_size = defaultdict(int)
101
- solved_puzzles_by_size = defaultdict(int)
102
  reason_lens = []
103
  for item in data:
104
  # solution = item["solution"]
@@ -106,20 +107,20 @@ def eval_model(model, filepath):
106
  size = item["size"]
107
  num_total_puzzles_by_size[size] += 1
108
 
109
- # Process the solution
110
  solution_table = {}
111
  num_houses = len(solution["rows"])
112
  columns = solution["header"]
113
  assert columns[0] == "House"
114
  solution_table = {}
115
- this_total_cells = 0
116
  for i in range(num_houses):
117
- solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
118
  this_total_cells += len(columns) - 1
119
  total_cells += this_total_cells
120
 
121
  # Read and Parse the prediction from model output
122
- prediction_str = item["output"][0]
123
  prediction_json = extract_last_complete_json(prediction_str)
124
  if prediction_json is None or "solution" not in prediction_json:
125
  # print("-"*100)
@@ -128,16 +129,16 @@ def eval_model(model, filepath):
128
  # json.loads(prediction_str)
129
  no_asnwer += 1
130
  # print(item["id"])
131
- continue
132
  reason = prediction_json.get("reasoning", "")
133
  prediction_table = prediction_json["solution"]
134
-
135
  reason_lens.append(len(reason))
136
 
137
- this_correct_cells = 0 # number in the solution_table
138
  for house in solution_table:
139
- for column in solution_table[house]:
140
- # if prediction_table[house][column] not exist then pass
141
  if house in prediction_table and column in prediction_table[house]:
142
  truth_cell = solution_table[house][column].lower().strip()
143
  if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
@@ -149,24 +150,24 @@ def eval_model(model, filepath):
149
  else:
150
  raise ValueError(f"Unknown type: {type(prediction_table[house][column])}")
151
  if truth_cell == predicted_cell:
152
- this_correct_cells += 1
153
  correct_cells += this_correct_cells
154
-
155
  # compute puzzle success rate
156
  if this_correct_cells == this_total_cells:
157
  solved_puzzles += 1
158
  solved_puzzles_by_size[size] += 1
159
 
160
-
161
-
162
 
163
- # # print the success rate by size; order the dict by size first
164
- sizes = sorted(num_total_puzzles_by_size.keys())
165
- easy_sizes = ['2*2', '2*3', '2*4', '2*5', '2*6', '3*2', '3*3',]
 
 
166
  hard_sizes = ['3*4', '3*5', '4*2', '3*6', '4*3', '4*4', '5*2', '6*2', '4*5', '4*6', '5*3', '5*4', '5*5', '5*6', '6*3', '6*4', '6*5', '6*6']
167
-
168
  easy_solved_puzzles = sum([solved_puzzles_by_size[size] for size in easy_sizes])
169
- easy_total_puzzles = sum([num_total_puzzles_by_size[size] for size in easy_sizes])
170
  hard_solved_puzzles = sum([solved_puzzles_by_size[size] for size in hard_sizes])
171
  hard_total_puzzles = sum([num_total_puzzles_by_size[size] for size in hard_sizes])
172
 
@@ -179,20 +180,20 @@ def eval_model(model, filepath):
179
  result["Puzzle Acc"] = f"{solved_puzzles/num_total_puzzles*100:.2f}"
180
  result["Cell Acc"] = f"{correct_cells/total_cells*100:.2f}"
181
  result["No answer"] = f"{no_asnwer/num_total_puzzles*100:.2f}"
182
- result["Easy Puzzle Acc"] = f"{easy_solved_puzzles/easy_total_puzzles*100:.2f}"
183
  result["Hard Puzzle Acc"] = f"{hard_solved_puzzles/hard_total_puzzles*100:.2f}"
184
  result["Total Puzzles"] = num_total_puzzles
185
  result["Reason Lens"] = f"{sum(reason_lens)/len(reason_lens):.2f}"
186
  return result
187
 
188
 
189
- def gen_results(run_name_folders):
190
  model_results = load_model_results(run_name_folders)
191
 
192
  columns = ["Model", "Mode", "Puzzle Acc", "Cell Acc", "No answer", "Easy Puzzle Acc", "Hard Puzzle Acc", "Total Puzzles", "Reason Lens"]
193
  rows = []
194
- for model_name, filepath in model_results.items():
195
- result = eval_model(model_name, filepath)
196
  rows.append(result)
197
 
198
  # sort the rows by puzzle accuracy
@@ -203,7 +204,7 @@ def gen_results(run_name_folders):
203
  print(tabulate(table_data, headers=columns, tablefmt="fancy_outline", stralign="center", numalign="center"))
204
  # print(tabulate(rows, headers=columns, tablefmt="github"))
205
 
206
- # write to json file
207
  with open("result_dirs/zebra-grid.summary.json", "w") as f:
208
  json.dump(rows, f, indent=2)
209
 
@@ -212,6 +213,7 @@ if __name__ == "__main__":
212
  run_name_folders = {
213
  "greedy": "result_dirs/zebra-grid",
214
  "sampling": "result_dirs/zebra-grid/sampling",
215
- }
216
  load_private_solutions()
217
  gen_results(run_name_folders)
 
 
1
+ import json
2
  from collections import defaultdict
3
+ import os
4
  from tabulate import tabulate
5
  from datasets import load_dataset
6
 
 
10
  global private_solutions
11
  private_zebra_data = load_dataset("WildEval/ZebraLogic", "grid_mode", split="test")
12
  for item in private_zebra_data:
13
+ private_solutions[item["id"]] = item["solution"]
14
+ return
15
 
16
  def load_model_results(run_name_folders):
17
  model_results = {}
18
  for run_name, folder in run_name_folders.items():
19
+ # iterate all json files under the folder
20
  for filename in os.listdir(folder):
21
  filepath = os.path.join(folder, filename)
22
  if not filename.endswith(".json"):
23
  continue
24
+ model_name = filename.replace(".json", "")
25
  model_name = f"{model_name}%{run_name}"
26
+ model_results[model_name] = filepath
27
  return model_results
28
+
29
  def extract_last_complete_json(s):
30
  # Stack to keep track of opening and closing braces
31
  stack = []
32
  last_json_start = None
33
  last_json_str = None
34
+
35
  for i, char in enumerate(s):
36
  if char == '{':
37
  stack.append(i)
 
44
  # Complete JSON object found
45
  last_json_str = s[last_json_start:i+1]
46
  last_json_start = None
47
+
48
  # Load the last JSON object
49
  if last_json_str:
50
  try:
51
  return json.loads(last_json_str.replace("\n", ""))
52
  except json.JSONDecodeError:
53
  pass
54
+
55
  return None
56
 
57
  def eval_each_puzzle(id, prediction_table):
 
64
  columns = solution["header"]
65
  assert columns[0] == "House"
66
  solution_table = {}
67
+ this_total_cells = 0
68
  for i in range(num_houses):
69
+ solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
70
  this_total_cells += len(columns) - 1
71
+
72
+ this_correct_cells = 0 # number in the solution_table
73
  for house in solution_table:
74
+ for column in solution_table[house]:
75
+ # if prediction_table[house][column] not exist then pass
76
  if house in prediction_table and column in prediction_table[house]:
77
  truth_cell = solution_table[house][column].lower().strip()
78
  if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
 
82
  elif type(prediction_table[house][column]) == str:
83
  predicted_cell = prediction_table[house][column].lower().strip()
84
  if truth_cell == predicted_cell:
85
+ this_correct_cells += 1
86
  return this_total_cells, this_correct_cells, private_solutions[id]
87
 
88
+ """
89
  def eval_model(model, filepath):
90
  global private_solutions
91
  with open(filepath, "r") as f:
92
  print(f"Processing {filepath}")
93
  data = json.load(f)
94
 
95
+ solved_puzzles = 0
96
  num_total_puzzles = len(data)
97
  correct_cells = 0
98
  total_cells = 0
99
+ no_asnwer = 0
100
 
101
  num_total_puzzles_by_size = defaultdict(int)
102
+ solved_puzzles_by_size = defaultdict(int)
103
  reason_lens = []
104
  for item in data:
105
  # solution = item["solution"]
 
107
  size = item["size"]
108
  num_total_puzzles_by_size[size] += 1
109
 
110
+ # Process the solution
111
  solution_table = {}
112
  num_houses = len(solution["rows"])
113
  columns = solution["header"]
114
  assert columns[0] == "House"
115
  solution_table = {}
116
+ this_total_cells = 0
117
  for i in range(num_houses):
118
+ solution_table[f'House {i+1}'] = {columns[j]: solution["rows"][i][j] for j in range(1, len(columns))}
119
  this_total_cells += len(columns) - 1
120
  total_cells += this_total_cells
121
 
122
  # Read and Parse the prediction from model output
123
+ prediction_str = item["output"][0]
124
  prediction_json = extract_last_complete_json(prediction_str)
125
  if prediction_json is None or "solution" not in prediction_json:
126
  # print("-"*100)
 
129
  # json.loads(prediction_str)
130
  no_asnwer += 1
131
  # print(item["id"])
132
+ continue
133
  reason = prediction_json.get("reasoning", "")
134
  prediction_table = prediction_json["solution"]
135
+
136
  reason_lens.append(len(reason))
137
 
138
+ this_correct_cells = 0 # number in the solution_table
139
  for house in solution_table:
140
+ for column in solution_table[house]:
141
+ # if prediction_table[house][column] not exist then pass
142
  if house in prediction_table and column in prediction_table[house]:
143
  truth_cell = solution_table[house][column].lower().strip()
144
  if prediction_table[house][column] is None or len(prediction_table[house][column]) == 0:
 
150
  else:
151
  raise ValueError(f"Unknown type: {type(prediction_table[house][column])}")
152
  if truth_cell == predicted_cell:
153
+ this_correct_cells += 1
154
  correct_cells += this_correct_cells
155
+
156
  # compute puzzle success rate
157
  if this_correct_cells == this_total_cells:
158
  solved_puzzles += 1
159
  solved_puzzles_by_size[size] += 1
160
 
 
 
161
 
162
+
163
+
164
+ # # print the success rate by size; order the dict by size first
165
+ sizes = sorted(num_total_puzzles_by_size.keys())
166
+ easy_sizes = ['2*2', '2*3', '2*4', '2*5', '2*6', '3*2', '3*3',]
167
  hard_sizes = ['3*4', '3*5', '4*2', '3*6', '4*3', '4*4', '5*2', '6*2', '4*5', '4*6', '5*3', '5*4', '5*5', '5*6', '6*3', '6*4', '6*5', '6*6']
168
+
169
  easy_solved_puzzles = sum([solved_puzzles_by_size[size] for size in easy_sizes])
170
+ easy_total_puzzles = sum([num_total_puzzles_by_size[size] for size in easy_sizes])
171
  hard_solved_puzzles = sum([solved_puzzles_by_size[size] for size in hard_sizes])
172
  hard_total_puzzles = sum([num_total_puzzles_by_size[size] for size in hard_sizes])
173
 
 
180
  result["Puzzle Acc"] = f"{solved_puzzles/num_total_puzzles*100:.2f}"
181
  result["Cell Acc"] = f"{correct_cells/total_cells*100:.2f}"
182
  result["No answer"] = f"{no_asnwer/num_total_puzzles*100:.2f}"
183
+ result["Easy Puzzle Acc"] = f"{easy_solved_puzzles/easy_total_puzzles*100:.2f}"
184
  result["Hard Puzzle Acc"] = f"{hard_solved_puzzles/hard_total_puzzles*100:.2f}"
185
  result["Total Puzzles"] = num_total_puzzles
186
  result["Reason Lens"] = f"{sum(reason_lens)/len(reason_lens):.2f}"
187
  return result
188
 
189
 
190
+ def gen_results(run_name_folders):
191
  model_results = load_model_results(run_name_folders)
192
 
193
  columns = ["Model", "Mode", "Puzzle Acc", "Cell Acc", "No answer", "Easy Puzzle Acc", "Hard Puzzle Acc", "Total Puzzles", "Reason Lens"]
194
  rows = []
195
+ for model_name, filepath in model_results.items():
196
+ result = eval_model(model_name, filepath)
197
  rows.append(result)
198
 
199
  # sort the rows by puzzle accuracy
 
204
  print(tabulate(table_data, headers=columns, tablefmt="fancy_outline", stralign="center", numalign="center"))
205
  # print(tabulate(rows, headers=columns, tablefmt="github"))
206
 
207
+ # write to json file
208
  with open("result_dirs/zebra-grid.summary.json", "w") as f:
209
  json.dump(rows, f, indent=2)
210
 
 
213
  run_name_folders = {
214
  "greedy": "result_dirs/zebra-grid",
215
  "sampling": "result_dirs/zebra-grid/sampling",
216
+ }
217
  load_private_solutions()
218
  gen_results(run_name_folders)
219
+ """