yuchenlin commited on
Commit
3c29637
β€’
1 Parent(s): b2043a7

add new models

Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json CHANGED
@@ -21,6 +21,28 @@
21
  "Total Puzzles": 1000,
22
  "Reason Lens": "1153.83"
23
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  {
25
  "Model": "gpt-4o-2024-05-13",
26
  "Mode": "sampling",
@@ -230,6 +252,17 @@
230
  "Total Puzzles": 1000,
231
  "Reason Lens": "809.95"
232
  },
 
 
 
 
 
 
 
 
 
 
 
233
  {
234
  "Model": "gemma-2-27b-it@nvidia",
235
  "Mode": "greedy",
@@ -252,6 +285,17 @@
252
  "Total Puzzles": 1000,
253
  "Reason Lens": "1015.06"
254
  },
 
 
 
 
 
 
 
 
 
 
 
255
  {
256
  "Model": "reka-core-20240501",
257
  "Mode": "greedy",
@@ -263,17 +307,6 @@
263
  "Total Puzzles": 1000,
264
  "Reason Lens": "1078.29"
265
  },
266
- {
267
- "Model": "gemma-2-9b-it",
268
- "Mode": "greedy",
269
- "Puzzle Acc": "12.90",
270
- "Cell Acc": "37.07",
271
- "No answer": "0.50",
272
- "Easy Puzzle Acc": "42.14",
273
- "Hard Puzzle Acc": "1.53",
274
- "Total Puzzles": 1000,
275
- "Reason Lens": "859.14"
276
- },
277
  {
278
  "Model": "gemma-2-9b-it@nvidia",
279
  "Mode": "greedy",
@@ -296,6 +329,28 @@
296
  "Total Puzzles": 1000,
297
  "Reason Lens": "1216.40"
298
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  {
300
  "Model": "Yi-1.5-34B-Chat",
301
  "Mode": "greedy",
@@ -329,6 +384,17 @@
329
  "Total Puzzles": 1000,
330
  "Reason Lens": "820.66"
331
  },
 
 
 
 
 
 
 
 
 
 
 
332
  {
333
  "Model": "reka-flash-20240226",
334
  "Mode": "greedy",
@@ -351,6 +417,17 @@
351
  "Total Puzzles": 1000,
352
  "Reason Lens": "1148.16"
353
  },
 
 
 
 
 
 
 
 
 
 
 
354
  {
355
  "Model": "Qwen2-7B-Instruct",
356
  "Mode": "greedy",
 
21
  "Total Puzzles": 1000,
22
  "Reason Lens": "1153.83"
23
  },
24
+ {
25
+ "Model": "Llama-3.1-405B-Instruct-Turbo",
26
+ "Mode": "greedy",
27
+ "Puzzle Acc": "32.60",
28
+ "Cell Acc": "45.80",
29
+ "No answer": "12.50",
30
+ "Easy Puzzle Acc": "87.14",
31
+ "Hard Puzzle Acc": "11.39",
32
+ "Total Puzzles": 1000,
33
+ "Reason Lens": "314.66"
34
+ },
35
+ {
36
+ "Model": "Llama-3.1-405B-Instruct-Turbo",
37
+ "Mode": "sampling",
38
+ "Puzzle Acc": "32.60",
39
+ "Cell Acc": "47.04",
40
+ "No answer": "10.80",
41
+ "Easy Puzzle Acc": "86.07",
42
+ "Hard Puzzle Acc": "11.81",
43
+ "Total Puzzles": 1000,
44
+ "Reason Lens": "439.96"
45
+ },
46
  {
47
  "Model": "gpt-4o-2024-05-13",
48
  "Mode": "sampling",
 
252
  "Total Puzzles": 1000,
253
  "Reason Lens": "809.95"
254
  },
255
+ {
256
+ "Model": "Athene-70B",
257
+ "Mode": "greedy",
258
+ "Puzzle Acc": "16.70",
259
+ "Cell Acc": "32.98",
260
+ "No answer": "21.10",
261
+ "Easy Puzzle Acc": "52.50",
262
+ "Hard Puzzle Acc": "2.78",
263
+ "Total Puzzles": 1000,
264
+ "Reason Lens": "391.19"
265
+ },
266
  {
267
  "Model": "gemma-2-27b-it@nvidia",
268
  "Mode": "greedy",
 
285
  "Total Puzzles": 1000,
286
  "Reason Lens": "1015.06"
287
  },
288
+ {
289
+ "Model": "command-r-plus",
290
+ "Mode": "greedy",
291
+ "Puzzle Acc": "13.90",
292
+ "Cell Acc": "39.01",
293
+ "No answer": "0.20",
294
+ "Easy Puzzle Acc": "44.64",
295
+ "Hard Puzzle Acc": "1.94",
296
+ "Total Puzzles": 1000,
297
+ "Reason Lens": "810.53"
298
+ },
299
  {
300
  "Model": "reka-core-20240501",
301
  "Mode": "greedy",
 
307
  "Total Puzzles": 1000,
308
  "Reason Lens": "1078.29"
309
  },
 
 
 
 
 
 
 
 
 
 
 
310
  {
311
  "Model": "gemma-2-9b-it@nvidia",
312
  "Mode": "greedy",
 
329
  "Total Puzzles": 1000,
330
  "Reason Lens": "1216.40"
331
  },
332
+ {
333
+ "Model": "Mistral-Nemo-Instruct-2407",
334
+ "Mode": "greedy",
335
+ "Puzzle Acc": "11.80",
336
+ "Cell Acc": "34.93",
337
+ "No answer": "1.60",
338
+ "Easy Puzzle Acc": "38.93",
339
+ "Hard Puzzle Acc": "1.25",
340
+ "Total Puzzles": 1000,
341
+ "Reason Lens": "925.88"
342
+ },
343
+ {
344
+ "Model": "Phi-3-mini-4k-instruct",
345
+ "Mode": "greedy",
346
+ "Puzzle Acc": "11.60",
347
+ "Cell Acc": "13.50",
348
+ "No answer": "59.00",
349
+ "Easy Puzzle Acc": "38.21",
350
+ "Hard Puzzle Acc": "1.25",
351
+ "Total Puzzles": 1000,
352
+ "Reason Lens": "790.29"
353
+ },
354
  {
355
  "Model": "Yi-1.5-34B-Chat",
356
  "Mode": "greedy",
 
384
  "Total Puzzles": 1000,
385
  "Reason Lens": "820.66"
386
  },
387
+ {
388
+ "Model": "command-r",
389
+ "Mode": "greedy",
390
+ "Puzzle Acc": "9.90",
391
+ "Cell Acc": "32.66",
392
+ "No answer": "1.50",
393
+ "Easy Puzzle Acc": "32.14",
394
+ "Hard Puzzle Acc": "1.25",
395
+ "Total Puzzles": 1000,
396
+ "Reason Lens": "1005.17"
397
+ },
398
  {
399
  "Model": "reka-flash-20240226",
400
  "Mode": "greedy",
 
417
  "Total Puzzles": 1000,
418
  "Reason Lens": "1148.16"
419
  },
420
+ {
421
+ "Model": "Mixtral-8x7B-Instruct-v0.1",
422
+ "Mode": "greedy",
423
+ "Puzzle Acc": "8.70",
424
+ "Cell Acc": "26.47",
425
+ "No answer": "20.30",
426
+ "Easy Puzzle Acc": "28.93",
427
+ "Hard Puzzle Acc": "0.83",
428
+ "Total Puzzles": 1000,
429
+ "Reason Lens": "1177.21"
430
+ },
431
  {
432
  "Model": "Qwen2-7B-Instruct",
433
  "Mode": "greedy",
model_info.json CHANGED
@@ -64,5 +64,9 @@
64
  "SELM-Llama-3-8B-Instruct-iter-3": {"pretty_name": "SELM (Llama3-8B-Inst-iter3)", "hf_model_id": "ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3"},
65
  "nemotron-4-340b-instruct": {"pretty_name": "Nemotron-4-340B-Instruct", "hf_model_id": "nvidia/Nemotron-4-340B-Instruct"},
66
  "Llama-3-8B-Magpie-Align-v0.1": {"pretty_name": "Magpie-8B-Align-v0.1", "hf_model_id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1"},
67
- "mathstral-7B-v0.1":{"pretty_name": "mathstral-7B-v0.1", "hf_model_id": "mistralai/mathstral-7B-v0.1"}
 
 
 
 
68
  }
 
64
  "SELM-Llama-3-8B-Instruct-iter-3": {"pretty_name": "SELM (Llama3-8B-Inst-iter3)", "hf_model_id": "ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3"},
65
  "nemotron-4-340b-instruct": {"pretty_name": "Nemotron-4-340B-Instruct", "hf_model_id": "nvidia/Nemotron-4-340B-Instruct"},
66
  "Llama-3-8B-Magpie-Align-v0.1": {"pretty_name": "Magpie-8B-Align-v0.1", "hf_model_id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1"},
67
+ "mathstral-7B-v0.1":{"pretty_name": "mathstral-7B-v0.1 🚨", "hf_model_id": "mistralai/mathstral-7B-v0.1"},
68
+ "Llama-3.1-405B-Instruct-Turbo": {"pretty_name": "Llama-3.1-405B-Instruct-Turbo 🚨", "hf_model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct"},
69
+ "Mistral-Nemo-Instruct-2407": {"pretty_name": "Mistral-Nemo-Inst (12B) 🚨", "hf_model_id": "Mistral-Nemo-Instruct-2407"},
70
+ "Phi-3-mini-4k-instruct": {"pretty_name": "Phi-3-mini-4k-instruct 🚨", "hf_model_id": "microsoft/Phi-3-mini-4k-instruct"},
71
+ "Athene-70B": {"pretty_name": "Athene-70B 🚨", "hf_model_id": "Nexusflow/Athene-70B"}
72
  }