SondosMB commited on
Commit
e01b14a
1 Parent(s): 90d84f8

Update big.json

Browse files
Files changed (1) hide show
  1. big.json +13 -13
big.json CHANGED
@@ -5,9 +5,9 @@
5
  "model": "GPT-4o",
6
  "Average": 70.15,
7
  "MMLU": 70.09,
8
- "ARC":86.33,
9
  "WinoGrande":72.22,
10
- "PiQA":60.34,
11
  "CommonsenseQA":70.28,
12
  "Race":67.87 ,
13
  "MedMCQA":57.85 ,
@@ -19,7 +19,7 @@
19
  "MMLU": 74.77,
20
  "ARC":82.68,
21
  "WinoGrande": 66.22,
22
- "PiQA": 61.64,
23
  "CommonsenseQA": 62.96,
24
  "Race": 67.05,
25
  "MedMCQA": 51.81,
@@ -27,11 +27,11 @@
27
  },
28
  {
29
  "model": "Claude-3 Opus",
30
- "Average": 62.68,
31
  "MMLU": 70.23,
32
- "ARC":76.62,
33
  "WinoGrande": 63.54,
34
- "PiQA": 59.05,
35
  "CommonsenseQA": 63.66,
36
  "Race": 66.22,
37
  "MedMCQA": 49.14,
@@ -43,7 +43,7 @@
43
  "MMLU": 68.76,
44
  "ARC":72.32,
45
  "WinoGrande": 56.83,
46
- "PiQA": 61.21,
47
  "CommonsenseQA": 55.35,
48
  "Race": 70.17,
49
  "MedMCQA": 43.44,
@@ -51,11 +51,11 @@
51
  },
52
  {
53
  "model": "GPT-3.5",
54
- "Average": 60.30,
55
  "MMLU": 65.38,
56
  "ARC":78.24,
57
  "WinoGrande": 64.56,
58
- "PiQA": 54.89,
59
  "CommonsenseQA": 67.89,
60
  "Race": 60.11,
61
  "MedMCQA": 41.42,
@@ -63,11 +63,11 @@
63
  },
64
  {
65
  "model": "Gemini 1.0 Pro",
66
- "Average": 54.04,
67
  "MMLU": 56.04,
68
- "ARC":72.23,
69
  "WinoGrande": 56.35,
70
- "PiQA": 47.70,
71
  "CommonsenseQA": 50.56,
72
  "Race": 61.02,
73
  "MedMCQA": 35.89,
@@ -79,7 +79,7 @@
79
  "MMLU": 59.67,
80
  "ARC":67.09,
81
  "WinoGrande": 57.14,
82
- "PiQA": 43.10,
83
  "CommonsenseQA": 55.49,
84
  "Race": 58.21,
85
  "MedMCQA": 41.67,
 
5
  "model": "GPT-4o",
6
  "Average": 70.15,
7
  "MMLU": 70.09,
8
+ "ARC":86.31,
9
  "WinoGrande":72.22,
10
+ "PIQA":60.34,
11
  "CommonsenseQA":70.28,
12
  "Race":67.87 ,
13
  "MedMCQA":57.85 ,
 
19
  "MMLU": 74.77,
20
  "ARC":82.68,
21
  "WinoGrande": 66.22,
22
+ "PIQA": 61.64,
23
  "CommonsenseQA": 62.96,
24
  "Race": 67.05,
25
  "MedMCQA": 51.81,
 
27
  },
28
  {
29
  "model": "Claude-3 Opus",
30
+ "Average": 62.53,
31
  "MMLU": 70.23,
32
+ "ARC":75.47,
33
  "WinoGrande": 63.54,
34
+ "PIQA": 59.05,
35
  "CommonsenseQA": 63.66,
36
  "Race": 66.22,
37
  "MedMCQA": 49.14,
 
43
  "MMLU": 68.76,
44
  "ARC":72.32,
45
  "WinoGrande": 56.83,
46
+ "PIQA": 61.21,
47
  "CommonsenseQA": 55.35,
48
  "Race": 70.17,
49
  "MedMCQA": 43.44,
 
51
  },
52
  {
53
  "model": "GPT-3.5",
54
+ "Average": 60.32,
55
  "MMLU": 65.38,
56
  "ARC":78.24,
57
  "WinoGrande": 64.56,
58
+ "PIQA": 54.89,
59
  "CommonsenseQA": 67.89,
60
  "Race": 60.11,
61
  "MedMCQA": 41.42,
 
63
  },
64
  {
65
  "model": "Gemini 1.0 Pro",
66
+ "Average": 54.06,
67
  "MMLU": 56.04,
68
+ "ARC":72.35,
69
  "WinoGrande": 56.35,
70
+ "PIQA": 47.70,
71
  "CommonsenseQA": 50.56,
72
  "Race": 61.02,
73
  "MedMCQA": 35.89,
 
79
  "MMLU": 59.67,
80
  "ARC":67.09,
81
  "WinoGrande": 57.14,
82
+ "PIQA": 43.10,
83
  "CommonsenseQA": 55.49,
84
  "Race": 58.21,
85
  "MedMCQA": 41.67,