File size: 1,950 Bytes
b782462
a153b62
 
b782462
a153b62
 
 
e01b14a
a3733f0
e01b14a
a153b62
 
 
 
 
 
05bf560
a153b62
 
 
 
e01b14a
a153b62
 
 
 
b782462
 
 
e01b14a
a153b62
e01b14a
a153b62
e01b14a
a153b62
 
 
 
b782462
 
 
a153b62
 
7a01f89
a153b62
e01b14a
a153b62
 
 
 
b782462
 
 
e01b14a
a153b62
 
 
e01b14a
a153b62
 
 
 
b782462
 
a153b62
e01b14a
a153b62
e01b14a
a153b62
e01b14a
a153b62
 
 
 
b782462
 
 
a153b62
 
 
b782462
e01b14a
b782462
 
 
a153b62
b782462
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
[

  
    {
        "model": "GPT-4o",
        "Average": 70.15,
        "MMLU": 70.09,
        "ARC":86.31,
        "WinoGrande":72.22,
        "PIQA":60.34,
        "CommonsenseQA":70.28,
        "Race":67.87 ,
        "MedMCQA":57.85 ,
        "OpenkookQA":67.21 
    },
  {
        "model": "GPT-4-1106-preview",
        "Average": 65.93,
        "MMLU": 74.77,
        "ARC":82.68,
        "WinoGrande": 66.22,
        "PIQA": 61.64,
        "CommonsenseQA": 62.96,
        "Race": 67.05,
        "MedMCQA": 51.81,
        "OpenkookQA": 60.29
    },
    {
        "model": "Claude-3 Opus",
        "Average": 62.53,
        "MMLU": 70.23,
        "ARC":75.47,
        "WinoGrande": 63.54,
        "PIQA": 59.05,
        "CommonsenseQA": 63.66,
        "Race": 66.22,
        "MedMCQA": 49.14,
        "OpenkookQA": 52.95
    },
    {
        "model": "Mistral Large",
        "Average": 60.48,
        "MMLU": 68.76,
        "ARC":72.32,
        "WinoGrande": 56.83,
        "PIQA": 61.21,
        "CommonsenseQA": 55.35,
        "Race": 70.17,
        "MedMCQA": 43.44,
        "OpenkookQA": 58.66
    },
    {
        "model": "GPT-3.5",
        "Average": 60.32,
        "MMLU": 65.38,
         "ARC":78.24,
        "WinoGrande":  64.56,
        "PIQA": 54.89,
        "CommonsenseQA":  67.89,
        "Race": 60.11,
        "MedMCQA": 41.42,
        "OpenkookQA": 49.90
    },
    {
        "model": "Gemini 1.0 Pro",
        "Average": 54.06,
        "MMLU": 56.04,
        "ARC":72.35,
        "WinoGrande":  56.35,
        "PIQA": 47.70,
        "CommonsenseQA": 50.56,
        "Race": 61.02,
        "MedMCQA": 35.89,
        "OpenkookQA": 52.55
    },
    {
        "model": "Llama3-70b-instruct",
        "Average": 52.92,
        "MMLU": 59.67,
        "ARC":67.09,
        "WinoGrande": 57.14,
        "PIQA": 43.10,
        "CommonsenseQA": 55.49,
        "Race": 58.21,
        "MedMCQA": 41.67,
        "OpenkookQA": 40.94
    }
]