File size: 1,936 Bytes
b782462
a153b62
 
b782462
a153b62
 
 
 
a3733f0
a153b62
 
 
 
 
 
 
b782462
a153b62
 
 
 
 
 
 
 
 
b782462
 
 
a153b62
 
 
 
 
 
 
 
 
b782462
 
 
a153b62
 
 
 
 
 
 
 
 
b782462
 
 
a153b62
 
 
 
 
 
 
 
 
b782462
 
a153b62
 
 
 
 
 
 
 
 
 
b782462
 
 
a153b62
 
 
b782462
a153b62
b782462
 
 
a153b62
b782462
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
[

  
    {
        "model": "GPT-4o",
        "Average": 70.15,
        "MMLU": 70.09,
        "ARC":86.33,
        "WinoGrande":72.22,
        "PiQA":60.34,
        "CommonsenseQA":70.28,
        "Race":67.87 ,
        "MedMCQA":57.85 ,
        "OpenkookQA":67.21 
    },
  {
        "model": "GPT-4",
        "Average": 65.93,
        "MMLU": 74.77,
        "ARC":82.68,
        "WinoGrande": 66.22,
        "PiQA": 61.64,
        "CommonsenseQA": 62.96,
        "Race": 67.05,
        "MedMCQA": 51.81,
        "OpenkookQA": 60.29
    },
    {
        "model": "Claude-3 Opus",
        "Average": 62.68,
        "MMLU": 70.23,
        "ARC":76.62,
        "WinoGrande": 63.54,
        "PiQA": 59.05,
        "CommonsenseQA": 63.66,
        "Race": 66.22,
        "MedMCQA": 49.14,
        "OpenkookQA": 52.95
    },
    {
        "model": "Mistral Large",
        "Average": 60.48,
        "MMLU": 68.76,
        "ARC":72.32
        "WinoGrande": 56.83,
        "PiQA": 61.21,
        "CommonsenseQA": 55.35,
        "Race": 70.17,
        "MedMCQA": 43.44,
        "OpenkookQA": 58.66
    },
    {
        "model": "GPT-3.5",
        "Average": 60.30,
        "MMLU": 65.38,
         "ARC":78.24,
        "WinoGrande":  64.56,
        "PiQA": 54.89,
        "CommonsenseQA":  67.89,
        "Race": 60.11,
        "MedMCQA": 41.42,
        "OpenkookQA": 49.90
    },
    {
        "model": "Gemini 1.0 Pro",
        "Average": 54.04,
        "MMLU": 56.04,
        "ARC":72.23,
        "WinoGrande":  56.35,
        "PiQA": 47.70,
        "CommonsenseQA": 50.56,
        "Race": 61.02,
        "MedMCQA": 35.89,
        "OpenkookQA": 52.55
    },
    {
        "model": "Llama3-70b-instruct",
        "Average": 52.92,
        "MMLU": 59.67,
        "ARC":67.09,
        "WinoGrande": 57.14,
        "PiQA": 43.10,
        "CommonsenseQA": 55.49,
        "Race": 58.21,
        "MedMCQA": 41.67,
        "OpenkookQA": 40.94
    }
]