OSQ-Leaderboard / big.json
SondosMB's picture
Update big.json
e01b14a verified
raw
history blame
1.95 kB
[
{
"model": "GPT-4o",
"Average": 70.15,
"MMLU": 70.09,
"ARC":86.31,
"WinoGrande":72.22,
"PIQA":60.34,
"CommonsenseQA":70.28,
"Race":67.87 ,
"MedMCQA":57.85 ,
"OpenkookQA":67.21
},
{
"model": "GPT-4-1106-preview",
"Average": 65.93,
"MMLU": 74.77,
"ARC":82.68,
"WinoGrande": 66.22,
"PIQA": 61.64,
"CommonsenseQA": 62.96,
"Race": 67.05,
"MedMCQA": 51.81,
"OpenkookQA": 60.29
},
{
"model": "Claude-3 Opus",
"Average": 62.53,
"MMLU": 70.23,
"ARC":75.47,
"WinoGrande": 63.54,
"PIQA": 59.05,
"CommonsenseQA": 63.66,
"Race": 66.22,
"MedMCQA": 49.14,
"OpenkookQA": 52.95
},
{
"model": "Mistral Large",
"Average": 60.48,
"MMLU": 68.76,
"ARC":72.32,
"WinoGrande": 56.83,
"PIQA": 61.21,
"CommonsenseQA": 55.35,
"Race": 70.17,
"MedMCQA": 43.44,
"OpenkookQA": 58.66
},
{
"model": "GPT-3.5",
"Average": 60.32,
"MMLU": 65.38,
"ARC":78.24,
"WinoGrande": 64.56,
"PIQA": 54.89,
"CommonsenseQA": 67.89,
"Race": 60.11,
"MedMCQA": 41.42,
"OpenkookQA": 49.90
},
{
"model": "Gemini 1.0 Pro",
"Average": 54.06,
"MMLU": 56.04,
"ARC":72.35,
"WinoGrande": 56.35,
"PIQA": 47.70,
"CommonsenseQA": 50.56,
"Race": 61.02,
"MedMCQA": 35.89,
"OpenkookQA": 52.55
},
{
"model": "Llama3-70b-instruct",
"Average": 52.92,
"MMLU": 59.67,
"ARC":67.09,
"WinoGrande": 57.14,
"PIQA": 43.10,
"CommonsenseQA": 55.49,
"Race": 58.21,
"MedMCQA": 41.67,
"OpenkookQA": 40.94
}
]