Spaces:
Runtime error
Runtime error
[ | |
{ | |
"model": "GPT-4o", | |
"Average": 70.15, | |
"MMLU": 70.09, | |
"ARC":86.31, | |
"WinoGrande":72.22, | |
"PIQA":60.34, | |
"CommonsenseQA":70.28, | |
"Race":67.87 , | |
"MedMCQA":57.85 , | |
"OpenkookQA":67.21 | |
}, | |
{ | |
"model": "GPT-4-1106-preview", | |
"Average": 65.93, | |
"MMLU": 74.77, | |
"ARC":82.68, | |
"WinoGrande": 66.22, | |
"PIQA": 61.64, | |
"CommonsenseQA": 62.96, | |
"Race": 67.05, | |
"MedMCQA": 51.81, | |
"OpenkookQA": 60.29 | |
}, | |
{ | |
"model": "Claude-3 Opus", | |
"Average": 62.53, | |
"MMLU": 70.23, | |
"ARC":75.47, | |
"WinoGrande": 63.54, | |
"PIQA": 59.05, | |
"CommonsenseQA": 63.66, | |
"Race": 66.22, | |
"MedMCQA": 49.14, | |
"OpenkookQA": 52.95 | |
}, | |
{ | |
"model": "Mistral Large", | |
"Average": 60.48, | |
"MMLU": 68.76, | |
"ARC":72.32, | |
"WinoGrande": 56.83, | |
"PIQA": 61.21, | |
"CommonsenseQA": 55.35, | |
"Race": 70.17, | |
"MedMCQA": 43.44, | |
"OpenkookQA": 58.66 | |
}, | |
{ | |
"model": "GPT-3.5", | |
"Average": 60.32, | |
"MMLU": 65.38, | |
"ARC":78.24, | |
"WinoGrande": 64.56, | |
"PIQA": 54.89, | |
"CommonsenseQA": 67.89, | |
"Race": 60.11, | |
"MedMCQA": 41.42, | |
"OpenkookQA": 49.90 | |
}, | |
{ | |
"model": "Gemini 1.0 Pro", | |
"Average": 54.06, | |
"MMLU": 56.04, | |
"ARC":72.35, | |
"WinoGrande": 56.35, | |
"PIQA": 47.70, | |
"CommonsenseQA": 50.56, | |
"Race": 61.02, | |
"MedMCQA": 35.89, | |
"OpenkookQA": 52.55 | |
}, | |
{ | |
"model": "Llama3-70b-instruct", | |
"Average": 52.92, | |
"MMLU": 59.67, | |
"ARC":67.09, | |
"WinoGrande": 57.14, | |
"PIQA": 43.10, | |
"CommonsenseQA": 55.49, | |
"Race": 58.21, | |
"MedMCQA": 41.67, | |
"OpenkookQA": 40.94 | |
} | |
] |