OSQ-Leaderboard / small.json
SondosMB's picture
Update small.json
1ffa31c verified
raw
history blame
3.03 kB
[
{
"model": "Phi-3-mini-128k-instruct (3.8B)",
"Average": 40.00,
"MMLU": 36.97,
"ARC":60.94,
"WinoGrande": 46.88,
"PIQA": 32.04,
"CommonsenseQA": 49.15,
"Race": 37.81,
"MedMCQA": 22.61,
"OpenkookQA": 33.60
},
{
"model": "Qwen1.5 (1.8B)",
"Average": 21.68,
"MMLU": 9.99,
"ARC":15.84 ,
"WinoGrande": 40.96,
"PIQA": 15.52,
"CommonsenseQA": 31.13,
"Race": 34.91,
"MedMCQA": 4.7,
"OpenkookQA": 20.37
},
{
"model": "Gemma (2B)",
"Average": 16.66,
"MMLU": 17.52,
"ARC":23.93,
"WinoGrande": 16.10,
"PIQA": 15.09,
"CommonsenseQA": 27.46,
"Race": 14.32,
"MedMCQA": 4.57,
"OpenkookQA": 14.26
},
{
"model": "SlimPajama-DC (1.3B)",
"Average": 9.60,
"MMLU": 9.22,
"ARC":14.95,
"WinoGrande": 14.76,
"PIQA": 5.32,
"CommonsenseQA": 9.01,
"Race": 16.19,
"MedMCQA": 1.68,
"OpenkookQA": 5.70
},
{
"model": "RedPajama (1B)",
"Average": 9.00,
"MMLU": 9.21,
"ARC":13.5,
"WinoGrande": 16.97,
"PIQA": 0.86,
"CommonsenseQA": 11.41,
"Race": 14.35,
"MedMCQA": 1.86,
"OpenkookQA": 3.87
},
{
"model": "OLMo (1.2B)",
"Average": 8.85,
"MMLU": 8.54,
"ARC":13.18,
"WinoGrande": 6.16,
"PIQA": 8.05,
"CommonsenseQA": 13.10,
"Race": 13.61,
"MedMCQA": 2.07,
"OpenkookQA": 6.11
},
{
"model": "Pythia (1.4B)",
"Average": 8.79,
"MMLU": 9.66,
"ARC":14.69,
"WinoGrande": 11.52,
"PIQA": 4.17,
"CommonsenseQA": 9.01,
"Race": 12.76,
"MedMCQA": 3.19,
"OpenkookQA": 5.30
},
{
"model": "TinyLLama (1.1B)",
"Average": 8.45,
"MMLU": 8.94,
"ARC":13.31,
"WinoGrande": 12.23,
"PIQA": 3.59,
"CommonsenseQA": 6.06,
"Race": 16.7,
"MedMCQA": 2.07,
"OpenkookQA": 4.68
},
{
"model": "OPT (1.3B)",
"Average": 7.89,
"MMLU": 7.40,
"ARC":11.83,
"WinoGrande": 12.47,
"PIQA": 4.48,
"CommonsenseQA": 7.61,
"Race": 13.61,
"MedMCQA": 1.25,
"OpenkookQA": 4.48
},
{
"model": "GPT-Neo (1.3B)",
"Average": 7.42,
"MMLU": 6.94,
"ARC": 6.69,
"WinoGrande": 10.81,
"PIQA": 4.31,
"CommonsenseQA": 6.34,
"Race": 13.75,
"MedMCQA": 2.63,
"OpenkookQA": 4.89
},
{
"model": "Cerebras-GPT (1.3B)",
"Average": 4.86,
"MMLU": 5.37,
"ARC":4.43,
"WinoGrande": 9.31,
"PIQA": 2.16,
"CommonsenseQA": 6.2,
"Race": 6.9,
"MedMCQA": 1.04,
"OpenkookQA": 3.46
}
]