|
[
|
|
{
|
|
"config": {
|
|
"model_name": "ChatGPT-4o-latest (2024-09-03)",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 94.56827761,
|
|
"Standard Deviation": 0.009435818,
|
|
"Rank": 4
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 82.306,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 91.701,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"Probability": {
|
|
"Average Score": 86.681,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"Logical": {
|
|
"Average Score": 97.425,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Social": {
|
|
"Average Score": 91.333,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 90.77,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"CPP": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4o-2024-08-06",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 83.58608983,
|
|
"Standard Deviation": 4.528687523,
|
|
"Rank": 12
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 86.632,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 95.242,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Probability": {
|
|
"Average Score": 78.89,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"Logical": {
|
|
"Average Score": 77.458,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Social": {
|
|
"Average Score": 70.351,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 80.088,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"CPP": {
|
|
"Average Score": 92.43090226400756,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4o-2024-05-13",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 89.45175971,
|
|
"Standard Deviation": 0.030431012,
|
|
"Rank": 8
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 82.859,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 90.056,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"Probability": {
|
|
"Average Score": 82.051,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Logical": {
|
|
"Average Score": 86.969,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Social": {
|
|
"Average Score": 67.017,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 84.501,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"CPP": {
|
|
"Average Score": 79.1592634699295,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4-turbo-2024-04-09",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 89.34848344,
|
|
"Standard Deviation": 0.303734513,
|
|
"Rank": 9
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 79.296,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 84.668,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"Probability": {
|
|
"Average Score": 77.859,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"Logical": {
|
|
"Average Score": 88.359,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"Social": {
|
|
"Average Score": 67.671,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 79.61,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"CPP": {
|
|
"Average Score": 70.73143363230263,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemini-1.5-pro-001",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 83.17822062,
|
|
"Standard Deviation": 4.166312552,
|
|
"Rank": 13
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 84.696,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 98.832,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"Probability": {
|
|
"Average Score": 74.233,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Logical": {
|
|
"Average Score": 77.421,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Social": {
|
|
"Average Score": 70.057,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen2-72b-instruct",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/09"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 80.78104505,
|
|
"Standard Deviation": 2.776695545,
|
|
"Rank": 15
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 70.775,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 95.816,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"Probability": {
|
|
"Average Score": 80.38,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"Logical": {
|
|
"Average Score": 71.975,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Social": {
|
|
"Average Score": 50.407,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 76.621,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"CPP": {
|
|
"Average Score": 73.54037778797029,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt-4o-mini-2024-07-18",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 85.99929202,
|
|
"Standard Deviation": 2.479470643,
|
|
"Rank": 11
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 79.42,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 89.997,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Probability": {
|
|
"Average Score": 78.89,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Logical": {
|
|
"Average Score": 84.755,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Social": {
|
|
"Average Score": 72.014,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 76.194,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"CPP": {
|
|
"Average Score": 88.3877070580296,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3.5-sonnet",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 90.43169444,
|
|
"Standard Deviation": 0.123754719,
|
|
"Rank": 7
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 74.36,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 83.137,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Probability": {
|
|
"Average Score": 73.278,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Logical": {
|
|
"Average Score": 88.581,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
},
|
|
"Social": {
|
|
"Average Score": 97.694,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 86.294,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
},
|
|
"CPP": {
|
|
"Average Score": 82.37734076815008,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3.5-sonnet-20241022",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "UNKNOW"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 82.08873036,
|
|
"Standard Deviation": 20.89052134,
|
|
"Rank": 14
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 74.362,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 89.387,
|
|
"Standard Deviation": null,
|
|
"Rank": 11
|
|
},
|
|
"Probability": {
|
|
"Average Score": 73.919,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Logical": {
|
|
"Average Score": 90.514,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Social": {
|
|
"Average Score": 84.505,
|
|
"Standard Deviation": null,
|
|
"Rank": 7
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 85.611,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "o1-mini",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 97.53705747,
|
|
"Standard Deviation": 0.013240268,
|
|
"Rank": 2
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Probability": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Logical": {
|
|
"Average Score": 96.558,
|
|
"Standard Deviation": null,
|
|
"Rank": 3
|
|
},
|
|
"Social": {
|
|
"Average Score": 84.884,
|
|
"Standard Deviation": null,
|
|
"Rank": 6
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 93.717,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "o1-preview",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 93.04608514,
|
|
"Standard Deviation": 0.005729293,
|
|
"Rank": 5
|
|
},
|
|
"Geometry": {
|
|
"Average Score": "N/A",
|
|
"Standard Deviation": "N/A",
|
|
"Rank": "N/A"
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 99.212,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Probability": {
|
|
"Average Score": 94.181,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Logical": {
|
|
"Average Score": 100.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 1
|
|
},
|
|
"Social": {
|
|
"Average Score": 96.978,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemini-1.5-flash-001",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 64.39324213,
|
|
"Standard Deviation": 1.348364198,
|
|
"Rank": 20
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 65.135,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 84.28,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Probability": {
|
|
"Average Score": 67.22,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Logical": {
|
|
"Average Score": 71.975,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"Social": {
|
|
"Average Score": 60.374,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 79.569,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"CPP": {
|
|
"Average Score": 72.1127762005651,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt4-1106",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 88.25145246,
|
|
"Standard Deviation": 0.889714647,
|
|
"Rank": 10
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 61.784,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 80.579,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Probability": {
|
|
"Average Score": 70.693,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Logical": {
|
|
"Average Score": 75.513,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Social": {
|
|
"Average Score": 40.498,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 73.251,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"CPP": {
|
|
"Average Score": 69.11824072252848,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2-27b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/06"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 71.08619043,
|
|
"Standard Deviation": 41.54124623,
|
|
"Rank": 19
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 56.805,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 76.352,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Probability": {
|
|
"Average Score": 65.472,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Logical": {
|
|
"Average Score": 71.976,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Social": {
|
|
"Average Score": 47.308,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 69.606,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"CPP": {
|
|
"Average Score": 63.28920072143611,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3-opus",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 79.97608403,
|
|
"Standard Deviation": 5.382942441,
|
|
"Rank": 16
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 56.54,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 75.405,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"Probability": {
|
|
"Average Score": 67.208,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Logical": {
|
|
"Average Score": 77.458,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Social": {
|
|
"Average Score": 80.318,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 79.694,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"CPP": {
|
|
"Average Score": 73.5404403567132,
|
|
"Standard Deviation": null,
|
|
"Rank": 8
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2-9b-it-simpo",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": "N/A",
|
|
"Standard Deviation": "N/A",
|
|
"Rank": "N/A"
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 51.492,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 70.836,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Probability": {
|
|
"Average Score": 58.976,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Logical": {
|
|
"Average Score": 62.887,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Social": {
|
|
"Average Score": 70.351,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 85.813,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"CPP": {
|
|
"Average Score": 73.43757596214863,
|
|
"Standard Deviation": null,
|
|
"Rank": 9
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-72b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 62.1296631,
|
|
"Standard Deviation": 10.31242823,
|
|
"Rank": 21
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 47.314,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 69.575,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Probability": {
|
|
"Average Score": 49.066,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Logical": {
|
|
"Average Score": 36.931,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Social": {
|
|
"Average Score": 40.498,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 53.127,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"CPP": {
|
|
"Average Score": 48.69302376665551,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-32b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 46.27600711,
|
|
"Standard Deviation": 4.159365923,
|
|
"Rank": 30
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 43.846,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 63.321,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Probability": {
|
|
"Average Score": 48.15,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Logical": {
|
|
"Average Score": 41.573,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Social": {
|
|
"Average Score": 38.018,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 48.041,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"CPP": {
|
|
"Average Score": 45.14284028264288,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "google-gemma-2-9b-it",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/06"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 59.59324506,
|
|
"Standard Deviation": 5.156822857,
|
|
"Rank": 23
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 51.184,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 64.38,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Probability": {
|
|
"Average Score": 63.362,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Logical": {
|
|
"Average Score": 69.422,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Social": {
|
|
"Average Score": 76.113,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 58.379,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"CPP": {
|
|
"Average Score": 54.03167523687635,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "yi-1.5-34b-chat",
|
|
"organization": "01 AI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2024/05"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 72.39079733,
|
|
"Standard Deviation": 98.90928937,
|
|
"Rank": 18
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 52.638,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 64.055,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Probability": {
|
|
"Average Score": 64.137,
|
|
"Standard Deviation": null,
|
|
"Rank": 20
|
|
},
|
|
"Logical": {
|
|
"Average Score": 65.671,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Social": {
|
|
"Average Score": 47.308,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 57.484,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"CPP": {
|
|
"Average Score": 52.148798061768964,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "meta-llama-3.1-70b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3.1 Community",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 73.27773635,
|
|
"Standard Deviation": 5.72723675,
|
|
"Rank": 17
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 65.135,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 80.579,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
},
|
|
"Probability": {
|
|
"Average Score": 65.472,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"Logical": {
|
|
"Average Score": 72.879,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Social": {
|
|
"Average Score": 60.374,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 71.8,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"CPP": {
|
|
"Average Score": 84.36815192532764,
|
|
"Standard Deviation": null,
|
|
"Rank": 4
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "meta-llama-3.1-8b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3.1 Community",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 52.8664657,
|
|
"Standard Deviation": 3.607384863,
|
|
"Rank": 27
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 41.384,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 62.508,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Probability": {
|
|
"Average Score": 51.889,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Logical": {
|
|
"Average Score": 53.587,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"Social": {
|
|
"Average Score": 34.405,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 45.032,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"CPP": {
|
|
"Average Score": 44.41846841004584,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gpt3.5-turbo-0125",
|
|
"organization": "OpenAI",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2021/09"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 33.7046204,
|
|
"Standard Deviation": 45.16937959,
|
|
"Rank": 40
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 50.19,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 60.978,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Probability": {
|
|
"Average Score": 46.284,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Logical": {
|
|
"Average Score": 20.595,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Social": {
|
|
"Average Score": 24.926,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 42.78,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"CPP": {
|
|
"Average Score": 40.46958736582551,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama-3-70b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3 Community",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 59.24245274,
|
|
"Standard Deviation": 4.878897527,
|
|
"Rank": 25
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 45.249,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 60.736,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Probability": {
|
|
"Average Score": 54.515,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Logical": {
|
|
"Average Score": 83.08,
|
|
"Standard Deviation": null,
|
|
"Rank": 12
|
|
},
|
|
"Social": {
|
|
"Average Score": 42.172,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 71.8,
|
|
"Standard Deviation": null,
|
|
"Rank": 18
|
|
},
|
|
"CPP": {
|
|
"Average Score": 65.32140697218945,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3-sonnet",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 61.81320888,
|
|
"Standard Deviation": 10.27472205,
|
|
"Rank": 22
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 50.185,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 58.739,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Probability": {
|
|
"Average Score": 54.182,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Logical": {
|
|
"Average Score": 65.118,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
},
|
|
"Social": {
|
|
"Average Score": 55.325,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 69.778,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
},
|
|
"CPP": {
|
|
"Average Score": 61.33538592327427,
|
|
"Standard Deviation": null,
|
|
"Rank": 15
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-14b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 43.97760317,
|
|
"Standard Deviation": 3.740375694,
|
|
"Rank": 31
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 35.5,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 57.821,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"Probability": {
|
|
"Average Score": 38.886,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Logical": {
|
|
"Average Score": 34.775,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Social": {
|
|
"Average Score": 31.022,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 40.55,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"CPP": {
|
|
"Average Score": 38.552779976347026,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-3-haiku",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 55.60534246,
|
|
"Standard Deviation": 15.07600975,
|
|
"Rank": 26
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 41.806,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 54.298,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Probability": {
|
|
"Average Score": 49.344,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Logical": {
|
|
"Average Score": 61.904,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Social": {
|
|
"Average Score": 50.407,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 61.491,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
},
|
|
"CPP": {
|
|
"Average Score": 56.40200048817984,
|
|
"Standard Deviation": null,
|
|
"Rank": 16
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-2.1",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 40.35699809,
|
|
"Standard Deviation": 2.484317383,
|
|
"Rank": 35
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 49.899,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 53.574,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Probability": {
|
|
"Average Score": 44.011,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Logical": {
|
|
"Average Score": 59.855,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"Social": {
|
|
"Average Score": 33.888,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 51.038,
|
|
"Standard Deviation": null,
|
|
"Rank": 26
|
|
},
|
|
"CPP": {
|
|
"Average Score": 47.23672563994903,
|
|
"Standard Deviation": null,
|
|
"Rank": 21
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "mistral-8x7b-instruct-v0.1",
|
|
"organization": "Mistral",
|
|
"license": "Apache 2.0",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 43.2937322,
|
|
"Standard Deviation": 2.659857412,
|
|
"Rank": 32
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 32.639,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 48.901,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Probability": {
|
|
"Average Score": 44.058,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Logical": {
|
|
"Average Score": 42.194,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Social": {
|
|
"Average Score": 26.702,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 47.192,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"CPP": {
|
|
"Average Score": 44.533118241976666,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "claude-2.0",
|
|
"organization": "Anthropic",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 30.83692551,
|
|
"Standard Deviation": 1.816269,
|
|
"Rank": 43
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 37.452,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 48.965,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Probability": {
|
|
"Average Score": 46.284,
|
|
"Standard Deviation": null,
|
|
"Rank": 29
|
|
},
|
|
"Logical": {
|
|
"Average Score": 55.657,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Social": {
|
|
"Average Score": 42.117,
|
|
"Standard Deviation": null,
|
|
"Rank": 25
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 55.869,
|
|
"Standard Deviation": null,
|
|
"Rank": 24
|
|
},
|
|
"CPP": {
|
|
"Average Score": 50.773143448036464,
|
|
"Standard Deviation": null,
|
|
"Rank": 19
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "starling-lm-7b-beta",
|
|
"organization": "Nexusflow",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2024/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 50.05304991,
|
|
"Standard Deviation": 3.017802027,
|
|
"Rank": 28
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 33.79,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 49.685,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Probability": {
|
|
"Average Score": 39.677,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Logical": {
|
|
"Average Score": 47.501,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Social": {
|
|
"Average Score": 37.7,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 40.274,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"CPP": {
|
|
"Average Score": 38.27587102395908,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemini-1.0-pro-001",
|
|
"organization": "Google",
|
|
"license": "Proprietary",
|
|
"knowledge_cutoff": "2023/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 38.71255653,
|
|
"Standard Deviation": 8.592349353,
|
|
"Rank": 37
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 34.596,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 48.159,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Probability": {
|
|
"Average Score": 29.585,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Logical": {
|
|
"Average Score": 23.882,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Social": {
|
|
"Average Score": 13.261,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 46.637,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"CPP": {
|
|
"Average Score": 45.22204471452975,
|
|
"Standard Deviation": null,
|
|
"Rank": 23
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "openchat-3.5-0106",
|
|
"organization": "OpenChat",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2024/01"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 40.85094215,
|
|
"Standard Deviation": 6.631820541,
|
|
"Rank": 34
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 29.115,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 45.456,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Probability": {
|
|
"Average Score": 38.408,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Logical": {
|
|
"Average Score": 41.678,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
},
|
|
"Social": {
|
|
"Average Score": 28.236,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 34.68,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"CPP": {
|
|
"Average Score": 33.70639271807677,
|
|
"Standard Deviation": null,
|
|
"Rank": 33
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "openchat-3.5",
|
|
"organization": "OpenChat",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 39.20699952,
|
|
"Standard Deviation": 1.576169927,
|
|
"Rank": 36
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 30.009,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 42.04,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Probability": {
|
|
"Average Score": 34.495,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Logical": {
|
|
"Average Score": 35.828,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Social": {
|
|
"Average Score": 33.096,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 36.737,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"CPP": {
|
|
"Average Score": 33.020911255646965,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "command-r-(08-2024)",
|
|
"organization": "Cohere",
|
|
"license": "CC-BY-NC-4.0",
|
|
"knowledge_cutoff": "2024/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 46.70245901,
|
|
"Standard Deviation": 3.665464964,
|
|
"Rank": 29
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 35.43,
|
|
"Standard Deviation": null,
|
|
"Rank": 32
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 41.852,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Probability": {
|
|
"Average Score": 36.535,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Logical": {
|
|
"Average Score": 25.941,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Social": {
|
|
"Average Score": 30.911,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 41.629,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"CPP": {
|
|
"Average Score": 39.61492485677676,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-1.1-7b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 32.61912991,
|
|
"Standard Deviation": 17.86038512,
|
|
"Rank": 41
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 25.149,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 40.456,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Probability": {
|
|
"Average Score": 29.307,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Logical": {
|
|
"Average Score": 41.543,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
},
|
|
"Social": {
|
|
"Average Score": 21.473,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 45.033,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"CPP": {
|
|
"Average Score": 42.666504105798204,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama3-8b-instruct",
|
|
"organization": "Meta",
|
|
"license": "Llama 3 Community",
|
|
"knowledge_cutoff": "2023/03"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 37.29361351,
|
|
"Standard Deviation": 8.841996174,
|
|
"Rank": 39
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 28.496,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 42.117,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Probability": {
|
|
"Average Score": 33.841,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Logical": {
|
|
"Average Score": 57.763,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"Social": {
|
|
"Average Score": 35.994,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 50.023,
|
|
"Standard Deviation": null,
|
|
"Rank": 27
|
|
},
|
|
"CPP": {
|
|
"Average Score": 45.35392139264795,
|
|
"Standard Deviation": null,
|
|
"Rank": 22
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2-2b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 59.3544514,
|
|
"Standard Deviation": 14.50864762,
|
|
"Rank": 24
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 29.077,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 39.677,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Probability": {
|
|
"Average Score": 31.561,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Logical": {
|
|
"Average Score": 43.458,
|
|
"Standard Deviation": null,
|
|
"Rank": 31
|
|
},
|
|
"Social": {
|
|
"Average Score": 39.343,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 31.156,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"CPP": {
|
|
"Average Score": 30.53406933106768,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "starling-lm-7b-alpha",
|
|
"organization": "Nexusflow",
|
|
"license": "Apache-2.0",
|
|
"knowledge_cutoff": "2023/11"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 37.94593338,
|
|
"Standard Deviation": 1.40532208,
|
|
"Rank": 38
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 25.519,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 38.88,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Probability": {
|
|
"Average Score": 32.068,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Logical": {
|
|
"Average Score": 33.804,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"Social": {
|
|
"Average Score": 30.875,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 31.354,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"CPP": {
|
|
"Average Score": 30.07926487356878,
|
|
"Standard Deviation": null,
|
|
"Rank": 37
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "qwen1.5-4b-chat",
|
|
"organization": "Alibaba",
|
|
"license": "Qianwen LICENSE",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 9.779979052,
|
|
"Standard Deviation": 0.925129318,
|
|
"Rank": 54
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 15.672,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 31.21,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Probability": {
|
|
"Average Score": 13.853,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Logical": {
|
|
"Average Score": 13.842,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Social": {
|
|
"Average Score": 20.21,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 14.794,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"CPP": {
|
|
"Average Score": 13.21208067122554,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "command-r-(04-2024)",
|
|
"organization": "Cohere",
|
|
"license": "CC-BY-NC-4.0",
|
|
"knowledge_cutoff": "2024/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 42.49175095,
|
|
"Standard Deviation": 5.556047496,
|
|
"Rank": 33
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 23.438,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 31.204,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Probability": {
|
|
"Average Score": 30.726,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"Logical": {
|
|
"Average Score": 35.111,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Social": {
|
|
"Average Score": 30.623,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 42.316,
|
|
"Standard Deviation": null,
|
|
"Rank": 34
|
|
},
|
|
"CPP": {
|
|
"Average Score": 41.346336503003236,
|
|
"Standard Deviation": null,
|
|
"Rank": 28
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "vicuna-33b",
|
|
"organization": "LMSYS",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/08"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 27.90851915,
|
|
"Standard Deviation": 4.55056913,
|
|
"Rank": 44
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 16.634,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 25.075,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"Probability": {
|
|
"Average Score": 20.901,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Logical": {
|
|
"Average Score": 22.962,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"Social": {
|
|
"Average Score": 28.487,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 31.156,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
},
|
|
"CPP": {
|
|
"Average Score": 28.01838653090379,
|
|
"Standard Deviation": null,
|
|
"Rank": 38
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-7b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 25.25380823,
|
|
"Standard Deviation": 3.455163419,
|
|
"Rank": 46
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 19.626,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 23.272,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Probability": {
|
|
"Average Score": 16.98,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Logical": {
|
|
"Average Score": 24.359,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Social": {
|
|
"Average Score": 23.52,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 31.139,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"CPP": {
|
|
"Average Score": 28.014658234926813,
|
|
"Standard Deviation": null,
|
|
"Rank": 39
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "mistral-7b-instruct-2",
|
|
"organization": "Mistral",
|
|
"license": "Apache 2.0",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 31.49596208,
|
|
"Standard Deviation": 11.79471585,
|
|
"Rank": 42
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 16.847,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 23.287,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Probability": {
|
|
"Average Score": 24.868,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
},
|
|
"Logical": {
|
|
"Average Score": 28.755,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
},
|
|
"Social": {
|
|
"Average Score": 21.473,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 31.994,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
},
|
|
"CPP": {
|
|
"Average Score": 31.382959631870822,
|
|
"Standard Deviation": null,
|
|
"Rank": 35
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "mistral-7b-instruct-1",
|
|
"organization": "Mistral",
|
|
"license": "Apache 2.0",
|
|
"knowledge_cutoff": "2023/12"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 20.53586787,
|
|
"Standard Deviation": 2.95650198,
|
|
"Rank": 51
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 11.019,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 20.39,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Probability": {
|
|
"Average Score": 24.279,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"Logical": {
|
|
"Average Score": 16.823,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Social": {
|
|
"Average Score": 12.369,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 22.121,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"CPP": {
|
|
"Average Score": 18.929093202755805,
|
|
"Standard Deviation": null,
|
|
"Rank": 42
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "vicuna-13b",
|
|
"organization": "LMSYS",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 17.42296198,
|
|
"Standard Deviation": 4.480901647,
|
|
"Rank": 52
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 12.755,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 17.974,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Probability": {
|
|
"Average Score": 13.004,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Logical": {
|
|
"Average Score": 16.997,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Social": {
|
|
"Average Score": 14.314,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 25.307,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
},
|
|
"CPP": {
|
|
"Average Score": 21.840013221590294,
|
|
"Standard Deviation": null,
|
|
"Rank": 40
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "zephyr-7b-beta",
|
|
"organization": "HuggingFace",
|
|
"license": "MIT",
|
|
"knowledge_cutoff": "2023/10"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 9.323654861,
|
|
"Standard Deviation": 0.338544041,
|
|
"Rank": 55
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 8.222,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 13.006,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"Probability": {
|
|
"Average Score": 7.573,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"Logical": {
|
|
"Average Score": 7.364,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"Social": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 17.18,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"CPP": {
|
|
"Average Score": 18.92902220864132,
|
|
"Standard Deviation": null,
|
|
"Rank": 43
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-1.1-2b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 22.44740296,
|
|
"Standard Deviation": 3.95922917,
|
|
"Rank": 49
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 12.834,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 12.291,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Probability": {
|
|
"Average Score": 8.228,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Logical": {
|
|
"Average Score": 10.822,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"Social": {
|
|
"Average Score": 19.303,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 19.892,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"CPP": {
|
|
"Average Score": 20.724691953843916,
|
|
"Standard Deviation": null,
|
|
"Rank": 41
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama2-7b-chat",
|
|
"organization": "Meta",
|
|
"license": "Llama 2 Community",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 23.53840413,
|
|
"Standard Deviation": 4.565404574,
|
|
"Rank": 47
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 5.681,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 9.809,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"Probability": {
|
|
"Average Score": 8.089,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"Logical": {
|
|
"Average Score": 20.474,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Social": {
|
|
"Average Score": 15.968,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 18.153,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"CPP": {
|
|
"Average Score": 15.730513733660898,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "gemma-2b-it",
|
|
"organization": "Google",
|
|
"license": "Gemma License",
|
|
"knowledge_cutoff": "2024/02"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 20.86803148,
|
|
"Standard Deviation": 4.810898787,
|
|
"Rank": 50
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 15.137,
|
|
"Standard Deviation": null,
|
|
"Rank": 47
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 10.108,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Probability": {
|
|
"Average Score": 6.688,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"Logical": {
|
|
"Average Score": 5.296,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
},
|
|
"Social": {
|
|
"Average Score": 9.63,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 18.153,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"CPP": {
|
|
"Average Score": 17.2715657115764,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama2-13b-chat",
|
|
"organization": "Meta",
|
|
"license": "Llama 2 Community",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 23.34503255,
|
|
"Standard Deviation": 4.939571996,
|
|
"Rank": 48
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 4.017,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 7.201,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"Probability": {
|
|
"Average Score": 11.451,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"Logical": {
|
|
"Average Score": 23.912,
|
|
"Standard Deviation": null,
|
|
"Rank": 44
|
|
},
|
|
"Social": {
|
|
"Average Score": 15.715,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 14.773,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"CPP": {
|
|
"Average Score": 13.17258252933903,
|
|
"Standard Deviation": null,
|
|
"Rank": 48
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "vicuna-7b",
|
|
"organization": "LMSYS",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/07"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 16.78668722,
|
|
"Standard Deviation": 4.782003459,
|
|
"Rank": 53
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 5.299,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 7.014,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"Probability": {
|
|
"Average Score": 8.228,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"Logical": {
|
|
"Average Score": 11.753,
|
|
"Standard Deviation": null,
|
|
"Rank": 53
|
|
},
|
|
"Social": {
|
|
"Average Score": 11.326,
|
|
"Standard Deviation": null,
|
|
"Rank": 54
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 15.092,
|
|
"Standard Deviation": null,
|
|
"Rank": 52
|
|
},
|
|
"CPP": {
|
|
"Average Score": 14.255194156624162,
|
|
"Standard Deviation": null,
|
|
"Rank": 46
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "koala-13b",
|
|
"organization": "UC Berkeley",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 8.747324657,
|
|
"Standard Deviation": 0.645177403,
|
|
"Rank": 56
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.156,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 2.242,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
},
|
|
"Probability": {
|
|
"Average Score": 3.323,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
},
|
|
"Logical": {
|
|
"Average Score": 8.156,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"Social": {
|
|
"Average Score": 9.649,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 6.672,
|
|
"Standard Deviation": null,
|
|
"Rank": 55
|
|
},
|
|
"CPP": {
|
|
"Average Score": 6.36433272373514,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "openassistant-pythia-12b",
|
|
"organization": "OpenAssistant",
|
|
"license": "Non-commercial",
|
|
"knowledge_cutoff": "2023/04"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 0,
|
|
"Standard Deviation": 0,
|
|
"Rank": 57
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
},
|
|
"Probability": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
},
|
|
"Logical": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 58
|
|
},
|
|
"Social": {
|
|
"Average Score": 1.637,
|
|
"Standard Deviation": null,
|
|
"Rank": 57
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 56
|
|
},
|
|
"CPP": {
|
|
"Average Score": 0.0,
|
|
"Standard Deviation": null,
|
|
"Rank": 50
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "nemotron-70b",
|
|
"organization": "NVIDIA",
|
|
"license": "Unknown",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 100,
|
|
"Standard Deviation": 0,
|
|
"Rank": 1
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 79.813,
|
|
"Standard Deviation": null,
|
|
"Rank": 17
|
|
},
|
|
"Geometry": {
|
|
"Average Score": 67.014,
|
|
"Standard Deviation": null,
|
|
"Rank": 13
|
|
},
|
|
"Probability": {
|
|
"Average Score": 75.535,
|
|
"Standard Deviation": null,
|
|
"Rank": 10
|
|
},
|
|
"Logical": {
|
|
"Average Score": 92.659,
|
|
"Standard Deviation": null,
|
|
"Rank": 5
|
|
},
|
|
"Social": {
|
|
"Average Score": 99.677,
|
|
"Standard Deviation": null,
|
|
"Rank": 2
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 76.262,
|
|
"Standard Deviation": null,
|
|
"Rank": 14
|
|
}
|
|
}
|
|
}, |
|
{ |
|
"config": { |
|
"model_name": "yi-lightning", |
|
"organization": "01 AI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 96.84467293, |
|
"Standard Deviation": 0.033152361, |
|
"Rank": 3 |
|
}, |
|
"Geometry": { |
|
"Average Score": 77.667, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
}, |
|
"Algebra": { |
|
"Average Score": 93.245, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 100.000, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
}, |
|
"Logical": { |
|
"Average Score": 94.660, |
|
"Standard Deviation": null, |
|
"Rank": 4 |
|
}, |
|
"Social": { |
|
"Average Score": 83.236, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
}, |
|
"Probability": { |
|
"Average Score": 90.329, |
|
"Standard Deviation": null, |
|
"Rank": 3 |
|
} |
|
} |
|
}, |
|
{ |
|
"config": { |
|
"model_name": "glm-4-plus", |
|
"organization": "Zhipu AI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "Unknown" |
|
}, |
|
"results": { |
|
"OVERALL": { |
|
"Average Score": 92.48932574, |
|
"Standard Deviation": 0.087973142, |
|
"Rank": 6 |
|
}, |
|
"Geometry": { |
|
"Average Score": 76.965, |
|
"Standard Deviation": null, |
|
"Rank": 9 |
|
}, |
|
"Algebra": { |
|
"Average Score": 91.701, |
|
"Standard Deviation": null, |
|
"Rank": 7 |
|
}, |
|
"Chemistry": { |
|
"Average Score": 83.527, |
|
"Standard Deviation": null, |
|
"Rank": 8 |
|
}, |
|
"Logical": { |
|
"Average Score": 92.348, |
|
"Standard Deviation": null, |
|
"Rank": 6 |
|
}, |
|
"Social": { |
|
"Average Score": 100.0, |
|
"Standard Deviation": null, |
|
"Rank": 1 |
|
}, |
|
"Probability": { |
|
"Average Score": 74.233, |
|
"Standard Deviation": null, |
|
"Rank": 12 |
|
} |
|
} |
|
},
|
|
{
|
|
"config": {
|
|
"model_name": "llama-3.2-3b-it",
|
|
"organization": "Meta",
|
|
"license": "Llama 3 Community",
|
|
"knowledge_cutoff": "Unknown"
|
|
},
|
|
"results": {
|
|
"OVERALL": {
|
|
"Average Score": 26.58569941,
|
|
"Standard Deviation": 4.191042423,
|
|
"Rank": 45
|
|
},
|
|
"Algebra": {
|
|
"Average Score": 56.545,
|
|
"Standard Deviation": null,
|
|
"Rank": 30
|
|
},
|
|
"Probability": {
|
|
"Average Score": 37.496,
|
|
"Standard Deviation": null,
|
|
"Rank": 36
|
|
},
|
|
"Logical": {
|
|
"Average Score": 15.188,
|
|
"Standard Deviation": null,
|
|
"Rank": 51
|
|
},
|
|
"Social": {
|
|
"Average Score": 15.924,
|
|
"Standard Deviation": null,
|
|
"Rank": 49
|
|
},
|
|
"Chemistry": {
|
|
"Average Score": 30.78,
|
|
"Standard Deviation": null,
|
|
"Rank": 45
|
|
}
|
|
}
|
|
}
|
|
] |