de-arena / src /results /models_2024-10-24-08:08:59.127307.json
yzabc007's picture
update
aee5c0e
[
{
"config": {
"model_name": "ChatGPT-4o-latest (2024-09-03)",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 94.56827761,
"Standard Deviation": 0.009435818,
"Rank": 4
},
"Geometry": {
"Average Score": 82.306,
"Standard Deviation": null,
"Rank": 5
},
"Algebra": {
"Average Score": 91.701,
"Standard Deviation": null,
"Rank": 8
},
"Probability": {
"Average Score": 86.681,
"Standard Deviation": null,
"Rank": 4
},
"Logical": {
"Average Score": 97.425,
"Standard Deviation": null,
"Rank": 2
},
"Social": {
"Average Score": 91.333,
"Standard Deviation": null,
"Rank": 5
},
"Chemistry": {
"Average Score": 90.77,
"Standard Deviation": null,
"Rank": 3
},
"CPP": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
}
}
},
{
"config": {
"model_name": "gpt-4o-2024-08-06",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 83.58608983,
"Standard Deviation": 4.528687523,
"Rank": 12
},
"Geometry": {
"Average Score": 86.632,
"Standard Deviation": null,
"Rank": 2
},
"Algebra": {
"Average Score": 95.242,
"Standard Deviation": null,
"Rank": 5
},
"Probability": {
"Average Score": 78.89,
"Standard Deviation": null,
"Rank": 8
},
"Logical": {
"Average Score": 77.458,
"Standard Deviation": null,
"Rank": 14
},
"Social": {
"Average Score": 70.351,
"Standard Deviation": null,
"Rank": 13
},
"Chemistry": {
"Average Score": 80.088,
"Standard Deviation": null,
"Rank": 9
},
"CPP": {
"Average Score": 92.43090226400756,
"Standard Deviation": null,
"Rank": 2
}
}
},
{
"config": {
"model_name": "gpt-4o-2024-05-13",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 89.45175971,
"Standard Deviation": 0.030431012,
"Rank": 8
},
"Geometry": {
"Average Score": 82.859,
"Standard Deviation": null,
"Rank": 4
},
"Algebra": {
"Average Score": 90.056,
"Standard Deviation": null,
"Rank": 9
},
"Probability": {
"Average Score": 82.051,
"Standard Deviation": null,
"Rank": 5
},
"Logical": {
"Average Score": 86.969,
"Standard Deviation": null,
"Rank": 10
},
"Social": {
"Average Score": 67.017,
"Standard Deviation": null,
"Rank": 16
},
"Chemistry": {
"Average Score": 84.501,
"Standard Deviation": null,
"Rank": 7
},
"CPP": {
"Average Score": 79.1592634699295,
"Standard Deviation": null,
"Rank": 6
}
}
},
{
"config": {
"model_name": "gpt-4-turbo-2024-04-09",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 89.34848344,
"Standard Deviation": 0.303734513,
"Rank": 9
},
"Geometry": {
"Average Score": 79.296,
"Standard Deviation": null,
"Rank": 7
},
"Algebra": {
"Average Score": 84.668,
"Standard Deviation": null,
"Rank": 12
},
"Probability": {
"Average Score": 77.859,
"Standard Deviation": null,
"Rank": 9
},
"Logical": {
"Average Score": 88.359,
"Standard Deviation": null,
"Rank": 9
},
"Social": {
"Average Score": 67.671,
"Standard Deviation": null,
"Rank": 15
},
"Chemistry": {
"Average Score": 79.61,
"Standard Deviation": null,
"Rank": 11
},
"CPP": {
"Average Score": 70.73143363230263,
"Standard Deviation": null,
"Rank": 11
}
}
},
{
"config": {
"model_name": "gemini-1.5-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 83.17822062,
"Standard Deviation": 4.166312552,
"Rank": 13
},
"Geometry": {
"Average Score": 84.696,
"Standard Deviation": null,
"Rank": 3
},
"Algebra": {
"Average Score": 98.832,
"Standard Deviation": null,
"Rank": 3
},
"Probability": {
"Average Score": 74.233,
"Standard Deviation": null,
"Rank": 11
},
"Logical": {
"Average Score": 77.421,
"Standard Deviation": null,
"Rank": 15
},
"Social": {
"Average Score": 70.057,
"Standard Deviation": null,
"Rank": 14
}
}
},
{
"config": {
"model_name": "qwen2-72b-instruct",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/09"
},
"results": {
"OVERALL": {
"Average Score": 80.78104505,
"Standard Deviation": 2.776695545,
"Rank": 15
},
"Geometry": {
"Average Score": 70.775,
"Standard Deviation": null,
"Rank": 12
},
"Algebra": {
"Average Score": 95.816,
"Standard Deviation": null,
"Rank": 4
},
"Probability": {
"Average Score": 80.38,
"Standard Deviation": null,
"Rank": 6
},
"Logical": {
"Average Score": 71.975,
"Standard Deviation": null,
"Rank": 20
},
"Social": {
"Average Score": 50.407,
"Standard Deviation": null,
"Rank": 20
},
"Chemistry": {
"Average Score": 76.621,
"Standard Deviation": null,
"Rank": 13
},
"CPP": {
"Average Score": 73.54037778797029,
"Standard Deviation": null,
"Rank": 7
}
}
},
{
"config": {
"model_name": "gpt-4o-mini-2024-07-18",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 85.99929202,
"Standard Deviation": 2.479470643,
"Rank": 11
},
"Geometry": {
"Average Score": 79.42,
"Standard Deviation": null,
"Rank": 6
},
"Algebra": {
"Average Score": 89.997,
"Standard Deviation": null,
"Rank": 10
},
"Probability": {
"Average Score": 78.89,
"Standard Deviation": null,
"Rank": 7
},
"Logical": {
"Average Score": 84.755,
"Standard Deviation": null,
"Rank": 11
},
"Social": {
"Average Score": 72.014,
"Standard Deviation": null,
"Rank": 11
},
"Chemistry": {
"Average Score": 76.194,
"Standard Deviation": null,
"Rank": 15
},
"CPP": {
"Average Score": 88.3877070580296,
"Standard Deviation": null,
"Rank": 3
}
}
},
{
"config": {
"model_name": "claude-3.5-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 90.43169444,
"Standard Deviation": 0.123754719,
"Rank": 7
},
"Geometry": {
"Average Score": 74.36,
"Standard Deviation": null,
"Rank": 11
},
"Algebra": {
"Average Score": 83.137,
"Standard Deviation": null,
"Rank": 14
},
"Probability": {
"Average Score": 73.278,
"Standard Deviation": null,
"Rank": 14
},
"Logical": {
"Average Score": 88.581,
"Standard Deviation": null,
"Rank": 8
},
"Social": {
"Average Score": 97.694,
"Standard Deviation": null,
"Rank": 3
},
"Chemistry": {
"Average Score": 86.294,
"Standard Deviation": null,
"Rank": 4
},
"CPP": {
"Average Score": 82.37734076815008,
"Standard Deviation": null,
"Rank": 5
}
}
},
{
"config": {
"model_name": "claude-3.5-sonnet-20241022",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "UNKNOW"
},
"results": {
"OVERALL": {
"Average Score": 82.08873036,
"Standard Deviation": 20.89052134,
"Rank": 14
},
"Geometry": {
"Average Score": 74.362,
"Standard Deviation": null,
"Rank": 10
},
"Algebra": {
"Average Score": 89.387,
"Standard Deviation": null,
"Rank": 11
},
"Probability": {
"Average Score": 73.919,
"Standard Deviation": null,
"Rank": 13
},
"Logical": {
"Average Score": 90.514,
"Standard Deviation": null,
"Rank": 7
},
"Social": {
"Average Score": 84.505,
"Standard Deviation": null,
"Rank": 7
},
"Chemistry": {
"Average Score": 85.611,
"Standard Deviation": null,
"Rank": 6
}
}
},
{
"config": {
"model_name": "o1-mini",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 97.53705747,
"Standard Deviation": 0.013240268,
"Rank": 2
},
"Geometry": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Algebra": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Probability": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Logical": {
"Average Score": 96.558,
"Standard Deviation": null,
"Rank": 3
},
"Social": {
"Average Score": 84.884,
"Standard Deviation": null,
"Rank": 6
},
"Chemistry": {
"Average Score": 93.717,
"Standard Deviation": null,
"Rank": 2
}
}
},
{
"config": {
"model_name": "o1-preview",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 93.04608514,
"Standard Deviation": 0.005729293,
"Rank": 5
},
"Geometry": {
"Average Score": "N/A",
"Standard Deviation": "N/A",
"Rank": "N/A"
},
"Algebra": {
"Average Score": 99.212,
"Standard Deviation": null,
"Rank": 2
},
"Probability": {
"Average Score": 94.181,
"Standard Deviation": null,
"Rank": 2
},
"Logical": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Social": {
"Average Score": 96.978,
"Standard Deviation": null,
"Rank": 4
}
}
},
{
"config": {
"model_name": "gemini-1.5-flash-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 64.39324213,
"Standard Deviation": 1.348364198,
"Rank": 20
},
"Geometry": {
"Average Score": 65.135,
"Standard Deviation": null,
"Rank": 14
},
"Algebra": {
"Average Score": 84.28,
"Standard Deviation": null,
"Rank": 13
},
"Probability": {
"Average Score": 67.22,
"Standard Deviation": null,
"Rank": 16
},
"Logical": {
"Average Score": 71.975,
"Standard Deviation": null,
"Rank": 19
},
"Social": {
"Average Score": 60.374,
"Standard Deviation": null,
"Rank": 18
},
"Chemistry": {
"Average Score": 79.569,
"Standard Deviation": null,
"Rank": 12
},
"CPP": {
"Average Score": 72.1127762005651,
"Standard Deviation": null,
"Rank": 10
}
}
},
{
"config": {
"model_name": "gpt4-1106",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 88.25145246,
"Standard Deviation": 0.889714647,
"Rank": 10
},
"Geometry": {
"Average Score": 61.784,
"Standard Deviation": null,
"Rank": 16
},
"Algebra": {
"Average Score": 80.579,
"Standard Deviation": null,
"Rank": 15
},
"Probability": {
"Average Score": 70.693,
"Standard Deviation": null,
"Rank": 15
},
"Logical": {
"Average Score": 75.513,
"Standard Deviation": null,
"Rank": 16
},
"Social": {
"Average Score": 40.498,
"Standard Deviation": null,
"Rank": 26
},
"Chemistry": {
"Average Score": 73.251,
"Standard Deviation": null,
"Rank": 16
},
"CPP": {
"Average Score": 69.11824072252848,
"Standard Deviation": null,
"Rank": 12
}
}
},
{
"config": {
"model_name": "gemma-2-27b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/06"
},
"results": {
"OVERALL": {
"Average Score": 71.08619043,
"Standard Deviation": 41.54124623,
"Rank": 19
},
"Geometry": {
"Average Score": 56.805,
"Standard Deviation": null,
"Rank": 17
},
"Algebra": {
"Average Score": 76.352,
"Standard Deviation": null,
"Rank": 18
},
"Probability": {
"Average Score": 65.472,
"Standard Deviation": null,
"Rank": 18
},
"Logical": {
"Average Score": 71.976,
"Standard Deviation": null,
"Rank": 18
},
"Social": {
"Average Score": 47.308,
"Standard Deviation": null,
"Rank": 22
},
"Chemistry": {
"Average Score": 69.606,
"Standard Deviation": null,
"Rank": 20
},
"CPP": {
"Average Score": 63.28920072143611,
"Standard Deviation": null,
"Rank": 14
}
}
},
{
"config": {
"model_name": "claude-3-opus",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 79.97608403,
"Standard Deviation": 5.382942441,
"Rank": 16
},
"Geometry": {
"Average Score": 56.54,
"Standard Deviation": null,
"Rank": 18
},
"Algebra": {
"Average Score": 75.405,
"Standard Deviation": null,
"Rank": 19
},
"Probability": {
"Average Score": 67.208,
"Standard Deviation": null,
"Rank": 17
},
"Logical": {
"Average Score": 77.458,
"Standard Deviation": null,
"Rank": 13
},
"Social": {
"Average Score": 80.318,
"Standard Deviation": null,
"Rank": 9
},
"Chemistry": {
"Average Score": 79.694,
"Standard Deviation": null,
"Rank": 10
},
"CPP": {
"Average Score": 73.5404403567132,
"Standard Deviation": null,
"Rank": 8
}
}
},
{
"config": {
"model_name": "gemma-2-9b-it-simpo",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/07"
},
"results": {
"OVERALL": {
"Average Score": "N/A",
"Standard Deviation": "N/A",
"Rank": "N/A"
},
"Geometry": {
"Average Score": 51.492,
"Standard Deviation": null,
"Rank": 20
},
"Algebra": {
"Average Score": 70.836,
"Standard Deviation": null,
"Rank": 20
},
"Probability": {
"Average Score": 58.976,
"Standard Deviation": null,
"Rank": 22
},
"Logical": {
"Average Score": 62.887,
"Standard Deviation": null,
"Rank": 24
},
"Social": {
"Average Score": 70.351,
"Standard Deviation": null,
"Rank": 12
},
"Chemistry": {
"Average Score": 85.813,
"Standard Deviation": null,
"Rank": 5
},
"CPP": {
"Average Score": 73.43757596214863,
"Standard Deviation": null,
"Rank": 9
}
}
},
{
"config": {
"model_name": "qwen1.5-72b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 62.1296631,
"Standard Deviation": 10.31242823,
"Rank": 21
},
"Geometry": {
"Average Score": 47.314,
"Standard Deviation": null,
"Rank": 25
},
"Algebra": {
"Average Score": 69.575,
"Standard Deviation": null,
"Rank": 21
},
"Probability": {
"Average Score": 49.066,
"Standard Deviation": null,
"Rank": 27
},
"Logical": {
"Average Score": 36.931,
"Standard Deviation": null,
"Rank": 36
},
"Social": {
"Average Score": 40.498,
"Standard Deviation": null,
"Rank": 27
},
"Chemistry": {
"Average Score": 53.127,
"Standard Deviation": null,
"Rank": 25
},
"CPP": {
"Average Score": 48.69302376665551,
"Standard Deviation": null,
"Rank": 20
}
}
},
{
"config": {
"model_name": "qwen1.5-32b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 46.27600711,
"Standard Deviation": 4.159365923,
"Rank": 30
},
"Geometry": {
"Average Score": 43.846,
"Standard Deviation": null,
"Rank": 27
},
"Algebra": {
"Average Score": 63.321,
"Standard Deviation": null,
"Rank": 24
},
"Probability": {
"Average Score": 48.15,
"Standard Deviation": null,
"Rank": 28
},
"Logical": {
"Average Score": 41.573,
"Standard Deviation": null,
"Rank": 34
},
"Social": {
"Average Score": 38.018,
"Standard Deviation": null,
"Rank": 29
},
"Chemistry": {
"Average Score": 48.041,
"Standard Deviation": null,
"Rank": 28
},
"CPP": {
"Average Score": 45.14284028264288,
"Standard Deviation": null,
"Rank": 24
}
}
},
{
"config": {
"model_name": "google-gemma-2-9b-it",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2024/06"
},
"results": {
"OVERALL": {
"Average Score": 59.59324506,
"Standard Deviation": 5.156822857,
"Rank": 23
},
"Geometry": {
"Average Score": 51.184,
"Standard Deviation": null,
"Rank": 21
},
"Algebra": {
"Average Score": 64.38,
"Standard Deviation": null,
"Rank": 22
},
"Probability": {
"Average Score": 63.362,
"Standard Deviation": null,
"Rank": 21
},
"Logical": {
"Average Score": 69.422,
"Standard Deviation": null,
"Rank": 21
},
"Social": {
"Average Score": 76.113,
"Standard Deviation": null,
"Rank": 10
},
"Chemistry": {
"Average Score": 58.379,
"Standard Deviation": null,
"Rank": 22
},
"CPP": {
"Average Score": 54.03167523687635,
"Standard Deviation": null,
"Rank": 17
}
}
},
{
"config": {
"model_name": "yi-1.5-34b-chat",
"organization": "01 AI",
"license": "Proprietary",
"knowledge_cutoff": "2024/05"
},
"results": {
"OVERALL": {
"Average Score": 72.39079733,
"Standard Deviation": 98.90928937,
"Rank": 18
},
"Geometry": {
"Average Score": 52.638,
"Standard Deviation": null,
"Rank": 19
},
"Algebra": {
"Average Score": 64.055,
"Standard Deviation": null,
"Rank": 23
},
"Probability": {
"Average Score": 64.137,
"Standard Deviation": null,
"Rank": 20
},
"Logical": {
"Average Score": 65.671,
"Standard Deviation": null,
"Rank": 22
},
"Social": {
"Average Score": 47.308,
"Standard Deviation": null,
"Rank": 23
},
"Chemistry": {
"Average Score": 57.484,
"Standard Deviation": null,
"Rank": 23
},
"CPP": {
"Average Score": 52.148798061768964,
"Standard Deviation": null,
"Rank": 18
}
}
},
{
"config": {
"model_name": "meta-llama-3.1-70b-instruct",
"organization": "Meta",
"license": "Llama 3.1 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 73.27773635,
"Standard Deviation": 5.72723675,
"Rank": 17
},
"Geometry": {
"Average Score": 65.135,
"Standard Deviation": null,
"Rank": 15
},
"Algebra": {
"Average Score": 80.579,
"Standard Deviation": null,
"Rank": 16
},
"Probability": {
"Average Score": 65.472,
"Standard Deviation": null,
"Rank": 19
},
"Logical": {
"Average Score": 72.879,
"Standard Deviation": null,
"Rank": 17
},
"Social": {
"Average Score": 60.374,
"Standard Deviation": null,
"Rank": 17
},
"Chemistry": {
"Average Score": 71.8,
"Standard Deviation": null,
"Rank": 17
},
"CPP": {
"Average Score": 84.36815192532764,
"Standard Deviation": null,
"Rank": 4
}
}
},
{
"config": {
"model_name": "meta-llama-3.1-8b-instruct",
"organization": "Meta",
"license": "Llama 3.1 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 52.8664657,
"Standard Deviation": 3.607384863,
"Rank": 27
},
"Geometry": {
"Average Score": 41.384,
"Standard Deviation": null,
"Rank": 29
},
"Algebra": {
"Average Score": 62.508,
"Standard Deviation": null,
"Rank": 25
},
"Probability": {
"Average Score": 51.889,
"Standard Deviation": null,
"Rank": 25
},
"Logical": {
"Average Score": 53.587,
"Standard Deviation": null,
"Rank": 29
},
"Social": {
"Average Score": 34.405,
"Standard Deviation": null,
"Rank": 32
},
"Chemistry": {
"Average Score": 45.032,
"Standard Deviation": null,
"Rank": 32
},
"CPP": {
"Average Score": 44.41846841004584,
"Standard Deviation": null,
"Rank": 26
}
}
},
{
"config": {
"model_name": "gpt3.5-turbo-0125",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2021/09"
},
"results": {
"OVERALL": {
"Average Score": 33.7046204,
"Standard Deviation": 45.16937959,
"Rank": 40
},
"Geometry": {
"Average Score": 50.19,
"Standard Deviation": null,
"Rank": 22
},
"Algebra": {
"Average Score": 60.978,
"Standard Deviation": null,
"Rank": 26
},
"Probability": {
"Average Score": 46.284,
"Standard Deviation": null,
"Rank": 30
},
"Logical": {
"Average Score": 20.595,
"Standard Deviation": null,
"Rank": 47
},
"Social": {
"Average Score": 24.926,
"Standard Deviation": null,
"Rank": 42
},
"Chemistry": {
"Average Score": 42.78,
"Standard Deviation": null,
"Rank": 33
},
"CPP": {
"Average Score": 40.46958736582551,
"Standard Deviation": null,
"Rank": 29
}
}
},
{
"config": {
"model_name": "llama-3-70b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 59.24245274,
"Standard Deviation": 4.878897527,
"Rank": 25
},
"Geometry": {
"Average Score": 45.249,
"Standard Deviation": null,
"Rank": 26
},
"Algebra": {
"Average Score": 60.736,
"Standard Deviation": null,
"Rank": 27
},
"Probability": {
"Average Score": 54.515,
"Standard Deviation": null,
"Rank": 23
},
"Logical": {
"Average Score": 83.08,
"Standard Deviation": null,
"Rank": 12
},
"Social": {
"Average Score": 42.172,
"Standard Deviation": null,
"Rank": 24
},
"Chemistry": {
"Average Score": 71.8,
"Standard Deviation": null,
"Rank": 18
},
"CPP": {
"Average Score": 65.32140697218945,
"Standard Deviation": null,
"Rank": 13
}
}
},
{
"config": {
"model_name": "claude-3-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 61.81320888,
"Standard Deviation": 10.27472205,
"Rank": 22
},
"Geometry": {
"Average Score": 50.185,
"Standard Deviation": null,
"Rank": 23
},
"Algebra": {
"Average Score": 58.739,
"Standard Deviation": null,
"Rank": 28
},
"Probability": {
"Average Score": 54.182,
"Standard Deviation": null,
"Rank": 24
},
"Logical": {
"Average Score": 65.118,
"Standard Deviation": null,
"Rank": 23
},
"Social": {
"Average Score": 55.325,
"Standard Deviation": null,
"Rank": 19
},
"Chemistry": {
"Average Score": 69.778,
"Standard Deviation": null,
"Rank": 19
},
"CPP": {
"Average Score": 61.33538592327427,
"Standard Deviation": null,
"Rank": 15
}
}
},
{
"config": {
"model_name": "qwen1.5-14b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 43.97760317,
"Standard Deviation": 3.740375694,
"Rank": 31
},
"Geometry": {
"Average Score": 35.5,
"Standard Deviation": null,
"Rank": 31
},
"Algebra": {
"Average Score": 57.821,
"Standard Deviation": null,
"Rank": 29
},
"Probability": {
"Average Score": 38.886,
"Standard Deviation": null,
"Rank": 34
},
"Logical": {
"Average Score": 34.775,
"Standard Deviation": null,
"Rank": 39
},
"Social": {
"Average Score": 31.022,
"Standard Deviation": null,
"Rank": 35
},
"Chemistry": {
"Average Score": 40.55,
"Standard Deviation": null,
"Rank": 36
},
"CPP": {
"Average Score": 38.552779976347026,
"Standard Deviation": null,
"Rank": 31
}
}
},
{
"config": {
"model_name": "claude-3-haiku",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 55.60534246,
"Standard Deviation": 15.07600975,
"Rank": 26
},
"Geometry": {
"Average Score": 41.806,
"Standard Deviation": null,
"Rank": 28
},
"Algebra": {
"Average Score": 54.298,
"Standard Deviation": null,
"Rank": 31
},
"Probability": {
"Average Score": 49.344,
"Standard Deviation": null,
"Rank": 26
},
"Logical": {
"Average Score": 61.904,
"Standard Deviation": null,
"Rank": 25
},
"Social": {
"Average Score": 50.407,
"Standard Deviation": null,
"Rank": 21
},
"Chemistry": {
"Average Score": 61.491,
"Standard Deviation": null,
"Rank": 21
},
"CPP": {
"Average Score": 56.40200048817984,
"Standard Deviation": null,
"Rank": 16
}
}
},
{
"config": {
"model_name": "claude-2.1",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 40.35699809,
"Standard Deviation": 2.484317383,
"Rank": 35
},
"Geometry": {
"Average Score": 49.899,
"Standard Deviation": null,
"Rank": 24
},
"Algebra": {
"Average Score": 53.574,
"Standard Deviation": null,
"Rank": 32
},
"Probability": {
"Average Score": 44.011,
"Standard Deviation": null,
"Rank": 32
},
"Logical": {
"Average Score": 59.855,
"Standard Deviation": null,
"Rank": 26
},
"Social": {
"Average Score": 33.888,
"Standard Deviation": null,
"Rank": 33
},
"Chemistry": {
"Average Score": 51.038,
"Standard Deviation": null,
"Rank": 26
},
"CPP": {
"Average Score": 47.23672563994903,
"Standard Deviation": null,
"Rank": 21
}
}
},
{
"config": {
"model_name": "mistral-8x7b-instruct-v0.1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 43.2937322,
"Standard Deviation": 2.659857412,
"Rank": 32
},
"Geometry": {
"Average Score": 32.639,
"Standard Deviation": null,
"Rank": 35
},
"Algebra": {
"Average Score": 48.901,
"Standard Deviation": null,
"Rank": 35
},
"Probability": {
"Average Score": 44.058,
"Standard Deviation": null,
"Rank": 31
},
"Logical": {
"Average Score": 42.194,
"Standard Deviation": null,
"Rank": 32
},
"Social": {
"Average Score": 26.702,
"Standard Deviation": null,
"Rank": 41
},
"Chemistry": {
"Average Score": 47.192,
"Standard Deviation": null,
"Rank": 29
},
"CPP": {
"Average Score": 44.533118241976666,
"Standard Deviation": null,
"Rank": 25
}
}
},
{
"config": {
"model_name": "claude-2.0",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 30.83692551,
"Standard Deviation": 1.816269,
"Rank": 43
},
"Geometry": {
"Average Score": 37.452,
"Standard Deviation": null,
"Rank": 30
},
"Algebra": {
"Average Score": 48.965,
"Standard Deviation": null,
"Rank": 34
},
"Probability": {
"Average Score": 46.284,
"Standard Deviation": null,
"Rank": 29
},
"Logical": {
"Average Score": 55.657,
"Standard Deviation": null,
"Rank": 28
},
"Social": {
"Average Score": 42.117,
"Standard Deviation": null,
"Rank": 25
},
"Chemistry": {
"Average Score": 55.869,
"Standard Deviation": null,
"Rank": 24
},
"CPP": {
"Average Score": 50.773143448036464,
"Standard Deviation": null,
"Rank": 19
}
}
},
{
"config": {
"model_name": "starling-lm-7b-beta",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2024/03"
},
"results": {
"OVERALL": {
"Average Score": 50.05304991,
"Standard Deviation": 3.017802027,
"Rank": 28
},
"Geometry": {
"Average Score": 33.79,
"Standard Deviation": null,
"Rank": 34
},
"Algebra": {
"Average Score": 49.685,
"Standard Deviation": null,
"Rank": 33
},
"Probability": {
"Average Score": 39.677,
"Standard Deviation": null,
"Rank": 33
},
"Logical": {
"Average Score": 47.501,
"Standard Deviation": null,
"Rank": 30
},
"Social": {
"Average Score": 37.7,
"Standard Deviation": null,
"Rank": 30
},
"Chemistry": {
"Average Score": 40.274,
"Standard Deviation": null,
"Rank": 37
},
"CPP": {
"Average Score": 38.27587102395908,
"Standard Deviation": null,
"Rank": 32
}
}
},
{
"config": {
"model_name": "gemini-1.0-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 38.71255653,
"Standard Deviation": 8.592349353,
"Rank": 37
},
"Geometry": {
"Average Score": 34.596,
"Standard Deviation": null,
"Rank": 33
},
"Algebra": {
"Average Score": 48.159,
"Standard Deviation": null,
"Rank": 36
},
"Probability": {
"Average Score": 29.585,
"Standard Deviation": null,
"Rank": 43
},
"Logical": {
"Average Score": 23.882,
"Standard Deviation": null,
"Rank": 45
},
"Social": {
"Average Score": 13.261,
"Standard Deviation": null,
"Rank": 52
},
"Chemistry": {
"Average Score": 46.637,
"Standard Deviation": null,
"Rank": 30
},
"CPP": {
"Average Score": 45.22204471452975,
"Standard Deviation": null,
"Rank": 23
}
}
},
{
"config": {
"model_name": "openchat-3.5-0106",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2024/01"
},
"results": {
"OVERALL": {
"Average Score": 40.85094215,
"Standard Deviation": 6.631820541,
"Rank": 34
},
"Geometry": {
"Average Score": 29.115,
"Standard Deviation": null,
"Rank": 37
},
"Algebra": {
"Average Score": 45.456,
"Standard Deviation": null,
"Rank": 37
},
"Probability": {
"Average Score": 38.408,
"Standard Deviation": null,
"Rank": 35
},
"Logical": {
"Average Score": 41.678,
"Standard Deviation": null,
"Rank": 33
},
"Social": {
"Average Score": 28.236,
"Standard Deviation": null,
"Rank": 40
},
"Chemistry": {
"Average Score": 34.68,
"Standard Deviation": null,
"Rank": 39
},
"CPP": {
"Average Score": 33.70639271807677,
"Standard Deviation": null,
"Rank": 33
}
}
},
{
"config": {
"model_name": "openchat-3.5",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 39.20699952,
"Standard Deviation": 1.576169927,
"Rank": 36
},
"Geometry": {
"Average Score": 30.009,
"Standard Deviation": null,
"Rank": 36
},
"Algebra": {
"Average Score": 42.04,
"Standard Deviation": null,
"Rank": 39
},
"Probability": {
"Average Score": 34.495,
"Standard Deviation": null,
"Rank": 38
},
"Logical": {
"Average Score": 35.828,
"Standard Deviation": null,
"Rank": 37
},
"Social": {
"Average Score": 33.096,
"Standard Deviation": null,
"Rank": 34
},
"Chemistry": {
"Average Score": 36.737,
"Standard Deviation": null,
"Rank": 38
},
"CPP": {
"Average Score": 33.020911255646965,
"Standard Deviation": null,
"Rank": 34
}
}
},
{
"config": {
"model_name": "command-r-(08-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024/08"
},
"results": {
"OVERALL": {
"Average Score": 46.70245901,
"Standard Deviation": 3.665464964,
"Rank": 29
},
"Geometry": {
"Average Score": 35.43,
"Standard Deviation": null,
"Rank": 32
},
"Algebra": {
"Average Score": 41.852,
"Standard Deviation": null,
"Rank": 40
},
"Probability": {
"Average Score": 36.535,
"Standard Deviation": null,
"Rank": 37
},
"Logical": {
"Average Score": 25.941,
"Standard Deviation": null,
"Rank": 42
},
"Social": {
"Average Score": 30.911,
"Standard Deviation": null,
"Rank": 36
},
"Chemistry": {
"Average Score": 41.629,
"Standard Deviation": null,
"Rank": 35
},
"CPP": {
"Average Score": 39.61492485677676,
"Standard Deviation": null,
"Rank": 30
}
}
},
{
"config": {
"model_name": "gemma-1.1-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 32.61912991,
"Standard Deviation": 17.86038512,
"Rank": 41
},
"Geometry": {
"Average Score": 25.149,
"Standard Deviation": null,
"Rank": 41
},
"Algebra": {
"Average Score": 40.456,
"Standard Deviation": null,
"Rank": 41
},
"Probability": {
"Average Score": 29.307,
"Standard Deviation": null,
"Rank": 44
},
"Logical": {
"Average Score": 41.543,
"Standard Deviation": null,
"Rank": 35
},
"Social": {
"Average Score": 21.473,
"Standard Deviation": null,
"Rank": 45
},
"Chemistry": {
"Average Score": 45.033,
"Standard Deviation": null,
"Rank": 31
},
"CPP": {
"Average Score": 42.666504105798204,
"Standard Deviation": null,
"Rank": 27
}
}
},
{
"config": {
"model_name": "llama3-8b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2023/03"
},
"results": {
"OVERALL": {
"Average Score": 37.29361351,
"Standard Deviation": 8.841996174,
"Rank": 39
},
"Geometry": {
"Average Score": 28.496,
"Standard Deviation": null,
"Rank": 39
},
"Algebra": {
"Average Score": 42.117,
"Standard Deviation": null,
"Rank": 38
},
"Probability": {
"Average Score": 33.841,
"Standard Deviation": null,
"Rank": 39
},
"Logical": {
"Average Score": 57.763,
"Standard Deviation": null,
"Rank": 27
},
"Social": {
"Average Score": 35.994,
"Standard Deviation": null,
"Rank": 31
},
"Chemistry": {
"Average Score": 50.023,
"Standard Deviation": null,
"Rank": 27
},
"CPP": {
"Average Score": 45.35392139264795,
"Standard Deviation": null,
"Rank": 22
}
}
},
{
"config": {
"model_name": "gemma-2-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/07"
},
"results": {
"OVERALL": {
"Average Score": 59.3544514,
"Standard Deviation": 14.50864762,
"Rank": 24
},
"Geometry": {
"Average Score": 29.077,
"Standard Deviation": null,
"Rank": 38
},
"Algebra": {
"Average Score": 39.677,
"Standard Deviation": null,
"Rank": 42
},
"Probability": {
"Average Score": 31.561,
"Standard Deviation": null,
"Rank": 41
},
"Logical": {
"Average Score": 43.458,
"Standard Deviation": null,
"Rank": 31
},
"Social": {
"Average Score": 39.343,
"Standard Deviation": null,
"Rank": 28
},
"Chemistry": {
"Average Score": 31.156,
"Standard Deviation": null,
"Rank": 43
},
"CPP": {
"Average Score": 30.53406933106768,
"Standard Deviation": null,
"Rank": 36
}
}
},
{
"config": {
"model_name": "starling-lm-7b-alpha",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2023/11"
},
"results": {
"OVERALL": {
"Average Score": 37.94593338,
"Standard Deviation": 1.40532208,
"Rank": 38
},
"Geometry": {
"Average Score": 25.519,
"Standard Deviation": null,
"Rank": 40
},
"Algebra": {
"Average Score": 38.88,
"Standard Deviation": null,
"Rank": 43
},
"Probability": {
"Average Score": 32.068,
"Standard Deviation": null,
"Rank": 40
},
"Logical": {
"Average Score": 33.804,
"Standard Deviation": null,
"Rank": 40
},
"Social": {
"Average Score": 30.875,
"Standard Deviation": null,
"Rank": 37
},
"Chemistry": {
"Average Score": 31.354,
"Standard Deviation": null,
"Rank": 41
},
"CPP": {
"Average Score": 30.07926487356878,
"Standard Deviation": null,
"Rank": 37
}
}
},
{
"config": {
"model_name": "qwen1.5-4b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 9.779979052,
"Standard Deviation": 0.925129318,
"Rank": 54
},
"Geometry": {
"Average Score": 15.672,
"Standard Deviation": null,
"Rank": 46
},
"Algebra": {
"Average Score": 31.21,
"Standard Deviation": null,
"Rank": 44
},
"Probability": {
"Average Score": 13.853,
"Standard Deviation": null,
"Rank": 49
},
"Logical": {
"Average Score": 13.842,
"Standard Deviation": null,
"Rank": 52
},
"Social": {
"Average Score": 20.21,
"Standard Deviation": null,
"Rank": 46
},
"Chemistry": {
"Average Score": 14.794,
"Standard Deviation": null,
"Rank": 53
},
"CPP": {
"Average Score": 13.21208067122554,
"Standard Deviation": null,
"Rank": 47
}
}
},
{
"config": {
"model_name": "command-r-(04-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024/04"
},
"results": {
"OVERALL": {
"Average Score": 42.49175095,
"Standard Deviation": 5.556047496,
"Rank": 33
},
"Geometry": {
"Average Score": 23.438,
"Standard Deviation": null,
"Rank": 42
},
"Algebra": {
"Average Score": 31.204,
"Standard Deviation": null,
"Rank": 45
},
"Probability": {
"Average Score": 30.726,
"Standard Deviation": null,
"Rank": 42
},
"Logical": {
"Average Score": 35.111,
"Standard Deviation": null,
"Rank": 38
},
"Social": {
"Average Score": 30.623,
"Standard Deviation": null,
"Rank": 38
},
"Chemistry": {
"Average Score": 42.316,
"Standard Deviation": null,
"Rank": 34
},
"CPP": {
"Average Score": 41.346336503003236,
"Standard Deviation": null,
"Rank": 28
}
}
},
{
"config": {
"model_name": "vicuna-33b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/08"
},
"results": {
"OVERALL": {
"Average Score": 27.90851915,
"Standard Deviation": 4.55056913,
"Rank": 44
},
"Geometry": {
"Average Score": 16.634,
"Standard Deviation": null,
"Rank": 45
},
"Algebra": {
"Average Score": 25.075,
"Standard Deviation": null,
"Rank": 46
},
"Probability": {
"Average Score": 20.901,
"Standard Deviation": null,
"Rank": 47
},
"Logical": {
"Average Score": 22.962,
"Standard Deviation": null,
"Rank": 46
},
"Social": {
"Average Score": 28.487,
"Standard Deviation": null,
"Rank": 39
},
"Chemistry": {
"Average Score": 31.156,
"Standard Deviation": null,
"Rank": 42
},
"CPP": {
"Average Score": 28.01838653090379,
"Standard Deviation": null,
"Rank": 38
}
}
},
{
"config": {
"model_name": "gemma-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 25.25380823,
"Standard Deviation": 3.455163419,
"Rank": 46
},
"Geometry": {
"Average Score": 19.626,
"Standard Deviation": null,
"Rank": 43
},
"Algebra": {
"Average Score": 23.272,
"Standard Deviation": null,
"Rank": 48
},
"Probability": {
"Average Score": 16.98,
"Standard Deviation": null,
"Rank": 48
},
"Logical": {
"Average Score": 24.359,
"Standard Deviation": null,
"Rank": 43
},
"Social": {
"Average Score": 23.52,
"Standard Deviation": null,
"Rank": 43
},
"Chemistry": {
"Average Score": 31.139,
"Standard Deviation": null,
"Rank": 44
},
"CPP": {
"Average Score": 28.014658234926813,
"Standard Deviation": null,
"Rank": 39
}
}
},
{
"config": {
"model_name": "mistral-7b-instruct-2",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 31.49596208,
"Standard Deviation": 11.79471585,
"Rank": 42
},
"Geometry": {
"Average Score": 16.847,
"Standard Deviation": null,
"Rank": 44
},
"Algebra": {
"Average Score": 23.287,
"Standard Deviation": null,
"Rank": 47
},
"Probability": {
"Average Score": 24.868,
"Standard Deviation": null,
"Rank": 45
},
"Logical": {
"Average Score": 28.755,
"Standard Deviation": null,
"Rank": 41
},
"Social": {
"Average Score": 21.473,
"Standard Deviation": null,
"Rank": 44
},
"Chemistry": {
"Average Score": 31.994,
"Standard Deviation": null,
"Rank": 40
},
"CPP": {
"Average Score": 31.382959631870822,
"Standard Deviation": null,
"Rank": 35
}
}
},
{
"config": {
"model_name": "mistral-7b-instruct-1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023/12"
},
"results": {
"OVERALL": {
"Average Score": 20.53586787,
"Standard Deviation": 2.95650198,
"Rank": 51
},
"Geometry": {
"Average Score": 11.019,
"Standard Deviation": null,
"Rank": 50
},
"Algebra": {
"Average Score": 20.39,
"Standard Deviation": null,
"Rank": 49
},
"Probability": {
"Average Score": 24.279,
"Standard Deviation": null,
"Rank": 46
},
"Logical": {
"Average Score": 16.823,
"Standard Deviation": null,
"Rank": 50
},
"Social": {
"Average Score": 12.369,
"Standard Deviation": null,
"Rank": 53
},
"Chemistry": {
"Average Score": 22.121,
"Standard Deviation": null,
"Rank": 47
},
"CPP": {
"Average Score": 18.929093202755805,
"Standard Deviation": null,
"Rank": 42
}
}
},
{
"config": {
"model_name": "vicuna-13b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 17.42296198,
"Standard Deviation": 4.480901647,
"Rank": 52
},
"Geometry": {
"Average Score": 12.755,
"Standard Deviation": null,
"Rank": 49
},
"Algebra": {
"Average Score": 17.974,
"Standard Deviation": null,
"Rank": 50
},
"Probability": {
"Average Score": 13.004,
"Standard Deviation": null,
"Rank": 50
},
"Logical": {
"Average Score": 16.997,
"Standard Deviation": null,
"Rank": 49
},
"Social": {
"Average Score": 14.314,
"Standard Deviation": null,
"Rank": 51
},
"Chemistry": {
"Average Score": 25.307,
"Standard Deviation": null,
"Rank": 46
},
"CPP": {
"Average Score": 21.840013221590294,
"Standard Deviation": null,
"Rank": 40
}
}
},
{
"config": {
"model_name": "zephyr-7b-beta",
"organization": "HuggingFace",
"license": "MIT",
"knowledge_cutoff": "2023/10"
},
"results": {
"OVERALL": {
"Average Score": 9.323654861,
"Standard Deviation": 0.338544041,
"Rank": 55
},
"Geometry": {
"Average Score": 8.222,
"Standard Deviation": null,
"Rank": 51
},
"Algebra": {
"Average Score": 13.006,
"Standard Deviation": null,
"Rank": 51
},
"Probability": {
"Average Score": 7.573,
"Standard Deviation": null,
"Rank": 55
},
"Logical": {
"Average Score": 7.364,
"Standard Deviation": null,
"Rank": 56
},
"Social": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 58
},
"Chemistry": {
"Average Score": 17.18,
"Standard Deviation": null,
"Rank": 51
},
"CPP": {
"Average Score": 18.92902220864132,
"Standard Deviation": null,
"Rank": 43
}
}
},
{
"config": {
"model_name": "gemma-1.1-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 22.44740296,
"Standard Deviation": 3.95922917,
"Rank": 49
},
"Geometry": {
"Average Score": 12.834,
"Standard Deviation": null,
"Rank": 48
},
"Algebra": {
"Average Score": 12.291,
"Standard Deviation": null,
"Rank": 52
},
"Probability": {
"Average Score": 8.228,
"Standard Deviation": null,
"Rank": 53
},
"Logical": {
"Average Score": 10.822,
"Standard Deviation": null,
"Rank": 54
},
"Social": {
"Average Score": 19.303,
"Standard Deviation": null,
"Rank": 47
},
"Chemistry": {
"Average Score": 19.892,
"Standard Deviation": null,
"Rank": 48
},
"CPP": {
"Average Score": 20.724691953843916,
"Standard Deviation": null,
"Rank": 41
}
}
},
{
"config": {
"model_name": "llama2-7b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 23.53840413,
"Standard Deviation": 4.565404574,
"Rank": 47
},
"Geometry": {
"Average Score": 5.681,
"Standard Deviation": null,
"Rank": 52
},
"Algebra": {
"Average Score": 9.809,
"Standard Deviation": null,
"Rank": 54
},
"Probability": {
"Average Score": 8.089,
"Standard Deviation": null,
"Rank": 54
},
"Logical": {
"Average Score": 20.474,
"Standard Deviation": null,
"Rank": 48
},
"Social": {
"Average Score": 15.968,
"Standard Deviation": null,
"Rank": 48
},
"Chemistry": {
"Average Score": 18.153,
"Standard Deviation": null,
"Rank": 50
},
"CPP": {
"Average Score": 15.730513733660898,
"Standard Deviation": null,
"Rank": 45
}
}
},
{
"config": {
"model_name": "gemma-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024/02"
},
"results": {
"OVERALL": {
"Average Score": 20.86803148,
"Standard Deviation": 4.810898787,
"Rank": 50
},
"Geometry": {
"Average Score": 15.137,
"Standard Deviation": null,
"Rank": 47
},
"Algebra": {
"Average Score": 10.108,
"Standard Deviation": null,
"Rank": 53
},
"Probability": {
"Average Score": 6.688,
"Standard Deviation": null,
"Rank": 56
},
"Logical": {
"Average Score": 5.296,
"Standard Deviation": null,
"Rank": 57
},
"Social": {
"Average Score": 9.63,
"Standard Deviation": null,
"Rank": 56
},
"Chemistry": {
"Average Score": 18.153,
"Standard Deviation": null,
"Rank": 49
},
"CPP": {
"Average Score": 17.2715657115764,
"Standard Deviation": null,
"Rank": 44
}
}
},
{
"config": {
"model_name": "llama2-13b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 23.34503255,
"Standard Deviation": 4.939571996,
"Rank": 48
},
"Geometry": {
"Average Score": 4.017,
"Standard Deviation": null,
"Rank": 54
},
"Algebra": {
"Average Score": 7.201,
"Standard Deviation": null,
"Rank": 55
},
"Probability": {
"Average Score": 11.451,
"Standard Deviation": null,
"Rank": 51
},
"Logical": {
"Average Score": 23.912,
"Standard Deviation": null,
"Rank": 44
},
"Social": {
"Average Score": 15.715,
"Standard Deviation": null,
"Rank": 50
},
"Chemistry": {
"Average Score": 14.773,
"Standard Deviation": null,
"Rank": 54
},
"CPP": {
"Average Score": 13.17258252933903,
"Standard Deviation": null,
"Rank": 48
}
}
},
{
"config": {
"model_name": "vicuna-7b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023/07"
},
"results": {
"OVERALL": {
"Average Score": 16.78668722,
"Standard Deviation": 4.782003459,
"Rank": 53
},
"Geometry": {
"Average Score": 5.299,
"Standard Deviation": null,
"Rank": 53
},
"Algebra": {
"Average Score": 7.014,
"Standard Deviation": null,
"Rank": 56
},
"Probability": {
"Average Score": 8.228,
"Standard Deviation": null,
"Rank": 52
},
"Logical": {
"Average Score": 11.753,
"Standard Deviation": null,
"Rank": 53
},
"Social": {
"Average Score": 11.326,
"Standard Deviation": null,
"Rank": 54
},
"Chemistry": {
"Average Score": 15.092,
"Standard Deviation": null,
"Rank": 52
},
"CPP": {
"Average Score": 14.255194156624162,
"Standard Deviation": null,
"Rank": 46
}
}
},
{
"config": {
"model_name": "koala-13b",
"organization": "UC Berkeley",
"license": "Non-commercial",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 8.747324657,
"Standard Deviation": 0.645177403,
"Rank": 56
},
"Geometry": {
"Average Score": 0.156,
"Standard Deviation": null,
"Rank": 55
},
"Algebra": {
"Average Score": 2.242,
"Standard Deviation": null,
"Rank": 57
},
"Probability": {
"Average Score": 3.323,
"Standard Deviation": null,
"Rank": 57
},
"Logical": {
"Average Score": 8.156,
"Standard Deviation": null,
"Rank": 55
},
"Social": {
"Average Score": 9.649,
"Standard Deviation": null,
"Rank": 55
},
"Chemistry": {
"Average Score": 6.672,
"Standard Deviation": null,
"Rank": 55
},
"CPP": {
"Average Score": 6.36433272373514,
"Standard Deviation": null,
"Rank": 49
}
}
},
{
"config": {
"model_name": "openassistant-pythia-12b",
"organization": "OpenAssistant",
"license": "Non-commercial",
"knowledge_cutoff": "2023/04"
},
"results": {
"OVERALL": {
"Average Score": 0,
"Standard Deviation": 0,
"Rank": 57
},
"Geometry": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 56
},
"Algebra": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 58
},
"Probability": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 58
},
"Logical": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 58
},
"Social": {
"Average Score": 1.637,
"Standard Deviation": null,
"Rank": 57
},
"Chemistry": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 56
},
"CPP": {
"Average Score": 0.0,
"Standard Deviation": null,
"Rank": 50
}
}
},
{
"config": {
"model_name": "nemotron-70b",
"organization": "NVIDIA",
"license": "Unknown",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 100,
"Standard Deviation": 0,
"Rank": 1
},
"Algebra": {
"Average Score": 79.813,
"Standard Deviation": null,
"Rank": 17
},
"Geometry": {
"Average Score": 67.014,
"Standard Deviation": null,
"Rank": 13
},
"Probability": {
"Average Score": 75.535,
"Standard Deviation": null,
"Rank": 10
},
"Logical": {
"Average Score": 92.659,
"Standard Deviation": null,
"Rank": 5
},
"Social": {
"Average Score": 99.677,
"Standard Deviation": null,
"Rank": 2
},
"Chemistry": {
"Average Score": 76.262,
"Standard Deviation": null,
"Rank": 14
}
}
},
{
"config": {
"model_name": "yi-lightning",
"organization": "01 AI",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 96.84467293,
"Standard Deviation": 0.033152361,
"Rank": 3
},
"Geometry": {
"Average Score": 77.667,
"Standard Deviation": null,
"Rank": 8
},
"Algebra": {
"Average Score": 93.245,
"Standard Deviation": null,
"Rank": 6
},
"Chemistry": {
"Average Score": 100.000,
"Standard Deviation": null,
"Rank": 1
},
"Logical": {
"Average Score": 94.660,
"Standard Deviation": null,
"Rank": 4
},
"Social": {
"Average Score": 83.236,
"Standard Deviation": null,
"Rank": 8
},
"Probability": {
"Average Score": 90.329,
"Standard Deviation": null,
"Rank": 3
}
}
},
{
"config": {
"model_name": "glm-4-plus",
"organization": "Zhipu AI",
"license": "Proprietary",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 92.48932574,
"Standard Deviation": 0.087973142,
"Rank": 6
},
"Geometry": {
"Average Score": 76.965,
"Standard Deviation": null,
"Rank": 9
},
"Algebra": {
"Average Score": 91.701,
"Standard Deviation": null,
"Rank": 7
},
"Chemistry": {
"Average Score": 83.527,
"Standard Deviation": null,
"Rank": 8
},
"Logical": {
"Average Score": 92.348,
"Standard Deviation": null,
"Rank": 6
},
"Social": {
"Average Score": 100.0,
"Standard Deviation": null,
"Rank": 1
},
"Probability": {
"Average Score": 74.233,
"Standard Deviation": null,
"Rank": 12
}
}
},
{
"config": {
"model_name": "llama-3.2-3b-it",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "Unknown"
},
"results": {
"OVERALL": {
"Average Score": 26.58569941,
"Standard Deviation": 4.191042423,
"Rank": 45
},
"Algebra": {
"Average Score": 56.545,
"Standard Deviation": null,
"Rank": 30
},
"Probability": {
"Average Score": 37.496,
"Standard Deviation": null,
"Rank": 36
},
"Logical": {
"Average Score": 15.188,
"Standard Deviation": null,
"Rank": 51
},
"Social": {
"Average Score": 15.924,
"Standard Deviation": null,
"Rank": 49
},
"Chemistry": {
"Average Score": 30.78,
"Standard Deviation": null,
"Rank": 45
}
}
}
]