[ { "config": { "model_name": "ChatGPT-4o-latest (2024-09-03)", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 94.56827761, "Standard Deviation": 0.009435818, "Rank": 4 }, "Geometry": { "Average Score": 82.306, "Standard Deviation": null, "Rank": 5 }, "Algebra": { "Average Score": 91.701, "Standard Deviation": null, "Rank": 8 }, "Probability": { "Average Score": 86.681, "Standard Deviation": null, "Rank": 4 }, "Logical": { "Average Score": 97.425, "Standard Deviation": null, "Rank": 2 }, "Social": { "Average Score": 91.333, "Standard Deviation": null, "Rank": 5 }, "Chemistry": { "Average Score": 90.77, "Standard Deviation": null, "Rank": 3 }, "CPP": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 } } }, { "config": { "model_name": "gpt-4o-2024-08-06", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 83.58608983, "Standard Deviation": 4.528687523, "Rank": 12 }, "Geometry": { "Average Score": 86.632, "Standard Deviation": null, "Rank": 2 }, "Algebra": { "Average Score": 95.242, "Standard Deviation": null, "Rank": 5 }, "Probability": { "Average Score": 78.89, "Standard Deviation": null, "Rank": 8 }, "Logical": { "Average Score": 77.458, "Standard Deviation": null, "Rank": 14 }, "Social": { "Average Score": 70.351, "Standard Deviation": null, "Rank": 13 }, "Chemistry": { "Average Score": 80.088, "Standard Deviation": null, "Rank": 9 }, "CPP": { "Average Score": 92.43090226400756, "Standard Deviation": null, "Rank": 2 } } }, { "config": { "model_name": "gpt-4o-2024-05-13", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 89.45175971, "Standard Deviation": 0.030431012, "Rank": 8 }, "Geometry": { "Average Score": 82.859, "Standard Deviation": null, "Rank": 4 }, "Algebra": { "Average Score": 90.056, "Standard Deviation": null, "Rank": 9 }, "Probability": { "Average Score": 82.051, "Standard Deviation": null, "Rank": 5 }, "Logical": { "Average Score": 86.969, "Standard Deviation": null, "Rank": 10 }, "Social": { "Average Score": 67.017, "Standard Deviation": null, "Rank": 16 }, "Chemistry": { "Average Score": 84.501, "Standard Deviation": null, "Rank": 7 }, "CPP": { "Average Score": 79.1592634699295, "Standard Deviation": null, "Rank": 6 } } }, { "config": { "model_name": "gpt-4-turbo-2024-04-09", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 89.34848344, "Standard Deviation": 0.303734513, "Rank": 9 }, "Geometry": { "Average Score": 79.296, "Standard Deviation": null, "Rank": 7 }, "Algebra": { "Average Score": 84.668, "Standard Deviation": null, "Rank": 12 }, "Probability": { "Average Score": 77.859, "Standard Deviation": null, "Rank": 9 }, "Logical": { "Average Score": 88.359, "Standard Deviation": null, "Rank": 9 }, "Social": { "Average Score": 67.671, "Standard Deviation": null, "Rank": 15 }, "Chemistry": { "Average Score": 79.61, "Standard Deviation": null, "Rank": 11 }, "CPP": { "Average Score": 70.73143363230263, "Standard Deviation": null, "Rank": 11 } } }, { "config": { "model_name": "gemini-1.5-pro-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 83.17822062, "Standard Deviation": 4.166312552, "Rank": 13 }, "Geometry": { "Average Score": 84.696, "Standard Deviation": null, "Rank": 3 }, "Algebra": { "Average Score": 98.832, "Standard Deviation": null, "Rank": 3 }, "Probability": { "Average Score": 74.233, "Standard Deviation": null, "Rank": 11 }, "Logical": { "Average Score": 77.421, "Standard Deviation": null, "Rank": 15 }, "Social": { "Average Score": 70.057, "Standard Deviation": null, "Rank": 14 } } }, { "config": { "model_name": "qwen2-72b-instruct", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/09" }, "results": { "OVERALL": { "Average Score": 80.78104505, "Standard Deviation": 2.776695545, "Rank": 15 }, "Geometry": { "Average Score": 70.775, "Standard Deviation": null, "Rank": 12 }, "Algebra": { "Average Score": 95.816, "Standard Deviation": null, "Rank": 4 }, "Probability": { "Average Score": 80.38, "Standard Deviation": null, "Rank": 6 }, "Logical": { "Average Score": 71.975, "Standard Deviation": null, "Rank": 20 }, "Social": { "Average Score": 50.407, "Standard Deviation": null, "Rank": 20 }, "Chemistry": { "Average Score": 76.621, "Standard Deviation": null, "Rank": 13 }, "CPP": { "Average Score": 73.54037778797029, "Standard Deviation": null, "Rank": 7 } } }, { "config": { "model_name": "gpt-4o-mini-2024-07-18", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 85.99929202, "Standard Deviation": 2.479470643, "Rank": 11 }, "Geometry": { "Average Score": 79.42, "Standard Deviation": null, "Rank": 6 }, "Algebra": { "Average Score": 89.997, "Standard Deviation": null, "Rank": 10 }, "Probability": { "Average Score": 78.89, "Standard Deviation": null, "Rank": 7 }, "Logical": { "Average Score": 84.755, "Standard Deviation": null, "Rank": 11 }, "Social": { "Average Score": 72.014, "Standard Deviation": null, "Rank": 11 }, "Chemistry": { "Average Score": 76.194, "Standard Deviation": null, "Rank": 15 }, "CPP": { "Average Score": 88.3877070580296, "Standard Deviation": null, "Rank": 3 } } }, { "config": { "model_name": "claude-3.5-sonnet", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 90.43169444, "Standard Deviation": 0.123754719, "Rank": 7 }, "Geometry": { "Average Score": 74.36, "Standard Deviation": null, "Rank": 11 }, "Algebra": { "Average Score": 83.137, "Standard Deviation": null, "Rank": 14 }, "Probability": { "Average Score": 73.278, "Standard Deviation": null, "Rank": 14 }, "Logical": { "Average Score": 88.581, "Standard Deviation": null, "Rank": 8 }, "Social": { "Average Score": 97.694, "Standard Deviation": null, "Rank": 3 }, "Chemistry": { "Average Score": 86.294, "Standard Deviation": null, "Rank": 4 }, "CPP": { "Average Score": 82.37734076815008, "Standard Deviation": null, "Rank": 5 } } }, { "config": { "model_name": "claude-3.5-sonnet-20241022", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "UNKNOW" }, "results": { "OVERALL": { "Average Score": 82.08873036, "Standard Deviation": 20.89052134, "Rank": 14 }, "Geometry": { "Average Score": 74.362, "Standard Deviation": null, "Rank": 10 }, "Algebra": { "Average Score": 89.387, "Standard Deviation": null, "Rank": 11 }, "Probability": { "Average Score": 73.919, "Standard Deviation": null, "Rank": 13 }, "Logical": { "Average Score": 90.514, "Standard Deviation": null, "Rank": 7 }, "Social": { "Average Score": 84.505, "Standard Deviation": null, "Rank": 7 }, "Chemistry": { "Average Score": 85.611, "Standard Deviation": null, "Rank": 6 } } }, { "config": { "model_name": "o1-mini", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 97.53705747, "Standard Deviation": 0.013240268, "Rank": 2 }, "Geometry": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Algebra": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Probability": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Logical": { "Average Score": 96.558, "Standard Deviation": null, "Rank": 3 }, "Social": { "Average Score": 84.884, "Standard Deviation": null, "Rank": 6 }, "Chemistry": { "Average Score": 93.717, "Standard Deviation": null, "Rank": 2 } } }, { "config": { "model_name": "o1-preview", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 93.04608514, "Standard Deviation": 0.005729293, "Rank": 5 }, "Geometry": { "Average Score": "N/A", "Standard Deviation": "N/A", "Rank": "N/A" }, "Algebra": { "Average Score": 99.212, "Standard Deviation": null, "Rank": 2 }, "Probability": { "Average Score": 94.181, "Standard Deviation": null, "Rank": 2 }, "Logical": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Social": { "Average Score": 96.978, "Standard Deviation": null, "Rank": 4 } } }, { "config": { "model_name": "gemini-1.5-flash-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 64.39324213, "Standard Deviation": 1.348364198, "Rank": 20 }, "Geometry": { "Average Score": 65.135, "Standard Deviation": null, "Rank": 14 }, "Algebra": { "Average Score": 84.28, "Standard Deviation": null, "Rank": 13 }, "Probability": { "Average Score": 67.22, "Standard Deviation": null, "Rank": 16 }, "Logical": { "Average Score": 71.975, "Standard Deviation": null, "Rank": 19 }, "Social": { "Average Score": 60.374, "Standard Deviation": null, "Rank": 18 }, "Chemistry": { "Average Score": 79.569, "Standard Deviation": null, "Rank": 12 }, "CPP": { "Average Score": 72.1127762005651, "Standard Deviation": null, "Rank": 10 } } }, { "config": { "model_name": "gpt4-1106", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 88.25145246, "Standard Deviation": 0.889714647, "Rank": 10 }, "Geometry": { "Average Score": 61.784, "Standard Deviation": null, "Rank": 16 }, "Algebra": { "Average Score": 80.579, "Standard Deviation": null, "Rank": 15 }, "Probability": { "Average Score": 70.693, "Standard Deviation": null, "Rank": 15 }, "Logical": { "Average Score": 75.513, "Standard Deviation": null, "Rank": 16 }, "Social": { "Average Score": 40.498, "Standard Deviation": null, "Rank": 26 }, "Chemistry": { "Average Score": 73.251, "Standard Deviation": null, "Rank": 16 }, "CPP": { "Average Score": 69.11824072252848, "Standard Deviation": null, "Rank": 12 } } }, { "config": { "model_name": "gemma-2-27b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/06" }, "results": { "OVERALL": { "Average Score": 71.08619043, "Standard Deviation": 41.54124623, "Rank": 19 }, "Geometry": { "Average Score": 56.805, "Standard Deviation": null, "Rank": 17 }, "Algebra": { "Average Score": 76.352, "Standard Deviation": null, "Rank": 18 }, "Probability": { "Average Score": 65.472, "Standard Deviation": null, "Rank": 18 }, "Logical": { "Average Score": 71.976, "Standard Deviation": null, "Rank": 18 }, "Social": { "Average Score": 47.308, "Standard Deviation": null, "Rank": 22 }, "Chemistry": { "Average Score": 69.606, "Standard Deviation": null, "Rank": 20 }, "CPP": { "Average Score": 63.28920072143611, "Standard Deviation": null, "Rank": 14 } } }, { "config": { "model_name": "claude-3-opus", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 79.97608403, "Standard Deviation": 5.382942441, "Rank": 16 }, "Geometry": { "Average Score": 56.54, "Standard Deviation": null, "Rank": 18 }, "Algebra": { "Average Score": 75.405, "Standard Deviation": null, "Rank": 19 }, "Probability": { "Average Score": 67.208, "Standard Deviation": null, "Rank": 17 }, "Logical": { "Average Score": 77.458, "Standard Deviation": null, "Rank": 13 }, "Social": { "Average Score": 80.318, "Standard Deviation": null, "Rank": 9 }, "Chemistry": { "Average Score": 79.694, "Standard Deviation": null, "Rank": 10 }, "CPP": { "Average Score": 73.5404403567132, "Standard Deviation": null, "Rank": 8 } } }, { "config": { "model_name": "gemma-2-9b-it-simpo", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/07" }, "results": { "OVERALL": { "Average Score": "N/A", "Standard Deviation": "N/A", "Rank": "N/A" }, "Geometry": { "Average Score": 51.492, "Standard Deviation": null, "Rank": 20 }, "Algebra": { "Average Score": 70.836, "Standard Deviation": null, "Rank": 20 }, "Probability": { "Average Score": 58.976, "Standard Deviation": null, "Rank": 22 }, "Logical": { "Average Score": 62.887, "Standard Deviation": null, "Rank": 24 }, "Social": { "Average Score": 70.351, "Standard Deviation": null, "Rank": 12 }, "Chemistry": { "Average Score": 85.813, "Standard Deviation": null, "Rank": 5 }, "CPP": { "Average Score": 73.43757596214863, "Standard Deviation": null, "Rank": 9 } } }, { "config": { "model_name": "qwen1.5-72b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 62.1296631, "Standard Deviation": 10.31242823, "Rank": 21 }, "Geometry": { "Average Score": 47.314, "Standard Deviation": null, "Rank": 25 }, "Algebra": { "Average Score": 69.575, "Standard Deviation": null, "Rank": 21 }, "Probability": { "Average Score": 49.066, "Standard Deviation": null, "Rank": 27 }, "Logical": { "Average Score": 36.931, "Standard Deviation": null, "Rank": 36 }, "Social": { "Average Score": 40.498, "Standard Deviation": null, "Rank": 27 }, "Chemistry": { "Average Score": 53.127, "Standard Deviation": null, "Rank": 25 }, "CPP": { "Average Score": 48.69302376665551, "Standard Deviation": null, "Rank": 20 } } }, { "config": { "model_name": "qwen1.5-32b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 46.27600711, "Standard Deviation": 4.159365923, "Rank": 30 }, "Geometry": { "Average Score": 43.846, "Standard Deviation": null, "Rank": 27 }, "Algebra": { "Average Score": 63.321, "Standard Deviation": null, "Rank": 24 }, "Probability": { "Average Score": 48.15, "Standard Deviation": null, "Rank": 28 }, "Logical": { "Average Score": 41.573, "Standard Deviation": null, "Rank": 34 }, "Social": { "Average Score": 38.018, "Standard Deviation": null, "Rank": 29 }, "Chemistry": { "Average Score": 48.041, "Standard Deviation": null, "Rank": 28 }, "CPP": { "Average Score": 45.14284028264288, "Standard Deviation": null, "Rank": 24 } } }, { "config": { "model_name": "google-gemma-2-9b-it", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2024/06" }, "results": { "OVERALL": { "Average Score": 59.59324506, "Standard Deviation": 5.156822857, "Rank": 23 }, "Geometry": { "Average Score": 51.184, "Standard Deviation": null, "Rank": 21 }, "Algebra": { "Average Score": 64.38, "Standard Deviation": null, "Rank": 22 }, "Probability": { "Average Score": 63.362, "Standard Deviation": null, "Rank": 21 }, "Logical": { "Average Score": 69.422, "Standard Deviation": null, "Rank": 21 }, "Social": { "Average Score": 76.113, "Standard Deviation": null, "Rank": 10 }, "Chemistry": { "Average Score": 58.379, "Standard Deviation": null, "Rank": 22 }, "CPP": { "Average Score": 54.03167523687635, "Standard Deviation": null, "Rank": 17 } } }, { "config": { "model_name": "yi-1.5-34b-chat", "organization": "01 AI", "license": "Proprietary", "knowledge_cutoff": "2024/05" }, "results": { "OVERALL": { "Average Score": 72.39079733, "Standard Deviation": 98.90928937, "Rank": 18 }, "Geometry": { "Average Score": 52.638, "Standard Deviation": null, "Rank": 19 }, "Algebra": { "Average Score": 64.055, "Standard Deviation": null, "Rank": 23 }, "Probability": { "Average Score": 64.137, "Standard Deviation": null, "Rank": 20 }, "Logical": { "Average Score": 65.671, "Standard Deviation": null, "Rank": 22 }, "Social": { "Average Score": 47.308, "Standard Deviation": null, "Rank": 23 }, "Chemistry": { "Average Score": 57.484, "Standard Deviation": null, "Rank": 23 }, "CPP": { "Average Score": 52.148798061768964, "Standard Deviation": null, "Rank": 18 } } }, { "config": { "model_name": "meta-llama-3.1-70b-instruct", "organization": "Meta", "license": "Llama 3.1 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 73.27773635, "Standard Deviation": 5.72723675, "Rank": 17 }, "Geometry": { "Average Score": 65.135, "Standard Deviation": null, "Rank": 15 }, "Algebra": { "Average Score": 80.579, "Standard Deviation": null, "Rank": 16 }, "Probability": { "Average Score": 65.472, "Standard Deviation": null, "Rank": 19 }, "Logical": { "Average Score": 72.879, "Standard Deviation": null, "Rank": 17 }, "Social": { "Average Score": 60.374, "Standard Deviation": null, "Rank": 17 }, "Chemistry": { "Average Score": 71.8, "Standard Deviation": null, "Rank": 17 }, "CPP": { "Average Score": 84.36815192532764, "Standard Deviation": null, "Rank": 4 } } }, { "config": { "model_name": "meta-llama-3.1-8b-instruct", "organization": "Meta", "license": "Llama 3.1 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 52.8664657, "Standard Deviation": 3.607384863, "Rank": 27 }, "Geometry": { "Average Score": 41.384, "Standard Deviation": null, "Rank": 29 }, "Algebra": { "Average Score": 62.508, "Standard Deviation": null, "Rank": 25 }, "Probability": { "Average Score": 51.889, "Standard Deviation": null, "Rank": 25 }, "Logical": { "Average Score": 53.587, "Standard Deviation": null, "Rank": 29 }, "Social": { "Average Score": 34.405, "Standard Deviation": null, "Rank": 32 }, "Chemistry": { "Average Score": 45.032, "Standard Deviation": null, "Rank": 32 }, "CPP": { "Average Score": 44.41846841004584, "Standard Deviation": null, "Rank": 26 } } }, { "config": { "model_name": "gpt3.5-turbo-0125", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2021/09" }, "results": { "OVERALL": { "Average Score": 33.7046204, "Standard Deviation": 45.16937959, "Rank": 40 }, "Geometry": { "Average Score": 50.19, "Standard Deviation": null, "Rank": 22 }, "Algebra": { "Average Score": 60.978, "Standard Deviation": null, "Rank": 26 }, "Probability": { "Average Score": 46.284, "Standard Deviation": null, "Rank": 30 }, "Logical": { "Average Score": 20.595, "Standard Deviation": null, "Rank": 47 }, "Social": { "Average Score": 24.926, "Standard Deviation": null, "Rank": 42 }, "Chemistry": { "Average Score": 42.78, "Standard Deviation": null, "Rank": 33 }, "CPP": { "Average Score": 40.46958736582551, "Standard Deviation": null, "Rank": 29 } } }, { "config": { "model_name": "llama-3-70b-instruct", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 59.24245274, "Standard Deviation": 4.878897527, "Rank": 25 }, "Geometry": { "Average Score": 45.249, "Standard Deviation": null, "Rank": 26 }, "Algebra": { "Average Score": 60.736, "Standard Deviation": null, "Rank": 27 }, "Probability": { "Average Score": 54.515, "Standard Deviation": null, "Rank": 23 }, "Logical": { "Average Score": 83.08, "Standard Deviation": null, "Rank": 12 }, "Social": { "Average Score": 42.172, "Standard Deviation": null, "Rank": 24 }, "Chemistry": { "Average Score": 71.8, "Standard Deviation": null, "Rank": 18 }, "CPP": { "Average Score": 65.32140697218945, "Standard Deviation": null, "Rank": 13 } } }, { "config": { "model_name": "claude-3-sonnet", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 61.81320888, "Standard Deviation": 10.27472205, "Rank": 22 }, "Geometry": { "Average Score": 50.185, "Standard Deviation": null, "Rank": 23 }, "Algebra": { "Average Score": 58.739, "Standard Deviation": null, "Rank": 28 }, "Probability": { "Average Score": 54.182, "Standard Deviation": null, "Rank": 24 }, "Logical": { "Average Score": 65.118, "Standard Deviation": null, "Rank": 23 }, "Social": { "Average Score": 55.325, "Standard Deviation": null, "Rank": 19 }, "Chemistry": { "Average Score": 69.778, "Standard Deviation": null, "Rank": 19 }, "CPP": { "Average Score": 61.33538592327427, "Standard Deviation": null, "Rank": 15 } } }, { "config": { "model_name": "qwen1.5-14b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 43.97760317, "Standard Deviation": 3.740375694, "Rank": 31 }, "Geometry": { "Average Score": 35.5, "Standard Deviation": null, "Rank": 31 }, "Algebra": { "Average Score": 57.821, "Standard Deviation": null, "Rank": 29 }, "Probability": { "Average Score": 38.886, "Standard Deviation": null, "Rank": 34 }, "Logical": { "Average Score": 34.775, "Standard Deviation": null, "Rank": 39 }, "Social": { "Average Score": 31.022, "Standard Deviation": null, "Rank": 35 }, "Chemistry": { "Average Score": 40.55, "Standard Deviation": null, "Rank": 36 }, "CPP": { "Average Score": 38.552779976347026, "Standard Deviation": null, "Rank": 31 } } }, { "config": { "model_name": "claude-3-haiku", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 55.60534246, "Standard Deviation": 15.07600975, "Rank": 26 }, "Geometry": { "Average Score": 41.806, "Standard Deviation": null, "Rank": 28 }, "Algebra": { "Average Score": 54.298, "Standard Deviation": null, "Rank": 31 }, "Probability": { "Average Score": 49.344, "Standard Deviation": null, "Rank": 26 }, "Logical": { "Average Score": 61.904, "Standard Deviation": null, "Rank": 25 }, "Social": { "Average Score": 50.407, "Standard Deviation": null, "Rank": 21 }, "Chemistry": { "Average Score": 61.491, "Standard Deviation": null, "Rank": 21 }, "CPP": { "Average Score": 56.40200048817984, "Standard Deviation": null, "Rank": 16 } } }, { "config": { "model_name": "claude-2.1", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 40.35699809, "Standard Deviation": 2.484317383, "Rank": 35 }, "Geometry": { "Average Score": 49.899, "Standard Deviation": null, "Rank": 24 }, "Algebra": { "Average Score": 53.574, "Standard Deviation": null, "Rank": 32 }, "Probability": { "Average Score": 44.011, "Standard Deviation": null, "Rank": 32 }, "Logical": { "Average Score": 59.855, "Standard Deviation": null, "Rank": 26 }, "Social": { "Average Score": 33.888, "Standard Deviation": null, "Rank": 33 }, "Chemistry": { "Average Score": 51.038, "Standard Deviation": null, "Rank": 26 }, "CPP": { "Average Score": 47.23672563994903, "Standard Deviation": null, "Rank": 21 } } }, { "config": { "model_name": "mistral-8x7b-instruct-v0.1", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 43.2937322, "Standard Deviation": 2.659857412, "Rank": 32 }, "Geometry": { "Average Score": 32.639, "Standard Deviation": null, "Rank": 35 }, "Algebra": { "Average Score": 48.901, "Standard Deviation": null, "Rank": 35 }, "Probability": { "Average Score": 44.058, "Standard Deviation": null, "Rank": 31 }, "Logical": { "Average Score": 42.194, "Standard Deviation": null, "Rank": 32 }, "Social": { "Average Score": 26.702, "Standard Deviation": null, "Rank": 41 }, "Chemistry": { "Average Score": 47.192, "Standard Deviation": null, "Rank": 29 }, "CPP": { "Average Score": 44.533118241976666, "Standard Deviation": null, "Rank": 25 } } }, { "config": { "model_name": "claude-2.0", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 30.83692551, "Standard Deviation": 1.816269, "Rank": 43 }, "Geometry": { "Average Score": 37.452, "Standard Deviation": null, "Rank": 30 }, "Algebra": { "Average Score": 48.965, "Standard Deviation": null, "Rank": 34 }, "Probability": { "Average Score": 46.284, "Standard Deviation": null, "Rank": 29 }, "Logical": { "Average Score": 55.657, "Standard Deviation": null, "Rank": 28 }, "Social": { "Average Score": 42.117, "Standard Deviation": null, "Rank": 25 }, "Chemistry": { "Average Score": 55.869, "Standard Deviation": null, "Rank": 24 }, "CPP": { "Average Score": 50.773143448036464, "Standard Deviation": null, "Rank": 19 } } }, { "config": { "model_name": "starling-lm-7b-beta", "organization": "Nexusflow", "license": "Apache-2.0", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 50.05304991, "Standard Deviation": 3.017802027, "Rank": 28 }, "Geometry": { "Average Score": 33.79, "Standard Deviation": null, "Rank": 34 }, "Algebra": { "Average Score": 49.685, "Standard Deviation": null, "Rank": 33 }, "Probability": { "Average Score": 39.677, "Standard Deviation": null, "Rank": 33 }, "Logical": { "Average Score": 47.501, "Standard Deviation": null, "Rank": 30 }, "Social": { "Average Score": 37.7, "Standard Deviation": null, "Rank": 30 }, "Chemistry": { "Average Score": 40.274, "Standard Deviation": null, "Rank": 37 }, "CPP": { "Average Score": 38.27587102395908, "Standard Deviation": null, "Rank": 32 } } }, { "config": { "model_name": "gemini-1.0-pro-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 38.71255653, "Standard Deviation": 8.592349353, "Rank": 37 }, "Geometry": { "Average Score": 34.596, "Standard Deviation": null, "Rank": 33 }, "Algebra": { "Average Score": 48.159, "Standard Deviation": null, "Rank": 36 }, "Probability": { "Average Score": 29.585, "Standard Deviation": null, "Rank": 43 }, "Logical": { "Average Score": 23.882, "Standard Deviation": null, "Rank": 45 }, "Social": { "Average Score": 13.261, "Standard Deviation": null, "Rank": 52 }, "Chemistry": { "Average Score": 46.637, "Standard Deviation": null, "Rank": 30 }, "CPP": { "Average Score": 45.22204471452975, "Standard Deviation": null, "Rank": 23 } } }, { "config": { "model_name": "openchat-3.5-0106", "organization": "OpenChat", "license": "Apache-2.0", "knowledge_cutoff": "2024/01" }, "results": { "OVERALL": { "Average Score": 40.85094215, "Standard Deviation": 6.631820541, "Rank": 34 }, "Geometry": { "Average Score": 29.115, "Standard Deviation": null, "Rank": 37 }, "Algebra": { "Average Score": 45.456, "Standard Deviation": null, "Rank": 37 }, "Probability": { "Average Score": 38.408, "Standard Deviation": null, "Rank": 35 }, "Logical": { "Average Score": 41.678, "Standard Deviation": null, "Rank": 33 }, "Social": { "Average Score": 28.236, "Standard Deviation": null, "Rank": 40 }, "Chemistry": { "Average Score": 34.68, "Standard Deviation": null, "Rank": 39 }, "CPP": { "Average Score": 33.70639271807677, "Standard Deviation": null, "Rank": 33 } } }, { "config": { "model_name": "openchat-3.5", "organization": "OpenChat", "license": "Apache-2.0", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 39.20699952, "Standard Deviation": 1.576169927, "Rank": 36 }, "Geometry": { "Average Score": 30.009, "Standard Deviation": null, "Rank": 36 }, "Algebra": { "Average Score": 42.04, "Standard Deviation": null, "Rank": 39 }, "Probability": { "Average Score": 34.495, "Standard Deviation": null, "Rank": 38 }, "Logical": { "Average Score": 35.828, "Standard Deviation": null, "Rank": 37 }, "Social": { "Average Score": 33.096, "Standard Deviation": null, "Rank": 34 }, "Chemistry": { "Average Score": 36.737, "Standard Deviation": null, "Rank": 38 }, "CPP": { "Average Score": 33.020911255646965, "Standard Deviation": null, "Rank": 34 } } }, { "config": { "model_name": "command-r-(08-2024)", "organization": "Cohere", "license": "CC-BY-NC-4.0", "knowledge_cutoff": "2024/08" }, "results": { "OVERALL": { "Average Score": 46.70245901, "Standard Deviation": 3.665464964, "Rank": 29 }, "Geometry": { "Average Score": 35.43, "Standard Deviation": null, "Rank": 32 }, "Algebra": { "Average Score": 41.852, "Standard Deviation": null, "Rank": 40 }, "Probability": { "Average Score": 36.535, "Standard Deviation": null, "Rank": 37 }, "Logical": { "Average Score": 25.941, "Standard Deviation": null, "Rank": 42 }, "Social": { "Average Score": 30.911, "Standard Deviation": null, "Rank": 36 }, "Chemistry": { "Average Score": 41.629, "Standard Deviation": null, "Rank": 35 }, "CPP": { "Average Score": 39.61492485677676, "Standard Deviation": null, "Rank": 30 } } }, { "config": { "model_name": "gemma-1.1-7b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 32.61912991, "Standard Deviation": 17.86038512, "Rank": 41 }, "Geometry": { "Average Score": 25.149, "Standard Deviation": null, "Rank": 41 }, "Algebra": { "Average Score": 40.456, "Standard Deviation": null, "Rank": 41 }, "Probability": { "Average Score": 29.307, "Standard Deviation": null, "Rank": 44 }, "Logical": { "Average Score": 41.543, "Standard Deviation": null, "Rank": 35 }, "Social": { "Average Score": 21.473, "Standard Deviation": null, "Rank": 45 }, "Chemistry": { "Average Score": 45.033, "Standard Deviation": null, "Rank": 31 }, "CPP": { "Average Score": 42.666504105798204, "Standard Deviation": null, "Rank": 27 } } }, { "config": { "model_name": "llama3-8b-instruct", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "2023/03" }, "results": { "OVERALL": { "Average Score": 37.29361351, "Standard Deviation": 8.841996174, "Rank": 39 }, "Geometry": { "Average Score": 28.496, "Standard Deviation": null, "Rank": 39 }, "Algebra": { "Average Score": 42.117, "Standard Deviation": null, "Rank": 38 }, "Probability": { "Average Score": 33.841, "Standard Deviation": null, "Rank": 39 }, "Logical": { "Average Score": 57.763, "Standard Deviation": null, "Rank": 27 }, "Social": { "Average Score": 35.994, "Standard Deviation": null, "Rank": 31 }, "Chemistry": { "Average Score": 50.023, "Standard Deviation": null, "Rank": 27 }, "CPP": { "Average Score": 45.35392139264795, "Standard Deviation": null, "Rank": 22 } } }, { "config": { "model_name": "gemma-2-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/07" }, "results": { "OVERALL": { "Average Score": 59.3544514, "Standard Deviation": 14.50864762, "Rank": 24 }, "Geometry": { "Average Score": 29.077, "Standard Deviation": null, "Rank": 38 }, "Algebra": { "Average Score": 39.677, "Standard Deviation": null, "Rank": 42 }, "Probability": { "Average Score": 31.561, "Standard Deviation": null, "Rank": 41 }, "Logical": { "Average Score": 43.458, "Standard Deviation": null, "Rank": 31 }, "Social": { "Average Score": 39.343, "Standard Deviation": null, "Rank": 28 }, "Chemistry": { "Average Score": 31.156, "Standard Deviation": null, "Rank": 43 }, "CPP": { "Average Score": 30.53406933106768, "Standard Deviation": null, "Rank": 36 } } }, { "config": { "model_name": "starling-lm-7b-alpha", "organization": "Nexusflow", "license": "Apache-2.0", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 37.94593338, "Standard Deviation": 1.40532208, "Rank": 38 }, "Geometry": { "Average Score": 25.519, "Standard Deviation": null, "Rank": 40 }, "Algebra": { "Average Score": 38.88, "Standard Deviation": null, "Rank": 43 }, "Probability": { "Average Score": 32.068, "Standard Deviation": null, "Rank": 40 }, "Logical": { "Average Score": 33.804, "Standard Deviation": null, "Rank": 40 }, "Social": { "Average Score": 30.875, "Standard Deviation": null, "Rank": 37 }, "Chemistry": { "Average Score": 31.354, "Standard Deviation": null, "Rank": 41 }, "CPP": { "Average Score": 30.07926487356878, "Standard Deviation": null, "Rank": 37 } } }, { "config": { "model_name": "qwen1.5-4b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 9.779979052, "Standard Deviation": 0.925129318, "Rank": 54 }, "Geometry": { "Average Score": 15.672, "Standard Deviation": null, "Rank": 46 }, "Algebra": { "Average Score": 31.21, "Standard Deviation": null, "Rank": 44 }, "Probability": { "Average Score": 13.853, "Standard Deviation": null, "Rank": 49 }, "Logical": { "Average Score": 13.842, "Standard Deviation": null, "Rank": 52 }, "Social": { "Average Score": 20.21, "Standard Deviation": null, "Rank": 46 }, "Chemistry": { "Average Score": 14.794, "Standard Deviation": null, "Rank": 53 }, "CPP": { "Average Score": 13.21208067122554, "Standard Deviation": null, "Rank": 47 } } }, { "config": { "model_name": "command-r-(04-2024)", "organization": "Cohere", "license": "CC-BY-NC-4.0", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 42.49175095, "Standard Deviation": 5.556047496, "Rank": 33 }, "Geometry": { "Average Score": 23.438, "Standard Deviation": null, "Rank": 42 }, "Algebra": { "Average Score": 31.204, "Standard Deviation": null, "Rank": 45 }, "Probability": { "Average Score": 30.726, "Standard Deviation": null, "Rank": 42 }, "Logical": { "Average Score": 35.111, "Standard Deviation": null, "Rank": 38 }, "Social": { "Average Score": 30.623, "Standard Deviation": null, "Rank": 38 }, "Chemistry": { "Average Score": 42.316, "Standard Deviation": null, "Rank": 34 }, "CPP": { "Average Score": 41.346336503003236, "Standard Deviation": null, "Rank": 28 } } }, { "config": { "model_name": "vicuna-33b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 27.90851915, "Standard Deviation": 4.55056913, "Rank": 44 }, "Geometry": { "Average Score": 16.634, "Standard Deviation": null, "Rank": 45 }, "Algebra": { "Average Score": 25.075, "Standard Deviation": null, "Rank": 46 }, "Probability": { "Average Score": 20.901, "Standard Deviation": null, "Rank": 47 }, "Logical": { "Average Score": 22.962, "Standard Deviation": null, "Rank": 46 }, "Social": { "Average Score": 28.487, "Standard Deviation": null, "Rank": 39 }, "Chemistry": { "Average Score": 31.156, "Standard Deviation": null, "Rank": 42 }, "CPP": { "Average Score": 28.01838653090379, "Standard Deviation": null, "Rank": 38 } } }, { "config": { "model_name": "gemma-7b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 25.25380823, "Standard Deviation": 3.455163419, "Rank": 46 }, "Geometry": { "Average Score": 19.626, "Standard Deviation": null, "Rank": 43 }, "Algebra": { "Average Score": 23.272, "Standard Deviation": null, "Rank": 48 }, "Probability": { "Average Score": 16.98, "Standard Deviation": null, "Rank": 48 }, "Logical": { "Average Score": 24.359, "Standard Deviation": null, "Rank": 43 }, "Social": { "Average Score": 23.52, "Standard Deviation": null, "Rank": 43 }, "Chemistry": { "Average Score": 31.139, "Standard Deviation": null, "Rank": 44 }, "CPP": { "Average Score": 28.014658234926813, "Standard Deviation": null, "Rank": 39 } } }, { "config": { "model_name": "mistral-7b-instruct-2", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 31.49596208, "Standard Deviation": 11.79471585, "Rank": 42 }, "Geometry": { "Average Score": 16.847, "Standard Deviation": null, "Rank": 44 }, "Algebra": { "Average Score": 23.287, "Standard Deviation": null, "Rank": 47 }, "Probability": { "Average Score": 24.868, "Standard Deviation": null, "Rank": 45 }, "Logical": { "Average Score": 28.755, "Standard Deviation": null, "Rank": 41 }, "Social": { "Average Score": 21.473, "Standard Deviation": null, "Rank": 44 }, "Chemistry": { "Average Score": 31.994, "Standard Deviation": null, "Rank": 40 }, "CPP": { "Average Score": 31.382959631870822, "Standard Deviation": null, "Rank": 35 } } }, { "config": { "model_name": "mistral-7b-instruct-1", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 20.53586787, "Standard Deviation": 2.95650198, "Rank": 51 }, "Geometry": { "Average Score": 11.019, "Standard Deviation": null, "Rank": 50 }, "Algebra": { "Average Score": 20.39, "Standard Deviation": null, "Rank": 49 }, "Probability": { "Average Score": 24.279, "Standard Deviation": null, "Rank": 46 }, "Logical": { "Average Score": 16.823, "Standard Deviation": null, "Rank": 50 }, "Social": { "Average Score": 12.369, "Standard Deviation": null, "Rank": 53 }, "Chemistry": { "Average Score": 22.121, "Standard Deviation": null, "Rank": 47 }, "CPP": { "Average Score": 18.929093202755805, "Standard Deviation": null, "Rank": 42 } } }, { "config": { "model_name": "vicuna-13b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 17.42296198, "Standard Deviation": 4.480901647, "Rank": 52 }, "Geometry": { "Average Score": 12.755, "Standard Deviation": null, "Rank": 49 }, "Algebra": { "Average Score": 17.974, "Standard Deviation": null, "Rank": 50 }, "Probability": { "Average Score": 13.004, "Standard Deviation": null, "Rank": 50 }, "Logical": { "Average Score": 16.997, "Standard Deviation": null, "Rank": 49 }, "Social": { "Average Score": 14.314, "Standard Deviation": null, "Rank": 51 }, "Chemistry": { "Average Score": 25.307, "Standard Deviation": null, "Rank": 46 }, "CPP": { "Average Score": 21.840013221590294, "Standard Deviation": null, "Rank": 40 } } }, { "config": { "model_name": "zephyr-7b-beta", "organization": "HuggingFace", "license": "MIT", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 9.323654861, "Standard Deviation": 0.338544041, "Rank": 55 }, "Geometry": { "Average Score": 8.222, "Standard Deviation": null, "Rank": 51 }, "Algebra": { "Average Score": 13.006, "Standard Deviation": null, "Rank": 51 }, "Probability": { "Average Score": 7.573, "Standard Deviation": null, "Rank": 55 }, "Logical": { "Average Score": 7.364, "Standard Deviation": null, "Rank": 56 }, "Social": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 58 }, "Chemistry": { "Average Score": 17.18, "Standard Deviation": null, "Rank": 51 }, "CPP": { "Average Score": 18.92902220864132, "Standard Deviation": null, "Rank": 43 } } }, { "config": { "model_name": "gemma-1.1-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 22.44740296, "Standard Deviation": 3.95922917, "Rank": 49 }, "Geometry": { "Average Score": 12.834, "Standard Deviation": null, "Rank": 48 }, "Algebra": { "Average Score": 12.291, "Standard Deviation": null, "Rank": 52 }, "Probability": { "Average Score": 8.228, "Standard Deviation": null, "Rank": 53 }, "Logical": { "Average Score": 10.822, "Standard Deviation": null, "Rank": 54 }, "Social": { "Average Score": 19.303, "Standard Deviation": null, "Rank": 47 }, "Chemistry": { "Average Score": 19.892, "Standard Deviation": null, "Rank": 48 }, "CPP": { "Average Score": 20.724691953843916, "Standard Deviation": null, "Rank": 41 } } }, { "config": { "model_name": "llama2-7b-chat", "organization": "Meta", "license": "Llama 2 Community", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 23.53840413, "Standard Deviation": 4.565404574, "Rank": 47 }, "Geometry": { "Average Score": 5.681, "Standard Deviation": null, "Rank": 52 }, "Algebra": { "Average Score": 9.809, "Standard Deviation": null, "Rank": 54 }, "Probability": { "Average Score": 8.089, "Standard Deviation": null, "Rank": 54 }, "Logical": { "Average Score": 20.474, "Standard Deviation": null, "Rank": 48 }, "Social": { "Average Score": 15.968, "Standard Deviation": null, "Rank": 48 }, "Chemistry": { "Average Score": 18.153, "Standard Deviation": null, "Rank": 50 }, "CPP": { "Average Score": 15.730513733660898, "Standard Deviation": null, "Rank": 45 } } }, { "config": { "model_name": "gemma-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 20.86803148, "Standard Deviation": 4.810898787, "Rank": 50 }, "Geometry": { "Average Score": 15.137, "Standard Deviation": null, "Rank": 47 }, "Algebra": { "Average Score": 10.108, "Standard Deviation": null, "Rank": 53 }, "Probability": { "Average Score": 6.688, "Standard Deviation": null, "Rank": 56 }, "Logical": { "Average Score": 5.296, "Standard Deviation": null, "Rank": 57 }, "Social": { "Average Score": 9.63, "Standard Deviation": null, "Rank": 56 }, "Chemistry": { "Average Score": 18.153, "Standard Deviation": null, "Rank": 49 }, "CPP": { "Average Score": 17.2715657115764, "Standard Deviation": null, "Rank": 44 } } }, { "config": { "model_name": "llama2-13b-chat", "organization": "Meta", "license": "Llama 2 Community", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 23.34503255, "Standard Deviation": 4.939571996, "Rank": 48 }, "Geometry": { "Average Score": 4.017, "Standard Deviation": null, "Rank": 54 }, "Algebra": { "Average Score": 7.201, "Standard Deviation": null, "Rank": 55 }, "Probability": { "Average Score": 11.451, "Standard Deviation": null, "Rank": 51 }, "Logical": { "Average Score": 23.912, "Standard Deviation": null, "Rank": 44 }, "Social": { "Average Score": 15.715, "Standard Deviation": null, "Rank": 50 }, "Chemistry": { "Average Score": 14.773, "Standard Deviation": null, "Rank": 54 }, "CPP": { "Average Score": 13.17258252933903, "Standard Deviation": null, "Rank": 48 } } }, { "config": { "model_name": "vicuna-7b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 16.78668722, "Standard Deviation": 4.782003459, "Rank": 53 }, "Geometry": { "Average Score": 5.299, "Standard Deviation": null, "Rank": 53 }, "Algebra": { "Average Score": 7.014, "Standard Deviation": null, "Rank": 56 }, "Probability": { "Average Score": 8.228, "Standard Deviation": null, "Rank": 52 }, "Logical": { "Average Score": 11.753, "Standard Deviation": null, "Rank": 53 }, "Social": { "Average Score": 11.326, "Standard Deviation": null, "Rank": 54 }, "Chemistry": { "Average Score": 15.092, "Standard Deviation": null, "Rank": 52 }, "CPP": { "Average Score": 14.255194156624162, "Standard Deviation": null, "Rank": 46 } } }, { "config": { "model_name": "koala-13b", "organization": "UC Berkeley", "license": "Non-commercial", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 8.747324657, "Standard Deviation": 0.645177403, "Rank": 56 }, "Geometry": { "Average Score": 0.156, "Standard Deviation": null, "Rank": 55 }, "Algebra": { "Average Score": 2.242, "Standard Deviation": null, "Rank": 57 }, "Probability": { "Average Score": 3.323, "Standard Deviation": null, "Rank": 57 }, "Logical": { "Average Score": 8.156, "Standard Deviation": null, "Rank": 55 }, "Social": { "Average Score": 9.649, "Standard Deviation": null, "Rank": 55 }, "Chemistry": { "Average Score": 6.672, "Standard Deviation": null, "Rank": 55 }, "CPP": { "Average Score": 6.36433272373514, "Standard Deviation": null, "Rank": 49 } } }, { "config": { "model_name": "openassistant-pythia-12b", "organization": "OpenAssistant", "license": "Non-commercial", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 0, "Standard Deviation": 0, "Rank": 57 }, "Geometry": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 56 }, "Algebra": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 58 }, "Probability": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 58 }, "Logical": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 58 }, "Social": { "Average Score": 1.637, "Standard Deviation": null, "Rank": 57 }, "Chemistry": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 56 }, "CPP": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 50 } } }, { "config": { "model_name": "nemotron-70b", "organization": "NVIDIA", "license": "Unknown", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 100, "Standard Deviation": 0, "Rank": 1 }, "Algebra": { "Average Score": 79.813, "Standard Deviation": null, "Rank": 17 }, "Geometry": { "Average Score": 67.014, "Standard Deviation": null, "Rank": 13 }, "Probability": { "Average Score": 75.535, "Standard Deviation": null, "Rank": 10 }, "Logical": { "Average Score": 92.659, "Standard Deviation": null, "Rank": 5 }, "Social": { "Average Score": 99.677, "Standard Deviation": null, "Rank": 2 }, "Chemistry": { "Average Score": 76.262, "Standard Deviation": null, "Rank": 14 } } }, { "config": { "model_name": "yi-lightning", "organization": "01 AI", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 96.84467293, "Standard Deviation": 0.033152361, "Rank": 3 }, "Geometry": { "Average Score": 77.667, "Standard Deviation": null, "Rank": 8 }, "Algebra": { "Average Score": 93.245, "Standard Deviation": null, "Rank": 6 }, "Chemistry": { "Average Score": 100.000, "Standard Deviation": null, "Rank": 1 }, "Logical": { "Average Score": 94.660, "Standard Deviation": null, "Rank": 4 }, "Social": { "Average Score": 83.236, "Standard Deviation": null, "Rank": 8 }, "Probability": { "Average Score": 90.329, "Standard Deviation": null, "Rank": 3 } } }, { "config": { "model_name": "glm-4-plus", "organization": "Zhipu AI", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 92.48932574, "Standard Deviation": 0.087973142, "Rank": 6 }, "Geometry": { "Average Score": 76.965, "Standard Deviation": null, "Rank": 9 }, "Algebra": { "Average Score": 91.701, "Standard Deviation": null, "Rank": 7 }, "Chemistry": { "Average Score": 83.527, "Standard Deviation": null, "Rank": 8 }, "Logical": { "Average Score": 92.348, "Standard Deviation": null, "Rank": 6 }, "Social": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Probability": { "Average Score": 74.233, "Standard Deviation": null, "Rank": 12 } } }, { "config": { "model_name": "llama-3.2-3b-it", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 26.58569941, "Standard Deviation": 4.191042423, "Rank": 45 }, "Algebra": { "Average Score": 56.545, "Standard Deviation": null, "Rank": 30 }, "Probability": { "Average Score": 37.496, "Standard Deviation": null, "Rank": 36 }, "Logical": { "Average Score": 15.188, "Standard Deviation": null, "Rank": 51 }, "Social": { "Average Score": 15.924, "Standard Deviation": null, "Rank": 49 }, "Chemistry": { "Average Score": 30.78, "Standard Deviation": null, "Rank": 45 } } } ]