|
[ |
|
{"config": { |
|
"model_name": "ChatGPT-4o-latest (2024-09-03)", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 99.19484702, "Avg Rank": 1.666666667, "Min Rank": 1, "Max Rank": 3}, |
|
"math-probability": {"Score": 100, "Avg Rank": 1, "Min Rank": 1, "Max Rank": 1}, |
|
"reasoning-logical": {"Avg Rank": 1, "Min Rank": 1, "Max Rank": 1}, |
|
"overall": {"Avg Rank": 2, "Min Rank": 2, "Max Rank": 2} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gpt-4o-2024-08-06", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 98.38969404, "Avg Rank": 1.666666667, "Min Rank": 1, "Max Rank": 2}, |
|
"math-probability": {"Score": 96.49758454, "Avg Rank": 2.666666667, "Min Rank": 2, "Max Rank": 4}, |
|
"reasoning-logical": {"Avg Rank": 4.333333333, "Min Rank": 3, "Max Rank": 5}, |
|
"overall": {"Avg Rank": 7.33, "Min Rank": 4, "Max Rank": 9} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gpt-4o-2024-05-13", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/10" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 98.15480333, "Avg Rank": 2.666666667, "Min Rank": 2, "Max Rank": 3}, |
|
"math-probability": {"Score": 94.83939431, "Avg Rank": 3.666666667, "Min Rank": 2, "Max Rank": 5}, |
|
"reasoning-logical": {"Avg Rank": 6.333333333, "Min Rank": 3, "Max Rank": 8}, |
|
"overall": {"Avg Rank": 7.67, "Min Rank": 7, "Max Rank": 9} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gpt-4-turbo-2024-04-09", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023/12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 96.03195879, "Avg Rank": 4, "Min Rank": 4, "Max Rank": 4}, |
|
"math-probability": {"Score": 93.59903382, "Avg Rank": 6.666666667, "Min Rank": 6, "Max Rank": 8}, |
|
"reasoning-logical": {"Avg Rank": 4, "Min Rank": 2, "Max Rank": 7}, |
|
"overall": {"Avg Rank": 6, "Min Rank": 5, "Max Rank": 8} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gemini-1.5-pro-001", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-01" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 94.7572213, "Avg Rank": 5, "Min Rank": 5, "Max Rank": 5}, |
|
"math-probability": {"Score": 91.42512077, "Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 10}, |
|
"reasoning-logical": {"Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 11}, |
|
"overall": {"Avg Rank": 14, "Min Rank": 13, "Max Rank": 15} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "qwen2-72b-instruct", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024-02" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 93.88818605, "Avg Rank": 6, "Min Rank": 6, "Max Rank": 6}, |
|
"math-probability": {"Score": 91.54326174, "Avg Rank": 4, "Min Rank": 3, "Max Rank": 5}, |
|
"reasoning-logical": {"Avg Rank": 15.66666667, "Min Rank": 15, "Max Rank": 17}, |
|
"overall": {"Avg Rank": 17, "Min Rank": 17, "Max Rank": 17} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gpt-4o-mini-2024-07-18", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-07" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 93.22073596, "Avg Rank": 7, "Min Rank": 7, "Max Rank": 7}, |
|
"math-probability": {"Score": 92.17351456, "Avg Rank": 3.666666667, "Min Rank": 3, "Max Rank": 5}, |
|
"reasoning-logical": {"Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 10}, |
|
"overall": {"Avg Rank": 7, "Min Rank": 5, "Max Rank": 8} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "claude-3.5-sonnet", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-03" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 91.5823805, "Avg Rank": 8.333333333, "Min Rank": 8, "Max Rank": 9}, |
|
"math-probability": {"Score": 91.55011915, "Avg Rank": 8, "Min Rank": 7, "Max Rank": 9}, |
|
"reasoning-logical": {"Avg Rank": 5, "Min Rank": 2, "Max Rank": 7}, |
|
"overall": {"Avg Rank": 5, "Min Rank": 4, "Max Rank": 7} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "o1-mini", |
|
"organization": "01 AI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-01" |
|
}, |
|
"results": { |
|
"math-algebra": None, |
|
"math-probability": None, |
|
"reasoning-logical": None, |
|
"overall": {"Avg Rank": 1, "Min Rank": 1, "Max Rank": 1} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "o1-preview", |
|
"organization": "01 AI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-01" |
|
}, |
|
"results": { |
|
"math-algebra": None, |
|
"math-probability": None, |
|
"reasoning-logical": None, |
|
"overall": {"Avg Rank": 3, "Min Rank": 3, "Max Rank": 3} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gemini-1.5-flash-001", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-02" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 91.30211121, "Avg Rank": 11, "Min Rank": 11, "Max Rank": 11}, |
|
"math-probability": {"Score": 91.066099, "Avg Rank": 12, "Min Rank": 10, "Max Rank": 13}, |
|
"reasoning-logical": {"Avg Rank": 15.66666667, "Min Rank": 15, "Max Rank": 16}, |
|
"overall": {"Avg Rank": 14, "Min Rank": 13, "Max Rank": 15} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gpt4-1106", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-04" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 91.2227739, "Avg Rank": 12, "Min Rank": 12, "Max Rank": 12}, |
|
"math-probability": {"Score": 91.09550085, "Avg Rank": 11.66666667, "Min Rank": 11, "Max Rank": 12}, |
|
"reasoning-logical": {"Avg Rank": 12, "Min Rank": 12, "Max Rank": 12}, |
|
"overall": {"Avg Rank": 12, "Min Rank": 11, "Max Rank": 12} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gemma-2-27b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024-03" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 91.08554346, "Avg Rank": 13.33333333, "Min Rank": 13, "Max Rank": 14}, |
|
"math-probability": {"Score": 91.09516215, "Avg Rank": 14, "Min Rank": 14, "Max Rank": 14}, |
|
"reasoning-logical": {"Avg Rank": 13, "Min Rank": 13, "Max Rank": 13}, |
|
"overall": {"Avg Rank": 13, "Min Rank": 12, "Max Rank": 14} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "claude-3-opus", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-01" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 89.75345785, "Avg Rank": 13.66666667, "Min Rank": 13, "Max Rank": 14}, |
|
"math-probability": {"Score": 91.06939607, "Avg Rank": 11.33333333, "Min Rank": 11, "Max Rank": 12}, |
|
"reasoning-logical": {"Avg Rank": 10.66666667, "Min Rank": 10, "Max Rank": 11}, |
|
"overall": {"Avg Rank": 12, "Min Rank": 10, "Max Rank": 15} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gemma-2-9b-it-simpo", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2024-02" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 87.66368227, "Avg Rank": 15, "Min Rank": 15, "Max Rank": 15}, |
|
"math-probability": {"Score": 73.64665336, "Avg Rank": 17, "Min Rank": 17, "Max Rank": 17}, |
|
"reasoning-logical": {"Avg Rank": 19, "Min Rank": 19, "Max Rank": 19}, |
|
"overall": {"Avg Rank": 17, "Min Rank": 15, "Max Rank": 19} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "qwen1.5-72b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024-03" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 86.56207015, "Avg Rank": 16, "Min Rank": 16, "Max Rank": 16}, |
|
"math-probability": {"Score": 72.7735874, "Avg Rank": 21, "Min Rank": 20, "Max Rank": 22}, |
|
"reasoning-logical": {"Avg Rank": 29.66666667, "Min Rank": 28, "Max Rank": 31}, |
|
"overall": {"Avg Rank": 23, "Min Rank": 16, "Max Rank": 31} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "qwen1.5-32b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024-03" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 84.59439036, "Avg Rank": 17.33333333, "Min Rank": 17, "Max Rank": 18}, |
|
"math-probability": {"Score": 76.61348265, "Avg Rank": 22.33333333, "Min Rank": 22, "Max Rank": 23}, |
|
"reasoning-logical": {"Avg Rank": 28.66666667, "Min Rank": 27, "Max Rank": 30}, |
|
"overall": {"Avg Rank": 22, "Min Rank": 17, "Max Rank": 30} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "google-gemma-2-9b-it", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-01" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 84.18901776, "Avg Rank": 18, "Min Rank": 17, "Max Rank": 19}, |
|
"math-probability": {"Score": 74.46332504, "Avg Rank": 16, "Min Rank": 16, "Max Rank": 16}, |
|
"reasoning-logical": {"Avg Rank": 14, "Min Rank": 14, "Max Rank": 14}, |
|
"overall": {"Avg Rank": 16, "Min Rank": 14, "Max Rank": 19} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "yi-1.5-34b-chat", |
|
"organization": "01 AI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-01" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 81.82921677, "Avg Rank": 18.66666667, "Min Rank": 18, "Max Rank": 19}, |
|
"math-probability": {"Score": 77.41945842, "Avg Rank": 15, "Min Rank": 15, "Max Rank": 15}, |
|
"reasoning-logical": {"Avg Rank": 17.33333333, "Min Rank": 17, "Max Rank": 18}, |
|
"overall": {"Avg Rank": 18, "Min Rank": 15, "Max Rank": 19} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "meta-llama-3.1-8b-instruct", |
|
"organization": "Meta", |
|
"license": "Llama 3.1 Community", |
|
"knowledge_cutoff": "2024-02" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 75.57121963, "Avg Rank": 20.33333333, "Min Rank": 20, "Max Rank": 21}, |
|
"math-probability": {"Score": 75.46243493, "Avg Rank": 20.33333333, "Min Rank": 20, "Max Rank": 21}, |
|
"reasoning-logical": {"Avg Rank": 23.66666667, "Min Rank": 23, "Max Rank": 24}, |
|
"overall": {"Avg Rank": 21, "Min Rank": 20, "Max Rank": 24} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gpt3.5-turbo-0125", |
|
"organization": "OpenAI", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 73.29235048, "Avg Rank": 21.33333333, "Min Rank": 21, "Max Rank": 22}, |
|
"math-probability": {"Score": 66.27452275, "Avg Rank": 24, "Min Rank": 24, "Max Rank": 24}, |
|
"reasoning-logical": {"Avg Rank": 42.66666667, "Min Rank": 42, "Max Rank": 44}, |
|
"overall": {"Avg Rank": 29, "Min Rank": 21, "Max Rank": 44} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "llama-3-70b-instruct", |
|
"organization": "Meta", |
|
"license": "Llama 3 Community", |
|
"knowledge_cutoff": "2024-03" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 73.75419539, "Avg Rank": 21.33333333, "Min Rank": 20, "Max Rank": 22}, |
|
"math-probability": {"Score": 87.86358478, "Avg Rank": 18.33333333, "Min Rank": 18, "Max Rank": 19}, |
|
"reasoning-logical": {"Avg Rank": 3.333333333, "Min Rank": 2, "Max Rank": 4}, |
|
"overall": {"Avg Rank": 15, "Min Rank": 3, "Max Rank": 22} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "claude-3-sonnet", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-02" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 71.15353833, "Avg Rank": 23, "Min Rank": 23, "Max Rank": 23}, |
|
"math-probability": {"Score": 88.02362801, "Avg Rank": 18.66666667, "Min Rank": 18, "Max Rank": 19}, |
|
"reasoning-logical": {"Avg Rank": 17.33333333, "Min Rank": 16, "Max Rank": 18}, |
|
"overall": {"Avg Rank": 20, "Min Rank": 16, "Max Rank": 23} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "qwen1.5-14b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024-01" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 69.70470323, "Avg Rank": 24, "Min Rank": 24, "Max Rank": 24}, |
|
"math-probability": {"Score": 66.41420544, "Avg Rank": 28.66666667, "Min Rank": 28, "Max Rank": 29}, |
|
"reasoning-logical": {"Avg Rank": 34, "Min Rank": 34, "Max Rank": 34}, |
|
"overall": {"Avg Rank": 28, "Min Rank": 24, "Max Rank": 34} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "claude-3-haiku", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2024-01" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 68.44060149, "Avg Rank": 25, "Min Rank": 25, "Max Rank": 25}, |
|
"math-probability": {"Score": 76.46075239, "Avg Rank": 22.33333333, "Min Rank": 21, "Max Rank": 23}, |
|
"reasoning-logical": {"Avg Rank": 20, "Min Rank": 20, "Max Rank": 20}, |
|
"overall": {"Avg Rank": 22, "Min Rank": 20, "Max Rank": 25} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "claude-2.1", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 67.59939121, "Avg Rank": 26, "Min Rank": 26, "Max Rank": 26}, |
|
"math-probability": {"Score": 68.89772398, "Avg Rank": 27, "Min Rank": 27, "Max Rank": 27}, |
|
"reasoning-logical": {"Avg Rank": 21, "Min Rank": 21, "Max Rank": 21}, |
|
"overall": {"Avg Rank": 25, "Min Rank": 21, "Max Rank": 27} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "mistral-8x7b-instruct-v0.1", |
|
"organization": "Mistral", |
|
"license": "Apache 2.0", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 64.71364004, "Avg Rank": 27, "Min Rank": 27, "Max Rank": 27}, |
|
"math-probability": {"Score": 67.67468595, "Avg Rank": 26, "Min Rank": 26, "Max Rank": 26}, |
|
"reasoning-logical": {"Avg Rank": 29, "Min Rank": 28, "Max Rank": 30}, |
|
"overall": {"Avg Rank": 27, "Min Rank": 26, "Max Rank": 30} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "claude-2.0", |
|
"organization": "Anthropic", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023-10" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 64.77311289, "Avg Rank": 28, "Min Rank": 28, "Max Rank": 28}, |
|
"math-probability": {"Score": 74.34063069, "Avg Rank": 25, "Min Rank": 25, "Max Rank": 25}, |
|
"reasoning-logical": {"Avg Rank": 23.33333333, "Min Rank": 23, "Max Rank": 24}, |
|
"overall": {"Avg Rank": 25, "Min Rank": 23, "Max Rank": 28} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "starling-lm-7b-beta", |
|
"organization": "Nexusflow", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2024-01" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 64.01222884, "Avg Rank": 29.33333333, "Min Rank": 29, "Max Rank": 30}, |
|
"math-probability": {"Score": 70.42025806, "Avg Rank": 28.33333333, "Min Rank": 28, "Max Rank": 29}, |
|
"reasoning-logical": {"Avg Rank": 25, "Min Rank": 25, "Max Rank": 25}, |
|
"overall": {"Avg Rank": 27, "Min Rank": 25, "Max Rank": 30} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gemini-1.0-pro-001", |
|
"organization": "Google", |
|
"license": "Proprietary", |
|
"knowledge_cutoff": "2023-11" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 63.93365247, "Avg Rank": 29.66666667, "Min Rank": 29, "Max Rank": 30}, |
|
"math-probability": {"Score": 62.13077748, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 38}, |
|
"reasoning-logical": {"Avg Rank": 37.33333333, "Min Rank": 36, "Max Rank": 40}, |
|
"overall": {"Avg Rank": 34, "Min Rank": 29, "Max Rank": 40} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "openchat-3.5-0106", |
|
"organization": "OpenChat", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2024-01" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 63.02959506, "Avg Rank": 31, "Min Rank": 31, "Max Rank": 31}, |
|
"math-probability": {"Score": 61.00599665, "Avg Rank": 30, "Min Rank": 30, "Max Rank": 30}, |
|
"reasoning-logical": {"Avg Rank": 27.66666667, "Min Rank": 27, "Max Rank": 29}, |
|
"overall": {"Avg Rank": 29, "Min Rank": 27, "Max Rank": 31} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "openchat-3.5", |
|
"organization": "OpenChat", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 61.45954168, "Avg Rank": 32.33333333, "Min Rank": 32, "Max Rank": 33}, |
|
"math-probability": {"Score": 62.56195929, "Avg Rank": 32, "Min Rank": 32, "Max Rank": 32}, |
|
"reasoning-logical": {"Avg Rank": 32, "Min Rank": 31, "Max Rank": 33}, |
|
"overall": {"Avg Rank": 32, "Min Rank": 31, "Max Rank": 33} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "command-r-(08-2024)", |
|
"organization": "Cohere", |
|
"license": "CC-BY-NC-4.0", |
|
"knowledge_cutoff": "2024-08" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 61.0679475, "Avg Rank": 32.66666667, "Min Rank": 32, "Max Rank": 33}, |
|
"math-probability": {"Score": 66.00833826, "Avg Rank": 31, "Min Rank": 31, "Max Rank": 31}, |
|
"reasoning-logical": {"Avg Rank": 37.66666667, "Min Rank": 37, "Max Rank": 38}, |
|
"overall": {"Avg Rank": 34, "Min Rank": 31, "Max Rank": 38} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gemma-1.1-7b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2023-11" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 60.92904194, "Avg Rank": 34.33333333, "Min Rank": 34, "Max Rank": 35}, |
|
"math-probability": {"Score": 62.17574935, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37}, |
|
"reasoning-logical": {"Avg Rank": 30.33333333, "Min Rank": 28, "Max Rank": 32}, |
|
"overall": {"Avg Rank": 34, "Min Rank": 28, "Max Rank": 37} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "llama3-8b-instruct", |
|
"organization": "Meta", |
|
"license": "Llama 3 Community", |
|
"knowledge_cutoff": "2024-01" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 61.06411319, "Avg Rank": 35, "Min Rank": 34, "Max Rank": 36}, |
|
"math-probability": {"Score": 62.13077748, "Avg Rank": 34.66666667, "Min Rank": 34, "Max Rank": 35}, |
|
"reasoning-logical": {"Avg Rank": 22, "Min Rank": 22, "Max Rank": 22}, |
|
"overall": {"Avg Rank": 30, "Min Rank": 22, "Max Rank": 36} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gemma-2-2b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 59.70248014, "Avg Rank": 36, "Min Rank": 35, "Max Rank": 37}, |
|
"math-probability": {"Score": 61.08084527, "Avg Rank": 33.66666667, "Min Rank": 33, "Max Rank": 35}, |
|
"reasoning-logical": {"Avg Rank": 26, "Min Rank": 26, "Max Rank": 26}, |
|
"overall": {"Avg Rank": 32, "Min Rank": 26, "Max Rank": 37} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "starling-lm-7b-alpha", |
|
"organization": "Nexusflow", |
|
"license": "Apache-2.0", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 59.574329, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37}, |
|
"math-probability": {"Score": 64.03683254, "Avg Rank": 33.66666667, "Min Rank": 33, "Max Rank": 34}, |
|
"reasoning-logical": {"Avg Rank": 35, "Min Rank": 35, "Max Rank": 35}, |
|
"overall": {"Avg Rank": 35, "Min Rank": 33, "Max Rank": 37} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "qwen1.5-4b-chat", |
|
"organization": "Alibaba", |
|
"license": "Qianwen LICENSE", |
|
"knowledge_cutoff": "2024-02" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 56.66282914, "Avg Rank": 38.33333333, "Min Rank": 38, "Max Rank": 39}, |
|
"math-probability": {"Score": 57.39032697, "Avg Rank": 43, "Min Rank": 43, "Max Rank": 43}, |
|
"reasoning-logical": {"Avg Rank": 46, "Min Rank": 46, "Max Rank": 46}, |
|
"overall": {"Avg Rank": 42, "Min Rank": 38, "Max Rank": 46} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "command-r-(04-2024)", |
|
"organization": "Cohere", |
|
"license": "CC-BY-NC-4.0", |
|
"knowledge_cutoff": "2024-04" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 56.19063413, "Avg Rank": 38.66666667, "Min Rank": 38, "Max Rank": 39}, |
|
"math-probability": {"Score": 54.37641509, "Avg Rank": 37.66666667, "Min Rank": 37, "Max Rank": 38}, |
|
"reasoning-logical": {"Avg Rank": 32.66666667, "Min Rank": 32, "Max Rank": 33}, |
|
"overall": {"Avg Rank": 36, "Min Rank": 32, "Max Rank": 39} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "vicuna-33b", |
|
"organization": "LMSYS", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 54.71037983, "Avg Rank": 40.66666667, "Min Rank": 40, "Max Rank": 42}, |
|
"math-probability": {"Score": 55.02214588, "Avg Rank": 41, "Min Rank": 41, "Max Rank": 41}, |
|
"reasoning-logical": {"Avg Rank": 41, "Min Rank": 41, "Max Rank": 41}, |
|
"overall": {"Avg Rank": 41, "Min Rank": 40, "Max Rank": 42} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gemma-7b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 54.35817186, "Avg Rank": 40.66666667, "Min Rank": 40, "Max Rank": 41}, |
|
"math-probability": {"Score": 58.19573446, "Avg Rank": 42, "Min Rank": 42, "Max Rank": 42}, |
|
"reasoning-logical": {"Avg Rank": 39.33333333, "Min Rank": 39, "Max Rank": 40}, |
|
"overall": {"Avg Rank": 41, "Min Rank": 39, "Max Rank": 42} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "mistral-7b-instruct-2", |
|
"organization": "Mistral", |
|
"license": "Apache 2.0", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 54.39240703, "Avg Rank": 41.66666667, "Min Rank": 41, "Max Rank": 42}, |
|
"math-probability": {"Score": 60.35257542, "Avg Rank": 39, "Min Rank": 39, "Max Rank": 39}, |
|
"reasoning-logical": {"Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37}, |
|
"overall": {"Avg Rank": 39, "Min Rank": 36, "Max Rank": 42} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "mistral-7b-instruct-1", |
|
"organization": "Mistral", |
|
"license": "Apache 2.0", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 53.80157944, "Avg Rank": 43, "Min Rank": 43, "Max Rank": 43}, |
|
"math-probability": {"Score": 56.51960666, "Avg Rank": 40, "Min Rank": 40, "Max Rank": 40}, |
|
"reasoning-logical": {"Avg Rank": 45, "Min Rank": 45, "Max Rank": 45}, |
|
"overall": {"Avg Rank": 43, "Min Rank": 40, "Max Rank": 45} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "vicuna-13b", |
|
"organization": "LMSYS", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023-11" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 53.5413765, "Avg Rank": 44, "Min Rank": 44, "Max Rank": 44}, |
|
"math-probability": {"Score": 53.53586693, "Avg Rank": 44, "Min Rank": 44, "Max Rank": 44}, |
|
"reasoning-logical": {"Avg Rank": 43.66666667, "Min Rank": 43, "Max Rank": 44}, |
|
"overall": {"Avg Rank": 44, "Min Rank": 43, "Max Rank": 44} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "zephyr-7b-beta", |
|
"organization": "HuggingFace", |
|
"license": "MIT", |
|
"knowledge_cutoff": "2023-10" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 52.23039742, "Avg Rank": 46, "Min Rank": 45, "Max Rank": 48}, |
|
"math-probability": {"Score": 51.67173535, "Avg Rank": 47.33333333, "Min Rank": 47, "Max Rank": 48}, |
|
"reasoning-logical": {"Avg Rank": 50, "Min Rank": 50, "Max Rank": 50}, |
|
"overall": {"Avg Rank": 48, "Min Rank": 45, "Max Rank": 50} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gemma-1.1-2b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 52.22372428, "Avg Rank": 46, "Min Rank": 45, "Max Rank": 47}, |
|
"math-probability": {"Score": 51.74306688, "Avg Rank": 46.33333333, "Min Rank": 46, "Max Rank": 47}, |
|
"reasoning-logical": {"Avg Rank": 48, "Min Rank": 48, "Max Rank": 48}, |
|
"overall": {"Avg Rank": 47, "Min Rank": 45, "Max Rank": 48} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "llama2-7b-chat", |
|
"organization": "Meta", |
|
"license": "Llama 2 Community", |
|
"knowledge_cutoff": "2023-10" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 51.83025857, "Avg Rank": 46.33333333, "Min Rank": 46, "Max Rank": 47}, |
|
"math-probability": {"Score": 51.19585847, "Avg Rank": 47.33333333, "Min Rank": 46, "Max Rank": 48}, |
|
"reasoning-logical": {"Avg Rank": 42.66666667, "Min Rank": 42, "Max Rank": 43}, |
|
"overall": {"Avg Rank": 45, "Min Rank": 42, "Max Rank": 48} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "gemma-2b-it", |
|
"organization": "Google", |
|
"license": "Gemma License", |
|
"knowledge_cutoff": "2023-11" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 51.60281474, "Avg Rank": 47.66666667, "Min Rank": 47, "Max Rank": 48}, |
|
"math-probability": {"Score": 51.52250905, "Avg Rank": 50, "Min Rank": 50, "Max Rank": 50}, |
|
"reasoning-logical": {"Avg Rank": 51, "Min Rank": 51, "Max Rank": 51}, |
|
"overall": {"Avg Rank": 49, "Min Rank": 47, "Max Rank": 51} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "llama2-13b-chat", |
|
"organization": "Meta", |
|
"license": "Llama 2 Community", |
|
"knowledge_cutoff": "2023-12" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 51.21273132, "Avg Rank": 49, "Min Rank": 49, "Max Rank": 49}, |
|
"math-probability": {"Score": 51.72056522, "Avg Rank": 45, "Min Rank": 45, "Max Rank": 45}, |
|
"reasoning-logical": {"Avg Rank": 39, "Min Rank": 38, "Max Rank": 40}, |
|
"overall": {"Avg Rank": 44, "Min Rank": 38, "Max Rank": 49} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "vicuna-7b", |
|
"organization": "LMSYS", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023-11" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 51.31450547, "Avg Rank": 50, "Min Rank": 50, "Max Rank": 50}, |
|
"math-probability": {"Score": 52.72504618, "Avg Rank": 49, "Min Rank": 49, "Max Rank": 49}, |
|
"reasoning-logical": {"Avg Rank": 47, "Min Rank": 47, "Max Rank": 47}, |
|
"overall": {"Avg Rank": 48, "Min Rank": 47, "Max Rank": 50} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "koala-13b", |
|
"organization": "UC Berkeley", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023-10" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 50.19054677, "Avg Rank": 51, "Min Rank": 51, "Max Rank": 51}, |
|
"math-probability": {"Score": 50.741989, "Avg Rank": 51, "Min Rank": 51, "Max Rank": 51}, |
|
"reasoning-logical": {"Avg Rank": 49, "Min Rank": 49, "Max Rank": 49}, |
|
"overall": {"Avg Rank": 50, "Min Rank": 49, "Max Rank": 51} |
|
}}, |
|
|
|
{"config": { |
|
"model_name": "openassistant-pythia-12b", |
|
"organization": "OpenAssistant", |
|
"license": "Non-commercial", |
|
"knowledge_cutoff": "2023-09" |
|
}, |
|
"results": { |
|
"math-algebra": {"Score": 50, "Avg Rank": 52, "Min Rank": 52, "Max Rank": 52}, |
|
"math-probability": {"Score": 50, "Avg Rank": 52, "Min Rank": 52, "Max Rank": 52}, |
|
"reasoning-logical": {"Avg Rank": 52, "Min Rank": 52, "Max Rank": 52}, |
|
"overall": {"Avg Rank": 52, "Min Rank": 52, "Max Rank": 52} |
|
}} |
|
] |
|
|