de-arena / src /results /models_2024-10-07-14:50:12.666068.jsonl
yzabc007's picture
Update space
5e7bf95
raw
history blame
29 kB
[
{"config": {
"model_name": "ChatGPT-4o-latest (2024-09-03)",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"math-algebra": {"Score": 99.19484702, "Avg Rank": 1.666666667, "Min Rank": 1, "Max Rank": 3},
"math-probability": {"Score": 100, "Avg Rank": 1, "Min Rank": 1, "Max Rank": 1},
"reasoning-logical": {"Avg Rank": 1, "Min Rank": 1, "Max Rank": 1},
"overall": {"Avg Rank": 2, "Min Rank": 2, "Max Rank": 2}
}},
{"config": {
"model_name": "gpt-4o-2024-08-06",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"math-algebra": {"Score": 98.38969404, "Avg Rank": 1.666666667, "Min Rank": 1, "Max Rank": 2},
"math-probability": {"Score": 96.49758454, "Avg Rank": 2.666666667, "Min Rank": 2, "Max Rank": 4},
"reasoning-logical": {"Avg Rank": 4.333333333, "Min Rank": 3, "Max Rank": 5},
"overall": {"Avg Rank": 7.33, "Min Rank": 4, "Max Rank": 9}
}},
{"config": {
"model_name": "gpt-4o-2024-05-13",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/10"
},
"results": {
"math-algebra": {"Score": 98.15480333, "Avg Rank": 2.666666667, "Min Rank": 2, "Max Rank": 3},
"math-probability": {"Score": 94.83939431, "Avg Rank": 3.666666667, "Min Rank": 2, "Max Rank": 5},
"reasoning-logical": {"Avg Rank": 6.333333333, "Min Rank": 3, "Max Rank": 8},
"overall": {"Avg Rank": 7.67, "Min Rank": 7, "Max Rank": 9}
}},
{"config": {
"model_name": "gpt-4-turbo-2024-04-09",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023/12"
},
"results": {
"math-algebra": {"Score": 96.03195879, "Avg Rank": 4, "Min Rank": 4, "Max Rank": 4},
"math-probability": {"Score": 93.59903382, "Avg Rank": 6.666666667, "Min Rank": 6, "Max Rank": 8},
"reasoning-logical": {"Avg Rank": 4, "Min Rank": 2, "Max Rank": 7},
"overall": {"Avg Rank": 6, "Min Rank": 5, "Max Rank": 8}
}},
{"config": {
"model_name": "gemini-1.5-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"math-algebra": {"Score": 94.7572213, "Avg Rank": 5, "Min Rank": 5, "Max Rank": 5},
"math-probability": {"Score": 91.42512077, "Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 10},
"reasoning-logical": {"Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 11},
"overall": {"Avg Rank": 14, "Min Rank": 13, "Max Rank": 15}
}},
{"config": {
"model_name": "qwen2-72b-instruct",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024-02"
},
"results": {
"math-algebra": {"Score": 93.88818605, "Avg Rank": 6, "Min Rank": 6, "Max Rank": 6},
"math-probability": {"Score": 91.54326174, "Avg Rank": 4, "Min Rank": 3, "Max Rank": 5},
"reasoning-logical": {"Avg Rank": 15.66666667, "Min Rank": 15, "Max Rank": 17},
"overall": {"Avg Rank": 17, "Min Rank": 17, "Max Rank": 17}
}},
{"config": {
"model_name": "gpt-4o-mini-2024-07-18",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2024-07"
},
"results": {
"math-algebra": {"Score": 93.22073596, "Avg Rank": 7, "Min Rank": 7, "Max Rank": 7},
"math-probability": {"Score": 92.17351456, "Avg Rank": 3.666666667, "Min Rank": 3, "Max Rank": 5},
"reasoning-logical": {"Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 10},
"overall": {"Avg Rank": 7, "Min Rank": 5, "Max Rank": 8}
}},
{"config": {
"model_name": "claude-3.5-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024-03"
},
"results": {
"math-algebra": {"Score": 91.5823805, "Avg Rank": 8.333333333, "Min Rank": 8, "Max Rank": 9},
"math-probability": {"Score": 91.55011915, "Avg Rank": 8, "Min Rank": 7, "Max Rank": 9},
"reasoning-logical": {"Avg Rank": 5, "Min Rank": 2, "Max Rank": 7},
"overall": {"Avg Rank": 5, "Min Rank": 4, "Max Rank": 7}
}},
{"config": {
"model_name": "o1-mini",
"organization": "01 AI",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"math-algebra": None,
"math-probability": None,
"reasoning-logical": None,
"overall": {"Avg Rank": 1, "Min Rank": 1, "Max Rank": 1}
}},
{"config": {
"model_name": "o1-preview",
"organization": "01 AI",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"math-algebra": None,
"math-probability": None,
"reasoning-logical": None,
"overall": {"Avg Rank": 3, "Min Rank": 3, "Max Rank": 3}
}},
{"config": {
"model_name": "gemini-1.5-flash-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2024-02"
},
"results": {
"math-algebra": {"Score": 91.30211121, "Avg Rank": 11, "Min Rank": 11, "Max Rank": 11},
"math-probability": {"Score": 91.066099, "Avg Rank": 12, "Min Rank": 10, "Max Rank": 13},
"reasoning-logical": {"Avg Rank": 15.66666667, "Min Rank": 15, "Max Rank": 16},
"overall": {"Avg Rank": 14, "Min Rank": 13, "Max Rank": 15}
}},
{"config": {
"model_name": "gpt4-1106",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2024-04"
},
"results": {
"math-algebra": {"Score": 91.2227739, "Avg Rank": 12, "Min Rank": 12, "Max Rank": 12},
"math-probability": {"Score": 91.09550085, "Avg Rank": 11.66666667, "Min Rank": 11, "Max Rank": 12},
"reasoning-logical": {"Avg Rank": 12, "Min Rank": 12, "Max Rank": 12},
"overall": {"Avg Rank": 12, "Min Rank": 11, "Max Rank": 12}
}},
{"config": {
"model_name": "gemma-2-27b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024-03"
},
"results": {
"math-algebra": {"Score": 91.08554346, "Avg Rank": 13.33333333, "Min Rank": 13, "Max Rank": 14},
"math-probability": {"Score": 91.09516215, "Avg Rank": 14, "Min Rank": 14, "Max Rank": 14},
"reasoning-logical": {"Avg Rank": 13, "Min Rank": 13, "Max Rank": 13},
"overall": {"Avg Rank": 13, "Min Rank": 12, "Max Rank": 14}
}},
{"config": {
"model_name": "claude-3-opus",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"math-algebra": {"Score": 89.75345785, "Avg Rank": 13.66666667, "Min Rank": 13, "Max Rank": 14},
"math-probability": {"Score": 91.06939607, "Avg Rank": 11.33333333, "Min Rank": 11, "Max Rank": 12},
"reasoning-logical": {"Avg Rank": 10.66666667, "Min Rank": 10, "Max Rank": 11},
"overall": {"Avg Rank": 12, "Min Rank": 10, "Max Rank": 15}
}},
{"config": {
"model_name": "gemma-2-9b-it-simpo",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2024-02"
},
"results": {
"math-algebra": {"Score": 87.66368227, "Avg Rank": 15, "Min Rank": 15, "Max Rank": 15},
"math-probability": {"Score": 73.64665336, "Avg Rank": 17, "Min Rank": 17, "Max Rank": 17},
"reasoning-logical": {"Avg Rank": 19, "Min Rank": 19, "Max Rank": 19},
"overall": {"Avg Rank": 17, "Min Rank": 15, "Max Rank": 19}
}},
{"config": {
"model_name": "qwen1.5-72b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024-03"
},
"results": {
"math-algebra": {"Score": 86.56207015, "Avg Rank": 16, "Min Rank": 16, "Max Rank": 16},
"math-probability": {"Score": 72.7735874, "Avg Rank": 21, "Min Rank": 20, "Max Rank": 22},
"reasoning-logical": {"Avg Rank": 29.66666667, "Min Rank": 28, "Max Rank": 31},
"overall": {"Avg Rank": 23, "Min Rank": 16, "Max Rank": 31}
}},
{"config": {
"model_name": "qwen1.5-32b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024-03"
},
"results": {
"math-algebra": {"Score": 84.59439036, "Avg Rank": 17.33333333, "Min Rank": 17, "Max Rank": 18},
"math-probability": {"Score": 76.61348265, "Avg Rank": 22.33333333, "Min Rank": 22, "Max Rank": 23},
"reasoning-logical": {"Avg Rank": 28.66666667, "Min Rank": 27, "Max Rank": 30},
"overall": {"Avg Rank": 22, "Min Rank": 17, "Max Rank": 30}
}},
{"config": {
"model_name": "google-gemma-2-9b-it",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"math-algebra": {"Score": 84.18901776, "Avg Rank": 18, "Min Rank": 17, "Max Rank": 19},
"math-probability": {"Score": 74.46332504, "Avg Rank": 16, "Min Rank": 16, "Max Rank": 16},
"reasoning-logical": {"Avg Rank": 14, "Min Rank": 14, "Max Rank": 14},
"overall": {"Avg Rank": 16, "Min Rank": 14, "Max Rank": 19}
}},
{"config": {
"model_name": "yi-1.5-34b-chat",
"organization": "01 AI",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"math-algebra": {"Score": 81.82921677, "Avg Rank": 18.66666667, "Min Rank": 18, "Max Rank": 19},
"math-probability": {"Score": 77.41945842, "Avg Rank": 15, "Min Rank": 15, "Max Rank": 15},
"reasoning-logical": {"Avg Rank": 17.33333333, "Min Rank": 17, "Max Rank": 18},
"overall": {"Avg Rank": 18, "Min Rank": 15, "Max Rank": 19}
}},
{"config": {
"model_name": "meta-llama-3.1-8b-instruct",
"organization": "Meta",
"license": "Llama 3.1 Community",
"knowledge_cutoff": "2024-02"
},
"results": {
"math-algebra": {"Score": 75.57121963, "Avg Rank": 20.33333333, "Min Rank": 20, "Max Rank": 21},
"math-probability": {"Score": 75.46243493, "Avg Rank": 20.33333333, "Min Rank": 20, "Max Rank": 21},
"reasoning-logical": {"Avg Rank": 23.66666667, "Min Rank": 23, "Max Rank": 24},
"overall": {"Avg Rank": 21, "Min Rank": 20, "Max Rank": 24}
}},
{"config": {
"model_name": "gpt3.5-turbo-0125",
"organization": "OpenAI",
"license": "Proprietary",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 73.29235048, "Avg Rank": 21.33333333, "Min Rank": 21, "Max Rank": 22},
"math-probability": {"Score": 66.27452275, "Avg Rank": 24, "Min Rank": 24, "Max Rank": 24},
"reasoning-logical": {"Avg Rank": 42.66666667, "Min Rank": 42, "Max Rank": 44},
"overall": {"Avg Rank": 29, "Min Rank": 21, "Max Rank": 44}
}},
{"config": {
"model_name": "llama-3-70b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2024-03"
},
"results": {
"math-algebra": {"Score": 73.75419539, "Avg Rank": 21.33333333, "Min Rank": 20, "Max Rank": 22},
"math-probability": {"Score": 87.86358478, "Avg Rank": 18.33333333, "Min Rank": 18, "Max Rank": 19},
"reasoning-logical": {"Avg Rank": 3.333333333, "Min Rank": 2, "Max Rank": 4},
"overall": {"Avg Rank": 15, "Min Rank": 3, "Max Rank": 22}
}},
{"config": {
"model_name": "claude-3-sonnet",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024-02"
},
"results": {
"math-algebra": {"Score": 71.15353833, "Avg Rank": 23, "Min Rank": 23, "Max Rank": 23},
"math-probability": {"Score": 88.02362801, "Avg Rank": 18.66666667, "Min Rank": 18, "Max Rank": 19},
"reasoning-logical": {"Avg Rank": 17.33333333, "Min Rank": 16, "Max Rank": 18},
"overall": {"Avg Rank": 20, "Min Rank": 16, "Max Rank": 23}
}},
{"config": {
"model_name": "qwen1.5-14b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024-01"
},
"results": {
"math-algebra": {"Score": 69.70470323, "Avg Rank": 24, "Min Rank": 24, "Max Rank": 24},
"math-probability": {"Score": 66.41420544, "Avg Rank": 28.66666667, "Min Rank": 28, "Max Rank": 29},
"reasoning-logical": {"Avg Rank": 34, "Min Rank": 34, "Max Rank": 34},
"overall": {"Avg Rank": 28, "Min Rank": 24, "Max Rank": 34}
}},
{"config": {
"model_name": "claude-3-haiku",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2024-01"
},
"results": {
"math-algebra": {"Score": 68.44060149, "Avg Rank": 25, "Min Rank": 25, "Max Rank": 25},
"math-probability": {"Score": 76.46075239, "Avg Rank": 22.33333333, "Min Rank": 21, "Max Rank": 23},
"reasoning-logical": {"Avg Rank": 20, "Min Rank": 20, "Max Rank": 20},
"overall": {"Avg Rank": 22, "Min Rank": 20, "Max Rank": 25}
}},
{"config": {
"model_name": "claude-2.1",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 67.59939121, "Avg Rank": 26, "Min Rank": 26, "Max Rank": 26},
"math-probability": {"Score": 68.89772398, "Avg Rank": 27, "Min Rank": 27, "Max Rank": 27},
"reasoning-logical": {"Avg Rank": 21, "Min Rank": 21, "Max Rank": 21},
"overall": {"Avg Rank": 25, "Min Rank": 21, "Max Rank": 27}
}},
{"config": {
"model_name": "mistral-8x7b-instruct-v0.1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 64.71364004, "Avg Rank": 27, "Min Rank": 27, "Max Rank": 27},
"math-probability": {"Score": 67.67468595, "Avg Rank": 26, "Min Rank": 26, "Max Rank": 26},
"reasoning-logical": {"Avg Rank": 29, "Min Rank": 28, "Max Rank": 30},
"overall": {"Avg Rank": 27, "Min Rank": 26, "Max Rank": 30}
}},
{"config": {
"model_name": "claude-2.0",
"organization": "Anthropic",
"license": "Proprietary",
"knowledge_cutoff": "2023-10"
},
"results": {
"math-algebra": {"Score": 64.77311289, "Avg Rank": 28, "Min Rank": 28, "Max Rank": 28},
"math-probability": {"Score": 74.34063069, "Avg Rank": 25, "Min Rank": 25, "Max Rank": 25},
"reasoning-logical": {"Avg Rank": 23.33333333, "Min Rank": 23, "Max Rank": 24},
"overall": {"Avg Rank": 25, "Min Rank": 23, "Max Rank": 28}
}},
{"config": {
"model_name": "starling-lm-7b-beta",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2024-01"
},
"results": {
"math-algebra": {"Score": 64.01222884, "Avg Rank": 29.33333333, "Min Rank": 29, "Max Rank": 30},
"math-probability": {"Score": 70.42025806, "Avg Rank": 28.33333333, "Min Rank": 28, "Max Rank": 29},
"reasoning-logical": {"Avg Rank": 25, "Min Rank": 25, "Max Rank": 25},
"overall": {"Avg Rank": 27, "Min Rank": 25, "Max Rank": 30}
}},
{"config": {
"model_name": "gemini-1.0-pro-001",
"organization": "Google",
"license": "Proprietary",
"knowledge_cutoff": "2023-11"
},
"results": {
"math-algebra": {"Score": 63.93365247, "Avg Rank": 29.66666667, "Min Rank": 29, "Max Rank": 30},
"math-probability": {"Score": 62.13077748, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 38},
"reasoning-logical": {"Avg Rank": 37.33333333, "Min Rank": 36, "Max Rank": 40},
"overall": {"Avg Rank": 34, "Min Rank": 29, "Max Rank": 40}
}},
{"config": {
"model_name": "openchat-3.5-0106",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2024-01"
},
"results": {
"math-algebra": {"Score": 63.02959506, "Avg Rank": 31, "Min Rank": 31, "Max Rank": 31},
"math-probability": {"Score": 61.00599665, "Avg Rank": 30, "Min Rank": 30, "Max Rank": 30},
"reasoning-logical": {"Avg Rank": 27.66666667, "Min Rank": 27, "Max Rank": 29},
"overall": {"Avg Rank": 29, "Min Rank": 27, "Max Rank": 31}
}},
{"config": {
"model_name": "openchat-3.5",
"organization": "OpenChat",
"license": "Apache-2.0",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 61.45954168, "Avg Rank": 32.33333333, "Min Rank": 32, "Max Rank": 33},
"math-probability": {"Score": 62.56195929, "Avg Rank": 32, "Min Rank": 32, "Max Rank": 32},
"reasoning-logical": {"Avg Rank": 32, "Min Rank": 31, "Max Rank": 33},
"overall": {"Avg Rank": 32, "Min Rank": 31, "Max Rank": 33}
}},
{"config": {
"model_name": "command-r-(08-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024-08"
},
"results": {
"math-algebra": {"Score": 61.0679475, "Avg Rank": 32.66666667, "Min Rank": 32, "Max Rank": 33},
"math-probability": {"Score": 66.00833826, "Avg Rank": 31, "Min Rank": 31, "Max Rank": 31},
"reasoning-logical": {"Avg Rank": 37.66666667, "Min Rank": 37, "Max Rank": 38},
"overall": {"Avg Rank": 34, "Min Rank": 31, "Max Rank": 38}
}},
{"config": {
"model_name": "gemma-1.1-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2023-11"
},
"results": {
"math-algebra": {"Score": 60.92904194, "Avg Rank": 34.33333333, "Min Rank": 34, "Max Rank": 35},
"math-probability": {"Score": 62.17574935, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37},
"reasoning-logical": {"Avg Rank": 30.33333333, "Min Rank": 28, "Max Rank": 32},
"overall": {"Avg Rank": 34, "Min Rank": 28, "Max Rank": 37}
}},
{"config": {
"model_name": "llama3-8b-instruct",
"organization": "Meta",
"license": "Llama 3 Community",
"knowledge_cutoff": "2024-01"
},
"results": {
"math-algebra": {"Score": 61.06411319, "Avg Rank": 35, "Min Rank": 34, "Max Rank": 36},
"math-probability": {"Score": 62.13077748, "Avg Rank": 34.66666667, "Min Rank": 34, "Max Rank": 35},
"reasoning-logical": {"Avg Rank": 22, "Min Rank": 22, "Max Rank": 22},
"overall": {"Avg Rank": 30, "Min Rank": 22, "Max Rank": 36}
}},
{"config": {
"model_name": "gemma-2-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 59.70248014, "Avg Rank": 36, "Min Rank": 35, "Max Rank": 37},
"math-probability": {"Score": 61.08084527, "Avg Rank": 33.66666667, "Min Rank": 33, "Max Rank": 35},
"reasoning-logical": {"Avg Rank": 26, "Min Rank": 26, "Max Rank": 26},
"overall": {"Avg Rank": 32, "Min Rank": 26, "Max Rank": 37}
}},
{"config": {
"model_name": "starling-lm-7b-alpha",
"organization": "Nexusflow",
"license": "Apache-2.0",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 59.574329, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37},
"math-probability": {"Score": 64.03683254, "Avg Rank": 33.66666667, "Min Rank": 33, "Max Rank": 34},
"reasoning-logical": {"Avg Rank": 35, "Min Rank": 35, "Max Rank": 35},
"overall": {"Avg Rank": 35, "Min Rank": 33, "Max Rank": 37}
}},
{"config": {
"model_name": "qwen1.5-4b-chat",
"organization": "Alibaba",
"license": "Qianwen LICENSE",
"knowledge_cutoff": "2024-02"
},
"results": {
"math-algebra": {"Score": 56.66282914, "Avg Rank": 38.33333333, "Min Rank": 38, "Max Rank": 39},
"math-probability": {"Score": 57.39032697, "Avg Rank": 43, "Min Rank": 43, "Max Rank": 43},
"reasoning-logical": {"Avg Rank": 46, "Min Rank": 46, "Max Rank": 46},
"overall": {"Avg Rank": 42, "Min Rank": 38, "Max Rank": 46}
}},
{"config": {
"model_name": "command-r-(04-2024)",
"organization": "Cohere",
"license": "CC-BY-NC-4.0",
"knowledge_cutoff": "2024-04"
},
"results": {
"math-algebra": {"Score": 56.19063413, "Avg Rank": 38.66666667, "Min Rank": 38, "Max Rank": 39},
"math-probability": {"Score": 54.37641509, "Avg Rank": 37.66666667, "Min Rank": 37, "Max Rank": 38},
"reasoning-logical": {"Avg Rank": 32.66666667, "Min Rank": 32, "Max Rank": 33},
"overall": {"Avg Rank": 36, "Min Rank": 32, "Max Rank": 39}
}},
{"config": {
"model_name": "vicuna-33b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 54.71037983, "Avg Rank": 40.66666667, "Min Rank": 40, "Max Rank": 42},
"math-probability": {"Score": 55.02214588, "Avg Rank": 41, "Min Rank": 41, "Max Rank": 41},
"reasoning-logical": {"Avg Rank": 41, "Min Rank": 41, "Max Rank": 41},
"overall": {"Avg Rank": 41, "Min Rank": 40, "Max Rank": 42}
}},
{"config": {
"model_name": "gemma-7b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 54.35817186, "Avg Rank": 40.66666667, "Min Rank": 40, "Max Rank": 41},
"math-probability": {"Score": 58.19573446, "Avg Rank": 42, "Min Rank": 42, "Max Rank": 42},
"reasoning-logical": {"Avg Rank": 39.33333333, "Min Rank": 39, "Max Rank": 40},
"overall": {"Avg Rank": 41, "Min Rank": 39, "Max Rank": 42}
}},
{"config": {
"model_name": "mistral-7b-instruct-2",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 54.39240703, "Avg Rank": 41.66666667, "Min Rank": 41, "Max Rank": 42},
"math-probability": {"Score": 60.35257542, "Avg Rank": 39, "Min Rank": 39, "Max Rank": 39},
"reasoning-logical": {"Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37},
"overall": {"Avg Rank": 39, "Min Rank": 36, "Max Rank": 42}
}},
{"config": {
"model_name": "mistral-7b-instruct-1",
"organization": "Mistral",
"license": "Apache 2.0",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 53.80157944, "Avg Rank": 43, "Min Rank": 43, "Max Rank": 43},
"math-probability": {"Score": 56.51960666, "Avg Rank": 40, "Min Rank": 40, "Max Rank": 40},
"reasoning-logical": {"Avg Rank": 45, "Min Rank": 45, "Max Rank": 45},
"overall": {"Avg Rank": 43, "Min Rank": 40, "Max Rank": 45}
}},
{"config": {
"model_name": "vicuna-13b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023-11"
},
"results": {
"math-algebra": {"Score": 53.5413765, "Avg Rank": 44, "Min Rank": 44, "Max Rank": 44},
"math-probability": {"Score": 53.53586693, "Avg Rank": 44, "Min Rank": 44, "Max Rank": 44},
"reasoning-logical": {"Avg Rank": 43.66666667, "Min Rank": 43, "Max Rank": 44},
"overall": {"Avg Rank": 44, "Min Rank": 43, "Max Rank": 44}
}},
{"config": {
"model_name": "zephyr-7b-beta",
"organization": "HuggingFace",
"license": "MIT",
"knowledge_cutoff": "2023-10"
},
"results": {
"math-algebra": {"Score": 52.23039742, "Avg Rank": 46, "Min Rank": 45, "Max Rank": 48},
"math-probability": {"Score": 51.67173535, "Avg Rank": 47.33333333, "Min Rank": 47, "Max Rank": 48},
"reasoning-logical": {"Avg Rank": 50, "Min Rank": 50, "Max Rank": 50},
"overall": {"Avg Rank": 48, "Min Rank": 45, "Max Rank": 50}
}},
{"config": {
"model_name": "gemma-1.1-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 52.22372428, "Avg Rank": 46, "Min Rank": 45, "Max Rank": 47},
"math-probability": {"Score": 51.74306688, "Avg Rank": 46.33333333, "Min Rank": 46, "Max Rank": 47},
"reasoning-logical": {"Avg Rank": 48, "Min Rank": 48, "Max Rank": 48},
"overall": {"Avg Rank": 47, "Min Rank": 45, "Max Rank": 48}
}},
{"config": {
"model_name": "llama2-7b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023-10"
},
"results": {
"math-algebra": {"Score": 51.83025857, "Avg Rank": 46.33333333, "Min Rank": 46, "Max Rank": 47},
"math-probability": {"Score": 51.19585847, "Avg Rank": 47.33333333, "Min Rank": 46, "Max Rank": 48},
"reasoning-logical": {"Avg Rank": 42.66666667, "Min Rank": 42, "Max Rank": 43},
"overall": {"Avg Rank": 45, "Min Rank": 42, "Max Rank": 48}
}},
{"config": {
"model_name": "gemma-2b-it",
"organization": "Google",
"license": "Gemma License",
"knowledge_cutoff": "2023-11"
},
"results": {
"math-algebra": {"Score": 51.60281474, "Avg Rank": 47.66666667, "Min Rank": 47, "Max Rank": 48},
"math-probability": {"Score": 51.52250905, "Avg Rank": 50, "Min Rank": 50, "Max Rank": 50},
"reasoning-logical": {"Avg Rank": 51, "Min Rank": 51, "Max Rank": 51},
"overall": {"Avg Rank": 49, "Min Rank": 47, "Max Rank": 51}
}},
{"config": {
"model_name": "llama2-13b-chat",
"organization": "Meta",
"license": "Llama 2 Community",
"knowledge_cutoff": "2023-12"
},
"results": {
"math-algebra": {"Score": 51.21273132, "Avg Rank": 49, "Min Rank": 49, "Max Rank": 49},
"math-probability": {"Score": 51.72056522, "Avg Rank": 45, "Min Rank": 45, "Max Rank": 45},
"reasoning-logical": {"Avg Rank": 39, "Min Rank": 38, "Max Rank": 40},
"overall": {"Avg Rank": 44, "Min Rank": 38, "Max Rank": 49}
}},
{"config": {
"model_name": "vicuna-7b",
"organization": "LMSYS",
"license": "Non-commercial",
"knowledge_cutoff": "2023-11"
},
"results": {
"math-algebra": {"Score": 51.31450547, "Avg Rank": 50, "Min Rank": 50, "Max Rank": 50},
"math-probability": {"Score": 52.72504618, "Avg Rank": 49, "Min Rank": 49, "Max Rank": 49},
"reasoning-logical": {"Avg Rank": 47, "Min Rank": 47, "Max Rank": 47},
"overall": {"Avg Rank": 48, "Min Rank": 47, "Max Rank": 50}
}},
{"config": {
"model_name": "koala-13b",
"organization": "UC Berkeley",
"license": "Non-commercial",
"knowledge_cutoff": "2023-10"
},
"results": {
"math-algebra": {"Score": 50.19054677, "Avg Rank": 51, "Min Rank": 51, "Max Rank": 51},
"math-probability": {"Score": 50.741989, "Avg Rank": 51, "Min Rank": 51, "Max Rank": 51},
"reasoning-logical": {"Avg Rank": 49, "Min Rank": 49, "Max Rank": 49},
"overall": {"Avg Rank": 50, "Min Rank": 49, "Max Rank": 51}
}},
{"config": {
"model_name": "openassistant-pythia-12b",
"organization": "OpenAssistant",
"license": "Non-commercial",
"knowledge_cutoff": "2023-09"
},
"results": {
"math-algebra": {"Score": 50, "Avg Rank": 52, "Min Rank": 52, "Max Rank": 52},
"math-probability": {"Score": 50, "Avg Rank": 52, "Min Rank": 52, "Max Rank": 52},
"reasoning-logical": {"Avg Rank": 52, "Min Rank": 52, "Max Rank": 52},
"overall": {"Avg Rank": 52, "Min Rank": 52, "Max Rank": 52}
}}
]