yzabc007 commited on
Commit
aee5c0e
·
1 Parent(s): 8643afc
app.py CHANGED
@@ -99,15 +99,8 @@ def init_leaderboard(dataframe):
99
  interactive=False,
100
  )
101
 
102
- # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
103
- # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
104
- # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
105
- # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
- # model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
107
- # model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
108
- # model_result_path = "./src/results/models_2024-10-10-06:18:54.263527.json"
109
- # model_result_path = "./src/results/models_2024-10-18-14:06:13.588399.json"
110
- model_result_path = "./src/results/models_2024-10-20-23:34:57.242641.json"
111
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
112
 
113
 
@@ -267,22 +260,36 @@ with demo:
267
 
268
 
269
  with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
270
- DESCRIPTION_TEXT="""
271
- Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
272
- To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources.
273
- We prioritize **recent math datasets** and focus on **college and beyond level** math questions.
274
- The current datasets include
275
- [MATH](https://arxiv.org/abs/2103.03874),
276
- [MATH-500](https://github.com/openai/prm800k/tree/main/prm800k/math_splits),
277
- [Omni](https://omni-math.github.io/),
278
- [MathQA](https://arxiv.org/abs/1905.13319),
279
- [MathBench](https://arxiv.org/abs/2405.12209),
280
- [SciBench](https://arxiv.org/abs/2307.10635), and more!
281
 
282
- We plan to include more math domains, such as calculus, number theory, and more in the future.
283
- """
284
- gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
287
  with gr.TabItem("🏆 Overview", elem_id="math_overview_subtab", id=0, elem_classes="subtab"):
288
 
 
99
  interactive=False,
100
  )
101
 
102
+ # model_result_path = "./src/results/models_2024-10-20-23:34:57.242641.json"
103
+ model_result_path = "./src/results/models_2024-10-24-08:08:59.127307.json"
 
 
 
 
 
 
 
104
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
105
 
106
 
 
260
 
261
 
262
  with gr.TabItem("🔢 Math", elem_id="math-tab-table", id=2):
263
+ # DESCRIPTION_TEXT="""
264
+ # Algebra, Geometry, and Probability are the current three main math domains in the leaderboard.
265
+ # To mitigate the potential impact of data contimination, we have carefully selected the datasets from various sources.
266
+ # We prioritize **recent math datasets** and focus on **college and beyond level** math questions.
267
+ # The current datasets include
268
+ # [MATH](https://arxiv.org/abs/2103.03874),
269
+ # [MATH-500](https://github.com/openai/prm800k/tree/main/prm800k/math_splits),
270
+ # [Omni](https://omni-math.github.io/),
271
+ # [MathQA](https://arxiv.org/abs/1905.13319),
272
+ # [MathBench](https://arxiv.org/abs/2405.12209),
273
+ # [SciBench](https://arxiv.org/abs/2307.10635), and more!
274
 
275
+ # We plan to include more math domains, such as calculus, number theory, and more in the future.
276
+ # """
277
+ # gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
278
 
279
+ TEXT = (
280
+ f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
281
+ '<b>Total #models: 57 (Last updated: 2024-10-21)</b>'
282
+ '</p>'
283
+ f'<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
284
+ 'This page prvovides a comprehensive overview of model ranks across various dimensions, based on their averaged ranks or scores.'
285
+ '(Missing values are due to the slow or problemtic model responses to be fixed soom.)'
286
+ '</p>'
287
+ # '<p style="font-size:{INTRODUCTION_TEXT_FONT_SIZE}px;">'
288
+ # 'We present '
289
+ # '</p>'
290
+ )
291
+ gr.HTML(TEXT)
292
+
293
  # leaderboard = init_leaderboard(LEADERBOARD_DF)
294
  with gr.TabItem("🏆 Overview", elem_id="math_overview_subtab", id=0, elem_classes="subtab"):
295
 
src/results/models_2024-10-07-14:50:12.666068.jsonl DELETED
@@ -1,677 +0,0 @@
1
- [
2
- {"config": {
3
- "model_name": "ChatGPT-4o-latest (2024-09-03)",
4
- "organization": "OpenAI",
5
- "license": "Proprietary",
6
- "knowledge_cutoff": "2023/10"
7
- },
8
- "results": {
9
- "math-algebra": {"Score": 99.19484702, "Avg Rank": 1.666666667, "Min Rank": 1, "Max Rank": 3},
10
- "math-probability": {"Score": 100, "Avg Rank": 1, "Min Rank": 1, "Max Rank": 1},
11
- "reasoning-logical": {"Avg Rank": 1, "Min Rank": 1, "Max Rank": 1},
12
- "overall": {"Avg Rank": 2, "Min Rank": 2, "Max Rank": 2}
13
- }},
14
-
15
- {"config": {
16
- "model_name": "gpt-4o-2024-08-06",
17
- "organization": "OpenAI",
18
- "license": "Proprietary",
19
- "knowledge_cutoff": "2023/10"
20
- },
21
- "results": {
22
- "math-algebra": {"Score": 98.38969404, "Avg Rank": 1.666666667, "Min Rank": 1, "Max Rank": 2},
23
- "math-probability": {"Score": 96.49758454, "Avg Rank": 2.666666667, "Min Rank": 2, "Max Rank": 4},
24
- "reasoning-logical": {"Avg Rank": 4.333333333, "Min Rank": 3, "Max Rank": 5},
25
- "overall": {"Avg Rank": 7.33, "Min Rank": 4, "Max Rank": 9}
26
- }},
27
-
28
- {"config": {
29
- "model_name": "gpt-4o-2024-05-13",
30
- "organization": "OpenAI",
31
- "license": "Proprietary",
32
- "knowledge_cutoff": "2023/10"
33
- },
34
- "results": {
35
- "math-algebra": {"Score": 98.15480333, "Avg Rank": 2.666666667, "Min Rank": 2, "Max Rank": 3},
36
- "math-probability": {"Score": 94.83939431, "Avg Rank": 3.666666667, "Min Rank": 2, "Max Rank": 5},
37
- "reasoning-logical": {"Avg Rank": 6.333333333, "Min Rank": 3, "Max Rank": 8},
38
- "overall": {"Avg Rank": 7.67, "Min Rank": 7, "Max Rank": 9}
39
- }},
40
-
41
- {"config": {
42
- "model_name": "gpt-4-turbo-2024-04-09",
43
- "organization": "OpenAI",
44
- "license": "Proprietary",
45
- "knowledge_cutoff": "2023/12"
46
- },
47
- "results": {
48
- "math-algebra": {"Score": 96.03195879, "Avg Rank": 4, "Min Rank": 4, "Max Rank": 4},
49
- "math-probability": {"Score": 93.59903382, "Avg Rank": 6.666666667, "Min Rank": 6, "Max Rank": 8},
50
- "reasoning-logical": {"Avg Rank": 4, "Min Rank": 2, "Max Rank": 7},
51
- "overall": {"Avg Rank": 6, "Min Rank": 5, "Max Rank": 8}
52
- }},
53
-
54
- {"config": {
55
- "model_name": "gemini-1.5-pro-001",
56
- "organization": "Google",
57
- "license": "Proprietary",
58
- "knowledge_cutoff": "2024-01"
59
- },
60
- "results": {
61
- "math-algebra": {"Score": 94.7572213, "Avg Rank": 5, "Min Rank": 5, "Max Rank": 5},
62
- "math-probability": {"Score": 91.42512077, "Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 10},
63
- "reasoning-logical": {"Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 11},
64
- "overall": {"Avg Rank": 14, "Min Rank": 13, "Max Rank": 15}
65
- }},
66
-
67
- {"config": {
68
- "model_name": "qwen2-72b-instruct",
69
- "organization": "Alibaba",
70
- "license": "Qianwen LICENSE",
71
- "knowledge_cutoff": "2024-02"
72
- },
73
- "results": {
74
- "math-algebra": {"Score": 93.88818605, "Avg Rank": 6, "Min Rank": 6, "Max Rank": 6},
75
- "math-probability": {"Score": 91.54326174, "Avg Rank": 4, "Min Rank": 3, "Max Rank": 5},
76
- "reasoning-logical": {"Avg Rank": 15.66666667, "Min Rank": 15, "Max Rank": 17},
77
- "overall": {"Avg Rank": 17, "Min Rank": 17, "Max Rank": 17}
78
- }},
79
-
80
- {"config": {
81
- "model_name": "gpt-4o-mini-2024-07-18",
82
- "organization": "OpenAI",
83
- "license": "Proprietary",
84
- "knowledge_cutoff": "2024-07"
85
- },
86
- "results": {
87
- "math-algebra": {"Score": 93.22073596, "Avg Rank": 7, "Min Rank": 7, "Max Rank": 7},
88
- "math-probability": {"Score": 92.17351456, "Avg Rank": 3.666666667, "Min Rank": 3, "Max Rank": 5},
89
- "reasoning-logical": {"Avg Rank": 9.666666667, "Min Rank": 9, "Max Rank": 10},
90
- "overall": {"Avg Rank": 7, "Min Rank": 5, "Max Rank": 8}
91
- }},
92
-
93
- {"config": {
94
- "model_name": "claude-3.5-sonnet",
95
- "organization": "Anthropic",
96
- "license": "Proprietary",
97
- "knowledge_cutoff": "2024-03"
98
- },
99
- "results": {
100
- "math-algebra": {"Score": 91.5823805, "Avg Rank": 8.333333333, "Min Rank": 8, "Max Rank": 9},
101
- "math-probability": {"Score": 91.55011915, "Avg Rank": 8, "Min Rank": 7, "Max Rank": 9},
102
- "reasoning-logical": {"Avg Rank": 5, "Min Rank": 2, "Max Rank": 7},
103
- "overall": {"Avg Rank": 5, "Min Rank": 4, "Max Rank": 7}
104
- }},
105
-
106
- {"config": {
107
- "model_name": "o1-mini",
108
- "organization": "01 AI",
109
- "license": "Proprietary",
110
- "knowledge_cutoff": "2024-01"
111
- },
112
- "results": {
113
- "math-algebra": None,
114
- "math-probability": None,
115
- "reasoning-logical": None,
116
- "overall": {"Avg Rank": 1, "Min Rank": 1, "Max Rank": 1}
117
- }},
118
-
119
- {"config": {
120
- "model_name": "o1-preview",
121
- "organization": "01 AI",
122
- "license": "Proprietary",
123
- "knowledge_cutoff": "2024-01"
124
- },
125
- "results": {
126
- "math-algebra": None,
127
- "math-probability": None,
128
- "reasoning-logical": None,
129
- "overall": {"Avg Rank": 3, "Min Rank": 3, "Max Rank": 3}
130
- }},
131
-
132
- {"config": {
133
- "model_name": "gemini-1.5-flash-001",
134
- "organization": "Google",
135
- "license": "Proprietary",
136
- "knowledge_cutoff": "2024-02"
137
- },
138
- "results": {
139
- "math-algebra": {"Score": 91.30211121, "Avg Rank": 11, "Min Rank": 11, "Max Rank": 11},
140
- "math-probability": {"Score": 91.066099, "Avg Rank": 12, "Min Rank": 10, "Max Rank": 13},
141
- "reasoning-logical": {"Avg Rank": 15.66666667, "Min Rank": 15, "Max Rank": 16},
142
- "overall": {"Avg Rank": 14, "Min Rank": 13, "Max Rank": 15}
143
- }},
144
-
145
- {"config": {
146
- "model_name": "gpt4-1106",
147
- "organization": "OpenAI",
148
- "license": "Proprietary",
149
- "knowledge_cutoff": "2024-04"
150
- },
151
- "results": {
152
- "math-algebra": {"Score": 91.2227739, "Avg Rank": 12, "Min Rank": 12, "Max Rank": 12},
153
- "math-probability": {"Score": 91.09550085, "Avg Rank": 11.66666667, "Min Rank": 11, "Max Rank": 12},
154
- "reasoning-logical": {"Avg Rank": 12, "Min Rank": 12, "Max Rank": 12},
155
- "overall": {"Avg Rank": 12, "Min Rank": 11, "Max Rank": 12}
156
- }},
157
-
158
- {"config": {
159
- "model_name": "gemma-2-27b-it",
160
- "organization": "Google",
161
- "license": "Gemma License",
162
- "knowledge_cutoff": "2024-03"
163
- },
164
- "results": {
165
- "math-algebra": {"Score": 91.08554346, "Avg Rank": 13.33333333, "Min Rank": 13, "Max Rank": 14},
166
- "math-probability": {"Score": 91.09516215, "Avg Rank": 14, "Min Rank": 14, "Max Rank": 14},
167
- "reasoning-logical": {"Avg Rank": 13, "Min Rank": 13, "Max Rank": 13},
168
- "overall": {"Avg Rank": 13, "Min Rank": 12, "Max Rank": 14}
169
- }},
170
-
171
- {"config": {
172
- "model_name": "claude-3-opus",
173
- "organization": "Anthropic",
174
- "license": "Proprietary",
175
- "knowledge_cutoff": "2024-01"
176
- },
177
- "results": {
178
- "math-algebra": {"Score": 89.75345785, "Avg Rank": 13.66666667, "Min Rank": 13, "Max Rank": 14},
179
- "math-probability": {"Score": 91.06939607, "Avg Rank": 11.33333333, "Min Rank": 11, "Max Rank": 12},
180
- "reasoning-logical": {"Avg Rank": 10.66666667, "Min Rank": 10, "Max Rank": 11},
181
- "overall": {"Avg Rank": 12, "Min Rank": 10, "Max Rank": 15}
182
- }},
183
-
184
- {"config": {
185
- "model_name": "gemma-2-9b-it-simpo",
186
- "organization": "Google",
187
- "license": "Gemma License",
188
- "knowledge_cutoff": "2024-02"
189
- },
190
- "results": {
191
- "math-algebra": {"Score": 87.66368227, "Avg Rank": 15, "Min Rank": 15, "Max Rank": 15},
192
- "math-probability": {"Score": 73.64665336, "Avg Rank": 17, "Min Rank": 17, "Max Rank": 17},
193
- "reasoning-logical": {"Avg Rank": 19, "Min Rank": 19, "Max Rank": 19},
194
- "overall": {"Avg Rank": 17, "Min Rank": 15, "Max Rank": 19}
195
- }},
196
-
197
- {"config": {
198
- "model_name": "qwen1.5-72b-chat",
199
- "organization": "Alibaba",
200
- "license": "Qianwen LICENSE",
201
- "knowledge_cutoff": "2024-03"
202
- },
203
- "results": {
204
- "math-algebra": {"Score": 86.56207015, "Avg Rank": 16, "Min Rank": 16, "Max Rank": 16},
205
- "math-probability": {"Score": 72.7735874, "Avg Rank": 21, "Min Rank": 20, "Max Rank": 22},
206
- "reasoning-logical": {"Avg Rank": 29.66666667, "Min Rank": 28, "Max Rank": 31},
207
- "overall": {"Avg Rank": 23, "Min Rank": 16, "Max Rank": 31}
208
- }},
209
-
210
- {"config": {
211
- "model_name": "qwen1.5-32b-chat",
212
- "organization": "Alibaba",
213
- "license": "Qianwen LICENSE",
214
- "knowledge_cutoff": "2024-03"
215
- },
216
- "results": {
217
- "math-algebra": {"Score": 84.59439036, "Avg Rank": 17.33333333, "Min Rank": 17, "Max Rank": 18},
218
- "math-probability": {"Score": 76.61348265, "Avg Rank": 22.33333333, "Min Rank": 22, "Max Rank": 23},
219
- "reasoning-logical": {"Avg Rank": 28.66666667, "Min Rank": 27, "Max Rank": 30},
220
- "overall": {"Avg Rank": 22, "Min Rank": 17, "Max Rank": 30}
221
- }},
222
-
223
- {"config": {
224
- "model_name": "google-gemma-2-9b-it",
225
- "organization": "Google",
226
- "license": "Proprietary",
227
- "knowledge_cutoff": "2024-01"
228
- },
229
- "results": {
230
- "math-algebra": {"Score": 84.18901776, "Avg Rank": 18, "Min Rank": 17, "Max Rank": 19},
231
- "math-probability": {"Score": 74.46332504, "Avg Rank": 16, "Min Rank": 16, "Max Rank": 16},
232
- "reasoning-logical": {"Avg Rank": 14, "Min Rank": 14, "Max Rank": 14},
233
- "overall": {"Avg Rank": 16, "Min Rank": 14, "Max Rank": 19}
234
- }},
235
-
236
- {"config": {
237
- "model_name": "yi-1.5-34b-chat",
238
- "organization": "01 AI",
239
- "license": "Proprietary",
240
- "knowledge_cutoff": "2024-01"
241
- },
242
- "results": {
243
- "math-algebra": {"Score": 81.82921677, "Avg Rank": 18.66666667, "Min Rank": 18, "Max Rank": 19},
244
- "math-probability": {"Score": 77.41945842, "Avg Rank": 15, "Min Rank": 15, "Max Rank": 15},
245
- "reasoning-logical": {"Avg Rank": 17.33333333, "Min Rank": 17, "Max Rank": 18},
246
- "overall": {"Avg Rank": 18, "Min Rank": 15, "Max Rank": 19}
247
- }},
248
-
249
- {"config": {
250
- "model_name": "meta-llama-3.1-8b-instruct",
251
- "organization": "Meta",
252
- "license": "Llama 3.1 Community",
253
- "knowledge_cutoff": "2024-02"
254
- },
255
- "results": {
256
- "math-algebra": {"Score": 75.57121963, "Avg Rank": 20.33333333, "Min Rank": 20, "Max Rank": 21},
257
- "math-probability": {"Score": 75.46243493, "Avg Rank": 20.33333333, "Min Rank": 20, "Max Rank": 21},
258
- "reasoning-logical": {"Avg Rank": 23.66666667, "Min Rank": 23, "Max Rank": 24},
259
- "overall": {"Avg Rank": 21, "Min Rank": 20, "Max Rank": 24}
260
- }},
261
-
262
- {"config": {
263
- "model_name": "gpt3.5-turbo-0125",
264
- "organization": "OpenAI",
265
- "license": "Proprietary",
266
- "knowledge_cutoff": "2023-12"
267
- },
268
- "results": {
269
- "math-algebra": {"Score": 73.29235048, "Avg Rank": 21.33333333, "Min Rank": 21, "Max Rank": 22},
270
- "math-probability": {"Score": 66.27452275, "Avg Rank": 24, "Min Rank": 24, "Max Rank": 24},
271
- "reasoning-logical": {"Avg Rank": 42.66666667, "Min Rank": 42, "Max Rank": 44},
272
- "overall": {"Avg Rank": 29, "Min Rank": 21, "Max Rank": 44}
273
- }},
274
-
275
- {"config": {
276
- "model_name": "llama-3-70b-instruct",
277
- "organization": "Meta",
278
- "license": "Llama 3 Community",
279
- "knowledge_cutoff": "2024-03"
280
- },
281
- "results": {
282
- "math-algebra": {"Score": 73.75419539, "Avg Rank": 21.33333333, "Min Rank": 20, "Max Rank": 22},
283
- "math-probability": {"Score": 87.86358478, "Avg Rank": 18.33333333, "Min Rank": 18, "Max Rank": 19},
284
- "reasoning-logical": {"Avg Rank": 3.333333333, "Min Rank": 2, "Max Rank": 4},
285
- "overall": {"Avg Rank": 15, "Min Rank": 3, "Max Rank": 22}
286
- }},
287
-
288
- {"config": {
289
- "model_name": "claude-3-sonnet",
290
- "organization": "Anthropic",
291
- "license": "Proprietary",
292
- "knowledge_cutoff": "2024-02"
293
- },
294
- "results": {
295
- "math-algebra": {"Score": 71.15353833, "Avg Rank": 23, "Min Rank": 23, "Max Rank": 23},
296
- "math-probability": {"Score": 88.02362801, "Avg Rank": 18.66666667, "Min Rank": 18, "Max Rank": 19},
297
- "reasoning-logical": {"Avg Rank": 17.33333333, "Min Rank": 16, "Max Rank": 18},
298
- "overall": {"Avg Rank": 20, "Min Rank": 16, "Max Rank": 23}
299
- }},
300
-
301
- {"config": {
302
- "model_name": "qwen1.5-14b-chat",
303
- "organization": "Alibaba",
304
- "license": "Qianwen LICENSE",
305
- "knowledge_cutoff": "2024-01"
306
- },
307
- "results": {
308
- "math-algebra": {"Score": 69.70470323, "Avg Rank": 24, "Min Rank": 24, "Max Rank": 24},
309
- "math-probability": {"Score": 66.41420544, "Avg Rank": 28.66666667, "Min Rank": 28, "Max Rank": 29},
310
- "reasoning-logical": {"Avg Rank": 34, "Min Rank": 34, "Max Rank": 34},
311
- "overall": {"Avg Rank": 28, "Min Rank": 24, "Max Rank": 34}
312
- }},
313
-
314
- {"config": {
315
- "model_name": "claude-3-haiku",
316
- "organization": "Anthropic",
317
- "license": "Proprietary",
318
- "knowledge_cutoff": "2024-01"
319
- },
320
- "results": {
321
- "math-algebra": {"Score": 68.44060149, "Avg Rank": 25, "Min Rank": 25, "Max Rank": 25},
322
- "math-probability": {"Score": 76.46075239, "Avg Rank": 22.33333333, "Min Rank": 21, "Max Rank": 23},
323
- "reasoning-logical": {"Avg Rank": 20, "Min Rank": 20, "Max Rank": 20},
324
- "overall": {"Avg Rank": 22, "Min Rank": 20, "Max Rank": 25}
325
- }},
326
-
327
- {"config": {
328
- "model_name": "claude-2.1",
329
- "organization": "Anthropic",
330
- "license": "Proprietary",
331
- "knowledge_cutoff": "2023-12"
332
- },
333
- "results": {
334
- "math-algebra": {"Score": 67.59939121, "Avg Rank": 26, "Min Rank": 26, "Max Rank": 26},
335
- "math-probability": {"Score": 68.89772398, "Avg Rank": 27, "Min Rank": 27, "Max Rank": 27},
336
- "reasoning-logical": {"Avg Rank": 21, "Min Rank": 21, "Max Rank": 21},
337
- "overall": {"Avg Rank": 25, "Min Rank": 21, "Max Rank": 27}
338
- }},
339
-
340
- {"config": {
341
- "model_name": "mistral-8x7b-instruct-v0.1",
342
- "organization": "Mistral",
343
- "license": "Apache 2.0",
344
- "knowledge_cutoff": "2023-12"
345
- },
346
- "results": {
347
- "math-algebra": {"Score": 64.71364004, "Avg Rank": 27, "Min Rank": 27, "Max Rank": 27},
348
- "math-probability": {"Score": 67.67468595, "Avg Rank": 26, "Min Rank": 26, "Max Rank": 26},
349
- "reasoning-logical": {"Avg Rank": 29, "Min Rank": 28, "Max Rank": 30},
350
- "overall": {"Avg Rank": 27, "Min Rank": 26, "Max Rank": 30}
351
- }},
352
-
353
- {"config": {
354
- "model_name": "claude-2.0",
355
- "organization": "Anthropic",
356
- "license": "Proprietary",
357
- "knowledge_cutoff": "2023-10"
358
- },
359
- "results": {
360
- "math-algebra": {"Score": 64.77311289, "Avg Rank": 28, "Min Rank": 28, "Max Rank": 28},
361
- "math-probability": {"Score": 74.34063069, "Avg Rank": 25, "Min Rank": 25, "Max Rank": 25},
362
- "reasoning-logical": {"Avg Rank": 23.33333333, "Min Rank": 23, "Max Rank": 24},
363
- "overall": {"Avg Rank": 25, "Min Rank": 23, "Max Rank": 28}
364
- }},
365
-
366
- {"config": {
367
- "model_name": "starling-lm-7b-beta",
368
- "organization": "Nexusflow",
369
- "license": "Apache-2.0",
370
- "knowledge_cutoff": "2024-01"
371
- },
372
- "results": {
373
- "math-algebra": {"Score": 64.01222884, "Avg Rank": 29.33333333, "Min Rank": 29, "Max Rank": 30},
374
- "math-probability": {"Score": 70.42025806, "Avg Rank": 28.33333333, "Min Rank": 28, "Max Rank": 29},
375
- "reasoning-logical": {"Avg Rank": 25, "Min Rank": 25, "Max Rank": 25},
376
- "overall": {"Avg Rank": 27, "Min Rank": 25, "Max Rank": 30}
377
- }},
378
-
379
- {"config": {
380
- "model_name": "gemini-1.0-pro-001",
381
- "organization": "Google",
382
- "license": "Proprietary",
383
- "knowledge_cutoff": "2023-11"
384
- },
385
- "results": {
386
- "math-algebra": {"Score": 63.93365247, "Avg Rank": 29.66666667, "Min Rank": 29, "Max Rank": 30},
387
- "math-probability": {"Score": 62.13077748, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 38},
388
- "reasoning-logical": {"Avg Rank": 37.33333333, "Min Rank": 36, "Max Rank": 40},
389
- "overall": {"Avg Rank": 34, "Min Rank": 29, "Max Rank": 40}
390
- }},
391
-
392
- {"config": {
393
- "model_name": "openchat-3.5-0106",
394
- "organization": "OpenChat",
395
- "license": "Apache-2.0",
396
- "knowledge_cutoff": "2024-01"
397
- },
398
- "results": {
399
- "math-algebra": {"Score": 63.02959506, "Avg Rank": 31, "Min Rank": 31, "Max Rank": 31},
400
- "math-probability": {"Score": 61.00599665, "Avg Rank": 30, "Min Rank": 30, "Max Rank": 30},
401
- "reasoning-logical": {"Avg Rank": 27.66666667, "Min Rank": 27, "Max Rank": 29},
402
- "overall": {"Avg Rank": 29, "Min Rank": 27, "Max Rank": 31}
403
- }},
404
-
405
- {"config": {
406
- "model_name": "openchat-3.5",
407
- "organization": "OpenChat",
408
- "license": "Apache-2.0",
409
- "knowledge_cutoff": "2023-12"
410
- },
411
- "results": {
412
- "math-algebra": {"Score": 61.45954168, "Avg Rank": 32.33333333, "Min Rank": 32, "Max Rank": 33},
413
- "math-probability": {"Score": 62.56195929, "Avg Rank": 32, "Min Rank": 32, "Max Rank": 32},
414
- "reasoning-logical": {"Avg Rank": 32, "Min Rank": 31, "Max Rank": 33},
415
- "overall": {"Avg Rank": 32, "Min Rank": 31, "Max Rank": 33}
416
- }},
417
-
418
- {"config": {
419
- "model_name": "command-r-(08-2024)",
420
- "organization": "Cohere",
421
- "license": "CC-BY-NC-4.0",
422
- "knowledge_cutoff": "2024-08"
423
- },
424
- "results": {
425
- "math-algebra": {"Score": 61.0679475, "Avg Rank": 32.66666667, "Min Rank": 32, "Max Rank": 33},
426
- "math-probability": {"Score": 66.00833826, "Avg Rank": 31, "Min Rank": 31, "Max Rank": 31},
427
- "reasoning-logical": {"Avg Rank": 37.66666667, "Min Rank": 37, "Max Rank": 38},
428
- "overall": {"Avg Rank": 34, "Min Rank": 31, "Max Rank": 38}
429
- }},
430
-
431
- {"config": {
432
- "model_name": "gemma-1.1-7b-it",
433
- "organization": "Google",
434
- "license": "Gemma License",
435
- "knowledge_cutoff": "2023-11"
436
- },
437
- "results": {
438
- "math-algebra": {"Score": 60.92904194, "Avg Rank": 34.33333333, "Min Rank": 34, "Max Rank": 35},
439
- "math-probability": {"Score": 62.17574935, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37},
440
- "reasoning-logical": {"Avg Rank": 30.33333333, "Min Rank": 28, "Max Rank": 32},
441
- "overall": {"Avg Rank": 34, "Min Rank": 28, "Max Rank": 37}
442
- }},
443
-
444
- {"config": {
445
- "model_name": "llama3-8b-instruct",
446
- "organization": "Meta",
447
- "license": "Llama 3 Community",
448
- "knowledge_cutoff": "2024-01"
449
- },
450
- "results": {
451
- "math-algebra": {"Score": 61.06411319, "Avg Rank": 35, "Min Rank": 34, "Max Rank": 36},
452
- "math-probability": {"Score": 62.13077748, "Avg Rank": 34.66666667, "Min Rank": 34, "Max Rank": 35},
453
- "reasoning-logical": {"Avg Rank": 22, "Min Rank": 22, "Max Rank": 22},
454
- "overall": {"Avg Rank": 30, "Min Rank": 22, "Max Rank": 36}
455
- }},
456
-
457
- {"config": {
458
- "model_name": "gemma-2-2b-it",
459
- "organization": "Google",
460
- "license": "Gemma License",
461
- "knowledge_cutoff": "2023-12"
462
- },
463
- "results": {
464
- "math-algebra": {"Score": 59.70248014, "Avg Rank": 36, "Min Rank": 35, "Max Rank": 37},
465
- "math-probability": {"Score": 61.08084527, "Avg Rank": 33.66666667, "Min Rank": 33, "Max Rank": 35},
466
- "reasoning-logical": {"Avg Rank": 26, "Min Rank": 26, "Max Rank": 26},
467
- "overall": {"Avg Rank": 32, "Min Rank": 26, "Max Rank": 37}
468
- }},
469
-
470
- {"config": {
471
- "model_name": "starling-lm-7b-alpha",
472
- "organization": "Nexusflow",
473
- "license": "Apache-2.0",
474
- "knowledge_cutoff": "2023-12"
475
- },
476
- "results": {
477
- "math-algebra": {"Score": 59.574329, "Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37},
478
- "math-probability": {"Score": 64.03683254, "Avg Rank": 33.66666667, "Min Rank": 33, "Max Rank": 34},
479
- "reasoning-logical": {"Avg Rank": 35, "Min Rank": 35, "Max Rank": 35},
480
- "overall": {"Avg Rank": 35, "Min Rank": 33, "Max Rank": 37}
481
- }},
482
-
483
- {"config": {
484
- "model_name": "qwen1.5-4b-chat",
485
- "organization": "Alibaba",
486
- "license": "Qianwen LICENSE",
487
- "knowledge_cutoff": "2024-02"
488
- },
489
- "results": {
490
- "math-algebra": {"Score": 56.66282914, "Avg Rank": 38.33333333, "Min Rank": 38, "Max Rank": 39},
491
- "math-probability": {"Score": 57.39032697, "Avg Rank": 43, "Min Rank": 43, "Max Rank": 43},
492
- "reasoning-logical": {"Avg Rank": 46, "Min Rank": 46, "Max Rank": 46},
493
- "overall": {"Avg Rank": 42, "Min Rank": 38, "Max Rank": 46}
494
- }},
495
-
496
- {"config": {
497
- "model_name": "command-r-(04-2024)",
498
- "organization": "Cohere",
499
- "license": "CC-BY-NC-4.0",
500
- "knowledge_cutoff": "2024-04"
501
- },
502
- "results": {
503
- "math-algebra": {"Score": 56.19063413, "Avg Rank": 38.66666667, "Min Rank": 38, "Max Rank": 39},
504
- "math-probability": {"Score": 54.37641509, "Avg Rank": 37.66666667, "Min Rank": 37, "Max Rank": 38},
505
- "reasoning-logical": {"Avg Rank": 32.66666667, "Min Rank": 32, "Max Rank": 33},
506
- "overall": {"Avg Rank": 36, "Min Rank": 32, "Max Rank": 39}
507
- }},
508
-
509
- {"config": {
510
- "model_name": "vicuna-33b",
511
- "organization": "LMSYS",
512
- "license": "Non-commercial",
513
- "knowledge_cutoff": "2023-12"
514
- },
515
- "results": {
516
- "math-algebra": {"Score": 54.71037983, "Avg Rank": 40.66666667, "Min Rank": 40, "Max Rank": 42},
517
- "math-probability": {"Score": 55.02214588, "Avg Rank": 41, "Min Rank": 41, "Max Rank": 41},
518
- "reasoning-logical": {"Avg Rank": 41, "Min Rank": 41, "Max Rank": 41},
519
- "overall": {"Avg Rank": 41, "Min Rank": 40, "Max Rank": 42}
520
- }},
521
-
522
- {"config": {
523
- "model_name": "gemma-7b-it",
524
- "organization": "Google",
525
- "license": "Gemma License",
526
- "knowledge_cutoff": "2023-12"
527
- },
528
- "results": {
529
- "math-algebra": {"Score": 54.35817186, "Avg Rank": 40.66666667, "Min Rank": 40, "Max Rank": 41},
530
- "math-probability": {"Score": 58.19573446, "Avg Rank": 42, "Min Rank": 42, "Max Rank": 42},
531
- "reasoning-logical": {"Avg Rank": 39.33333333, "Min Rank": 39, "Max Rank": 40},
532
- "overall": {"Avg Rank": 41, "Min Rank": 39, "Max Rank": 42}
533
- }},
534
-
535
- {"config": {
536
- "model_name": "mistral-7b-instruct-2",
537
- "organization": "Mistral",
538
- "license": "Apache 2.0",
539
- "knowledge_cutoff": "2023-12"
540
- },
541
- "results": {
542
- "math-algebra": {"Score": 54.39240703, "Avg Rank": 41.66666667, "Min Rank": 41, "Max Rank": 42},
543
- "math-probability": {"Score": 60.35257542, "Avg Rank": 39, "Min Rank": 39, "Max Rank": 39},
544
- "reasoning-logical": {"Avg Rank": 36.66666667, "Min Rank": 36, "Max Rank": 37},
545
- "overall": {"Avg Rank": 39, "Min Rank": 36, "Max Rank": 42}
546
- }},
547
-
548
- {"config": {
549
- "model_name": "mistral-7b-instruct-1",
550
- "organization": "Mistral",
551
- "license": "Apache 2.0",
552
- "knowledge_cutoff": "2023-12"
553
- },
554
- "results": {
555
- "math-algebra": {"Score": 53.80157944, "Avg Rank": 43, "Min Rank": 43, "Max Rank": 43},
556
- "math-probability": {"Score": 56.51960666, "Avg Rank": 40, "Min Rank": 40, "Max Rank": 40},
557
- "reasoning-logical": {"Avg Rank": 45, "Min Rank": 45, "Max Rank": 45},
558
- "overall": {"Avg Rank": 43, "Min Rank": 40, "Max Rank": 45}
559
- }},
560
-
561
- {"config": {
562
- "model_name": "vicuna-13b",
563
- "organization": "LMSYS",
564
- "license": "Non-commercial",
565
- "knowledge_cutoff": "2023-11"
566
- },
567
- "results": {
568
- "math-algebra": {"Score": 53.5413765, "Avg Rank": 44, "Min Rank": 44, "Max Rank": 44},
569
- "math-probability": {"Score": 53.53586693, "Avg Rank": 44, "Min Rank": 44, "Max Rank": 44},
570
- "reasoning-logical": {"Avg Rank": 43.66666667, "Min Rank": 43, "Max Rank": 44},
571
- "overall": {"Avg Rank": 44, "Min Rank": 43, "Max Rank": 44}
572
- }},
573
-
574
- {"config": {
575
- "model_name": "zephyr-7b-beta",
576
- "organization": "HuggingFace",
577
- "license": "MIT",
578
- "knowledge_cutoff": "2023-10"
579
- },
580
- "results": {
581
- "math-algebra": {"Score": 52.23039742, "Avg Rank": 46, "Min Rank": 45, "Max Rank": 48},
582
- "math-probability": {"Score": 51.67173535, "Avg Rank": 47.33333333, "Min Rank": 47, "Max Rank": 48},
583
- "reasoning-logical": {"Avg Rank": 50, "Min Rank": 50, "Max Rank": 50},
584
- "overall": {"Avg Rank": 48, "Min Rank": 45, "Max Rank": 50}
585
- }},
586
-
587
- {"config": {
588
- "model_name": "gemma-1.1-2b-it",
589
- "organization": "Google",
590
- "license": "Gemma License",
591
- "knowledge_cutoff": "2023-12"
592
- },
593
- "results": {
594
- "math-algebra": {"Score": 52.22372428, "Avg Rank": 46, "Min Rank": 45, "Max Rank": 47},
595
- "math-probability": {"Score": 51.74306688, "Avg Rank": 46.33333333, "Min Rank": 46, "Max Rank": 47},
596
- "reasoning-logical": {"Avg Rank": 48, "Min Rank": 48, "Max Rank": 48},
597
- "overall": {"Avg Rank": 47, "Min Rank": 45, "Max Rank": 48}
598
- }},
599
-
600
- {"config": {
601
- "model_name": "llama2-7b-chat",
602
- "organization": "Meta",
603
- "license": "Llama 2 Community",
604
- "knowledge_cutoff": "2023-10"
605
- },
606
- "results": {
607
- "math-algebra": {"Score": 51.83025857, "Avg Rank": 46.33333333, "Min Rank": 46, "Max Rank": 47},
608
- "math-probability": {"Score": 51.19585847, "Avg Rank": 47.33333333, "Min Rank": 46, "Max Rank": 48},
609
- "reasoning-logical": {"Avg Rank": 42.66666667, "Min Rank": 42, "Max Rank": 43},
610
- "overall": {"Avg Rank": 45, "Min Rank": 42, "Max Rank": 48}
611
- }},
612
-
613
- {"config": {
614
- "model_name": "gemma-2b-it",
615
- "organization": "Google",
616
- "license": "Gemma License",
617
- "knowledge_cutoff": "2023-11"
618
- },
619
- "results": {
620
- "math-algebra": {"Score": 51.60281474, "Avg Rank": 47.66666667, "Min Rank": 47, "Max Rank": 48},
621
- "math-probability": {"Score": 51.52250905, "Avg Rank": 50, "Min Rank": 50, "Max Rank": 50},
622
- "reasoning-logical": {"Avg Rank": 51, "Min Rank": 51, "Max Rank": 51},
623
- "overall": {"Avg Rank": 49, "Min Rank": 47, "Max Rank": 51}
624
- }},
625
-
626
- {"config": {
627
- "model_name": "llama2-13b-chat",
628
- "organization": "Meta",
629
- "license": "Llama 2 Community",
630
- "knowledge_cutoff": "2023-12"
631
- },
632
- "results": {
633
- "math-algebra": {"Score": 51.21273132, "Avg Rank": 49, "Min Rank": 49, "Max Rank": 49},
634
- "math-probability": {"Score": 51.72056522, "Avg Rank": 45, "Min Rank": 45, "Max Rank": 45},
635
- "reasoning-logical": {"Avg Rank": 39, "Min Rank": 38, "Max Rank": 40},
636
- "overall": {"Avg Rank": 44, "Min Rank": 38, "Max Rank": 49}
637
- }},
638
-
639
- {"config": {
640
- "model_name": "vicuna-7b",
641
- "organization": "LMSYS",
642
- "license": "Non-commercial",
643
- "knowledge_cutoff": "2023-11"
644
- },
645
- "results": {
646
- "math-algebra": {"Score": 51.31450547, "Avg Rank": 50, "Min Rank": 50, "Max Rank": 50},
647
- "math-probability": {"Score": 52.72504618, "Avg Rank": 49, "Min Rank": 49, "Max Rank": 49},
648
- "reasoning-logical": {"Avg Rank": 47, "Min Rank": 47, "Max Rank": 47},
649
- "overall": {"Avg Rank": 48, "Min Rank": 47, "Max Rank": 50}
650
- }},
651
-
652
- {"config": {
653
- "model_name": "koala-13b",
654
- "organization": "UC Berkeley",
655
- "license": "Non-commercial",
656
- "knowledge_cutoff": "2023-10"
657
- },
658
- "results": {
659
- "math-algebra": {"Score": 50.19054677, "Avg Rank": 51, "Min Rank": 51, "Max Rank": 51},
660
- "math-probability": {"Score": 50.741989, "Avg Rank": 51, "Min Rank": 51, "Max Rank": 51},
661
- "reasoning-logical": {"Avg Rank": 49, "Min Rank": 49, "Max Rank": 49},
662
- "overall": {"Avg Rank": 50, "Min Rank": 49, "Max Rank": 51}
663
- }},
664
-
665
- {"config": {
666
- "model_name": "openassistant-pythia-12b",
667
- "organization": "OpenAssistant",
668
- "license": "Non-commercial",
669
- "knowledge_cutoff": "2023-09"
670
- },
671
- "results": {
672
- "math-algebra": {"Score": 50, "Avg Rank": 52, "Min Rank": 52, "Max Rank": 52},
673
- "math-probability": {"Score": 50, "Avg Rank": 52, "Min Rank": 52, "Max Rank": 52},
674
- "reasoning-logical": {"Avg Rank": 52, "Min Rank": 52, "Max Rank": 52},
675
- "overall": {"Avg Rank": 52, "Min Rank": 52, "Max Rank": 52}
676
- }}
677
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/results/models_2024-10-08-03:10:26.811832.jsonl DELETED
@@ -1,1770 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "model_name": "ChatGPT-4o-latest (2024-09-03)",
5
- "organization": "OpenAI",
6
- "license": "Proprietary",
7
- "knowledge_cutoff": "2023/10"
8
- },
9
- "results": {
10
- "OVERALL": {
11
- "Score": 0.974329609,
12
- "Standard Deviation": 0.005024959031
13
- },
14
- "Geometry": {
15
- "Score": 0.976028578,
16
- "Standard Deviation": 0.01507912373
17
- },
18
- "Algebra": {
19
- "Score": 0.951199453,
20
- "Standard Deviation": 0.08452452108
21
- },
22
- "Probability": {
23
- "Score": 0.842116641,
24
- "Standard Deviation": 0.006267759054
25
- },
26
- "Logical": {
27
- "Score": 0.828490728,
28
- "Standard Deviation": 0.009134213144
29
- },
30
- "Social": {
31
- "Score": 0.815902987,
32
- "Standard Deviation": 0.0196254222
33
- }
34
- }
35
- },
36
- {
37
- "config": {
38
- "model_name": "gpt-4o-2024-08-06",
39
- "organization": "OpenAI",
40
- "license": "Proprietary",
41
- "knowledge_cutoff": "2023/10"
42
- },
43
- "results": {
44
- "OVERALL": {
45
- "Score": 0.846571548,
46
- "Standard Deviation": 0.03394056554
47
- },
48
- "Geometry": {
49
- "Score": 0.99773096,
50
- "Standard Deviation": 0.002835555172
51
- },
52
- "Algebra": {
53
- "Score": 1.0,
54
- "Standard Deviation": 0.0
55
- },
56
- "Probability": {
57
- "Score": 0.78855795,
58
- "Standard Deviation": 0.008188675452
59
- },
60
- "Logical": {
61
- "Score": 0.668635768,
62
- "Standard Deviation": 0.03466314094
63
- },
64
- "Social": {
65
- "Score": 0.680417314,
66
- "Standard Deviation": 0.00656867063
67
- }
68
- }
69
- },
70
- {
71
- "config": {
72
- "model_name": "gpt-4o-2024-05-13",
73
- "organization": "OpenAI",
74
- "license": "Proprietary",
75
- "knowledge_cutoff": "2023/10"
76
- },
77
- "results": {
78
- "OVERALL": {
79
- "Score": 0.846334477,
80
- "Standard Deviation": 0.09377911572
81
- },
82
- "Geometry": {
83
- "Score": 0.972472377,
84
- "Standard Deviation": 0.01648274205
85
- },
86
- "Algebra": {
87
- "Score": 0.995511298,
88
- "Standard Deviation": 0.004097802515
89
- },
90
- "Probability": {
91
- "Score": 0.812149974,
92
- "Standard Deviation": 0.007669585485
93
- },
94
- "Logical": {
95
- "Score": 0.755019692,
96
- "Standard Deviation": 0.008149588572
97
- },
98
- "Social": {
99
- "Score": 0.609875087,
100
- "Standard Deviation": 0.038729239
101
- }
102
- }
103
- },
104
- {
105
- "config": {
106
- "model_name": "gpt-4-turbo-2024-04-09",
107
- "organization": "OpenAI",
108
- "license": "Proprietary",
109
- "knowledge_cutoff": "2023/12"
110
- },
111
- "results": {
112
- "OVERALL": {
113
- "Score": 0.855357972,
114
- "Standard Deviation": 0.1016986368
115
- },
116
- "Geometry": {
117
- "Score": 0.95374588,
118
- "Standard Deviation": 0.03109307166
119
- },
120
- "Algebra": {
121
- "Score": 0.930945223,
122
- "Standard Deviation": 0.06705136813
123
- },
124
- "Probability": {
125
- "Score": 0.750705448,
126
- "Standard Deviation": 0.05944483103
127
- },
128
- "Logical": {
129
- "Score": 0.77906699,
130
- "Standard Deviation": 0.007406734161
131
- },
132
- "Social": {
133
- "Score": 0.715935163,
134
- "Standard Deviation": 0.1209141409
135
- }
136
- }
137
- },
138
- {
139
- "config": {
140
- "model_name": "gemini-1.5-pro-001",
141
- "organization": "Google",
142
- "license": "Proprietary",
143
- "knowledge_cutoff": "2024-01"
144
- },
145
- "results": {
146
- "OVERALL": {
147
- "Score": 0.797187842,
148
- "Standard Deviation": 0.0272375249
149
- },
150
- "Geometry": {
151
- "Score": 0.9947169,
152
- "Standard Deviation": 0.009150597621
153
- },
154
- "Algebra": {
155
- "Score": 0.857464301,
156
- "Standard Deviation": 0.05014285338
157
- },
158
- "Probability": {
159
- "Score": 0.651781767,
160
- "Standard Deviation": 0.04156998547
161
- },
162
- "Logical": {
163
- "Score": 0.739745471,
164
- "Standard Deviation": 0.01631532019
165
- },
166
- "Social": {
167
- "Score": 0.649601885,
168
- "Standard Deviation": 0.104854889
169
- }
170
- }
171
- },
172
- {
173
- "config": {
174
- "model_name": "qwen2-72b-instruct",
175
- "organization": "Alibaba",
176
- "license": "Qianwen LICENSE",
177
- "knowledge_cutoff": "2024-02"
178
- },
179
- "results": {
180
- "OVERALL": {
181
- "Score": 0.737918558,
182
- "Standard Deviation": 0.09069077339
183
- },
184
- "Geometry": {
185
- "Score": 0.796870305,
186
- "Standard Deviation": 0.0509025346
187
- },
188
- "Algebra": {
189
- "Score": 0.836194231,
190
- "Standard Deviation": 0.04517093028
191
- },
192
- "Probability": {
193
- "Score": 0.788068004,
194
- "Standard Deviation": 0.007288989044
195
- },
196
- "Logical": {
197
- "Score": 0.619300904,
198
- "Standard Deviation": 0.06377931612
199
- },
200
- "Social": {
201
- "Score": 0.652578786,
202
- "Standard Deviation": 0.04259293171
203
- }
204
- }
205
- },
206
- {
207
- "config": {
208
- "model_name": "gpt-4o-mini-2024-07-18",
209
- "organization": "OpenAI",
210
- "license": "Proprietary",
211
- "knowledge_cutoff": "2024-07"
212
- },
213
- "results": {
214
- "OVERALL": {
215
- "Score": 0.847694133,
216
- "Standard Deviation": 0.02164304402
217
- },
218
- "Geometry": {
219
- "Score": 0.946650435,
220
- "Standard Deviation": 0.01831236482
221
- },
222
- "Algebra": {
223
- "Score": 0.796243022,
224
- "Standard Deviation": 0.05537539202
225
- },
226
- "Probability": {
227
- "Score": 0.798402685,
228
- "Standard Deviation": 0.009404491967
229
- },
230
- "Logical": {
231
- "Score": 0.727009735,
232
- "Standard Deviation": 0.02628110141
233
- },
234
- "Social": {
235
- "Score": 0.691949855,
236
- "Standard Deviation": 0.02072934333
237
- }
238
- }
239
- },
240
- {
241
- "config": {
242
- "model_name": "claude-3.5-sonnet",
243
- "organization": "Anthropic",
244
- "license": "Proprietary",
245
- "knowledge_cutoff": "2024-03"
246
- },
247
- "results": {
248
- "OVERALL": {
249
- "Score": 0.839004422,
250
- "Standard Deviation": 0.1461079564
251
- },
252
- "Geometry": {
253
- "Score": 0.95316419,
254
- "Standard Deviation": 0.02081192856
255
- },
256
- "Algebra": {
257
- "Score": 0.759789952,
258
- "Standard Deviation": 0.02611765096
259
- },
260
- "Probability": {
261
- "Score": 0.707730127,
262
- "Standard Deviation": 0.0394436664
263
- },
264
- "Logical": {
265
- "Score": 0.77342666,
266
- "Standard Deviation": 0.002892426458
267
- },
268
- "Social": {
269
- "Score": 0.790002247,
270
- "Standard Deviation": 0.1007410022
271
- }
272
- }
273
- },
274
- {
275
- "config": {
276
- "model_name": "o1-mini",
277
- "organization": "OpenAI",
278
- "license": "Proprietary",
279
- "knowledge_cutoff": "2024-01"
280
- },
281
- "results": {
282
- "OVERALL": {
283
- "Score": 1.0,
284
- "Standard Deviation": 0.0
285
- },
286
- "Geometry": {
287
- "Score": "N/A",
288
- "Standard Deviation": "N/A"
289
- },
290
- "Algebra": {
291
- "Score": "N/A",
292
- "Standard Deviation": "N/A"
293
- },
294
- "Probability": {
295
- "Score": 1.0,
296
- "Standard Deviation": 0.0
297
- },
298
- "Logical": {
299
- "Score": 1.0,
300
- "Standard Deviation": 0.0
301
- },
302
- "Social": {
303
- "Score": 0.993974241,
304
- "Standard Deviation": 0.001996882328
305
- }
306
- }
307
- },
308
- {
309
- "config": {
310
- "model_name": "o1-preview",
311
- "organization": "OpenAI",
312
- "license": "Proprietary",
313
- "knowledge_cutoff": "2024-01"
314
- },
315
- "results": {
316
- "OVERALL": {
317
- "Score": 0.945884589,
318
- "Standard Deviation": 0.01059250762
319
- },
320
- "Geometry": {
321
- "Score": "N/A",
322
- "Standard Deviation": "N/A"
323
- },
324
- "Algebra": {
325
- "Score": "N/A",
326
- "Standard Deviation": "N/A"
327
- },
328
- "Probability": {
329
- "Score": 0.964666392,
330
- "Standard Deviation": 0.003139983398
331
- },
332
- "Logical": {
333
- "Score": 0.987950057,
334
- "Standard Deviation": 0.004881220327
335
- },
336
- "Social": {
337
- "Score": 1.0,
338
- "Standard Deviation": 0.0
339
- }
340
- }
341
- },
342
- {
343
- "config": {
344
- "model_name": "gemini-1.5-flash-001",
345
- "organization": "Google",
346
- "license": "Proprietary",
347
- "knowledge_cutoff": "2024-02"
348
- },
349
- "results": {
350
- "OVERALL": {
351
- "Score": 0.726493401,
352
- "Standard Deviation": 0.01113913725
353
- },
354
- "Geometry": {
355
- "Score": 0.804144103,
356
- "Standard Deviation": 0.1327142178
357
- },
358
- "Algebra": {
359
- "Score": 0.731776765,
360
- "Standard Deviation": 0.02594657111
361
- },
362
- "Probability": {
363
- "Score": 0.614461891,
364
- "Standard Deviation": 0.04690131826
365
- },
366
- "Logical": {
367
- "Score": 0.630805991,
368
- "Standard Deviation": 0.04871350612
369
- },
370
- "Social": {
371
- "Score": 0.555933822,
372
- "Standard Deviation": 0.1029934524
373
- }
374
- }
375
- },
376
- {
377
- "config": {
378
- "model_name": "gpt4-1106",
379
- "organization": "OpenAI",
380
- "license": "Proprietary",
381
- "knowledge_cutoff": "2024-04"
382
- },
383
- "results": {
384
- "OVERALL": {
385
- "Score": 0.816347784,
386
- "Standard Deviation": 0.1566815755
387
- },
388
- "Geometry": {
389
- "Score": 0.71843088,
390
- "Standard Deviation": 0.04778038294
391
- },
392
- "Algebra": {
393
- "Score": 0.712910417,
394
- "Standard Deviation": 0.02581828898
395
- },
396
- "Probability": {
397
- "Score": 0.623947619,
398
- "Standard Deviation": 0.03502982933
399
- },
400
- "Logical": {
401
- "Score": 0.637482274,
402
- "Standard Deviation": 0.04158809888
403
- },
404
- "Social": {
405
- "Score": 0.450609816,
406
- "Standard Deviation": 0.05208655446
407
- }
408
- }
409
- },
410
- {
411
- "config": {
412
- "model_name": "gemma-2-27b-it",
413
- "organization": "Google",
414
- "license": "Gemma License",
415
- "knowledge_cutoff": "2024-03"
416
- },
417
- "results": {
418
- "OVERALL": {
419
- "Score": 0.624169623,
420
- "Standard Deviation": 0.1048365121
421
- },
422
- "Geometry": {
423
- "Score": 0.60112744,
424
- "Standard Deviation": 0.0469109952
425
- },
426
- "Algebra": {
427
- "Score": 0.687955914,
428
- "Standard Deviation": 0.01959958192
429
- },
430
- "Probability": {
431
- "Score": 0.589524771,
432
- "Standard Deviation": 0.03112689325
433
- },
434
- "Logical": {
435
- "Score": 0.614978944,
436
- "Standard Deviation": 0.05710657859
437
- },
438
- "Social": {
439
- "Score": 0.487844257,
440
- "Standard Deviation": 0.05857760809
441
- }
442
- }
443
- },
444
- {
445
- "config": {
446
- "model_name": "claude-3-opus",
447
- "organization": "Anthropic",
448
- "license": "Proprietary",
449
- "knowledge_cutoff": "2024-01"
450
- },
451
- "results": {
452
- "OVERALL": {
453
- "Score": 0.650636271,
454
- "Standard Deviation": 0.1197773541
455
- },
456
- "Geometry": {
457
- "Score": 0.7215743,
458
- "Standard Deviation": 0.04712598358
459
- },
460
- "Algebra": {
461
- "Score": 0.68777327,
462
- "Standard Deviation": 0.02382683713
463
- },
464
- "Probability": {
465
- "Score": 0.626471421,
466
- "Standard Deviation": 0.02911817976
467
- },
468
- "Logical": {
469
- "Score": 0.692346381,
470
- "Standard Deviation": 0.03617185198
471
- },
472
- "Social": {
473
- "Score": 0.663410854,
474
- "Standard Deviation": 0.09540220876
475
- }
476
- }
477
- },
478
- {
479
- "config": {
480
- "model_name": "gemma-2-9b-it-simpo",
481
- "organization": "Google",
482
- "license": "Gemma License",
483
- "knowledge_cutoff": "2024-02"
484
- },
485
- "results": {
486
- "OVERALL": {
487
- "Score": "N/A",
488
- "Standard Deviation": "N/A"
489
- },
490
- "Geometry": {
491
- "Score": 0.582787508,
492
- "Standard Deviation": 0.03965204074
493
- },
494
- "Algebra": {
495
- "Score": 0.658648133,
496
- "Standard Deviation": 0.02565919856
497
- },
498
- "Probability": {
499
- "Score": 0.547861265,
500
- "Standard Deviation": 0.02885209131
501
- },
502
- "Logical": {
503
- "Score": 0.540720893,
504
- "Standard Deviation": 0.01970134508
505
- },
506
- "Social": {
507
- "Score": 0.635266187,
508
- "Standard Deviation": 0.03620021751
509
- }
510
- }
511
- },
512
- {
513
- "config": {
514
- "model_name": "qwen1.5-72b-chat",
515
- "organization": "Alibaba",
516
- "license": "Qianwen LICENSE",
517
- "knowledge_cutoff": "2024-03"
518
- },
519
- "results": {
520
- "OVERALL": {
521
- "Score": 0.519549796,
522
- "Standard Deviation": 0.00903634343
523
- },
524
- "Geometry": {
525
- "Score": 0.543139301,
526
- "Standard Deviation": 0.03425202326
527
- },
528
- "Algebra": {
529
- "Score": 0.635228729,
530
- "Standard Deviation": 0.01944043425
531
- },
532
- "Probability": {
533
- "Score": 0.486948658,
534
- "Standard Deviation": 0.06064655315
535
- },
536
- "Logical": {
537
- "Score": 0.284069394,
538
- "Standard Deviation": 0.02686608506
539
- },
540
- "Social": {
541
- "Score": 0.415007627,
542
- "Standard Deviation": 0.03920053159
543
- }
544
- }
545
- },
546
- {
547
- "config": {
548
- "model_name": "qwen1.5-32b-chat",
549
- "organization": "Alibaba",
550
- "license": "Qianwen LICENSE",
551
- "knowledge_cutoff": "2024-03"
552
- },
553
- "results": {
554
- "OVERALL": {
555
- "Score": 0.393789407,
556
- "Standard Deviation": 0.05413770095
557
- },
558
- "Geometry": {
559
- "Score": 0.51086835,
560
- "Standard Deviation": 0.04052471998
561
- },
562
- "Algebra": {
563
- "Score": 0.609003168,
564
- "Standard Deviation": 0.04874143541
565
- },
566
- "Probability": {
567
- "Score": 0.476300002,
568
- "Standard Deviation": 0.05322403912
569
- },
570
- "Logical": {
571
- "Score": 0.331781014,
572
- "Standard Deviation": 0.004938997686
573
- },
574
- "Social": {
575
- "Score": 0.380987334,
576
- "Standard Deviation": 0.03762251776
577
- }
578
- }
579
- },
580
- {
581
- "config": {
582
- "model_name": "google-gemma-2-9b-it",
583
- "organization": "Google",
584
- "license": "Proprietary",
585
- "knowledge_cutoff": "2024-01"
586
- },
587
- "results": {
588
- "OVERALL": {
589
- "Score": 0.489663449,
590
- "Standard Deviation": 0.002595702019
591
- },
592
- "Geometry": {
593
- "Score": 0.575371308,
594
- "Standard Deviation": 0.03556220251
595
- },
596
- "Algebra": {
597
- "Score": 0.597045661,
598
- "Standard Deviation": 0.0313828123
599
- },
600
- "Probability": {
601
- "Score": 0.589221807,
602
- "Standard Deviation": 0.03110811656
603
- },
604
- "Logical": {
605
- "Score": 0.587579897,
606
- "Standard Deviation": 0.05512716783
607
- },
608
- "Social": {
609
- "Score": 0.768337958,
610
- "Standard Deviation": 0.04078610476
611
- }
612
- }
613
- },
614
- {
615
- "config": {
616
- "model_name": "yi-1.5-34b-chat",
617
- "organization": "01 AI",
618
- "license": "Proprietary",
619
- "knowledge_cutoff": "2024-01"
620
- },
621
- "results": {
622
- "OVERALL": {
623
- "Score": 0.607812897,
624
- "Standard Deviation": 0.1440881293
625
- },
626
- "Geometry": {
627
- "Score": 0.566666724,
628
- "Standard Deviation": 0.04001381658
629
- },
630
- "Algebra": {
631
- "Score": 0.590997292,
632
- "Standard Deviation": 0.03594087315
633
- },
634
- "Probability": {
635
- "Score": 0.589524589,
636
- "Standard Deviation": 0.03112618772
637
- },
638
- "Logical": {
639
- "Score": 0.574105508,
640
- "Standard Deviation": 0.03441737941
641
- },
642
- "Social": {
643
- "Score": 0.516980832,
644
- "Standard Deviation": 0.03369347985
645
- }
646
- }
647
- },
648
- {
649
- "config": {
650
- "model_name": "meta-llama-3.1-8b-instruct",
651
- "organization": "Meta",
652
- "license": "Llama 3.1 Community",
653
- "knowledge_cutoff": "2024-02"
654
- },
655
- "results": {
656
- "OVERALL": {
657
- "Score": 0.505936324,
658
- "Standard Deviation": 0.05286756493
659
- },
660
- "Geometry": {
661
- "Score": 0.522442162,
662
- "Standard Deviation": 0.03908236317
663
- },
664
- "Algebra": {
665
- "Score": 0.582702645,
666
- "Standard Deviation": 0.05002277711
667
- },
668
- "Probability": {
669
- "Score": 0.495001149,
670
- "Standard Deviation": 0.05244587037
671
- },
672
- "Logical": {
673
- "Score": 0.443030561,
674
- "Standard Deviation": 0.01343820628
675
- },
676
- "Social": {
677
- "Score": 0.329195941,
678
- "Standard Deviation": 0.03925019528
679
- }
680
- }
681
- },
682
- {
683
- "config": {
684
- "model_name": "gpt3.5-turbo-0125",
685
- "organization": "OpenAI",
686
- "license": "Proprietary",
687
- "knowledge_cutoff": "2023-12"
688
- },
689
- "results": {
690
- "OVERALL": {
691
- "Score": 0.313398088,
692
- "Standard Deviation": 0.09322528606
693
- },
694
- "Geometry": {
695
- "Score": 0.678714519,
696
- "Standard Deviation": 0.05926546762
697
- },
698
- "Algebra": {
699
- "Score": 0.569296173,
700
- "Standard Deviation": 0.05277281097
701
- },
702
- "Probability": {
703
- "Score": 0.448460767,
704
- "Standard Deviation": 0.05768095196
705
- },
706
- "Logical": {
707
- "Score": 0.148521348,
708
- "Standard Deviation": 0.04033712907
709
- },
710
- "Social": {
711
- "Score": 0.235071541,
712
- "Standard Deviation": 0.02632892457
713
- }
714
- }
715
- },
716
- {
717
- "config": {
718
- "model_name": "llama-3-70b-instruct",
719
- "organization": "Meta",
720
- "license": "Llama 3 Community",
721
- "knowledge_cutoff": "2024-03"
722
- },
723
- "results": {
724
- "OVERALL": {
725
- "Score": 0.456689885,
726
- "Standard Deviation": 0.01385989995
727
- },
728
- "Geometry": {
729
- "Score": 0.516865529,
730
- "Standard Deviation": 0.03858112564
731
- },
732
- "Algebra": {
733
- "Score": 0.566756531,
734
- "Standard Deviation": 0.03369826926
735
- },
736
- "Probability": {
737
- "Score": 0.513857306,
738
- "Standard Deviation": 0.05453699062
739
- },
740
- "Logical": {
741
- "Score": 0.713796415,
742
- "Standard Deviation": 0.02031215107
743
- },
744
- "Social": {
745
- "Score": 0.45872939,
746
- "Standard Deviation": 0.05347039576
747
- }
748
- }
749
- },
750
- {
751
- "config": {
752
- "model_name": "claude-3-sonnet",
753
- "organization": "Anthropic",
754
- "license": "Proprietary",
755
- "knowledge_cutoff": "2024-02"
756
- },
757
- "results": {
758
- "OVERALL": {
759
- "Score": 0.520010833,
760
- "Standard Deviation": 0.005030563799
761
- },
762
- "Geometry": {
763
- "Score": 0.675613638,
764
- "Standard Deviation": 0.05275594408
765
- },
766
- "Algebra": {
767
- "Score": 0.552025728,
768
- "Standard Deviation": 0.04122192409
769
- },
770
- "Probability": {
771
- "Score": 0.516192848,
772
- "Standard Deviation": 0.04152293217
773
- },
774
- "Logical": {
775
- "Score": 0.588545747,
776
- "Standard Deviation": 0.06068211943
777
- },
778
- "Social": {
779
- "Score": 0.570437582,
780
- "Standard Deviation": 0.08607040862
781
- }
782
- }
783
- },
784
- {
785
- "config": {
786
- "model_name": "qwen1.5-14b-chat",
787
- "organization": "Alibaba",
788
- "license": "Qianwen LICENSE",
789
- "knowledge_cutoff": "2024-01"
790
- },
791
- "results": {
792
- "OVERALL": {
793
- "Score": 0.415328996,
794
- "Standard Deviation": 0.0743938717
795
- },
796
- "Geometry": {
797
- "Score": 0.452504016,
798
- "Standard Deviation": 0.04225594393
799
- },
800
- "Algebra": {
801
- "Score": 0.538655725,
802
- "Standard Deviation": 0.03721542594
803
- },
804
- "Probability": {
805
- "Score": 0.397185975,
806
- "Standard Deviation": 0.05607695946
807
- },
808
- "Logical": {
809
- "Score": 0.264573129,
810
- "Standard Deviation": 0.03936133174
811
- },
812
- "Social": {
813
- "Score": 0.287370142,
814
- "Standard Deviation": 0.04264085315
815
- }
816
- }
817
- },
818
- {
819
- "config": {
820
- "model_name": "claude-3-haiku",
821
- "organization": "Anthropic",
822
- "license": "Proprietary",
823
- "knowledge_cutoff": "2024-01"
824
- },
825
- "results": {
826
- "OVERALL": {
827
- "Score": 0.453901163,
828
- "Standard Deviation": 0.003604084261
829
- },
830
- "Geometry": {
831
- "Score": 0.607993912,
832
- "Standard Deviation": 0.05793460748
833
- },
834
- "Algebra": {
835
- "Score": 0.520054055,
836
- "Standard Deviation": 0.03333544511
837
- },
838
- "Probability": {
839
- "Score": 0.474460688,
840
- "Standard Deviation": 0.0446501933
841
- },
842
- "Logical": {
843
- "Score": 0.512815976,
844
- "Standard Deviation": 0.0163264281
845
- },
846
- "Social": {
847
- "Score": 0.551083976,
848
- "Standard Deviation": 0.05374722539
849
- }
850
- }
851
- },
852
- {
853
- "config": {
854
- "model_name": "claude-2.1",
855
- "organization": "Anthropic",
856
- "license": "Proprietary",
857
- "knowledge_cutoff": "2023-12"
858
- },
859
- "results": {
860
- "OVERALL": {
861
- "Score": 0.35814708,
862
- "Standard Deviation": 0.09168134168
863
- },
864
- "Geometry": {
865
- "Score": 0.62752395,
866
- "Standard Deviation": 0.07232659398
867
- },
868
- "Algebra": {
869
- "Score": 0.508849609,
870
- "Standard Deviation": 0.0346897465
871
- },
872
- "Probability": {
873
- "Score": 0.41477086,
874
- "Standard Deviation": 0.05964060239
875
- },
876
- "Logical": {
877
- "Score": 0.482923674,
878
- "Standard Deviation": 0.01989147048
879
- },
880
- "Social": {
881
- "Score": 0.333804568,
882
- "Standard Deviation": 0.03775548253
883
- }
884
- }
885
- },
886
- {
887
- "config": {
888
- "model_name": "mistral-8x7b-instruct-v0.1",
889
- "organization": "Mistral",
890
- "license": "Apache 2.0",
891
- "knowledge_cutoff": "2023-12"
892
- },
893
- "results": {
894
- "OVERALL": {
895
- "Score": 0.382659161,
896
- "Standard Deviation": 0.07594496929
897
- },
898
- "Geometry": {
899
- "Score": 0.432216097,
900
- "Standard Deviation": 0.04747949254
901
- },
902
- "Algebra": {
903
- "Score": 0.478314888,
904
- "Standard Deviation": 0.01998797419
905
- },
906
- "Probability": {
907
- "Score": 0.427144725,
908
- "Standard Deviation": 0.0590923329
909
- },
910
- "Logical": {
911
- "Score": 0.340041983,
912
- "Standard Deviation": 0.008397574592
913
- },
914
- "Social": {
915
- "Score": 0.251949622,
916
- "Standard Deviation": 0.03346674405
917
- }
918
- }
919
- },
920
- {
921
- "config": {
922
- "model_name": "claude-2.0",
923
- "organization": "Anthropic",
924
- "license": "Proprietary",
925
- "knowledge_cutoff": "2023-10"
926
- },
927
- "results": {
928
- "OVERALL": {
929
- "Score": 0.322718057,
930
- "Standard Deviation": 0.08369883584
931
- },
932
- "Geometry": {
933
- "Score": 0.604141967,
934
- "Standard Deviation": 0.05116441826
935
- },
936
- "Algebra": {
937
- "Score": 0.474350734,
938
- "Standard Deviation": 0.01510393066
939
- },
940
- "Probability": {
941
- "Score": 0.437950412,
942
- "Standard Deviation": 0.05985594317
943
- },
944
- "Logical": {
945
- "Score": 0.445620646,
946
- "Standard Deviation": 0.01812614805
947
- },
948
- "Social": {
949
- "Score": 0.469422836,
950
- "Standard Deviation": 0.05999901796
951
- }
952
- }
953
- },
954
- {
955
- "config": {
956
- "model_name": "starling-lm-7b-beta",
957
- "organization": "Nexusflow",
958
- "license": "Apache-2.0",
959
- "knowledge_cutoff": "2024-01"
960
- },
961
- "results": {
962
- "OVERALL": {
963
- "Score": 0.479391856,
964
- "Standard Deviation": 0.04199990887
965
- },
966
- "Geometry": {
967
- "Score": 0.446654388,
968
- "Standard Deviation": 0.05637864999
969
- },
970
- "Algebra": {
971
- "Score": 0.473952749,
972
- "Standard Deviation": 0.01584301288
973
- },
974
- "Probability": {
975
- "Score": 0.395197837,
976
- "Standard Deviation": 0.05814798892
977
- },
978
- "Logical": {
979
- "Score": 0.39927199,
980
- "Standard Deviation": 0.02125277518
981
- },
982
- "Social": {
983
- "Score": 0.380021662,
984
- "Standard Deviation": 0.04622452748
985
- }
986
- }
987
- },
988
- {
989
- "config": {
990
- "model_name": "gemini-1.0-pro-001",
991
- "organization": "Google",
992
- "license": "Proprietary",
993
- "knowledge_cutoff": "2023-11"
994
- },
995
- "results": {
996
- "OVERALL": {
997
- "Score": 0.449040654,
998
- "Standard Deviation": 0.0450610177
999
- },
1000
- "Geometry": {
1001
- "Score": 0.578347959,
1002
- "Standard Deviation": 0.04242873607
1003
- },
1004
- "Algebra": {
1005
- "Score": 0.462417786,
1006
- "Standard Deviation": 0.01668313635
1007
- },
1008
- "Probability": {
1009
- "Score": 0.289836324,
1010
- "Standard Deviation": 0.05739831115
1011
- },
1012
- "Logical": {
1013
- "Score": 0.191140355,
1014
- "Standard Deviation": 0.03394652499
1015
- },
1016
- "Social": {
1017
- "Score": 0.130790863,
1018
- "Standard Deviation": 0.02800188173
1019
- }
1020
- }
1021
- },
1022
- {
1023
- "config": {
1024
- "model_name": "openchat-3.5-0106",
1025
- "organization": "OpenChat",
1026
- "license": "Apache-2.0",
1027
- "knowledge_cutoff": "2024-01"
1028
- },
1029
- "results": {
1030
- "OVERALL": {
1031
- "Score": 0.363929888,
1032
- "Standard Deviation": 0.08602347145
1033
- },
1034
- "Geometry": {
1035
- "Score": 0.38715246,
1036
- "Standard Deviation": 0.03701851946
1037
- },
1038
- "Algebra": {
1039
- "Score": 0.441233712,
1040
- "Standard Deviation": 0.01135753754
1041
- },
1042
- "Probability": {
1043
- "Score": 0.38802618,
1044
- "Standard Deviation": 0.05663879714
1045
- },
1046
- "Logical": {
1047
- "Score": 0.336754383,
1048
- "Standard Deviation": 0.01608478079
1049
- },
1050
- "Social": {
1051
- "Score": 0.250891608,
1052
- "Standard Deviation": 0.03253769914
1053
- }
1054
- }
1055
- },
1056
- {
1057
- "config": {
1058
- "model_name": "openchat-3.5",
1059
- "organization": "OpenChat",
1060
- "license": "Apache-2.0",
1061
- "knowledge_cutoff": "2023-12"
1062
- },
1063
- "results": {
1064
- "OVERALL": {
1065
- "Score": 0.361341296,
1066
- "Standard Deviation": 0.09034869493
1067
- },
1068
- "Geometry": {
1069
- "Score": 0.401699069,
1070
- "Standard Deviation": 0.03410726557
1071
- },
1072
- "Algebra": {
1073
- "Score": 0.414095336,
1074
- "Standard Deviation": 0.01881964261
1075
- },
1076
- "Probability": {
1077
- "Score": 0.349601002,
1078
- "Standard Deviation": 0.05077455539
1079
- },
1080
- "Logical": {
1081
- "Score": 0.331069242,
1082
- "Standard Deviation": 0.02180827173
1083
- },
1084
- "Social": {
1085
- "Score": 0.319991655,
1086
- "Standard Deviation": 0.04502478724
1087
- }
1088
- }
1089
- },
1090
- {
1091
- "config": {
1092
- "model_name": "command-r-(08-2024)",
1093
- "organization": "Cohere",
1094
- "license": "CC-BY-NC-4.0",
1095
- "knowledge_cutoff": "2024-08"
1096
- },
1097
- "results": {
1098
- "OVERALL": {
1099
- "Score": 0.427605298,
1100
- "Standard Deviation": 0.01747449163
1101
- },
1102
- "Geometry": {
1103
- "Score": 0.448300727,
1104
- "Standard Deviation": 0.04996362328
1105
- },
1106
- "Algebra": {
1107
- "Score": 0.417519167,
1108
- "Standard Deviation": 0.01822196902
1109
- },
1110
- "Probability": {
1111
- "Score": 0.366336281,
1112
- "Standard Deviation": 0.04716826942
1113
- },
1114
- "Logical": {
1115
- "Score": 0.214657906,
1116
- "Standard Deviation": 0.03003579835
1117
- },
1118
- "Social": {
1119
- "Score": 0.276088379,
1120
- "Standard Deviation": 0.03295234688
1121
- }
1122
- }
1123
- },
1124
- {
1125
- "config": {
1126
- "model_name": "gemma-1.1-7b-it",
1127
- "organization": "Google",
1128
- "license": "Gemma License",
1129
- "knowledge_cutoff": "2023-11"
1130
- },
1131
- "results": {
1132
- "OVERALL": {
1133
- "Score": 0.339506922,
1134
- "Standard Deviation": 0.1066279108
1135
- },
1136
- "Geometry": {
1137
- "Score": 0.324170977,
1138
- "Standard Deviation": 0.04668553765
1139
- },
1140
- "Algebra": {
1141
- "Score": 0.398684697,
1142
- "Standard Deviation": 0.01982398259
1143
- },
1144
- "Probability": {
1145
- "Score": 0.293253175,
1146
- "Standard Deviation": 0.05126192191
1147
- },
1148
- "Logical": {
1149
- "Score": 0.317750796,
1150
- "Standard Deviation": 0.01101933543
1151
- },
1152
- "Social": {
1153
- "Score": 0.179073276,
1154
- "Standard Deviation": 0.02009658805
1155
- }
1156
- }
1157
- },
1158
- {
1159
- "config": {
1160
- "model_name": "llama3-8b-instruct",
1161
- "organization": "Meta",
1162
- "license": "Llama 3 Community",
1163
- "knowledge_cutoff": "2024-01"
1164
- },
1165
- "results": {
1166
- "OVERALL": {
1167
- "Score": 0.367722676,
1168
- "Standard Deviation": 0.1071368221
1169
- },
1170
- "Geometry": {
1171
- "Score": 0.367143758,
1172
- "Standard Deviation": 0.04363680358
1173
- },
1174
- "Algebra": {
1175
- "Score": 0.391480973,
1176
- "Standard Deviation": 0.02757445266
1177
- },
1178
- "Probability": {
1179
- "Score": 0.317616445,
1180
- "Standard Deviation": 0.04300430361
1181
- },
1182
- "Logical": {
1183
- "Score": 0.461607495,
1184
- "Standard Deviation": 0.02185028842
1185
- },
1186
- "Social": {
1187
- "Score": 0.336373622,
1188
- "Standard Deviation": 0.05762408512
1189
- }
1190
- }
1191
- },
1192
- {
1193
- "config": {
1194
- "model_name": "gemma-2-2b-it",
1195
- "organization": "Google",
1196
- "license": "Gemma License",
1197
- "knowledge_cutoff": "2023-12"
1198
- },
1199
- "results": {
1200
- "OVERALL": {
1201
- "Score": 0.502167612,
1202
- "Standard Deviation": 0.04389786763
1203
- },
1204
- "Geometry": {
1205
- "Score": 0.395006676,
1206
- "Standard Deviation": 0.05882607713
1207
- },
1208
- "Algebra": {
1209
- "Score": 0.379391887,
1210
- "Standard Deviation": 0.01722410785
1211
- },
1212
- "Probability": {
1213
- "Score": 0.331231097,
1214
- "Standard Deviation": 0.05392499987
1215
- },
1216
- "Logical": {
1217
- "Score": 0.367687789,
1218
- "Standard Deviation": 0.02547968808
1219
- },
1220
- "Social": {
1221
- "Score": 0.393482094,
1222
- "Standard Deviation": 0.06450214024
1223
- }
1224
- }
1225
- },
1226
- {
1227
- "config": {
1228
- "model_name": "starling-lm-7b-alpha",
1229
- "organization": "Nexusflow",
1230
- "license": "Apache-2.0",
1231
- "knowledge_cutoff": "2023-12"
1232
- },
1233
- "results": {
1234
- "OVERALL": {
1235
- "Score": 0.366628765,
1236
- "Standard Deviation": 0.08405492929
1237
- },
1238
- "Geometry": {
1239
- "Score": 0.336782578,
1240
- "Standard Deviation": 0.04069449132
1241
- },
1242
- "Algebra": {
1243
- "Score": 0.371551932,
1244
- "Standard Deviation": 0.03367241745
1245
- },
1246
- "Probability": {
1247
- "Score": 0.331472505,
1248
- "Standard Deviation": 0.04833324282
1249
- },
1250
- "Logical": {
1251
- "Score": 0.260869624,
1252
- "Standard Deviation": 0.03562735237
1253
- },
1254
- "Social": {
1255
- "Score": 0.271975534,
1256
- "Standard Deviation": 0.04266753408
1257
- }
1258
- }
1259
- },
1260
- {
1261
- "config": {
1262
- "model_name": "qwen1.5-4b-chat",
1263
- "organization": "Alibaba",
1264
- "license": "Qianwen LICENSE",
1265
- "knowledge_cutoff": "2024-02"
1266
- },
1267
- "results": {
1268
- "OVERALL": {
1269
- "Score": 0.111876411,
1270
- "Standard Deviation": 0.04241022785
1271
- },
1272
- "Geometry": {
1273
- "Score": 0.215834522,
1274
- "Standard Deviation": 0.0363766363
1275
- },
1276
- "Algebra": {
1277
- "Score": 0.305589811,
1278
- "Standard Deviation": 0.02354198912
1279
- },
1280
- "Probability": {
1281
- "Score": 0.149365327,
1282
- "Standard Deviation": 0.03489672675
1283
- },
1284
- "Logical": {
1285
- "Score": 0.116210168,
1286
- "Standard Deviation": 0.005927966496
1287
- },
1288
- "Social": {
1289
- "Score": 0.18195615,
1290
- "Standard Deviation": 0.02269805277
1291
- }
1292
- }
1293
- },
1294
- {
1295
- "config": {
1296
- "model_name": "command-r-(04-2024)",
1297
- "organization": "Cohere",
1298
- "license": "CC-BY-NC-4.0",
1299
- "knowledge_cutoff": "2024-04"
1300
- },
1301
- "results": {
1302
- "OVERALL": {
1303
- "Score": 0.388783887,
1304
- "Standard Deviation": 0.07417186783
1305
- },
1306
- "Geometry": {
1307
- "Score": 0.300416698,
1308
- "Standard Deviation": 0.03485612736
1309
- },
1310
- "Algebra": {
1311
- "Score": 0.293120231,
1312
- "Standard Deviation": 0.032926484
1313
- },
1314
- "Probability": {
1315
- "Score": 0.281271304,
1316
- "Standard Deviation": 0.05697149867
1317
- },
1318
- "Logical": {
1319
- "Score": 0.276189906,
1320
- "Standard Deviation": 0.03562914754
1321
- },
1322
- "Social": {
1323
- "Score": 0.283882949,
1324
- "Standard Deviation": 0.03336901148
1325
- }
1326
- }
1327
- },
1328
- {
1329
- "config": {
1330
- "model_name": "vicuna-33b",
1331
- "organization": "LMSYS",
1332
- "license": "Non-commercial",
1333
- "knowledge_cutoff": "2023-12"
1334
- },
1335
- "results": {
1336
- "OVERALL": {
1337
- "Score": 0.316543555,
1338
- "Standard Deviation": 0.08922095647
1339
- },
1340
- "Geometry": {
1341
- "Score": 0.208284679,
1342
- "Standard Deviation": 0.03937771461
1343
- },
1344
- "Algebra": {
1345
- "Score": 0.248994048,
1346
- "Standard Deviation": 0.02668175054
1347
- },
1348
- "Probability": {
1349
- "Score": 0.222313995,
1350
- "Standard Deviation": 0.03978859759
1351
- },
1352
- "Logical": {
1353
- "Score": 0.180291222,
1354
- "Standard Deviation": 0.021886267
1355
- },
1356
- "Social": {
1357
- "Score": 0.257623798,
1358
- "Standard Deviation": 0.02653724437
1359
- }
1360
- }
1361
- },
1362
- {
1363
- "config": {
1364
- "model_name": "gemma-7b-it",
1365
- "organization": "Google",
1366
- "license": "Gemma License",
1367
- "knowledge_cutoff": "2023-12"
1368
- },
1369
- "results": {
1370
- "OVERALL": {
1371
- "Score": 0.285077558,
1372
- "Standard Deviation": 0.08871758453
1373
- },
1374
- "Geometry": {
1375
- "Score": 0.244791417,
1376
- "Standard Deviation": 0.0289612078
1377
- },
1378
- "Algebra": {
1379
- "Score": 0.250614794,
1380
- "Standard Deviation": 0.01991678295
1381
- },
1382
- "Probability": {
1383
- "Score": 0.174313053,
1384
- "Standard Deviation": 0.03765424728
1385
- },
1386
- "Logical": {
1387
- "Score": 0.197505536,
1388
- "Standard Deviation": 0.02050298885
1389
- },
1390
- "Social": {
1391
- "Score": 0.202138025,
1392
- "Standard Deviation": 0.02098346639
1393
- }
1394
- }
1395
- },
1396
- {
1397
- "config": {
1398
- "model_name": "mistral-7b-instruct-2",
1399
- "organization": "Mistral",
1400
- "license": "Apache 2.0",
1401
- "knowledge_cutoff": "2023-12"
1402
- },
1403
- "results": {
1404
- "OVERALL": {
1405
- "Score": 0.427513868,
1406
- "Standard Deviation": 0.05553921135
1407
- },
1408
- "Geometry": {
1409
- "Score": 0.216402626,
1410
- "Standard Deviation": 0.03338414918
1411
- },
1412
- "Algebra": {
1413
- "Score": 0.233777838,
1414
- "Standard Deviation": 0.0155226054
1415
- },
1416
- "Probability": {
1417
- "Score": 0.25118175,
1418
- "Standard Deviation": 0.04065514593
1419
- },
1420
- "Logical": {
1421
- "Score": 0.224469136,
1422
- "Standard Deviation": 0.03404706752
1423
- },
1424
- "Social": {
1425
- "Score": 0.209386782,
1426
- "Standard Deviation": 0.02738569921
1427
- }
1428
- }
1429
- },
1430
- {
1431
- "config": {
1432
- "model_name": "mistral-7b-instruct-1",
1433
- "organization": "Mistral",
1434
- "license": "Apache 2.0",
1435
- "knowledge_cutoff": "2023-12"
1436
- },
1437
- "results": {
1438
- "OVERALL": {
1439
- "Score": 0.23016314,
1440
- "Standard Deviation": 0.07137625271
1441
- },
1442
- "Geometry": {
1443
- "Score": 0.161799938,
1444
- "Standard Deviation": 0.03595278559
1445
- },
1446
- "Algebra": {
1447
- "Score": 0.210341624,
1448
- "Standard Deviation": 0.01736539119
1449
- },
1450
- "Probability": {
1451
- "Score": 0.238417922,
1452
- "Standard Deviation": 0.03744211933
1453
- },
1454
- "Logical": {
1455
- "Score": 0.142636601,
1456
- "Standard Deviation": 0.02080406365
1457
- },
1458
- "Social": {
1459
- "Score": 0.117646827,
1460
- "Standard Deviation": 0.009321202779
1461
- }
1462
- }
1463
- },
1464
- {
1465
- "config": {
1466
- "model_name": "vicuna-13b",
1467
- "organization": "LMSYS",
1468
- "license": "Non-commercial",
1469
- "knowledge_cutoff": "2023-11"
1470
- },
1471
- "results": {
1472
- "OVERALL": {
1473
- "Score": 0.201892849,
1474
- "Standard Deviation": 0.06021749802
1475
- },
1476
- "Geometry": {
1477
- "Score": 0.200941928,
1478
- "Standard Deviation": 0.03366817781
1479
- },
1480
- "Algebra": {
1481
- "Score": 0.196123323,
1482
- "Standard Deviation": 0.0135715643
1483
- },
1484
- "Probability": {
1485
- "Score": 0.141214079,
1486
- "Standard Deviation": 0.02721328211
1487
- },
1488
- "Logical": {
1489
- "Score": 0.148598631,
1490
- "Standard Deviation": 0.02241523892
1491
- },
1492
- "Social": {
1493
- "Score": 0.124655135,
1494
- "Standard Deviation": 0.01122382671
1495
- }
1496
- }
1497
- },
1498
- {
1499
- "config": {
1500
- "model_name": "zephyr-7b-beta",
1501
- "organization": "HuggingFace",
1502
- "license": "MIT",
1503
- "knowledge_cutoff": "2023-10"
1504
- },
1505
- "results": {
1506
- "OVERALL": {
1507
- "Score": 0.102705119,
1508
- "Standard Deviation": 0.03683757312
1509
- },
1510
- "Geometry": {
1511
- "Score": 0.114005544,
1512
- "Standard Deviation": 0.03144354365
1513
- },
1514
- "Algebra": {
1515
- "Score": 0.141766633,
1516
- "Standard Deviation": 0.03179520129
1517
- },
1518
- "Probability": {
1519
- "Score": 0.089050714,
1520
- "Standard Deviation": 0.002136754266
1521
- },
1522
- "Logical": {
1523
- "Score": 0.069520789,
1524
- "Standard Deviation": 0.004477840857
1525
- },
1526
- "Social": {
1527
- "Score": 0.0,
1528
- "Standard Deviation": 0.0
1529
- }
1530
- }
1531
- },
1532
- {
1533
- "config": {
1534
- "model_name": "gemma-1.1-2b-it",
1535
- "organization": "Google",
1536
- "license": "Gemma License",
1537
- "knowledge_cutoff": "2023-12"
1538
- },
1539
- "results": {
1540
- "OVERALL": {
1541
- "Score": 0.257700845,
1542
- "Standard Deviation": 0.07369021445
1543
- },
1544
- "Geometry": {
1545
- "Score": 0.183974034,
1546
- "Standard Deviation": 0.0215548886
1547
- },
1548
- "Algebra": {
1549
- "Score": 0.13422252,
1550
- "Standard Deviation": 0.01922819511
1551
- },
1552
- "Probability": {
1553
- "Score": 0.095628657,
1554
- "Standard Deviation": 0.007536076456
1555
- },
1556
- "Logical": {
1557
- "Score": 0.094965074,
1558
- "Standard Deviation": 0.005019175487
1559
- },
1560
- "Social": {
1561
- "Score": 0.167796727,
1562
- "Standard Deviation": 0.01666541942
1563
- }
1564
- }
1565
- },
1566
- {
1567
- "config": {
1568
- "model_name": "llama2-7b-chat",
1569
- "organization": "Meta",
1570
- "license": "Llama 2 Community",
1571
- "knowledge_cutoff": "2023-10"
1572
- },
1573
- "results": {
1574
- "OVERALL": {
1575
- "Score": 0.260189428,
1576
- "Standard Deviation": 0.08019299364
1577
- },
1578
- "Geometry": {
1579
- "Score": 0.087067276,
1580
- "Standard Deviation": 0.04274343402
1581
- },
1582
- "Algebra": {
1583
- "Score": 0.12308805,
1584
- "Standard Deviation": 0.01856053622
1585
- },
1586
- "Probability": {
1587
- "Score": 0.087515438,
1588
- "Standard Deviation": 0.006315053573
1589
- },
1590
- "Logical": {
1591
- "Score": 0.17312827,
1592
- "Standard Deviation": 0.01867044092
1593
- },
1594
- "Social": {
1595
- "Score": 0.152905272,
1596
- "Standard Deviation": 0.007166957097
1597
- }
1598
- }
1599
- },
1600
- {
1601
- "config": {
1602
- "model_name": "gemma-2b-it",
1603
- "organization": "Google",
1604
- "license": "Gemma License",
1605
- "knowledge_cutoff": "2023-11"
1606
- },
1607
- "results": {
1608
- "OVERALL": {
1609
- "Score": 0.234172069,
1610
- "Standard Deviation": 0.06522685718
1611
- },
1612
- "Geometry": {
1613
- "Score": 0.198571153,
1614
- "Standard Deviation": 0.01699161031
1615
- },
1616
- "Algebra": {
1617
- "Score": 0.109883009,
1618
- "Standard Deviation": 0.01520005833
1619
- },
1620
- "Probability": {
1621
- "Score": 0.06467432,
1622
- "Standard Deviation": 0.002117497231
1623
- },
1624
- "Logical": {
1625
- "Score": 0.039624492,
1626
- "Standard Deviation": 0.007606972686
1627
- },
1628
- "Social": {
1629
- "Score": 0.087452913,
1630
- "Standard Deviation": 0.008170146562
1631
- }
1632
- }
1633
- },
1634
- {
1635
- "config": {
1636
- "model_name": "llama2-13b-chat",
1637
- "organization": "Meta",
1638
- "license": "Llama 2 Community",
1639
- "knowledge_cutoff": "2023-12"
1640
- },
1641
- "results": {
1642
- "OVERALL": {
1643
- "Score": 0.263305684,
1644
- "Standard Deviation": 0.07283640689
1645
- },
1646
- "Geometry": {
1647
- "Score": 0.072729954,
1648
- "Standard Deviation": 0.02315988261
1649
- },
1650
- "Algebra": {
1651
- "Score": 0.080371692,
1652
- "Standard Deviation": 0.01277569453
1653
- },
1654
- "Probability": {
1655
- "Score": 0.117757344,
1656
- "Standard Deviation": 0.02418619619
1657
- },
1658
- "Logical": {
1659
- "Score": 0.193149889,
1660
- "Standard Deviation": 0.01776690764
1661
- },
1662
- "Social": {
1663
- "Score": 0.149125922,
1664
- "Standard Deviation": 0.01157416827
1665
- }
1666
- }
1667
- },
1668
- {
1669
- "config": {
1670
- "model_name": "vicuna-7b",
1671
- "organization": "LMSYS",
1672
- "license": "Non-commercial",
1673
- "knowledge_cutoff": "2023-11"
1674
- },
1675
- "results": {
1676
- "OVERALL": {
1677
- "Score": 0.198839786,
1678
- "Standard Deviation": 0.05725381576
1679
- },
1680
- "Geometry": {
1681
- "Score": 0.083457058,
1682
- "Standard Deviation": 0.02520989111
1683
- },
1684
- "Algebra": {
1685
- "Score": 0.070883882,
1686
- "Standard Deviation": 0.007315853253
1687
- },
1688
- "Probability": {
1689
- "Score": 0.080987673,
1690
- "Standard Deviation": 0.005474288861
1691
- },
1692
- "Logical": {
1693
- "Score": 0.100065588,
1694
- "Standard Deviation": 0.003561886452
1695
- },
1696
- "Social": {
1697
- "Score": 0.111076414,
1698
- "Standard Deviation": 0.004805626512
1699
- }
1700
- }
1701
- },
1702
- {
1703
- "config": {
1704
- "model_name": "koala-13b",
1705
- "organization": "UC Berkeley",
1706
- "license": "Non-commercial",
1707
- "knowledge_cutoff": "2023-10"
1708
- },
1709
- "results": {
1710
- "OVERALL": {
1711
- "Score": 0.09387188,
1712
- "Standard Deviation": 0.02642167489
1713
- },
1714
- "Geometry": {
1715
- "Score": 0.017374001,
1716
- "Standard Deviation": 0.01747053557
1717
- },
1718
- "Algebra": {
1719
- "Score": 0.018129197,
1720
- "Standard Deviation": 0.01054371383
1721
- },
1722
- "Probability": {
1723
- "Score": 0.043654362,
1724
- "Standard Deviation": 0.004288231886
1725
- },
1726
- "Logical": {
1727
- "Score": 0.074694053,
1728
- "Standard Deviation": 0.002674646998
1729
- },
1730
- "Social": {
1731
- "Score": 0.096983835,
1732
- "Standard Deviation": 0.007847059783
1733
- }
1734
- }
1735
- },
1736
- {
1737
- "config": {
1738
- "model_name": "openassistant-pythia-12b",
1739
- "organization": "OpenAssistant",
1740
- "license": "Non-commercial",
1741
- "knowledge_cutoff": "2023-09"
1742
- },
1743
- "results": {
1744
- "OVERALL": {
1745
- "Score": 0.0,
1746
- "Standard Deviation": 0.0
1747
- },
1748
- "Geometry": {
1749
- "Score": 0.0,
1750
- "Standard Deviation": 0.0
1751
- },
1752
- "Algebra": {
1753
- "Score": 0.0,
1754
- "Standard Deviation": 0.0
1755
- },
1756
- "Probability": {
1757
- "Score": 0.0,
1758
- "Standard Deviation": 0.0
1759
- },
1760
- "Logical": {
1761
- "Score": 0.0,
1762
- "Standard Deviation": 0.0
1763
- },
1764
- "Social": {
1765
- "Score": 0.030792528,
1766
- "Standard Deviation": 0.007518796391
1767
- }
1768
- }
1769
- }
1770
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/results/models_2024-10-08-03:25:44.801310.jsonl DELETED
@@ -1,2082 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "model_name": "ChatGPT-4o-latest (2024-09-03)",
5
- "organization": "OpenAI",
6
- "license": "Proprietary",
7
- "knowledge_cutoff": "2023/10"
8
- },
9
- "results": {
10
- "OVERALL": {
11
- "Average Score": 0.974329609,
12
- "Standard Deviation": 0.005024959031,
13
- "Rank": 2
14
- },
15
- "Geometry": {
16
- "Average Score": 0.976028578,
17
- "Standard Deviation": 0.01507912373,
18
- "Rank": 3
19
- },
20
- "Algebra": {
21
- "Average Score": 0.951199453,
22
- "Standard Deviation": 0.08452452108,
23
- "Rank": 3
24
- },
25
- "Probability": {
26
- "Average Score": 0.842116641,
27
- "Standard Deviation": 0.006267759054,
28
- "Rank": 3
29
- },
30
- "Logical": {
31
- "Average Score": 0.828490728,
32
- "Standard Deviation": 0.009134213144,
33
- "Rank": 3
34
- },
35
- "Social": {
36
- "Average Score": 0.815902987,
37
- "Standard Deviation": 0.0196254222,
38
- "Rank": 4
39
- }
40
- }
41
- },
42
- {
43
- "config": {
44
- "model_name": "gpt-4o-2024-08-06",
45
- "organization": "OpenAI",
46
- "license": "Proprietary",
47
- "knowledge_cutoff": "2023/10"
48
- },
49
- "results": {
50
- "OVERALL": {
51
- "Average Score": 0.846571548,
52
- "Standard Deviation": 0.03394056554,
53
- "Rank": 6
54
- },
55
- "Geometry": {
56
- "Average Score": 0.99773096,
57
- "Standard Deviation": 0.002835555172,
58
- "Rank": 1
59
- },
60
- "Algebra": {
61
- "Average Score": 1.0,
62
- "Standard Deviation": 0.0,
63
- "Rank": 1
64
- },
65
- "Probability": {
66
- "Average Score": 0.78855795,
67
- "Standard Deviation": 0.008188675452,
68
- "Rank": 6
69
- },
70
- "Logical": {
71
- "Average Score": 0.668635768,
72
- "Standard Deviation": 0.03466314094,
73
- "Rank": 11
74
- },
75
- "Social": {
76
- "Average Score": 0.680417314,
77
- "Standard Deviation": 0.00656867063,
78
- "Rank": 9
79
- }
80
- }
81
- },
82
- {
83
- "config": {
84
- "model_name": "gpt-4o-2024-05-13",
85
- "organization": "OpenAI",
86
- "license": "Proprietary",
87
- "knowledge_cutoff": "2023/10"
88
- },
89
- "results": {
90
- "OVERALL": {
91
- "Average Score": 0.846334477,
92
- "Standard Deviation": 0.09377911572,
93
- "Rank": 7
94
- },
95
- "Geometry": {
96
- "Average Score": 0.972472377,
97
- "Standard Deviation": 0.01648274205,
98
- "Rank": 4
99
- },
100
- "Algebra": {
101
- "Average Score": 0.995511298,
102
- "Standard Deviation": 0.004097802515,
103
- "Rank": 2
104
- },
105
- "Probability": {
106
- "Average Score": 0.812149974,
107
- "Standard Deviation": 0.007669585485,
108
- "Rank": 4
109
- },
110
- "Logical": {
111
- "Average Score": 0.755019692,
112
- "Standard Deviation": 0.008149588572,
113
- "Rank": 6
114
- },
115
- "Social": {
116
- "Average Score": 0.609875087,
117
- "Standard Deviation": 0.038729239,
118
- "Rank": 14
119
- }
120
- }
121
- },
122
- {
123
- "config": {
124
- "model_name": "gpt-4-turbo-2024-04-09",
125
- "organization": "OpenAI",
126
- "license": "Proprietary",
127
- "knowledge_cutoff": "2023/12"
128
- },
129
- "results": {
130
- "OVERALL": {
131
- "Average Score": 0.855357972,
132
- "Standard Deviation": 0.1016986368,
133
- "Rank": 4
134
- },
135
- "Geometry": {
136
- "Average Score": 0.95374588,
137
- "Standard Deviation": 0.03109307166,
138
- "Rank": 5
139
- },
140
- "Algebra": {
141
- "Average Score": 0.930945223,
142
- "Standard Deviation": 0.06705136813,
143
- "Rank": 4
144
- },
145
- "Probability": {
146
- "Average Score": 0.750705448,
147
- "Standard Deviation": 0.05944483103,
148
- "Rank": 8
149
- },
150
- "Logical": {
151
- "Average Score": 0.77906699,
152
- "Standard Deviation": 0.007406734161,
153
- "Rank": 4
154
- },
155
- "Social": {
156
- "Average Score": 0.715935163,
157
- "Standard Deviation": 0.1209141409,
158
- "Rank": 7
159
- }
160
- }
161
- },
162
- {
163
- "config": {
164
- "model_name": "gemini-1.5-pro-001",
165
- "organization": "Google",
166
- "license": "Proprietary",
167
- "knowledge_cutoff": "2024-01"
168
- },
169
- "results": {
170
- "OVERALL": {
171
- "Average Score": 0.797187842,
172
- "Standard Deviation": 0.0272375249,
173
- "Rank": 10
174
- },
175
- "Geometry": {
176
- "Average Score": 0.9947169,
177
- "Standard Deviation": 0.009150597621,
178
- "Rank": 2
179
- },
180
- "Algebra": {
181
- "Average Score": 0.857464301,
182
- "Standard Deviation": 0.05014285338,
183
- "Rank": 5
184
- },
185
- "Probability": {
186
- "Average Score": 0.651781767,
187
- "Standard Deviation": 0.04156998547,
188
- "Rank": 12
189
- },
190
- "Logical": {
191
- "Average Score": 0.739745471,
192
- "Standard Deviation": 0.01631532019,
193
- "Rank": 7
194
- },
195
- "Social": {
196
- "Average Score": 0.649601885,
197
- "Standard Deviation": 0.104854889,
198
- "Rank": 12
199
- }
200
- }
201
- },
202
- {
203
- "config": {
204
- "model_name": "qwen2-72b-instruct",
205
- "organization": "Alibaba",
206
- "license": "Qianwen LICENSE",
207
- "knowledge_cutoff": "2024-02"
208
- },
209
- "results": {
210
- "OVERALL": {
211
- "Average Score": 0.737918558,
212
- "Standard Deviation": 0.09069077339,
213
- "Rank": 11
214
- },
215
- "Geometry": {
216
- "Average Score": 0.796870305,
217
- "Standard Deviation": 0.0509025346,
218
- "Rank": 9
219
- },
220
- "Algebra": {
221
- "Average Score": 0.836194231,
222
- "Standard Deviation": 0.04517093028,
223
- "Rank": 6
224
- },
225
- "Probability": {
226
- "Average Score": 0.788068004,
227
- "Standard Deviation": 0.007288989044,
228
- "Rank": 7
229
- },
230
- "Logical": {
231
- "Average Score": 0.619300904,
232
- "Standard Deviation": 0.06377931612,
233
- "Rank": 15
234
- },
235
- "Social": {
236
- "Average Score": 0.652578786,
237
- "Standard Deviation": 0.04259293171,
238
- "Rank": 11
239
- }
240
- }
241
- },
242
- {
243
- "config": {
244
- "model_name": "gpt-4o-mini-2024-07-18",
245
- "organization": "OpenAI",
246
- "license": "Proprietary",
247
- "knowledge_cutoff": "2024-07"
248
- },
249
- "results": {
250
- "OVERALL": {
251
- "Average Score": 0.847694133,
252
- "Standard Deviation": 0.02164304402,
253
- "Rank": 5
254
- },
255
- "Geometry": {
256
- "Average Score": 0.946650435,
257
- "Standard Deviation": 0.01831236482,
258
- "Rank": 7
259
- },
260
- "Algebra": {
261
- "Average Score": 0.796243022,
262
- "Standard Deviation": 0.05537539202,
263
- "Rank": 7
264
- },
265
- "Probability": {
266
- "Average Score": 0.798402685,
267
- "Standard Deviation": 0.009404491967,
268
- "Rank": 5
269
- },
270
- "Logical": {
271
- "Average Score": 0.727009735,
272
- "Standard Deviation": 0.02628110141,
273
- "Rank": 8
274
- },
275
- "Social": {
276
- "Average Score": 0.691949855,
277
- "Standard Deviation": 0.02072934333,
278
- "Rank": 8
279
- }
280
- }
281
- },
282
- {
283
- "config": {
284
- "model_name": "claude-3.5-sonnet",
285
- "organization": "Anthropic",
286
- "license": "Proprietary",
287
- "knowledge_cutoff": "2024-03"
288
- },
289
- "results": {
290
- "OVERALL": {
291
- "Average Score": 0.839004422,
292
- "Standard Deviation": 0.1461079564,
293
- "Rank": 8
294
- },
295
- "Geometry": {
296
- "Average Score": 0.95316419,
297
- "Standard Deviation": 0.02081192856,
298
- "Rank": 6
299
- },
300
- "Algebra": {
301
- "Average Score": 0.759789952,
302
- "Standard Deviation": 0.02611765096,
303
- "Rank": 8
304
- },
305
- "Probability": {
306
- "Average Score": 0.707730127,
307
- "Standard Deviation": 0.0394436664,
308
- "Rank": 10
309
- },
310
- "Logical": {
311
- "Average Score": 0.77342666,
312
- "Standard Deviation": 0.002892426458,
313
- "Rank": 5
314
- },
315
- "Social": {
316
- "Average Score": 0.790002247,
317
- "Standard Deviation": 0.1007410022,
318
- "Rank": 5
319
- }
320
- }
321
- },
322
- {
323
- "config": {
324
- "model_name": "o1-mini",
325
- "organization": "OpenAI",
326
- "license": "Proprietary",
327
- "knowledge_cutoff": "2024-01"
328
- },
329
- "results": {
330
- "OVERALL": {
331
- "Average Score": 1.0,
332
- "Standard Deviation": 0.0,
333
- "Rank": 1
334
- },
335
- "Geometry": {
336
- "Average Score": "N/A",
337
- "Standard Deviation": "N/A",
338
- "Rank": "N/A"
339
- },
340
- "Algebra": {
341
- "Average Score": "N/A",
342
- "Standard Deviation": "N/A",
343
- "Rank": "N/A"
344
- },
345
- "Probability": {
346
- "Average Score": 1.0,
347
- "Standard Deviation": 0.0,
348
- "Rank": 1
349
- },
350
- "Logical": {
351
- "Average Score": 1.0,
352
- "Standard Deviation": 0.0,
353
- "Rank": 1
354
- },
355
- "Social": {
356
- "Average Score": 0.993974241,
357
- "Standard Deviation": 0.001996882328,
358
- "Rank": 2
359
- }
360
- }
361
- },
362
- {
363
- "config": {
364
- "model_name": "o1-preview",
365
- "organization": "OpenAI",
366
- "license": "Proprietary",
367
- "knowledge_cutoff": "2024-01"
368
- },
369
- "results": {
370
- "OVERALL": {
371
- "Average Score": 0.945884589,
372
- "Standard Deviation": 0.01059250762,
373
- "Rank": 3
374
- },
375
- "Geometry": {
376
- "Average Score": "N/A",
377
- "Standard Deviation": "N/A",
378
- "Rank": "N/A"
379
- },
380
- "Algebra": {
381
- "Average Score": "N/A",
382
- "Standard Deviation": "N/A",
383
- "Rank": "N/A"
384
- },
385
- "Probability": {
386
- "Average Score": 0.964666392,
387
- "Standard Deviation": 0.003139983398,
388
- "Rank": 2
389
- },
390
- "Logical": {
391
- "Average Score": 0.987950057,
392
- "Standard Deviation": 0.004881220327,
393
- "Rank": 2
394
- },
395
- "Social": {
396
- "Average Score": 1.0,
397
- "Standard Deviation": 0.0,
398
- "Rank": 1
399
- }
400
- }
401
- },
402
- {
403
- "config": {
404
- "model_name": "gemini-1.5-flash-001",
405
- "organization": "Google",
406
- "license": "Proprietary",
407
- "knowledge_cutoff": "2024-02"
408
- },
409
- "results": {
410
- "OVERALL": {
411
- "Average Score": 0.726493401,
412
- "Standard Deviation": 0.01113913725,
413
- "Rank": 12
414
- },
415
- "Geometry": {
416
- "Average Score": 0.804144103,
417
- "Standard Deviation": 0.1327142178,
418
- "Rank": 8
419
- },
420
- "Algebra": {
421
- "Average Score": 0.731776765,
422
- "Standard Deviation": 0.02594657111,
423
- "Rank": 11
424
- },
425
- "Probability": {
426
- "Average Score": 0.614461891,
427
- "Standard Deviation": 0.04690131826,
428
- "Rank": 15
429
- },
430
- "Logical": {
431
- "Average Score": 0.630805991,
432
- "Standard Deviation": 0.04871350612,
433
- "Rank": 13
434
- },
435
- "Social": {
436
- "Average Score": 0.555933822,
437
- "Standard Deviation": 0.1029934524,
438
- "Rank": 16
439
- }
440
- }
441
- },
442
- {
443
- "config": {
444
- "model_name": "gpt4-1106",
445
- "organization": "OpenAI",
446
- "license": "Proprietary",
447
- "knowledge_cutoff": "2024-04"
448
- },
449
- "results": {
450
- "OVERALL": {
451
- "Average Score": 0.816347784,
452
- "Standard Deviation": 0.1566815755,
453
- "Rank": 9
454
- },
455
- "Geometry": {
456
- "Average Score": 0.71843088,
457
- "Standard Deviation": 0.04778038294,
458
- "Rank": 13
459
- },
460
- "Algebra": {
461
- "Average Score": 0.712910417,
462
- "Standard Deviation": 0.02581828898,
463
- "Rank": 12
464
- },
465
- "Probability": {
466
- "Average Score": 0.623947619,
467
- "Standard Deviation": 0.03502982933,
468
- "Rank": 14
469
- },
470
- "Logical": {
471
- "Average Score": 0.637482274,
472
- "Standard Deviation": 0.04158809888,
473
- "Rank": 12
474
- },
475
- "Social": {
476
- "Average Score": 0.450609816,
477
- "Standard Deviation": 0.05208655446,
478
- "Rank": 23
479
- }
480
- }
481
- },
482
- {
483
- "config": {
484
- "model_name": "gemma-2-27b-it",
485
- "organization": "Google",
486
- "license": "Gemma License",
487
- "knowledge_cutoff": "2024-03"
488
- },
489
- "results": {
490
- "OVERALL": {
491
- "Average Score": 0.624169623,
492
- "Standard Deviation": 0.1048365121,
493
- "Rank": 15
494
- },
495
- "Geometry": {
496
- "Average Score": 0.60112744,
497
- "Standard Deviation": 0.0469109952,
498
- "Rank": 19
499
- },
500
- "Algebra": {
501
- "Average Score": 0.687955914,
502
- "Standard Deviation": 0.01959958192,
503
- "Rank": 13
504
- },
505
- "Probability": {
506
- "Average Score": 0.589524771,
507
- "Standard Deviation": 0.03112689325,
508
- "Rank": 16
509
- },
510
- "Logical": {
511
- "Average Score": 0.614978944,
512
- "Standard Deviation": 0.05710657859,
513
- "Rank": 16
514
- },
515
- "Social": {
516
- "Average Score": 0.487844257,
517
- "Standard Deviation": 0.05857760809,
518
- "Rank": 20
519
- }
520
- }
521
- },
522
- {
523
- "config": {
524
- "model_name": "claude-3-opus",
525
- "organization": "Anthropic",
526
- "license": "Proprietary",
527
- "knowledge_cutoff": "2024-01"
528
- },
529
- "results": {
530
- "OVERALL": {
531
- "Average Score": 0.650636271,
532
- "Standard Deviation": 0.1197773541,
533
- "Rank": 14
534
- },
535
- "Geometry": {
536
- "Average Score": 0.7215743,
537
- "Standard Deviation": 0.04712598358,
538
- "Rank": 12
539
- },
540
- "Algebra": {
541
- "Average Score": 0.68777327,
542
- "Standard Deviation": 0.02382683713,
543
- "Rank": 14
544
- },
545
- "Probability": {
546
- "Average Score": 0.626471421,
547
- "Standard Deviation": 0.02911817976,
548
- "Rank": 13
549
- },
550
- "Logical": {
551
- "Average Score": 0.692346381,
552
- "Standard Deviation": 0.03617185198,
553
- "Rank": 10
554
- },
555
- "Social": {
556
- "Average Score": 0.663410854,
557
- "Standard Deviation": 0.09540220876,
558
- "Rank": 10
559
- }
560
- }
561
- },
562
- {
563
- "config": {
564
- "model_name": "gemma-2-9b-it-simpo",
565
- "organization": "Google",
566
- "license": "Gemma License",
567
- "knowledge_cutoff": "2024-02"
568
- },
569
- "results": {
570
- "OVERALL": {
571
- "Average Score": "N/A",
572
- "Standard Deviation": "N/A",
573
- "Rank": "N/A"
574
- },
575
- "Geometry": {
576
- "Average Score": 0.582787508,
577
- "Standard Deviation": 0.03965204074,
578
- "Rank": 20
579
- },
580
- "Algebra": {
581
- "Average Score": 0.658648133,
582
- "Standard Deviation": 0.02565919856,
583
- "Rank": 15
584
- },
585
- "Probability": {
586
- "Average Score": 0.547861265,
587
- "Standard Deviation": 0.02885209131,
588
- "Rank": 19
589
- },
590
- "Logical": {
591
- "Average Score": 0.540720893,
592
- "Standard Deviation": 0.01970134508,
593
- "Rank": 20
594
- },
595
- "Social": {
596
- "Average Score": 0.635266187,
597
- "Standard Deviation": 0.03620021751,
598
- "Rank": 13
599
- }
600
- }
601
- },
602
- {
603
- "config": {
604
- "model_name": "qwen1.5-72b-chat",
605
- "organization": "Alibaba",
606
- "license": "Qianwen LICENSE",
607
- "knowledge_cutoff": "2024-03"
608
- },
609
- "results": {
610
- "OVERALL": {
611
- "Average Score": 0.519549796,
612
- "Standard Deviation": 0.00903634343,
613
- "Rank": 18
614
- },
615
- "Geometry": {
616
- "Average Score": 0.543139301,
617
- "Standard Deviation": 0.03425202326,
618
- "Rank": 24
619
- },
620
- "Algebra": {
621
- "Average Score": 0.635228729,
622
- "Standard Deviation": 0.01944043425,
623
- "Rank": 16
624
- },
625
- "Probability": {
626
- "Average Score": 0.486948658,
627
- "Standard Deviation": 0.06064655315,
628
- "Rank": 23
629
- },
630
- "Logical": {
631
- "Average Score": 0.284069394,
632
- "Standard Deviation": 0.02686608506,
633
- "Rank": 33
634
- },
635
- "Social": {
636
- "Average Score": 0.415007627,
637
- "Standard Deviation": 0.03920053159,
638
- "Rank": 24
639
- }
640
- }
641
- },
642
- {
643
- "config": {
644
- "model_name": "qwen1.5-32b-chat",
645
- "organization": "Alibaba",
646
- "license": "Qianwen LICENSE",
647
- "knowledge_cutoff": "2024-03"
648
- },
649
- "results": {
650
- "OVERALL": {
651
- "Average Score": 0.393789407,
652
- "Standard Deviation": 0.05413770095,
653
- "Rank": 29
654
- },
655
- "Geometry": {
656
- "Average Score": 0.51086835,
657
- "Standard Deviation": 0.04052471998,
658
- "Rank": 27
659
- },
660
- "Algebra": {
661
- "Average Score": 0.609003168,
662
- "Standard Deviation": 0.04874143541,
663
- "Rank": 17
664
- },
665
- "Probability": {
666
- "Average Score": 0.476300002,
667
- "Standard Deviation": 0.05322403912,
668
- "Rank": 24
669
- },
670
- "Logical": {
671
- "Average Score": 0.331781014,
672
- "Standard Deviation": 0.004938997686,
673
- "Rank": 30
674
- },
675
- "Social": {
676
- "Average Score": 0.380987334,
677
- "Standard Deviation": 0.03762251776,
678
- "Rank": 26
679
- }
680
- }
681
- },
682
- {
683
- "config": {
684
- "model_name": "google-gemma-2-9b-it",
685
- "organization": "Google",
686
- "license": "Proprietary",
687
- "knowledge_cutoff": "2024-01"
688
- },
689
- "results": {
690
- "OVERALL": {
691
- "Average Score": 0.489663449,
692
- "Standard Deviation": 0.002595702019,
693
- "Rank": 21
694
- },
695
- "Geometry": {
696
- "Average Score": 0.575371308,
697
- "Standard Deviation": 0.03556220251,
698
- "Rank": 22
699
- },
700
- "Algebra": {
701
- "Average Score": 0.597045661,
702
- "Standard Deviation": 0.0313828123,
703
- "Rank": 18
704
- },
705
- "Probability": {
706
- "Average Score": 0.589221807,
707
- "Standard Deviation": 0.03110811656,
708
- "Rank": 18
709
- },
710
- "Logical": {
711
- "Average Score": 0.587579897,
712
- "Standard Deviation": 0.05512716783,
713
- "Rank": 18
714
- },
715
- "Social": {
716
- "Average Score": 0.768337958,
717
- "Standard Deviation": 0.04078610476,
718
- "Rank": 6
719
- }
720
- }
721
- },
722
- {
723
- "config": {
724
- "model_name": "yi-1.5-34b-chat",
725
- "organization": "01 AI",
726
- "license": "Proprietary",
727
- "knowledge_cutoff": "2024-01"
728
- },
729
- "results": {
730
- "OVERALL": {
731
- "Average Score": 0.607812897,
732
- "Standard Deviation": 0.1440881293,
733
- "Rank": 16
734
- },
735
- "Geometry": {
736
- "Average Score": 0.566666724,
737
- "Standard Deviation": 0.04001381658,
738
- "Rank": 23
739
- },
740
- "Algebra": {
741
- "Average Score": 0.590997292,
742
- "Standard Deviation": 0.03594087315,
743
- "Rank": 19
744
- },
745
- "Probability": {
746
- "Average Score": 0.589524589,
747
- "Standard Deviation": 0.03112618772,
748
- "Rank": 17
749
- },
750
- "Logical": {
751
- "Average Score": 0.574105508,
752
- "Standard Deviation": 0.03441737941,
753
- "Rank": 19
754
- },
755
- "Social": {
756
- "Average Score": 0.516980832,
757
- "Standard Deviation": 0.03369347985,
758
- "Rank": 19
759
- }
760
- }
761
- },
762
- {
763
- "config": {
764
- "model_name": "meta-llama-3.1-8b-instruct",
765
- "organization": "Meta",
766
- "license": "Llama 3.1 Community",
767
- "knowledge_cutoff": "2024-02"
768
- },
769
- "results": {
770
- "OVERALL": {
771
- "Average Score": 0.505936324,
772
- "Standard Deviation": 0.05286756493,
773
- "Rank": 19
774
- },
775
- "Geometry": {
776
- "Average Score": 0.522442162,
777
- "Standard Deviation": 0.03908236317,
778
- "Rank": 25
779
- },
780
- "Algebra": {
781
- "Average Score": 0.582702645,
782
- "Standard Deviation": 0.05002277711,
783
- "Rank": 20
784
- },
785
- "Probability": {
786
- "Average Score": 0.495001149,
787
- "Standard Deviation": 0.05244587037,
788
- "Rank": 22
789
- },
790
- "Logical": {
791
- "Average Score": 0.443030561,
792
- "Standard Deviation": 0.01343820628,
793
- "Rank": 25
794
- },
795
- "Social": {
796
- "Average Score": 0.329195941,
797
- "Standard Deviation": 0.03925019528,
798
- "Rank": 30
799
- }
800
- }
801
- },
802
- {
803
- "config": {
804
- "model_name": "gpt3.5-turbo-0125",
805
- "organization": "OpenAI",
806
- "license": "Proprietary",
807
- "knowledge_cutoff": "2023-12"
808
- },
809
- "results": {
810
- "OVERALL": {
811
- "Average Score": 0.313398088,
812
- "Standard Deviation": 0.09322528606,
813
- "Rank": 40
814
- },
815
- "Geometry": {
816
- "Average Score": 0.678714519,
817
- "Standard Deviation": 0.05926546762,
818
- "Rank": 14
819
- },
820
- "Algebra": {
821
- "Average Score": 0.569296173,
822
- "Standard Deviation": 0.05277281097,
823
- "Rank": 21
824
- },
825
- "Probability": {
826
- "Average Score": 0.448460767,
827
- "Standard Deviation": 0.05768095196,
828
- "Rank": 26
829
- },
830
- "Logical": {
831
- "Average Score": 0.148521348,
832
- "Standard Deviation": 0.04033712907,
833
- "Rank": 45
834
- },
835
- "Social": {
836
- "Average Score": 0.235071541,
837
- "Standard Deviation": 0.02632892457,
838
- "Rank": 39
839
- }
840
- }
841
- },
842
- {
843
- "config": {
844
- "model_name": "llama-3-70b-instruct",
845
- "organization": "Meta",
846
- "license": "Llama 3 Community",
847
- "knowledge_cutoff": "2024-03"
848
- },
849
- "results": {
850
- "OVERALL": {
851
- "Average Score": 0.456689885,
852
- "Standard Deviation": 0.01385989995,
853
- "Rank": 23
854
- },
855
- "Geometry": {
856
- "Average Score": 0.516865529,
857
- "Standard Deviation": 0.03858112564,
858
- "Rank": 26
859
- },
860
- "Algebra": {
861
- "Average Score": 0.566756531,
862
- "Standard Deviation": 0.03369826926,
863
- "Rank": 22
864
- },
865
- "Probability": {
866
- "Average Score": 0.513857306,
867
- "Standard Deviation": 0.05453699062,
868
- "Rank": 21
869
- },
870
- "Logical": {
871
- "Average Score": 0.713796415,
872
- "Standard Deviation": 0.02031215107,
873
- "Rank": 9
874
- },
875
- "Social": {
876
- "Average Score": 0.45872939,
877
- "Standard Deviation": 0.05347039576,
878
- "Rank": 22
879
- }
880
- }
881
- },
882
- {
883
- "config": {
884
- "model_name": "claude-3-sonnet",
885
- "organization": "Anthropic",
886
- "license": "Proprietary",
887
- "knowledge_cutoff": "2024-02"
888
- },
889
- "results": {
890
- "OVERALL": {
891
- "Average Score": 0.520010833,
892
- "Standard Deviation": 0.005030563799,
893
- "Rank": 17
894
- },
895
- "Geometry": {
896
- "Average Score": 0.675613638,
897
- "Standard Deviation": 0.05275594408,
898
- "Rank": 15
899
- },
900
- "Algebra": {
901
- "Average Score": 0.552025728,
902
- "Standard Deviation": 0.04122192409,
903
- "Rank": 23
904
- },
905
- "Probability": {
906
- "Average Score": 0.516192848,
907
- "Standard Deviation": 0.04152293217,
908
- "Rank": 20
909
- },
910
- "Logical": {
911
- "Average Score": 0.588545747,
912
- "Standard Deviation": 0.06068211943,
913
- "Rank": 17
914
- },
915
- "Social": {
916
- "Average Score": 0.570437582,
917
- "Standard Deviation": 0.08607040862,
918
- "Rank": 15
919
- }
920
- }
921
- },
922
- {
923
- "config": {
924
- "model_name": "qwen1.5-14b-chat",
925
- "organization": "Alibaba",
926
- "license": "Qianwen LICENSE",
927
- "knowledge_cutoff": "2024-01"
928
- },
929
- "results": {
930
- "OVERALL": {
931
- "Average Score": 0.415328996,
932
- "Standard Deviation": 0.0743938717,
933
- "Rank": 28
934
- },
935
- "Geometry": {
936
- "Average Score": 0.452504016,
937
- "Standard Deviation": 0.04225594393,
938
- "Rank": 28
939
- },
940
- "Algebra": {
941
- "Average Score": 0.538655725,
942
- "Standard Deviation": 0.03721542594,
943
- "Rank": 24
944
- },
945
- "Probability": {
946
- "Average Score": 0.397185975,
947
- "Standard Deviation": 0.05607695946,
948
- "Rank": 30
949
- },
950
- "Logical": {
951
- "Average Score": 0.264573129,
952
- "Standard Deviation": 0.03936133174,
953
- "Rank": 35
954
- },
955
- "Social": {
956
- "Average Score": 0.287370142,
957
- "Standard Deviation": 0.04264085315,
958
- "Rank": 32
959
- }
960
- }
961
- },
962
- {
963
- "config": {
964
- "model_name": "claude-3-haiku",
965
- "organization": "Anthropic",
966
- "license": "Proprietary",
967
- "knowledge_cutoff": "2024-01"
968
- },
969
- "results": {
970
- "OVERALL": {
971
- "Average Score": 0.453901163,
972
- "Standard Deviation": 0.003604084261,
973
- "Rank": 24
974
- },
975
- "Geometry": {
976
- "Average Score": 0.607993912,
977
- "Standard Deviation": 0.05793460748,
978
- "Rank": 17
979
- },
980
- "Algebra": {
981
- "Average Score": 0.520054055,
982
- "Standard Deviation": 0.03333544511,
983
- "Rank": 25
984
- },
985
- "Probability": {
986
- "Average Score": 0.474460688,
987
- "Standard Deviation": 0.0446501933,
988
- "Rank": 25
989
- },
990
- "Logical": {
991
- "Average Score": 0.512815976,
992
- "Standard Deviation": 0.0163264281,
993
- "Rank": 21
994
- },
995
- "Social": {
996
- "Average Score": 0.551083976,
997
- "Standard Deviation": 0.05374722539,
998
- "Rank": 17
999
- }
1000
- }
1001
- },
1002
- {
1003
- "config": {
1004
- "model_name": "claude-2.1",
1005
- "organization": "Anthropic",
1006
- "license": "Proprietary",
1007
- "knowledge_cutoff": "2023-12"
1008
- },
1009
- "results": {
1010
- "OVERALL": {
1011
- "Average Score": 0.35814708,
1012
- "Standard Deviation": 0.09168134168,
1013
- "Rank": 36
1014
- },
1015
- "Geometry": {
1016
- "Average Score": 0.62752395,
1017
- "Standard Deviation": 0.07232659398,
1018
- "Rank": 16
1019
- },
1020
- "Algebra": {
1021
- "Average Score": 0.508849609,
1022
- "Standard Deviation": 0.0346897465,
1023
- "Rank": 26
1024
- },
1025
- "Probability": {
1026
- "Average Score": 0.41477086,
1027
- "Standard Deviation": 0.05964060239,
1028
- "Rank": 29
1029
- },
1030
- "Logical": {
1031
- "Average Score": 0.482923674,
1032
- "Standard Deviation": 0.01989147048,
1033
- "Rank": 22
1034
- },
1035
- "Social": {
1036
- "Average Score": 0.333804568,
1037
- "Standard Deviation": 0.03775548253,
1038
- "Rank": 29
1039
- }
1040
- }
1041
- },
1042
- {
1043
- "config": {
1044
- "model_name": "mistral-8x7b-instruct-v0.1",
1045
- "organization": "Mistral",
1046
- "license": "Apache 2.0",
1047
- "knowledge_cutoff": "2023-12"
1048
- },
1049
- "results": {
1050
- "OVERALL": {
1051
- "Average Score": 0.382659161,
1052
- "Standard Deviation": 0.07594496929,
1053
- "Rank": 31
1054
- },
1055
- "Geometry": {
1056
- "Average Score": 0.432216097,
1057
- "Standard Deviation": 0.04747949254,
1058
- "Rank": 31
1059
- },
1060
- "Algebra": {
1061
- "Average Score": 0.478314888,
1062
- "Standard Deviation": 0.01998797419,
1063
- "Rank": 27
1064
- },
1065
- "Probability": {
1066
- "Average Score": 0.427144725,
1067
- "Standard Deviation": 0.0590923329,
1068
- "Rank": 28
1069
- },
1070
- "Logical": {
1071
- "Average Score": 0.340041983,
1072
- "Standard Deviation": 0.008397574592,
1073
- "Rank": 28
1074
- },
1075
- "Social": {
1076
- "Average Score": 0.251949622,
1077
- "Standard Deviation": 0.03346674405,
1078
- "Rank": 37
1079
- }
1080
- }
1081
- },
1082
- {
1083
- "config": {
1084
- "model_name": "claude-2.0",
1085
- "organization": "Anthropic",
1086
- "license": "Proprietary",
1087
- "knowledge_cutoff": "2023-10"
1088
- },
1089
- "results": {
1090
- "OVERALL": {
1091
- "Average Score": 0.322718057,
1092
- "Standard Deviation": 0.08369883584,
1093
- "Rank": 38
1094
- },
1095
- "Geometry": {
1096
- "Average Score": 0.604141967,
1097
- "Standard Deviation": 0.05116441826,
1098
- "Rank": 18
1099
- },
1100
- "Algebra": {
1101
- "Average Score": 0.474350734,
1102
- "Standard Deviation": 0.01510393066,
1103
- "Rank": 28
1104
- },
1105
- "Probability": {
1106
- "Average Score": 0.437950412,
1107
- "Standard Deviation": 0.05985594317,
1108
- "Rank": 27
1109
- },
1110
- "Logical": {
1111
- "Average Score": 0.445620646,
1112
- "Standard Deviation": 0.01812614805,
1113
- "Rank": 24
1114
- },
1115
- "Social": {
1116
- "Average Score": 0.469422836,
1117
- "Standard Deviation": 0.05999901796,
1118
- "Rank": 21
1119
- }
1120
- }
1121
- },
1122
- {
1123
- "config": {
1124
- "model_name": "starling-lm-7b-beta",
1125
- "organization": "Nexusflow",
1126
- "license": "Apache-2.0",
1127
- "knowledge_cutoff": "2024-01"
1128
- },
1129
- "results": {
1130
- "OVERALL": {
1131
- "Average Score": 0.479391856,
1132
- "Standard Deviation": 0.04199990887,
1133
- "Rank": 22
1134
- },
1135
- "Geometry": {
1136
- "Average Score": 0.446654388,
1137
- "Standard Deviation": 0.05637864999,
1138
- "Rank": 30
1139
- },
1140
- "Algebra": {
1141
- "Average Score": 0.473952749,
1142
- "Standard Deviation": 0.01584301288,
1143
- "Rank": 29
1144
- },
1145
- "Probability": {
1146
- "Average Score": 0.395197837,
1147
- "Standard Deviation": 0.05814798892,
1148
- "Rank": 31
1149
- },
1150
- "Logical": {
1151
- "Average Score": 0.39927199,
1152
- "Standard Deviation": 0.02125277518,
1153
- "Rank": 26
1154
- },
1155
- "Social": {
1156
- "Average Score": 0.380021662,
1157
- "Standard Deviation": 0.04622452748,
1158
- "Rank": 27
1159
- }
1160
- }
1161
- },
1162
- {
1163
- "config": {
1164
- "model_name": "gemini-1.0-pro-001",
1165
- "organization": "Google",
1166
- "license": "Proprietary",
1167
- "knowledge_cutoff": "2023-11"
1168
- },
1169
- "results": {
1170
- "OVERALL": {
1171
- "Average Score": 0.449040654,
1172
- "Standard Deviation": 0.0450610177,
1173
- "Rank": 25
1174
- },
1175
- "Geometry": {
1176
- "Average Score": 0.578347959,
1177
- "Standard Deviation": 0.04242873607,
1178
- "Rank": 21
1179
- },
1180
- "Algebra": {
1181
- "Average Score": 0.462417786,
1182
- "Standard Deviation": 0.01668313635,
1183
- "Rank": 30
1184
- },
1185
- "Probability": {
1186
- "Average Score": 0.289836324,
1187
- "Standard Deviation": 0.05739831115,
1188
- "Rank": 39
1189
- },
1190
- "Logical": {
1191
- "Average Score": 0.191140355,
1192
- "Standard Deviation": 0.03394652499,
1193
- "Rank": 41
1194
- },
1195
- "Social": {
1196
- "Average Score": 0.130790863,
1197
- "Standard Deviation": 0.02800188173,
1198
- "Rank": 47
1199
- }
1200
- }
1201
- },
1202
- {
1203
- "config": {
1204
- "model_name": "openchat-3.5-0106",
1205
- "organization": "OpenChat",
1206
- "license": "Apache-2.0",
1207
- "knowledge_cutoff": "2024-01"
1208
- },
1209
- "results": {
1210
- "OVERALL": {
1211
- "Average Score": 0.363929888,
1212
- "Standard Deviation": 0.08602347145,
1213
- "Rank": 34
1214
- },
1215
- "Geometry": {
1216
- "Average Score": 0.38715246,
1217
- "Standard Deviation": 0.03701851946,
1218
- "Rank": 34
1219
- },
1220
- "Algebra": {
1221
- "Average Score": 0.441233712,
1222
- "Standard Deviation": 0.01135753754,
1223
- "Rank": 31
1224
- },
1225
- "Probability": {
1226
- "Average Score": 0.38802618,
1227
- "Standard Deviation": 0.05663879714,
1228
- "Rank": 32
1229
- },
1230
- "Logical": {
1231
- "Average Score": 0.336754383,
1232
- "Standard Deviation": 0.01608478079,
1233
- "Rank": 29
1234
- },
1235
- "Social": {
1236
- "Average Score": 0.250891608,
1237
- "Standard Deviation": 0.03253769914,
1238
- "Rank": 38
1239
- }
1240
- }
1241
- },
1242
- {
1243
- "config": {
1244
- "model_name": "openchat-3.5",
1245
- "organization": "OpenChat",
1246
- "license": "Apache-2.0",
1247
- "knowledge_cutoff": "2023-12"
1248
- },
1249
- "results": {
1250
- "OVERALL": {
1251
- "Average Score": 0.361341296,
1252
- "Standard Deviation": 0.09034869493,
1253
- "Rank": 35
1254
- },
1255
- "Geometry": {
1256
- "Average Score": 0.401699069,
1257
- "Standard Deviation": 0.03410726557,
1258
- "Rank": 32
1259
- },
1260
- "Algebra": {
1261
- "Average Score": 0.414095336,
1262
- "Standard Deviation": 0.01881964261,
1263
- "Rank": 33
1264
- },
1265
- "Probability": {
1266
- "Average Score": 0.349601002,
1267
- "Standard Deviation": 0.05077455539,
1268
- "Rank": 34
1269
- },
1270
- "Logical": {
1271
- "Average Score": 0.331069242,
1272
- "Standard Deviation": 0.02180827173,
1273
- "Rank": 31
1274
- },
1275
- "Social": {
1276
- "Average Score": 0.319991655,
1277
- "Standard Deviation": 0.04502478724,
1278
- "Rank": 31
1279
- }
1280
- }
1281
- },
1282
- {
1283
- "config": {
1284
- "model_name": "command-r-(08-2024)",
1285
- "organization": "Cohere",
1286
- "license": "CC-BY-NC-4.0",
1287
- "knowledge_cutoff": "2024-08"
1288
- },
1289
- "results": {
1290
- "OVERALL": {
1291
- "Average Score": 0.427605298,
1292
- "Standard Deviation": 0.01747449163,
1293
- "Rank": 26
1294
- },
1295
- "Geometry": {
1296
- "Average Score": 0.448300727,
1297
- "Standard Deviation": 0.04996362328,
1298
- "Rank": 29
1299
- },
1300
- "Algebra": {
1301
- "Average Score": 0.417519167,
1302
- "Standard Deviation": 0.01822196902,
1303
- "Rank": 32
1304
- },
1305
- "Probability": {
1306
- "Average Score": 0.366336281,
1307
- "Standard Deviation": 0.04716826942,
1308
- "Rank": 33
1309
- },
1310
- "Logical": {
1311
- "Average Score": 0.214657906,
1312
- "Standard Deviation": 0.03003579835,
1313
- "Rank": 38
1314
- },
1315
- "Social": {
1316
- "Average Score": 0.276088379,
1317
- "Standard Deviation": 0.03295234688,
1318
- "Rank": 34
1319
- }
1320
- }
1321
- },
1322
- {
1323
- "config": {
1324
- "model_name": "gemma-1.1-7b-it",
1325
- "organization": "Google",
1326
- "license": "Gemma License",
1327
- "knowledge_cutoff": "2023-11"
1328
- },
1329
- "results": {
1330
- "OVERALL": {
1331
- "Average Score": 0.339506922,
1332
- "Standard Deviation": 0.1066279108,
1333
- "Rank": 37
1334
- },
1335
- "Geometry": {
1336
- "Average Score": 0.324170977,
1337
- "Standard Deviation": 0.04668553765,
1338
- "Rank": 37
1339
- },
1340
- "Algebra": {
1341
- "Average Score": 0.398684697,
1342
- "Standard Deviation": 0.01982398259,
1343
- "Rank": 34
1344
- },
1345
- "Probability": {
1346
- "Average Score": 0.293253175,
1347
- "Standard Deviation": 0.05126192191,
1348
- "Rank": 38
1349
- },
1350
- "Logical": {
1351
- "Average Score": 0.317750796,
1352
- "Standard Deviation": 0.01101933543,
1353
- "Rank": 32
1354
- },
1355
- "Social": {
1356
- "Average Score": 0.179073276,
1357
- "Standard Deviation": 0.02009658805,
1358
- "Rank": 43
1359
- }
1360
- }
1361
- },
1362
- {
1363
- "config": {
1364
- "model_name": "llama3-8b-instruct",
1365
- "organization": "Meta",
1366
- "license": "Llama 3 Community",
1367
- "knowledge_cutoff": "2024-01"
1368
- },
1369
- "results": {
1370
- "OVERALL": {
1371
- "Average Score": 0.367722676,
1372
- "Standard Deviation": 0.1071368221,
1373
- "Rank": 32
1374
- },
1375
- "Geometry": {
1376
- "Average Score": 0.367143758,
1377
- "Standard Deviation": 0.04363680358,
1378
- "Rank": 35
1379
- },
1380
- "Algebra": {
1381
- "Average Score": 0.391480973,
1382
- "Standard Deviation": 0.02757445266,
1383
- "Rank": 35
1384
- },
1385
- "Probability": {
1386
- "Average Score": 0.317616445,
1387
- "Standard Deviation": 0.04300430361,
1388
- "Rank": 37
1389
- },
1390
- "Logical": {
1391
- "Average Score": 0.461607495,
1392
- "Standard Deviation": 0.02185028842,
1393
- "Rank": 23
1394
- },
1395
- "Social": {
1396
- "Average Score": 0.336373622,
1397
- "Standard Deviation": 0.05762408512,
1398
- "Rank": 28
1399
- }
1400
- }
1401
- },
1402
- {
1403
- "config": {
1404
- "model_name": "gemma-2-2b-it",
1405
- "organization": "Google",
1406
- "license": "Gemma License",
1407
- "knowledge_cutoff": "2023-12"
1408
- },
1409
- "results": {
1410
- "OVERALL": {
1411
- "Average Score": 0.502167612,
1412
- "Standard Deviation": 0.04389786763,
1413
- "Rank": 20
1414
- },
1415
- "Geometry": {
1416
- "Average Score": 0.395006676,
1417
- "Standard Deviation": 0.05882607713,
1418
- "Rank": 33
1419
- },
1420
- "Algebra": {
1421
- "Average Score": 0.379391887,
1422
- "Standard Deviation": 0.01722410785,
1423
- "Rank": 36
1424
- },
1425
- "Probability": {
1426
- "Average Score": 0.331231097,
1427
- "Standard Deviation": 0.05392499987,
1428
- "Rank": 36
1429
- },
1430
- "Logical": {
1431
- "Average Score": 0.367687789,
1432
- "Standard Deviation": 0.02547968808,
1433
- "Rank": 27
1434
- },
1435
- "Social": {
1436
- "Average Score": 0.393482094,
1437
- "Standard Deviation": 0.06450214024,
1438
- "Rank": 25
1439
- }
1440
- }
1441
- },
1442
- {
1443
- "config": {
1444
- "model_name": "starling-lm-7b-alpha",
1445
- "organization": "Nexusflow",
1446
- "license": "Apache-2.0",
1447
- "knowledge_cutoff": "2023-12"
1448
- },
1449
- "results": {
1450
- "OVERALL": {
1451
- "Average Score": 0.366628765,
1452
- "Standard Deviation": 0.08405492929,
1453
- "Rank": 33
1454
- },
1455
- "Geometry": {
1456
- "Average Score": 0.336782578,
1457
- "Standard Deviation": 0.04069449132,
1458
- "Rank": 36
1459
- },
1460
- "Algebra": {
1461
- "Average Score": 0.371551932,
1462
- "Standard Deviation": 0.03367241745,
1463
- "Rank": 37
1464
- },
1465
- "Probability": {
1466
- "Average Score": 0.331472505,
1467
- "Standard Deviation": 0.04833324282,
1468
- "Rank": 35
1469
- },
1470
- "Logical": {
1471
- "Average Score": 0.260869624,
1472
- "Standard Deviation": 0.03562735237,
1473
- "Rank": 36
1474
- },
1475
- "Social": {
1476
- "Average Score": 0.271975534,
1477
- "Standard Deviation": 0.04266753408,
1478
- "Rank": 35
1479
- }
1480
- }
1481
- },
1482
- {
1483
- "config": {
1484
- "model_name": "qwen1.5-4b-chat",
1485
- "organization": "Alibaba",
1486
- "license": "Qianwen LICENSE",
1487
- "knowledge_cutoff": "2024-02"
1488
- },
1489
- "results": {
1490
- "OVERALL": {
1491
- "Average Score": 0.111876411,
1492
- "Standard Deviation": 0.04241022785,
1493
- "Rank": 49
1494
- },
1495
- "Geometry": {
1496
- "Average Score": 0.215834522,
1497
- "Standard Deviation": 0.0363766363,
1498
- "Rank": 41
1499
- },
1500
- "Algebra": {
1501
- "Average Score": 0.305589811,
1502
- "Standard Deviation": 0.02354198912,
1503
- "Rank": 38
1504
- },
1505
- "Probability": {
1506
- "Average Score": 0.149365327,
1507
- "Standard Deviation": 0.03489672675,
1508
- "Rank": 45
1509
- },
1510
- "Logical": {
1511
- "Average Score": 0.116210168,
1512
- "Standard Deviation": 0.005927966496,
1513
- "Rank": 47
1514
- },
1515
- "Social": {
1516
- "Average Score": 0.18195615,
1517
- "Standard Deviation": 0.02269805277,
1518
- "Rank": 42
1519
- }
1520
- }
1521
- },
1522
- {
1523
- "config": {
1524
- "model_name": "command-r-(04-2024)",
1525
- "organization": "Cohere",
1526
- "license": "CC-BY-NC-4.0",
1527
- "knowledge_cutoff": "2024-04"
1528
- },
1529
- "results": {
1530
- "OVERALL": {
1531
- "Average Score": 0.388783887,
1532
- "Standard Deviation": 0.07417186783,
1533
- "Rank": 30
1534
- },
1535
- "Geometry": {
1536
- "Average Score": 0.300416698,
1537
- "Standard Deviation": 0.03485612736,
1538
- "Rank": 38
1539
- },
1540
- "Algebra": {
1541
- "Average Score": 0.293120231,
1542
- "Standard Deviation": 0.032926484,
1543
- "Rank": 39
1544
- },
1545
- "Probability": {
1546
- "Average Score": 0.281271304,
1547
- "Standard Deviation": 0.05697149867,
1548
- "Rank": 40
1549
- },
1550
- "Logical": {
1551
- "Average Score": 0.276189906,
1552
- "Standard Deviation": 0.03562914754,
1553
- "Rank": 34
1554
- },
1555
- "Social": {
1556
- "Average Score": 0.283882949,
1557
- "Standard Deviation": 0.03336901148,
1558
- "Rank": 33
1559
- }
1560
- }
1561
- },
1562
- {
1563
- "config": {
1564
- "model_name": "vicuna-33b",
1565
- "organization": "LMSYS",
1566
- "license": "Non-commercial",
1567
- "knowledge_cutoff": "2023-12"
1568
- },
1569
- "results": {
1570
- "OVERALL": {
1571
- "Average Score": 0.316543555,
1572
- "Standard Deviation": 0.08922095647,
1573
- "Rank": 39
1574
- },
1575
- "Geometry": {
1576
- "Average Score": 0.208284679,
1577
- "Standard Deviation": 0.03937771461,
1578
- "Rank": 42
1579
- },
1580
- "Algebra": {
1581
- "Average Score": 0.248994048,
1582
- "Standard Deviation": 0.02668175054,
1583
- "Rank": 41
1584
- },
1585
- "Probability": {
1586
- "Average Score": 0.222313995,
1587
- "Standard Deviation": 0.03978859759,
1588
- "Rank": 43
1589
- },
1590
- "Logical": {
1591
- "Average Score": 0.180291222,
1592
- "Standard Deviation": 0.021886267,
1593
- "Rank": 42
1594
- },
1595
- "Social": {
1596
- "Average Score": 0.257623798,
1597
- "Standard Deviation": 0.02653724437,
1598
- "Rank": 36
1599
- }
1600
- }
1601
- },
1602
- {
1603
- "config": {
1604
- "model_name": "gemma-7b-it",
1605
- "organization": "Google",
1606
- "license": "Gemma License",
1607
- "knowledge_cutoff": "2023-12"
1608
- },
1609
- "results": {
1610
- "OVERALL": {
1611
- "Average Score": 0.285077558,
1612
- "Standard Deviation": 0.08871758453,
1613
- "Rank": 41
1614
- },
1615
- "Geometry": {
1616
- "Average Score": 0.244791417,
1617
- "Standard Deviation": 0.0289612078,
1618
- "Rank": 39
1619
- },
1620
- "Algebra": {
1621
- "Average Score": 0.250614794,
1622
- "Standard Deviation": 0.01991678295,
1623
- "Rank": 40
1624
- },
1625
- "Probability": {
1626
- "Average Score": 0.174313053,
1627
- "Standard Deviation": 0.03765424728,
1628
- "Rank": 44
1629
- },
1630
- "Logical": {
1631
- "Average Score": 0.197505536,
1632
- "Standard Deviation": 0.02050298885,
1633
- "Rank": 39
1634
- },
1635
- "Social": {
1636
- "Average Score": 0.202138025,
1637
- "Standard Deviation": 0.02098346639,
1638
- "Rank": 41
1639
- }
1640
- }
1641
- },
1642
- {
1643
- "config": {
1644
- "model_name": "mistral-7b-instruct-2",
1645
- "organization": "Mistral",
1646
- "license": "Apache 2.0",
1647
- "knowledge_cutoff": "2023-12"
1648
- },
1649
- "results": {
1650
- "OVERALL": {
1651
- "Average Score": 0.427513868,
1652
- "Standard Deviation": 0.05553921135,
1653
- "Rank": 27
1654
- },
1655
- "Geometry": {
1656
- "Average Score": 0.216402626,
1657
- "Standard Deviation": 0.03338414918,
1658
- "Rank": 40
1659
- },
1660
- "Algebra": {
1661
- "Average Score": 0.233777838,
1662
- "Standard Deviation": 0.0155226054,
1663
- "Rank": 42
1664
- },
1665
- "Probability": {
1666
- "Average Score": 0.25118175,
1667
- "Standard Deviation": 0.04065514593,
1668
- "Rank": 41
1669
- },
1670
- "Logical": {
1671
- "Average Score": 0.224469136,
1672
- "Standard Deviation": 0.03404706752,
1673
- "Rank": 37
1674
- },
1675
- "Social": {
1676
- "Average Score": 0.209386782,
1677
- "Standard Deviation": 0.02738569921,
1678
- "Rank": 40
1679
- }
1680
- }
1681
- },
1682
- {
1683
- "config": {
1684
- "model_name": "mistral-7b-instruct-1",
1685
- "organization": "Mistral",
1686
- "license": "Apache 2.0",
1687
- "knowledge_cutoff": "2023-12"
1688
- },
1689
- "results": {
1690
- "OVERALL": {
1691
- "Average Score": 0.23016314,
1692
- "Standard Deviation": 0.07137625271,
1693
- "Rank": 46
1694
- },
1695
- "Geometry": {
1696
- "Average Score": 0.161799938,
1697
- "Standard Deviation": 0.03595278559,
1698
- "Rank": 46
1699
- },
1700
- "Algebra": {
1701
- "Average Score": 0.210341624,
1702
- "Standard Deviation": 0.01736539119,
1703
- "Rank": 43
1704
- },
1705
- "Probability": {
1706
- "Average Score": 0.238417922,
1707
- "Standard Deviation": 0.03744211933,
1708
- "Rank": 42
1709
- },
1710
- "Logical": {
1711
- "Average Score": 0.142636601,
1712
- "Standard Deviation": 0.02080406365,
1713
- "Rank": 46
1714
- },
1715
- "Social": {
1716
- "Average Score": 0.117646827,
1717
- "Standard Deviation": 0.009321202779,
1718
- "Rank": 49
1719
- }
1720
- }
1721
- },
1722
- {
1723
- "config": {
1724
- "model_name": "vicuna-13b",
1725
- "organization": "LMSYS",
1726
- "license": "Non-commercial",
1727
- "knowledge_cutoff": "2023-11"
1728
- },
1729
- "results": {
1730
- "OVERALL": {
1731
- "Average Score": 0.201892849,
1732
- "Standard Deviation": 0.06021749802,
1733
- "Rank": 47
1734
- },
1735
- "Geometry": {
1736
- "Average Score": 0.200941928,
1737
- "Standard Deviation": 0.03366817781,
1738
- "Rank": 43
1739
- },
1740
- "Algebra": {
1741
- "Average Score": 0.196123323,
1742
- "Standard Deviation": 0.0135715643,
1743
- "Rank": 44
1744
- },
1745
- "Probability": {
1746
- "Average Score": 0.141214079,
1747
- "Standard Deviation": 0.02721328211,
1748
- "Rank": 46
1749
- },
1750
- "Logical": {
1751
- "Average Score": 0.148598631,
1752
- "Standard Deviation": 0.02241523892,
1753
- "Rank": 44
1754
- },
1755
- "Social": {
1756
- "Average Score": 0.124655135,
1757
- "Standard Deviation": 0.01122382671,
1758
- "Rank": 48
1759
- }
1760
- }
1761
- },
1762
- {
1763
- "config": {
1764
- "model_name": "zephyr-7b-beta",
1765
- "organization": "HuggingFace",
1766
- "license": "MIT",
1767
- "knowledge_cutoff": "2023-10"
1768
- },
1769
- "results": {
1770
- "OVERALL": {
1771
- "Average Score": 0.102705119,
1772
- "Standard Deviation": 0.03683757312,
1773
- "Rank": 50
1774
- },
1775
- "Geometry": {
1776
- "Average Score": 0.114005544,
1777
- "Standard Deviation": 0.03144354365,
1778
- "Rank": 47
1779
- },
1780
- "Algebra": {
1781
- "Average Score": 0.141766633,
1782
- "Standard Deviation": 0.03179520129,
1783
- "Rank": 45
1784
- },
1785
- "Probability": {
1786
- "Average Score": 0.089050714,
1787
- "Standard Deviation": 0.002136754266,
1788
- "Rank": 49
1789
- },
1790
- "Logical": {
1791
- "Average Score": 0.069520789,
1792
- "Standard Deviation": 0.004477840857,
1793
- "Rank": 51
1794
- },
1795
- "Social": {
1796
- "Average Score": 0.0,
1797
- "Standard Deviation": 0.0,
1798
- "Rank": 54
1799
- }
1800
- }
1801
- },
1802
- {
1803
- "config": {
1804
- "model_name": "gemma-1.1-2b-it",
1805
- "organization": "Google",
1806
- "license": "Gemma License",
1807
- "knowledge_cutoff": "2023-12"
1808
- },
1809
- "results": {
1810
- "OVERALL": {
1811
- "Average Score": 0.257700845,
1812
- "Standard Deviation": 0.07369021445,
1813
- "Rank": 44
1814
- },
1815
- "Geometry": {
1816
- "Average Score": 0.183974034,
1817
- "Standard Deviation": 0.0215548886,
1818
- "Rank": 45
1819
- },
1820
- "Algebra": {
1821
- "Average Score": 0.13422252,
1822
- "Standard Deviation": 0.01922819511,
1823
- "Rank": 46
1824
- },
1825
- "Probability": {
1826
- "Average Score": 0.095628657,
1827
- "Standard Deviation": 0.007536076456,
1828
- "Rank": 48
1829
- },
1830
- "Logical": {
1831
- "Average Score": 0.094965074,
1832
- "Standard Deviation": 0.005019175487,
1833
- "Rank": 49
1834
- },
1835
- "Social": {
1836
- "Average Score": 0.167796727,
1837
- "Standard Deviation": 0.01666541942,
1838
- "Rank": 44
1839
- }
1840
- }
1841
- },
1842
- {
1843
- "config": {
1844
- "model_name": "llama2-7b-chat",
1845
- "organization": "Meta",
1846
- "license": "Llama 2 Community",
1847
- "knowledge_cutoff": "2023-10"
1848
- },
1849
- "results": {
1850
- "OVERALL": {
1851
- "Average Score": 0.260189428,
1852
- "Standard Deviation": 0.08019299364,
1853
- "Rank": 43
1854
- },
1855
- "Geometry": {
1856
- "Average Score": 0.087067276,
1857
- "Standard Deviation": 0.04274343402,
1858
- "Rank": 48
1859
- },
1860
- "Algebra": {
1861
- "Average Score": 0.12308805,
1862
- "Standard Deviation": 0.01856053622,
1863
- "Rank": 47
1864
- },
1865
- "Probability": {
1866
- "Average Score": 0.087515438,
1867
- "Standard Deviation": 0.006315053573,
1868
- "Rank": 50
1869
- },
1870
- "Logical": {
1871
- "Average Score": 0.17312827,
1872
- "Standard Deviation": 0.01867044092,
1873
- "Rank": 43
1874
- },
1875
- "Social": {
1876
- "Average Score": 0.152905272,
1877
- "Standard Deviation": 0.007166957097,
1878
- "Rank": 45
1879
- }
1880
- }
1881
- },
1882
- {
1883
- "config": {
1884
- "model_name": "gemma-2b-it",
1885
- "organization": "Google",
1886
- "license": "Gemma License",
1887
- "knowledge_cutoff": "2023-11"
1888
- },
1889
- "results": {
1890
- "OVERALL": {
1891
- "Average Score": 0.234172069,
1892
- "Standard Deviation": 0.06522685718,
1893
- "Rank": 45
1894
- },
1895
- "Geometry": {
1896
- "Average Score": 0.198571153,
1897
- "Standard Deviation": 0.01699161031,
1898
- "Rank": 44
1899
- },
1900
- "Algebra": {
1901
- "Average Score": 0.109883009,
1902
- "Standard Deviation": 0.01520005833,
1903
- "Rank": 48
1904
- },
1905
- "Probability": {
1906
- "Average Score": 0.06467432,
1907
- "Standard Deviation": 0.002117497231,
1908
- "Rank": 52
1909
- },
1910
- "Logical": {
1911
- "Average Score": 0.039624492,
1912
- "Standard Deviation": 0.007606972686,
1913
- "Rank": 52
1914
- },
1915
- "Social": {
1916
- "Average Score": 0.087452913,
1917
- "Standard Deviation": 0.008170146562,
1918
- "Rank": 52
1919
- }
1920
- }
1921
- },
1922
- {
1923
- "config": {
1924
- "model_name": "llama2-13b-chat",
1925
- "organization": "Meta",
1926
- "license": "Llama 2 Community",
1927
- "knowledge_cutoff": "2023-12"
1928
- },
1929
- "results": {
1930
- "OVERALL": {
1931
- "Average Score": 0.263305684,
1932
- "Standard Deviation": 0.07283640689,
1933
- "Rank": 42
1934
- },
1935
- "Geometry": {
1936
- "Average Score": 0.072729954,
1937
- "Standard Deviation": 0.02315988261,
1938
- "Rank": 50
1939
- },
1940
- "Algebra": {
1941
- "Average Score": 0.080371692,
1942
- "Standard Deviation": 0.01277569453,
1943
- "Rank": 49
1944
- },
1945
- "Probability": {
1946
- "Average Score": 0.117757344,
1947
- "Standard Deviation": 0.02418619619,
1948
- "Rank": 47
1949
- },
1950
- "Logical": {
1951
- "Average Score": 0.193149889,
1952
- "Standard Deviation": 0.01776690764,
1953
- "Rank": 40
1954
- },
1955
- "Social": {
1956
- "Average Score": 0.149125922,
1957
- "Standard Deviation": 0.01157416827,
1958
- "Rank": 46
1959
- }
1960
- }
1961
- },
1962
- {
1963
- "config": {
1964
- "model_name": "vicuna-7b",
1965
- "organization": "LMSYS",
1966
- "license": "Non-commercial",
1967
- "knowledge_cutoff": "2023-11"
1968
- },
1969
- "results": {
1970
- "OVERALL": {
1971
- "Average Score": 0.198839786,
1972
- "Standard Deviation": 0.05725381576,
1973
- "Rank": 48
1974
- },
1975
- "Geometry": {
1976
- "Average Score": 0.083457058,
1977
- "Standard Deviation": 0.02520989111,
1978
- "Rank": 49
1979
- },
1980
- "Algebra": {
1981
- "Average Score": 0.070883882,
1982
- "Standard Deviation": 0.007315853253,
1983
- "Rank": 50
1984
- },
1985
- "Probability": {
1986
- "Average Score": 0.080987673,
1987
- "Standard Deviation": 0.005474288861,
1988
- "Rank": 51
1989
- },
1990
- "Logical": {
1991
- "Average Score": 0.100065588,
1992
- "Standard Deviation": 0.003561886452,
1993
- "Rank": 48
1994
- },
1995
- "Social": {
1996
- "Average Score": 0.111076414,
1997
- "Standard Deviation": 0.004805626512,
1998
- "Rank": 50
1999
- }
2000
- }
2001
- },
2002
- {
2003
- "config": {
2004
- "model_name": "koala-13b",
2005
- "organization": "UC Berkeley",
2006
- "license": "Non-commercial",
2007
- "knowledge_cutoff": "2023-10"
2008
- },
2009
- "results": {
2010
- "OVERALL": {
2011
- "Average Score": 0.09387188,
2012
- "Standard Deviation": 0.02642167489,
2013
- "Rank": 51
2014
- },
2015
- "Geometry": {
2016
- "Average Score": 0.017374001,
2017
- "Standard Deviation": 0.01747053557,
2018
- "Rank": 51
2019
- },
2020
- "Algebra": {
2021
- "Average Score": 0.018129197,
2022
- "Standard Deviation": 0.01054371383,
2023
- "Rank": 51
2024
- },
2025
- "Probability": {
2026
- "Average Score": 0.043654362,
2027
- "Standard Deviation": 0.004288231886,
2028
- "Rank": 53
2029
- },
2030
- "Logical": {
2031
- "Average Score": 0.074694053,
2032
- "Standard Deviation": 0.002674646998,
2033
- "Rank": 50
2034
- },
2035
- "Social": {
2036
- "Average Score": 0.096983835,
2037
- "Standard Deviation": 0.007847059783,
2038
- "Rank": 51
2039
- }
2040
- }
2041
- },
2042
- {
2043
- "config": {
2044
- "model_name": "openassistant-pythia-12b",
2045
- "organization": "OpenAssistant",
2046
- "license": "Non-commercial",
2047
- "knowledge_cutoff": "2023-09"
2048
- },
2049
- "results": {
2050
- "OVERALL": {
2051
- "Average Score": 0.0,
2052
- "Standard Deviation": 0.0,
2053
- "Rank": 52
2054
- },
2055
- "Geometry": {
2056
- "Average Score": 0.0,
2057
- "Standard Deviation": 0.0,
2058
- "Rank": 52
2059
- },
2060
- "Algebra": {
2061
- "Average Score": 0.0,
2062
- "Standard Deviation": 0.0,
2063
- "Rank": 52
2064
- },
2065
- "Probability": {
2066
- "Average Score": 0.0,
2067
- "Standard Deviation": 0.0,
2068
- "Rank": 54
2069
- },
2070
- "Logical": {
2071
- "Average Score": 0.0,
2072
- "Standard Deviation": 0.0,
2073
- "Rank": 53
2074
- },
2075
- "Social": {
2076
- "Average Score": 0.030792528,
2077
- "Standard Deviation": 0.007518796391,
2078
- "Rank": 53
2079
- }
2080
- }
2081
- }
2082
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/results/models_2024-10-08-17:39:21.001582.jsonl DELETED
@@ -1,2082 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "model_name": "ChatGPT-4o-latest (2024-09-03)",
5
- "organization": "OpenAI",
6
- "license": "Proprietary",
7
- "knowledge_cutoff": "2023/10"
8
- },
9
- "results": {
10
- "OVERALL": {
11
- "Average Score": 0.974329609,
12
- "Standard Deviation": 0.005024959031,
13
- "Rank": 2
14
- },
15
- "Geometry": {
16
- "Average Score": 0.976028578,
17
- "Standard Deviation": 0.01507912373,
18
- "Rank": 3
19
- },
20
- "Algebra": {
21
- "Average Score": 0.951199453,
22
- "Standard Deviation": 0.08452452108,
23
- "Rank": 3
24
- },
25
- "Probability": {
26
- "Average Score": 0.842116641,
27
- "Standard Deviation": 0.006267759054,
28
- "Rank": 3
29
- },
30
- "Logical": {
31
- "Average Score": 0.828490728,
32
- "Standard Deviation": 0.009134213144,
33
- "Rank": 3
34
- },
35
- "Social": {
36
- "Average Score": 0.815902987,
37
- "Standard Deviation": 0.0196254222,
38
- "Rank": 4
39
- }
40
- }
41
- },
42
- {
43
- "config": {
44
- "model_name": "gpt-4o-2024-08-06",
45
- "organization": "OpenAI",
46
- "license": "Proprietary",
47
- "knowledge_cutoff": "2023/10"
48
- },
49
- "results": {
50
- "OVERALL": {
51
- "Average Score": 0.846571548,
52
- "Standard Deviation": 0.03394056554,
53
- "Rank": 6
54
- },
55
- "Geometry": {
56
- "Average Score": 0.99773096,
57
- "Standard Deviation": 0.002835555172,
58
- "Rank": 1
59
- },
60
- "Algebra": {
61
- "Average Score": 1.0,
62
- "Standard Deviation": 0.0,
63
- "Rank": 1
64
- },
65
- "Probability": {
66
- "Average Score": 0.78855795,
67
- "Standard Deviation": 0.008188675452,
68
- "Rank": 6
69
- },
70
- "Logical": {
71
- "Average Score": 0.668635768,
72
- "Standard Deviation": 0.03466314094,
73
- "Rank": 11
74
- },
75
- "Social": {
76
- "Average Score": 0.680417314,
77
- "Standard Deviation": 0.00656867063,
78
- "Rank": 9
79
- }
80
- }
81
- },
82
- {
83
- "config": {
84
- "model_name": "gpt-4o-2024-05-13",
85
- "organization": "OpenAI",
86
- "license": "Proprietary",
87
- "knowledge_cutoff": "2023/10"
88
- },
89
- "results": {
90
- "OVERALL": {
91
- "Average Score": 0.846334477,
92
- "Standard Deviation": 0.09377911572,
93
- "Rank": 7
94
- },
95
- "Geometry": {
96
- "Average Score": 0.972472377,
97
- "Standard Deviation": 0.01648274205,
98
- "Rank": 4
99
- },
100
- "Algebra": {
101
- "Average Score": 0.995511298,
102
- "Standard Deviation": 0.004097802515,
103
- "Rank": 2
104
- },
105
- "Probability": {
106
- "Average Score": 0.812149974,
107
- "Standard Deviation": 0.007669585485,
108
- "Rank": 4
109
- },
110
- "Logical": {
111
- "Average Score": 0.755019692,
112
- "Standard Deviation": 0.008149588572,
113
- "Rank": 6
114
- },
115
- "Social": {
116
- "Average Score": 0.609875087,
117
- "Standard Deviation": 0.038729239,
118
- "Rank": 14
119
- }
120
- }
121
- },
122
- {
123
- "config": {
124
- "model_name": "gpt-4-turbo-2024-04-09",
125
- "organization": "OpenAI",
126
- "license": "Proprietary",
127
- "knowledge_cutoff": "2023/12"
128
- },
129
- "results": {
130
- "OVERALL": {
131
- "Average Score": 0.855357972,
132
- "Standard Deviation": 0.1016986368,
133
- "Rank": 4
134
- },
135
- "Geometry": {
136
- "Average Score": 0.95374588,
137
- "Standard Deviation": 0.03109307166,
138
- "Rank": 5
139
- },
140
- "Algebra": {
141
- "Average Score": 0.930945223,
142
- "Standard Deviation": 0.06705136813,
143
- "Rank": 4
144
- },
145
- "Probability": {
146
- "Average Score": 0.750705448,
147
- "Standard Deviation": 0.05944483103,
148
- "Rank": 8
149
- },
150
- "Logical": {
151
- "Average Score": 0.77906699,
152
- "Standard Deviation": 0.007406734161,
153
- "Rank": 4
154
- },
155
- "Social": {
156
- "Average Score": 0.715935163,
157
- "Standard Deviation": 0.1209141409,
158
- "Rank": 7
159
- }
160
- }
161
- },
162
- {
163
- "config": {
164
- "model_name": "gemini-1.5-pro-001",
165
- "organization": "Google",
166
- "license": "Proprietary",
167
- "knowledge_cutoff": "2023/11"
168
- },
169
- "results": {
170
- "OVERALL": {
171
- "Average Score": 0.797187842,
172
- "Standard Deviation": 0.0272375249,
173
- "Rank": 10
174
- },
175
- "Geometry": {
176
- "Average Score": 0.9947169,
177
- "Standard Deviation": 0.009150597621,
178
- "Rank": 2
179
- },
180
- "Algebra": {
181
- "Average Score": 0.857464301,
182
- "Standard Deviation": 0.05014285338,
183
- "Rank": 5
184
- },
185
- "Probability": {
186
- "Average Score": 0.651781767,
187
- "Standard Deviation": 0.04156998547,
188
- "Rank": 12
189
- },
190
- "Logical": {
191
- "Average Score": 0.739745471,
192
- "Standard Deviation": 0.01631532019,
193
- "Rank": 7
194
- },
195
- "Social": {
196
- "Average Score": 0.649601885,
197
- "Standard Deviation": 0.104854889,
198
- "Rank": 12
199
- }
200
- }
201
- },
202
- {
203
- "config": {
204
- "model_name": "qwen2-72b-instruct",
205
- "organization": "Alibaba",
206
- "license": "Qianwen LICENSE",
207
- "knowledge_cutoff": "2024/09"
208
- },
209
- "results": {
210
- "OVERALL": {
211
- "Average Score": 0.737918558,
212
- "Standard Deviation": 0.09069077339,
213
- "Rank": 11
214
- },
215
- "Geometry": {
216
- "Average Score": 0.796870305,
217
- "Standard Deviation": 0.0509025346,
218
- "Rank": 9
219
- },
220
- "Algebra": {
221
- "Average Score": 0.836194231,
222
- "Standard Deviation": 0.04517093028,
223
- "Rank": 6
224
- },
225
- "Probability": {
226
- "Average Score": 0.788068004,
227
- "Standard Deviation": 0.007288989044,
228
- "Rank": 7
229
- },
230
- "Logical": {
231
- "Average Score": 0.619300904,
232
- "Standard Deviation": 0.06377931612,
233
- "Rank": 15
234
- },
235
- "Social": {
236
- "Average Score": 0.652578786,
237
- "Standard Deviation": 0.04259293171,
238
- "Rank": 11
239
- }
240
- }
241
- },
242
- {
243
- "config": {
244
- "model_name": "gpt-4o-mini-2024-07-18",
245
- "organization": "OpenAI",
246
- "license": "Proprietary",
247
- "knowledge_cutoff": "2023/10"
248
- },
249
- "results": {
250
- "OVERALL": {
251
- "Average Score": 0.847694133,
252
- "Standard Deviation": 0.02164304402,
253
- "Rank": 5
254
- },
255
- "Geometry": {
256
- "Average Score": 0.946650435,
257
- "Standard Deviation": 0.01831236482,
258
- "Rank": 7
259
- },
260
- "Algebra": {
261
- "Average Score": 0.796243022,
262
- "Standard Deviation": 0.05537539202,
263
- "Rank": 7
264
- },
265
- "Probability": {
266
- "Average Score": 0.798402685,
267
- "Standard Deviation": 0.009404491967,
268
- "Rank": 5
269
- },
270
- "Logical": {
271
- "Average Score": 0.727009735,
272
- "Standard Deviation": 0.02628110141,
273
- "Rank": 8
274
- },
275
- "Social": {
276
- "Average Score": 0.691949855,
277
- "Standard Deviation": 0.02072934333,
278
- "Rank": 8
279
- }
280
- }
281
- },
282
- {
283
- "config": {
284
- "model_name": "claude-3.5-sonnet",
285
- "organization": "Anthropic",
286
- "license": "Proprietary",
287
- "knowledge_cutoff": "2024/04"
288
- },
289
- "results": {
290
- "OVERALL": {
291
- "Average Score": 0.839004422,
292
- "Standard Deviation": 0.1461079564,
293
- "Rank": 8
294
- },
295
- "Geometry": {
296
- "Average Score": 0.95316419,
297
- "Standard Deviation": 0.02081192856,
298
- "Rank": 6
299
- },
300
- "Algebra": {
301
- "Average Score": 0.759789952,
302
- "Standard Deviation": 0.02611765096,
303
- "Rank": 8
304
- },
305
- "Probability": {
306
- "Average Score": 0.707730127,
307
- "Standard Deviation": 0.0394436664,
308
- "Rank": 10
309
- },
310
- "Logical": {
311
- "Average Score": 0.77342666,
312
- "Standard Deviation": 0.002892426458,
313
- "Rank": 5
314
- },
315
- "Social": {
316
- "Average Score": 0.790002247,
317
- "Standard Deviation": 0.1007410022,
318
- "Rank": 5
319
- }
320
- }
321
- },
322
- {
323
- "config": {
324
- "model_name": "o1-mini",
325
- "organization": "OpenAI",
326
- "license": "Proprietary",
327
- "knowledge_cutoff": "2023/10"
328
- },
329
- "results": {
330
- "OVERALL": {
331
- "Average Score": 1.0,
332
- "Standard Deviation": 0.0,
333
- "Rank": 1
334
- },
335
- "Geometry": {
336
- "Average Score": "N/A",
337
- "Standard Deviation": "N/A",
338
- "Rank": "N/A"
339
- },
340
- "Algebra": {
341
- "Average Score": "N/A",
342
- "Standard Deviation": "N/A",
343
- "Rank": "N/A"
344
- },
345
- "Probability": {
346
- "Average Score": 1.0,
347
- "Standard Deviation": 0.0,
348
- "Rank": 1
349
- },
350
- "Logical": {
351
- "Average Score": 1.0,
352
- "Standard Deviation": 0.0,
353
- "Rank": 1
354
- },
355
- "Social": {
356
- "Average Score": 0.993974241,
357
- "Standard Deviation": 0.001996882328,
358
- "Rank": 2
359
- }
360
- }
361
- },
362
- {
363
- "config": {
364
- "model_name": "o1-preview",
365
- "organization": "OpenAI",
366
- "license": "Proprietary",
367
- "knowledge_cutoff": "2023/10"
368
- },
369
- "results": {
370
- "OVERALL": {
371
- "Average Score": 0.945884589,
372
- "Standard Deviation": 0.01059250762,
373
- "Rank": 3
374
- },
375
- "Geometry": {
376
- "Average Score": "N/A",
377
- "Standard Deviation": "N/A",
378
- "Rank": "N/A"
379
- },
380
- "Algebra": {
381
- "Average Score": "N/A",
382
- "Standard Deviation": "N/A",
383
- "Rank": "N/A"
384
- },
385
- "Probability": {
386
- "Average Score": 0.964666392,
387
- "Standard Deviation": 0.003139983398,
388
- "Rank": 2
389
- },
390
- "Logical": {
391
- "Average Score": 0.987950057,
392
- "Standard Deviation": 0.004881220327,
393
- "Rank": 2
394
- },
395
- "Social": {
396
- "Average Score": 1.0,
397
- "Standard Deviation": 0.0,
398
- "Rank": 1
399
- }
400
- }
401
- },
402
- {
403
- "config": {
404
- "model_name": "gemini-1.5-flash-001",
405
- "organization": "Google",
406
- "license": "Proprietary",
407
- "knowledge_cutoff": "2023/11"
408
- },
409
- "results": {
410
- "OVERALL": {
411
- "Average Score": 0.726493401,
412
- "Standard Deviation": 0.01113913725,
413
- "Rank": 12
414
- },
415
- "Geometry": {
416
- "Average Score": 0.804144103,
417
- "Standard Deviation": 0.1327142178,
418
- "Rank": 8
419
- },
420
- "Algebra": {
421
- "Average Score": 0.731776765,
422
- "Standard Deviation": 0.02594657111,
423
- "Rank": 11
424
- },
425
- "Probability": {
426
- "Average Score": 0.614461891,
427
- "Standard Deviation": 0.04690131826,
428
- "Rank": 15
429
- },
430
- "Logical": {
431
- "Average Score": 0.630805991,
432
- "Standard Deviation": 0.04871350612,
433
- "Rank": 13
434
- },
435
- "Social": {
436
- "Average Score": 0.555933822,
437
- "Standard Deviation": 0.1029934524,
438
- "Rank": 16
439
- }
440
- }
441
- },
442
- {
443
- "config": {
444
- "model_name": "gpt4-1106",
445
- "organization": "OpenAI",
446
- "license": "Proprietary",
447
- "knowledge_cutoff": "2024/04"
448
- },
449
- "results": {
450
- "OVERALL": {
451
- "Average Score": 0.816347784,
452
- "Standard Deviation": 0.1566815755,
453
- "Rank": 9
454
- },
455
- "Geometry": {
456
- "Average Score": 0.71843088,
457
- "Standard Deviation": 0.04778038294,
458
- "Rank": 13
459
- },
460
- "Algebra": {
461
- "Average Score": 0.712910417,
462
- "Standard Deviation": 0.02581828898,
463
- "Rank": 12
464
- },
465
- "Probability": {
466
- "Average Score": 0.623947619,
467
- "Standard Deviation": 0.03502982933,
468
- "Rank": 14
469
- },
470
- "Logical": {
471
- "Average Score": 0.637482274,
472
- "Standard Deviation": 0.04158809888,
473
- "Rank": 12
474
- },
475
- "Social": {
476
- "Average Score": 0.450609816,
477
- "Standard Deviation": 0.05208655446,
478
- "Rank": 23
479
- }
480
- }
481
- },
482
- {
483
- "config": {
484
- "model_name": "gemma-2-27b-it",
485
- "organization": "Google",
486
- "license": "Gemma License",
487
- "knowledge_cutoff": "2024/06"
488
- },
489
- "results": {
490
- "OVERALL": {
491
- "Average Score": 0.624169623,
492
- "Standard Deviation": 0.1048365121,
493
- "Rank": 15
494
- },
495
- "Geometry": {
496
- "Average Score": 0.60112744,
497
- "Standard Deviation": 0.0469109952,
498
- "Rank": 19
499
- },
500
- "Algebra": {
501
- "Average Score": 0.687955914,
502
- "Standard Deviation": 0.01959958192,
503
- "Rank": 13
504
- },
505
- "Probability": {
506
- "Average Score": 0.589524771,
507
- "Standard Deviation": 0.03112689325,
508
- "Rank": 16
509
- },
510
- "Logical": {
511
- "Average Score": 0.614978944,
512
- "Standard Deviation": 0.05710657859,
513
- "Rank": 16
514
- },
515
- "Social": {
516
- "Average Score": 0.487844257,
517
- "Standard Deviation": 0.05857760809,
518
- "Rank": 20
519
- }
520
- }
521
- },
522
- {
523
- "config": {
524
- "model_name": "claude-3-opus",
525
- "organization": "Anthropic",
526
- "license": "Proprietary",
527
- "knowledge_cutoff": "2023/08"
528
- },
529
- "results": {
530
- "OVERALL": {
531
- "Average Score": 0.650636271,
532
- "Standard Deviation": 0.1197773541,
533
- "Rank": 14
534
- },
535
- "Geometry": {
536
- "Average Score": 0.7215743,
537
- "Standard Deviation": 0.04712598358,
538
- "Rank": 12
539
- },
540
- "Algebra": {
541
- "Average Score": 0.68777327,
542
- "Standard Deviation": 0.02382683713,
543
- "Rank": 14
544
- },
545
- "Probability": {
546
- "Average Score": 0.626471421,
547
- "Standard Deviation": 0.02911817976,
548
- "Rank": 13
549
- },
550
- "Logical": {
551
- "Average Score": 0.692346381,
552
- "Standard Deviation": 0.03617185198,
553
- "Rank": 10
554
- },
555
- "Social": {
556
- "Average Score": 0.663410854,
557
- "Standard Deviation": 0.09540220876,
558
- "Rank": 10
559
- }
560
- }
561
- },
562
- {
563
- "config": {
564
- "model_name": "gemma-2-9b-it-simpo",
565
- "organization": "Google",
566
- "license": "Gemma License",
567
- "knowledge_cutoff": "2024/07"
568
- },
569
- "results": {
570
- "OVERALL": {
571
- "Average Score": "N/A",
572
- "Standard Deviation": "N/A",
573
- "Rank": "N/A"
574
- },
575
- "Geometry": {
576
- "Average Score": 0.582787508,
577
- "Standard Deviation": 0.03965204074,
578
- "Rank": 20
579
- },
580
- "Algebra": {
581
- "Average Score": 0.658648133,
582
- "Standard Deviation": 0.02565919856,
583
- "Rank": 15
584
- },
585
- "Probability": {
586
- "Average Score": 0.547861265,
587
- "Standard Deviation": 0.02885209131,
588
- "Rank": 19
589
- },
590
- "Logical": {
591
- "Average Score": 0.540720893,
592
- "Standard Deviation": 0.01970134508,
593
- "Rank": 20
594
- },
595
- "Social": {
596
- "Average Score": 0.635266187,
597
- "Standard Deviation": 0.03620021751,
598
- "Rank": 13
599
- }
600
- }
601
- },
602
- {
603
- "config": {
604
- "model_name": "qwen1.5-72b-chat",
605
- "organization": "Alibaba",
606
- "license": "Qianwen LICENSE",
607
- "knowledge_cutoff": "2024/03"
608
- },
609
- "results": {
610
- "OVERALL": {
611
- "Average Score": 0.519549796,
612
- "Standard Deviation": 0.00903634343,
613
- "Rank": 18
614
- },
615
- "Geometry": {
616
- "Average Score": 0.543139301,
617
- "Standard Deviation": 0.03425202326,
618
- "Rank": 24
619
- },
620
- "Algebra": {
621
- "Average Score": 0.635228729,
622
- "Standard Deviation": 0.01944043425,
623
- "Rank": 16
624
- },
625
- "Probability": {
626
- "Average Score": 0.486948658,
627
- "Standard Deviation": 0.06064655315,
628
- "Rank": 23
629
- },
630
- "Logical": {
631
- "Average Score": 0.284069394,
632
- "Standard Deviation": 0.02686608506,
633
- "Rank": 33
634
- },
635
- "Social": {
636
- "Average Score": 0.415007627,
637
- "Standard Deviation": 0.03920053159,
638
- "Rank": 24
639
- }
640
- }
641
- },
642
- {
643
- "config": {
644
- "model_name": "qwen1.5-32b-chat",
645
- "organization": "Alibaba",
646
- "license": "Qianwen LICENSE",
647
- "knowledge_cutoff": "2024/03"
648
- },
649
- "results": {
650
- "OVERALL": {
651
- "Average Score": 0.393789407,
652
- "Standard Deviation": 0.05413770095,
653
- "Rank": 29
654
- },
655
- "Geometry": {
656
- "Average Score": 0.51086835,
657
- "Standard Deviation": 0.04052471998,
658
- "Rank": 27
659
- },
660
- "Algebra": {
661
- "Average Score": 0.609003168,
662
- "Standard Deviation": 0.04874143541,
663
- "Rank": 17
664
- },
665
- "Probability": {
666
- "Average Score": 0.476300002,
667
- "Standard Deviation": 0.05322403912,
668
- "Rank": 24
669
- },
670
- "Logical": {
671
- "Average Score": 0.331781014,
672
- "Standard Deviation": 0.004938997686,
673
- "Rank": 30
674
- },
675
- "Social": {
676
- "Average Score": 0.380987334,
677
- "Standard Deviation": 0.03762251776,
678
- "Rank": 26
679
- }
680
- }
681
- },
682
- {
683
- "config": {
684
- "model_name": "google-gemma-2-9b-it",
685
- "organization": "Google",
686
- "license": "Proprietary",
687
- "knowledge_cutoff": "2024/06"
688
- },
689
- "results": {
690
- "OVERALL": {
691
- "Average Score": 0.489663449,
692
- "Standard Deviation": 0.002595702019,
693
- "Rank": 21
694
- },
695
- "Geometry": {
696
- "Average Score": 0.575371308,
697
- "Standard Deviation": 0.03556220251,
698
- "Rank": 22
699
- },
700
- "Algebra": {
701
- "Average Score": 0.597045661,
702
- "Standard Deviation": 0.0313828123,
703
- "Rank": 18
704
- },
705
- "Probability": {
706
- "Average Score": 0.589221807,
707
- "Standard Deviation": 0.03110811656,
708
- "Rank": 18
709
- },
710
- "Logical": {
711
- "Average Score": 0.587579897,
712
- "Standard Deviation": 0.05512716783,
713
- "Rank": 18
714
- },
715
- "Social": {
716
- "Average Score": 0.768337958,
717
- "Standard Deviation": 0.04078610476,
718
- "Rank": 6
719
- }
720
- }
721
- },
722
- {
723
- "config": {
724
- "model_name": "yi-1.5-34b-chat",
725
- "organization": "01 AI",
726
- "license": "Proprietary",
727
- "knowledge_cutoff": "2024/05"
728
- },
729
- "results": {
730
- "OVERALL": {
731
- "Average Score": 0.607812897,
732
- "Standard Deviation": 0.1440881293,
733
- "Rank": 16
734
- },
735
- "Geometry": {
736
- "Average Score": 0.566666724,
737
- "Standard Deviation": 0.04001381658,
738
- "Rank": 23
739
- },
740
- "Algebra": {
741
- "Average Score": 0.590997292,
742
- "Standard Deviation": 0.03594087315,
743
- "Rank": 19
744
- },
745
- "Probability": {
746
- "Average Score": 0.589524589,
747
- "Standard Deviation": 0.03112618772,
748
- "Rank": 17
749
- },
750
- "Logical": {
751
- "Average Score": 0.574105508,
752
- "Standard Deviation": 0.03441737941,
753
- "Rank": 19
754
- },
755
- "Social": {
756
- "Average Score": 0.516980832,
757
- "Standard Deviation": 0.03369347985,
758
- "Rank": 19
759
- }
760
- }
761
- },
762
- {
763
- "config": {
764
- "model_name": "meta-llama-3.1-8b-instruct",
765
- "organization": "Meta",
766
- "license": "Llama 3.1 Community",
767
- "knowledge_cutoff": "2023/12"
768
- },
769
- "results": {
770
- "OVERALL": {
771
- "Average Score": 0.505936324,
772
- "Standard Deviation": 0.05286756493,
773
- "Rank": 19
774
- },
775
- "Geometry": {
776
- "Average Score": 0.522442162,
777
- "Standard Deviation": 0.03908236317,
778
- "Rank": 25
779
- },
780
- "Algebra": {
781
- "Average Score": 0.582702645,
782
- "Standard Deviation": 0.05002277711,
783
- "Rank": 20
784
- },
785
- "Probability": {
786
- "Average Score": 0.495001149,
787
- "Standard Deviation": 0.05244587037,
788
- "Rank": 22
789
- },
790
- "Logical": {
791
- "Average Score": 0.443030561,
792
- "Standard Deviation": 0.01343820628,
793
- "Rank": 25
794
- },
795
- "Social": {
796
- "Average Score": 0.329195941,
797
- "Standard Deviation": 0.03925019528,
798
- "Rank": 30
799
- }
800
- }
801
- },
802
- {
803
- "config": {
804
- "model_name": "gpt3.5-turbo-0125",
805
- "organization": "OpenAI",
806
- "license": "Proprietary",
807
- "knowledge_cutoff": "2021/09"
808
- },
809
- "results": {
810
- "OVERALL": {
811
- "Average Score": 0.313398088,
812
- "Standard Deviation": 0.09322528606,
813
- "Rank": 40
814
- },
815
- "Geometry": {
816
- "Average Score": 0.678714519,
817
- "Standard Deviation": 0.05926546762,
818
- "Rank": 14
819
- },
820
- "Algebra": {
821
- "Average Score": 0.569296173,
822
- "Standard Deviation": 0.05277281097,
823
- "Rank": 21
824
- },
825
- "Probability": {
826
- "Average Score": 0.448460767,
827
- "Standard Deviation": 0.05768095196,
828
- "Rank": 26
829
- },
830
- "Logical": {
831
- "Average Score": 0.148521348,
832
- "Standard Deviation": 0.04033712907,
833
- "Rank": 45
834
- },
835
- "Social": {
836
- "Average Score": 0.235071541,
837
- "Standard Deviation": 0.02632892457,
838
- "Rank": 39
839
- }
840
- }
841
- },
842
- {
843
- "config": {
844
- "model_name": "llama-3-70b-instruct",
845
- "organization": "Meta",
846
- "license": "Llama 3 Community",
847
- "knowledge_cutoff": "2023/12"
848
- },
849
- "results": {
850
- "OVERALL": {
851
- "Average Score": 0.456689885,
852
- "Standard Deviation": 0.01385989995,
853
- "Rank": 23
854
- },
855
- "Geometry": {
856
- "Average Score": 0.516865529,
857
- "Standard Deviation": 0.03858112564,
858
- "Rank": 26
859
- },
860
- "Algebra": {
861
- "Average Score": 0.566756531,
862
- "Standard Deviation": 0.03369826926,
863
- "Rank": 22
864
- },
865
- "Probability": {
866
- "Average Score": 0.513857306,
867
- "Standard Deviation": 0.05453699062,
868
- "Rank": 21
869
- },
870
- "Logical": {
871
- "Average Score": 0.713796415,
872
- "Standard Deviation": 0.02031215107,
873
- "Rank": 9
874
- },
875
- "Social": {
876
- "Average Score": 0.45872939,
877
- "Standard Deviation": 0.05347039576,
878
- "Rank": 22
879
- }
880
- }
881
- },
882
- {
883
- "config": {
884
- "model_name": "claude-3-sonnet",
885
- "organization": "Anthropic",
886
- "license": "Proprietary",
887
- "knowledge_cutoff": "2023/08"
888
- },
889
- "results": {
890
- "OVERALL": {
891
- "Average Score": 0.520010833,
892
- "Standard Deviation": 0.005030563799,
893
- "Rank": 17
894
- },
895
- "Geometry": {
896
- "Average Score": 0.675613638,
897
- "Standard Deviation": 0.05275594408,
898
- "Rank": 15
899
- },
900
- "Algebra": {
901
- "Average Score": 0.552025728,
902
- "Standard Deviation": 0.04122192409,
903
- "Rank": 23
904
- },
905
- "Probability": {
906
- "Average Score": 0.516192848,
907
- "Standard Deviation": 0.04152293217,
908
- "Rank": 20
909
- },
910
- "Logical": {
911
- "Average Score": 0.588545747,
912
- "Standard Deviation": 0.06068211943,
913
- "Rank": 17
914
- },
915
- "Social": {
916
- "Average Score": 0.570437582,
917
- "Standard Deviation": 0.08607040862,
918
- "Rank": 15
919
- }
920
- }
921
- },
922
- {
923
- "config": {
924
- "model_name": "qwen1.5-14b-chat",
925
- "organization": "Alibaba",
926
- "license": "Qianwen LICENSE",
927
- "knowledge_cutoff": "2024/02"
928
- },
929
- "results": {
930
- "OVERALL": {
931
- "Average Score": 0.415328996,
932
- "Standard Deviation": 0.0743938717,
933
- "Rank": 28
934
- },
935
- "Geometry": {
936
- "Average Score": 0.452504016,
937
- "Standard Deviation": 0.04225594393,
938
- "Rank": 28
939
- },
940
- "Algebra": {
941
- "Average Score": 0.538655725,
942
- "Standard Deviation": 0.03721542594,
943
- "Rank": 24
944
- },
945
- "Probability": {
946
- "Average Score": 0.397185975,
947
- "Standard Deviation": 0.05607695946,
948
- "Rank": 30
949
- },
950
- "Logical": {
951
- "Average Score": 0.264573129,
952
- "Standard Deviation": 0.03936133174,
953
- "Rank": 35
954
- },
955
- "Social": {
956
- "Average Score": 0.287370142,
957
- "Standard Deviation": 0.04264085315,
958
- "Rank": 32
959
- }
960
- }
961
- },
962
- {
963
- "config": {
964
- "model_name": "claude-3-haiku",
965
- "organization": "Anthropic",
966
- "license": "Proprietary",
967
- "knowledge_cutoff": "2023/08"
968
- },
969
- "results": {
970
- "OVERALL": {
971
- "Average Score": 0.453901163,
972
- "Standard Deviation": 0.003604084261,
973
- "Rank": 24
974
- },
975
- "Geometry": {
976
- "Average Score": 0.607993912,
977
- "Standard Deviation": 0.05793460748,
978
- "Rank": 17
979
- },
980
- "Algebra": {
981
- "Average Score": 0.520054055,
982
- "Standard Deviation": 0.03333544511,
983
- "Rank": 25
984
- },
985
- "Probability": {
986
- "Average Score": 0.474460688,
987
- "Standard Deviation": 0.0446501933,
988
- "Rank": 25
989
- },
990
- "Logical": {
991
- "Average Score": 0.512815976,
992
- "Standard Deviation": 0.0163264281,
993
- "Rank": 21
994
- },
995
- "Social": {
996
- "Average Score": 0.551083976,
997
- "Standard Deviation": 0.05374722539,
998
- "Rank": 17
999
- }
1000
- }
1001
- },
1002
- {
1003
- "config": {
1004
- "model_name": "claude-2.1",
1005
- "organization": "Anthropic",
1006
- "license": "Proprietary",
1007
- "knowledge_cutoff": "Unknown"
1008
- },
1009
- "results": {
1010
- "OVERALL": {
1011
- "Average Score": 0.35814708,
1012
- "Standard Deviation": 0.09168134168,
1013
- "Rank": 36
1014
- },
1015
- "Geometry": {
1016
- "Average Score": 0.62752395,
1017
- "Standard Deviation": 0.07232659398,
1018
- "Rank": 16
1019
- },
1020
- "Algebra": {
1021
- "Average Score": 0.508849609,
1022
- "Standard Deviation": 0.0346897465,
1023
- "Rank": 26
1024
- },
1025
- "Probability": {
1026
- "Average Score": 0.41477086,
1027
- "Standard Deviation": 0.05964060239,
1028
- "Rank": 29
1029
- },
1030
- "Logical": {
1031
- "Average Score": 0.482923674,
1032
- "Standard Deviation": 0.01989147048,
1033
- "Rank": 22
1034
- },
1035
- "Social": {
1036
- "Average Score": 0.333804568,
1037
- "Standard Deviation": 0.03775548253,
1038
- "Rank": 29
1039
- }
1040
- }
1041
- },
1042
- {
1043
- "config": {
1044
- "model_name": "mistral-8x7b-instruct-v0.1",
1045
- "organization": "Mistral",
1046
- "license": "Apache 2.0",
1047
- "knowledge_cutoff": "2023/12"
1048
- },
1049
- "results": {
1050
- "OVERALL": {
1051
- "Average Score": 0.382659161,
1052
- "Standard Deviation": 0.07594496929,
1053
- "Rank": 31
1054
- },
1055
- "Geometry": {
1056
- "Average Score": 0.432216097,
1057
- "Standard Deviation": 0.04747949254,
1058
- "Rank": 31
1059
- },
1060
- "Algebra": {
1061
- "Average Score": 0.478314888,
1062
- "Standard Deviation": 0.01998797419,
1063
- "Rank": 27
1064
- },
1065
- "Probability": {
1066
- "Average Score": 0.427144725,
1067
- "Standard Deviation": 0.0590923329,
1068
- "Rank": 28
1069
- },
1070
- "Logical": {
1071
- "Average Score": 0.340041983,
1072
- "Standard Deviation": 0.008397574592,
1073
- "Rank": 28
1074
- },
1075
- "Social": {
1076
- "Average Score": 0.251949622,
1077
- "Standard Deviation": 0.03346674405,
1078
- "Rank": 37
1079
- }
1080
- }
1081
- },
1082
- {
1083
- "config": {
1084
- "model_name": "claude-2.0",
1085
- "organization": "Anthropic",
1086
- "license": "Proprietary",
1087
- "knowledge_cutoff": "Unknown"
1088
- },
1089
- "results": {
1090
- "OVERALL": {
1091
- "Average Score": 0.322718057,
1092
- "Standard Deviation": 0.08369883584,
1093
- "Rank": 38
1094
- },
1095
- "Geometry": {
1096
- "Average Score": 0.604141967,
1097
- "Standard Deviation": 0.05116441826,
1098
- "Rank": 18
1099
- },
1100
- "Algebra": {
1101
- "Average Score": 0.474350734,
1102
- "Standard Deviation": 0.01510393066,
1103
- "Rank": 28
1104
- },
1105
- "Probability": {
1106
- "Average Score": 0.437950412,
1107
- "Standard Deviation": 0.05985594317,
1108
- "Rank": 27
1109
- },
1110
- "Logical": {
1111
- "Average Score": 0.445620646,
1112
- "Standard Deviation": 0.01812614805,
1113
- "Rank": 24
1114
- },
1115
- "Social": {
1116
- "Average Score": 0.469422836,
1117
- "Standard Deviation": 0.05999901796,
1118
- "Rank": 21
1119
- }
1120
- }
1121
- },
1122
- {
1123
- "config": {
1124
- "model_name": "starling-lm-7b-beta",
1125
- "organization": "Nexusflow",
1126
- "license": "Apache-2.0",
1127
- "knowledge_cutoff": "2024/03"
1128
- },
1129
- "results": {
1130
- "OVERALL": {
1131
- "Average Score": 0.479391856,
1132
- "Standard Deviation": 0.04199990887,
1133
- "Rank": 22
1134
- },
1135
- "Geometry": {
1136
- "Average Score": 0.446654388,
1137
- "Standard Deviation": 0.05637864999,
1138
- "Rank": 30
1139
- },
1140
- "Algebra": {
1141
- "Average Score": 0.473952749,
1142
- "Standard Deviation": 0.01584301288,
1143
- "Rank": 29
1144
- },
1145
- "Probability": {
1146
- "Average Score": 0.395197837,
1147
- "Standard Deviation": 0.05814798892,
1148
- "Rank": 31
1149
- },
1150
- "Logical": {
1151
- "Average Score": 0.39927199,
1152
- "Standard Deviation": 0.02125277518,
1153
- "Rank": 26
1154
- },
1155
- "Social": {
1156
- "Average Score": 0.380021662,
1157
- "Standard Deviation": 0.04622452748,
1158
- "Rank": 27
1159
- }
1160
- }
1161
- },
1162
- {
1163
- "config": {
1164
- "model_name": "gemini-1.0-pro-001",
1165
- "organization": "Google",
1166
- "license": "Proprietary",
1167
- "knowledge_cutoff": "2023/04"
1168
- },
1169
- "results": {
1170
- "OVERALL": {
1171
- "Average Score": 0.449040654,
1172
- "Standard Deviation": 0.0450610177,
1173
- "Rank": 25
1174
- },
1175
- "Geometry": {
1176
- "Average Score": 0.578347959,
1177
- "Standard Deviation": 0.04242873607,
1178
- "Rank": 21
1179
- },
1180
- "Algebra": {
1181
- "Average Score": 0.462417786,
1182
- "Standard Deviation": 0.01668313635,
1183
- "Rank": 30
1184
- },
1185
- "Probability": {
1186
- "Average Score": 0.289836324,
1187
- "Standard Deviation": 0.05739831115,
1188
- "Rank": 39
1189
- },
1190
- "Logical": {
1191
- "Average Score": 0.191140355,
1192
- "Standard Deviation": 0.03394652499,
1193
- "Rank": 41
1194
- },
1195
- "Social": {
1196
- "Average Score": 0.130790863,
1197
- "Standard Deviation": 0.02800188173,
1198
- "Rank": 47
1199
- }
1200
- }
1201
- },
1202
- {
1203
- "config": {
1204
- "model_name": "openchat-3.5-0106",
1205
- "organization": "OpenChat",
1206
- "license": "Apache-2.0",
1207
- "knowledge_cutoff": "2024/01"
1208
- },
1209
- "results": {
1210
- "OVERALL": {
1211
- "Average Score": 0.363929888,
1212
- "Standard Deviation": 0.08602347145,
1213
- "Rank": 34
1214
- },
1215
- "Geometry": {
1216
- "Average Score": 0.38715246,
1217
- "Standard Deviation": 0.03701851946,
1218
- "Rank": 34
1219
- },
1220
- "Algebra": {
1221
- "Average Score": 0.441233712,
1222
- "Standard Deviation": 0.01135753754,
1223
- "Rank": 31
1224
- },
1225
- "Probability": {
1226
- "Average Score": 0.38802618,
1227
- "Standard Deviation": 0.05663879714,
1228
- "Rank": 32
1229
- },
1230
- "Logical": {
1231
- "Average Score": 0.336754383,
1232
- "Standard Deviation": 0.01608478079,
1233
- "Rank": 29
1234
- },
1235
- "Social": {
1236
- "Average Score": 0.250891608,
1237
- "Standard Deviation": 0.03253769914,
1238
- "Rank": 38
1239
- }
1240
- }
1241
- },
1242
- {
1243
- "config": {
1244
- "model_name": "openchat-3.5",
1245
- "organization": "OpenChat",
1246
- "license": "Apache-2.0",
1247
- "knowledge_cutoff": "2023/11"
1248
- },
1249
- "results": {
1250
- "OVERALL": {
1251
- "Average Score": 0.361341296,
1252
- "Standard Deviation": 0.09034869493,
1253
- "Rank": 35
1254
- },
1255
- "Geometry": {
1256
- "Average Score": 0.401699069,
1257
- "Standard Deviation": 0.03410726557,
1258
- "Rank": 32
1259
- },
1260
- "Algebra": {
1261
- "Average Score": 0.414095336,
1262
- "Standard Deviation": 0.01881964261,
1263
- "Rank": 33
1264
- },
1265
- "Probability": {
1266
- "Average Score": 0.349601002,
1267
- "Standard Deviation": 0.05077455539,
1268
- "Rank": 34
1269
- },
1270
- "Logical": {
1271
- "Average Score": 0.331069242,
1272
- "Standard Deviation": 0.02180827173,
1273
- "Rank": 31
1274
- },
1275
- "Social": {
1276
- "Average Score": 0.319991655,
1277
- "Standard Deviation": 0.04502478724,
1278
- "Rank": 31
1279
- }
1280
- }
1281
- },
1282
- {
1283
- "config": {
1284
- "model_name": "command-r-(08-2024)",
1285
- "organization": "Cohere",
1286
- "license": "CC-BY-NC-4.0",
1287
- "knowledge_cutoff": "2024/08"
1288
- },
1289
- "results": {
1290
- "OVERALL": {
1291
- "Average Score": 0.427605298,
1292
- "Standard Deviation": 0.01747449163,
1293
- "Rank": 26
1294
- },
1295
- "Geometry": {
1296
- "Average Score": 0.448300727,
1297
- "Standard Deviation": 0.04996362328,
1298
- "Rank": 29
1299
- },
1300
- "Algebra": {
1301
- "Average Score": 0.417519167,
1302
- "Standard Deviation": 0.01822196902,
1303
- "Rank": 32
1304
- },
1305
- "Probability": {
1306
- "Average Score": 0.366336281,
1307
- "Standard Deviation": 0.04716826942,
1308
- "Rank": 33
1309
- },
1310
- "Logical": {
1311
- "Average Score": 0.214657906,
1312
- "Standard Deviation": 0.03003579835,
1313
- "Rank": 38
1314
- },
1315
- "Social": {
1316
- "Average Score": 0.276088379,
1317
- "Standard Deviation": 0.03295234688,
1318
- "Rank": 34
1319
- }
1320
- }
1321
- },
1322
- {
1323
- "config": {
1324
- "model_name": "gemma-1.1-7b-it",
1325
- "organization": "Google",
1326
- "license": "Gemma License",
1327
- "knowledge_cutoff": "2024/02"
1328
- },
1329
- "results": {
1330
- "OVERALL": {
1331
- "Average Score": 0.339506922,
1332
- "Standard Deviation": 0.1066279108,
1333
- "Rank": 37
1334
- },
1335
- "Geometry": {
1336
- "Average Score": 0.324170977,
1337
- "Standard Deviation": 0.04668553765,
1338
- "Rank": 37
1339
- },
1340
- "Algebra": {
1341
- "Average Score": 0.398684697,
1342
- "Standard Deviation": 0.01982398259,
1343
- "Rank": 34
1344
- },
1345
- "Probability": {
1346
- "Average Score": 0.293253175,
1347
- "Standard Deviation": 0.05126192191,
1348
- "Rank": 38
1349
- },
1350
- "Logical": {
1351
- "Average Score": 0.317750796,
1352
- "Standard Deviation": 0.01101933543,
1353
- "Rank": 32
1354
- },
1355
- "Social": {
1356
- "Average Score": 0.179073276,
1357
- "Standard Deviation": 0.02009658805,
1358
- "Rank": 43
1359
- }
1360
- }
1361
- },
1362
- {
1363
- "config": {
1364
- "model_name": "llama3-8b-instruct",
1365
- "organization": "Meta",
1366
- "license": "Llama 3 Community",
1367
- "knowledge_cutoff": "2023/03"
1368
- },
1369
- "results": {
1370
- "OVERALL": {
1371
- "Average Score": 0.367722676,
1372
- "Standard Deviation": 0.1071368221,
1373
- "Rank": 32
1374
- },
1375
- "Geometry": {
1376
- "Average Score": 0.367143758,
1377
- "Standard Deviation": 0.04363680358,
1378
- "Rank": 35
1379
- },
1380
- "Algebra": {
1381
- "Average Score": 0.391480973,
1382
- "Standard Deviation": 0.02757445266,
1383
- "Rank": 35
1384
- },
1385
- "Probability": {
1386
- "Average Score": 0.317616445,
1387
- "Standard Deviation": 0.04300430361,
1388
- "Rank": 37
1389
- },
1390
- "Logical": {
1391
- "Average Score": 0.461607495,
1392
- "Standard Deviation": 0.02185028842,
1393
- "Rank": 23
1394
- },
1395
- "Social": {
1396
- "Average Score": 0.336373622,
1397
- "Standard Deviation": 0.05762408512,
1398
- "Rank": 28
1399
- }
1400
- }
1401
- },
1402
- {
1403
- "config": {
1404
- "model_name": "gemma-2-2b-it",
1405
- "organization": "Google",
1406
- "license": "Gemma License",
1407
- "knowledge_cutoff": "2024/07"
1408
- },
1409
- "results": {
1410
- "OVERALL": {
1411
- "Average Score": 0.502167612,
1412
- "Standard Deviation": 0.04389786763,
1413
- "Rank": 20
1414
- },
1415
- "Geometry": {
1416
- "Average Score": 0.395006676,
1417
- "Standard Deviation": 0.05882607713,
1418
- "Rank": 33
1419
- },
1420
- "Algebra": {
1421
- "Average Score": 0.379391887,
1422
- "Standard Deviation": 0.01722410785,
1423
- "Rank": 36
1424
- },
1425
- "Probability": {
1426
- "Average Score": 0.331231097,
1427
- "Standard Deviation": 0.05392499987,
1428
- "Rank": 36
1429
- },
1430
- "Logical": {
1431
- "Average Score": 0.367687789,
1432
- "Standard Deviation": 0.02547968808,
1433
- "Rank": 27
1434
- },
1435
- "Social": {
1436
- "Average Score": 0.393482094,
1437
- "Standard Deviation": 0.06450214024,
1438
- "Rank": 25
1439
- }
1440
- }
1441
- },
1442
- {
1443
- "config": {
1444
- "model_name": "starling-lm-7b-alpha",
1445
- "organization": "Nexusflow",
1446
- "license": "Apache-2.0",
1447
- "knowledge_cutoff": "2023/11"
1448
- },
1449
- "results": {
1450
- "OVERALL": {
1451
- "Average Score": 0.366628765,
1452
- "Standard Deviation": 0.08405492929,
1453
- "Rank": 33
1454
- },
1455
- "Geometry": {
1456
- "Average Score": 0.336782578,
1457
- "Standard Deviation": 0.04069449132,
1458
- "Rank": 36
1459
- },
1460
- "Algebra": {
1461
- "Average Score": 0.371551932,
1462
- "Standard Deviation": 0.03367241745,
1463
- "Rank": 37
1464
- },
1465
- "Probability": {
1466
- "Average Score": 0.331472505,
1467
- "Standard Deviation": 0.04833324282,
1468
- "Rank": 35
1469
- },
1470
- "Logical": {
1471
- "Average Score": 0.260869624,
1472
- "Standard Deviation": 0.03562735237,
1473
- "Rank": 36
1474
- },
1475
- "Social": {
1476
- "Average Score": 0.271975534,
1477
- "Standard Deviation": 0.04266753408,
1478
- "Rank": 35
1479
- }
1480
- }
1481
- },
1482
- {
1483
- "config": {
1484
- "model_name": "qwen1.5-4b-chat",
1485
- "organization": "Alibaba",
1486
- "license": "Qianwen LICENSE",
1487
- "knowledge_cutoff": "2024/02"
1488
- },
1489
- "results": {
1490
- "OVERALL": {
1491
- "Average Score": 0.111876411,
1492
- "Standard Deviation": 0.04241022785,
1493
- "Rank": 49
1494
- },
1495
- "Geometry": {
1496
- "Average Score": 0.215834522,
1497
- "Standard Deviation": 0.0363766363,
1498
- "Rank": 41
1499
- },
1500
- "Algebra": {
1501
- "Average Score": 0.305589811,
1502
- "Standard Deviation": 0.02354198912,
1503
- "Rank": 38
1504
- },
1505
- "Probability": {
1506
- "Average Score": 0.149365327,
1507
- "Standard Deviation": 0.03489672675,
1508
- "Rank": 45
1509
- },
1510
- "Logical": {
1511
- "Average Score": 0.116210168,
1512
- "Standard Deviation": 0.005927966496,
1513
- "Rank": 47
1514
- },
1515
- "Social": {
1516
- "Average Score": 0.18195615,
1517
- "Standard Deviation": 0.02269805277,
1518
- "Rank": 42
1519
- }
1520
- }
1521
- },
1522
- {
1523
- "config": {
1524
- "model_name": "command-r-(04-2024)",
1525
- "organization": "Cohere",
1526
- "license": "CC-BY-NC-4.0",
1527
- "knowledge_cutoff": "2024/04"
1528
- },
1529
- "results": {
1530
- "OVERALL": {
1531
- "Average Score": 0.388783887,
1532
- "Standard Deviation": 0.07417186783,
1533
- "Rank": 30
1534
- },
1535
- "Geometry": {
1536
- "Average Score": 0.300416698,
1537
- "Standard Deviation": 0.03485612736,
1538
- "Rank": 38
1539
- },
1540
- "Algebra": {
1541
- "Average Score": 0.293120231,
1542
- "Standard Deviation": 0.032926484,
1543
- "Rank": 39
1544
- },
1545
- "Probability": {
1546
- "Average Score": 0.281271304,
1547
- "Standard Deviation": 0.05697149867,
1548
- "Rank": 40
1549
- },
1550
- "Logical": {
1551
- "Average Score": 0.276189906,
1552
- "Standard Deviation": 0.03562914754,
1553
- "Rank": 34
1554
- },
1555
- "Social": {
1556
- "Average Score": 0.283882949,
1557
- "Standard Deviation": 0.03336901148,
1558
- "Rank": 33
1559
- }
1560
- }
1561
- },
1562
- {
1563
- "config": {
1564
- "model_name": "vicuna-33b",
1565
- "organization": "LMSYS",
1566
- "license": "Non-commercial",
1567
- "knowledge_cutoff": "2023/08"
1568
- },
1569
- "results": {
1570
- "OVERALL": {
1571
- "Average Score": 0.316543555,
1572
- "Standard Deviation": 0.08922095647,
1573
- "Rank": 39
1574
- },
1575
- "Geometry": {
1576
- "Average Score": 0.208284679,
1577
- "Standard Deviation": 0.03937771461,
1578
- "Rank": 42
1579
- },
1580
- "Algebra": {
1581
- "Average Score": 0.248994048,
1582
- "Standard Deviation": 0.02668175054,
1583
- "Rank": 41
1584
- },
1585
- "Probability": {
1586
- "Average Score": 0.222313995,
1587
- "Standard Deviation": 0.03978859759,
1588
- "Rank": 43
1589
- },
1590
- "Logical": {
1591
- "Average Score": 0.180291222,
1592
- "Standard Deviation": 0.021886267,
1593
- "Rank": 42
1594
- },
1595
- "Social": {
1596
- "Average Score": 0.257623798,
1597
- "Standard Deviation": 0.02653724437,
1598
- "Rank": 36
1599
- }
1600
- }
1601
- },
1602
- {
1603
- "config": {
1604
- "model_name": "gemma-7b-it",
1605
- "organization": "Google",
1606
- "license": "Gemma License",
1607
- "knowledge_cutoff": "2024/02"
1608
- },
1609
- "results": {
1610
- "OVERALL": {
1611
- "Average Score": 0.285077558,
1612
- "Standard Deviation": 0.08871758453,
1613
- "Rank": 41
1614
- },
1615
- "Geometry": {
1616
- "Average Score": 0.244791417,
1617
- "Standard Deviation": 0.0289612078,
1618
- "Rank": 39
1619
- },
1620
- "Algebra": {
1621
- "Average Score": 0.250614794,
1622
- "Standard Deviation": 0.01991678295,
1623
- "Rank": 40
1624
- },
1625
- "Probability": {
1626
- "Average Score": 0.174313053,
1627
- "Standard Deviation": 0.03765424728,
1628
- "Rank": 44
1629
- },
1630
- "Logical": {
1631
- "Average Score": 0.197505536,
1632
- "Standard Deviation": 0.02050298885,
1633
- "Rank": 39
1634
- },
1635
- "Social": {
1636
- "Average Score": 0.202138025,
1637
- "Standard Deviation": 0.02098346639,
1638
- "Rank": 41
1639
- }
1640
- }
1641
- },
1642
- {
1643
- "config": {
1644
- "model_name": "mistral-7b-instruct-2",
1645
- "organization": "Mistral",
1646
- "license": "Apache 2.0",
1647
- "knowledge_cutoff": "2023/12"
1648
- },
1649
- "results": {
1650
- "OVERALL": {
1651
- "Average Score": 0.427513868,
1652
- "Standard Deviation": 0.05553921135,
1653
- "Rank": 27
1654
- },
1655
- "Geometry": {
1656
- "Average Score": 0.216402626,
1657
- "Standard Deviation": 0.03338414918,
1658
- "Rank": 40
1659
- },
1660
- "Algebra": {
1661
- "Average Score": 0.233777838,
1662
- "Standard Deviation": 0.0155226054,
1663
- "Rank": 42
1664
- },
1665
- "Probability": {
1666
- "Average Score": 0.25118175,
1667
- "Standard Deviation": 0.04065514593,
1668
- "Rank": 41
1669
- },
1670
- "Logical": {
1671
- "Average Score": 0.224469136,
1672
- "Standard Deviation": 0.03404706752,
1673
- "Rank": 37
1674
- },
1675
- "Social": {
1676
- "Average Score": 0.209386782,
1677
- "Standard Deviation": 0.02738569921,
1678
- "Rank": 40
1679
- }
1680
- }
1681
- },
1682
- {
1683
- "config": {
1684
- "model_name": "mistral-7b-instruct-1",
1685
- "organization": "Mistral",
1686
- "license": "Apache 2.0",
1687
- "knowledge_cutoff": "2023/12"
1688
- },
1689
- "results": {
1690
- "OVERALL": {
1691
- "Average Score": 0.23016314,
1692
- "Standard Deviation": 0.07137625271,
1693
- "Rank": 46
1694
- },
1695
- "Geometry": {
1696
- "Average Score": 0.161799938,
1697
- "Standard Deviation": 0.03595278559,
1698
- "Rank": 46
1699
- },
1700
- "Algebra": {
1701
- "Average Score": 0.210341624,
1702
- "Standard Deviation": 0.01736539119,
1703
- "Rank": 43
1704
- },
1705
- "Probability": {
1706
- "Average Score": 0.238417922,
1707
- "Standard Deviation": 0.03744211933,
1708
- "Rank": 42
1709
- },
1710
- "Logical": {
1711
- "Average Score": 0.142636601,
1712
- "Standard Deviation": 0.02080406365,
1713
- "Rank": 46
1714
- },
1715
- "Social": {
1716
- "Average Score": 0.117646827,
1717
- "Standard Deviation": 0.009321202779,
1718
- "Rank": 49
1719
- }
1720
- }
1721
- },
1722
- {
1723
- "config": {
1724
- "model_name": "vicuna-13b",
1725
- "organization": "LMSYS",
1726
- "license": "Non-commercial",
1727
- "knowledge_cutoff": "2023/07"
1728
- },
1729
- "results": {
1730
- "OVERALL": {
1731
- "Average Score": 0.201892849,
1732
- "Standard Deviation": 0.06021749802,
1733
- "Rank": 47
1734
- },
1735
- "Geometry": {
1736
- "Average Score": 0.200941928,
1737
- "Standard Deviation": 0.03366817781,
1738
- "Rank": 43
1739
- },
1740
- "Algebra": {
1741
- "Average Score": 0.196123323,
1742
- "Standard Deviation": 0.0135715643,
1743
- "Rank": 44
1744
- },
1745
- "Probability": {
1746
- "Average Score": 0.141214079,
1747
- "Standard Deviation": 0.02721328211,
1748
- "Rank": 46
1749
- },
1750
- "Logical": {
1751
- "Average Score": 0.148598631,
1752
- "Standard Deviation": 0.02241523892,
1753
- "Rank": 44
1754
- },
1755
- "Social": {
1756
- "Average Score": 0.124655135,
1757
- "Standard Deviation": 0.01122382671,
1758
- "Rank": 48
1759
- }
1760
- }
1761
- },
1762
- {
1763
- "config": {
1764
- "model_name": "zephyr-7b-beta",
1765
- "organization": "HuggingFace",
1766
- "license": "MIT",
1767
- "knowledge_cutoff": "2023/10"
1768
- },
1769
- "results": {
1770
- "OVERALL": {
1771
- "Average Score": 0.102705119,
1772
- "Standard Deviation": 0.03683757312,
1773
- "Rank": 50
1774
- },
1775
- "Geometry": {
1776
- "Average Score": 0.114005544,
1777
- "Standard Deviation": 0.03144354365,
1778
- "Rank": 47
1779
- },
1780
- "Algebra": {
1781
- "Average Score": 0.141766633,
1782
- "Standard Deviation": 0.03179520129,
1783
- "Rank": 45
1784
- },
1785
- "Probability": {
1786
- "Average Score": 0.089050714,
1787
- "Standard Deviation": 0.002136754266,
1788
- "Rank": 49
1789
- },
1790
- "Logical": {
1791
- "Average Score": 0.069520789,
1792
- "Standard Deviation": 0.004477840857,
1793
- "Rank": 51
1794
- },
1795
- "Social": {
1796
- "Average Score": 0.0,
1797
- "Standard Deviation": 0.0,
1798
- "Rank": 54
1799
- }
1800
- }
1801
- },
1802
- {
1803
- "config": {
1804
- "model_name": "gemma-1.1-2b-it",
1805
- "organization": "Google",
1806
- "license": "Gemma License",
1807
- "knowledge_cutoff": "2024/02"
1808
- },
1809
- "results": {
1810
- "OVERALL": {
1811
- "Average Score": 0.257700845,
1812
- "Standard Deviation": 0.07369021445,
1813
- "Rank": 44
1814
- },
1815
- "Geometry": {
1816
- "Average Score": 0.183974034,
1817
- "Standard Deviation": 0.0215548886,
1818
- "Rank": 45
1819
- },
1820
- "Algebra": {
1821
- "Average Score": 0.13422252,
1822
- "Standard Deviation": 0.01922819511,
1823
- "Rank": 46
1824
- },
1825
- "Probability": {
1826
- "Average Score": 0.095628657,
1827
- "Standard Deviation": 0.007536076456,
1828
- "Rank": 48
1829
- },
1830
- "Logical": {
1831
- "Average Score": 0.094965074,
1832
- "Standard Deviation": 0.005019175487,
1833
- "Rank": 49
1834
- },
1835
- "Social": {
1836
- "Average Score": 0.167796727,
1837
- "Standard Deviation": 0.01666541942,
1838
- "Rank": 44
1839
- }
1840
- }
1841
- },
1842
- {
1843
- "config": {
1844
- "model_name": "llama2-7b-chat",
1845
- "organization": "Meta",
1846
- "license": "Llama 2 Community",
1847
- "knowledge_cutoff": "2023/07"
1848
- },
1849
- "results": {
1850
- "OVERALL": {
1851
- "Average Score": 0.260189428,
1852
- "Standard Deviation": 0.08019299364,
1853
- "Rank": 43
1854
- },
1855
- "Geometry": {
1856
- "Average Score": 0.087067276,
1857
- "Standard Deviation": 0.04274343402,
1858
- "Rank": 48
1859
- },
1860
- "Algebra": {
1861
- "Average Score": 0.12308805,
1862
- "Standard Deviation": 0.01856053622,
1863
- "Rank": 47
1864
- },
1865
- "Probability": {
1866
- "Average Score": 0.087515438,
1867
- "Standard Deviation": 0.006315053573,
1868
- "Rank": 50
1869
- },
1870
- "Logical": {
1871
- "Average Score": 0.17312827,
1872
- "Standard Deviation": 0.01867044092,
1873
- "Rank": 43
1874
- },
1875
- "Social": {
1876
- "Average Score": 0.152905272,
1877
- "Standard Deviation": 0.007166957097,
1878
- "Rank": 45
1879
- }
1880
- }
1881
- },
1882
- {
1883
- "config": {
1884
- "model_name": "gemma-2b-it",
1885
- "organization": "Google",
1886
- "license": "Gemma License",
1887
- "knowledge_cutoff": "2024/02"
1888
- },
1889
- "results": {
1890
- "OVERALL": {
1891
- "Average Score": 0.234172069,
1892
- "Standard Deviation": 0.06522685718,
1893
- "Rank": 45
1894
- },
1895
- "Geometry": {
1896
- "Average Score": 0.198571153,
1897
- "Standard Deviation": 0.01699161031,
1898
- "Rank": 44
1899
- },
1900
- "Algebra": {
1901
- "Average Score": 0.109883009,
1902
- "Standard Deviation": 0.01520005833,
1903
- "Rank": 48
1904
- },
1905
- "Probability": {
1906
- "Average Score": 0.06467432,
1907
- "Standard Deviation": 0.002117497231,
1908
- "Rank": 52
1909
- },
1910
- "Logical": {
1911
- "Average Score": 0.039624492,
1912
- "Standard Deviation": 0.007606972686,
1913
- "Rank": 52
1914
- },
1915
- "Social": {
1916
- "Average Score": 0.087452913,
1917
- "Standard Deviation": 0.008170146562,
1918
- "Rank": 52
1919
- }
1920
- }
1921
- },
1922
- {
1923
- "config": {
1924
- "model_name": "llama2-13b-chat",
1925
- "organization": "Meta",
1926
- "license": "Llama 2 Community",
1927
- "knowledge_cutoff": "2023/07"
1928
- },
1929
- "results": {
1930
- "OVERALL": {
1931
- "Average Score": 0.263305684,
1932
- "Standard Deviation": 0.07283640689,
1933
- "Rank": 42
1934
- },
1935
- "Geometry": {
1936
- "Average Score": 0.072729954,
1937
- "Standard Deviation": 0.02315988261,
1938
- "Rank": 50
1939
- },
1940
- "Algebra": {
1941
- "Average Score": 0.080371692,
1942
- "Standard Deviation": 0.01277569453,
1943
- "Rank": 49
1944
- },
1945
- "Probability": {
1946
- "Average Score": 0.117757344,
1947
- "Standard Deviation": 0.02418619619,
1948
- "Rank": 47
1949
- },
1950
- "Logical": {
1951
- "Average Score": 0.193149889,
1952
- "Standard Deviation": 0.01776690764,
1953
- "Rank": 40
1954
- },
1955
- "Social": {
1956
- "Average Score": 0.149125922,
1957
- "Standard Deviation": 0.01157416827,
1958
- "Rank": 46
1959
- }
1960
- }
1961
- },
1962
- {
1963
- "config": {
1964
- "model_name": "vicuna-7b",
1965
- "organization": "LMSYS",
1966
- "license": "Non-commercial",
1967
- "knowledge_cutoff": "2023/07"
1968
- },
1969
- "results": {
1970
- "OVERALL": {
1971
- "Average Score": 0.198839786,
1972
- "Standard Deviation": 0.05725381576,
1973
- "Rank": 48
1974
- },
1975
- "Geometry": {
1976
- "Average Score": 0.083457058,
1977
- "Standard Deviation": 0.02520989111,
1978
- "Rank": 49
1979
- },
1980
- "Algebra": {
1981
- "Average Score": 0.070883882,
1982
- "Standard Deviation": 0.007315853253,
1983
- "Rank": 50
1984
- },
1985
- "Probability": {
1986
- "Average Score": 0.080987673,
1987
- "Standard Deviation": 0.005474288861,
1988
- "Rank": 51
1989
- },
1990
- "Logical": {
1991
- "Average Score": 0.100065588,
1992
- "Standard Deviation": 0.003561886452,
1993
- "Rank": 48
1994
- },
1995
- "Social": {
1996
- "Average Score": 0.111076414,
1997
- "Standard Deviation": 0.004805626512,
1998
- "Rank": 50
1999
- }
2000
- }
2001
- },
2002
- {
2003
- "config": {
2004
- "model_name": "koala-13b",
2005
- "organization": "UC Berkeley",
2006
- "license": "Non-commercial",
2007
- "knowledge_cutoff": "2023/04"
2008
- },
2009
- "results": {
2010
- "OVERALL": {
2011
- "Average Score": 0.09387188,
2012
- "Standard Deviation": 0.02642167489,
2013
- "Rank": 51
2014
- },
2015
- "Geometry": {
2016
- "Average Score": 0.017374001,
2017
- "Standard Deviation": 0.01747053557,
2018
- "Rank": 51
2019
- },
2020
- "Algebra": {
2021
- "Average Score": 0.018129197,
2022
- "Standard Deviation": 0.01054371383,
2023
- "Rank": 51
2024
- },
2025
- "Probability": {
2026
- "Average Score": 0.043654362,
2027
- "Standard Deviation": 0.004288231886,
2028
- "Rank": 53
2029
- },
2030
- "Logical": {
2031
- "Average Score": 0.074694053,
2032
- "Standard Deviation": 0.002674646998,
2033
- "Rank": 50
2034
- },
2035
- "Social": {
2036
- "Average Score": 0.096983835,
2037
- "Standard Deviation": 0.007847059783,
2038
- "Rank": 51
2039
- }
2040
- }
2041
- },
2042
- {
2043
- "config": {
2044
- "model_name": "openassistant-pythia-12b",
2045
- "organization": "OpenAssistant",
2046
- "license": "Non-commercial",
2047
- "knowledge_cutoff": "2023/04"
2048
- },
2049
- "results": {
2050
- "OVERALL": {
2051
- "Average Score": 0.0,
2052
- "Standard Deviation": 0.0,
2053
- "Rank": 52
2054
- },
2055
- "Geometry": {
2056
- "Average Score": 0.0,
2057
- "Standard Deviation": 0.0,
2058
- "Rank": 52
2059
- },
2060
- "Algebra": {
2061
- "Average Score": 0.0,
2062
- "Standard Deviation": 0.0,
2063
- "Rank": 52
2064
- },
2065
- "Probability": {
2066
- "Average Score": 0.0,
2067
- "Standard Deviation": 0.0,
2068
- "Rank": 54
2069
- },
2070
- "Logical": {
2071
- "Average Score": 0.0,
2072
- "Standard Deviation": 0.0,
2073
- "Rank": 53
2074
- },
2075
- "Social": {
2076
- "Average Score": 0.030792528,
2077
- "Standard Deviation": 0.007518796391,
2078
- "Rank": 53
2079
- }
2080
- }
2081
- }
2082
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/results/models_2024-10-09-05:17:38.810960.json DELETED
@@ -1,2327 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "model_name": "ChatGPT-4o-latest (2024-09-03)",
5
- "organization": "OpenAI",
6
- "license": "Proprietary",
7
- "knowledge_cutoff": "2023/10"
8
- },
9
- "results": {
10
- "OVERALL": {
11
- "Average Score": 0.974329609,
12
- "Standard Deviation": 0.005024959031,
13
- "Rank": 2
14
- },
15
- "Geometry": {
16
- "Average Score": 0.976028578,
17
- "Standard Deviation": 0.01507912373,
18
- "Rank": 3
19
- },
20
- "Algebra": {
21
- "Average Score": 0.951199453,
22
- "Standard Deviation": 0.08452452108,
23
- "Rank": 3
24
- },
25
- "Probability": {
26
- "Average Score": 0.842116641,
27
- "Standard Deviation": 0.006267759054,
28
- "Rank": 3
29
- },
30
- "Logical": {
31
- "Average Score": 0.828490728,
32
- "Standard Deviation": 0.009134213144,
33
- "Rank": 3
34
- },
35
- "Social": {
36
- "Average Score": 0.815902987,
37
- "Standard Deviation": 0.0196254222,
38
- "Rank": 4
39
- },
40
- "Chemistry": {
41
- "Average Score": 100.0,
42
- "Standard Deviation": null,
43
- "Rank": 1
44
- }
45
- }
46
- },
47
- {
48
- "config": {
49
- "model_name": "gpt-4o-2024-08-06",
50
- "organization": "OpenAI",
51
- "license": "Proprietary",
52
- "knowledge_cutoff": "2023/10"
53
- },
54
- "results": {
55
- "OVERALL": {
56
- "Average Score": 0.846571548,
57
- "Standard Deviation": 0.03394056554,
58
- "Rank": 6
59
- },
60
- "Geometry": {
61
- "Average Score": 0.99773096,
62
- "Standard Deviation": 0.002835555172,
63
- "Rank": 1
64
- },
65
- "Algebra": {
66
- "Average Score": 1.0,
67
- "Standard Deviation": 0.0,
68
- "Rank": 1
69
- },
70
- "Probability": {
71
- "Average Score": 0.78855795,
72
- "Standard Deviation": 0.008188675452,
73
- "Rank": 6
74
- },
75
- "Logical": {
76
- "Average Score": 0.668635768,
77
- "Standard Deviation": 0.03466314094,
78
- "Rank": 11
79
- },
80
- "Social": {
81
- "Average Score": 0.680417314,
82
- "Standard Deviation": 0.00656867063,
83
- "Rank": 9
84
- },
85
- "Chemistry": {
86
- "Average Score": 92.43090226400756,
87
- "Standard Deviation": null,
88
- "Rank": 2
89
- }
90
- }
91
- },
92
- {
93
- "config": {
94
- "model_name": "gpt-4o-2024-05-13",
95
- "organization": "OpenAI",
96
- "license": "Proprietary",
97
- "knowledge_cutoff": "2023/10"
98
- },
99
- "results": {
100
- "OVERALL": {
101
- "Average Score": 0.846334477,
102
- "Standard Deviation": 0.09377911572,
103
- "Rank": 7
104
- },
105
- "Geometry": {
106
- "Average Score": 0.972472377,
107
- "Standard Deviation": 0.01648274205,
108
- "Rank": 4
109
- },
110
- "Algebra": {
111
- "Average Score": 0.995511298,
112
- "Standard Deviation": 0.004097802515,
113
- "Rank": 2
114
- },
115
- "Probability": {
116
- "Average Score": 0.812149974,
117
- "Standard Deviation": 0.007669585485,
118
- "Rank": 4
119
- },
120
- "Logical": {
121
- "Average Score": 0.755019692,
122
- "Standard Deviation": 0.008149588572,
123
- "Rank": 6
124
- },
125
- "Social": {
126
- "Average Score": 0.609875087,
127
- "Standard Deviation": 0.038729239,
128
- "Rank": 14
129
- },
130
- "Chemistry": {
131
- "Average Score": 79.1592634699295,
132
- "Standard Deviation": null,
133
- "Rank": 7
134
- }
135
- }
136
- },
137
- {
138
- "config": {
139
- "model_name": "gpt-4-turbo-2024-04-09",
140
- "organization": "OpenAI",
141
- "license": "Proprietary",
142
- "knowledge_cutoff": "2023/12"
143
- },
144
- "results": {
145
- "OVERALL": {
146
- "Average Score": 0.855357972,
147
- "Standard Deviation": 0.1016986368,
148
- "Rank": 4
149
- },
150
- "Geometry": {
151
- "Average Score": 0.95374588,
152
- "Standard Deviation": 0.03109307166,
153
- "Rank": 5
154
- },
155
- "Algebra": {
156
- "Average Score": 0.930945223,
157
- "Standard Deviation": 0.06705136813,
158
- "Rank": 4
159
- },
160
- "Probability": {
161
- "Average Score": 0.750705448,
162
- "Standard Deviation": 0.05944483103,
163
- "Rank": 8
164
- },
165
- "Logical": {
166
- "Average Score": 0.77906699,
167
- "Standard Deviation": 0.007406734161,
168
- "Rank": 4
169
- },
170
- "Social": {
171
- "Average Score": 0.715935163,
172
- "Standard Deviation": 0.1209141409,
173
- "Rank": 7
174
- },
175
- "Chemistry": {
176
- "Average Score": 70.73143363230263,
177
- "Standard Deviation": null,
178
- "Rank": 12
179
- }
180
- }
181
- },
182
- {
183
- "config": {
184
- "model_name": "gemini-1.5-pro-001",
185
- "organization": "Google",
186
- "license": "Proprietary",
187
- "knowledge_cutoff": "2023/11"
188
- },
189
- "results": {
190
- "OVERALL": {
191
- "Average Score": 0.797187842,
192
- "Standard Deviation": 0.0272375249,
193
- "Rank": 10
194
- },
195
- "Geometry": {
196
- "Average Score": 0.9947169,
197
- "Standard Deviation": 0.009150597621,
198
- "Rank": 2
199
- },
200
- "Algebra": {
201
- "Average Score": 0.857464301,
202
- "Standard Deviation": 0.05014285338,
203
- "Rank": 5
204
- },
205
- "Probability": {
206
- "Average Score": 0.651781767,
207
- "Standard Deviation": 0.04156998547,
208
- "Rank": 12
209
- },
210
- "Logical": {
211
- "Average Score": 0.739745471,
212
- "Standard Deviation": 0.01631532019,
213
- "Rank": 7
214
- },
215
- "Social": {
216
- "Average Score": 0.649601885,
217
- "Standard Deviation": 0.104854889,
218
- "Rank": 12
219
- }
220
- }
221
- },
222
- {
223
- "config": {
224
- "model_name": "qwen2-72b-instruct",
225
- "organization": "Alibaba",
226
- "license": "Qianwen LICENSE",
227
- "knowledge_cutoff": "2024/09"
228
- },
229
- "results": {
230
- "OVERALL": {
231
- "Average Score": 0.737918558,
232
- "Standard Deviation": 0.09069077339,
233
- "Rank": 11
234
- },
235
- "Geometry": {
236
- "Average Score": 0.796870305,
237
- "Standard Deviation": 0.0509025346,
238
- "Rank": 9
239
- },
240
- "Algebra": {
241
- "Average Score": 0.836194231,
242
- "Standard Deviation": 0.04517093028,
243
- "Rank": 6
244
- },
245
- "Probability": {
246
- "Average Score": 0.788068004,
247
- "Standard Deviation": 0.007288989044,
248
- "Rank": 7
249
- },
250
- "Logical": {
251
- "Average Score": 0.619300904,
252
- "Standard Deviation": 0.06377931612,
253
- "Rank": 15
254
- },
255
- "Social": {
256
- "Average Score": 0.652578786,
257
- "Standard Deviation": 0.04259293171,
258
- "Rank": 11
259
- },
260
- "Chemistry": {
261
- "Average Score": 73.54037778797029,
262
- "Standard Deviation": null,
263
- "Rank": 8
264
- }
265
- }
266
- },
267
- {
268
- "config": {
269
- "model_name": "gpt-4o-mini-2024-07-18",
270
- "organization": "OpenAI",
271
- "license": "Proprietary",
272
- "knowledge_cutoff": "2023/10"
273
- },
274
- "results": {
275
- "OVERALL": {
276
- "Average Score": 0.847694133,
277
- "Standard Deviation": 0.02164304402,
278
- "Rank": 5
279
- },
280
- "Geometry": {
281
- "Average Score": 0.946650435,
282
- "Standard Deviation": 0.01831236482,
283
- "Rank": 7
284
- },
285
- "Algebra": {
286
- "Average Score": 0.796243022,
287
- "Standard Deviation": 0.05537539202,
288
- "Rank": 7
289
- },
290
- "Probability": {
291
- "Average Score": 0.798402685,
292
- "Standard Deviation": 0.009404491967,
293
- "Rank": 5
294
- },
295
- "Logical": {
296
- "Average Score": 0.727009735,
297
- "Standard Deviation": 0.02628110141,
298
- "Rank": 8
299
- },
300
- "Social": {
301
- "Average Score": 0.691949855,
302
- "Standard Deviation": 0.02072934333,
303
- "Rank": 8
304
- },
305
- "Chemistry": {
306
- "Average Score": 88.3877070580296,
307
- "Standard Deviation": null,
308
- "Rank": 3
309
- }
310
- }
311
- },
312
- {
313
- "config": {
314
- "model_name": "claude-3.5-sonnet",
315
- "organization": "Anthropic",
316
- "license": "Proprietary",
317
- "knowledge_cutoff": "2024/04"
318
- },
319
- "results": {
320
- "OVERALL": {
321
- "Average Score": 0.839004422,
322
- "Standard Deviation": 0.1461079564,
323
- "Rank": 8
324
- },
325
- "Geometry": {
326
- "Average Score": 0.95316419,
327
- "Standard Deviation": 0.02081192856,
328
- "Rank": 6
329
- },
330
- "Algebra": {
331
- "Average Score": 0.759789952,
332
- "Standard Deviation": 0.02611765096,
333
- "Rank": 8
334
- },
335
- "Probability": {
336
- "Average Score": 0.707730127,
337
- "Standard Deviation": 0.0394436664,
338
- "Rank": 10
339
- },
340
- "Logical": {
341
- "Average Score": 0.77342666,
342
- "Standard Deviation": 0.002892426458,
343
- "Rank": 5
344
- },
345
- "Social": {
346
- "Average Score": 0.790002247,
347
- "Standard Deviation": 0.1007410022,
348
- "Rank": 5
349
- },
350
- "Chemistry": {
351
- "Average Score": 82.37734076815008,
352
- "Standard Deviation": null,
353
- "Rank": 6
354
- }
355
- }
356
- },
357
- {
358
- "config": {
359
- "model_name": "o1-mini",
360
- "organization": "OpenAI",
361
- "license": "Proprietary",
362
- "knowledge_cutoff": "2023/10"
363
- },
364
- "results": {
365
- "OVERALL": {
366
- "Average Score": 1.0,
367
- "Standard Deviation": 0.0,
368
- "Rank": 1
369
- },
370
- "Geometry": {
371
- "Average Score": "N/A",
372
- "Standard Deviation": "N/A",
373
- "Rank": "N/A"
374
- },
375
- "Algebra": {
376
- "Average Score": "N/A",
377
- "Standard Deviation": "N/A",
378
- "Rank": "N/A"
379
- },
380
- "Probability": {
381
- "Average Score": 1.0,
382
- "Standard Deviation": 0.0,
383
- "Rank": 1
384
- },
385
- "Logical": {
386
- "Average Score": 1.0,
387
- "Standard Deviation": 0.0,
388
- "Rank": 1
389
- },
390
- "Social": {
391
- "Average Score": 0.993974241,
392
- "Standard Deviation": 0.001996882328,
393
- "Rank": 2
394
- }
395
- }
396
- },
397
- {
398
- "config": {
399
- "model_name": "o1-preview",
400
- "organization": "OpenAI",
401
- "license": "Proprietary",
402
- "knowledge_cutoff": "2023/10"
403
- },
404
- "results": {
405
- "OVERALL": {
406
- "Average Score": 0.945884589,
407
- "Standard Deviation": 0.01059250762,
408
- "Rank": 3
409
- },
410
- "Geometry": {
411
- "Average Score": "N/A",
412
- "Standard Deviation": "N/A",
413
- "Rank": "N/A"
414
- },
415
- "Algebra": {
416
- "Average Score": "N/A",
417
- "Standard Deviation": "N/A",
418
- "Rank": "N/A"
419
- },
420
- "Probability": {
421
- "Average Score": 0.964666392,
422
- "Standard Deviation": 0.003139983398,
423
- "Rank": 2
424
- },
425
- "Logical": {
426
- "Average Score": 0.987950057,
427
- "Standard Deviation": 0.004881220327,
428
- "Rank": 2
429
- },
430
- "Social": {
431
- "Average Score": 1.0,
432
- "Standard Deviation": 0.0,
433
- "Rank": 1
434
- }
435
- }
436
- },
437
- {
438
- "config": {
439
- "model_name": "gemini-1.5-flash-001",
440
- "organization": "Google",
441
- "license": "Proprietary",
442
- "knowledge_cutoff": "2023/11"
443
- },
444
- "results": {
445
- "OVERALL": {
446
- "Average Score": 0.726493401,
447
- "Standard Deviation": 0.01113913725,
448
- "Rank": 12
449
- },
450
- "Geometry": {
451
- "Average Score": 0.804144103,
452
- "Standard Deviation": 0.1327142178,
453
- "Rank": 8
454
- },
455
- "Algebra": {
456
- "Average Score": 0.731776765,
457
- "Standard Deviation": 0.02594657111,
458
- "Rank": 11
459
- },
460
- "Probability": {
461
- "Average Score": 0.614461891,
462
- "Standard Deviation": 0.04690131826,
463
- "Rank": 15
464
- },
465
- "Logical": {
466
- "Average Score": 0.630805991,
467
- "Standard Deviation": 0.04871350612,
468
- "Rank": 13
469
- },
470
- "Social": {
471
- "Average Score": 0.555933822,
472
- "Standard Deviation": 0.1029934524,
473
- "Rank": 16
474
- },
475
- "Chemistry": {
476
- "Average Score": 72.1127762005651,
477
- "Standard Deviation": null,
478
- "Rank": 11
479
- }
480
- }
481
- },
482
- {
483
- "config": {
484
- "model_name": "gpt4-1106",
485
- "organization": "OpenAI",
486
- "license": "Proprietary",
487
- "knowledge_cutoff": "2024/04"
488
- },
489
- "results": {
490
- "OVERALL": {
491
- "Average Score": 0.816347784,
492
- "Standard Deviation": 0.1566815755,
493
- "Rank": 9
494
- },
495
- "Geometry": {
496
- "Average Score": 0.71843088,
497
- "Standard Deviation": 0.04778038294,
498
- "Rank": 13
499
- },
500
- "Algebra": {
501
- "Average Score": 0.712910417,
502
- "Standard Deviation": 0.02581828898,
503
- "Rank": 12
504
- },
505
- "Probability": {
506
- "Average Score": 0.623947619,
507
- "Standard Deviation": 0.03502982933,
508
- "Rank": 14
509
- },
510
- "Logical": {
511
- "Average Score": 0.637482274,
512
- "Standard Deviation": 0.04158809888,
513
- "Rank": 12
514
- },
515
- "Social": {
516
- "Average Score": 0.450609816,
517
- "Standard Deviation": 0.05208655446,
518
- "Rank": 23
519
- },
520
- "Chemistry": {
521
- "Average Score": 69.11824072252848,
522
- "Standard Deviation": null,
523
- "Rank": 13
524
- }
525
- }
526
- },
527
- {
528
- "config": {
529
- "model_name": "gemma-2-27b-it",
530
- "organization": "Google",
531
- "license": "Gemma License",
532
- "knowledge_cutoff": "2024/06"
533
- },
534
- "results": {
535
- "OVERALL": {
536
- "Average Score": 0.624169623,
537
- "Standard Deviation": 0.1048365121,
538
- "Rank": 15
539
- },
540
- "Geometry": {
541
- "Average Score": 0.60112744,
542
- "Standard Deviation": 0.0469109952,
543
- "Rank": 19
544
- },
545
- "Algebra": {
546
- "Average Score": 0.687955914,
547
- "Standard Deviation": 0.01959958192,
548
- "Rank": 13
549
- },
550
- "Probability": {
551
- "Average Score": 0.589524771,
552
- "Standard Deviation": 0.03112689325,
553
- "Rank": 16
554
- },
555
- "Logical": {
556
- "Average Score": 0.614978944,
557
- "Standard Deviation": 0.05710657859,
558
- "Rank": 16
559
- },
560
- "Social": {
561
- "Average Score": 0.487844257,
562
- "Standard Deviation": 0.05857760809,
563
- "Rank": 20
564
- },
565
- "Chemistry": {
566
- "Average Score": 63.28920072143611,
567
- "Standard Deviation": null,
568
- "Rank": 15
569
- }
570
- }
571
- },
572
- {
573
- "config": {
574
- "model_name": "claude-3-opus",
575
- "organization": "Anthropic",
576
- "license": "Proprietary",
577
- "knowledge_cutoff": "2023/08"
578
- },
579
- "results": {
580
- "OVERALL": {
581
- "Average Score": 0.650636271,
582
- "Standard Deviation": 0.1197773541,
583
- "Rank": 14
584
- },
585
- "Geometry": {
586
- "Average Score": 0.7215743,
587
- "Standard Deviation": 0.04712598358,
588
- "Rank": 12
589
- },
590
- "Algebra": {
591
- "Average Score": 0.68777327,
592
- "Standard Deviation": 0.02382683713,
593
- "Rank": 14
594
- },
595
- "Probability": {
596
- "Average Score": 0.626471421,
597
- "Standard Deviation": 0.02911817976,
598
- "Rank": 13
599
- },
600
- "Logical": {
601
- "Average Score": 0.692346381,
602
- "Standard Deviation": 0.03617185198,
603
- "Rank": 10
604
- },
605
- "Social": {
606
- "Average Score": 0.663410854,
607
- "Standard Deviation": 0.09540220876,
608
- "Rank": 10
609
- },
610
- "Chemistry": {
611
- "Average Score": 73.5404403567132,
612
- "Standard Deviation": null,
613
- "Rank": 9
614
- }
615
- }
616
- },
617
- {
618
- "config": {
619
- "model_name": "gemma-2-9b-it-simpo",
620
- "organization": "Google",
621
- "license": "Gemma License",
622
- "knowledge_cutoff": "2024/07"
623
- },
624
- "results": {
625
- "OVERALL": {
626
- "Average Score": "N/A",
627
- "Standard Deviation": "N/A",
628
- "Rank": "N/A"
629
- },
630
- "Geometry": {
631
- "Average Score": 0.582787508,
632
- "Standard Deviation": 0.03965204074,
633
- "Rank": 20
634
- },
635
- "Algebra": {
636
- "Average Score": 0.658648133,
637
- "Standard Deviation": 0.02565919856,
638
- "Rank": 15
639
- },
640
- "Probability": {
641
- "Average Score": 0.547861265,
642
- "Standard Deviation": 0.02885209131,
643
- "Rank": 19
644
- },
645
- "Logical": {
646
- "Average Score": 0.540720893,
647
- "Standard Deviation": 0.01970134508,
648
- "Rank": 20
649
- },
650
- "Social": {
651
- "Average Score": 0.635266187,
652
- "Standard Deviation": 0.03620021751,
653
- "Rank": 13
654
- },
655
- "Chemistry": {
656
- "Average Score": 73.43757596214863,
657
- "Standard Deviation": null,
658
- "Rank": 10
659
- }
660
- }
661
- },
662
- {
663
- "config": {
664
- "model_name": "qwen1.5-72b-chat",
665
- "organization": "Alibaba",
666
- "license": "Qianwen LICENSE",
667
- "knowledge_cutoff": "2024/03"
668
- },
669
- "results": {
670
- "OVERALL": {
671
- "Average Score": 0.519549796,
672
- "Standard Deviation": 0.00903634343,
673
- "Rank": 18
674
- },
675
- "Geometry": {
676
- "Average Score": 0.543139301,
677
- "Standard Deviation": 0.03425202326,
678
- "Rank": 24
679
- },
680
- "Algebra": {
681
- "Average Score": 0.635228729,
682
- "Standard Deviation": 0.01944043425,
683
- "Rank": 16
684
- },
685
- "Probability": {
686
- "Average Score": 0.486948658,
687
- "Standard Deviation": 0.06064655315,
688
- "Rank": 23
689
- },
690
- "Logical": {
691
- "Average Score": 0.284069394,
692
- "Standard Deviation": 0.02686608506,
693
- "Rank": 33
694
- },
695
- "Social": {
696
- "Average Score": 0.415007627,
697
- "Standard Deviation": 0.03920053159,
698
- "Rank": 24
699
- },
700
- "Chemistry": {
701
- "Average Score": 48.69302376665551,
702
- "Standard Deviation": null,
703
- "Rank": 21
704
- }
705
- }
706
- },
707
- {
708
- "config": {
709
- "model_name": "qwen1.5-32b-chat",
710
- "organization": "Alibaba",
711
- "license": "Qianwen LICENSE",
712
- "knowledge_cutoff": "2024/03"
713
- },
714
- "results": {
715
- "OVERALL": {
716
- "Average Score": 0.393789407,
717
- "Standard Deviation": 0.05413770095,
718
- "Rank": 29
719
- },
720
- "Geometry": {
721
- "Average Score": 0.51086835,
722
- "Standard Deviation": 0.04052471998,
723
- "Rank": 27
724
- },
725
- "Algebra": {
726
- "Average Score": 0.609003168,
727
- "Standard Deviation": 0.04874143541,
728
- "Rank": 17
729
- },
730
- "Probability": {
731
- "Average Score": 0.476300002,
732
- "Standard Deviation": 0.05322403912,
733
- "Rank": 24
734
- },
735
- "Logical": {
736
- "Average Score": 0.331781014,
737
- "Standard Deviation": 0.004938997686,
738
- "Rank": 30
739
- },
740
- "Social": {
741
- "Average Score": 0.380987334,
742
- "Standard Deviation": 0.03762251776,
743
- "Rank": 26
744
- },
745
- "Chemistry": {
746
- "Average Score": 45.14284028264288,
747
- "Standard Deviation": null,
748
- "Rank": 25
749
- }
750
- }
751
- },
752
- {
753
- "config": {
754
- "model_name": "google-gemma-2-9b-it",
755
- "organization": "Google",
756
- "license": "Proprietary",
757
- "knowledge_cutoff": "2024/06"
758
- },
759
- "results": {
760
- "OVERALL": {
761
- "Average Score": 0.489663449,
762
- "Standard Deviation": 0.002595702019,
763
- "Rank": 21
764
- },
765
- "Geometry": {
766
- "Average Score": 0.575371308,
767
- "Standard Deviation": 0.03556220251,
768
- "Rank": 22
769
- },
770
- "Algebra": {
771
- "Average Score": 0.597045661,
772
- "Standard Deviation": 0.0313828123,
773
- "Rank": 18
774
- },
775
- "Probability": {
776
- "Average Score": 0.589221807,
777
- "Standard Deviation": 0.03110811656,
778
- "Rank": 18
779
- },
780
- "Logical": {
781
- "Average Score": 0.587579897,
782
- "Standard Deviation": 0.05512716783,
783
- "Rank": 18
784
- },
785
- "Social": {
786
- "Average Score": 0.768337958,
787
- "Standard Deviation": 0.04078610476,
788
- "Rank": 6
789
- },
790
- "Chemistry": {
791
- "Average Score": 54.03167523687635,
792
- "Standard Deviation": null,
793
- "Rank": 18
794
- }
795
- }
796
- },
797
- {
798
- "config": {
799
- "model_name": "yi-1.5-34b-chat",
800
- "organization": "01 AI",
801
- "license": "Proprietary",
802
- "knowledge_cutoff": "2024/05"
803
- },
804
- "results": {
805
- "OVERALL": {
806
- "Average Score": 0.607812897,
807
- "Standard Deviation": 0.1440881293,
808
- "Rank": 16
809
- },
810
- "Geometry": {
811
- "Average Score": 0.566666724,
812
- "Standard Deviation": 0.04001381658,
813
- "Rank": 23
814
- },
815
- "Algebra": {
816
- "Average Score": 0.590997292,
817
- "Standard Deviation": 0.03594087315,
818
- "Rank": 19
819
- },
820
- "Probability": {
821
- "Average Score": 0.589524589,
822
- "Standard Deviation": 0.03112618772,
823
- "Rank": 17
824
- },
825
- "Logical": {
826
- "Average Score": 0.574105508,
827
- "Standard Deviation": 0.03441737941,
828
- "Rank": 19
829
- },
830
- "Social": {
831
- "Average Score": 0.516980832,
832
- "Standard Deviation": 0.03369347985,
833
- "Rank": 19
834
- },
835
- "Chemistry": {
836
- "Average Score": 52.148798061768964,
837
- "Standard Deviation": null,
838
- "Rank": 19
839
- }
840
- }
841
- },
842
- {
843
- "config": {
844
- "model_name": "meta-llama-3.1-8b-instruct",
845
- "organization": "Meta",
846
- "license": "Llama 3.1 Community",
847
- "knowledge_cutoff": "2023/12"
848
- },
849
- "results": {
850
- "OVERALL": {
851
- "Average Score": 0.505936324,
852
- "Standard Deviation": 0.05286756493,
853
- "Rank": 19
854
- },
855
- "Geometry": {
856
- "Average Score": 0.522442162,
857
- "Standard Deviation": 0.03908236317,
858
- "Rank": 25
859
- },
860
- "Algebra": {
861
- "Average Score": 0.582702645,
862
- "Standard Deviation": 0.05002277711,
863
- "Rank": 20
864
- },
865
- "Probability": {
866
- "Average Score": 0.495001149,
867
- "Standard Deviation": 0.05244587037,
868
- "Rank": 22
869
- },
870
- "Logical": {
871
- "Average Score": 0.443030561,
872
- "Standard Deviation": 0.01343820628,
873
- "Rank": 25
874
- },
875
- "Social": {
876
- "Average Score": 0.329195941,
877
- "Standard Deviation": 0.03925019528,
878
- "Rank": 30
879
- },
880
- "Chemistry": {
881
- "Average Score": 44.41846841004584,
882
- "Standard Deviation": null,
883
- "Rank": 27
884
- }
885
- }
886
- },
887
- {
888
- "config": {
889
- "model_name": "gpt3.5-turbo-0125",
890
- "organization": "OpenAI",
891
- "license": "Proprietary",
892
- "knowledge_cutoff": "2021/09"
893
- },
894
- "results": {
895
- "OVERALL": {
896
- "Average Score": 0.313398088,
897
- "Standard Deviation": 0.09322528606,
898
- "Rank": 40
899
- },
900
- "Geometry": {
901
- "Average Score": 0.678714519,
902
- "Standard Deviation": 0.05926546762,
903
- "Rank": 14
904
- },
905
- "Algebra": {
906
- "Average Score": 0.569296173,
907
- "Standard Deviation": 0.05277281097,
908
- "Rank": 21
909
- },
910
- "Probability": {
911
- "Average Score": 0.448460767,
912
- "Standard Deviation": 0.05768095196,
913
- "Rank": 26
914
- },
915
- "Logical": {
916
- "Average Score": 0.148521348,
917
- "Standard Deviation": 0.04033712907,
918
- "Rank": 45
919
- },
920
- "Social": {
921
- "Average Score": 0.235071541,
922
- "Standard Deviation": 0.02632892457,
923
- "Rank": 39
924
- },
925
- "Chemistry": {
926
- "Average Score": 40.46958736582551,
927
- "Standard Deviation": null,
928
- "Rank": 30
929
- }
930
- }
931
- },
932
- {
933
- "config": {
934
- "model_name": "llama-3-70b-instruct",
935
- "organization": "Meta",
936
- "license": "Llama 3 Community",
937
- "knowledge_cutoff": "2023/12"
938
- },
939
- "results": {
940
- "OVERALL": {
941
- "Average Score": 0.456689885,
942
- "Standard Deviation": 0.01385989995,
943
- "Rank": 23
944
- },
945
- "Geometry": {
946
- "Average Score": 0.516865529,
947
- "Standard Deviation": 0.03858112564,
948
- "Rank": 26
949
- },
950
- "Algebra": {
951
- "Average Score": 0.566756531,
952
- "Standard Deviation": 0.03369826926,
953
- "Rank": 22
954
- },
955
- "Probability": {
956
- "Average Score": 0.513857306,
957
- "Standard Deviation": 0.05453699062,
958
- "Rank": 21
959
- },
960
- "Logical": {
961
- "Average Score": 0.713796415,
962
- "Standard Deviation": 0.02031215107,
963
- "Rank": 9
964
- },
965
- "Social": {
966
- "Average Score": 0.45872939,
967
- "Standard Deviation": 0.05347039576,
968
- "Rank": 22
969
- },
970
- "Chemistry": {
971
- "Average Score": 65.32140697218945,
972
- "Standard Deviation": null,
973
- "Rank": 14
974
- }
975
- }
976
- },
977
- {
978
- "config": {
979
- "model_name": "claude-3-sonnet",
980
- "organization": "Anthropic",
981
- "license": "Proprietary",
982
- "knowledge_cutoff": "2023/08"
983
- },
984
- "results": {
985
- "OVERALL": {
986
- "Average Score": 0.520010833,
987
- "Standard Deviation": 0.005030563799,
988
- "Rank": 17
989
- },
990
- "Geometry": {
991
- "Average Score": 0.675613638,
992
- "Standard Deviation": 0.05275594408,
993
- "Rank": 15
994
- },
995
- "Algebra": {
996
- "Average Score": 0.552025728,
997
- "Standard Deviation": 0.04122192409,
998
- "Rank": 23
999
- },
1000
- "Probability": {
1001
- "Average Score": 0.516192848,
1002
- "Standard Deviation": 0.04152293217,
1003
- "Rank": 20
1004
- },
1005
- "Logical": {
1006
- "Average Score": 0.588545747,
1007
- "Standard Deviation": 0.06068211943,
1008
- "Rank": 17
1009
- },
1010
- "Social": {
1011
- "Average Score": 0.570437582,
1012
- "Standard Deviation": 0.08607040862,
1013
- "Rank": 15
1014
- },
1015
- "Chemistry": {
1016
- "Average Score": 61.33538592327427,
1017
- "Standard Deviation": null,
1018
- "Rank": 16
1019
- }
1020
- }
1021
- },
1022
- {
1023
- "config": {
1024
- "model_name": "qwen1.5-14b-chat",
1025
- "organization": "Alibaba",
1026
- "license": "Qianwen LICENSE",
1027
- "knowledge_cutoff": "2024/02"
1028
- },
1029
- "results": {
1030
- "OVERALL": {
1031
- "Average Score": 0.415328996,
1032
- "Standard Deviation": 0.0743938717,
1033
- "Rank": 28
1034
- },
1035
- "Geometry": {
1036
- "Average Score": 0.452504016,
1037
- "Standard Deviation": 0.04225594393,
1038
- "Rank": 28
1039
- },
1040
- "Algebra": {
1041
- "Average Score": 0.538655725,
1042
- "Standard Deviation": 0.03721542594,
1043
- "Rank": 24
1044
- },
1045
- "Probability": {
1046
- "Average Score": 0.397185975,
1047
- "Standard Deviation": 0.05607695946,
1048
- "Rank": 30
1049
- },
1050
- "Logical": {
1051
- "Average Score": 0.264573129,
1052
- "Standard Deviation": 0.03936133174,
1053
- "Rank": 35
1054
- },
1055
- "Social": {
1056
- "Average Score": 0.287370142,
1057
- "Standard Deviation": 0.04264085315,
1058
- "Rank": 32
1059
- },
1060
- "Chemistry": {
1061
- "Average Score": 38.552779976347026,
1062
- "Standard Deviation": null,
1063
- "Rank": 32
1064
- }
1065
- }
1066
- },
1067
- {
1068
- "config": {
1069
- "model_name": "claude-3-haiku",
1070
- "organization": "Anthropic",
1071
- "license": "Proprietary",
1072
- "knowledge_cutoff": "2023/08"
1073
- },
1074
- "results": {
1075
- "OVERALL": {
1076
- "Average Score": 0.453901163,
1077
- "Standard Deviation": 0.003604084261,
1078
- "Rank": 24
1079
- },
1080
- "Geometry": {
1081
- "Average Score": 0.607993912,
1082
- "Standard Deviation": 0.05793460748,
1083
- "Rank": 17
1084
- },
1085
- "Algebra": {
1086
- "Average Score": 0.520054055,
1087
- "Standard Deviation": 0.03333544511,
1088
- "Rank": 25
1089
- },
1090
- "Probability": {
1091
- "Average Score": 0.474460688,
1092
- "Standard Deviation": 0.0446501933,
1093
- "Rank": 25
1094
- },
1095
- "Logical": {
1096
- "Average Score": 0.512815976,
1097
- "Standard Deviation": 0.0163264281,
1098
- "Rank": 21
1099
- },
1100
- "Social": {
1101
- "Average Score": 0.551083976,
1102
- "Standard Deviation": 0.05374722539,
1103
- "Rank": 17
1104
- },
1105
- "Chemistry": {
1106
- "Average Score": 56.40200048817984,
1107
- "Standard Deviation": null,
1108
- "Rank": 17
1109
- }
1110
- }
1111
- },
1112
- {
1113
- "config": {
1114
- "model_name": "claude-2.1",
1115
- "organization": "Anthropic",
1116
- "license": "Proprietary",
1117
- "knowledge_cutoff": "Unknown"
1118
- },
1119
- "results": {
1120
- "OVERALL": {
1121
- "Average Score": 0.35814708,
1122
- "Standard Deviation": 0.09168134168,
1123
- "Rank": 36
1124
- },
1125
- "Geometry": {
1126
- "Average Score": 0.62752395,
1127
- "Standard Deviation": 0.07232659398,
1128
- "Rank": 16
1129
- },
1130
- "Algebra": {
1131
- "Average Score": 0.508849609,
1132
- "Standard Deviation": 0.0346897465,
1133
- "Rank": 26
1134
- },
1135
- "Probability": {
1136
- "Average Score": 0.41477086,
1137
- "Standard Deviation": 0.05964060239,
1138
- "Rank": 29
1139
- },
1140
- "Logical": {
1141
- "Average Score": 0.482923674,
1142
- "Standard Deviation": 0.01989147048,
1143
- "Rank": 22
1144
- },
1145
- "Social": {
1146
- "Average Score": 0.333804568,
1147
- "Standard Deviation": 0.03775548253,
1148
- "Rank": 29
1149
- },
1150
- "Chemistry": {
1151
- "Average Score": 47.23672563994903,
1152
- "Standard Deviation": null,
1153
- "Rank": 22
1154
- }
1155
- }
1156
- },
1157
- {
1158
- "config": {
1159
- "model_name": "mistral-8x7b-instruct-v0.1",
1160
- "organization": "Mistral",
1161
- "license": "Apache 2.0",
1162
- "knowledge_cutoff": "2023/12"
1163
- },
1164
- "results": {
1165
- "OVERALL": {
1166
- "Average Score": 0.382659161,
1167
- "Standard Deviation": 0.07594496929,
1168
- "Rank": 31
1169
- },
1170
- "Geometry": {
1171
- "Average Score": 0.432216097,
1172
- "Standard Deviation": 0.04747949254,
1173
- "Rank": 31
1174
- },
1175
- "Algebra": {
1176
- "Average Score": 0.478314888,
1177
- "Standard Deviation": 0.01998797419,
1178
- "Rank": 27
1179
- },
1180
- "Probability": {
1181
- "Average Score": 0.427144725,
1182
- "Standard Deviation": 0.0590923329,
1183
- "Rank": 28
1184
- },
1185
- "Logical": {
1186
- "Average Score": 0.340041983,
1187
- "Standard Deviation": 0.008397574592,
1188
- "Rank": 28
1189
- },
1190
- "Social": {
1191
- "Average Score": 0.251949622,
1192
- "Standard Deviation": 0.03346674405,
1193
- "Rank": 37
1194
- },
1195
- "Chemistry": {
1196
- "Average Score": 44.533118241976666,
1197
- "Standard Deviation": null,
1198
- "Rank": 26
1199
- }
1200
- }
1201
- },
1202
- {
1203
- "config": {
1204
- "model_name": "claude-2.0",
1205
- "organization": "Anthropic",
1206
- "license": "Proprietary",
1207
- "knowledge_cutoff": "Unknown"
1208
- },
1209
- "results": {
1210
- "OVERALL": {
1211
- "Average Score": 0.322718057,
1212
- "Standard Deviation": 0.08369883584,
1213
- "Rank": 38
1214
- },
1215
- "Geometry": {
1216
- "Average Score": 0.604141967,
1217
- "Standard Deviation": 0.05116441826,
1218
- "Rank": 18
1219
- },
1220
- "Algebra": {
1221
- "Average Score": 0.474350734,
1222
- "Standard Deviation": 0.01510393066,
1223
- "Rank": 28
1224
- },
1225
- "Probability": {
1226
- "Average Score": 0.437950412,
1227
- "Standard Deviation": 0.05985594317,
1228
- "Rank": 27
1229
- },
1230
- "Logical": {
1231
- "Average Score": 0.445620646,
1232
- "Standard Deviation": 0.01812614805,
1233
- "Rank": 24
1234
- },
1235
- "Social": {
1236
- "Average Score": 0.469422836,
1237
- "Standard Deviation": 0.05999901796,
1238
- "Rank": 21
1239
- },
1240
- "Chemistry": {
1241
- "Average Score": 50.773143448036464,
1242
- "Standard Deviation": null,
1243
- "Rank": 20
1244
- }
1245
- }
1246
- },
1247
- {
1248
- "config": {
1249
- "model_name": "starling-lm-7b-beta",
1250
- "organization": "Nexusflow",
1251
- "license": "Apache-2.0",
1252
- "knowledge_cutoff": "2024/03"
1253
- },
1254
- "results": {
1255
- "OVERALL": {
1256
- "Average Score": 0.479391856,
1257
- "Standard Deviation": 0.04199990887,
1258
- "Rank": 22
1259
- },
1260
- "Geometry": {
1261
- "Average Score": 0.446654388,
1262
- "Standard Deviation": 0.05637864999,
1263
- "Rank": 30
1264
- },
1265
- "Algebra": {
1266
- "Average Score": 0.473952749,
1267
- "Standard Deviation": 0.01584301288,
1268
- "Rank": 29
1269
- },
1270
- "Probability": {
1271
- "Average Score": 0.395197837,
1272
- "Standard Deviation": 0.05814798892,
1273
- "Rank": 31
1274
- },
1275
- "Logical": {
1276
- "Average Score": 0.39927199,
1277
- "Standard Deviation": 0.02125277518,
1278
- "Rank": 26
1279
- },
1280
- "Social": {
1281
- "Average Score": 0.380021662,
1282
- "Standard Deviation": 0.04622452748,
1283
- "Rank": 27
1284
- },
1285
- "Chemistry": {
1286
- "Average Score": 38.27587102395908,
1287
- "Standard Deviation": null,
1288
- "Rank": 33
1289
- }
1290
- }
1291
- },
1292
- {
1293
- "config": {
1294
- "model_name": "gemini-1.0-pro-001",
1295
- "organization": "Google",
1296
- "license": "Proprietary",
1297
- "knowledge_cutoff": "2023/04"
1298
- },
1299
- "results": {
1300
- "OVERALL": {
1301
- "Average Score": 0.449040654,
1302
- "Standard Deviation": 0.0450610177,
1303
- "Rank": 25
1304
- },
1305
- "Geometry": {
1306
- "Average Score": 0.578347959,
1307
- "Standard Deviation": 0.04242873607,
1308
- "Rank": 21
1309
- },
1310
- "Algebra": {
1311
- "Average Score": 0.462417786,
1312
- "Standard Deviation": 0.01668313635,
1313
- "Rank": 30
1314
- },
1315
- "Probability": {
1316
- "Average Score": 0.289836324,
1317
- "Standard Deviation": 0.05739831115,
1318
- "Rank": 39
1319
- },
1320
- "Logical": {
1321
- "Average Score": 0.191140355,
1322
- "Standard Deviation": 0.03394652499,
1323
- "Rank": 41
1324
- },
1325
- "Social": {
1326
- "Average Score": 0.130790863,
1327
- "Standard Deviation": 0.02800188173,
1328
- "Rank": 47
1329
- },
1330
- "Chemistry": {
1331
- "Average Score": 45.22204471452975,
1332
- "Standard Deviation": null,
1333
- "Rank": 24
1334
- }
1335
- }
1336
- },
1337
- {
1338
- "config": {
1339
- "model_name": "openchat-3.5-0106",
1340
- "organization": "OpenChat",
1341
- "license": "Apache-2.0",
1342
- "knowledge_cutoff": "2024/01"
1343
- },
1344
- "results": {
1345
- "OVERALL": {
1346
- "Average Score": 0.363929888,
1347
- "Standard Deviation": 0.08602347145,
1348
- "Rank": 34
1349
- },
1350
- "Geometry": {
1351
- "Average Score": 0.38715246,
1352
- "Standard Deviation": 0.03701851946,
1353
- "Rank": 34
1354
- },
1355
- "Algebra": {
1356
- "Average Score": 0.441233712,
1357
- "Standard Deviation": 0.01135753754,
1358
- "Rank": 31
1359
- },
1360
- "Probability": {
1361
- "Average Score": 0.38802618,
1362
- "Standard Deviation": 0.05663879714,
1363
- "Rank": 32
1364
- },
1365
- "Logical": {
1366
- "Average Score": 0.336754383,
1367
- "Standard Deviation": 0.01608478079,
1368
- "Rank": 29
1369
- },
1370
- "Social": {
1371
- "Average Score": 0.250891608,
1372
- "Standard Deviation": 0.03253769914,
1373
- "Rank": 38
1374
- },
1375
- "Chemistry": {
1376
- "Average Score": 33.70639271807677,
1377
- "Standard Deviation": null,
1378
- "Rank": 34
1379
- }
1380
- }
1381
- },
1382
- {
1383
- "config": {
1384
- "model_name": "openchat-3.5",
1385
- "organization": "OpenChat",
1386
- "license": "Apache-2.0",
1387
- "knowledge_cutoff": "2023/11"
1388
- },
1389
- "results": {
1390
- "OVERALL": {
1391
- "Average Score": 0.361341296,
1392
- "Standard Deviation": 0.09034869493,
1393
- "Rank": 35
1394
- },
1395
- "Geometry": {
1396
- "Average Score": 0.401699069,
1397
- "Standard Deviation": 0.03410726557,
1398
- "Rank": 32
1399
- },
1400
- "Algebra": {
1401
- "Average Score": 0.414095336,
1402
- "Standard Deviation": 0.01881964261,
1403
- "Rank": 33
1404
- },
1405
- "Probability": {
1406
- "Average Score": 0.349601002,
1407
- "Standard Deviation": 0.05077455539,
1408
- "Rank": 34
1409
- },
1410
- "Logical": {
1411
- "Average Score": 0.331069242,
1412
- "Standard Deviation": 0.02180827173,
1413
- "Rank": 31
1414
- },
1415
- "Social": {
1416
- "Average Score": 0.319991655,
1417
- "Standard Deviation": 0.04502478724,
1418
- "Rank": 31
1419
- },
1420
- "Chemistry": {
1421
- "Average Score": 33.020911255646965,
1422
- "Standard Deviation": null,
1423
- "Rank": 35
1424
- }
1425
- }
1426
- },
1427
- {
1428
- "config": {
1429
- "model_name": "command-r-(08-2024)",
1430
- "organization": "Cohere",
1431
- "license": "CC-BY-NC-4.0",
1432
- "knowledge_cutoff": "2024/08"
1433
- },
1434
- "results": {
1435
- "OVERALL": {
1436
- "Average Score": 0.427605298,
1437
- "Standard Deviation": 0.01747449163,
1438
- "Rank": 26
1439
- },
1440
- "Geometry": {
1441
- "Average Score": 0.448300727,
1442
- "Standard Deviation": 0.04996362328,
1443
- "Rank": 29
1444
- },
1445
- "Algebra": {
1446
- "Average Score": 0.417519167,
1447
- "Standard Deviation": 0.01822196902,
1448
- "Rank": 32
1449
- },
1450
- "Probability": {
1451
- "Average Score": 0.366336281,
1452
- "Standard Deviation": 0.04716826942,
1453
- "Rank": 33
1454
- },
1455
- "Logical": {
1456
- "Average Score": 0.214657906,
1457
- "Standard Deviation": 0.03003579835,
1458
- "Rank": 38
1459
- },
1460
- "Social": {
1461
- "Average Score": 0.276088379,
1462
- "Standard Deviation": 0.03295234688,
1463
- "Rank": 34
1464
- },
1465
- "Chemistry": {
1466
- "Average Score": 39.61492485677676,
1467
- "Standard Deviation": null,
1468
- "Rank": 31
1469
- }
1470
- }
1471
- },
1472
- {
1473
- "config": {
1474
- "model_name": "gemma-1.1-7b-it",
1475
- "organization": "Google",
1476
- "license": "Gemma License",
1477
- "knowledge_cutoff": "2024/02"
1478
- },
1479
- "results": {
1480
- "OVERALL": {
1481
- "Average Score": 0.339506922,
1482
- "Standard Deviation": 0.1066279108,
1483
- "Rank": 37
1484
- },
1485
- "Geometry": {
1486
- "Average Score": 0.324170977,
1487
- "Standard Deviation": 0.04668553765,
1488
- "Rank": 37
1489
- },
1490
- "Algebra": {
1491
- "Average Score": 0.398684697,
1492
- "Standard Deviation": 0.01982398259,
1493
- "Rank": 34
1494
- },
1495
- "Probability": {
1496
- "Average Score": 0.293253175,
1497
- "Standard Deviation": 0.05126192191,
1498
- "Rank": 38
1499
- },
1500
- "Logical": {
1501
- "Average Score": 0.317750796,
1502
- "Standard Deviation": 0.01101933543,
1503
- "Rank": 32
1504
- },
1505
- "Social": {
1506
- "Average Score": 0.179073276,
1507
- "Standard Deviation": 0.02009658805,
1508
- "Rank": 43
1509
- },
1510
- "Chemistry": {
1511
- "Average Score": 42.666504105798204,
1512
- "Standard Deviation": null,
1513
- "Rank": 28
1514
- }
1515
- }
1516
- },
1517
- {
1518
- "config": {
1519
- "model_name": "llama3-8b-instruct",
1520
- "organization": "Meta",
1521
- "license": "Llama 3 Community",
1522
- "knowledge_cutoff": "2023/03"
1523
- },
1524
- "results": {
1525
- "OVERALL": {
1526
- "Average Score": 0.367722676,
1527
- "Standard Deviation": 0.1071368221,
1528
- "Rank": 32
1529
- },
1530
- "Geometry": {
1531
- "Average Score": 0.367143758,
1532
- "Standard Deviation": 0.04363680358,
1533
- "Rank": 35
1534
- },
1535
- "Algebra": {
1536
- "Average Score": 0.391480973,
1537
- "Standard Deviation": 0.02757445266,
1538
- "Rank": 35
1539
- },
1540
- "Probability": {
1541
- "Average Score": 0.317616445,
1542
- "Standard Deviation": 0.04300430361,
1543
- "Rank": 37
1544
- },
1545
- "Logical": {
1546
- "Average Score": 0.461607495,
1547
- "Standard Deviation": 0.02185028842,
1548
- "Rank": 23
1549
- },
1550
- "Social": {
1551
- "Average Score": 0.336373622,
1552
- "Standard Deviation": 0.05762408512,
1553
- "Rank": 28
1554
- },
1555
- "Chemistry": {
1556
- "Average Score": 45.35392139264795,
1557
- "Standard Deviation": null,
1558
- "Rank": 23
1559
- }
1560
- }
1561
- },
1562
- {
1563
- "config": {
1564
- "model_name": "gemma-2-2b-it",
1565
- "organization": "Google",
1566
- "license": "Gemma License",
1567
- "knowledge_cutoff": "2024/07"
1568
- },
1569
- "results": {
1570
- "OVERALL": {
1571
- "Average Score": 0.502167612,
1572
- "Standard Deviation": 0.04389786763,
1573
- "Rank": 20
1574
- },
1575
- "Geometry": {
1576
- "Average Score": 0.395006676,
1577
- "Standard Deviation": 0.05882607713,
1578
- "Rank": 33
1579
- },
1580
- "Algebra": {
1581
- "Average Score": 0.379391887,
1582
- "Standard Deviation": 0.01722410785,
1583
- "Rank": 36
1584
- },
1585
- "Probability": {
1586
- "Average Score": 0.331231097,
1587
- "Standard Deviation": 0.05392499987,
1588
- "Rank": 36
1589
- },
1590
- "Logical": {
1591
- "Average Score": 0.367687789,
1592
- "Standard Deviation": 0.02547968808,
1593
- "Rank": 27
1594
- },
1595
- "Social": {
1596
- "Average Score": 0.393482094,
1597
- "Standard Deviation": 0.06450214024,
1598
- "Rank": 25
1599
- },
1600
- "Chemistry": {
1601
- "Average Score": 30.53406933106768,
1602
- "Standard Deviation": null,
1603
- "Rank": 37
1604
- }
1605
- }
1606
- },
1607
- {
1608
- "config": {
1609
- "model_name": "starling-lm-7b-alpha",
1610
- "organization": "Nexusflow",
1611
- "license": "Apache-2.0",
1612
- "knowledge_cutoff": "2023/11"
1613
- },
1614
- "results": {
1615
- "OVERALL": {
1616
- "Average Score": 0.366628765,
1617
- "Standard Deviation": 0.08405492929,
1618
- "Rank": 33
1619
- },
1620
- "Geometry": {
1621
- "Average Score": 0.336782578,
1622
- "Standard Deviation": 0.04069449132,
1623
- "Rank": 36
1624
- },
1625
- "Algebra": {
1626
- "Average Score": 0.371551932,
1627
- "Standard Deviation": 0.03367241745,
1628
- "Rank": 37
1629
- },
1630
- "Probability": {
1631
- "Average Score": 0.331472505,
1632
- "Standard Deviation": 0.04833324282,
1633
- "Rank": 35
1634
- },
1635
- "Logical": {
1636
- "Average Score": 0.260869624,
1637
- "Standard Deviation": 0.03562735237,
1638
- "Rank": 36
1639
- },
1640
- "Social": {
1641
- "Average Score": 0.271975534,
1642
- "Standard Deviation": 0.04266753408,
1643
- "Rank": 35
1644
- },
1645
- "Chemistry": {
1646
- "Average Score": 30.07926487356878,
1647
- "Standard Deviation": null,
1648
- "Rank": 38
1649
- }
1650
- }
1651
- },
1652
- {
1653
- "config": {
1654
- "model_name": "qwen1.5-4b-chat",
1655
- "organization": "Alibaba",
1656
- "license": "Qianwen LICENSE",
1657
- "knowledge_cutoff": "2024/02"
1658
- },
1659
- "results": {
1660
- "OVERALL": {
1661
- "Average Score": 0.111876411,
1662
- "Standard Deviation": 0.04241022785,
1663
- "Rank": 49
1664
- },
1665
- "Geometry": {
1666
- "Average Score": 0.215834522,
1667
- "Standard Deviation": 0.0363766363,
1668
- "Rank": 41
1669
- },
1670
- "Algebra": {
1671
- "Average Score": 0.305589811,
1672
- "Standard Deviation": 0.02354198912,
1673
- "Rank": 38
1674
- },
1675
- "Probability": {
1676
- "Average Score": 0.149365327,
1677
- "Standard Deviation": 0.03489672675,
1678
- "Rank": 45
1679
- },
1680
- "Logical": {
1681
- "Average Score": 0.116210168,
1682
- "Standard Deviation": 0.005927966496,
1683
- "Rank": 47
1684
- },
1685
- "Social": {
1686
- "Average Score": 0.18195615,
1687
- "Standard Deviation": 0.02269805277,
1688
- "Rank": 42
1689
- },
1690
- "Chemistry": {
1691
- "Average Score": 13.21208067122554,
1692
- "Standard Deviation": null,
1693
- "Rank": 48
1694
- }
1695
- }
1696
- },
1697
- {
1698
- "config": {
1699
- "model_name": "command-r-(04-2024)",
1700
- "organization": "Cohere",
1701
- "license": "CC-BY-NC-4.0",
1702
- "knowledge_cutoff": "2024/04"
1703
- },
1704
- "results": {
1705
- "OVERALL": {
1706
- "Average Score": 0.388783887,
1707
- "Standard Deviation": 0.07417186783,
1708
- "Rank": 30
1709
- },
1710
- "Geometry": {
1711
- "Average Score": 0.300416698,
1712
- "Standard Deviation": 0.03485612736,
1713
- "Rank": 38
1714
- },
1715
- "Algebra": {
1716
- "Average Score": 0.293120231,
1717
- "Standard Deviation": 0.032926484,
1718
- "Rank": 39
1719
- },
1720
- "Probability": {
1721
- "Average Score": 0.281271304,
1722
- "Standard Deviation": 0.05697149867,
1723
- "Rank": 40
1724
- },
1725
- "Logical": {
1726
- "Average Score": 0.276189906,
1727
- "Standard Deviation": 0.03562914754,
1728
- "Rank": 34
1729
- },
1730
- "Social": {
1731
- "Average Score": 0.283882949,
1732
- "Standard Deviation": 0.03336901148,
1733
- "Rank": 33
1734
- },
1735
- "Chemistry": {
1736
- "Average Score": 41.346336503003236,
1737
- "Standard Deviation": null,
1738
- "Rank": 29
1739
- }
1740
- }
1741
- },
1742
- {
1743
- "config": {
1744
- "model_name": "vicuna-33b",
1745
- "organization": "LMSYS",
1746
- "license": "Non-commercial",
1747
- "knowledge_cutoff": "2023/08"
1748
- },
1749
- "results": {
1750
- "OVERALL": {
1751
- "Average Score": 0.316543555,
1752
- "Standard Deviation": 0.08922095647,
1753
- "Rank": 39
1754
- },
1755
- "Geometry": {
1756
- "Average Score": 0.208284679,
1757
- "Standard Deviation": 0.03937771461,
1758
- "Rank": 42
1759
- },
1760
- "Algebra": {
1761
- "Average Score": 0.248994048,
1762
- "Standard Deviation": 0.02668175054,
1763
- "Rank": 41
1764
- },
1765
- "Probability": {
1766
- "Average Score": 0.222313995,
1767
- "Standard Deviation": 0.03978859759,
1768
- "Rank": 43
1769
- },
1770
- "Logical": {
1771
- "Average Score": 0.180291222,
1772
- "Standard Deviation": 0.021886267,
1773
- "Rank": 42
1774
- },
1775
- "Social": {
1776
- "Average Score": 0.257623798,
1777
- "Standard Deviation": 0.02653724437,
1778
- "Rank": 36
1779
- },
1780
- "Chemistry": {
1781
- "Average Score": 28.01838653090379,
1782
- "Standard Deviation": null,
1783
- "Rank": 39
1784
- }
1785
- }
1786
- },
1787
- {
1788
- "config": {
1789
- "model_name": "gemma-7b-it",
1790
- "organization": "Google",
1791
- "license": "Gemma License",
1792
- "knowledge_cutoff": "2024/02"
1793
- },
1794
- "results": {
1795
- "OVERALL": {
1796
- "Average Score": 0.285077558,
1797
- "Standard Deviation": 0.08871758453,
1798
- "Rank": 41
1799
- },
1800
- "Geometry": {
1801
- "Average Score": 0.244791417,
1802
- "Standard Deviation": 0.0289612078,
1803
- "Rank": 39
1804
- },
1805
- "Algebra": {
1806
- "Average Score": 0.250614794,
1807
- "Standard Deviation": 0.01991678295,
1808
- "Rank": 40
1809
- },
1810
- "Probability": {
1811
- "Average Score": 0.174313053,
1812
- "Standard Deviation": 0.03765424728,
1813
- "Rank": 44
1814
- },
1815
- "Logical": {
1816
- "Average Score": 0.197505536,
1817
- "Standard Deviation": 0.02050298885,
1818
- "Rank": 39
1819
- },
1820
- "Social": {
1821
- "Average Score": 0.202138025,
1822
- "Standard Deviation": 0.02098346639,
1823
- "Rank": 41
1824
- },
1825
- "Chemistry": {
1826
- "Average Score": 28.014658234926813,
1827
- "Standard Deviation": null,
1828
- "Rank": 40
1829
- }
1830
- }
1831
- },
1832
- {
1833
- "config": {
1834
- "model_name": "mistral-7b-instruct-2",
1835
- "organization": "Mistral",
1836
- "license": "Apache 2.0",
1837
- "knowledge_cutoff": "2023/12"
1838
- },
1839
- "results": {
1840
- "OVERALL": {
1841
- "Average Score": 0.427513868,
1842
- "Standard Deviation": 0.05553921135,
1843
- "Rank": 27
1844
- },
1845
- "Geometry": {
1846
- "Average Score": 0.216402626,
1847
- "Standard Deviation": 0.03338414918,
1848
- "Rank": 40
1849
- },
1850
- "Algebra": {
1851
- "Average Score": 0.233777838,
1852
- "Standard Deviation": 0.0155226054,
1853
- "Rank": 42
1854
- },
1855
- "Probability": {
1856
- "Average Score": 0.25118175,
1857
- "Standard Deviation": 0.04065514593,
1858
- "Rank": 41
1859
- },
1860
- "Logical": {
1861
- "Average Score": 0.224469136,
1862
- "Standard Deviation": 0.03404706752,
1863
- "Rank": 37
1864
- },
1865
- "Social": {
1866
- "Average Score": 0.209386782,
1867
- "Standard Deviation": 0.02738569921,
1868
- "Rank": 40
1869
- },
1870
- "Chemistry": {
1871
- "Average Score": 31.382959631870822,
1872
- "Standard Deviation": null,
1873
- "Rank": 36
1874
- }
1875
- }
1876
- },
1877
- {
1878
- "config": {
1879
- "model_name": "mistral-7b-instruct-1",
1880
- "organization": "Mistral",
1881
- "license": "Apache 2.0",
1882
- "knowledge_cutoff": "2023/12"
1883
- },
1884
- "results": {
1885
- "OVERALL": {
1886
- "Average Score": 0.23016314,
1887
- "Standard Deviation": 0.07137625271,
1888
- "Rank": 46
1889
- },
1890
- "Geometry": {
1891
- "Average Score": 0.161799938,
1892
- "Standard Deviation": 0.03595278559,
1893
- "Rank": 46
1894
- },
1895
- "Algebra": {
1896
- "Average Score": 0.210341624,
1897
- "Standard Deviation": 0.01736539119,
1898
- "Rank": 43
1899
- },
1900
- "Probability": {
1901
- "Average Score": 0.238417922,
1902
- "Standard Deviation": 0.03744211933,
1903
- "Rank": 42
1904
- },
1905
- "Logical": {
1906
- "Average Score": 0.142636601,
1907
- "Standard Deviation": 0.02080406365,
1908
- "Rank": 46
1909
- },
1910
- "Social": {
1911
- "Average Score": 0.117646827,
1912
- "Standard Deviation": 0.009321202779,
1913
- "Rank": 49
1914
- },
1915
- "Chemistry": {
1916
- "Average Score": 18.929093202755805,
1917
- "Standard Deviation": null,
1918
- "Rank": 43
1919
- }
1920
- }
1921
- },
1922
- {
1923
- "config": {
1924
- "model_name": "vicuna-13b",
1925
- "organization": "LMSYS",
1926
- "license": "Non-commercial",
1927
- "knowledge_cutoff": "2023/07"
1928
- },
1929
- "results": {
1930
- "OVERALL": {
1931
- "Average Score": 0.201892849,
1932
- "Standard Deviation": 0.06021749802,
1933
- "Rank": 47
1934
- },
1935
- "Geometry": {
1936
- "Average Score": 0.200941928,
1937
- "Standard Deviation": 0.03366817781,
1938
- "Rank": 43
1939
- },
1940
- "Algebra": {
1941
- "Average Score": 0.196123323,
1942
- "Standard Deviation": 0.0135715643,
1943
- "Rank": 44
1944
- },
1945
- "Probability": {
1946
- "Average Score": 0.141214079,
1947
- "Standard Deviation": 0.02721328211,
1948
- "Rank": 46
1949
- },
1950
- "Logical": {
1951
- "Average Score": 0.148598631,
1952
- "Standard Deviation": 0.02241523892,
1953
- "Rank": 44
1954
- },
1955
- "Social": {
1956
- "Average Score": 0.124655135,
1957
- "Standard Deviation": 0.01122382671,
1958
- "Rank": 48
1959
- },
1960
- "Chemistry": {
1961
- "Average Score": 21.840013221590294,
1962
- "Standard Deviation": null,
1963
- "Rank": 41
1964
- }
1965
- }
1966
- },
1967
- {
1968
- "config": {
1969
- "model_name": "zephyr-7b-beta",
1970
- "organization": "HuggingFace",
1971
- "license": "MIT",
1972
- "knowledge_cutoff": "2023/10"
1973
- },
1974
- "results": {
1975
- "OVERALL": {
1976
- "Average Score": 0.102705119,
1977
- "Standard Deviation": 0.03683757312,
1978
- "Rank": 50
1979
- },
1980
- "Geometry": {
1981
- "Average Score": 0.114005544,
1982
- "Standard Deviation": 0.03144354365,
1983
- "Rank": 47
1984
- },
1985
- "Algebra": {
1986
- "Average Score": 0.141766633,
1987
- "Standard Deviation": 0.03179520129,
1988
- "Rank": 45
1989
- },
1990
- "Probability": {
1991
- "Average Score": 0.089050714,
1992
- "Standard Deviation": 0.002136754266,
1993
- "Rank": 49
1994
- },
1995
- "Logical": {
1996
- "Average Score": 0.069520789,
1997
- "Standard Deviation": 0.004477840857,
1998
- "Rank": 51
1999
- },
2000
- "Social": {
2001
- "Average Score": 0.0,
2002
- "Standard Deviation": 0.0,
2003
- "Rank": 54
2004
- },
2005
- "Chemistry": {
2006
- "Average Score": 18.92902220864132,
2007
- "Standard Deviation": null,
2008
- "Rank": 44
2009
- }
2010
- }
2011
- },
2012
- {
2013
- "config": {
2014
- "model_name": "gemma-1.1-2b-it",
2015
- "organization": "Google",
2016
- "license": "Gemma License",
2017
- "knowledge_cutoff": "2024/02"
2018
- },
2019
- "results": {
2020
- "OVERALL": {
2021
- "Average Score": 0.257700845,
2022
- "Standard Deviation": 0.07369021445,
2023
- "Rank": 44
2024
- },
2025
- "Geometry": {
2026
- "Average Score": 0.183974034,
2027
- "Standard Deviation": 0.0215548886,
2028
- "Rank": 45
2029
- },
2030
- "Algebra": {
2031
- "Average Score": 0.13422252,
2032
- "Standard Deviation": 0.01922819511,
2033
- "Rank": 46
2034
- },
2035
- "Probability": {
2036
- "Average Score": 0.095628657,
2037
- "Standard Deviation": 0.007536076456,
2038
- "Rank": 48
2039
- },
2040
- "Logical": {
2041
- "Average Score": 0.094965074,
2042
- "Standard Deviation": 0.005019175487,
2043
- "Rank": 49
2044
- },
2045
- "Social": {
2046
- "Average Score": 0.167796727,
2047
- "Standard Deviation": 0.01666541942,
2048
- "Rank": 44
2049
- },
2050
- "Chemistry": {
2051
- "Average Score": 20.724691953843916,
2052
- "Standard Deviation": null,
2053
- "Rank": 42
2054
- }
2055
- }
2056
- },
2057
- {
2058
- "config": {
2059
- "model_name": "llama2-7b-chat",
2060
- "organization": "Meta",
2061
- "license": "Llama 2 Community",
2062
- "knowledge_cutoff": "2023/07"
2063
- },
2064
- "results": {
2065
- "OVERALL": {
2066
- "Average Score": 0.260189428,
2067
- "Standard Deviation": 0.08019299364,
2068
- "Rank": 43
2069
- },
2070
- "Geometry": {
2071
- "Average Score": 0.087067276,
2072
- "Standard Deviation": 0.04274343402,
2073
- "Rank": 48
2074
- },
2075
- "Algebra": {
2076
- "Average Score": 0.12308805,
2077
- "Standard Deviation": 0.01856053622,
2078
- "Rank": 47
2079
- },
2080
- "Probability": {
2081
- "Average Score": 0.087515438,
2082
- "Standard Deviation": 0.006315053573,
2083
- "Rank": 50
2084
- },
2085
- "Logical": {
2086
- "Average Score": 0.17312827,
2087
- "Standard Deviation": 0.01867044092,
2088
- "Rank": 43
2089
- },
2090
- "Social": {
2091
- "Average Score": 0.152905272,
2092
- "Standard Deviation": 0.007166957097,
2093
- "Rank": 45
2094
- },
2095
- "Chemistry": {
2096
- "Average Score": 15.730513733660898,
2097
- "Standard Deviation": null,
2098
- "Rank": 46
2099
- }
2100
- }
2101
- },
2102
- {
2103
- "config": {
2104
- "model_name": "gemma-2b-it",
2105
- "organization": "Google",
2106
- "license": "Gemma License",
2107
- "knowledge_cutoff": "2024/02"
2108
- },
2109
- "results": {
2110
- "OVERALL": {
2111
- "Average Score": 0.234172069,
2112
- "Standard Deviation": 0.06522685718,
2113
- "Rank": 45
2114
- },
2115
- "Geometry": {
2116
- "Average Score": 0.198571153,
2117
- "Standard Deviation": 0.01699161031,
2118
- "Rank": 44
2119
- },
2120
- "Algebra": {
2121
- "Average Score": 0.109883009,
2122
- "Standard Deviation": 0.01520005833,
2123
- "Rank": 48
2124
- },
2125
- "Probability": {
2126
- "Average Score": 0.06467432,
2127
- "Standard Deviation": 0.002117497231,
2128
- "Rank": 52
2129
- },
2130
- "Logical": {
2131
- "Average Score": 0.039624492,
2132
- "Standard Deviation": 0.007606972686,
2133
- "Rank": 52
2134
- },
2135
- "Social": {
2136
- "Average Score": 0.087452913,
2137
- "Standard Deviation": 0.008170146562,
2138
- "Rank": 52
2139
- },
2140
- "Chemistry": {
2141
- "Average Score": 17.2715657115764,
2142
- "Standard Deviation": null,
2143
- "Rank": 45
2144
- }
2145
- }
2146
- },
2147
- {
2148
- "config": {
2149
- "model_name": "llama2-13b-chat",
2150
- "organization": "Meta",
2151
- "license": "Llama 2 Community",
2152
- "knowledge_cutoff": "2023/07"
2153
- },
2154
- "results": {
2155
- "OVERALL": {
2156
- "Average Score": 0.263305684,
2157
- "Standard Deviation": 0.07283640689,
2158
- "Rank": 42
2159
- },
2160
- "Geometry": {
2161
- "Average Score": 0.072729954,
2162
- "Standard Deviation": 0.02315988261,
2163
- "Rank": 50
2164
- },
2165
- "Algebra": {
2166
- "Average Score": 0.080371692,
2167
- "Standard Deviation": 0.01277569453,
2168
- "Rank": 49
2169
- },
2170
- "Probability": {
2171
- "Average Score": 0.117757344,
2172
- "Standard Deviation": 0.02418619619,
2173
- "Rank": 47
2174
- },
2175
- "Logical": {
2176
- "Average Score": 0.193149889,
2177
- "Standard Deviation": 0.01776690764,
2178
- "Rank": 40
2179
- },
2180
- "Social": {
2181
- "Average Score": 0.149125922,
2182
- "Standard Deviation": 0.01157416827,
2183
- "Rank": 46
2184
- },
2185
- "Chemistry": {
2186
- "Average Score": 13.17258252933903,
2187
- "Standard Deviation": null,
2188
- "Rank": 49
2189
- }
2190
- }
2191
- },
2192
- {
2193
- "config": {
2194
- "model_name": "vicuna-7b",
2195
- "organization": "LMSYS",
2196
- "license": "Non-commercial",
2197
- "knowledge_cutoff": "2023/07"
2198
- },
2199
- "results": {
2200
- "OVERALL": {
2201
- "Average Score": 0.198839786,
2202
- "Standard Deviation": 0.05725381576,
2203
- "Rank": 48
2204
- },
2205
- "Geometry": {
2206
- "Average Score": 0.083457058,
2207
- "Standard Deviation": 0.02520989111,
2208
- "Rank": 49
2209
- },
2210
- "Algebra": {
2211
- "Average Score": 0.070883882,
2212
- "Standard Deviation": 0.007315853253,
2213
- "Rank": 50
2214
- },
2215
- "Probability": {
2216
- "Average Score": 0.080987673,
2217
- "Standard Deviation": 0.005474288861,
2218
- "Rank": 51
2219
- },
2220
- "Logical": {
2221
- "Average Score": 0.100065588,
2222
- "Standard Deviation": 0.003561886452,
2223
- "Rank": 48
2224
- },
2225
- "Social": {
2226
- "Average Score": 0.111076414,
2227
- "Standard Deviation": 0.004805626512,
2228
- "Rank": 50
2229
- },
2230
- "Chemistry": {
2231
- "Average Score": 14.255194156624162,
2232
- "Standard Deviation": null,
2233
- "Rank": 47
2234
- }
2235
- }
2236
- },
2237
- {
2238
- "config": {
2239
- "model_name": "koala-13b",
2240
- "organization": "UC Berkeley",
2241
- "license": "Non-commercial",
2242
- "knowledge_cutoff": "2023/04"
2243
- },
2244
- "results": {
2245
- "OVERALL": {
2246
- "Average Score": 0.09387188,
2247
- "Standard Deviation": 0.02642167489,
2248
- "Rank": 51
2249
- },
2250
- "Geometry": {
2251
- "Average Score": 0.017374001,
2252
- "Standard Deviation": 0.01747053557,
2253
- "Rank": 51
2254
- },
2255
- "Algebra": {
2256
- "Average Score": 0.018129197,
2257
- "Standard Deviation": 0.01054371383,
2258
- "Rank": 51
2259
- },
2260
- "Probability": {
2261
- "Average Score": 0.043654362,
2262
- "Standard Deviation": 0.004288231886,
2263
- "Rank": 53
2264
- },
2265
- "Logical": {
2266
- "Average Score": 0.074694053,
2267
- "Standard Deviation": 0.002674646998,
2268
- "Rank": 50
2269
- },
2270
- "Social": {
2271
- "Average Score": 0.096983835,
2272
- "Standard Deviation": 0.007847059783,
2273
- "Rank": 51
2274
- },
2275
- "Chemistry": {
2276
- "Average Score": 6.36433272373514,
2277
- "Standard Deviation": null,
2278
- "Rank": 50
2279
- }
2280
- }
2281
- },
2282
- {
2283
- "config": {
2284
- "model_name": "openassistant-pythia-12b",
2285
- "organization": "OpenAssistant",
2286
- "license": "Non-commercial",
2287
- "knowledge_cutoff": "2023/04"
2288
- },
2289
- "results": {
2290
- "OVERALL": {
2291
- "Average Score": 0.0,
2292
- "Standard Deviation": 0.0,
2293
- "Rank": 52
2294
- },
2295
- "Geometry": {
2296
- "Average Score": 0.0,
2297
- "Standard Deviation": 0.0,
2298
- "Rank": 52
2299
- },
2300
- "Algebra": {
2301
- "Average Score": 0.0,
2302
- "Standard Deviation": 0.0,
2303
- "Rank": 52
2304
- },
2305
- "Probability": {
2306
- "Average Score": 0.0,
2307
- "Standard Deviation": 0.0,
2308
- "Rank": 54
2309
- },
2310
- "Logical": {
2311
- "Average Score": 0.0,
2312
- "Standard Deviation": 0.0,
2313
- "Rank": 53
2314
- },
2315
- "Social": {
2316
- "Average Score": 0.030792528,
2317
- "Standard Deviation": 0.007518796391,
2318
- "Rank": 53
2319
- },
2320
- "Chemistry": {
2321
- "Average Score": 0.0,
2322
- "Standard Deviation": null,
2323
- "Rank": 51
2324
- }
2325
- }
2326
- }
2327
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/results/models_2024-10-09-06:22:21.122422.json DELETED
@@ -1,2372 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "model_name": "ChatGPT-4o-latest (2024-09-03)",
5
- "organization": "OpenAI",
6
- "license": "Proprietary",
7
- "knowledge_cutoff": "2023/10"
8
- },
9
- "results": {
10
- "OVERALL": {
11
- "Average Score": 0.974329609,
12
- "Standard Deviation": 0.005024959031,
13
- "Rank": 2
14
- },
15
- "Geometry": {
16
- "Average Score": 0.976028578,
17
- "Standard Deviation": 0.01507912373,
18
- "Rank": 3
19
- },
20
- "Algebra": {
21
- "Average Score": 0.951199453,
22
- "Standard Deviation": 0.08452452108,
23
- "Rank": 3
24
- },
25
- "Probability": {
26
- "Average Score": 0.842116641,
27
- "Standard Deviation": 0.006267759054,
28
- "Rank": 3
29
- },
30
- "Logical": {
31
- "Average Score": 0.828490728,
32
- "Standard Deviation": 0.009134213144,
33
- "Rank": 3
34
- },
35
- "Social": {
36
- "Average Score": 0.815902987,
37
- "Standard Deviation": 0.0196254222,
38
- "Rank": 3
39
- },
40
- "Chemistry": {
41
- "Average Score": 100.0,
42
- "Standard Deviation": null,
43
- "Rank": 1
44
- }
45
- }
46
- },
47
- {
48
- "config": {
49
- "model_name": "meta-llama-3.1-70b-instruct",
50
- "organization": "Meta",
51
- "license": "Llama 3.1 Community",
52
- "knowledge_cutoff": "2023/12"
53
- },
54
- "results": {
55
- "OVERALL": {
56
- "Average Score": 0.708874896,
57
- "Standard Deviation": 0.1315111956,
58
- "Rank": 13
59
- },
60
- "Geometry": {
61
- "Average Score": 0.76184398,
62
- "Standard Deviation": 0.01790377984,
63
- "Rank": 10
64
- },
65
- "Algebra": {
66
- "Average Score": 0.732041699,
67
- "Standard Deviation": 0.02621439062,
68
- "Rank": 9
69
- },
70
- "Probability": {
71
- "Average Score": 0.676208383,
72
- "Standard Deviation": 0.05131201636,
73
- "Rank": 10
74
- },
75
- "Logical": {
76
- "Average Score": 0.620018631,
77
- "Standard Deviation": 0.02518873821,
78
- "Rank": 14
79
- },
80
- "Social": {
81
- "Average Score": 0.45872939,
82
- "Standard Deviation": 0.05347039576,
83
- "Rank": 20
84
- },
85
- "Chemistry": {
86
- "Average Score": 84.36815192532764,
87
- "Standard Deviation": null,
88
- "Rank": 4
89
- }
90
- }
91
- },
92
- {
93
- "config": {
94
- "model_name": "gpt-4o-2024-08-06",
95
- "organization": "OpenAI",
96
- "license": "Proprietary",
97
- "knowledge_cutoff": "2023/10"
98
- },
99
- "results": {
100
- "OVERALL": {
101
- "Average Score": 0.846571548,
102
- "Standard Deviation": 0.03394056554,
103
- "Rank": 6
104
- },
105
- "Geometry": {
106
- "Average Score": 0.99773096,
107
- "Standard Deviation": 0.002835555172,
108
- "Rank": 1
109
- },
110
- "Algebra": {
111
- "Average Score": 1.0,
112
- "Standard Deviation": 0.0,
113
- "Rank": 1
114
- },
115
- "Probability": {
116
- "Average Score": 0.78855795,
117
- "Standard Deviation": 0.008188675452,
118
- "Rank": 6
119
- },
120
- "Logical": {
121
- "Average Score": 0.668635768,
122
- "Standard Deviation": 0.03466314094,
123
- "Rank": 11
124
- },
125
- "Social": {
126
- "Average Score": 0.680417314,
127
- "Standard Deviation": 0.00656867063,
128
- "Rank": 8
129
- },
130
- "Chemistry": {
131
- "Average Score": 92.43090226400756,
132
- "Standard Deviation": null,
133
- "Rank": 2
134
- }
135
- }
136
- },
137
- {
138
- "config": {
139
- "model_name": "gpt-4o-2024-05-13",
140
- "organization": "OpenAI",
141
- "license": "Proprietary",
142
- "knowledge_cutoff": "2023/10"
143
- },
144
- "results": {
145
- "OVERALL": {
146
- "Average Score": 0.846334477,
147
- "Standard Deviation": 0.09377911572,
148
- "Rank": 7
149
- },
150
- "Geometry": {
151
- "Average Score": 0.972472377,
152
- "Standard Deviation": 0.01648274205,
153
- "Rank": 4
154
- },
155
- "Algebra": {
156
- "Average Score": 0.995511298,
157
- "Standard Deviation": 0.004097802515,
158
- "Rank": 2
159
- },
160
- "Probability": {
161
- "Average Score": 0.812149974,
162
- "Standard Deviation": 0.007669585485,
163
- "Rank": 4
164
- },
165
- "Logical": {
166
- "Average Score": 0.755019692,
167
- "Standard Deviation": 0.008149588572,
168
- "Rank": 6
169
- },
170
- "Social": {
171
- "Average Score": 0.609875087,
172
- "Standard Deviation": 0.038729239,
173
- "Rank": 13
174
- },
175
- "Chemistry": {
176
- "Average Score": 79.1592634699295,
177
- "Standard Deviation": null,
178
- "Rank": 6
179
- }
180
- }
181
- },
182
- {
183
- "config": {
184
- "model_name": "gpt-4-turbo-2024-04-09",
185
- "organization": "OpenAI",
186
- "license": "Proprietary",
187
- "knowledge_cutoff": "2023/12"
188
- },
189
- "results": {
190
- "OVERALL": {
191
- "Average Score": 0.855357972,
192
- "Standard Deviation": 0.1016986368,
193
- "Rank": 4
194
- },
195
- "Geometry": {
196
- "Average Score": 0.95374588,
197
- "Standard Deviation": 0.03109307166,
198
- "Rank": 5
199
- },
200
- "Algebra": {
201
- "Average Score": 0.930945223,
202
- "Standard Deviation": 0.06705136813,
203
- "Rank": 4
204
- },
205
- "Probability": {
206
- "Average Score": 0.750705448,
207
- "Standard Deviation": 0.05944483103,
208
- "Rank": 8
209
- },
210
- "Logical": {
211
- "Average Score": 0.77906699,
212
- "Standard Deviation": 0.007406734161,
213
- "Rank": 4
214
- },
215
- "Social": {
216
- "Average Score": 0.715935163,
217
- "Standard Deviation": 0.1209141409,
218
- "Rank": 6
219
- },
220
- "Chemistry": {
221
- "Average Score": 70.73143363230263,
222
- "Standard Deviation": null,
223
- "Rank": 11
224
- }
225
- }
226
- },
227
- {
228
- "config": {
229
- "model_name": "gemini-1.5-pro-001",
230
- "organization": "Google",
231
- "license": "Proprietary",
232
- "knowledge_cutoff": "2023/11"
233
- },
234
- "results": {
235
- "OVERALL": {
236
- "Average Score": 0.797187842,
237
- "Standard Deviation": 0.0272375249,
238
- "Rank": 10
239
- },
240
- "Geometry": {
241
- "Average Score": 0.9947169,
242
- "Standard Deviation": 0.009150597621,
243
- "Rank": 2
244
- },
245
- "Algebra": {
246
- "Average Score": 0.857464301,
247
- "Standard Deviation": 0.05014285338,
248
- "Rank": 5
249
- },
250
- "Probability": {
251
- "Average Score": 0.651781767,
252
- "Standard Deviation": 0.04156998547,
253
- "Rank": 11
254
- },
255
- "Logical": {
256
- "Average Score": 0.739745471,
257
- "Standard Deviation": 0.01631532019,
258
- "Rank": 7
259
- },
260
- "Social": {
261
- "Average Score": 0.649601885,
262
- "Standard Deviation": 0.104854889,
263
- "Rank": 11
264
- }
265
- }
266
- },
267
- {
268
- "config": {
269
- "model_name": "qwen2-72b-instruct",
270
- "organization": "Alibaba",
271
- "license": "Qianwen LICENSE",
272
- "knowledge_cutoff": "2024/09"
273
- },
274
- "results": {
275
- "OVERALL": {
276
- "Average Score": 0.737918558,
277
- "Standard Deviation": 0.09069077339,
278
- "Rank": 11
279
- },
280
- "Geometry": {
281
- "Average Score": 0.796870305,
282
- "Standard Deviation": 0.0509025346,
283
- "Rank": 9
284
- },
285
- "Algebra": {
286
- "Average Score": 0.836194231,
287
- "Standard Deviation": 0.04517093028,
288
- "Rank": 6
289
- },
290
- "Probability": {
291
- "Average Score": 0.788068004,
292
- "Standard Deviation": 0.007288989044,
293
- "Rank": 7
294
- },
295
- "Logical": {
296
- "Average Score": 0.619300904,
297
- "Standard Deviation": 0.06377931612,
298
- "Rank": 15
299
- },
300
- "Social": {
301
- "Average Score": 0.652578786,
302
- "Standard Deviation": 0.04259293171,
303
- "Rank": 10
304
- },
305
- "Chemistry": {
306
- "Average Score": 73.54037778797029,
307
- "Standard Deviation": null,
308
- "Rank": 8
309
- }
310
- }
311
- },
312
- {
313
- "config": {
314
- "model_name": "gpt-4o-mini-2024-07-18",
315
- "organization": "OpenAI",
316
- "license": "Proprietary",
317
- "knowledge_cutoff": "2023/10"
318
- },
319
- "results": {
320
- "OVERALL": {
321
- "Average Score": 0.847694133,
322
- "Standard Deviation": 0.02164304402,
323
- "Rank": 5
324
- },
325
- "Geometry": {
326
- "Average Score": 0.946650435,
327
- "Standard Deviation": 0.01831236482,
328
- "Rank": 7
329
- },
330
- "Algebra": {
331
- "Average Score": 0.796243022,
332
- "Standard Deviation": 0.05537539202,
333
- "Rank": 7
334
- },
335
- "Probability": {
336
- "Average Score": 0.798402685,
337
- "Standard Deviation": 0.009404491967,
338
- "Rank": 5
339
- },
340
- "Logical": {
341
- "Average Score": 0.727009735,
342
- "Standard Deviation": 0.02628110141,
343
- "Rank": 8
344
- },
345
- "Social": {
346
- "Average Score": 0.691949855,
347
- "Standard Deviation": 0.02072934333,
348
- "Rank": 7
349
- },
350
- "Chemistry": {
351
- "Average Score": 88.3877070580296,
352
- "Standard Deviation": null,
353
- "Rank": 3
354
- }
355
- }
356
- },
357
- {
358
- "config": {
359
- "model_name": "claude-3.5-sonnet",
360
- "organization": "Anthropic",
361
- "license": "Proprietary",
362
- "knowledge_cutoff": "2024/04"
363
- },
364
- "results": {
365
- "OVERALL": {
366
- "Average Score": 0.839004422,
367
- "Standard Deviation": 0.1461079564,
368
- "Rank": 8
369
- },
370
- "Geometry": {
371
- "Average Score": 0.95316419,
372
- "Standard Deviation": 0.02081192856,
373
- "Rank": 6
374
- },
375
- "Algebra": {
376
- "Average Score": 0.759789952,
377
- "Standard Deviation": 0.02611765096,
378
- "Rank": 8
379
- },
380
- "Probability": {
381
- "Average Score": 0.707730127,
382
- "Standard Deviation": 0.0394436664,
383
- "Rank": 9
384
- },
385
- "Logical": {
386
- "Average Score": 0.77342666,
387
- "Standard Deviation": 0.002892426458,
388
- "Rank": 5
389
- },
390
- "Social": {
391
- "Average Score": 0.790002247,
392
- "Standard Deviation": 0.1007410022,
393
- "Rank": 4
394
- },
395
- "Chemistry": {
396
- "Average Score": 82.37734076815008,
397
- "Standard Deviation": null,
398
- "Rank": 5
399
- }
400
- }
401
- },
402
- {
403
- "config": {
404
- "model_name": "o1-mini",
405
- "organization": "OpenAI",
406
- "license": "Proprietary",
407
- "knowledge_cutoff": "2023/10"
408
- },
409
- "results": {
410
- "OVERALL": {
411
- "Average Score": 1.0,
412
- "Standard Deviation": 0.0,
413
- "Rank": 1
414
- },
415
- "Geometry": {
416
- "Average Score": "N/A",
417
- "Standard Deviation": "N/A",
418
- "Rank": "N/A"
419
- },
420
- "Algebra": {
421
- "Average Score": "N/A",
422
- "Standard Deviation": "N/A",
423
- "Rank": "N/A"
424
- },
425
- "Probability": {
426
- "Average Score": 1.0,
427
- "Standard Deviation": 0.0,
428
- "Rank": 1
429
- },
430
- "Logical": {
431
- "Average Score": 1.0,
432
- "Standard Deviation": 0.0,
433
- "Rank": 1
434
- },
435
- "Social": {
436
- "Average Score": 0.993974241,
437
- "Standard Deviation": 0.001996882328,
438
- "Rank": 2
439
- }
440
- }
441
- },
442
- {
443
- "config": {
444
- "model_name": "o1-preview",
445
- "organization": "OpenAI",
446
- "license": "Proprietary",
447
- "knowledge_cutoff": "2023/10"
448
- },
449
- "results": {
450
- "OVERALL": {
451
- "Average Score": 0.945884589,
452
- "Standard Deviation": 0.01059250762,
453
- "Rank": 3
454
- },
455
- "Geometry": {
456
- "Average Score": "N/A",
457
- "Standard Deviation": "N/A",
458
- "Rank": "N/A"
459
- },
460
- "Algebra": {
461
- "Average Score": "N/A",
462
- "Standard Deviation": "N/A",
463
- "Rank": "N/A"
464
- },
465
- "Probability": {
466
- "Average Score": 0.964666392,
467
- "Standard Deviation": 0.003139983398,
468
- "Rank": 2
469
- },
470
- "Logical": {
471
- "Average Score": 0.987950057,
472
- "Standard Deviation": 0.004881220327,
473
- "Rank": 2
474
- },
475
- "Social": {
476
- "Average Score": 1.0,
477
- "Standard Deviation": 0.0,
478
- "Rank": 1
479
- }
480
- }
481
- },
482
- {
483
- "config": {
484
- "model_name": "gemini-1.5-flash-001",
485
- "organization": "Google",
486
- "license": "Proprietary",
487
- "knowledge_cutoff": "2023/11"
488
- },
489
- "results": {
490
- "OVERALL": {
491
- "Average Score": 0.726493401,
492
- "Standard Deviation": 0.01113913725,
493
- "Rank": 12
494
- },
495
- "Geometry": {
496
- "Average Score": 0.804144103,
497
- "Standard Deviation": 0.1327142178,
498
- "Rank": 8
499
- },
500
- "Algebra": {
501
- "Average Score": 0.731776765,
502
- "Standard Deviation": 0.02594657111,
503
- "Rank": 10
504
- },
505
- "Probability": {
506
- "Average Score": 0.614461891,
507
- "Standard Deviation": 0.04690131826,
508
- "Rank": 14
509
- },
510
- "Logical": {
511
- "Average Score": 0.630805991,
512
- "Standard Deviation": 0.04871350612,
513
- "Rank": 13
514
- },
515
- "Social": {
516
- "Average Score": 0.555933822,
517
- "Standard Deviation": 0.1029934524,
518
- "Rank": 15
519
- },
520
- "Chemistry": {
521
- "Average Score": 72.1127762005651,
522
- "Standard Deviation": null,
523
- "Rank": 10
524
- }
525
- }
526
- },
527
- {
528
- "config": {
529
- "model_name": "gpt4-1106",
530
- "organization": "OpenAI",
531
- "license": "Proprietary",
532
- "knowledge_cutoff": "2024/04"
533
- },
534
- "results": {
535
- "OVERALL": {
536
- "Average Score": 0.816347784,
537
- "Standard Deviation": 0.1566815755,
538
- "Rank": 9
539
- },
540
- "Geometry": {
541
- "Average Score": 0.71843088,
542
- "Standard Deviation": 0.04778038294,
543
- "Rank": 12
544
- },
545
- "Algebra": {
546
- "Average Score": 0.712910417,
547
- "Standard Deviation": 0.02581828898,
548
- "Rank": 11
549
- },
550
- "Probability": {
551
- "Average Score": 0.623947619,
552
- "Standard Deviation": 0.03502982933,
553
- "Rank": 13
554
- },
555
- "Logical": {
556
- "Average Score": 0.637482274,
557
- "Standard Deviation": 0.04158809888,
558
- "Rank": 12
559
- },
560
- "Social": {
561
- "Average Score": 0.450609816,
562
- "Standard Deviation": 0.05208655446,
563
- "Rank": 22
564
- },
565
- "Chemistry": {
566
- "Average Score": 69.11824072252848,
567
- "Standard Deviation": null,
568
- "Rank": 12
569
- }
570
- }
571
- },
572
- {
573
- "config": {
574
- "model_name": "gemma-2-27b-it",
575
- "organization": "Google",
576
- "license": "Gemma License",
577
- "knowledge_cutoff": "2024/06"
578
- },
579
- "results": {
580
- "OVERALL": {
581
- "Average Score": 0.624169623,
582
- "Standard Deviation": 0.1048365121,
583
- "Rank": 15
584
- },
585
- "Geometry": {
586
- "Average Score": 0.60112744,
587
- "Standard Deviation": 0.0469109952,
588
- "Rank": 18
589
- },
590
- "Algebra": {
591
- "Average Score": 0.687955914,
592
- "Standard Deviation": 0.01959958192,
593
- "Rank": 12
594
- },
595
- "Probability": {
596
- "Average Score": 0.589524771,
597
- "Standard Deviation": 0.03112689325,
598
- "Rank": 15
599
- },
600
- "Logical": {
601
- "Average Score": 0.614978944,
602
- "Standard Deviation": 0.05710657859,
603
- "Rank": 16
604
- },
605
- "Social": {
606
- "Average Score": 0.487844257,
607
- "Standard Deviation": 0.05857760809,
608
- "Rank": 18
609
- },
610
- "Chemistry": {
611
- "Average Score": 63.28920072143611,
612
- "Standard Deviation": null,
613
- "Rank": 14
614
- }
615
- }
616
- },
617
- {
618
- "config": {
619
- "model_name": "claude-3-opus",
620
- "organization": "Anthropic",
621
- "license": "Proprietary",
622
- "knowledge_cutoff": "2023/08"
623
- },
624
- "results": {
625
- "OVERALL": {
626
- "Average Score": 0.650636271,
627
- "Standard Deviation": 0.1197773541,
628
- "Rank": 14
629
- },
630
- "Geometry": {
631
- "Average Score": 0.7215743,
632
- "Standard Deviation": 0.04712598358,
633
- "Rank": 11
634
- },
635
- "Algebra": {
636
- "Average Score": 0.68777327,
637
- "Standard Deviation": 0.02382683713,
638
- "Rank": 13
639
- },
640
- "Probability": {
641
- "Average Score": 0.626471421,
642
- "Standard Deviation": 0.02911817976,
643
- "Rank": 12
644
- },
645
- "Logical": {
646
- "Average Score": 0.692346381,
647
- "Standard Deviation": 0.03617185198,
648
- "Rank": 10
649
- },
650
- "Social": {
651
- "Average Score": 0.663410854,
652
- "Standard Deviation": 0.09540220876,
653
- "Rank": 9
654
- },
655
- "Chemistry": {
656
- "Average Score": 73.5404403567132,
657
- "Standard Deviation": null,
658
- "Rank": 7
659
- }
660
- }
661
- },
662
- {
663
- "config": {
664
- "model_name": "gemma-2-9b-it-simpo",
665
- "organization": "Google",
666
- "license": "Gemma License",
667
- "knowledge_cutoff": "2024/07"
668
- },
669
- "results": {
670
- "OVERALL": {
671
- "Average Score": "N/A",
672
- "Standard Deviation": "N/A",
673
- "Rank": "N/A"
674
- },
675
- "Geometry": {
676
- "Average Score": 0.582787508,
677
- "Standard Deviation": 0.03965204074,
678
- "Rank": 19
679
- },
680
- "Algebra": {
681
- "Average Score": 0.658648133,
682
- "Standard Deviation": 0.02565919856,
683
- "Rank": 14
684
- },
685
- "Probability": {
686
- "Average Score": 0.547861265,
687
- "Standard Deviation": 0.02885209131,
688
- "Rank": 18
689
- },
690
- "Logical": {
691
- "Average Score": 0.540720893,
692
- "Standard Deviation": 0.01970134508,
693
- "Rank": 20
694
- },
695
- "Social": {
696
- "Average Score": 0.635266187,
697
- "Standard Deviation": 0.03620021751,
698
- "Rank": 12
699
- },
700
- "Chemistry": {
701
- "Average Score": 73.43757596214863,
702
- "Standard Deviation": null,
703
- "Rank": 9
704
- }
705
- }
706
- },
707
- {
708
- "config": {
709
- "model_name": "qwen1.5-72b-chat",
710
- "organization": "Alibaba",
711
- "license": "Qianwen LICENSE",
712
- "knowledge_cutoff": "2024/03"
713
- },
714
- "results": {
715
- "OVERALL": {
716
- "Average Score": 0.519549796,
717
- "Standard Deviation": 0.00903634343,
718
- "Rank": 18
719
- },
720
- "Geometry": {
721
- "Average Score": 0.543139301,
722
- "Standard Deviation": 0.03425202326,
723
- "Rank": 23
724
- },
725
- "Algebra": {
726
- "Average Score": 0.635228729,
727
- "Standard Deviation": 0.01944043425,
728
- "Rank": 15
729
- },
730
- "Probability": {
731
- "Average Score": 0.486948658,
732
- "Standard Deviation": 0.06064655315,
733
- "Rank": 22
734
- },
735
- "Logical": {
736
- "Average Score": 0.284069394,
737
- "Standard Deviation": 0.02686608506,
738
- "Rank": 33
739
- },
740
- "Social": {
741
- "Average Score": 0.415007627,
742
- "Standard Deviation": 0.03920053159,
743
- "Rank": 23
744
- },
745
- "Chemistry": {
746
- "Average Score": 48.69302376665551,
747
- "Standard Deviation": null,
748
- "Rank": 20
749
- }
750
- }
751
- },
752
- {
753
- "config": {
754
- "model_name": "qwen1.5-32b-chat",
755
- "organization": "Alibaba",
756
- "license": "Qianwen LICENSE",
757
- "knowledge_cutoff": "2024/03"
758
- },
759
- "results": {
760
- "OVERALL": {
761
- "Average Score": 0.393789407,
762
- "Standard Deviation": 0.05413770095,
763
- "Rank": 29
764
- },
765
- "Geometry": {
766
- "Average Score": 0.51086835,
767
- "Standard Deviation": 0.04052471998,
768
- "Rank": 26
769
- },
770
- "Algebra": {
771
- "Average Score": 0.609003168,
772
- "Standard Deviation": 0.04874143541,
773
- "Rank": 16
774
- },
775
- "Probability": {
776
- "Average Score": 0.476300002,
777
- "Standard Deviation": 0.05322403912,
778
- "Rank": 23
779
- },
780
- "Logical": {
781
- "Average Score": 0.331781014,
782
- "Standard Deviation": 0.004938997686,
783
- "Rank": 30
784
- },
785
- "Social": {
786
- "Average Score": 0.380987334,
787
- "Standard Deviation": 0.03762251776,
788
- "Rank": 25
789
- },
790
- "Chemistry": {
791
- "Average Score": 45.14284028264288,
792
- "Standard Deviation": null,
793
- "Rank": 24
794
- }
795
- }
796
- },
797
- {
798
- "config": {
799
- "model_name": "google-gemma-2-9b-it",
800
- "organization": "Google",
801
- "license": "Proprietary",
802
- "knowledge_cutoff": "2024/06"
803
- },
804
- "results": {
805
- "OVERALL": {
806
- "Average Score": 0.489663449,
807
- "Standard Deviation": 0.002595702019,
808
- "Rank": 21
809
- },
810
- "Geometry": {
811
- "Average Score": 0.575371308,
812
- "Standard Deviation": 0.03556220251,
813
- "Rank": 21
814
- },
815
- "Algebra": {
816
- "Average Score": 0.597045661,
817
- "Standard Deviation": 0.0313828123,
818
- "Rank": 17
819
- },
820
- "Probability": {
821
- "Average Score": 0.589221807,
822
- "Standard Deviation": 0.03110811656,
823
- "Rank": 17
824
- },
825
- "Logical": {
826
- "Average Score": 0.587579897,
827
- "Standard Deviation": 0.05512716783,
828
- "Rank": 18
829
- },
830
- "Social": {
831
- "Average Score": 0.768337958,
832
- "Standard Deviation": 0.04078610476,
833
- "Rank": 5
834
- },
835
- "Chemistry": {
836
- "Average Score": 54.03167523687635,
837
- "Standard Deviation": null,
838
- "Rank": 17
839
- }
840
- }
841
- },
842
- {
843
- "config": {
844
- "model_name": "yi-1.5-34b-chat",
845
- "organization": "01 AI",
846
- "license": "Proprietary",
847
- "knowledge_cutoff": "2024/05"
848
- },
849
- "results": {
850
- "OVERALL": {
851
- "Average Score": 0.607812897,
852
- "Standard Deviation": 0.1440881293,
853
- "Rank": 16
854
- },
855
- "Geometry": {
856
- "Average Score": 0.566666724,
857
- "Standard Deviation": 0.04001381658,
858
- "Rank": 22
859
- },
860
- "Algebra": {
861
- "Average Score": 0.590997292,
862
- "Standard Deviation": 0.03594087315,
863
- "Rank": 18
864
- },
865
- "Probability": {
866
- "Average Score": 0.589524589,
867
- "Standard Deviation": 0.03112618772,
868
- "Rank": 16
869
- },
870
- "Logical": {
871
- "Average Score": 0.574105508,
872
- "Standard Deviation": 0.03441737941,
873
- "Rank": 19
874
- },
875
- "Social": {
876
- "Average Score": 0.516980832,
877
- "Standard Deviation": 0.03369347985,
878
- "Rank": 17
879
- },
880
- "Chemistry": {
881
- "Average Score": 52.148798061768964,
882
- "Standard Deviation": null,
883
- "Rank": 18
884
- }
885
- }
886
- },
887
- {
888
- "config": {
889
- "model_name": "meta-llama-3.1-8b-instruct",
890
- "organization": "Meta",
891
- "license": "Llama 3.1 Community",
892
- "knowledge_cutoff": "2023/12"
893
- },
894
- "results": {
895
- "OVERALL": {
896
- "Average Score": 0.505936324,
897
- "Standard Deviation": 0.05286756493,
898
- "Rank": 19
899
- },
900
- "Geometry": {
901
- "Average Score": 0.522442162,
902
- "Standard Deviation": 0.03908236317,
903
- "Rank": 24
904
- },
905
- "Algebra": {
906
- "Average Score": 0.582702645,
907
- "Standard Deviation": 0.05002277711,
908
- "Rank": 19
909
- },
910
- "Probability": {
911
- "Average Score": 0.495001149,
912
- "Standard Deviation": 0.05244587037,
913
- "Rank": 21
914
- },
915
- "Logical": {
916
- "Average Score": 0.443030561,
917
- "Standard Deviation": 0.01343820628,
918
- "Rank": 25
919
- },
920
- "Social": {
921
- "Average Score": 0.329195941,
922
- "Standard Deviation": 0.03925019528,
923
- "Rank": 29
924
- },
925
- "Chemistry": {
926
- "Average Score": 44.41846841004584,
927
- "Standard Deviation": null,
928
- "Rank": 26
929
- }
930
- }
931
- },
932
- {
933
- "config": {
934
- "model_name": "gpt3.5-turbo-0125",
935
- "organization": "OpenAI",
936
- "license": "Proprietary",
937
- "knowledge_cutoff": "2021/09"
938
- },
939
- "results": {
940
- "OVERALL": {
941
- "Average Score": 0.313398088,
942
- "Standard Deviation": 0.09322528606,
943
- "Rank": 40
944
- },
945
- "Geometry": {
946
- "Average Score": 0.678714519,
947
- "Standard Deviation": 0.05926546762,
948
- "Rank": 13
949
- },
950
- "Algebra": {
951
- "Average Score": 0.569296173,
952
- "Standard Deviation": 0.05277281097,
953
- "Rank": 20
954
- },
955
- "Probability": {
956
- "Average Score": 0.448460767,
957
- "Standard Deviation": 0.05768095196,
958
- "Rank": 25
959
- },
960
- "Logical": {
961
- "Average Score": 0.148521348,
962
- "Standard Deviation": 0.04033712907,
963
- "Rank": 45
964
- },
965
- "Social": {
966
- "Average Score": 0.235071541,
967
- "Standard Deviation": 0.02632892457,
968
- "Rank": 38
969
- },
970
- "Chemistry": {
971
- "Average Score": 40.46958736582551,
972
- "Standard Deviation": null,
973
- "Rank": 29
974
- }
975
- }
976
- },
977
- {
978
- "config": {
979
- "model_name": "llama-3-70b-instruct",
980
- "organization": "Meta",
981
- "license": "Llama 3 Community",
982
- "knowledge_cutoff": "2023/12"
983
- },
984
- "results": {
985
- "OVERALL": {
986
- "Average Score": 0.456689885,
987
- "Standard Deviation": 0.01385989995,
988
- "Rank": 23
989
- },
990
- "Geometry": {
991
- "Average Score": 0.516865529,
992
- "Standard Deviation": 0.03858112564,
993
- "Rank": 25
994
- },
995
- "Algebra": {
996
- "Average Score": 0.566756531,
997
- "Standard Deviation": 0.03369826926,
998
- "Rank": 21
999
- },
1000
- "Probability": {
1001
- "Average Score": 0.513857306,
1002
- "Standard Deviation": 0.05453699062,
1003
- "Rank": 20
1004
- },
1005
- "Logical": {
1006
- "Average Score": 0.713796415,
1007
- "Standard Deviation": 0.02031215107,
1008
- "Rank": 9
1009
- },
1010
- "Social": {
1011
- "Average Score": 0.45872939,
1012
- "Standard Deviation": 0.05347039576,
1013
- "Rank": 21
1014
- },
1015
- "Chemistry": {
1016
- "Average Score": 65.32140697218945,
1017
- "Standard Deviation": null,
1018
- "Rank": 13
1019
- }
1020
- }
1021
- },
1022
- {
1023
- "config": {
1024
- "model_name": "claude-3-sonnet",
1025
- "organization": "Anthropic",
1026
- "license": "Proprietary",
1027
- "knowledge_cutoff": "2023/08"
1028
- },
1029
- "results": {
1030
- "OVERALL": {
1031
- "Average Score": 0.520010833,
1032
- "Standard Deviation": 0.005030563799,
1033
- "Rank": 17
1034
- },
1035
- "Geometry": {
1036
- "Average Score": 0.675613638,
1037
- "Standard Deviation": 0.05275594408,
1038
- "Rank": 14
1039
- },
1040
- "Algebra": {
1041
- "Average Score": 0.552025728,
1042
- "Standard Deviation": 0.04122192409,
1043
- "Rank": 22
1044
- },
1045
- "Probability": {
1046
- "Average Score": 0.516192848,
1047
- "Standard Deviation": 0.04152293217,
1048
- "Rank": 19
1049
- },
1050
- "Logical": {
1051
- "Average Score": 0.588545747,
1052
- "Standard Deviation": 0.06068211943,
1053
- "Rank": 17
1054
- },
1055
- "Social": {
1056
- "Average Score": 0.570437582,
1057
- "Standard Deviation": 0.08607040862,
1058
- "Rank": 14
1059
- },
1060
- "Chemistry": {
1061
- "Average Score": 61.33538592327427,
1062
- "Standard Deviation": null,
1063
- "Rank": 15
1064
- }
1065
- }
1066
- },
1067
- {
1068
- "config": {
1069
- "model_name": "qwen1.5-14b-chat",
1070
- "organization": "Alibaba",
1071
- "license": "Qianwen LICENSE",
1072
- "knowledge_cutoff": "2024/02"
1073
- },
1074
- "results": {
1075
- "OVERALL": {
1076
- "Average Score": 0.415328996,
1077
- "Standard Deviation": 0.0743938717,
1078
- "Rank": 28
1079
- },
1080
- "Geometry": {
1081
- "Average Score": 0.452504016,
1082
- "Standard Deviation": 0.04225594393,
1083
- "Rank": 27
1084
- },
1085
- "Algebra": {
1086
- "Average Score": 0.538655725,
1087
- "Standard Deviation": 0.03721542594,
1088
- "Rank": 23
1089
- },
1090
- "Probability": {
1091
- "Average Score": 0.397185975,
1092
- "Standard Deviation": 0.05607695946,
1093
- "Rank": 29
1094
- },
1095
- "Logical": {
1096
- "Average Score": 0.264573129,
1097
- "Standard Deviation": 0.03936133174,
1098
- "Rank": 35
1099
- },
1100
- "Social": {
1101
- "Average Score": 0.287370142,
1102
- "Standard Deviation": 0.04264085315,
1103
- "Rank": 31
1104
- },
1105
- "Chemistry": {
1106
- "Average Score": 38.552779976347026,
1107
- "Standard Deviation": null,
1108
- "Rank": 31
1109
- }
1110
- }
1111
- },
1112
- {
1113
- "config": {
1114
- "model_name": "claude-3-haiku",
1115
- "organization": "Anthropic",
1116
- "license": "Proprietary",
1117
- "knowledge_cutoff": "2023/08"
1118
- },
1119
- "results": {
1120
- "OVERALL": {
1121
- "Average Score": 0.453901163,
1122
- "Standard Deviation": 0.003604084261,
1123
- "Rank": 24
1124
- },
1125
- "Geometry": {
1126
- "Average Score": 0.607993912,
1127
- "Standard Deviation": 0.05793460748,
1128
- "Rank": 16
1129
- },
1130
- "Algebra": {
1131
- "Average Score": 0.520054055,
1132
- "Standard Deviation": 0.03333544511,
1133
- "Rank": 24
1134
- },
1135
- "Probability": {
1136
- "Average Score": 0.474460688,
1137
- "Standard Deviation": 0.0446501933,
1138
- "Rank": 24
1139
- },
1140
- "Logical": {
1141
- "Average Score": 0.512815976,
1142
- "Standard Deviation": 0.0163264281,
1143
- "Rank": 21
1144
- },
1145
- "Social": {
1146
- "Average Score": 0.551083976,
1147
- "Standard Deviation": 0.05374722539,
1148
- "Rank": 16
1149
- },
1150
- "Chemistry": {
1151
- "Average Score": 56.40200048817984,
1152
- "Standard Deviation": null,
1153
- "Rank": 16
1154
- }
1155
- }
1156
- },
1157
- {
1158
- "config": {
1159
- "model_name": "claude-2.1",
1160
- "organization": "Anthropic",
1161
- "license": "Proprietary",
1162
- "knowledge_cutoff": "Unknown"
1163
- },
1164
- "results": {
1165
- "OVERALL": {
1166
- "Average Score": 0.35814708,
1167
- "Standard Deviation": 0.09168134168,
1168
- "Rank": 36
1169
- },
1170
- "Geometry": {
1171
- "Average Score": 0.62752395,
1172
- "Standard Deviation": 0.07232659398,
1173
- "Rank": 15
1174
- },
1175
- "Algebra": {
1176
- "Average Score": 0.508849609,
1177
- "Standard Deviation": 0.0346897465,
1178
- "Rank": 25
1179
- },
1180
- "Probability": {
1181
- "Average Score": 0.41477086,
1182
- "Standard Deviation": 0.05964060239,
1183
- "Rank": 28
1184
- },
1185
- "Logical": {
1186
- "Average Score": 0.482923674,
1187
- "Standard Deviation": 0.01989147048,
1188
- "Rank": 22
1189
- },
1190
- "Social": {
1191
- "Average Score": 0.333804568,
1192
- "Standard Deviation": 0.03775548253,
1193
- "Rank": 28
1194
- },
1195
- "Chemistry": {
1196
- "Average Score": 47.23672563994903,
1197
- "Standard Deviation": null,
1198
- "Rank": 21
1199
- }
1200
- }
1201
- },
1202
- {
1203
- "config": {
1204
- "model_name": "mistral-8x7b-instruct-v0.1",
1205
- "organization": "Mistral",
1206
- "license": "Apache 2.0",
1207
- "knowledge_cutoff": "2023/12"
1208
- },
1209
- "results": {
1210
- "OVERALL": {
1211
- "Average Score": 0.382659161,
1212
- "Standard Deviation": 0.07594496929,
1213
- "Rank": 31
1214
- },
1215
- "Geometry": {
1216
- "Average Score": 0.432216097,
1217
- "Standard Deviation": 0.04747949254,
1218
- "Rank": 30
1219
- },
1220
- "Algebra": {
1221
- "Average Score": 0.478314888,
1222
- "Standard Deviation": 0.01998797419,
1223
- "Rank": 26
1224
- },
1225
- "Probability": {
1226
- "Average Score": 0.427144725,
1227
- "Standard Deviation": 0.0590923329,
1228
- "Rank": 27
1229
- },
1230
- "Logical": {
1231
- "Average Score": 0.340041983,
1232
- "Standard Deviation": 0.008397574592,
1233
- "Rank": 28
1234
- },
1235
- "Social": {
1236
- "Average Score": 0.251949622,
1237
- "Standard Deviation": 0.03346674405,
1238
- "Rank": 36
1239
- },
1240
- "Chemistry": {
1241
- "Average Score": 44.533118241976666,
1242
- "Standard Deviation": null,
1243
- "Rank": 25
1244
- }
1245
- }
1246
- },
1247
- {
1248
- "config": {
1249
- "model_name": "claude-2.0",
1250
- "organization": "Anthropic",
1251
- "license": "Proprietary",
1252
- "knowledge_cutoff": "Unknown"
1253
- },
1254
- "results": {
1255
- "OVERALL": {
1256
- "Average Score": 0.322718057,
1257
- "Standard Deviation": 0.08369883584,
1258
- "Rank": 38
1259
- },
1260
- "Geometry": {
1261
- "Average Score": 0.604141967,
1262
- "Standard Deviation": 0.05116441826,
1263
- "Rank": 17
1264
- },
1265
- "Algebra": {
1266
- "Average Score": 0.474350734,
1267
- "Standard Deviation": 0.01510393066,
1268
- "Rank": 27
1269
- },
1270
- "Probability": {
1271
- "Average Score": 0.437950412,
1272
- "Standard Deviation": 0.05985594317,
1273
- "Rank": 26
1274
- },
1275
- "Logical": {
1276
- "Average Score": 0.445620646,
1277
- "Standard Deviation": 0.01812614805,
1278
- "Rank": 24
1279
- },
1280
- "Social": {
1281
- "Average Score": 0.469422836,
1282
- "Standard Deviation": 0.05999901796,
1283
- "Rank": 19
1284
- },
1285
- "Chemistry": {
1286
- "Average Score": 50.773143448036464,
1287
- "Standard Deviation": null,
1288
- "Rank": 19
1289
- }
1290
- }
1291
- },
1292
- {
1293
- "config": {
1294
- "model_name": "starling-lm-7b-beta",
1295
- "organization": "Nexusflow",
1296
- "license": "Apache-2.0",
1297
- "knowledge_cutoff": "2024/03"
1298
- },
1299
- "results": {
1300
- "OVERALL": {
1301
- "Average Score": 0.479391856,
1302
- "Standard Deviation": 0.04199990887,
1303
- "Rank": 22
1304
- },
1305
- "Geometry": {
1306
- "Average Score": 0.446654388,
1307
- "Standard Deviation": 0.05637864999,
1308
- "Rank": 29
1309
- },
1310
- "Algebra": {
1311
- "Average Score": 0.473952749,
1312
- "Standard Deviation": 0.01584301288,
1313
- "Rank": 28
1314
- },
1315
- "Probability": {
1316
- "Average Score": 0.395197837,
1317
- "Standard Deviation": 0.05814798892,
1318
- "Rank": 30
1319
- },
1320
- "Logical": {
1321
- "Average Score": 0.39927199,
1322
- "Standard Deviation": 0.02125277518,
1323
- "Rank": 26
1324
- },
1325
- "Social": {
1326
- "Average Score": 0.380021662,
1327
- "Standard Deviation": 0.04622452748,
1328
- "Rank": 26
1329
- },
1330
- "Chemistry": {
1331
- "Average Score": 38.27587102395908,
1332
- "Standard Deviation": null,
1333
- "Rank": 32
1334
- }
1335
- }
1336
- },
1337
- {
1338
- "config": {
1339
- "model_name": "gemini-1.0-pro-001",
1340
- "organization": "Google",
1341
- "license": "Proprietary",
1342
- "knowledge_cutoff": "2023/04"
1343
- },
1344
- "results": {
1345
- "OVERALL": {
1346
- "Average Score": 0.449040654,
1347
- "Standard Deviation": 0.0450610177,
1348
- "Rank": 25
1349
- },
1350
- "Geometry": {
1351
- "Average Score": 0.578347959,
1352
- "Standard Deviation": 0.04242873607,
1353
- "Rank": 20
1354
- },
1355
- "Algebra": {
1356
- "Average Score": 0.462417786,
1357
- "Standard Deviation": 0.01668313635,
1358
- "Rank": 29
1359
- },
1360
- "Probability": {
1361
- "Average Score": 0.289836324,
1362
- "Standard Deviation": 0.05739831115,
1363
- "Rank": 38
1364
- },
1365
- "Logical": {
1366
- "Average Score": 0.191140355,
1367
- "Standard Deviation": 0.03394652499,
1368
- "Rank": 41
1369
- },
1370
- "Social": {
1371
- "Average Score": 0.130790863,
1372
- "Standard Deviation": 0.02800188173,
1373
- "Rank": 46
1374
- },
1375
- "Chemistry": {
1376
- "Average Score": 45.22204471452975,
1377
- "Standard Deviation": null,
1378
- "Rank": 23
1379
- }
1380
- }
1381
- },
1382
- {
1383
- "config": {
1384
- "model_name": "openchat-3.5-0106",
1385
- "organization": "OpenChat",
1386
- "license": "Apache-2.0",
1387
- "knowledge_cutoff": "2024/01"
1388
- },
1389
- "results": {
1390
- "OVERALL": {
1391
- "Average Score": 0.363929888,
1392
- "Standard Deviation": 0.08602347145,
1393
- "Rank": 34
1394
- },
1395
- "Geometry": {
1396
- "Average Score": 0.38715246,
1397
- "Standard Deviation": 0.03701851946,
1398
- "Rank": 33
1399
- },
1400
- "Algebra": {
1401
- "Average Score": 0.441233712,
1402
- "Standard Deviation": 0.01135753754,
1403
- "Rank": 30
1404
- },
1405
- "Probability": {
1406
- "Average Score": 0.38802618,
1407
- "Standard Deviation": 0.05663879714,
1408
- "Rank": 31
1409
- },
1410
- "Logical": {
1411
- "Average Score": 0.336754383,
1412
- "Standard Deviation": 0.01608478079,
1413
- "Rank": 29
1414
- },
1415
- "Social": {
1416
- "Average Score": 0.250891608,
1417
- "Standard Deviation": 0.03253769914,
1418
- "Rank": 37
1419
- },
1420
- "Chemistry": {
1421
- "Average Score": 33.70639271807677,
1422
- "Standard Deviation": null,
1423
- "Rank": 33
1424
- }
1425
- }
1426
- },
1427
- {
1428
- "config": {
1429
- "model_name": "openchat-3.5",
1430
- "organization": "OpenChat",
1431
- "license": "Apache-2.0",
1432
- "knowledge_cutoff": "2023/11"
1433
- },
1434
- "results": {
1435
- "OVERALL": {
1436
- "Average Score": 0.361341296,
1437
- "Standard Deviation": 0.09034869493,
1438
- "Rank": 35
1439
- },
1440
- "Geometry": {
1441
- "Average Score": 0.401699069,
1442
- "Standard Deviation": 0.03410726557,
1443
- "Rank": 31
1444
- },
1445
- "Algebra": {
1446
- "Average Score": 0.414095336,
1447
- "Standard Deviation": 0.01881964261,
1448
- "Rank": 32
1449
- },
1450
- "Probability": {
1451
- "Average Score": 0.349601002,
1452
- "Standard Deviation": 0.05077455539,
1453
- "Rank": 33
1454
- },
1455
- "Logical": {
1456
- "Average Score": 0.331069242,
1457
- "Standard Deviation": 0.02180827173,
1458
- "Rank": 31
1459
- },
1460
- "Social": {
1461
- "Average Score": 0.319991655,
1462
- "Standard Deviation": 0.04502478724,
1463
- "Rank": 30
1464
- },
1465
- "Chemistry": {
1466
- "Average Score": 33.020911255646965,
1467
- "Standard Deviation": null,
1468
- "Rank": 34
1469
- }
1470
- }
1471
- },
1472
- {
1473
- "config": {
1474
- "model_name": "command-r-(08-2024)",
1475
- "organization": "Cohere",
1476
- "license": "CC-BY-NC-4.0",
1477
- "knowledge_cutoff": "2024/08"
1478
- },
1479
- "results": {
1480
- "OVERALL": {
1481
- "Average Score": 0.427605298,
1482
- "Standard Deviation": 0.01747449163,
1483
- "Rank": 26
1484
- },
1485
- "Geometry": {
1486
- "Average Score": 0.448300727,
1487
- "Standard Deviation": 0.04996362328,
1488
- "Rank": 28
1489
- },
1490
- "Algebra": {
1491
- "Average Score": 0.417519167,
1492
- "Standard Deviation": 0.01822196902,
1493
- "Rank": 31
1494
- },
1495
- "Probability": {
1496
- "Average Score": 0.366336281,
1497
- "Standard Deviation": 0.04716826942,
1498
- "Rank": 32
1499
- },
1500
- "Logical": {
1501
- "Average Score": 0.214657906,
1502
- "Standard Deviation": 0.03003579835,
1503
- "Rank": 38
1504
- },
1505
- "Social": {
1506
- "Average Score": 0.276088379,
1507
- "Standard Deviation": 0.03295234688,
1508
- "Rank": 33
1509
- },
1510
- "Chemistry": {
1511
- "Average Score": 39.61492485677676,
1512
- "Standard Deviation": null,
1513
- "Rank": 30
1514
- }
1515
- }
1516
- },
1517
- {
1518
- "config": {
1519
- "model_name": "gemma-1.1-7b-it",
1520
- "organization": "Google",
1521
- "license": "Gemma License",
1522
- "knowledge_cutoff": "2024/02"
1523
- },
1524
- "results": {
1525
- "OVERALL": {
1526
- "Average Score": 0.339506922,
1527
- "Standard Deviation": 0.1066279108,
1528
- "Rank": 37
1529
- },
1530
- "Geometry": {
1531
- "Average Score": 0.324170977,
1532
- "Standard Deviation": 0.04668553765,
1533
- "Rank": 36
1534
- },
1535
- "Algebra": {
1536
- "Average Score": 0.398684697,
1537
- "Standard Deviation": 0.01982398259,
1538
- "Rank": 33
1539
- },
1540
- "Probability": {
1541
- "Average Score": 0.293253175,
1542
- "Standard Deviation": 0.05126192191,
1543
- "Rank": 37
1544
- },
1545
- "Logical": {
1546
- "Average Score": 0.317750796,
1547
- "Standard Deviation": 0.01101933543,
1548
- "Rank": 32
1549
- },
1550
- "Social": {
1551
- "Average Score": 0.179073276,
1552
- "Standard Deviation": 0.02009658805,
1553
- "Rank": 42
1554
- },
1555
- "Chemistry": {
1556
- "Average Score": 42.666504105798204,
1557
- "Standard Deviation": null,
1558
- "Rank": 27
1559
- }
1560
- }
1561
- },
1562
- {
1563
- "config": {
1564
- "model_name": "llama3-8b-instruct",
1565
- "organization": "Meta",
1566
- "license": "Llama 3 Community",
1567
- "knowledge_cutoff": "2023/03"
1568
- },
1569
- "results": {
1570
- "OVERALL": {
1571
- "Average Score": 0.367722676,
1572
- "Standard Deviation": 0.1071368221,
1573
- "Rank": 32
1574
- },
1575
- "Geometry": {
1576
- "Average Score": 0.367143758,
1577
- "Standard Deviation": 0.04363680358,
1578
- "Rank": 34
1579
- },
1580
- "Algebra": {
1581
- "Average Score": 0.391480973,
1582
- "Standard Deviation": 0.02757445266,
1583
- "Rank": 34
1584
- },
1585
- "Probability": {
1586
- "Average Score": 0.317616445,
1587
- "Standard Deviation": 0.04300430361,
1588
- "Rank": 36
1589
- },
1590
- "Logical": {
1591
- "Average Score": 0.461607495,
1592
- "Standard Deviation": 0.02185028842,
1593
- "Rank": 23
1594
- },
1595
- "Social": {
1596
- "Average Score": 0.336373622,
1597
- "Standard Deviation": 0.05762408512,
1598
- "Rank": 27
1599
- },
1600
- "Chemistry": {
1601
- "Average Score": 45.35392139264795,
1602
- "Standard Deviation": null,
1603
- "Rank": 22
1604
- }
1605
- }
1606
- },
1607
- {
1608
- "config": {
1609
- "model_name": "gemma-2-2b-it",
1610
- "organization": "Google",
1611
- "license": "Gemma License",
1612
- "knowledge_cutoff": "2024/07"
1613
- },
1614
- "results": {
1615
- "OVERALL": {
1616
- "Average Score": 0.502167612,
1617
- "Standard Deviation": 0.04389786763,
1618
- "Rank": 20
1619
- },
1620
- "Geometry": {
1621
- "Average Score": 0.395006676,
1622
- "Standard Deviation": 0.05882607713,
1623
- "Rank": 32
1624
- },
1625
- "Algebra": {
1626
- "Average Score": 0.379391887,
1627
- "Standard Deviation": 0.01722410785,
1628
- "Rank": 35
1629
- },
1630
- "Probability": {
1631
- "Average Score": 0.331231097,
1632
- "Standard Deviation": 0.05392499987,
1633
- "Rank": 35
1634
- },
1635
- "Logical": {
1636
- "Average Score": 0.367687789,
1637
- "Standard Deviation": 0.02547968808,
1638
- "Rank": 27
1639
- },
1640
- "Social": {
1641
- "Average Score": 0.393482094,
1642
- "Standard Deviation": 0.06450214024,
1643
- "Rank": 24
1644
- },
1645
- "Chemistry": {
1646
- "Average Score": 30.53406933106768,
1647
- "Standard Deviation": null,
1648
- "Rank": 36
1649
- }
1650
- }
1651
- },
1652
- {
1653
- "config": {
1654
- "model_name": "starling-lm-7b-alpha",
1655
- "organization": "Nexusflow",
1656
- "license": "Apache-2.0",
1657
- "knowledge_cutoff": "2023/11"
1658
- },
1659
- "results": {
1660
- "OVERALL": {
1661
- "Average Score": 0.366628765,
1662
- "Standard Deviation": 0.08405492929,
1663
- "Rank": 33
1664
- },
1665
- "Geometry": {
1666
- "Average Score": 0.336782578,
1667
- "Standard Deviation": 0.04069449132,
1668
- "Rank": 35
1669
- },
1670
- "Algebra": {
1671
- "Average Score": 0.371551932,
1672
- "Standard Deviation": 0.03367241745,
1673
- "Rank": 36
1674
- },
1675
- "Probability": {
1676
- "Average Score": 0.331472505,
1677
- "Standard Deviation": 0.04833324282,
1678
- "Rank": 34
1679
- },
1680
- "Logical": {
1681
- "Average Score": 0.260869624,
1682
- "Standard Deviation": 0.03562735237,
1683
- "Rank": 36
1684
- },
1685
- "Social": {
1686
- "Average Score": 0.271975534,
1687
- "Standard Deviation": 0.04266753408,
1688
- "Rank": 34
1689
- },
1690
- "Chemistry": {
1691
- "Average Score": 30.07926487356878,
1692
- "Standard Deviation": null,
1693
- "Rank": 37
1694
- }
1695
- }
1696
- },
1697
- {
1698
- "config": {
1699
- "model_name": "qwen1.5-4b-chat",
1700
- "organization": "Alibaba",
1701
- "license": "Qianwen LICENSE",
1702
- "knowledge_cutoff": "2024/02"
1703
- },
1704
- "results": {
1705
- "OVERALL": {
1706
- "Average Score": 0.111876411,
1707
- "Standard Deviation": 0.04241022785,
1708
- "Rank": 49
1709
- },
1710
- "Geometry": {
1711
- "Average Score": 0.215834522,
1712
- "Standard Deviation": 0.0363766363,
1713
- "Rank": 40
1714
- },
1715
- "Algebra": {
1716
- "Average Score": 0.305589811,
1717
- "Standard Deviation": 0.02354198912,
1718
- "Rank": 37
1719
- },
1720
- "Probability": {
1721
- "Average Score": 0.149365327,
1722
- "Standard Deviation": 0.03489672675,
1723
- "Rank": 44
1724
- },
1725
- "Logical": {
1726
- "Average Score": 0.116210168,
1727
- "Standard Deviation": 0.005927966496,
1728
- "Rank": 47
1729
- },
1730
- "Social": {
1731
- "Average Score": 0.18195615,
1732
- "Standard Deviation": 0.02269805277,
1733
- "Rank": 41
1734
- },
1735
- "Chemistry": {
1736
- "Average Score": 13.21208067122554,
1737
- "Standard Deviation": null,
1738
- "Rank": 47
1739
- }
1740
- }
1741
- },
1742
- {
1743
- "config": {
1744
- "model_name": "command-r-(04-2024)",
1745
- "organization": "Cohere",
1746
- "license": "CC-BY-NC-4.0",
1747
- "knowledge_cutoff": "2024/04"
1748
- },
1749
- "results": {
1750
- "OVERALL": {
1751
- "Average Score": 0.388783887,
1752
- "Standard Deviation": 0.07417186783,
1753
- "Rank": 30
1754
- },
1755
- "Geometry": {
1756
- "Average Score": 0.300416698,
1757
- "Standard Deviation": 0.03485612736,
1758
- "Rank": 37
1759
- },
1760
- "Algebra": {
1761
- "Average Score": 0.293120231,
1762
- "Standard Deviation": 0.032926484,
1763
- "Rank": 38
1764
- },
1765
- "Probability": {
1766
- "Average Score": 0.281271304,
1767
- "Standard Deviation": 0.05697149867,
1768
- "Rank": 39
1769
- },
1770
- "Logical": {
1771
- "Average Score": 0.276189906,
1772
- "Standard Deviation": 0.03562914754,
1773
- "Rank": 34
1774
- },
1775
- "Social": {
1776
- "Average Score": 0.283882949,
1777
- "Standard Deviation": 0.03336901148,
1778
- "Rank": 32
1779
- },
1780
- "Chemistry": {
1781
- "Average Score": 41.346336503003236,
1782
- "Standard Deviation": null,
1783
- "Rank": 28
1784
- }
1785
- }
1786
- },
1787
- {
1788
- "config": {
1789
- "model_name": "vicuna-33b",
1790
- "organization": "LMSYS",
1791
- "license": "Non-commercial",
1792
- "knowledge_cutoff": "2023/08"
1793
- },
1794
- "results": {
1795
- "OVERALL": {
1796
- "Average Score": 0.316543555,
1797
- "Standard Deviation": 0.08922095647,
1798
- "Rank": 39
1799
- },
1800
- "Geometry": {
1801
- "Average Score": 0.208284679,
1802
- "Standard Deviation": 0.03937771461,
1803
- "Rank": 41
1804
- },
1805
- "Algebra": {
1806
- "Average Score": 0.248994048,
1807
- "Standard Deviation": 0.02668175054,
1808
- "Rank": 40
1809
- },
1810
- "Probability": {
1811
- "Average Score": 0.222313995,
1812
- "Standard Deviation": 0.03978859759,
1813
- "Rank": 42
1814
- },
1815
- "Logical": {
1816
- "Average Score": 0.180291222,
1817
- "Standard Deviation": 0.021886267,
1818
- "Rank": 42
1819
- },
1820
- "Social": {
1821
- "Average Score": 0.257623798,
1822
- "Standard Deviation": 0.02653724437,
1823
- "Rank": 35
1824
- },
1825
- "Chemistry": {
1826
- "Average Score": 28.01838653090379,
1827
- "Standard Deviation": null,
1828
- "Rank": 38
1829
- }
1830
- }
1831
- },
1832
- {
1833
- "config": {
1834
- "model_name": "gemma-7b-it",
1835
- "organization": "Google",
1836
- "license": "Gemma License",
1837
- "knowledge_cutoff": "2024/02"
1838
- },
1839
- "results": {
1840
- "OVERALL": {
1841
- "Average Score": 0.285077558,
1842
- "Standard Deviation": 0.08871758453,
1843
- "Rank": 41
1844
- },
1845
- "Geometry": {
1846
- "Average Score": 0.244791417,
1847
- "Standard Deviation": 0.0289612078,
1848
- "Rank": 38
1849
- },
1850
- "Algebra": {
1851
- "Average Score": 0.250614794,
1852
- "Standard Deviation": 0.01991678295,
1853
- "Rank": 39
1854
- },
1855
- "Probability": {
1856
- "Average Score": 0.174313053,
1857
- "Standard Deviation": 0.03765424728,
1858
- "Rank": 43
1859
- },
1860
- "Logical": {
1861
- "Average Score": 0.197505536,
1862
- "Standard Deviation": 0.02050298885,
1863
- "Rank": 39
1864
- },
1865
- "Social": {
1866
- "Average Score": 0.202138025,
1867
- "Standard Deviation": 0.02098346639,
1868
- "Rank": 40
1869
- },
1870
- "Chemistry": {
1871
- "Average Score": 28.014658234926813,
1872
- "Standard Deviation": null,
1873
- "Rank": 39
1874
- }
1875
- }
1876
- },
1877
- {
1878
- "config": {
1879
- "model_name": "mistral-7b-instruct-2",
1880
- "organization": "Mistral",
1881
- "license": "Apache 2.0",
1882
- "knowledge_cutoff": "2023/12"
1883
- },
1884
- "results": {
1885
- "OVERALL": {
1886
- "Average Score": 0.427513868,
1887
- "Standard Deviation": 0.05553921135,
1888
- "Rank": 27
1889
- },
1890
- "Geometry": {
1891
- "Average Score": 0.216402626,
1892
- "Standard Deviation": 0.03338414918,
1893
- "Rank": 39
1894
- },
1895
- "Algebra": {
1896
- "Average Score": 0.233777838,
1897
- "Standard Deviation": 0.0155226054,
1898
- "Rank": 41
1899
- },
1900
- "Probability": {
1901
- "Average Score": 0.25118175,
1902
- "Standard Deviation": 0.04065514593,
1903
- "Rank": 40
1904
- },
1905
- "Logical": {
1906
- "Average Score": 0.224469136,
1907
- "Standard Deviation": 0.03404706752,
1908
- "Rank": 37
1909
- },
1910
- "Social": {
1911
- "Average Score": 0.209386782,
1912
- "Standard Deviation": 0.02738569921,
1913
- "Rank": 39
1914
- },
1915
- "Chemistry": {
1916
- "Average Score": 31.382959631870822,
1917
- "Standard Deviation": null,
1918
- "Rank": 35
1919
- }
1920
- }
1921
- },
1922
- {
1923
- "config": {
1924
- "model_name": "mistral-7b-instruct-1",
1925
- "organization": "Mistral",
1926
- "license": "Apache 2.0",
1927
- "knowledge_cutoff": "2023/12"
1928
- },
1929
- "results": {
1930
- "OVERALL": {
1931
- "Average Score": 0.23016314,
1932
- "Standard Deviation": 0.07137625271,
1933
- "Rank": 46
1934
- },
1935
- "Geometry": {
1936
- "Average Score": 0.161799938,
1937
- "Standard Deviation": 0.03595278559,
1938
- "Rank": 45
1939
- },
1940
- "Algebra": {
1941
- "Average Score": 0.210341624,
1942
- "Standard Deviation": 0.01736539119,
1943
- "Rank": 42
1944
- },
1945
- "Probability": {
1946
- "Average Score": 0.238417922,
1947
- "Standard Deviation": 0.03744211933,
1948
- "Rank": 41
1949
- },
1950
- "Logical": {
1951
- "Average Score": 0.142636601,
1952
- "Standard Deviation": 0.02080406365,
1953
- "Rank": 46
1954
- },
1955
- "Social": {
1956
- "Average Score": 0.117646827,
1957
- "Standard Deviation": 0.009321202779,
1958
- "Rank": 48
1959
- },
1960
- "Chemistry": {
1961
- "Average Score": 18.929093202755805,
1962
- "Standard Deviation": null,
1963
- "Rank": 42
1964
- }
1965
- }
1966
- },
1967
- {
1968
- "config": {
1969
- "model_name": "vicuna-13b",
1970
- "organization": "LMSYS",
1971
- "license": "Non-commercial",
1972
- "knowledge_cutoff": "2023/07"
1973
- },
1974
- "results": {
1975
- "OVERALL": {
1976
- "Average Score": 0.201892849,
1977
- "Standard Deviation": 0.06021749802,
1978
- "Rank": 47
1979
- },
1980
- "Geometry": {
1981
- "Average Score": 0.200941928,
1982
- "Standard Deviation": 0.03366817781,
1983
- "Rank": 42
1984
- },
1985
- "Algebra": {
1986
- "Average Score": 0.196123323,
1987
- "Standard Deviation": 0.0135715643,
1988
- "Rank": 43
1989
- },
1990
- "Probability": {
1991
- "Average Score": 0.141214079,
1992
- "Standard Deviation": 0.02721328211,
1993
- "Rank": 45
1994
- },
1995
- "Logical": {
1996
- "Average Score": 0.148598631,
1997
- "Standard Deviation": 0.02241523892,
1998
- "Rank": 44
1999
- },
2000
- "Social": {
2001
- "Average Score": 0.124655135,
2002
- "Standard Deviation": 0.01122382671,
2003
- "Rank": 47
2004
- },
2005
- "Chemistry": {
2006
- "Average Score": 21.840013221590294,
2007
- "Standard Deviation": null,
2008
- "Rank": 40
2009
- }
2010
- }
2011
- },
2012
- {
2013
- "config": {
2014
- "model_name": "zephyr-7b-beta",
2015
- "organization": "HuggingFace",
2016
- "license": "MIT",
2017
- "knowledge_cutoff": "2023/10"
2018
- },
2019
- "results": {
2020
- "OVERALL": {
2021
- "Average Score": 0.102705119,
2022
- "Standard Deviation": 0.03683757312,
2023
- "Rank": 50
2024
- },
2025
- "Geometry": {
2026
- "Average Score": 0.114005544,
2027
- "Standard Deviation": 0.03144354365,
2028
- "Rank": 46
2029
- },
2030
- "Algebra": {
2031
- "Average Score": 0.141766633,
2032
- "Standard Deviation": 0.03179520129,
2033
- "Rank": 44
2034
- },
2035
- "Probability": {
2036
- "Average Score": 0.089050714,
2037
- "Standard Deviation": 0.002136754266,
2038
- "Rank": 48
2039
- },
2040
- "Logical": {
2041
- "Average Score": 0.069520789,
2042
- "Standard Deviation": 0.004477840857,
2043
- "Rank": 51
2044
- },
2045
- "Social": {
2046
- "Average Score": 0.0,
2047
- "Standard Deviation": 0.0,
2048
- "Rank": 53
2049
- },
2050
- "Chemistry": {
2051
- "Average Score": 18.92902220864132,
2052
- "Standard Deviation": null,
2053
- "Rank": 43
2054
- }
2055
- }
2056
- },
2057
- {
2058
- "config": {
2059
- "model_name": "gemma-1.1-2b-it",
2060
- "organization": "Google",
2061
- "license": "Gemma License",
2062
- "knowledge_cutoff": "2024/02"
2063
- },
2064
- "results": {
2065
- "OVERALL": {
2066
- "Average Score": 0.257700845,
2067
- "Standard Deviation": 0.07369021445,
2068
- "Rank": 44
2069
- },
2070
- "Geometry": {
2071
- "Average Score": 0.183974034,
2072
- "Standard Deviation": 0.0215548886,
2073
- "Rank": 44
2074
- },
2075
- "Algebra": {
2076
- "Average Score": 0.13422252,
2077
- "Standard Deviation": 0.01922819511,
2078
- "Rank": 45
2079
- },
2080
- "Probability": {
2081
- "Average Score": 0.095628657,
2082
- "Standard Deviation": 0.007536076456,
2083
- "Rank": 47
2084
- },
2085
- "Logical": {
2086
- "Average Score": 0.094965074,
2087
- "Standard Deviation": 0.005019175487,
2088
- "Rank": 49
2089
- },
2090
- "Social": {
2091
- "Average Score": 0.167796727,
2092
- "Standard Deviation": 0.01666541942,
2093
- "Rank": 43
2094
- },
2095
- "Chemistry": {
2096
- "Average Score": 20.724691953843916,
2097
- "Standard Deviation": null,
2098
- "Rank": 41
2099
- }
2100
- }
2101
- },
2102
- {
2103
- "config": {
2104
- "model_name": "llama2-7b-chat",
2105
- "organization": "Meta",
2106
- "license": "Llama 2 Community",
2107
- "knowledge_cutoff": "2023/07"
2108
- },
2109
- "results": {
2110
- "OVERALL": {
2111
- "Average Score": 0.260189428,
2112
- "Standard Deviation": 0.08019299364,
2113
- "Rank": 43
2114
- },
2115
- "Geometry": {
2116
- "Average Score": 0.087067276,
2117
- "Standard Deviation": 0.04274343402,
2118
- "Rank": 47
2119
- },
2120
- "Algebra": {
2121
- "Average Score": 0.12308805,
2122
- "Standard Deviation": 0.01856053622,
2123
- "Rank": 46
2124
- },
2125
- "Probability": {
2126
- "Average Score": 0.087515438,
2127
- "Standard Deviation": 0.006315053573,
2128
- "Rank": 49
2129
- },
2130
- "Logical": {
2131
- "Average Score": 0.17312827,
2132
- "Standard Deviation": 0.01867044092,
2133
- "Rank": 43
2134
- },
2135
- "Social": {
2136
- "Average Score": 0.152905272,
2137
- "Standard Deviation": 0.007166957097,
2138
- "Rank": 44
2139
- },
2140
- "Chemistry": {
2141
- "Average Score": 15.730513733660898,
2142
- "Standard Deviation": null,
2143
- "Rank": 45
2144
- }
2145
- }
2146
- },
2147
- {
2148
- "config": {
2149
- "model_name": "gemma-2b-it",
2150
- "organization": "Google",
2151
- "license": "Gemma License",
2152
- "knowledge_cutoff": "2024/02"
2153
- },
2154
- "results": {
2155
- "OVERALL": {
2156
- "Average Score": 0.234172069,
2157
- "Standard Deviation": 0.06522685718,
2158
- "Rank": 45
2159
- },
2160
- "Geometry": {
2161
- "Average Score": 0.198571153,
2162
- "Standard Deviation": 0.01699161031,
2163
- "Rank": 43
2164
- },
2165
- "Algebra": {
2166
- "Average Score": 0.109883009,
2167
- "Standard Deviation": 0.01520005833,
2168
- "Rank": 47
2169
- },
2170
- "Probability": {
2171
- "Average Score": 0.06467432,
2172
- "Standard Deviation": 0.002117497231,
2173
- "Rank": 51
2174
- },
2175
- "Logical": {
2176
- "Average Score": 0.039624492,
2177
- "Standard Deviation": 0.007606972686,
2178
- "Rank": 52
2179
- },
2180
- "Social": {
2181
- "Average Score": 0.087452913,
2182
- "Standard Deviation": 0.008170146562,
2183
- "Rank": 51
2184
- },
2185
- "Chemistry": {
2186
- "Average Score": 17.2715657115764,
2187
- "Standard Deviation": null,
2188
- "Rank": 44
2189
- }
2190
- }
2191
- },
2192
- {
2193
- "config": {
2194
- "model_name": "llama2-13b-chat",
2195
- "organization": "Meta",
2196
- "license": "Llama 2 Community",
2197
- "knowledge_cutoff": "2023/07"
2198
- },
2199
- "results": {
2200
- "OVERALL": {
2201
- "Average Score": 0.263305684,
2202
- "Standard Deviation": 0.07283640689,
2203
- "Rank": 42
2204
- },
2205
- "Geometry": {
2206
- "Average Score": 0.072729954,
2207
- "Standard Deviation": 0.02315988261,
2208
- "Rank": 49
2209
- },
2210
- "Algebra": {
2211
- "Average Score": 0.080371692,
2212
- "Standard Deviation": 0.01277569453,
2213
- "Rank": 48
2214
- },
2215
- "Probability": {
2216
- "Average Score": 0.117757344,
2217
- "Standard Deviation": 0.02418619619,
2218
- "Rank": 46
2219
- },
2220
- "Logical": {
2221
- "Average Score": 0.193149889,
2222
- "Standard Deviation": 0.01776690764,
2223
- "Rank": 40
2224
- },
2225
- "Social": {
2226
- "Average Score": 0.149125922,
2227
- "Standard Deviation": 0.01157416827,
2228
- "Rank": 45
2229
- },
2230
- "Chemistry": {
2231
- "Average Score": 13.17258252933903,
2232
- "Standard Deviation": null,
2233
- "Rank": 48
2234
- }
2235
- }
2236
- },
2237
- {
2238
- "config": {
2239
- "model_name": "vicuna-7b",
2240
- "organization": "LMSYS",
2241
- "license": "Non-commercial",
2242
- "knowledge_cutoff": "2023/07"
2243
- },
2244
- "results": {
2245
- "OVERALL": {
2246
- "Average Score": 0.198839786,
2247
- "Standard Deviation": 0.05725381576,
2248
- "Rank": 48
2249
- },
2250
- "Geometry": {
2251
- "Average Score": 0.083457058,
2252
- "Standard Deviation": 0.02520989111,
2253
- "Rank": 48
2254
- },
2255
- "Algebra": {
2256
- "Average Score": 0.070883882,
2257
- "Standard Deviation": 0.007315853253,
2258
- "Rank": 49
2259
- },
2260
- "Probability": {
2261
- "Average Score": 0.080987673,
2262
- "Standard Deviation": 0.005474288861,
2263
- "Rank": 50
2264
- },
2265
- "Logical": {
2266
- "Average Score": 0.100065588,
2267
- "Standard Deviation": 0.003561886452,
2268
- "Rank": 48
2269
- },
2270
- "Social": {
2271
- "Average Score": 0.111076414,
2272
- "Standard Deviation": 0.004805626512,
2273
- "Rank": 49
2274
- },
2275
- "Chemistry": {
2276
- "Average Score": 14.255194156624162,
2277
- "Standard Deviation": null,
2278
- "Rank": 46
2279
- }
2280
- }
2281
- },
2282
- {
2283
- "config": {
2284
- "model_name": "koala-13b",
2285
- "organization": "UC Berkeley",
2286
- "license": "Non-commercial",
2287
- "knowledge_cutoff": "2023/04"
2288
- },
2289
- "results": {
2290
- "OVERALL": {
2291
- "Average Score": 0.09387188,
2292
- "Standard Deviation": 0.02642167489,
2293
- "Rank": 51
2294
- },
2295
- "Geometry": {
2296
- "Average Score": 0.017374001,
2297
- "Standard Deviation": 0.01747053557,
2298
- "Rank": 50
2299
- },
2300
- "Algebra": {
2301
- "Average Score": 0.018129197,
2302
- "Standard Deviation": 0.01054371383,
2303
- "Rank": 50
2304
- },
2305
- "Probability": {
2306
- "Average Score": 0.043654362,
2307
- "Standard Deviation": 0.004288231886,
2308
- "Rank": 52
2309
- },
2310
- "Logical": {
2311
- "Average Score": 0.074694053,
2312
- "Standard Deviation": 0.002674646998,
2313
- "Rank": 50
2314
- },
2315
- "Social": {
2316
- "Average Score": 0.096983835,
2317
- "Standard Deviation": 0.007847059783,
2318
- "Rank": 50
2319
- },
2320
- "Chemistry": {
2321
- "Average Score": 6.36433272373514,
2322
- "Standard Deviation": null,
2323
- "Rank": 49
2324
- }
2325
- }
2326
- },
2327
- {
2328
- "config": {
2329
- "model_name": "openassistant-pythia-12b",
2330
- "organization": "OpenAssistant",
2331
- "license": "Non-commercial",
2332
- "knowledge_cutoff": "2023/04"
2333
- },
2334
- "results": {
2335
- "OVERALL": {
2336
- "Average Score": 0.0,
2337
- "Standard Deviation": 0.0,
2338
- "Rank": 52
2339
- },
2340
- "Geometry": {
2341
- "Average Score": 0.0,
2342
- "Standard Deviation": 0.0,
2343
- "Rank": 51
2344
- },
2345
- "Algebra": {
2346
- "Average Score": 0.0,
2347
- "Standard Deviation": 0.0,
2348
- "Rank": 51
2349
- },
2350
- "Probability": {
2351
- "Average Score": 0.0,
2352
- "Standard Deviation": 0.0,
2353
- "Rank": 53
2354
- },
2355
- "Logical": {
2356
- "Average Score": 0.0,
2357
- "Standard Deviation": 0.0,
2358
- "Rank": 53
2359
- },
2360
- "Social": {
2361
- "Average Score": 0.030792528,
2362
- "Standard Deviation": 0.007518796391,
2363
- "Rank": 52
2364
- },
2365
- "Chemistry": {
2366
- "Average Score": 0.0,
2367
- "Standard Deviation": null,
2368
- "Rank": 50
2369
- }
2370
- }
2371
- }
2372
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/results/models_2024-10-10-06:18:54.263527.json DELETED
@@ -1,2622 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "model_name": "ChatGPT-4o-latest (2024-09-03)",
5
- "organization": "OpenAI",
6
- "license": "Proprietary",
7
- "knowledge_cutoff": "2023/10"
8
- },
9
- "results": {
10
- "OVERALL": {
11
- "Average Score": 0.974329609,
12
- "Standard Deviation": 0.005024959031,
13
- "Rank": 2
14
- },
15
- "Geometry": {
16
- "Average Score": 0.976028578,
17
- "Standard Deviation": 0.01507912373,
18
- "Rank": 3
19
- },
20
- "Algebra": {
21
- "Average Score": 0.951199453,
22
- "Standard Deviation": 0.08452452108,
23
- "Rank": 3
24
- },
25
- "Probability": {
26
- "Average Score": 0.842116641,
27
- "Standard Deviation": 0.006267759054,
28
- "Rank": 3
29
- },
30
- "Logical": {
31
- "Average Score": 0.828490728,
32
- "Standard Deviation": 0.009134213144,
33
- "Rank": 3
34
- },
35
- "Social": {
36
- "Average Score": 0.815902987,
37
- "Standard Deviation": 0.0196254222,
38
- "Rank": 3
39
- },
40
- "Chemistry": {
41
- "Average Score": 100.0,
42
- "Standard Deviation": null,
43
- "Rank": 1
44
- },
45
- "CPP": {
46
- "Average Score": 100.0,
47
- "Standard Deviation": null,
48
- "Rank": 1
49
- }
50
- }
51
- },
52
- {
53
- "config": {
54
- "model_name": "gpt-4o-2024-08-06",
55
- "organization": "OpenAI",
56
- "license": "Proprietary",
57
- "knowledge_cutoff": "2023/10"
58
- },
59
- "results": {
60
- "OVERALL": {
61
- "Average Score": 0.846571548,
62
- "Standard Deviation": 0.03394056554,
63
- "Rank": 6
64
- },
65
- "Geometry": {
66
- "Average Score": 0.99773096,
67
- "Standard Deviation": 0.002835555172,
68
- "Rank": 1
69
- },
70
- "Algebra": {
71
- "Average Score": 1.0,
72
- "Standard Deviation": 0.0,
73
- "Rank": 1
74
- },
75
- "Probability": {
76
- "Average Score": 0.78855795,
77
- "Standard Deviation": 0.008188675452,
78
- "Rank": 6
79
- },
80
- "Logical": {
81
- "Average Score": 0.668635768,
82
- "Standard Deviation": 0.03466314094,
83
- "Rank": 11
84
- },
85
- "Social": {
86
- "Average Score": 0.680417314,
87
- "Standard Deviation": 0.00656867063,
88
- "Rank": 8
89
- },
90
- "Chemistry": {
91
- "Average Score": 92.43090226400756,
92
- "Standard Deviation": null,
93
- "Rank": 2
94
- },
95
- "CPP": {
96
- "Average Score": 92.43090226400756,
97
- "Standard Deviation": null,
98
- "Rank": 2
99
- }
100
- }
101
- },
102
- {
103
- "config": {
104
- "model_name": "gpt-4o-2024-05-13",
105
- "organization": "OpenAI",
106
- "license": "Proprietary",
107
- "knowledge_cutoff": "2023/10"
108
- },
109
- "results": {
110
- "OVERALL": {
111
- "Average Score": 0.846334477,
112
- "Standard Deviation": 0.09377911572,
113
- "Rank": 7
114
- },
115
- "Geometry": {
116
- "Average Score": 0.972472377,
117
- "Standard Deviation": 0.01648274205,
118
- "Rank": 4
119
- },
120
- "Algebra": {
121
- "Average Score": 0.995511298,
122
- "Standard Deviation": 0.004097802515,
123
- "Rank": 2
124
- },
125
- "Probability": {
126
- "Average Score": 0.812149974,
127
- "Standard Deviation": 0.007669585485,
128
- "Rank": 4
129
- },
130
- "Logical": {
131
- "Average Score": 0.755019692,
132
- "Standard Deviation": 0.008149588572,
133
- "Rank": 6
134
- },
135
- "Social": {
136
- "Average Score": 0.609875087,
137
- "Standard Deviation": 0.038729239,
138
- "Rank": 13
139
- },
140
- "Chemistry": {
141
- "Average Score": 79.1592634699295,
142
- "Standard Deviation": null,
143
- "Rank": 5
144
- },
145
- "CPP": {
146
- "Average Score": 79.1592634699295,
147
- "Standard Deviation": null,
148
- "Rank": 6
149
- }
150
- }
151
- },
152
- {
153
- "config": {
154
- "model_name": "gpt-4-turbo-2024-04-09",
155
- "organization": "OpenAI",
156
- "license": "Proprietary",
157
- "knowledge_cutoff": "2023/12"
158
- },
159
- "results": {
160
- "OVERALL": {
161
- "Average Score": 0.855357972,
162
- "Standard Deviation": 0.1016986368,
163
- "Rank": 4
164
- },
165
- "Geometry": {
166
- "Average Score": 0.95374588,
167
- "Standard Deviation": 0.03109307166,
168
- "Rank": 5
169
- },
170
- "Algebra": {
171
- "Average Score": 0.930945223,
172
- "Standard Deviation": 0.06705136813,
173
- "Rank": 4
174
- },
175
- "Probability": {
176
- "Average Score": 0.750705448,
177
- "Standard Deviation": 0.05944483103,
178
- "Rank": 8
179
- },
180
- "Logical": {
181
- "Average Score": 0.77906699,
182
- "Standard Deviation": 0.007406734161,
183
- "Rank": 4
184
- },
185
- "Social": {
186
- "Average Score": 0.715935163,
187
- "Standard Deviation": 0.1209141409,
188
- "Rank": 6
189
- },
190
- "Chemistry": {
191
- "Average Score": 70.73143363230263,
192
- "Standard Deviation": null,
193
- "Rank": 10
194
- },
195
- "CPP": {
196
- "Average Score": 70.73143363230263,
197
- "Standard Deviation": null,
198
- "Rank": 11
199
- }
200
- }
201
- },
202
- {
203
- "config": {
204
- "model_name": "gemini-1.5-pro-001",
205
- "organization": "Google",
206
- "license": "Proprietary",
207
- "knowledge_cutoff": "2023/11"
208
- },
209
- "results": {
210
- "OVERALL": {
211
- "Average Score": 0.797187842,
212
- "Standard Deviation": 0.0272375249,
213
- "Rank": 10
214
- },
215
- "Geometry": {
216
- "Average Score": 0.9947169,
217
- "Standard Deviation": 0.009150597621,
218
- "Rank": 2
219
- },
220
- "Algebra": {
221
- "Average Score": 0.857464301,
222
- "Standard Deviation": 0.05014285338,
223
- "Rank": 5
224
- },
225
- "Probability": {
226
- "Average Score": 0.651781767,
227
- "Standard Deviation": 0.04156998547,
228
- "Rank": 10
229
- },
230
- "Logical": {
231
- "Average Score": 0.739745471,
232
- "Standard Deviation": 0.01631532019,
233
- "Rank": 7
234
- },
235
- "Social": {
236
- "Average Score": 0.649601885,
237
- "Standard Deviation": 0.104854889,
238
- "Rank": 11
239
- }
240
- }
241
- },
242
- {
243
- "config": {
244
- "model_name": "qwen2-72b-instruct",
245
- "organization": "Alibaba",
246
- "license": "Qianwen LICENSE",
247
- "knowledge_cutoff": "2024/09"
248
- },
249
- "results": {
250
- "OVERALL": {
251
- "Average Score": 0.737918558,
252
- "Standard Deviation": 0.09069077339,
253
- "Rank": 11
254
- },
255
- "Geometry": {
256
- "Average Score": 0.796870305,
257
- "Standard Deviation": 0.0509025346,
258
- "Rank": 9
259
- },
260
- "Algebra": {
261
- "Average Score": 0.836194231,
262
- "Standard Deviation": 0.04517093028,
263
- "Rank": 6
264
- },
265
- "Probability": {
266
- "Average Score": 0.788068004,
267
- "Standard Deviation": 0.007288989044,
268
- "Rank": 7
269
- },
270
- "Logical": {
271
- "Average Score": 0.619300904,
272
- "Standard Deviation": 0.06377931612,
273
- "Rank": 14
274
- },
275
- "Social": {
276
- "Average Score": 0.652578786,
277
- "Standard Deviation": 0.04259293171,
278
- "Rank": 10
279
- },
280
- "Chemistry": {
281
- "Average Score": 73.54037778797029,
282
- "Standard Deviation": null,
283
- "Rank": 7
284
- },
285
- "CPP": {
286
- "Average Score": 73.54037778797029,
287
- "Standard Deviation": null,
288
- "Rank": 7
289
- }
290
- }
291
- },
292
- {
293
- "config": {
294
- "model_name": "gpt-4o-mini-2024-07-18",
295
- "organization": "OpenAI",
296
- "license": "Proprietary",
297
- "knowledge_cutoff": "2023/10"
298
- },
299
- "results": {
300
- "OVERALL": {
301
- "Average Score": 0.847694133,
302
- "Standard Deviation": 0.02164304402,
303
- "Rank": 5
304
- },
305
- "Geometry": {
306
- "Average Score": 0.946650435,
307
- "Standard Deviation": 0.01831236482,
308
- "Rank": 7
309
- },
310
- "Algebra": {
311
- "Average Score": 0.796243022,
312
- "Standard Deviation": 0.05537539202,
313
- "Rank": 7
314
- },
315
- "Probability": {
316
- "Average Score": 0.798402685,
317
- "Standard Deviation": 0.009404491967,
318
- "Rank": 5
319
- },
320
- "Logical": {
321
- "Average Score": 0.727009735,
322
- "Standard Deviation": 0.02628110141,
323
- "Rank": 8
324
- },
325
- "Social": {
326
- "Average Score": 0.691949855,
327
- "Standard Deviation": 0.02072934333,
328
- "Rank": 7
329
- },
330
- "Chemistry": {
331
- "Average Score": 88.3877070580296,
332
- "Standard Deviation": null,
333
- "Rank": 3
334
- },
335
- "CPP": {
336
- "Average Score": 88.3877070580296,
337
- "Standard Deviation": null,
338
- "Rank": 3
339
- }
340
- }
341
- },
342
- {
343
- "config": {
344
- "model_name": "claude-3.5-sonnet",
345
- "organization": "Anthropic",
346
- "license": "Proprietary",
347
- "knowledge_cutoff": "2024/04"
348
- },
349
- "results": {
350
- "OVERALL": {
351
- "Average Score": 0.839004422,
352
- "Standard Deviation": 0.1461079564,
353
- "Rank": 8
354
- },
355
- "Geometry": {
356
- "Average Score": 0.95316419,
357
- "Standard Deviation": 0.02081192856,
358
- "Rank": 6
359
- },
360
- "Algebra": {
361
- "Average Score": 0.759789952,
362
- "Standard Deviation": 0.02611765096,
363
- "Rank": 8
364
- },
365
- "Probability": {
366
- "Average Score": 0.707730127,
367
- "Standard Deviation": 0.0394436664,
368
- "Rank": 9
369
- },
370
- "Logical": {
371
- "Average Score": 0.77342666,
372
- "Standard Deviation": 0.002892426458,
373
- "Rank": 5
374
- },
375
- "Social": {
376
- "Average Score": 0.790002247,
377
- "Standard Deviation": 0.1007410022,
378
- "Rank": 4
379
- },
380
- "Chemistry": {
381
- "Average Score": 82.37734076815008,
382
- "Standard Deviation": null,
383
- "Rank": 4
384
- },
385
- "CPP": {
386
- "Average Score": 82.37734076815008,
387
- "Standard Deviation": null,
388
- "Rank": 5
389
- }
390
- }
391
- },
392
- {
393
- "config": {
394
- "model_name": "o1-mini",
395
- "organization": "OpenAI",
396
- "license": "Proprietary",
397
- "knowledge_cutoff": "2023/10"
398
- },
399
- "results": {
400
- "OVERALL": {
401
- "Average Score": 1.0,
402
- "Standard Deviation": 0.0,
403
- "Rank": 1
404
- },
405
- "Geometry": {
406
- "Average Score": "N/A",
407
- "Standard Deviation": "N/A",
408
- "Rank": "N/A"
409
- },
410
- "Algebra": {
411
- "Average Score": "N/A",
412
- "Standard Deviation": "N/A",
413
- "Rank": "N/A"
414
- },
415
- "Probability": {
416
- "Average Score": 1.0,
417
- "Standard Deviation": 0.0,
418
- "Rank": 1
419
- },
420
- "Logical": {
421
- "Average Score": 1.0,
422
- "Standard Deviation": 0.0,
423
- "Rank": 1
424
- },
425
- "Social": {
426
- "Average Score": 0.993974241,
427
- "Standard Deviation": 0.001996882328,
428
- "Rank": 2
429
- }
430
- }
431
- },
432
- {
433
- "config": {
434
- "model_name": "o1-preview",
435
- "organization": "OpenAI",
436
- "license": "Proprietary",
437
- "knowledge_cutoff": "2023/10"
438
- },
439
- "results": {
440
- "OVERALL": {
441
- "Average Score": 0.945884589,
442
- "Standard Deviation": 0.01059250762,
443
- "Rank": 3
444
- },
445
- "Geometry": {
446
- "Average Score": "N/A",
447
- "Standard Deviation": "N/A",
448
- "Rank": "N/A"
449
- },
450
- "Algebra": {
451
- "Average Score": "N/A",
452
- "Standard Deviation": "N/A",
453
- "Rank": "N/A"
454
- },
455
- "Probability": {
456
- "Average Score": 0.964666392,
457
- "Standard Deviation": 0.003139983398,
458
- "Rank": 2
459
- },
460
- "Logical": {
461
- "Average Score": 0.987950057,
462
- "Standard Deviation": 0.004881220327,
463
- "Rank": 2
464
- },
465
- "Social": {
466
- "Average Score": 1.0,
467
- "Standard Deviation": 0.0,
468
- "Rank": 1
469
- }
470
- }
471
- },
472
- {
473
- "config": {
474
- "model_name": "gemini-1.5-flash-001",
475
- "organization": "Google",
476
- "license": "Proprietary",
477
- "knowledge_cutoff": "2023/11"
478
- },
479
- "results": {
480
- "OVERALL": {
481
- "Average Score": 0.726493401,
482
- "Standard Deviation": 0.01113913725,
483
- "Rank": 12
484
- },
485
- "Geometry": {
486
- "Average Score": 0.804144103,
487
- "Standard Deviation": 0.1327142178,
488
- "Rank": 8
489
- },
490
- "Algebra": {
491
- "Average Score": 0.731776765,
492
- "Standard Deviation": 0.02594657111,
493
- "Rank": 9
494
- },
495
- "Probability": {
496
- "Average Score": 0.614461891,
497
- "Standard Deviation": 0.04690131826,
498
- "Rank": 13
499
- },
500
- "Logical": {
501
- "Average Score": 0.630805991,
502
- "Standard Deviation": 0.04871350612,
503
- "Rank": 13
504
- },
505
- "Social": {
506
- "Average Score": 0.555933822,
507
- "Standard Deviation": 0.1029934524,
508
- "Rank": 15
509
- },
510
- "Chemistry": {
511
- "Average Score": 72.1127762005651,
512
- "Standard Deviation": null,
513
- "Rank": 9
514
- },
515
- "CPP": {
516
- "Average Score": 72.1127762005651,
517
- "Standard Deviation": null,
518
- "Rank": 10
519
- }
520
- }
521
- },
522
- {
523
- "config": {
524
- "model_name": "gpt4-1106",
525
- "organization": "OpenAI",
526
- "license": "Proprietary",
527
- "knowledge_cutoff": "2024/04"
528
- },
529
- "results": {
530
- "OVERALL": {
531
- "Average Score": 0.816347784,
532
- "Standard Deviation": 0.1566815755,
533
- "Rank": 9
534
- },
535
- "Geometry": {
536
- "Average Score": 0.71843088,
537
- "Standard Deviation": 0.04778038294,
538
- "Rank": 11
539
- },
540
- "Algebra": {
541
- "Average Score": 0.712910417,
542
- "Standard Deviation": 0.02581828898,
543
- "Rank": 10
544
- },
545
- "Probability": {
546
- "Average Score": 0.623947619,
547
- "Standard Deviation": 0.03502982933,
548
- "Rank": 12
549
- },
550
- "Logical": {
551
- "Average Score": 0.637482274,
552
- "Standard Deviation": 0.04158809888,
553
- "Rank": 12
554
- },
555
- "Social": {
556
- "Average Score": 0.450609816,
557
- "Standard Deviation": 0.05208655446,
558
- "Rank": 21
559
- },
560
- "Chemistry": {
561
- "Average Score": 69.11824072252848,
562
- "Standard Deviation": null,
563
- "Rank": 11
564
- },
565
- "CPP": {
566
- "Average Score": 69.11824072252848,
567
- "Standard Deviation": null,
568
- "Rank": 12
569
- }
570
- }
571
- },
572
- {
573
- "config": {
574
- "model_name": "gemma-2-27b-it",
575
- "organization": "Google",
576
- "license": "Gemma License",
577
- "knowledge_cutoff": "2024/06"
578
- },
579
- "results": {
580
- "OVERALL": {
581
- "Average Score": 0.624169623,
582
- "Standard Deviation": 0.1048365121,
583
- "Rank": 14
584
- },
585
- "Geometry": {
586
- "Average Score": 0.60112744,
587
- "Standard Deviation": 0.0469109952,
588
- "Rank": 17
589
- },
590
- "Algebra": {
591
- "Average Score": 0.687955914,
592
- "Standard Deviation": 0.01959958192,
593
- "Rank": 11
594
- },
595
- "Probability": {
596
- "Average Score": 0.589524771,
597
- "Standard Deviation": 0.03112689325,
598
- "Rank": 14
599
- },
600
- "Logical": {
601
- "Average Score": 0.614978944,
602
- "Standard Deviation": 0.05710657859,
603
- "Rank": 15
604
- },
605
- "Social": {
606
- "Average Score": 0.487844257,
607
- "Standard Deviation": 0.05857760809,
608
- "Rank": 18
609
- },
610
- "Chemistry": {
611
- "Average Score": 63.28920072143611,
612
- "Standard Deviation": null,
613
- "Rank": 13
614
- },
615
- "CPP": {
616
- "Average Score": 63.28920072143611,
617
- "Standard Deviation": null,
618
- "Rank": 14
619
- }
620
- }
621
- },
622
- {
623
- "config": {
624
- "model_name": "claude-3-opus",
625
- "organization": "Anthropic",
626
- "license": "Proprietary",
627
- "knowledge_cutoff": "2023/08"
628
- },
629
- "results": {
630
- "OVERALL": {
631
- "Average Score": 0.650636271,
632
- "Standard Deviation": 0.1197773541,
633
- "Rank": 13
634
- },
635
- "Geometry": {
636
- "Average Score": 0.7215743,
637
- "Standard Deviation": 0.04712598358,
638
- "Rank": 10
639
- },
640
- "Algebra": {
641
- "Average Score": 0.68777327,
642
- "Standard Deviation": 0.02382683713,
643
- "Rank": 12
644
- },
645
- "Probability": {
646
- "Average Score": 0.626471421,
647
- "Standard Deviation": 0.02911817976,
648
- "Rank": 11
649
- },
650
- "Logical": {
651
- "Average Score": 0.692346381,
652
- "Standard Deviation": 0.03617185198,
653
- "Rank": 10
654
- },
655
- "Social": {
656
- "Average Score": 0.663410854,
657
- "Standard Deviation": 0.09540220876,
658
- "Rank": 9
659
- },
660
- "Chemistry": {
661
- "Average Score": 73.5404403567132,
662
- "Standard Deviation": null,
663
- "Rank": 6
664
- },
665
- "CPP": {
666
- "Average Score": 73.5404403567132,
667
- "Standard Deviation": null,
668
- "Rank": 8
669
- }
670
- }
671
- },
672
- {
673
- "config": {
674
- "model_name": "gemma-2-9b-it-simpo",
675
- "organization": "Google",
676
- "license": "Gemma License",
677
- "knowledge_cutoff": "2024/07"
678
- },
679
- "results": {
680
- "OVERALL": {
681
- "Average Score": "N/A",
682
- "Standard Deviation": "N/A",
683
- "Rank": "N/A"
684
- },
685
- "Geometry": {
686
- "Average Score": 0.582787508,
687
- "Standard Deviation": 0.03965204074,
688
- "Rank": 18
689
- },
690
- "Algebra": {
691
- "Average Score": 0.658648133,
692
- "Standard Deviation": 0.02565919856,
693
- "Rank": 13
694
- },
695
- "Probability": {
696
- "Average Score": 0.547861265,
697
- "Standard Deviation": 0.02885209131,
698
- "Rank": 17
699
- },
700
- "Logical": {
701
- "Average Score": 0.540720893,
702
- "Standard Deviation": 0.01970134508,
703
- "Rank": 19
704
- },
705
- "Social": {
706
- "Average Score": 0.635266187,
707
- "Standard Deviation": 0.03620021751,
708
- "Rank": 12
709
- },
710
- "Chemistry": {
711
- "Average Score": 73.43757596214863,
712
- "Standard Deviation": null,
713
- "Rank": 8
714
- },
715
- "CPP": {
716
- "Average Score": 73.43757596214863,
717
- "Standard Deviation": null,
718
- "Rank": 9
719
- }
720
- }
721
- },
722
- {
723
- "config": {
724
- "model_name": "qwen1.5-72b-chat",
725
- "organization": "Alibaba",
726
- "license": "Qianwen LICENSE",
727
- "knowledge_cutoff": "2024/03"
728
- },
729
- "results": {
730
- "OVERALL": {
731
- "Average Score": 0.519549796,
732
- "Standard Deviation": 0.00903634343,
733
- "Rank": 17
734
- },
735
- "Geometry": {
736
- "Average Score": 0.543139301,
737
- "Standard Deviation": 0.03425202326,
738
- "Rank": 22
739
- },
740
- "Algebra": {
741
- "Average Score": 0.635228729,
742
- "Standard Deviation": 0.01944043425,
743
- "Rank": 14
744
- },
745
- "Probability": {
746
- "Average Score": 0.486948658,
747
- "Standard Deviation": 0.06064655315,
748
- "Rank": 21
749
- },
750
- "Logical": {
751
- "Average Score": 0.284069394,
752
- "Standard Deviation": 0.02686608506,
753
- "Rank": 32
754
- },
755
- "Social": {
756
- "Average Score": 0.415007627,
757
- "Standard Deviation": 0.03920053159,
758
- "Rank": 22
759
- },
760
- "Chemistry": {
761
- "Average Score": 48.69302376665551,
762
- "Standard Deviation": null,
763
- "Rank": 19
764
- },
765
- "CPP": {
766
- "Average Score": 48.69302376665551,
767
- "Standard Deviation": null,
768
- "Rank": 20
769
- }
770
- }
771
- },
772
- {
773
- "config": {
774
- "model_name": "qwen1.5-32b-chat",
775
- "organization": "Alibaba",
776
- "license": "Qianwen LICENSE",
777
- "knowledge_cutoff": "2024/03"
778
- },
779
- "results": {
780
- "OVERALL": {
781
- "Average Score": 0.393789407,
782
- "Standard Deviation": 0.05413770095,
783
- "Rank": 28
784
- },
785
- "Geometry": {
786
- "Average Score": 0.51086835,
787
- "Standard Deviation": 0.04052471998,
788
- "Rank": 25
789
- },
790
- "Algebra": {
791
- "Average Score": 0.609003168,
792
- "Standard Deviation": 0.04874143541,
793
- "Rank": 15
794
- },
795
- "Probability": {
796
- "Average Score": 0.476300002,
797
- "Standard Deviation": 0.05322403912,
798
- "Rank": 22
799
- },
800
- "Logical": {
801
- "Average Score": 0.331781014,
802
- "Standard Deviation": 0.004938997686,
803
- "Rank": 29
804
- },
805
- "Social": {
806
- "Average Score": 0.380987334,
807
- "Standard Deviation": 0.03762251776,
808
- "Rank": 24
809
- },
810
- "Chemistry": {
811
- "Average Score": 45.14284028264288,
812
- "Standard Deviation": null,
813
- "Rank": 23
814
- },
815
- "CPP": {
816
- "Average Score": 45.14284028264288,
817
- "Standard Deviation": null,
818
- "Rank": 24
819
- }
820
- }
821
- },
822
- {
823
- "config": {
824
- "model_name": "google-gemma-2-9b-it",
825
- "organization": "Google",
826
- "license": "Proprietary",
827
- "knowledge_cutoff": "2024/06"
828
- },
829
- "results": {
830
- "OVERALL": {
831
- "Average Score": 0.489663449,
832
- "Standard Deviation": 0.002595702019,
833
- "Rank": 20
834
- },
835
- "Geometry": {
836
- "Average Score": 0.575371308,
837
- "Standard Deviation": 0.03556220251,
838
- "Rank": 20
839
- },
840
- "Algebra": {
841
- "Average Score": 0.597045661,
842
- "Standard Deviation": 0.0313828123,
843
- "Rank": 16
844
- },
845
- "Probability": {
846
- "Average Score": 0.589221807,
847
- "Standard Deviation": 0.03110811656,
848
- "Rank": 16
849
- },
850
- "Logical": {
851
- "Average Score": 0.587579897,
852
- "Standard Deviation": 0.05512716783,
853
- "Rank": 17
854
- },
855
- "Social": {
856
- "Average Score": 0.768337958,
857
- "Standard Deviation": 0.04078610476,
858
- "Rank": 5
859
- },
860
- "Chemistry": {
861
- "Average Score": 54.03167523687635,
862
- "Standard Deviation": null,
863
- "Rank": 16
864
- },
865
- "CPP": {
866
- "Average Score": 54.03167523687635,
867
- "Standard Deviation": null,
868
- "Rank": 17
869
- }
870
- }
871
- },
872
- {
873
- "config": {
874
- "model_name": "yi-1.5-34b-chat",
875
- "organization": "01 AI",
876
- "license": "Proprietary",
877
- "knowledge_cutoff": "2024/05"
878
- },
879
- "results": {
880
- "OVERALL": {
881
- "Average Score": 0.607812897,
882
- "Standard Deviation": 0.1440881293,
883
- "Rank": 15
884
- },
885
- "Geometry": {
886
- "Average Score": 0.566666724,
887
- "Standard Deviation": 0.04001381658,
888
- "Rank": 21
889
- },
890
- "Algebra": {
891
- "Average Score": 0.590997292,
892
- "Standard Deviation": 0.03594087315,
893
- "Rank": 17
894
- },
895
- "Probability": {
896
- "Average Score": 0.589524589,
897
- "Standard Deviation": 0.03112618772,
898
- "Rank": 15
899
- },
900
- "Logical": {
901
- "Average Score": 0.574105508,
902
- "Standard Deviation": 0.03441737941,
903
- "Rank": 18
904
- },
905
- "Social": {
906
- "Average Score": 0.516980832,
907
- "Standard Deviation": 0.03369347985,
908
- "Rank": 17
909
- },
910
- "Chemistry": {
911
- "Average Score": 52.148798061768964,
912
- "Standard Deviation": null,
913
- "Rank": 17
914
- },
915
- "CPP": {
916
- "Average Score": 52.148798061768964,
917
- "Standard Deviation": null,
918
- "Rank": 18
919
- }
920
- }
921
- },
922
- {
923
- "config": {
924
- "model_name": "meta-llama-3.1-70b-instruct",
925
- "organization": "Meta",
926
- "license": "Llama 3.1 Community",
927
- "knowledge_cutoff": "2023/12"
928
- },
929
- "results": {
930
- "OVERALL": {
931
- "Average Score": 0.708874896,
932
- "Standard Deviation": 0.1315111956,
933
- "Rank": 13
934
- },
935
- "Geometry": {
936
- "Average Score": 0.76184398,
937
- "Standard Deviation": 0.01790377984,
938
- "Rank": 10
939
- },
940
- "Algebra": {
941
- "Average Score": 0.732041699,
942
- "Standard Deviation": 0.02621439062,
943
- "Rank": 9
944
- },
945
- "Probability": {
946
- "Average Score": 0.676208383,
947
- "Standard Deviation": 0.05131201636,
948
- "Rank": 10
949
- },
950
- "Logical": {
951
- "Average Score": 0.620018631,
952
- "Standard Deviation": 0.02518873821,
953
- "Rank": 14
954
- },
955
- "Social": {
956
- "Average Score": 0.45872939,
957
- "Standard Deviation": 0.05347039576,
958
- "Rank": 20
959
- },
960
- "Chemistry": {
961
- "Average Score": 84.36815192532764,
962
- "Standard Deviation": null,
963
- "Rank": 4
964
- },
965
- "CPP": {
966
- "Average Score": 84.36815192532764,
967
- "Standard Deviation": null,
968
- "Rank": 4
969
- }
970
- }
971
- },
972
- {
973
- "config": {
974
- "model_name": "meta-llama-3.1-8b-instruct",
975
- "organization": "Meta",
976
- "license": "Llama 3.1 Community",
977
- "knowledge_cutoff": "2023/12"
978
- },
979
- "results": {
980
- "OVERALL": {
981
- "Average Score": 0.505936324,
982
- "Standard Deviation": 0.05286756493,
983
- "Rank": 18
984
- },
985
- "Geometry": {
986
- "Average Score": 0.522442162,
987
- "Standard Deviation": 0.03908236317,
988
- "Rank": 23
989
- },
990
- "Algebra": {
991
- "Average Score": 0.582702645,
992
- "Standard Deviation": 0.05002277711,
993
- "Rank": 18
994
- },
995
- "Probability": {
996
- "Average Score": 0.495001149,
997
- "Standard Deviation": 0.05244587037,
998
- "Rank": 20
999
- },
1000
- "Logical": {
1001
- "Average Score": 0.443030561,
1002
- "Standard Deviation": 0.01343820628,
1003
- "Rank": 24
1004
- },
1005
- "Social": {
1006
- "Average Score": 0.329195941,
1007
- "Standard Deviation": 0.03925019528,
1008
- "Rank": 28
1009
- },
1010
- "Chemistry": {
1011
- "Average Score": 44.41846841004584,
1012
- "Standard Deviation": null,
1013
- "Rank": 25
1014
- },
1015
- "CPP": {
1016
- "Average Score": 44.41846841004584,
1017
- "Standard Deviation": null,
1018
- "Rank": 26
1019
- }
1020
- }
1021
- },
1022
- {
1023
- "config": {
1024
- "model_name": "gpt3.5-turbo-0125",
1025
- "organization": "OpenAI",
1026
- "license": "Proprietary",
1027
- "knowledge_cutoff": "2021/09"
1028
- },
1029
- "results": {
1030
- "OVERALL": {
1031
- "Average Score": 0.313398088,
1032
- "Standard Deviation": 0.09322528606,
1033
- "Rank": 39
1034
- },
1035
- "Geometry": {
1036
- "Average Score": 0.678714519,
1037
- "Standard Deviation": 0.05926546762,
1038
- "Rank": 12
1039
- },
1040
- "Algebra": {
1041
- "Average Score": 0.569296173,
1042
- "Standard Deviation": 0.05277281097,
1043
- "Rank": 19
1044
- },
1045
- "Probability": {
1046
- "Average Score": 0.448460767,
1047
- "Standard Deviation": 0.05768095196,
1048
- "Rank": 24
1049
- },
1050
- "Logical": {
1051
- "Average Score": 0.148521348,
1052
- "Standard Deviation": 0.04033712907,
1053
- "Rank": 44
1054
- },
1055
- "Social": {
1056
- "Average Score": 0.235071541,
1057
- "Standard Deviation": 0.02632892457,
1058
- "Rank": 37
1059
- },
1060
- "Chemistry": {
1061
- "Average Score": 40.46958736582551,
1062
- "Standard Deviation": null,
1063
- "Rank": 28
1064
- },
1065
- "CPP": {
1066
- "Average Score": 40.46958736582551,
1067
- "Standard Deviation": null,
1068
- "Rank": 29
1069
- }
1070
- }
1071
- },
1072
- {
1073
- "config": {
1074
- "model_name": "llama-3-70b-instruct",
1075
- "organization": "Meta",
1076
- "license": "Llama 3 Community",
1077
- "knowledge_cutoff": "2023/12"
1078
- },
1079
- "results": {
1080
- "OVERALL": {
1081
- "Average Score": 0.456689885,
1082
- "Standard Deviation": 0.01385989995,
1083
- "Rank": 22
1084
- },
1085
- "Geometry": {
1086
- "Average Score": 0.516865529,
1087
- "Standard Deviation": 0.03858112564,
1088
- "Rank": 24
1089
- },
1090
- "Algebra": {
1091
- "Average Score": 0.566756531,
1092
- "Standard Deviation": 0.03369826926,
1093
- "Rank": 20
1094
- },
1095
- "Probability": {
1096
- "Average Score": 0.513857306,
1097
- "Standard Deviation": 0.05453699062,
1098
- "Rank": 19
1099
- },
1100
- "Logical": {
1101
- "Average Score": 0.713796415,
1102
- "Standard Deviation": 0.02031215107,
1103
- "Rank": 9
1104
- },
1105
- "Social": {
1106
- "Average Score": 0.45872939,
1107
- "Standard Deviation": 0.05347039576,
1108
- "Rank": 20
1109
- },
1110
- "Chemistry": {
1111
- "Average Score": 65.32140697218945,
1112
- "Standard Deviation": null,
1113
- "Rank": 12
1114
- },
1115
- "CPP": {
1116
- "Average Score": 65.32140697218945,
1117
- "Standard Deviation": null,
1118
- "Rank": 13
1119
- }
1120
- }
1121
- },
1122
- {
1123
- "config": {
1124
- "model_name": "claude-3-sonnet",
1125
- "organization": "Anthropic",
1126
- "license": "Proprietary",
1127
- "knowledge_cutoff": "2023/08"
1128
- },
1129
- "results": {
1130
- "OVERALL": {
1131
- "Average Score": 0.520010833,
1132
- "Standard Deviation": 0.005030563799,
1133
- "Rank": 16
1134
- },
1135
- "Geometry": {
1136
- "Average Score": 0.675613638,
1137
- "Standard Deviation": 0.05275594408,
1138
- "Rank": 13
1139
- },
1140
- "Algebra": {
1141
- "Average Score": 0.552025728,
1142
- "Standard Deviation": 0.04122192409,
1143
- "Rank": 21
1144
- },
1145
- "Probability": {
1146
- "Average Score": 0.516192848,
1147
- "Standard Deviation": 0.04152293217,
1148
- "Rank": 18
1149
- },
1150
- "Logical": {
1151
- "Average Score": 0.588545747,
1152
- "Standard Deviation": 0.06068211943,
1153
- "Rank": 16
1154
- },
1155
- "Social": {
1156
- "Average Score": 0.570437582,
1157
- "Standard Deviation": 0.08607040862,
1158
- "Rank": 14
1159
- },
1160
- "Chemistry": {
1161
- "Average Score": 61.33538592327427,
1162
- "Standard Deviation": null,
1163
- "Rank": 14
1164
- },
1165
- "CPP": {
1166
- "Average Score": 61.33538592327427,
1167
- "Standard Deviation": null,
1168
- "Rank": 15
1169
- }
1170
- }
1171
- },
1172
- {
1173
- "config": {
1174
- "model_name": "qwen1.5-14b-chat",
1175
- "organization": "Alibaba",
1176
- "license": "Qianwen LICENSE",
1177
- "knowledge_cutoff": "2024/02"
1178
- },
1179
- "results": {
1180
- "OVERALL": {
1181
- "Average Score": 0.415328996,
1182
- "Standard Deviation": 0.0743938717,
1183
- "Rank": 27
1184
- },
1185
- "Geometry": {
1186
- "Average Score": 0.452504016,
1187
- "Standard Deviation": 0.04225594393,
1188
- "Rank": 26
1189
- },
1190
- "Algebra": {
1191
- "Average Score": 0.538655725,
1192
- "Standard Deviation": 0.03721542594,
1193
- "Rank": 22
1194
- },
1195
- "Probability": {
1196
- "Average Score": 0.397185975,
1197
- "Standard Deviation": 0.05607695946,
1198
- "Rank": 28
1199
- },
1200
- "Logical": {
1201
- "Average Score": 0.264573129,
1202
- "Standard Deviation": 0.03936133174,
1203
- "Rank": 34
1204
- },
1205
- "Social": {
1206
- "Average Score": 0.287370142,
1207
- "Standard Deviation": 0.04264085315,
1208
- "Rank": 30
1209
- },
1210
- "Chemistry": {
1211
- "Average Score": 38.552779976347026,
1212
- "Standard Deviation": null,
1213
- "Rank": 30
1214
- },
1215
- "CPP": {
1216
- "Average Score": 38.552779976347026,
1217
- "Standard Deviation": null,
1218
- "Rank": 31
1219
- }
1220
- }
1221
- },
1222
- {
1223
- "config": {
1224
- "model_name": "claude-3-haiku",
1225
- "organization": "Anthropic",
1226
- "license": "Proprietary",
1227
- "knowledge_cutoff": "2023/08"
1228
- },
1229
- "results": {
1230
- "OVERALL": {
1231
- "Average Score": 0.453901163,
1232
- "Standard Deviation": 0.003604084261,
1233
- "Rank": 23
1234
- },
1235
- "Geometry": {
1236
- "Average Score": 0.607993912,
1237
- "Standard Deviation": 0.05793460748,
1238
- "Rank": 15
1239
- },
1240
- "Algebra": {
1241
- "Average Score": 0.520054055,
1242
- "Standard Deviation": 0.03333544511,
1243
- "Rank": 23
1244
- },
1245
- "Probability": {
1246
- "Average Score": 0.474460688,
1247
- "Standard Deviation": 0.0446501933,
1248
- "Rank": 23
1249
- },
1250
- "Logical": {
1251
- "Average Score": 0.512815976,
1252
- "Standard Deviation": 0.0163264281,
1253
- "Rank": 20
1254
- },
1255
- "Social": {
1256
- "Average Score": 0.551083976,
1257
- "Standard Deviation": 0.05374722539,
1258
- "Rank": 16
1259
- },
1260
- "Chemistry": {
1261
- "Average Score": 56.40200048817984,
1262
- "Standard Deviation": null,
1263
- "Rank": 15
1264
- },
1265
- "CPP": {
1266
- "Average Score": 56.40200048817984,
1267
- "Standard Deviation": null,
1268
- "Rank": 16
1269
- }
1270
- }
1271
- },
1272
- {
1273
- "config": {
1274
- "model_name": "claude-2.1",
1275
- "organization": "Anthropic",
1276
- "license": "Proprietary",
1277
- "knowledge_cutoff": "Unknown"
1278
- },
1279
- "results": {
1280
- "OVERALL": {
1281
- "Average Score": 0.35814708,
1282
- "Standard Deviation": 0.09168134168,
1283
- "Rank": 35
1284
- },
1285
- "Geometry": {
1286
- "Average Score": 0.62752395,
1287
- "Standard Deviation": 0.07232659398,
1288
- "Rank": 14
1289
- },
1290
- "Algebra": {
1291
- "Average Score": 0.508849609,
1292
- "Standard Deviation": 0.0346897465,
1293
- "Rank": 24
1294
- },
1295
- "Probability": {
1296
- "Average Score": 0.41477086,
1297
- "Standard Deviation": 0.05964060239,
1298
- "Rank": 27
1299
- },
1300
- "Logical": {
1301
- "Average Score": 0.482923674,
1302
- "Standard Deviation": 0.01989147048,
1303
- "Rank": 21
1304
- },
1305
- "Social": {
1306
- "Average Score": 0.333804568,
1307
- "Standard Deviation": 0.03775548253,
1308
- "Rank": 27
1309
- },
1310
- "Chemistry": {
1311
- "Average Score": 47.23672563994903,
1312
- "Standard Deviation": null,
1313
- "Rank": 20
1314
- },
1315
- "CPP": {
1316
- "Average Score": 47.23672563994903,
1317
- "Standard Deviation": null,
1318
- "Rank": 21
1319
- }
1320
- }
1321
- },
1322
- {
1323
- "config": {
1324
- "model_name": "mistral-8x7b-instruct-v0.1",
1325
- "organization": "Mistral",
1326
- "license": "Apache 2.0",
1327
- "knowledge_cutoff": "2023/12"
1328
- },
1329
- "results": {
1330
- "OVERALL": {
1331
- "Average Score": 0.382659161,
1332
- "Standard Deviation": 0.07594496929,
1333
- "Rank": 30
1334
- },
1335
- "Geometry": {
1336
- "Average Score": 0.432216097,
1337
- "Standard Deviation": 0.04747949254,
1338
- "Rank": 29
1339
- },
1340
- "Algebra": {
1341
- "Average Score": 0.478314888,
1342
- "Standard Deviation": 0.01998797419,
1343
- "Rank": 25
1344
- },
1345
- "Probability": {
1346
- "Average Score": 0.427144725,
1347
- "Standard Deviation": 0.0590923329,
1348
- "Rank": 26
1349
- },
1350
- "Logical": {
1351
- "Average Score": 0.340041983,
1352
- "Standard Deviation": 0.008397574592,
1353
- "Rank": 27
1354
- },
1355
- "Social": {
1356
- "Average Score": 0.251949622,
1357
- "Standard Deviation": 0.03346674405,
1358
- "Rank": 35
1359
- },
1360
- "Chemistry": {
1361
- "Average Score": 44.533118241976666,
1362
- "Standard Deviation": null,
1363
- "Rank": 24
1364
- },
1365
- "CPP": {
1366
- "Average Score": 44.533118241976666,
1367
- "Standard Deviation": null,
1368
- "Rank": 25
1369
- }
1370
- }
1371
- },
1372
- {
1373
- "config": {
1374
- "model_name": "claude-2.0",
1375
- "organization": "Anthropic",
1376
- "license": "Proprietary",
1377
- "knowledge_cutoff": "Unknown"
1378
- },
1379
- "results": {
1380
- "OVERALL": {
1381
- "Average Score": 0.322718057,
1382
- "Standard Deviation": 0.08369883584,
1383
- "Rank": 37
1384
- },
1385
- "Geometry": {
1386
- "Average Score": 0.604141967,
1387
- "Standard Deviation": 0.05116441826,
1388
- "Rank": 16
1389
- },
1390
- "Algebra": {
1391
- "Average Score": 0.474350734,
1392
- "Standard Deviation": 0.01510393066,
1393
- "Rank": 26
1394
- },
1395
- "Probability": {
1396
- "Average Score": 0.437950412,
1397
- "Standard Deviation": 0.05985594317,
1398
- "Rank": 25
1399
- },
1400
- "Logical": {
1401
- "Average Score": 0.445620646,
1402
- "Standard Deviation": 0.01812614805,
1403
- "Rank": 23
1404
- },
1405
- "Social": {
1406
- "Average Score": 0.469422836,
1407
- "Standard Deviation": 0.05999901796,
1408
- "Rank": 19
1409
- },
1410
- "Chemistry": {
1411
- "Average Score": 50.773143448036464,
1412
- "Standard Deviation": null,
1413
- "Rank": 18
1414
- },
1415
- "CPP": {
1416
- "Average Score": 50.773143448036464,
1417
- "Standard Deviation": null,
1418
- "Rank": 19
1419
- }
1420
- }
1421
- },
1422
- {
1423
- "config": {
1424
- "model_name": "starling-lm-7b-beta",
1425
- "organization": "Nexusflow",
1426
- "license": "Apache-2.0",
1427
- "knowledge_cutoff": "2024/03"
1428
- },
1429
- "results": {
1430
- "OVERALL": {
1431
- "Average Score": 0.479391856,
1432
- "Standard Deviation": 0.04199990887,
1433
- "Rank": 21
1434
- },
1435
- "Geometry": {
1436
- "Average Score": 0.446654388,
1437
- "Standard Deviation": 0.05637864999,
1438
- "Rank": 28
1439
- },
1440
- "Algebra": {
1441
- "Average Score": 0.473952749,
1442
- "Standard Deviation": 0.01584301288,
1443
- "Rank": 27
1444
- },
1445
- "Probability": {
1446
- "Average Score": 0.395197837,
1447
- "Standard Deviation": 0.05814798892,
1448
- "Rank": 29
1449
- },
1450
- "Logical": {
1451
- "Average Score": 0.39927199,
1452
- "Standard Deviation": 0.02125277518,
1453
- "Rank": 25
1454
- },
1455
- "Social": {
1456
- "Average Score": 0.380021662,
1457
- "Standard Deviation": 0.04622452748,
1458
- "Rank": 25
1459
- },
1460
- "Chemistry": {
1461
- "Average Score": 38.27587102395908,
1462
- "Standard Deviation": null,
1463
- "Rank": 31
1464
- },
1465
- "CPP": {
1466
- "Average Score": 38.27587102395908,
1467
- "Standard Deviation": null,
1468
- "Rank": 32
1469
- }
1470
- }
1471
- },
1472
- {
1473
- "config": {
1474
- "model_name": "gemini-1.0-pro-001",
1475
- "organization": "Google",
1476
- "license": "Proprietary",
1477
- "knowledge_cutoff": "2023/04"
1478
- },
1479
- "results": {
1480
- "OVERALL": {
1481
- "Average Score": 0.449040654,
1482
- "Standard Deviation": 0.0450610177,
1483
- "Rank": 24
1484
- },
1485
- "Geometry": {
1486
- "Average Score": 0.578347959,
1487
- "Standard Deviation": 0.04242873607,
1488
- "Rank": 19
1489
- },
1490
- "Algebra": {
1491
- "Average Score": 0.462417786,
1492
- "Standard Deviation": 0.01668313635,
1493
- "Rank": 28
1494
- },
1495
- "Probability": {
1496
- "Average Score": 0.289836324,
1497
- "Standard Deviation": 0.05739831115,
1498
- "Rank": 37
1499
- },
1500
- "Logical": {
1501
- "Average Score": 0.191140355,
1502
- "Standard Deviation": 0.03394652499,
1503
- "Rank": 40
1504
- },
1505
- "Social": {
1506
- "Average Score": 0.130790863,
1507
- "Standard Deviation": 0.02800188173,
1508
- "Rank": 45
1509
- },
1510
- "Chemistry": {
1511
- "Average Score": 45.22204471452975,
1512
- "Standard Deviation": null,
1513
- "Rank": 22
1514
- },
1515
- "CPP": {
1516
- "Average Score": 45.22204471452975,
1517
- "Standard Deviation": null,
1518
- "Rank": 23
1519
- }
1520
- }
1521
- },
1522
- {
1523
- "config": {
1524
- "model_name": "openchat-3.5-0106",
1525
- "organization": "OpenChat",
1526
- "license": "Apache-2.0",
1527
- "knowledge_cutoff": "2024/01"
1528
- },
1529
- "results": {
1530
- "OVERALL": {
1531
- "Average Score": 0.363929888,
1532
- "Standard Deviation": 0.08602347145,
1533
- "Rank": 33
1534
- },
1535
- "Geometry": {
1536
- "Average Score": 0.38715246,
1537
- "Standard Deviation": 0.03701851946,
1538
- "Rank": 32
1539
- },
1540
- "Algebra": {
1541
- "Average Score": 0.441233712,
1542
- "Standard Deviation": 0.01135753754,
1543
- "Rank": 29
1544
- },
1545
- "Probability": {
1546
- "Average Score": 0.38802618,
1547
- "Standard Deviation": 0.05663879714,
1548
- "Rank": 30
1549
- },
1550
- "Logical": {
1551
- "Average Score": 0.336754383,
1552
- "Standard Deviation": 0.01608478079,
1553
- "Rank": 28
1554
- },
1555
- "Social": {
1556
- "Average Score": 0.250891608,
1557
- "Standard Deviation": 0.03253769914,
1558
- "Rank": 36
1559
- },
1560
- "Chemistry": {
1561
- "Average Score": 33.70639271807677,
1562
- "Standard Deviation": null,
1563
- "Rank": 32
1564
- },
1565
- "CPP": {
1566
- "Average Score": 33.70639271807677,
1567
- "Standard Deviation": null,
1568
- "Rank": 33
1569
- }
1570
- }
1571
- },
1572
- {
1573
- "config": {
1574
- "model_name": "openchat-3.5",
1575
- "organization": "OpenChat",
1576
- "license": "Apache-2.0",
1577
- "knowledge_cutoff": "2023/11"
1578
- },
1579
- "results": {
1580
- "OVERALL": {
1581
- "Average Score": 0.361341296,
1582
- "Standard Deviation": 0.09034869493,
1583
- "Rank": 34
1584
- },
1585
- "Geometry": {
1586
- "Average Score": 0.401699069,
1587
- "Standard Deviation": 0.03410726557,
1588
- "Rank": 30
1589
- },
1590
- "Algebra": {
1591
- "Average Score": 0.414095336,
1592
- "Standard Deviation": 0.01881964261,
1593
- "Rank": 31
1594
- },
1595
- "Probability": {
1596
- "Average Score": 0.349601002,
1597
- "Standard Deviation": 0.05077455539,
1598
- "Rank": 32
1599
- },
1600
- "Logical": {
1601
- "Average Score": 0.331069242,
1602
- "Standard Deviation": 0.02180827173,
1603
- "Rank": 30
1604
- },
1605
- "Social": {
1606
- "Average Score": 0.319991655,
1607
- "Standard Deviation": 0.04502478724,
1608
- "Rank": 29
1609
- },
1610
- "Chemistry": {
1611
- "Average Score": 33.020911255646965,
1612
- "Standard Deviation": null,
1613
- "Rank": 33
1614
- },
1615
- "CPP": {
1616
- "Average Score": 33.020911255646965,
1617
- "Standard Deviation": null,
1618
- "Rank": 34
1619
- }
1620
- }
1621
- },
1622
- {
1623
- "config": {
1624
- "model_name": "command-r-(08-2024)",
1625
- "organization": "Cohere",
1626
- "license": "CC-BY-NC-4.0",
1627
- "knowledge_cutoff": "2024/08"
1628
- },
1629
- "results": {
1630
- "OVERALL": {
1631
- "Average Score": 0.427605298,
1632
- "Standard Deviation": 0.01747449163,
1633
- "Rank": 25
1634
- },
1635
- "Geometry": {
1636
- "Average Score": 0.448300727,
1637
- "Standard Deviation": 0.04996362328,
1638
- "Rank": 27
1639
- },
1640
- "Algebra": {
1641
- "Average Score": 0.417519167,
1642
- "Standard Deviation": 0.01822196902,
1643
- "Rank": 30
1644
- },
1645
- "Probability": {
1646
- "Average Score": 0.366336281,
1647
- "Standard Deviation": 0.04716826942,
1648
- "Rank": 31
1649
- },
1650
- "Logical": {
1651
- "Average Score": 0.214657906,
1652
- "Standard Deviation": 0.03003579835,
1653
- "Rank": 37
1654
- },
1655
- "Social": {
1656
- "Average Score": 0.276088379,
1657
- "Standard Deviation": 0.03295234688,
1658
- "Rank": 32
1659
- },
1660
- "Chemistry": {
1661
- "Average Score": 39.61492485677676,
1662
- "Standard Deviation": null,
1663
- "Rank": 29
1664
- },
1665
- "CPP": {
1666
- "Average Score": 39.61492485677676,
1667
- "Standard Deviation": null,
1668
- "Rank": 30
1669
- }
1670
- }
1671
- },
1672
- {
1673
- "config": {
1674
- "model_name": "gemma-1.1-7b-it",
1675
- "organization": "Google",
1676
- "license": "Gemma License",
1677
- "knowledge_cutoff": "2024/02"
1678
- },
1679
- "results": {
1680
- "OVERALL": {
1681
- "Average Score": 0.339506922,
1682
- "Standard Deviation": 0.1066279108,
1683
- "Rank": 36
1684
- },
1685
- "Geometry": {
1686
- "Average Score": 0.324170977,
1687
- "Standard Deviation": 0.04668553765,
1688
- "Rank": 35
1689
- },
1690
- "Algebra": {
1691
- "Average Score": 0.398684697,
1692
- "Standard Deviation": 0.01982398259,
1693
- "Rank": 32
1694
- },
1695
- "Probability": {
1696
- "Average Score": 0.293253175,
1697
- "Standard Deviation": 0.05126192191,
1698
- "Rank": 36
1699
- },
1700
- "Logical": {
1701
- "Average Score": 0.317750796,
1702
- "Standard Deviation": 0.01101933543,
1703
- "Rank": 31
1704
- },
1705
- "Social": {
1706
- "Average Score": 0.179073276,
1707
- "Standard Deviation": 0.02009658805,
1708
- "Rank": 41
1709
- },
1710
- "Chemistry": {
1711
- "Average Score": 42.666504105798204,
1712
- "Standard Deviation": null,
1713
- "Rank": 26
1714
- },
1715
- "CPP": {
1716
- "Average Score": 42.666504105798204,
1717
- "Standard Deviation": null,
1718
- "Rank": 27
1719
- }
1720
- }
1721
- },
1722
- {
1723
- "config": {
1724
- "model_name": "llama3-8b-instruct",
1725
- "organization": "Meta",
1726
- "license": "Llama 3 Community",
1727
- "knowledge_cutoff": "2023/03"
1728
- },
1729
- "results": {
1730
- "OVERALL": {
1731
- "Average Score": 0.367722676,
1732
- "Standard Deviation": 0.1071368221,
1733
- "Rank": 31
1734
- },
1735
- "Geometry": {
1736
- "Average Score": 0.367143758,
1737
- "Standard Deviation": 0.04363680358,
1738
- "Rank": 33
1739
- },
1740
- "Algebra": {
1741
- "Average Score": 0.391480973,
1742
- "Standard Deviation": 0.02757445266,
1743
- "Rank": 33
1744
- },
1745
- "Probability": {
1746
- "Average Score": 0.317616445,
1747
- "Standard Deviation": 0.04300430361,
1748
- "Rank": 35
1749
- },
1750
- "Logical": {
1751
- "Average Score": 0.461607495,
1752
- "Standard Deviation": 0.02185028842,
1753
- "Rank": 22
1754
- },
1755
- "Social": {
1756
- "Average Score": 0.336373622,
1757
- "Standard Deviation": 0.05762408512,
1758
- "Rank": 26
1759
- },
1760
- "Chemistry": {
1761
- "Average Score": 45.35392139264795,
1762
- "Standard Deviation": null,
1763
- "Rank": 21
1764
- },
1765
- "CPP": {
1766
- "Average Score": 45.35392139264795,
1767
- "Standard Deviation": null,
1768
- "Rank": 22
1769
- }
1770
- }
1771
- },
1772
- {
1773
- "config": {
1774
- "model_name": "gemma-2-2b-it",
1775
- "organization": "Google",
1776
- "license": "Gemma License",
1777
- "knowledge_cutoff": "2024/07"
1778
- },
1779
- "results": {
1780
- "OVERALL": {
1781
- "Average Score": 0.502167612,
1782
- "Standard Deviation": 0.04389786763,
1783
- "Rank": 19
1784
- },
1785
- "Geometry": {
1786
- "Average Score": 0.395006676,
1787
- "Standard Deviation": 0.05882607713,
1788
- "Rank": 31
1789
- },
1790
- "Algebra": {
1791
- "Average Score": 0.379391887,
1792
- "Standard Deviation": 0.01722410785,
1793
- "Rank": 34
1794
- },
1795
- "Probability": {
1796
- "Average Score": 0.331231097,
1797
- "Standard Deviation": 0.05392499987,
1798
- "Rank": 34
1799
- },
1800
- "Logical": {
1801
- "Average Score": 0.367687789,
1802
- "Standard Deviation": 0.02547968808,
1803
- "Rank": 26
1804
- },
1805
- "Social": {
1806
- "Average Score": 0.393482094,
1807
- "Standard Deviation": 0.06450214024,
1808
- "Rank": 23
1809
- },
1810
- "Chemistry": {
1811
- "Average Score": 30.53406933106768,
1812
- "Standard Deviation": null,
1813
- "Rank": 35
1814
- },
1815
- "CPP": {
1816
- "Average Score": 30.53406933106768,
1817
- "Standard Deviation": null,
1818
- "Rank": 36
1819
- }
1820
- }
1821
- },
1822
- {
1823
- "config": {
1824
- "model_name": "starling-lm-7b-alpha",
1825
- "organization": "Nexusflow",
1826
- "license": "Apache-2.0",
1827
- "knowledge_cutoff": "2023/11"
1828
- },
1829
- "results": {
1830
- "OVERALL": {
1831
- "Average Score": 0.366628765,
1832
- "Standard Deviation": 0.08405492929,
1833
- "Rank": 32
1834
- },
1835
- "Geometry": {
1836
- "Average Score": 0.336782578,
1837
- "Standard Deviation": 0.04069449132,
1838
- "Rank": 34
1839
- },
1840
- "Algebra": {
1841
- "Average Score": 0.371551932,
1842
- "Standard Deviation": 0.03367241745,
1843
- "Rank": 35
1844
- },
1845
- "Probability": {
1846
- "Average Score": 0.331472505,
1847
- "Standard Deviation": 0.04833324282,
1848
- "Rank": 33
1849
- },
1850
- "Logical": {
1851
- "Average Score": 0.260869624,
1852
- "Standard Deviation": 0.03562735237,
1853
- "Rank": 35
1854
- },
1855
- "Social": {
1856
- "Average Score": 0.271975534,
1857
- "Standard Deviation": 0.04266753408,
1858
- "Rank": 33
1859
- },
1860
- "Chemistry": {
1861
- "Average Score": 30.07926487356878,
1862
- "Standard Deviation": null,
1863
- "Rank": 36
1864
- },
1865
- "CPP": {
1866
- "Average Score": 30.07926487356878,
1867
- "Standard Deviation": null,
1868
- "Rank": 37
1869
- }
1870
- }
1871
- },
1872
- {
1873
- "config": {
1874
- "model_name": "qwen1.5-4b-chat",
1875
- "organization": "Alibaba",
1876
- "license": "Qianwen LICENSE",
1877
- "knowledge_cutoff": "2024/02"
1878
- },
1879
- "results": {
1880
- "OVERALL": {
1881
- "Average Score": 0.111876411,
1882
- "Standard Deviation": 0.04241022785,
1883
- "Rank": 48
1884
- },
1885
- "Geometry": {
1886
- "Average Score": 0.215834522,
1887
- "Standard Deviation": 0.0363766363,
1888
- "Rank": 39
1889
- },
1890
- "Algebra": {
1891
- "Average Score": 0.305589811,
1892
- "Standard Deviation": 0.02354198912,
1893
- "Rank": 36
1894
- },
1895
- "Probability": {
1896
- "Average Score": 0.149365327,
1897
- "Standard Deviation": 0.03489672675,
1898
- "Rank": 43
1899
- },
1900
- "Logical": {
1901
- "Average Score": 0.116210168,
1902
- "Standard Deviation": 0.005927966496,
1903
- "Rank": 46
1904
- },
1905
- "Social": {
1906
- "Average Score": 0.18195615,
1907
- "Standard Deviation": 0.02269805277,
1908
- "Rank": 40
1909
- },
1910
- "Chemistry": {
1911
- "Average Score": 13.21208067122554,
1912
- "Standard Deviation": null,
1913
- "Rank": 46
1914
- },
1915
- "CPP": {
1916
- "Average Score": 13.21208067122554,
1917
- "Standard Deviation": null,
1918
- "Rank": 47
1919
- }
1920
- }
1921
- },
1922
- {
1923
- "config": {
1924
- "model_name": "command-r-(04-2024)",
1925
- "organization": "Cohere",
1926
- "license": "CC-BY-NC-4.0",
1927
- "knowledge_cutoff": "2024/04"
1928
- },
1929
- "results": {
1930
- "OVERALL": {
1931
- "Average Score": 0.388783887,
1932
- "Standard Deviation": 0.07417186783,
1933
- "Rank": 29
1934
- },
1935
- "Geometry": {
1936
- "Average Score": 0.300416698,
1937
- "Standard Deviation": 0.03485612736,
1938
- "Rank": 36
1939
- },
1940
- "Algebra": {
1941
- "Average Score": 0.293120231,
1942
- "Standard Deviation": 0.032926484,
1943
- "Rank": 37
1944
- },
1945
- "Probability": {
1946
- "Average Score": 0.281271304,
1947
- "Standard Deviation": 0.05697149867,
1948
- "Rank": 38
1949
- },
1950
- "Logical": {
1951
- "Average Score": 0.276189906,
1952
- "Standard Deviation": 0.03562914754,
1953
- "Rank": 33
1954
- },
1955
- "Social": {
1956
- "Average Score": 0.283882949,
1957
- "Standard Deviation": 0.03336901148,
1958
- "Rank": 31
1959
- },
1960
- "Chemistry": {
1961
- "Average Score": 41.346336503003236,
1962
- "Standard Deviation": null,
1963
- "Rank": 27
1964
- },
1965
- "CPP": {
1966
- "Average Score": 41.346336503003236,
1967
- "Standard Deviation": null,
1968
- "Rank": 28
1969
- }
1970
- }
1971
- },
1972
- {
1973
- "config": {
1974
- "model_name": "vicuna-33b",
1975
- "organization": "LMSYS",
1976
- "license": "Non-commercial",
1977
- "knowledge_cutoff": "2023/08"
1978
- },
1979
- "results": {
1980
- "OVERALL": {
1981
- "Average Score": 0.316543555,
1982
- "Standard Deviation": 0.08922095647,
1983
- "Rank": 38
1984
- },
1985
- "Geometry": {
1986
- "Average Score": 0.208284679,
1987
- "Standard Deviation": 0.03937771461,
1988
- "Rank": 40
1989
- },
1990
- "Algebra": {
1991
- "Average Score": 0.248994048,
1992
- "Standard Deviation": 0.02668175054,
1993
- "Rank": 39
1994
- },
1995
- "Probability": {
1996
- "Average Score": 0.222313995,
1997
- "Standard Deviation": 0.03978859759,
1998
- "Rank": 41
1999
- },
2000
- "Logical": {
2001
- "Average Score": 0.180291222,
2002
- "Standard Deviation": 0.021886267,
2003
- "Rank": 41
2004
- },
2005
- "Social": {
2006
- "Average Score": 0.257623798,
2007
- "Standard Deviation": 0.02653724437,
2008
- "Rank": 34
2009
- },
2010
- "Chemistry": {
2011
- "Average Score": 28.01838653090379,
2012
- "Standard Deviation": null,
2013
- "Rank": 37
2014
- },
2015
- "CPP": {
2016
- "Average Score": 28.01838653090379,
2017
- "Standard Deviation": null,
2018
- "Rank": 38
2019
- }
2020
- }
2021
- },
2022
- {
2023
- "config": {
2024
- "model_name": "gemma-7b-it",
2025
- "organization": "Google",
2026
- "license": "Gemma License",
2027
- "knowledge_cutoff": "2024/02"
2028
- },
2029
- "results": {
2030
- "OVERALL": {
2031
- "Average Score": 0.285077558,
2032
- "Standard Deviation": 0.08871758453,
2033
- "Rank": 40
2034
- },
2035
- "Geometry": {
2036
- "Average Score": 0.244791417,
2037
- "Standard Deviation": 0.0289612078,
2038
- "Rank": 37
2039
- },
2040
- "Algebra": {
2041
- "Average Score": 0.250614794,
2042
- "Standard Deviation": 0.01991678295,
2043
- "Rank": 38
2044
- },
2045
- "Probability": {
2046
- "Average Score": 0.174313053,
2047
- "Standard Deviation": 0.03765424728,
2048
- "Rank": 42
2049
- },
2050
- "Logical": {
2051
- "Average Score": 0.197505536,
2052
- "Standard Deviation": 0.02050298885,
2053
- "Rank": 38
2054
- },
2055
- "Social": {
2056
- "Average Score": 0.202138025,
2057
- "Standard Deviation": 0.02098346639,
2058
- "Rank": 39
2059
- },
2060
- "Chemistry": {
2061
- "Average Score": 28.014658234926813,
2062
- "Standard Deviation": null,
2063
- "Rank": 38
2064
- },
2065
- "CPP": {
2066
- "Average Score": 28.014658234926813,
2067
- "Standard Deviation": null,
2068
- "Rank": 39
2069
- }
2070
- }
2071
- },
2072
- {
2073
- "config": {
2074
- "model_name": "mistral-7b-instruct-2",
2075
- "organization": "Mistral",
2076
- "license": "Apache 2.0",
2077
- "knowledge_cutoff": "2023/12"
2078
- },
2079
- "results": {
2080
- "OVERALL": {
2081
- "Average Score": 0.427513868,
2082
- "Standard Deviation": 0.05553921135,
2083
- "Rank": 26
2084
- },
2085
- "Geometry": {
2086
- "Average Score": 0.216402626,
2087
- "Standard Deviation": 0.03338414918,
2088
- "Rank": 38
2089
- },
2090
- "Algebra": {
2091
- "Average Score": 0.233777838,
2092
- "Standard Deviation": 0.0155226054,
2093
- "Rank": 40
2094
- },
2095
- "Probability": {
2096
- "Average Score": 0.25118175,
2097
- "Standard Deviation": 0.04065514593,
2098
- "Rank": 39
2099
- },
2100
- "Logical": {
2101
- "Average Score": 0.224469136,
2102
- "Standard Deviation": 0.03404706752,
2103
- "Rank": 36
2104
- },
2105
- "Social": {
2106
- "Average Score": 0.209386782,
2107
- "Standard Deviation": 0.02738569921,
2108
- "Rank": 38
2109
- },
2110
- "Chemistry": {
2111
- "Average Score": 31.382959631870822,
2112
- "Standard Deviation": null,
2113
- "Rank": 34
2114
- },
2115
- "CPP": {
2116
- "Average Score": 31.382959631870822,
2117
- "Standard Deviation": null,
2118
- "Rank": 35
2119
- }
2120
- }
2121
- },
2122
- {
2123
- "config": {
2124
- "model_name": "mistral-7b-instruct-1",
2125
- "organization": "Mistral",
2126
- "license": "Apache 2.0",
2127
- "knowledge_cutoff": "2023/12"
2128
- },
2129
- "results": {
2130
- "OVERALL": {
2131
- "Average Score": 0.23016314,
2132
- "Standard Deviation": 0.07137625271,
2133
- "Rank": 45
2134
- },
2135
- "Geometry": {
2136
- "Average Score": 0.161799938,
2137
- "Standard Deviation": 0.03595278559,
2138
- "Rank": 44
2139
- },
2140
- "Algebra": {
2141
- "Average Score": 0.210341624,
2142
- "Standard Deviation": 0.01736539119,
2143
- "Rank": 41
2144
- },
2145
- "Probability": {
2146
- "Average Score": 0.238417922,
2147
- "Standard Deviation": 0.03744211933,
2148
- "Rank": 40
2149
- },
2150
- "Logical": {
2151
- "Average Score": 0.142636601,
2152
- "Standard Deviation": 0.02080406365,
2153
- "Rank": 45
2154
- },
2155
- "Social": {
2156
- "Average Score": 0.117646827,
2157
- "Standard Deviation": 0.009321202779,
2158
- "Rank": 47
2159
- },
2160
- "Chemistry": {
2161
- "Average Score": 18.929093202755805,
2162
- "Standard Deviation": null,
2163
- "Rank": 41
2164
- },
2165
- "CPP": {
2166
- "Average Score": 18.929093202755805,
2167
- "Standard Deviation": null,
2168
- "Rank": 42
2169
- }
2170
- }
2171
- },
2172
- {
2173
- "config": {
2174
- "model_name": "vicuna-13b",
2175
- "organization": "LMSYS",
2176
- "license": "Non-commercial",
2177
- "knowledge_cutoff": "2023/07"
2178
- },
2179
- "results": {
2180
- "OVERALL": {
2181
- "Average Score": 0.201892849,
2182
- "Standard Deviation": 0.06021749802,
2183
- "Rank": 46
2184
- },
2185
- "Geometry": {
2186
- "Average Score": 0.200941928,
2187
- "Standard Deviation": 0.03366817781,
2188
- "Rank": 41
2189
- },
2190
- "Algebra": {
2191
- "Average Score": 0.196123323,
2192
- "Standard Deviation": 0.0135715643,
2193
- "Rank": 42
2194
- },
2195
- "Probability": {
2196
- "Average Score": 0.141214079,
2197
- "Standard Deviation": 0.02721328211,
2198
- "Rank": 44
2199
- },
2200
- "Logical": {
2201
- "Average Score": 0.148598631,
2202
- "Standard Deviation": 0.02241523892,
2203
- "Rank": 43
2204
- },
2205
- "Social": {
2206
- "Average Score": 0.124655135,
2207
- "Standard Deviation": 0.01122382671,
2208
- "Rank": 46
2209
- },
2210
- "Chemistry": {
2211
- "Average Score": 21.840013221590294,
2212
- "Standard Deviation": null,
2213
- "Rank": 39
2214
- },
2215
- "CPP": {
2216
- "Average Score": 21.840013221590294,
2217
- "Standard Deviation": null,
2218
- "Rank": 40
2219
- }
2220
- }
2221
- },
2222
- {
2223
- "config": {
2224
- "model_name": "zephyr-7b-beta",
2225
- "organization": "HuggingFace",
2226
- "license": "MIT",
2227
- "knowledge_cutoff": "2023/10"
2228
- },
2229
- "results": {
2230
- "OVERALL": {
2231
- "Average Score": 0.102705119,
2232
- "Standard Deviation": 0.03683757312,
2233
- "Rank": 49
2234
- },
2235
- "Geometry": {
2236
- "Average Score": 0.114005544,
2237
- "Standard Deviation": 0.03144354365,
2238
- "Rank": 45
2239
- },
2240
- "Algebra": {
2241
- "Average Score": 0.141766633,
2242
- "Standard Deviation": 0.03179520129,
2243
- "Rank": 43
2244
- },
2245
- "Probability": {
2246
- "Average Score": 0.089050714,
2247
- "Standard Deviation": 0.002136754266,
2248
- "Rank": 47
2249
- },
2250
- "Logical": {
2251
- "Average Score": 0.069520789,
2252
- "Standard Deviation": 0.004477840857,
2253
- "Rank": 50
2254
- },
2255
- "Social": {
2256
- "Average Score": 0.0,
2257
- "Standard Deviation": 0.0,
2258
- "Rank": 52
2259
- },
2260
- "Chemistry": {
2261
- "Average Score": 18.92902220864132,
2262
- "Standard Deviation": null,
2263
- "Rank": 42
2264
- },
2265
- "CPP": {
2266
- "Average Score": 18.92902220864132,
2267
- "Standard Deviation": null,
2268
- "Rank": 43
2269
- }
2270
- }
2271
- },
2272
- {
2273
- "config": {
2274
- "model_name": "gemma-1.1-2b-it",
2275
- "organization": "Google",
2276
- "license": "Gemma License",
2277
- "knowledge_cutoff": "2024/02"
2278
- },
2279
- "results": {
2280
- "OVERALL": {
2281
- "Average Score": 0.257700845,
2282
- "Standard Deviation": 0.07369021445,
2283
- "Rank": 43
2284
- },
2285
- "Geometry": {
2286
- "Average Score": 0.183974034,
2287
- "Standard Deviation": 0.0215548886,
2288
- "Rank": 43
2289
- },
2290
- "Algebra": {
2291
- "Average Score": 0.13422252,
2292
- "Standard Deviation": 0.01922819511,
2293
- "Rank": 44
2294
- },
2295
- "Probability": {
2296
- "Average Score": 0.095628657,
2297
- "Standard Deviation": 0.007536076456,
2298
- "Rank": 46
2299
- },
2300
- "Logical": {
2301
- "Average Score": 0.094965074,
2302
- "Standard Deviation": 0.005019175487,
2303
- "Rank": 48
2304
- },
2305
- "Social": {
2306
- "Average Score": 0.167796727,
2307
- "Standard Deviation": 0.01666541942,
2308
- "Rank": 42
2309
- },
2310
- "Chemistry": {
2311
- "Average Score": 20.724691953843916,
2312
- "Standard Deviation": null,
2313
- "Rank": 40
2314
- },
2315
- "CPP": {
2316
- "Average Score": 20.724691953843916,
2317
- "Standard Deviation": null,
2318
- "Rank": 41
2319
- }
2320
- }
2321
- },
2322
- {
2323
- "config": {
2324
- "model_name": "llama2-7b-chat",
2325
- "organization": "Meta",
2326
- "license": "Llama 2 Community",
2327
- "knowledge_cutoff": "2023/07"
2328
- },
2329
- "results": {
2330
- "OVERALL": {
2331
- "Average Score": 0.260189428,
2332
- "Standard Deviation": 0.08019299364,
2333
- "Rank": 42
2334
- },
2335
- "Geometry": {
2336
- "Average Score": 0.087067276,
2337
- "Standard Deviation": 0.04274343402,
2338
- "Rank": 46
2339
- },
2340
- "Algebra": {
2341
- "Average Score": 0.12308805,
2342
- "Standard Deviation": 0.01856053622,
2343
- "Rank": 45
2344
- },
2345
- "Probability": {
2346
- "Average Score": 0.087515438,
2347
- "Standard Deviation": 0.006315053573,
2348
- "Rank": 48
2349
- },
2350
- "Logical": {
2351
- "Average Score": 0.17312827,
2352
- "Standard Deviation": 0.01867044092,
2353
- "Rank": 42
2354
- },
2355
- "Social": {
2356
- "Average Score": 0.152905272,
2357
- "Standard Deviation": 0.007166957097,
2358
- "Rank": 43
2359
- },
2360
- "Chemistry": {
2361
- "Average Score": 15.730513733660898,
2362
- "Standard Deviation": null,
2363
- "Rank": 44
2364
- },
2365
- "CPP": {
2366
- "Average Score": 15.730513733660898,
2367
- "Standard Deviation": null,
2368
- "Rank": 45
2369
- }
2370
- }
2371
- },
2372
- {
2373
- "config": {
2374
- "model_name": "gemma-2b-it",
2375
- "organization": "Google",
2376
- "license": "Gemma License",
2377
- "knowledge_cutoff": "2024/02"
2378
- },
2379
- "results": {
2380
- "OVERALL": {
2381
- "Average Score": 0.234172069,
2382
- "Standard Deviation": 0.06522685718,
2383
- "Rank": 44
2384
- },
2385
- "Geometry": {
2386
- "Average Score": 0.198571153,
2387
- "Standard Deviation": 0.01699161031,
2388
- "Rank": 42
2389
- },
2390
- "Algebra": {
2391
- "Average Score": 0.109883009,
2392
- "Standard Deviation": 0.01520005833,
2393
- "Rank": 46
2394
- },
2395
- "Probability": {
2396
- "Average Score": 0.06467432,
2397
- "Standard Deviation": 0.002117497231,
2398
- "Rank": 50
2399
- },
2400
- "Logical": {
2401
- "Average Score": 0.039624492,
2402
- "Standard Deviation": 0.007606972686,
2403
- "Rank": 51
2404
- },
2405
- "Social": {
2406
- "Average Score": 0.087452913,
2407
- "Standard Deviation": 0.008170146562,
2408
- "Rank": 50
2409
- },
2410
- "Chemistry": {
2411
- "Average Score": 17.2715657115764,
2412
- "Standard Deviation": null,
2413
- "Rank": 43
2414
- },
2415
- "CPP": {
2416
- "Average Score": 17.2715657115764,
2417
- "Standard Deviation": null,
2418
- "Rank": 44
2419
- }
2420
- }
2421
- },
2422
- {
2423
- "config": {
2424
- "model_name": "llama2-13b-chat",
2425
- "organization": "Meta",
2426
- "license": "Llama 2 Community",
2427
- "knowledge_cutoff": "2023/07"
2428
- },
2429
- "results": {
2430
- "OVERALL": {
2431
- "Average Score": 0.263305684,
2432
- "Standard Deviation": 0.07283640689,
2433
- "Rank": 41
2434
- },
2435
- "Geometry": {
2436
- "Average Score": 0.072729954,
2437
- "Standard Deviation": 0.02315988261,
2438
- "Rank": 48
2439
- },
2440
- "Algebra": {
2441
- "Average Score": 0.080371692,
2442
- "Standard Deviation": 0.01277569453,
2443
- "Rank": 47
2444
- },
2445
- "Probability": {
2446
- "Average Score": 0.117757344,
2447
- "Standard Deviation": 0.02418619619,
2448
- "Rank": 45
2449
- },
2450
- "Logical": {
2451
- "Average Score": 0.193149889,
2452
- "Standard Deviation": 0.01776690764,
2453
- "Rank": 39
2454
- },
2455
- "Social": {
2456
- "Average Score": 0.149125922,
2457
- "Standard Deviation": 0.01157416827,
2458
- "Rank": 44
2459
- },
2460
- "Chemistry": {
2461
- "Average Score": 13.17258252933903,
2462
- "Standard Deviation": null,
2463
- "Rank": 47
2464
- },
2465
- "CPP": {
2466
- "Average Score": 13.17258252933903,
2467
- "Standard Deviation": null,
2468
- "Rank": 48
2469
- }
2470
- }
2471
- },
2472
- {
2473
- "config": {
2474
- "model_name": "vicuna-7b",
2475
- "organization": "LMSYS",
2476
- "license": "Non-commercial",
2477
- "knowledge_cutoff": "2023/07"
2478
- },
2479
- "results": {
2480
- "OVERALL": {
2481
- "Average Score": 0.198839786,
2482
- "Standard Deviation": 0.05725381576,
2483
- "Rank": 47
2484
- },
2485
- "Geometry": {
2486
- "Average Score": 0.083457058,
2487
- "Standard Deviation": 0.02520989111,
2488
- "Rank": 47
2489
- },
2490
- "Algebra": {
2491
- "Average Score": 0.070883882,
2492
- "Standard Deviation": 0.007315853253,
2493
- "Rank": 48
2494
- },
2495
- "Probability": {
2496
- "Average Score": 0.080987673,
2497
- "Standard Deviation": 0.005474288861,
2498
- "Rank": 49
2499
- },
2500
- "Logical": {
2501
- "Average Score": 0.100065588,
2502
- "Standard Deviation": 0.003561886452,
2503
- "Rank": 47
2504
- },
2505
- "Social": {
2506
- "Average Score": 0.111076414,
2507
- "Standard Deviation": 0.004805626512,
2508
- "Rank": 48
2509
- },
2510
- "Chemistry": {
2511
- "Average Score": 14.255194156624162,
2512
- "Standard Deviation": null,
2513
- "Rank": 45
2514
- },
2515
- "CPP": {
2516
- "Average Score": 14.255194156624162,
2517
- "Standard Deviation": null,
2518
- "Rank": 46
2519
- }
2520
- }
2521
- },
2522
- {
2523
- "config": {
2524
- "model_name": "koala-13b",
2525
- "organization": "UC Berkeley",
2526
- "license": "Non-commercial",
2527
- "knowledge_cutoff": "2023/04"
2528
- },
2529
- "results": {
2530
- "OVERALL": {
2531
- "Average Score": 0.09387188,
2532
- "Standard Deviation": 0.02642167489,
2533
- "Rank": 50
2534
- },
2535
- "Geometry": {
2536
- "Average Score": 0.017374001,
2537
- "Standard Deviation": 0.01747053557,
2538
- "Rank": 49
2539
- },
2540
- "Algebra": {
2541
- "Average Score": 0.018129197,
2542
- "Standard Deviation": 0.01054371383,
2543
- "Rank": 49
2544
- },
2545
- "Probability": {
2546
- "Average Score": 0.043654362,
2547
- "Standard Deviation": 0.004288231886,
2548
- "Rank": 51
2549
- },
2550
- "Logical": {
2551
- "Average Score": 0.074694053,
2552
- "Standard Deviation": 0.002674646998,
2553
- "Rank": 49
2554
- },
2555
- "Social": {
2556
- "Average Score": 0.096983835,
2557
- "Standard Deviation": 0.007847059783,
2558
- "Rank": 49
2559
- },
2560
- "Chemistry": {
2561
- "Average Score": 6.36433272373514,
2562
- "Standard Deviation": null,
2563
- "Rank": 48
2564
- },
2565
- "CPP": {
2566
- "Average Score": 6.36433272373514,
2567
- "Standard Deviation": null,
2568
- "Rank": 49
2569
- }
2570
- }
2571
- },
2572
- {
2573
- "config": {
2574
- "model_name": "openassistant-pythia-12b",
2575
- "organization": "OpenAssistant",
2576
- "license": "Non-commercial",
2577
- "knowledge_cutoff": "2023/04"
2578
- },
2579
- "results": {
2580
- "OVERALL": {
2581
- "Average Score": 0.0,
2582
- "Standard Deviation": 0.0,
2583
- "Rank": 51
2584
- },
2585
- "Geometry": {
2586
- "Average Score": 0.0,
2587
- "Standard Deviation": 0.0,
2588
- "Rank": 50
2589
- },
2590
- "Algebra": {
2591
- "Average Score": 0.0,
2592
- "Standard Deviation": 0.0,
2593
- "Rank": 50
2594
- },
2595
- "Probability": {
2596
- "Average Score": 0.0,
2597
- "Standard Deviation": 0.0,
2598
- "Rank": 52
2599
- },
2600
- "Logical": {
2601
- "Average Score": 0.0,
2602
- "Standard Deviation": 0.0,
2603
- "Rank": 52
2604
- },
2605
- "Social": {
2606
- "Average Score": 0.030792528,
2607
- "Standard Deviation": 0.007518796391,
2608
- "Rank": 51
2609
- },
2610
- "Chemistry": {
2611
- "Average Score": 0.0,
2612
- "Standard Deviation": null,
2613
- "Rank": 49
2614
- },
2615
- "CPP": {
2616
- "Average Score": 0.0,
2617
- "Standard Deviation": null,
2618
- "Rank": 50
2619
- }
2620
- }
2621
- }
2622
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/results/models_2024-10-18-14:06:13.588399.json DELETED
@@ -1,2732 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "model_name": "ChatGPT-4o-latest (2024-09-03)",
5
- "organization": "OpenAI",
6
- "license": "Proprietary",
7
- "knowledge_cutoff": "2023/10"
8
- },
9
- "results": {
10
- "OVERALL": {
11
- "Average Score": 87.33082346779815,
12
- "Standard Deviation": 1.4853337406399776,
13
- "Rank": 3
14
- },
15
- "Geometry": {
16
- "Average Score": 0.976028578,
17
- "Standard Deviation": 0.01507912373,
18
- "Rank": 3
19
- },
20
- "Algebra": {
21
- "Average Score": 0.951199453,
22
- "Standard Deviation": 0.08452452108,
23
- "Rank": 3
24
- },
25
- "Probability": {
26
- "Average Score": 80.1332207690739,
27
- "Standard Deviation": null,
28
- "Rank": 7
29
- },
30
- "Logical": {
31
- "Average Score": 84.12975867250425,
32
- "Standard Deviation": 0.21211547702245045,
33
- "Rank": 6
34
- },
35
- "Social": {
36
- "Average Score": 0.815902987,
37
- "Standard Deviation": 0.0196254222,
38
- "Rank": 3
39
- },
40
- "Chemistry": {
41
- "Average Score": 89.92480228064885,
42
- "Standard Deviation": null,
43
- "Rank": 4
44
- },
45
- "CPP": {
46
- "Average Score": 100.0,
47
- "Standard Deviation": null,
48
- "Rank": 1
49
- }
50
- }
51
- },
52
- {
53
- "config": {
54
- "model_name": "gpt-4o-2024-08-06",
55
- "organization": "OpenAI",
56
- "license": "Proprietary",
57
- "knowledge_cutoff": "2023/10"
58
- },
59
- "results": {
60
- "OVERALL": {
61
- "Average Score": 77.7818546246671,
62
- "Standard Deviation": 2.7097581088879505,
63
- "Rank": 5
64
- },
65
- "Geometry": {
66
- "Average Score": 0.99773096,
67
- "Standard Deviation": 0.002835555172,
68
- "Rank": 1
69
- },
70
- "Algebra": {
71
- "Average Score": 1.0,
72
- "Standard Deviation": 0.0,
73
- "Rank": 1
74
- },
75
- "Probability": {
76
- "Average Score": 74.97136205481755,
77
- "Standard Deviation": null,
78
- "Rank": 11
79
- },
80
- "Logical": {
81
- "Average Score": 66.0597109743056,
82
- "Standard Deviation": 1.5021351704575163,
83
- "Rank": 14
84
- },
85
- "Social": {
86
- "Average Score": 0.680417314,
87
- "Standard Deviation": 0.00656867063,
88
- "Rank": 8
89
- },
90
- "Chemistry": {
91
- "Average Score": 82.55189735524202,
92
- "Standard Deviation": null,
93
- "Rank": 7
94
- },
95
- "CPP": {
96
- "Average Score": 92.43090226400756,
97
- "Standard Deviation": null,
98
- "Rank": 2
99
- }
100
- }
101
- },
102
- {
103
- "config": {
104
- "model_name": "gpt-4o-2024-05-13",
105
- "organization": "OpenAI",
106
- "license": "Proprietary",
107
- "knowledge_cutoff": "2023/10"
108
- },
109
- "results": {
110
- "OVERALL": {
111
- "Average Score": 72.6093654197998,
112
- "Standard Deviation": 13.515345690976028,
113
- "Rank": 10
114
- },
115
- "Geometry": {
116
- "Average Score": 0.972472377,
117
- "Standard Deviation": 0.01648274205,
118
- "Rank": 4
119
- },
120
- "Algebra": {
121
- "Average Score": 0.995511298,
122
- "Standard Deviation": 0.004097802515,
123
- "Rank": 2
124
- },
125
- "Probability": {
126
- "Average Score": 77.97816201050715,
127
- "Standard Deviation": null,
128
- "Rank": 8
129
- },
130
- "Logical": {
131
- "Average Score": 75.65058939137873,
132
- "Standard Deviation": 0.07522785572103825,
133
- "Rank": 9
134
- },
135
- "Social": {
136
- "Average Score": 0.609875087,
137
- "Standard Deviation": 0.038729239,
138
- "Rank": 13
139
- },
140
- "Chemistry": {
141
- "Average Score": 76.03377031297643,
142
- "Standard Deviation": null,
143
- "Rank": 9
144
- },
145
- "CPP": {
146
- "Average Score": 79.1592634699295,
147
- "Standard Deviation": null,
148
- "Rank": 6
149
- }
150
- }
151
- },
152
- {
153
- "config": {
154
- "model_name": "gpt-4-turbo-2024-04-09",
155
- "organization": "OpenAI",
156
- "license": "Proprietary",
157
- "knowledge_cutoff": "2023/12"
158
- },
159
- "results": {
160
- "OVERALL": {
161
- "Average Score": 73.32308543749606,
162
- "Standard Deviation": 6.562777844134629,
163
- "Rank": 9
164
- },
165
- "Geometry": {
166
- "Average Score": 0.95374588,
167
- "Standard Deviation": 0.03109307166,
168
- "Rank": 5
169
- },
170
- "Algebra": {
171
- "Average Score": 0.930945223,
172
- "Standard Deviation": 0.06705136813,
173
- "Rank": 4
174
- },
175
- "Probability": {
176
- "Average Score": 74.97144205445957,
177
- "Standard Deviation": null,
178
- "Rank": 12
179
- },
180
- "Logical": {
181
- "Average Score": 76.82291715624933,
182
- "Standard Deviation": 0.03462548327631355,
183
- "Rank": 7
184
- },
185
- "Social": {
186
- "Average Score": 0.715935163,
187
- "Standard Deviation": 0.1209141409,
188
- "Rank": 6
189
- },
190
- "Chemistry": {
191
- "Average Score": 70.44329321394066,
192
- "Standard Deviation": null,
193
- "Rank": 12
194
- },
195
- "CPP": {
196
- "Average Score": 70.73143363230263,
197
- "Standard Deviation": null,
198
- "Rank": 11
199
- }
200
- }
201
- },
202
- {
203
- "config": {
204
- "model_name": "gemini-1.5-pro-001",
205
- "organization": "Google",
206
- "license": "Proprietary",
207
- "knowledge_cutoff": "2023/11"
208
- },
209
- "results": {
210
- "OVERALL": {
211
- "Average Score": 74.27365448117855,
212
- "Standard Deviation": 3.9515447172901847,
213
- "Rank": 8
214
- },
215
- "Geometry": {
216
- "Average Score": 0.9947169,
217
- "Standard Deviation": 0.009150597621,
218
- "Rank": 2
219
- },
220
- "Algebra": {
221
- "Average Score": 0.857464301,
222
- "Standard Deviation": 0.05014285338,
223
- "Rank": 5
224
- },
225
- "Probability": {
226
- "Average Score": 64.77713215500482,
227
- "Standard Deviation": null,
228
- "Rank": 15
229
- },
230
- "Logical": {
231
- "Average Score": 74.3275461555815,
232
- "Standard Deviation": 0.8092355737847541,
233
- "Rank": 10
234
- },
235
- "Social": {
236
- "Average Score": 0.649601885,
237
- "Standard Deviation": 0.104854889,
238
- "Rank": 11
239
- }
240
- }
241
- },
242
- {
243
- "config": {
244
- "model_name": "qwen2-72b-instruct",
245
- "organization": "Alibaba",
246
- "license": "Qianwen LICENSE",
247
- "knowledge_cutoff": "2024/09"
248
- },
249
- "results": {
250
- "OVERALL": {
251
- "Average Score": 71.00423311357184,
252
- "Standard Deviation": 1.6189609141983887,
253
- "Rank": 12
254
- },
255
- "Geometry": {
256
- "Average Score": 0.796870305,
257
- "Standard Deviation": 0.0509025346,
258
- "Rank": 9
259
- },
260
- "Algebra": {
261
- "Average Score": 0.836194231,
262
- "Standard Deviation": 0.04517093028,
263
- "Rank": 6
264
- },
265
- "Probability": {
266
- "Average Score": 76.33751777233937,
267
- "Standard Deviation": null,
268
- "Rank": 10
269
- },
270
- "Logical": {
271
- "Average Score": 61.22020517318166,
272
- "Standard Deviation": 10.241399997578569,
273
- "Rank": 17
274
- },
275
- "Social": {
276
- "Average Score": 0.652578786,
277
- "Standard Deviation": 0.04259293171,
278
- "Rank": 10
279
- },
280
- "Chemistry": {
281
- "Average Score": 70.44342338869497,
282
- "Standard Deviation": null,
283
- "Rank": 12
284
- },
285
- "CPP": {
286
- "Average Score": 73.54037778797029,
287
- "Standard Deviation": null,
288
- "Rank": 7
289
- }
290
- }
291
- },
292
- {
293
- "config": {
294
- "model_name": "gpt-4o-mini-2024-07-18",
295
- "organization": "OpenAI",
296
- "license": "Proprietary",
297
- "knowledge_cutoff": "2023/10"
298
- },
299
- "results": {
300
- "OVERALL": {
301
- "Average Score": 77.35427394420829,
302
- "Standard Deviation": 3.162321541714492,
303
- "Rank": 6
304
- },
305
- "Geometry": {
306
- "Average Score": 0.946650435,
307
- "Standard Deviation": 0.01831236482,
308
- "Rank": 7
309
- },
310
- "Algebra": {
311
- "Average Score": 0.796243022,
312
- "Standard Deviation": 0.05537539202,
313
- "Rank": 7
314
- },
315
- "Probability": {
316
- "Average Score": 77.63972720989734,
317
- "Standard Deviation": null,
318
- "Rank": 9
319
- },
320
- "Logical": {
321
- "Average Score": 71.81267717239906,
322
- "Standard Deviation": 0.3393593163824375,
323
- "Rank": 11
324
- },
325
- "Social": {
326
- "Average Score": 0.691949855,
327
- "Standard Deviation": 0.02072934333,
328
- "Rank": 7
329
- },
330
- "Chemistry": {
331
- "Average Score": 78.10636943659426,
332
- "Standard Deviation": null,
333
- "Rank": 8
334
- },
335
- "CPP": {
336
- "Average Score": 88.3877070580296,
337
- "Standard Deviation": null,
338
- "Rank": 3
339
- }
340
- }
341
- },
342
- {
343
- "config": {
344
- "model_name": "claude-3.5-sonnet",
345
- "organization": "Anthropic",
346
- "license": "Proprietary",
347
- "knowledge_cutoff": "2024/04"
348
- },
349
- "results": {
350
- "OVERALL": {
351
- "Average Score": 75.97534774560863,
352
- "Standard Deviation": 9.237316832705584,
353
- "Rank": 7
354
- },
355
- "Geometry": {
356
- "Average Score": 0.95316419,
357
- "Standard Deviation": 0.02081192856,
358
- "Rank": 6
359
- },
360
- "Algebra": {
361
- "Average Score": 0.759789952,
362
- "Standard Deviation": 0.02611765096,
363
- "Rank": 8
364
- },
365
- "Probability": {
366
- "Average Score": 65.4531881044298,
367
- "Standard Deviation": null,
368
- "Rank": 14
369
- },
370
- "Logical": {
371
- "Average Score": 76.47424588300288,
372
- "Standard Deviation": 0.07699328617321737,
373
- "Rank": 8
374
- },
375
- "Social": {
376
- "Average Score": 0.790002247,
377
- "Standard Deviation": 0.1007410022,
378
- "Rank": 4
379
- },
380
- "Chemistry": {
381
- "Average Score": 85.17654674052096,
382
- "Standard Deviation": null,
383
- "Rank": 6
384
- },
385
- "CPP": {
386
- "Average Score": 82.37734076815008,
387
- "Standard Deviation": null,
388
- "Rank": 5
389
- }
390
- }
391
- },
392
- {
393
- "config": {
394
- "model_name": "o1-mini",
395
- "organization": "OpenAI",
396
- "license": "Proprietary",
397
- "knowledge_cutoff": "2023/10"
398
- },
399
- "results": {
400
- "OVERALL": {
401
- "Average Score": 87.92989248183513,
402
- "Standard Deviation": 1.3401058431409953,
403
- "Rank": 2
404
- },
405
- "Geometry": {
406
- "Average Score": "N/A",
407
- "Standard Deviation": "N/A",
408
- "Rank": "N/A"
409
- },
410
- "Algebra": {
411
- "Average Score": "N/A",
412
- "Standard Deviation": "N/A",
413
- "Rank": "N/A"
414
- },
415
- "Probability": {
416
- "Average Score": 100.0,
417
- "Standard Deviation": null,
418
- "Rank": 1
419
- },
420
- "Logical": {
421
- "Average Score": 99.15920225407733,
422
- "Standard Deviation": 0.49801294410288666,
423
- "Rank": 2
424
- },
425
- "Social": {
426
- "Average Score": 0.993974241,
427
- "Standard Deviation": 0.001996882328,
428
- "Rank": 2
429
- }
430
- }
431
- },
432
- {
433
- "config": {
434
- "model_name": "o1-preview",
435
- "organization": "OpenAI",
436
- "license": "Proprietary",
437
- "knowledge_cutoff": "2023/10"
438
- },
439
- "results": {
440
- "OVERALL": {
441
- "Average Score": 85.40247108906188,
442
- "Standard Deviation": 1.5796898764998464,
443
- "Rank": 4
444
- },
445
- "Geometry": {
446
- "Average Score": "N/A",
447
- "Standard Deviation": "N/A",
448
- "Rank": "N/A"
449
- },
450
- "Algebra": {
451
- "Average Score": "N/A",
452
- "Standard Deviation": "N/A",
453
- "Rank": "N/A"
454
- },
455
- "Probability": {
456
- "Average Score": 90.32625019320989,
457
- "Standard Deviation": null,
458
- "Rank": 5
459
- },
460
- "Logical": {
461
- "Average Score": 98.18241651273537,
462
- "Standard Deviation": 0.16231417987288874,
463
- "Rank": 4
464
- },
465
- "Social": {
466
- "Average Score": 1.0,
467
- "Standard Deviation": 0.0,
468
- "Rank": 1
469
- }
470
- }
471
- },
472
- {
473
- "config": {
474
- "model_name": "gemini-1.5-flash-001",
475
- "organization": "Google",
476
- "license": "Proprietary",
477
- "knowledge_cutoff": "2023/11"
478
- },
479
- "results": {
480
- "OVERALL": {
481
- "Average Score": 67.67997467963976,
482
- "Standard Deviation": 2.624276751646549,
483
- "Rank": 13
484
- },
485
- "Geometry": {
486
- "Average Score": 0.804144103,
487
- "Standard Deviation": 0.1327142178,
488
- "Rank": 8
489
- },
490
- "Algebra": {
491
- "Average Score": 0.731776765,
492
- "Standard Deviation": 0.02594657111,
493
- "Rank": 9
494
- },
495
- "Probability": {
496
- "Average Score": 61.17190439316032,
497
- "Standard Deviation": null,
498
- "Rank": 19
499
- },
500
- "Logical": {
501
- "Average Score": 62.284381466778335,
502
- "Standard Deviation": 3.9592476945909674,
503
- "Rank": 16
504
- },
505
- "Social": {
506
- "Average Score": 0.555933822,
507
- "Standard Deviation": 0.1029934524,
508
- "Rank": 15
509
- },
510
- "Chemistry": {
511
- "Average Score": 70.24726462490831,
512
- "Standard Deviation": null,
513
- "Rank": 15
514
- },
515
- "CPP": {
516
- "Average Score": 72.1127762005651,
517
- "Standard Deviation": null,
518
- "Rank": 10
519
- }
520
- }
521
- },
522
- {
523
- "config": {
524
- "model_name": "gpt4-1106",
525
- "organization": "OpenAI",
526
- "license": "Proprietary",
527
- "knowledge_cutoff": "2024/04"
528
- },
529
- "results": {
530
- "OVERALL": {
531
- "Average Score": 72.24829405851214,
532
- "Standard Deviation": 13.633826990442946,
533
- "Rank": 11
534
- },
535
- "Geometry": {
536
- "Average Score": 0.71843088,
537
- "Standard Deviation": 0.04778038294,
538
- "Rank": 11
539
- },
540
- "Algebra": {
541
- "Average Score": 0.712910417,
542
- "Standard Deviation": 0.02581828898,
543
- "Rank": 10
544
- },
545
- "Probability": {
546
- "Average Score": 63.29462909293814,
547
- "Standard Deviation": null,
548
- "Rank": 16
549
- },
550
- "Logical": {
551
- "Average Score": 62.987098158883875,
552
- "Standard Deviation": 4.027795425350514,
553
- "Rank": 15
554
- },
555
- "Social": {
556
- "Average Score": 0.450609816,
557
- "Standard Deviation": 0.05208655446,
558
- "Rank": 21
559
- },
560
- "Chemistry": {
561
- "Average Score": 67.34047237109209,
562
- "Standard Deviation": null,
563
- "Rank": 16
564
- },
565
- "CPP": {
566
- "Average Score": 69.11824072252848,
567
- "Standard Deviation": null,
568
- "Rank": 12
569
- }
570
- }
571
- },
572
- {
573
- "config": {
574
- "model_name": "gemma-2-27b-it",
575
- "organization": "Google",
576
- "license": "Gemma License",
577
- "knowledge_cutoff": "2024/06"
578
- },
579
- "results": {
580
- "OVERALL": {
581
- "Average Score": 62.70975283121063,
582
- "Standard Deviation": 6.376450054715319,
583
- "Rank": 15
584
- },
585
- "Geometry": {
586
- "Average Score": 0.60112744,
587
- "Standard Deviation": 0.0469109952,
588
- "Rank": 17
589
- },
590
- "Algebra": {
591
- "Average Score": 0.687955914,
592
- "Standard Deviation": 0.01959958192,
593
- "Rank": 11
594
- },
595
- "Probability": {
596
- "Average Score": 60.04180799425261,
597
- "Standard Deviation": null,
598
- "Rank": 20
599
- },
600
- "Logical": {
601
- "Average Score": 60.77082327163094,
602
- "Standard Deviation": 7.2164902432618625,
603
- "Rank": 19
604
- },
605
- "Social": {
606
- "Average Score": 0.487844257,
607
- "Standard Deviation": 0.05857760809,
608
- "Rank": 18
609
- },
610
- "Chemistry": {
611
- "Average Score": 61.68181926111706,
612
- "Standard Deviation": null,
613
- "Rank": 18
614
- },
615
- "CPP": {
616
- "Average Score": 63.28920072143611,
617
- "Standard Deviation": null,
618
- "Rank": 14
619
- }
620
- }
621
- },
622
- {
623
- "config": {
624
- "model_name": "claude-3-opus",
625
- "organization": "Anthropic",
626
- "license": "Proprietary",
627
- "knowledge_cutoff": "2023/08"
628
- },
629
- "results": {
630
- "OVERALL": {
631
- "Average Score": 60.56449573632771,
632
- "Standard Deviation": 8.485936885427277,
633
- "Rank": 17
634
- },
635
- "Geometry": {
636
- "Average Score": 0.7215743,
637
- "Standard Deviation": 0.04712598358,
638
- "Rank": 10
639
- },
640
- "Algebra": {
641
- "Average Score": 0.68777327,
642
- "Standard Deviation": 0.02382683713,
643
- "Rank": 12
644
- },
645
- "Probability": {
646
- "Average Score": 62.296041016641176,
647
- "Standard Deviation": null,
648
- "Rank": 17
649
- },
650
- "Logical": {
651
- "Average Score": 68.36295609287292,
652
- "Standard Deviation": 1.6558271236588655,
653
- "Rank": 13
654
- },
655
- "Social": {
656
- "Average Score": 0.663410854,
657
- "Standard Deviation": 0.09540220876,
658
- "Rank": 9
659
- },
660
- "Chemistry": {
661
- "Average Score": 70.44337273504232,
662
- "Standard Deviation": null,
663
- "Rank": 12
664
- },
665
- "CPP": {
666
- "Average Score": 73.5404403567132,
667
- "Standard Deviation": null,
668
- "Rank": 8
669
- }
670
- }
671
- },
672
- {
673
- "config": {
674
- "model_name": "gemma-2-9b-it-simpo",
675
- "organization": "Google",
676
- "license": "Gemma License",
677
- "knowledge_cutoff": "2024/07"
678
- },
679
- "results": {
680
- "OVERALL": {
681
- "Average Score": "N/A",
682
- "Standard Deviation": "N/A",
683
- "Rank": "N/A"
684
- },
685
- "Geometry": {
686
- "Average Score": 0.582787508,
687
- "Standard Deviation": 0.03965204074,
688
- "Rank": 18
689
- },
690
- "Algebra": {
691
- "Average Score": 0.658648133,
692
- "Standard Deviation": 0.02565919856,
693
- "Rank": 13
694
- },
695
- "Probability": {
696
- "Average Score": 57.545408188912894,
697
- "Standard Deviation": null,
698
- "Rank": 23
699
- },
700
- "Logical": {
701
- "Average Score": 53.1996479262466,
702
- "Standard Deviation": 2.690106544431167,
703
- "Rank": 23
704
- },
705
- "Social": {
706
- "Average Score": 0.635266187,
707
- "Standard Deviation": 0.03620021751,
708
- "Rank": 12
709
- },
710
- "Chemistry": {
711
- "Average Score": 74.44267231381626,
712
- "Standard Deviation": null,
713
- "Rank": 11
714
- },
715
- "CPP": {
716
- "Average Score": 73.43757596214863,
717
- "Standard Deviation": null,
718
- "Rank": 9
719
- }
720
- }
721
- },
722
- {
723
- "config": {
724
- "model_name": "qwen1.5-72b-chat",
725
- "organization": "Alibaba",
726
- "license": "Qianwen LICENSE",
727
- "knowledge_cutoff": "2024/03"
728
- },
729
- "results": {
730
- "OVERALL": {
731
- "Average Score": 52.983715751652085,
732
- "Standard Deviation": 3.097613966427763,
733
- "Rank": 18
734
- },
735
- "Geometry": {
736
- "Average Score": 0.543139301,
737
- "Standard Deviation": 0.03425202326,
738
- "Rank": 22
739
- },
740
- "Algebra": {
741
- "Average Score": 0.635228729,
742
- "Standard Deviation": 0.01944043425,
743
- "Rank": 14
744
- },
745
- "Probability": {
746
- "Average Score": 52.650033879924905,
747
- "Standard Deviation": null,
748
- "Rank": 26
749
- },
750
- "Logical": {
751
- "Average Score": 32.628853250402074,
752
- "Standard Deviation": 3.227745519436025,
753
- "Rank": 37
754
- },
755
- "Social": {
756
- "Average Score": 0.415007627,
757
- "Standard Deviation": 0.03920053159,
758
- "Rank": 22
759
- },
760
- "Chemistry": {
761
- "Average Score": 47.5126781973184,
762
- "Standard Deviation": null,
763
- "Rank": 24
764
- },
765
- "CPP": {
766
- "Average Score": 48.69302376665551,
767
- "Standard Deviation": null,
768
- "Rank": 20
769
- }
770
- }
771
- },
772
- {
773
- "config": {
774
- "model_name": "qwen1.5-32b-chat",
775
- "organization": "Alibaba",
776
- "license": "Qianwen LICENSE",
777
- "knowledge_cutoff": "2024/03"
778
- },
779
- "results": {
780
- "OVERALL": {
781
- "Average Score": 26.978561942890224,
782
- "Standard Deviation": 1.575986887925592,
783
- "Rank": 32
784
- },
785
- "Geometry": {
786
- "Average Score": 0.51086835,
787
- "Standard Deviation": 0.04052471998,
788
- "Rank": 25
789
- },
790
- "Algebra": {
791
- "Average Score": 0.609003168,
792
- "Standard Deviation": 0.04874143541,
793
- "Rank": 15
794
- },
795
- "Probability": {
796
- "Average Score": 49.50617919486678,
797
- "Standard Deviation": null,
798
- "Rank": 29
799
- },
800
- "Logical": {
801
- "Average Score": 34.07387941414556,
802
- "Standard Deviation": 4.616974831074921,
803
- "Rank": 34
804
- },
805
- "Social": {
806
- "Average Score": 0.380987334,
807
- "Standard Deviation": 0.03762251776,
808
- "Rank": 24
809
- },
810
- "Chemistry": {
811
- "Average Score": 44.06627265183811,
812
- "Standard Deviation": null,
813
- "Rank": 28
814
- },
815
- "CPP": {
816
- "Average Score": 45.14284028264288,
817
- "Standard Deviation": null,
818
- "Rank": 24
819
- }
820
- }
821
- },
822
- {
823
- "config": {
824
- "model_name": "google-gemma-2-9b-it",
825
- "organization": "Google",
826
- "license": "Proprietary",
827
- "knowledge_cutoff": "2024/06"
828
- },
829
- "results": {
830
- "OVERALL": {
831
- "Average Score": 52.23013018580635,
832
- "Standard Deviation": 3.3939236141078495,
833
- "Rank": 19
834
- },
835
- "Geometry": {
836
- "Average Score": 0.575371308,
837
- "Standard Deviation": 0.03556220251,
838
- "Rank": 20
839
- },
840
- "Algebra": {
841
- "Average Score": 0.597045661,
842
- "Standard Deviation": 0.0313828123,
843
- "Rank": 16
844
- },
845
- "Probability": {
846
- "Average Score": 58.73062101843859,
847
- "Standard Deviation": null,
848
- "Rank": 21
849
- },
850
- "Logical": {
851
- "Average Score": 58.01791397899675,
852
- "Standard Deviation": 5.751983660134971,
853
- "Rank": 21
854
- },
855
- "Social": {
856
- "Average Score": 0.768337958,
857
- "Standard Deviation": 0.04078610476,
858
- "Rank": 5
859
- },
860
- "Chemistry": {
861
- "Average Score": 52.69494515004607,
862
- "Standard Deviation": null,
863
- "Rank": 21
864
- },
865
- "CPP": {
866
- "Average Score": 54.03167523687635,
867
- "Standard Deviation": null,
868
- "Rank": 17
869
- }
870
- }
871
- },
872
- {
873
- "config": {
874
- "model_name": "yi-1.5-34b-chat",
875
- "organization": "01 AI",
876
- "license": "Proprietary",
877
- "knowledge_cutoff": "2024/05"
878
- },
879
- "results": {
880
- "OVERALL": {
881
- "Average Score": 62.568637878216464,
882
- "Standard Deviation": 8.554205798418673,
883
- "Rank": 16
884
- },
885
- "Geometry": {
886
- "Average Score": 0.566666724,
887
- "Standard Deviation": 0.04001381658,
888
- "Rank": 21
889
- },
890
- "Algebra": {
891
- "Average Score": 0.590997292,
892
- "Standard Deviation": 0.03594087315,
893
- "Rank": 17
894
- },
895
- "Probability": {
896
- "Average Score": 57.545207891104354,
897
- "Standard Deviation": null,
898
- "Rank": 22
899
- },
900
- "Logical": {
901
- "Average Score": 56.598158131627194,
902
- "Standard Deviation": 1.1072821075127297,
903
- "Rank": 22
904
- },
905
- "Social": {
906
- "Average Score": 0.516980832,
907
- "Standard Deviation": 0.03369347985,
908
- "Rank": 17
909
- },
910
- "Chemistry": {
911
- "Average Score": 50.867343712131174,
912
- "Standard Deviation": null,
913
- "Rank": 22
914
- },
915
- "CPP": {
916
- "Average Score": 52.148798061768964,
917
- "Standard Deviation": null,
918
- "Rank": 18
919
- }
920
- }
921
- },
922
- {
923
- "config": {
924
- "model_name": "meta-llama-3.1-70b-instruct",
925
- "organization": "Meta",
926
- "license": "Llama 3.1 Community",
927
- "knowledge_cutoff": "2023/12"
928
- },
929
- "results": {
930
- "OVERALL": {
931
- "Average Score": 65.61302047306724,
932
- "Standard Deviation": 7.113338386318571,
933
- "Rank": 14
934
- },
935
- "Geometry": {
936
- "Average Score": 0.76184398,
937
- "Standard Deviation": 0.01790377984,
938
- "Rank": 10
939
- },
940
- "Algebra": {
941
- "Average Score": 0.732041699,
942
- "Standard Deviation": 0.02621439062,
943
- "Rank": 9
944
- },
945
- "Probability": {
946
- "Average Score": 65.4531285887158,
947
- "Standard Deviation": null,
948
- "Rank": 13
949
- },
950
- "Logical": {
951
- "Average Score": 61.16321386785366,
952
- "Standard Deviation": 0.8920966760646541,
953
- "Rank": 18
954
- },
955
- "Social": {
956
- "Average Score": 0.45872939,
957
- "Standard Deviation": 0.05347039576,
958
- "Rank": 20
959
- },
960
- "Chemistry": {
961
- "Average Score": 76.03374498429748,
962
- "Standard Deviation": null,
963
- "Rank": 9
964
- },
965
- "CPP": {
966
- "Average Score": 84.36815192532764,
967
- "Standard Deviation": null,
968
- "Rank": 4
969
- }
970
- }
971
- },
972
- {
973
- "config": {
974
- "model_name": "meta-llama-3.1-8b-instruct",
975
- "organization": "Meta",
976
- "license": "Llama 3.1 Community",
977
- "knowledge_cutoff": "2023/12"
978
- },
979
- "results": {
980
- "OVERALL": {
981
- "Average Score": 48.86242501618216,
982
- "Standard Deviation": 3.7761459978540257,
983
- "Rank": 21
984
- },
985
- "Geometry": {
986
- "Average Score": 0.522442162,
987
- "Standard Deviation": 0.03908236317,
988
- "Rank": 23
989
- },
990
- "Algebra": {
991
- "Average Score": 0.582702645,
992
- "Standard Deviation": 0.05002277711,
993
- "Rank": 18
994
- },
995
- "Probability": {
996
- "Average Score": 52.44179989233465,
997
- "Standard Deviation": null,
998
- "Rank": 27
999
- },
1000
- "Logical": {
1001
- "Average Score": 43.3706774850582,
1002
- "Standard Deviation": 2.820707319899787,
1003
- "Rank": 28
1004
- },
1005
- "Social": {
1006
- "Average Score": 0.329195941,
1007
- "Standard Deviation": 0.03925019528,
1008
- "Rank": 28
1009
- },
1010
- "Chemistry": {
1011
- "Average Score": 43.36264580455019,
1012
- "Standard Deviation": null,
1013
- "Rank": 30
1014
- },
1015
- "CPP": {
1016
- "Average Score": 44.41846841004584,
1017
- "Standard Deviation": null,
1018
- "Rank": 26
1019
- }
1020
- }
1021
- },
1022
- {
1023
- "config": {
1024
- "model_name": "gpt3.5-turbo-0125",
1025
- "organization": "OpenAI",
1026
- "license": "Proprietary",
1027
- "knowledge_cutoff": "2021/09"
1028
- },
1029
- "results": {
1030
- "OVERALL": {
1031
- "Average Score": 18.951737690142235,
1032
- "Standard Deviation": 0.7967088395458379,
1033
- "Rank": 42
1034
- },
1035
- "Geometry": {
1036
- "Average Score": 0.678714519,
1037
- "Standard Deviation": 0.05926546762,
1038
- "Rank": 12
1039
- },
1040
- "Algebra": {
1041
- "Average Score": 0.569296173,
1042
- "Standard Deviation": 0.05277281097,
1043
- "Rank": 19
1044
- },
1045
- "Probability": {
1046
- "Average Score": 45.77959177088119,
1047
- "Standard Deviation": null,
1048
- "Rank": 30
1049
- },
1050
- "Logical": {
1051
- "Average Score": 17.159084771200394,
1052
- "Standard Deviation": 2.5845422782742546,
1053
- "Rank": 48
1054
- },
1055
- "Social": {
1056
- "Average Score": 0.235071541,
1057
- "Standard Deviation": 0.02632892457,
1058
- "Rank": 37
1059
- },
1060
- "Chemistry": {
1061
- "Average Score": 39.52885225927276,
1062
- "Standard Deviation": null,
1063
- "Rank": 33
1064
- },
1065
- "CPP": {
1066
- "Average Score": 40.46958736582551,
1067
- "Standard Deviation": null,
1068
- "Rank": 29
1069
- }
1070
- }
1071
- },
1072
- {
1073
- "config": {
1074
- "model_name": "llama-3-70b-instruct",
1075
- "organization": "Meta",
1076
- "license": "Llama 3 Community",
1077
- "knowledge_cutoff": "2023/12"
1078
- },
1079
- "results": {
1080
- "OVERALL": {
1081
- "Average Score": 40.57810915454436,
1082
- "Standard Deviation": 1.3134243733127455,
1083
- "Rank": 26
1084
- },
1085
- "Geometry": {
1086
- "Average Score": 0.516865529,
1087
- "Standard Deviation": 0.03858112564,
1088
- "Rank": 24
1089
- },
1090
- "Algebra": {
1091
- "Average Score": 0.566756531,
1092
- "Standard Deviation": 0.03369826926,
1093
- "Rank": 20
1094
- },
1095
- "Probability": {
1096
- "Average Score": 52.64997876875813,
1097
- "Standard Deviation": null,
1098
- "Rank": 25
1099
- },
1100
- "Logical": {
1101
- "Average Score": 70.51651844158742,
1102
- "Standard Deviation": 0.12355022869457871,
1103
- "Rank": 12
1104
- },
1105
- "Social": {
1106
- "Average Score": 0.45872939,
1107
- "Standard Deviation": 0.05347039576,
1108
- "Rank": 20
1109
- },
1110
- "Chemistry": {
1111
- "Average Score": 63.65476403379996,
1112
- "Standard Deviation": null,
1113
- "Rank": 17
1114
- },
1115
- "CPP": {
1116
- "Average Score": 65.32140697218945,
1117
- "Standard Deviation": null,
1118
- "Rank": 13
1119
- }
1120
- }
1121
- },
1122
- {
1123
- "config": {
1124
- "model_name": "claude-3-sonnet",
1125
- "organization": "Anthropic",
1126
- "license": "Proprietary",
1127
- "knowledge_cutoff": "2023/08"
1128
- },
1129
- "results": {
1130
- "OVERALL": {
1131
- "Average Score": 52.19088595402735,
1132
- "Standard Deviation": 3.743258734262917,
1133
- "Rank": 20
1134
- },
1135
- "Geometry": {
1136
- "Average Score": 0.675613638,
1137
- "Standard Deviation": 0.05275594408,
1138
- "Rank": 13
1139
- },
1140
- "Algebra": {
1141
- "Average Score": 0.552025728,
1142
- "Standard Deviation": 0.04122192409,
1143
- "Rank": 21
1144
- },
1145
- "Probability": {
1146
- "Average Score": 54.0284459891417,
1147
- "Standard Deviation": null,
1148
- "Rank": 24
1149
- },
1150
- "Logical": {
1151
- "Average Score": 58.099761779812475,
1152
- "Standard Deviation": 7.815595203680491,
1153
- "Rank": 20
1154
- },
1155
- "Social": {
1156
- "Average Score": 0.570437582,
1157
- "Standard Deviation": 0.08607040862,
1158
- "Rank": 14
1159
- },
1160
- "Chemistry": {
1161
- "Average Score": 59.784958090634056,
1162
- "Standard Deviation": null,
1163
- "Rank": 19
1164
- },
1165
- "CPP": {
1166
- "Average Score": 61.33538592327427,
1167
- "Standard Deviation": null,
1168
- "Rank": 15
1169
- }
1170
- }
1171
- },
1172
- {
1173
- "config": {
1174
- "model_name": "qwen1.5-14b-chat",
1175
- "organization": "Alibaba",
1176
- "license": "Qianwen LICENSE",
1177
- "knowledge_cutoff": "2024/02"
1178
- },
1179
- "results": {
1180
- "OVERALL": {
1181
- "Average Score": 31.56999734729493,
1182
- "Standard Deviation": 5.42704987916441,
1183
- "Rank": 29
1184
- },
1185
- "Geometry": {
1186
- "Average Score": 0.452504016,
1187
- "Standard Deviation": 0.04225594393,
1188
- "Rank": 26
1189
- },
1190
- "Algebra": {
1191
- "Average Score": 0.538655725,
1192
- "Standard Deviation": 0.03721542594,
1193
- "Rank": 22
1194
- },
1195
- "Probability": {
1196
- "Average Score": 41.027908758027046,
1197
- "Standard Deviation": null,
1198
- "Rank": 35
1199
- },
1200
- "Logical": {
1201
- "Average Score": 31.638560769720616,
1202
- "Standard Deviation": 3.175225377796435,
1203
- "Rank": 38
1204
- },
1205
- "Social": {
1206
- "Average Score": 0.287370142,
1207
- "Standard Deviation": 0.04264085315,
1208
- "Rank": 30
1209
- },
1210
- "Chemistry": {
1211
- "Average Score": 37.667977565724996,
1212
- "Standard Deviation": null,
1213
- "Rank": 35
1214
- },
1215
- "CPP": {
1216
- "Average Score": 38.552779976347026,
1217
- "Standard Deviation": null,
1218
- "Rank": 31
1219
- }
1220
- }
1221
- },
1222
- {
1223
- "config": {
1224
- "model_name": "claude-3-haiku",
1225
- "organization": "Anthropic",
1226
- "license": "Proprietary",
1227
- "knowledge_cutoff": "2023/08"
1228
- },
1229
- "results": {
1230
- "OVERALL": {
1231
- "Average Score": 42.975259650014074,
1232
- "Standard Deviation": 2.248602505751528,
1233
- "Rank": 25
1234
- },
1235
- "Geometry": {
1236
- "Average Score": 0.607993912,
1237
- "Standard Deviation": 0.05793460748,
1238
- "Rank": 15
1239
- },
1240
- "Algebra": {
1241
- "Average Score": 0.520054055,
1242
- "Standard Deviation": 0.03333544511,
1243
- "Rank": 23
1244
- },
1245
- "Probability": {
1246
- "Average Score": 52.44184603289214,
1247
- "Standard Deviation": null,
1248
- "Rank": 28
1249
- },
1250
- "Logical": {
1251
- "Average Score": 50.38523351226464,
1252
- "Standard Deviation": 1.9928131873345676,
1253
- "Rank": 24
1254
- },
1255
- "Social": {
1256
- "Average Score": 0.551083976,
1257
- "Standard Deviation": 0.05374722539,
1258
- "Rank": 16
1259
- },
1260
- "Chemistry": {
1261
- "Average Score": 54.99584531372778,
1262
- "Standard Deviation": null,
1263
- "Rank": 20
1264
- },
1265
- "CPP": {
1266
- "Average Score": 56.40200048817984,
1267
- "Standard Deviation": null,
1268
- "Rank": 16
1269
- }
1270
- }
1271
- },
1272
- {
1273
- "config": {
1274
- "model_name": "claude-2.1",
1275
- "organization": "Anthropic",
1276
- "license": "Proprietary",
1277
- "knowledge_cutoff": "Unknown"
1278
- },
1279
- "results": {
1280
- "OVERALL": {
1281
- "Average Score": 23.82704986290717,
1282
- "Standard Deviation": 1.6337262681919007,
1283
- "Rank": 37
1284
- },
1285
- "Geometry": {
1286
- "Average Score": 0.62752395,
1287
- "Standard Deviation": 0.07232659398,
1288
- "Rank": 14
1289
- },
1290
- "Algebra": {
1291
- "Average Score": 0.508849609,
1292
- "Standard Deviation": 0.0346897465,
1293
- "Rank": 24
1294
- },
1295
- "Probability": {
1296
- "Average Score": 42.82280874207299,
1297
- "Standard Deviation": null,
1298
- "Rank": 32
1299
- },
1300
- "Logical": {
1301
- "Average Score": 47.40647506260718,
1302
- "Standard Deviation": 3.5140099122016686,
1303
- "Rank": 25
1304
- },
1305
- "Social": {
1306
- "Average Score": 0.333804568,
1307
- "Standard Deviation": 0.03775548253,
1308
- "Rank": 27
1309
- },
1310
- "Chemistry": {
1311
- "Average Score": 46.09889239661357,
1312
- "Standard Deviation": null,
1313
- "Rank": 25
1314
- },
1315
- "CPP": {
1316
- "Average Score": 47.23672563994903,
1317
- "Standard Deviation": null,
1318
- "Rank": 21
1319
- }
1320
- }
1321
- },
1322
- {
1323
- "config": {
1324
- "model_name": "mistral-8x7b-instruct-v0.1",
1325
- "organization": "Mistral",
1326
- "license": "Apache 2.0",
1327
- "knowledge_cutoff": "2023/12"
1328
- },
1329
- "results": {
1330
- "OVERALL": {
1331
- "Average Score": 26.279729527476174,
1332
- "Standard Deviation": 1.7823676900027476,
1333
- "Rank": 33
1334
- },
1335
- "Geometry": {
1336
- "Average Score": 0.432216097,
1337
- "Standard Deviation": 0.04747949254,
1338
- "Rank": 29
1339
- },
1340
- "Algebra": {
1341
- "Average Score": 0.478314888,
1342
- "Standard Deviation": 0.01998797419,
1343
- "Rank": 25
1344
- },
1345
- "Probability": {
1346
- "Average Score": 42.27303178662447,
1347
- "Standard Deviation": null,
1348
- "Rank": 33
1349
- },
1350
- "Logical": {
1351
- "Average Score": 34.58281320758576,
1352
- "Standard Deviation": 2.5548927504271073,
1353
- "Rank": 33
1354
- },
1355
- "Social": {
1356
- "Average Score": 0.251949622,
1357
- "Standard Deviation": 0.03346674405,
1358
- "Rank": 35
1359
- },
1360
- "Chemistry": {
1361
- "Average Score": 43.47423835615602,
1362
- "Standard Deviation": null,
1363
- "Rank": 29
1364
- },
1365
- "CPP": {
1366
- "Average Score": 44.533118241976666,
1367
- "Standard Deviation": null,
1368
- "Rank": 25
1369
- }
1370
- }
1371
- },
1372
- {
1373
- "config": {
1374
- "model_name": "claude-2.0",
1375
- "organization": "Anthropic",
1376
- "license": "Proprietary",
1377
- "knowledge_cutoff": "Unknown"
1378
- },
1379
- "results": {
1380
- "OVERALL": {
1381
- "Average Score": 20.490629074737296,
1382
- "Standard Deviation": 0.4821482730133453,
1383
- "Rank": 40
1384
- },
1385
- "Geometry": {
1386
- "Average Score": 0.604141967,
1387
- "Standard Deviation": 0.05116441826,
1388
- "Rank": 16
1389
- },
1390
- "Algebra": {
1391
- "Average Score": 0.474350734,
1392
- "Standard Deviation": 0.01510393066,
1393
- "Rank": 26
1394
- },
1395
- "Probability": {
1396
- "Average Score": 45.15580067803421,
1397
- "Standard Deviation": null,
1398
- "Rank": 31
1399
- },
1400
- "Logical": {
1401
- "Average Score": 43.65660021552717,
1402
- "Standard Deviation": 4.959029305063026,
1403
- "Rank": 27
1404
- },
1405
- "Social": {
1406
- "Average Score": 0.469422836,
1407
- "Standard Deviation": 0.05999901796,
1408
- "Rank": 19
1409
- },
1410
- "Chemistry": {
1411
- "Average Score": 49.53201090067431,
1412
- "Standard Deviation": null,
1413
- "Rank": 23
1414
- },
1415
- "CPP": {
1416
- "Average Score": 50.773143448036464,
1417
- "Standard Deviation": null,
1418
- "Rank": 19
1419
- }
1420
- }
1421
- },
1422
- {
1423
- "config": {
1424
- "model_name": "starling-lm-7b-beta",
1425
- "organization": "Nexusflow",
1426
- "license": "Apache-2.0",
1427
- "knowledge_cutoff": "2024/03"
1428
- },
1429
- "results": {
1430
- "OVERALL": {
1431
- "Average Score": 43.0415265396966,
1432
- "Standard Deviation": 0.8770524316858576,
1433
- "Rank": 24
1434
- },
1435
- "Geometry": {
1436
- "Average Score": 0.446654388,
1437
- "Standard Deviation": 0.05637864999,
1438
- "Rank": 28
1439
- },
1440
- "Algebra": {
1441
- "Average Score": 0.473952749,
1442
- "Standard Deviation": 0.01584301288,
1443
- "Rank": 27
1444
- },
1445
- "Probability": {
1446
- "Average Score": 41.320066911500234,
1447
- "Standard Deviation": null,
1448
- "Rank": 34
1449
- },
1450
- "Logical": {
1451
- "Average Score": 39.79665241383638,
1452
- "Standard Deviation": 3.4711628274016544,
1453
- "Rank": 30
1454
- },
1455
- "Social": {
1456
- "Average Score": 0.380021662,
1457
- "Standard Deviation": 0.04622452748,
1458
- "Rank": 25
1459
- },
1460
- "Chemistry": {
1461
- "Average Score": 37.39896886078588,
1462
- "Standard Deviation": null,
1463
- "Rank": 36
1464
- },
1465
- "CPP": {
1466
- "Average Score": 38.27587102395908,
1467
- "Standard Deviation": null,
1468
- "Rank": 32
1469
- }
1470
- }
1471
- },
1472
- {
1473
- "config": {
1474
- "model_name": "gemini-1.0-pro-001",
1475
- "organization": "Google",
1476
- "license": "Proprietary",
1477
- "knowledge_cutoff": "2023/04"
1478
- },
1479
- "results": {
1480
- "OVERALL": {
1481
- "Average Score": 45.78126809517331,
1482
- "Standard Deviation": 3.7275133674569783,
1483
- "Rank": 23
1484
- },
1485
- "Geometry": {
1486
- "Average Score": 0.578347959,
1487
- "Standard Deviation": 0.04242873607,
1488
- "Rank": 19
1489
- },
1490
- "Algebra": {
1491
- "Average Score": 0.462417786,
1492
- "Standard Deviation": 0.01668313635,
1493
- "Rank": 28
1494
- },
1495
- "Probability": {
1496
- "Average Score": 31.410607001114293,
1497
- "Standard Deviation": null,
1498
- "Rank": 42
1499
- },
1500
- "Logical": {
1501
- "Average Score": 21.717362428653246,
1502
- "Standard Deviation": 4.392290522642325,
1503
- "Rank": 44
1504
- },
1505
- "Social": {
1506
- "Average Score": 0.130790863,
1507
- "Standard Deviation": 0.02800188173,
1508
- "Rank": 45
1509
- },
1510
- "Chemistry": {
1511
- "Average Score": 44.14314678087462,
1512
- "Standard Deviation": null,
1513
- "Rank": 27
1514
- },
1515
- "CPP": {
1516
- "Average Score": 45.22204471452975,
1517
- "Standard Deviation": null,
1518
- "Rank": 23
1519
- }
1520
- }
1521
- },
1522
- {
1523
- "config": {
1524
- "model_name": "openchat-3.5-0106",
1525
- "organization": "OpenChat",
1526
- "license": "Apache-2.0",
1527
- "knowledge_cutoff": "2024/01"
1528
- },
1529
- "results": {
1530
- "OVERALL": {
1531
- "Average Score": 23.85666609339201,
1532
- "Standard Deviation": 1.341285455536348,
1533
- "Rank": 36
1534
- },
1535
- "Geometry": {
1536
- "Average Score": 0.38715246,
1537
- "Standard Deviation": 0.03701851946,
1538
- "Rank": 32
1539
- },
1540
- "Algebra": {
1541
- "Average Score": 0.441233712,
1542
- "Standard Deviation": 0.01135753754,
1543
- "Rank": 29
1544
- },
1545
- "Probability": {
1546
- "Average Score": 40.37790468557232,
1547
- "Standard Deviation": null,
1548
- "Rank": 36
1549
- },
1550
- "Logical": {
1551
- "Average Score": 35.1573373260624,
1552
- "Standard Deviation": 2.485128777146724,
1553
- "Rank": 32
1554
- },
1555
- "Social": {
1556
- "Average Score": 0.250891608,
1557
- "Standard Deviation": 0.03253769914,
1558
- "Rank": 36
1559
- },
1560
- "Chemistry": {
1561
- "Average Score": 32.96322247853182,
1562
- "Standard Deviation": null,
1563
- "Rank": 37
1564
- },
1565
- "CPP": {
1566
- "Average Score": 33.70639271807677,
1567
- "Standard Deviation": null,
1568
- "Rank": 33
1569
- }
1570
- }
1571
- },
1572
- {
1573
- "config": {
1574
- "model_name": "openchat-3.5",
1575
- "organization": "OpenChat",
1576
- "license": "Apache-2.0",
1577
- "knowledge_cutoff": "2023/11"
1578
- },
1579
- "results": {
1580
- "OVERALL": {
1581
- "Average Score": 23.63538251797928,
1582
- "Standard Deviation": 2.0516295921862095,
1583
- "Rank": 38
1584
- },
1585
- "Geometry": {
1586
- "Average Score": 0.401699069,
1587
- "Standard Deviation": 0.03410726557,
1588
- "Rank": 30
1589
- },
1590
- "Algebra": {
1591
- "Average Score": 0.414095336,
1592
- "Standard Deviation": 0.01881964261,
1593
- "Rank": 31
1594
- },
1595
- "Probability": {
1596
- "Average Score": 36.00454588244476,
1597
- "Standard Deviation": null,
1598
- "Rank": 38
1599
- },
1600
- "Logical": {
1601
- "Average Score": 34.029859502735654,
1602
- "Standard Deviation": 3.354098427500673,
1603
- "Rank": 35
1604
- },
1605
- "Social": {
1606
- "Average Score": 0.319991655,
1607
- "Standard Deviation": 0.04502478724,
1608
- "Rank": 29
1609
- },
1610
- "Chemistry": {
1611
- "Average Score": 32.29778226319944,
1612
- "Standard Deviation": null,
1613
- "Rank": 38
1614
- },
1615
- "CPP": {
1616
- "Average Score": 33.020911255646965,
1617
- "Standard Deviation": null,
1618
- "Rank": 34
1619
- }
1620
- }
1621
- },
1622
- {
1623
- "config": {
1624
- "model_name": "command-r-(08-2024)",
1625
- "organization": "Cohere",
1626
- "license": "CC-BY-NC-4.0",
1627
- "knowledge_cutoff": "2024/08"
1628
- },
1629
- "results": {
1630
- "OVERALL": {
1631
- "Average Score": 38.783798277856995,
1632
- "Standard Deviation": 1.1948096596199191,
1633
- "Rank": 27
1634
- },
1635
- "Geometry": {
1636
- "Average Score": 0.448300727,
1637
- "Standard Deviation": 0.04996362328,
1638
- "Rank": 27
1639
- },
1640
- "Algebra": {
1641
- "Average Score": 0.417519167,
1642
- "Standard Deviation": 0.01822196902,
1643
- "Rank": 30
1644
- },
1645
- "Probability": {
1646
- "Average Score": 38.019523941917335,
1647
- "Standard Deviation": null,
1648
- "Rank": 37
1649
- },
1650
- "Logical": {
1651
- "Average Score": 23.408826179018206,
1652
- "Standard Deviation": 0.9355701468205376,
1653
- "Rank": 42
1654
- },
1655
- "Social": {
1656
- "Average Score": 0.276088379,
1657
- "Standard Deviation": 0.03295234688,
1658
- "Rank": 32
1659
- },
1660
- "Chemistry": {
1661
- "Average Score": 38.699171059988636,
1662
- "Standard Deviation": null,
1663
- "Rank": 34
1664
- },
1665
- "CPP": {
1666
- "Average Score": 39.61492485677676,
1667
- "Standard Deviation": null,
1668
- "Rank": 30
1669
- }
1670
- }
1671
- },
1672
- {
1673
- "config": {
1674
- "model_name": "gemma-1.1-7b-it",
1675
- "organization": "Google",
1676
- "license": "Gemma License",
1677
- "knowledge_cutoff": "2024/02"
1678
- },
1679
- "results": {
1680
- "OVERALL": {
1681
- "Average Score": 20.965269549151657,
1682
- "Standard Deviation": 0.6031600560715249,
1683
- "Rank": 39
1684
- },
1685
- "Geometry": {
1686
- "Average Score": 0.324170977,
1687
- "Standard Deviation": 0.04668553765,
1688
- "Rank": 35
1689
- },
1690
- "Algebra": {
1691
- "Average Score": 0.398684697,
1692
- "Standard Deviation": 0.01982398259,
1693
- "Rank": 32
1694
- },
1695
- "Probability": {
1696
- "Average Score": 30.98345832281905,
1697
- "Standard Deviation": null,
1698
- "Rank": 43
1699
- },
1700
- "Logical": {
1701
- "Average Score": 33.36570116785516,
1702
- "Standard Deviation": 3.8824795120929765,
1703
- "Rank": 36
1704
- },
1705
- "Social": {
1706
- "Average Score": 0.179073276,
1707
- "Standard Deviation": 0.02009658805,
1708
- "Rank": 41
1709
- },
1710
- "Chemistry": {
1711
- "Average Score": 41.66173653808921,
1712
- "Standard Deviation": null,
1713
- "Rank": 31
1714
- },
1715
- "CPP": {
1716
- "Average Score": 42.666504105798204,
1717
- "Standard Deviation": null,
1718
- "Rank": 27
1719
- }
1720
- }
1721
- },
1722
- {
1723
- "config": {
1724
- "model_name": "llama3-8b-instruct",
1725
- "organization": "Meta",
1726
- "license": "Llama 3 Community",
1727
- "knowledge_cutoff": "2023/03"
1728
- },
1729
- "results": {
1730
- "OVERALL": {
1731
- "Average Score": 30.183633696164936,
1732
- "Standard Deviation": 3.5901082045571266,
1733
- "Rank": 31
1734
- },
1735
- "Geometry": {
1736
- "Average Score": 0.367143758,
1737
- "Standard Deviation": 0.04363680358,
1738
- "Rank": 33
1739
- },
1740
- "Algebra": {
1741
- "Average Score": 0.391480973,
1742
- "Standard Deviation": 0.02757445266,
1743
- "Rank": 33
1744
- },
1745
- "Probability": {
1746
- "Average Score": 34.51621975866105,
1747
- "Standard Deviation": null,
1748
- "Rank": 39
1749
- },
1750
- "Logical": {
1751
- "Average Score": 45.27560737491475,
1752
- "Standard Deviation": 4.639305724878496,
1753
- "Rank": 26
1754
- },
1755
- "Social": {
1756
- "Average Score": 0.336373622,
1757
- "Standard Deviation": 0.05762408512,
1758
- "Rank": 26
1759
- },
1760
- "Chemistry": {
1761
- "Average Score": 44.271144265487514,
1762
- "Standard Deviation": null,
1763
- "Rank": 26
1764
- },
1765
- "CPP": {
1766
- "Average Score": 45.35392139264795,
1767
- "Standard Deviation": null,
1768
- "Rank": 22
1769
- }
1770
- }
1771
- },
1772
- {
1773
- "config": {
1774
- "model_name": "gemma-2-2b-it",
1775
- "organization": "Google",
1776
- "license": "Gemma License",
1777
- "knowledge_cutoff": "2024/07"
1778
- },
1779
- "results": {
1780
- "OVERALL": {
1781
- "Average Score": 47.37377937645159,
1782
- "Standard Deviation": 2.72420190928707,
1783
- "Rank": 22
1784
- },
1785
- "Geometry": {
1786
- "Average Score": 0.395006676,
1787
- "Standard Deviation": 0.05882607713,
1788
- "Rank": 31
1789
- },
1790
- "Algebra": {
1791
- "Average Score": 0.379391887,
1792
- "Standard Deviation": 0.01722410785,
1793
- "Rank": 34
1794
- },
1795
- "Probability": {
1796
- "Average Score": 33.90530403382374,
1797
- "Standard Deviation": null,
1798
- "Rank": 41
1799
- },
1800
- "Logical": {
1801
- "Average Score": 37.64262561604027,
1802
- "Standard Deviation": 3.0627256408495804,
1803
- "Rank": 31
1804
- },
1805
- "Social": {
1806
- "Average Score": 0.393482094,
1807
- "Standard Deviation": 0.06450214024,
1808
- "Rank": 23
1809
- },
1810
- "Chemistry": {
1811
- "Average Score": 29.883648650177584,
1812
- "Standard Deviation": null,
1813
- "Rank": 40
1814
- },
1815
- "CPP": {
1816
- "Average Score": 30.53406933106768,
1817
- "Standard Deviation": null,
1818
- "Rank": 36
1819
- }
1820
- }
1821
- },
1822
- {
1823
- "config": {
1824
- "model_name": "starling-lm-7b-alpha",
1825
- "organization": "Nexusflow",
1826
- "license": "Apache-2.0",
1827
- "knowledge_cutoff": "2023/11"
1828
- },
1829
- "results": {
1830
- "OVERALL": {
1831
- "Average Score": 24.34505731078066,
1832
- "Standard Deviation": 1.4660872513914562,
1833
- "Rank": 35
1834
- },
1835
- "Geometry": {
1836
- "Average Score": 0.336782578,
1837
- "Standard Deviation": 0.04069449132,
1838
- "Rank": 34
1839
- },
1840
- "Algebra": {
1841
- "Average Score": 0.371551932,
1842
- "Standard Deviation": 0.03367241745,
1843
- "Rank": 35
1844
- },
1845
- "Probability": {
1846
- "Average Score": 34.51613212227484,
1847
- "Standard Deviation": null,
1848
- "Rank": 40
1849
- },
1850
- "Logical": {
1851
- "Average Score": 29.88612695085449,
1852
- "Standard Deviation": 2.4070524024678672,
1853
- "Rank": 40
1854
- },
1855
- "Social": {
1856
- "Average Score": 0.271975534,
1857
- "Standard Deviation": 0.04266753408,
1858
- "Rank": 33
1859
- },
1860
- "Chemistry": {
1861
- "Average Score": 29.442057363491365,
1862
- "Standard Deviation": null,
1863
- "Rank": 41
1864
- },
1865
- "CPP": {
1866
- "Average Score": 30.07926487356878,
1867
- "Standard Deviation": null,
1868
- "Rank": 37
1869
- }
1870
- }
1871
- },
1872
- {
1873
- "config": {
1874
- "model_name": "qwen1.5-4b-chat",
1875
- "organization": "Alibaba",
1876
- "license": "Qianwen LICENSE",
1877
- "knowledge_cutoff": "2024/02"
1878
- },
1879
- "results": {
1880
- "OVERALL": {
1881
- "Average Score": 7.19753150259024,
1882
- "Standard Deviation": 0.6175113365944395,
1883
- "Rank": 52
1884
- },
1885
- "Geometry": {
1886
- "Average Score": 0.215834522,
1887
- "Standard Deviation": 0.0363766363,
1888
- "Rank": 39
1889
- },
1890
- "Algebra": {
1891
- "Average Score": 0.305589811,
1892
- "Standard Deviation": 0.02354198912,
1893
- "Rank": 36
1894
- },
1895
- "Probability": {
1896
- "Average Score": 15.124506890648007,
1897
- "Standard Deviation": null,
1898
- "Rank": 49
1899
- },
1900
- "Logical": {
1901
- "Average Score": 11.67206257803879,
1902
- "Standard Deviation": 1.140401009846497,
1903
- "Rank": 51
1904
- },
1905
- "Social": {
1906
- "Average Score": 0.18195615,
1907
- "Standard Deviation": 0.02269805277,
1908
- "Rank": 40
1909
- },
1910
- "Chemistry": {
1911
- "Average Score": 12.825435835657133,
1912
- "Standard Deviation": null,
1913
- "Rank": 52
1914
- },
1915
- "CPP": {
1916
- "Average Score": 13.21208067122554,
1917
- "Standard Deviation": null,
1918
- "Rank": 47
1919
- }
1920
- }
1921
- },
1922
- {
1923
- "config": {
1924
- "model_name": "command-r-(04-2024)",
1925
- "organization": "Cohere",
1926
- "license": "CC-BY-NC-4.0",
1927
- "knowledge_cutoff": "2024/04"
1928
- },
1929
- "results": {
1930
- "OVERALL": {
1931
- "Average Score": 26.20787727166716,
1932
- "Standard Deviation": 1.6793980036057201,
1933
- "Rank": 34
1934
- },
1935
- "Geometry": {
1936
- "Average Score": 0.300416698,
1937
- "Standard Deviation": 0.03485612736,
1938
- "Rank": 36
1939
- },
1940
- "Algebra": {
1941
- "Average Score": 0.293120231,
1942
- "Standard Deviation": 0.032926484,
1943
- "Rank": 37
1944
- },
1945
- "Probability": {
1946
- "Average Score": 28.551833516483626,
1947
- "Standard Deviation": null,
1948
- "Rank": 44
1949
- },
1950
- "Logical": {
1951
- "Average Score": 30.83782425033377,
1952
- "Standard Deviation": 3.4266833154577383,
1953
- "Rank": 39
1954
- },
1955
- "Social": {
1956
- "Average Score": 0.283882949,
1957
- "Standard Deviation": 0.03336901148,
1958
- "Rank": 31
1959
- },
1960
- "Chemistry": {
1961
- "Average Score": 40.38004181614496,
1962
- "Standard Deviation": null,
1963
- "Rank": 32
1964
- },
1965
- "CPP": {
1966
- "Average Score": 41.346336503003236,
1967
- "Standard Deviation": null,
1968
- "Rank": 28
1969
- }
1970
- }
1971
- },
1972
- {
1973
- "config": {
1974
- "model_name": "vicuna-33b",
1975
- "organization": "LMSYS",
1976
- "license": "Non-commercial",
1977
- "knowledge_cutoff": "2023/08"
1978
- },
1979
- "results": {
1980
- "OVERALL": {
1981
- "Average Score": 19.726298678709266,
1982
- "Standard Deviation": 1.0771354692793496,
1983
- "Rank": 41
1984
- },
1985
- "Geometry": {
1986
- "Average Score": 0.208284679,
1987
- "Standard Deviation": 0.03937771461,
1988
- "Rank": 40
1989
- },
1990
- "Algebra": {
1991
- "Average Score": 0.248994048,
1992
- "Standard Deviation": 0.02668175054,
1993
- "Rank": 39
1994
- },
1995
- "Probability": {
1996
- "Average Score": 23.2308538772627,
1997
- "Standard Deviation": null,
1998
- "Rank": 47
1999
- },
2000
- "Logical": {
2001
- "Average Score": 19.488409585540122,
2002
- "Standard Deviation": 0.7913465863319494,
2003
- "Rank": 46
2004
- },
2005
- "Social": {
2006
- "Average Score": 0.257623798,
2007
- "Standard Deviation": 0.02653724437,
2008
- "Rank": 34
2009
- },
2010
- "Chemistry": {
2011
- "Average Score": 27.198874596635843,
2012
- "Standard Deviation": null,
2013
- "Rank": 43
2014
- },
2015
- "CPP": {
2016
- "Average Score": 28.01838653090379,
2017
- "Standard Deviation": null,
2018
- "Rank": 38
2019
- }
2020
- }
2021
- },
2022
- {
2023
- "config": {
2024
- "model_name": "gemma-7b-it",
2025
- "organization": "Google",
2026
- "license": "Gemma License",
2027
- "knowledge_cutoff": "2024/02"
2028
- },
2029
- "results": {
2030
- "OVERALL": {
2031
- "Average Score": 18.339626858215343,
2032
- "Standard Deviation": 0.1553156123023995,
2033
- "Rank": 43
2034
- },
2035
- "Geometry": {
2036
- "Average Score": 0.244791417,
2037
- "Standard Deviation": 0.0289612078,
2038
- "Rank": 37
2039
- },
2040
- "Algebra": {
2041
- "Average Score": 0.250614794,
2042
- "Standard Deviation": 0.01991678295,
2043
- "Rank": 38
2044
- },
2045
- "Probability": {
2046
- "Average Score": 18.066869704202595,
2047
- "Standard Deviation": null,
2048
- "Rank": 48
2049
- },
2050
- "Logical": {
2051
- "Average Score": 22.446113532575186,
2052
- "Standard Deviation": 1.1759308097806727,
2053
- "Rank": 43
2054
- },
2055
- "Social": {
2056
- "Average Score": 0.202138025,
2057
- "Standard Deviation": 0.02098346639,
2058
- "Rank": 39
2059
- },
2060
- "Chemistry": {
2061
- "Average Score": 27.195166540671735,
2062
- "Standard Deviation": null,
2063
- "Rank": 43
2064
- },
2065
- "CPP": {
2066
- "Average Score": 28.014658234926813,
2067
- "Standard Deviation": null,
2068
- "Rank": 39
2069
- }
2070
- }
2071
- },
2072
- {
2073
- "config": {
2074
- "model_name": "mistral-7b-instruct-2",
2075
- "organization": "Mistral",
2076
- "license": "Apache 2.0",
2077
- "knowledge_cutoff": "2023/12"
2078
- },
2079
- "results": {
2080
- "OVERALL": {
2081
- "Average Score": 32.27919528900069,
2082
- "Standard Deviation": 2.070593349377193,
2083
- "Rank": 28
2084
- },
2085
- "Geometry": {
2086
- "Average Score": 0.216402626,
2087
- "Standard Deviation": 0.03338414918,
2088
- "Rank": 38
2089
- },
2090
- "Algebra": {
2091
- "Average Score": 0.233777838,
2092
- "Standard Deviation": 0.0155226054,
2093
- "Rank": 40
2094
- },
2095
- "Probability": {
2096
- "Average Score": 25.70261650740474,
2097
- "Standard Deviation": null,
2098
- "Rank": 45
2099
- },
2100
- "Logical": {
2101
- "Average Score": 26.165635051797608,
2102
- "Standard Deviation": 1.5009510944001014,
2103
- "Rank": 41
2104
- },
2105
- "Social": {
2106
- "Average Score": 0.209386782,
2107
- "Standard Deviation": 0.02738569921,
2108
- "Rank": 38
2109
- },
2110
- "Chemistry": {
2111
- "Average Score": 30.70773868184025,
2112
- "Standard Deviation": null,
2113
- "Rank": 39
2114
- },
2115
- "CPP": {
2116
- "Average Score": 31.382959631870822,
2117
- "Standard Deviation": null,
2118
- "Rank": 35
2119
- }
2120
- }
2121
- },
2122
- {
2123
- "config": {
2124
- "model_name": "mistral-7b-instruct-1",
2125
- "organization": "Mistral",
2126
- "license": "Apache 2.0",
2127
- "knowledge_cutoff": "2023/12"
2128
- },
2129
- "results": {
2130
- "OVERALL": {
2131
- "Average Score": 14.750363553682964,
2132
- "Standard Deviation": 0.442399072321264,
2133
- "Rank": 48
2134
- },
2135
- "Geometry": {
2136
- "Average Score": 0.161799938,
2137
- "Standard Deviation": 0.03595278559,
2138
- "Rank": 44
2139
- },
2140
- "Algebra": {
2141
- "Average Score": 0.210341624,
2142
- "Standard Deviation": 0.01736539119,
2143
- "Rank": 41
2144
- },
2145
- "Probability": {
2146
- "Average Score": 24.69501890202338,
2147
- "Standard Deviation": null,
2148
- "Rank": 46
2149
- },
2150
- "Logical": {
2151
- "Average Score": 15.957706802740889,
2152
- "Standard Deviation": 2.080778273455708,
2153
- "Rank": 50
2154
- },
2155
- "Social": {
2156
- "Average Score": 0.117646827,
2157
- "Standard Deviation": 0.009321202779,
2158
- "Rank": 47
2159
- },
2160
- "Chemistry": {
2161
- "Average Score": 18.375111202411667,
2162
- "Standard Deviation": null,
2163
- "Rank": 47
2164
- },
2165
- "CPP": {
2166
- "Average Score": 18.929093202755805,
2167
- "Standard Deviation": null,
2168
- "Rank": 42
2169
- }
2170
- }
2171
- },
2172
- {
2173
- "config": {
2174
- "model_name": "vicuna-13b",
2175
- "organization": "LMSYS",
2176
- "license": "Non-commercial",
2177
- "knowledge_cutoff": "2023/07"
2178
- },
2179
- "results": {
2180
- "OVERALL": {
2181
- "Average Score": 13.302607436757697,
2182
- "Standard Deviation": 0.570272227659312,
2183
- "Rank": 50
2184
- },
2185
- "Geometry": {
2186
- "Average Score": 0.200941928,
2187
- "Standard Deviation": 0.03366817781,
2188
- "Rank": 41
2189
- },
2190
- "Algebra": {
2191
- "Average Score": 0.196123323,
2192
- "Standard Deviation": 0.0135715643,
2193
- "Rank": 42
2194
- },
2195
- "Probability": {
2196
- "Average Score": 15.08476669604627,
2197
- "Standard Deviation": null,
2198
- "Rank": 50
2199
- },
2200
- "Logical": {
2201
- "Average Score": 16.548339412104294,
2202
- "Standard Deviation": 3.443370777556759,
2203
- "Rank": 49
2204
- },
2205
- "Social": {
2206
- "Average Score": 0.124655135,
2207
- "Standard Deviation": 0.01122382671,
2208
- "Rank": 46
2209
- },
2210
- "Chemistry": {
2211
- "Average Score": 21.201173318496842,
2212
- "Standard Deviation": null,
2213
- "Rank": 45
2214
- },
2215
- "CPP": {
2216
- "Average Score": 21.840013221590294,
2217
- "Standard Deviation": null,
2218
- "Rank": 40
2219
- }
2220
- }
2221
- },
2222
- {
2223
- "config": {
2224
- "model_name": "zephyr-7b-beta",
2225
- "organization": "HuggingFace",
2226
- "license": "MIT",
2227
- "knowledge_cutoff": "2023/10"
2228
- },
2229
- "results": {
2230
- "OVERALL": {
2231
- "Average Score": 7.378234886105356,
2232
- "Standard Deviation": 1.1456147261693999,
2233
- "Rank": 51
2234
- },
2235
- "Geometry": {
2236
- "Average Score": 0.114005544,
2237
- "Standard Deviation": 0.03144354365,
2238
- "Rank": 45
2239
- },
2240
- "Algebra": {
2241
- "Average Score": 0.141766633,
2242
- "Standard Deviation": 0.03179520129,
2243
- "Rank": 43
2244
- },
2245
- "Probability": {
2246
- "Average Score": 8.92696070171298,
2247
- "Standard Deviation": null,
2248
- "Rank": 53
2249
- },
2250
- "Logical": {
2251
- "Average Score": 6.971377981442089,
2252
- "Standard Deviation": 0.31669853263737413,
2253
- "Rank": 55
2254
- },
2255
- "Social": {
2256
- "Average Score": 0.0,
2257
- "Standard Deviation": 0.0,
2258
- "Rank": 52
2259
- },
2260
- "Chemistry": {
2261
- "Average Score": 18.374948840997902,
2262
- "Standard Deviation": null,
2263
- "Rank": 47
2264
- },
2265
- "CPP": {
2266
- "Average Score": 18.92902220864132,
2267
- "Standard Deviation": null,
2268
- "Rank": 43
2269
- }
2270
- }
2271
- },
2272
- {
2273
- "config": {
2274
- "model_name": "gemma-1.1-2b-it",
2275
- "organization": "Google",
2276
- "license": "Gemma License",
2277
- "knowledge_cutoff": "2024/02"
2278
- },
2279
- "results": {
2280
- "OVERALL": {
2281
- "Average Score": 16.083251992757752,
2282
- "Standard Deviation": 0.7340624884005772,
2283
- "Rank": 46
2284
- },
2285
- "Geometry": {
2286
- "Average Score": 0.183974034,
2287
- "Standard Deviation": 0.0215548886,
2288
- "Rank": 43
2289
- },
2290
- "Algebra": {
2291
- "Average Score": 0.13422252,
2292
- "Standard Deviation": 0.01922819511,
2293
- "Rank": 44
2294
- },
2295
- "Probability": {
2296
- "Average Score": 9.992136776217318,
2297
- "Standard Deviation": null,
2298
- "Rank": 52
2299
- },
2300
- "Logical": {
2301
- "Average Score": 9.537233946101678,
2302
- "Standard Deviation": 0.7567112693269967,
2303
- "Rank": 53
2304
- },
2305
- "Social": {
2306
- "Average Score": 0.167796727,
2307
- "Standard Deviation": 0.01666541942,
2308
- "Rank": 42
2309
- },
2310
- "Chemistry": {
2311
- "Average Score": 20.11834233400297,
2312
- "Standard Deviation": null,
2313
- "Rank": 46
2314
- },
2315
- "CPP": {
2316
- "Average Score": 20.724691953843916,
2317
- "Standard Deviation": null,
2318
- "Rank": 41
2319
- }
2320
- }
2321
- },
2322
- {
2323
- "config": {
2324
- "model_name": "llama2-7b-chat",
2325
- "organization": "Meta",
2326
- "license": "Llama 2 Community",
2327
- "knowledge_cutoff": "2023/07"
2328
- },
2329
- "results": {
2330
- "OVERALL": {
2331
- "Average Score": 17.319161859655946,
2332
- "Standard Deviation": 0.495520710612214,
2333
- "Rank": 45
2334
- },
2335
- "Geometry": {
2336
- "Average Score": 0.087067276,
2337
- "Standard Deviation": 0.04274343402,
2338
- "Rank": 46
2339
- },
2340
- "Algebra": {
2341
- "Average Score": 0.12308805,
2342
- "Standard Deviation": 0.01856053622,
2343
- "Rank": 45
2344
- },
2345
- "Probability": {
2346
- "Average Score": 8.860911732515305,
2347
- "Standard Deviation": null,
2348
- "Rank": 54
2349
- },
2350
- "Logical": {
2351
- "Average Score": 18.812132126028335,
2352
- "Standard Deviation": 3.0846832107977433,
2353
- "Rank": 47
2354
- },
2355
- "Social": {
2356
- "Average Score": 0.152905272,
2357
- "Standard Deviation": 0.007166957097,
2358
- "Rank": 43
2359
- },
2360
- "Chemistry": {
2361
- "Average Score": 15.270334671133512,
2362
- "Standard Deviation": null,
2363
- "Rank": 50
2364
- },
2365
- "CPP": {
2366
- "Average Score": 15.730513733660898,
2367
- "Standard Deviation": null,
2368
- "Rank": 45
2369
- }
2370
- }
2371
- },
2372
- {
2373
- "config": {
2374
- "model_name": "gemma-2b-it",
2375
- "organization": "Google",
2376
- "license": "Gemma License",
2377
- "knowledge_cutoff": "2024/02"
2378
- },
2379
- "results": {
2380
- "OVERALL": {
2381
- "Average Score": 15.029602991101632,
2382
- "Standard Deviation": 0.4529017602377039,
2383
- "Rank": 47
2384
- },
2385
- "Geometry": {
2386
- "Average Score": 0.198571153,
2387
- "Standard Deviation": 0.01699161031,
2388
- "Rank": 42
2389
- },
2390
- "Algebra": {
2391
- "Average Score": 0.109883009,
2392
- "Standard Deviation": 0.01520005833,
2393
- "Rank": 46
2394
- },
2395
- "Probability": {
2396
- "Average Score": 6.561360414966015,
2397
- "Standard Deviation": null,
2398
- "Rank": 56
2399
- },
2400
- "Logical": {
2401
- "Average Score": 3.9858662356708785,
2402
- "Standard Deviation": 0.5609499073366407,
2403
- "Rank": 56
2404
- },
2405
- "Social": {
2406
- "Average Score": 0.087452913,
2407
- "Standard Deviation": 0.008170146562,
2408
- "Rank": 50
2409
- },
2410
- "Chemistry": {
2411
- "Average Score": 16.766144078336097,
2412
- "Standard Deviation": null,
2413
- "Rank": 49
2414
- },
2415
- "CPP": {
2416
- "Average Score": 17.2715657115764,
2417
- "Standard Deviation": null,
2418
- "Rank": 44
2419
- }
2420
- }
2421
- },
2422
- {
2423
- "config": {
2424
- "model_name": "llama2-13b-chat",
2425
- "organization": "Meta",
2426
- "license": "Llama 2 Community",
2427
- "knowledge_cutoff": "2023/07"
2428
- },
2429
- "results": {
2430
- "OVERALL": {
2431
- "Average Score": 17.47902371074294,
2432
- "Standard Deviation": 0.4047581815962028,
2433
- "Rank": 44
2434
- },
2435
- "Geometry": {
2436
- "Average Score": 0.072729954,
2437
- "Standard Deviation": 0.02315988261,
2438
- "Rank": 48
2439
- },
2440
- "Algebra": {
2441
- "Average Score": 0.080371692,
2442
- "Standard Deviation": 0.01277569453,
2443
- "Rank": 47
2444
- },
2445
- "Probability": {
2446
- "Average Score": 12.738302754764042,
2447
- "Standard Deviation": null,
2448
- "Rank": 51
2449
- },
2450
- "Logical": {
2451
- "Average Score": 21.708359515217182,
2452
- "Standard Deviation": 1.4862481594434973,
2453
- "Rank": 45
2454
- },
2455
- "Social": {
2456
- "Average Score": 0.149125922,
2457
- "Standard Deviation": 0.01157416827,
2458
- "Rank": 44
2459
- },
2460
- "Chemistry": {
2461
- "Average Score": 12.786967781868814,
2462
- "Standard Deviation": null,
2463
- "Rank": 53
2464
- },
2465
- "CPP": {
2466
- "Average Score": 13.17258252933903,
2467
- "Standard Deviation": null,
2468
- "Rank": 48
2469
- }
2470
- }
2471
- },
2472
- {
2473
- "config": {
2474
- "model_name": "vicuna-7b",
2475
- "organization": "LMSYS",
2476
- "license": "Non-commercial",
2477
- "knowledge_cutoff": "2023/07"
2478
- },
2479
- "results": {
2480
- "OVERALL": {
2481
- "Average Score": 13.31896682669754,
2482
- "Standard Deviation": 0.30441157156016124,
2483
- "Rank": 49
2484
- },
2485
- "Geometry": {
2486
- "Average Score": 0.083457058,
2487
- "Standard Deviation": 0.02520989111,
2488
- "Rank": 47
2489
- },
2490
- "Algebra": {
2491
- "Average Score": 0.070883882,
2492
- "Standard Deviation": 0.007315853253,
2493
- "Rank": 48
2494
- },
2495
- "Probability": {
2496
- "Average Score": 8.255246380068842,
2497
- "Standard Deviation": null,
2498
- "Rank": 55
2499
- },
2500
- "Logical": {
2501
- "Average Score": 10.046676845257544,
2502
- "Standard Deviation": 0.6816182835206797,
2503
- "Rank": 52
2504
- },
2505
- "Social": {
2506
- "Average Score": 0.111076414,
2507
- "Standard Deviation": 0.004805626512,
2508
- "Rank": 48
2509
- },
2510
- "Chemistry": {
2511
- "Average Score": 13.838150481781991,
2512
- "Standard Deviation": null,
2513
- "Rank": 51
2514
- },
2515
- "CPP": {
2516
- "Average Score": 14.255194156624162,
2517
- "Standard Deviation": null,
2518
- "Rank": 46
2519
- }
2520
- }
2521
- },
2522
- {
2523
- "config": {
2524
- "model_name": "koala-13b",
2525
- "organization": "UC Berkeley",
2526
- "license": "Non-commercial",
2527
- "knowledge_cutoff": "2023/04"
2528
- },
2529
- "results": {
2530
- "OVERALL": {
2531
- "Average Score": 6.419305623111718,
2532
- "Standard Deviation": 0.19611070515647736,
2533
- "Rank": 53
2534
- },
2535
- "Geometry": {
2536
- "Average Score": 0.017374001,
2537
- "Standard Deviation": 0.01747053557,
2538
- "Rank": 49
2539
- },
2540
- "Algebra": {
2541
- "Average Score": 0.018129197,
2542
- "Standard Deviation": 0.01054371383,
2543
- "Rank": 49
2544
- },
2545
- "Probability": {
2546
- "Average Score": 4.1717283559090035,
2547
- "Standard Deviation": null,
2548
- "Rank": 57
2549
- },
2550
- "Logical": {
2551
- "Average Score": 7.484701131693112,
2552
- "Standard Deviation": 0.172417770163525,
2553
- "Rank": 54
2554
- },
2555
- "Social": {
2556
- "Average Score": 0.096983835,
2557
- "Standard Deviation": 0.007847059783,
2558
- "Rank": 49
2559
- },
2560
- "Chemistry": {
2561
- "Average Score": 6.177985738164252,
2562
- "Standard Deviation": null,
2563
- "Rank": 54
2564
- },
2565
- "CPP": {
2566
- "Average Score": 6.36433272373514,
2567
- "Standard Deviation": null,
2568
- "Rank": 49
2569
- }
2570
- }
2571
- },
2572
- {
2573
- "config": {
2574
- "model_name": "openassistant-pythia-12b",
2575
- "organization": "OpenAssistant",
2576
- "license": "Non-commercial",
2577
- "knowledge_cutoff": "2023/04"
2578
- },
2579
- "results": {
2580
- "OVERALL": {
2581
- "Average Score": 0.0,
2582
- "Standard Deviation": 0.0,
2583
- "Rank": 54
2584
- },
2585
- "Geometry": {
2586
- "Average Score": 0.0,
2587
- "Standard Deviation": 0.0,
2588
- "Rank": 50
2589
- },
2590
- "Algebra": {
2591
- "Average Score": 0.0,
2592
- "Standard Deviation": 0.0,
2593
- "Rank": 50
2594
- },
2595
- "Probability": {
2596
- "Average Score": 0.0,
2597
- "Standard Deviation": null,
2598
- "Rank": 58
2599
- },
2600
- "Logical": {
2601
- "Average Score": 0.0,
2602
- "Standard Deviation": 0.0,
2603
- "Rank": 57
2604
- },
2605
- "Social": {
2606
- "Average Score": 0.030792528,
2607
- "Standard Deviation": 0.007518796391,
2608
- "Rank": 51
2609
- },
2610
- "Chemistry": {
2611
- "Average Score": 0.0,
2612
- "Standard Deviation": null,
2613
- "Rank": 55
2614
- },
2615
- "CPP": {
2616
- "Average Score": 0.0,
2617
- "Standard Deviation": null,
2618
- "Rank": 50
2619
- }
2620
- }
2621
- },
2622
- {
2623
- "config": {
2624
- "model_name": "nemotron-70b",
2625
- "organization": "NVIDIA",
2626
- "license": "Unknown",
2627
- "knowledge_cutoff": "Unknown"
2628
- },
2629
- "results": {
2630
- "OVERALL": {
2631
- "Average Score": 100.0,
2632
- "Standard Deviation": 0.0,
2633
- "Rank": 1
2634
- },
2635
- "Chemistry": {
2636
- "Average Score": 96.00601450276388,
2637
- "Standard Deviation": null,
2638
- "Rank": 3
2639
- },
2640
- "Logical": {
2641
- "Average Score": 98.08807085219765,
2642
- "Standard Deviation": 0.832489959144682,
2643
- "Rank": 5
2644
- },
2645
- "Probability": {
2646
- "Average Score": 91.16755514126538,
2647
- "Standard Deviation": null,
2648
- "Rank": 4
2649
- }
2650
- }
2651
- },
2652
- {
2653
- "config": {
2654
- "model_name": "llama-3.2-3b-it",
2655
- "organization": "Meta",
2656
- "license": "Llama 3 Community",
2657
- "knowledge_cutoff": "Unknown"
2658
- },
2659
- "results": {
2660
- "OVERALL": {
2661
- "Average Score": 30.40742747938681,
2662
- "Standard Deviation": 1.6816556668351852,
2663
- "Rank": 30
2664
- },
2665
- "Chemistry": {
2666
- "Average Score": 27.43049468475638,
2667
- "Standard Deviation": null,
2668
- "Rank": 42
2669
- },
2670
- "Logical": {
2671
- "Average Score": 41.58905844173492,
2672
- "Standard Deviation": 5.2798221527591,
2673
- "Rank": 29
2674
- },
2675
- "Probability": {
2676
- "Average Score": 62.02868227997844,
2677
- "Standard Deviation": null,
2678
- "Rank": 18
2679
- }
2680
- }
2681
- },
2682
- {
2683
- "config": {
2684
- "model_name": "yi-lightning",
2685
- "organization": "01 AI",
2686
- "license": "Proprietary",
2687
- "knowledge_cutoff": "Unknown"
2688
- },
2689
- "results": {
2690
- "Chemistry": {
2691
- "Average Score": 100.0,
2692
- "Standard Deviation": null,
2693
- "Rank": 1
2694
- },
2695
- "Logical": {
2696
- "Average Score": 98.816765663456,
2697
- "Standard Deviation": 0.3271335810663529,
2698
- "Rank": 3
2699
- },
2700
- "Probability": {
2701
- "Average Score": 95.8842044402052,
2702
- "Standard Deviation": null,
2703
- "Rank": 2
2704
- }
2705
- }
2706
- },
2707
- {
2708
- "config": {
2709
- "model_name": "glm-4-plus",
2710
- "organization": "Zhipu AI",
2711
- "license": "Proprietary",
2712
- "knowledge_cutoff": "Unknown"
2713
- },
2714
- "results": {
2715
- "Chemistry": {
2716
- "Average Score": 99.05822908668402,
2717
- "Standard Deviation": null,
2718
- "Rank": 2
2719
- },
2720
- "Logical": {
2721
- "Average Score": 99.45307787995229,
2722
- "Standard Deviation": 0.5982476107949444,
2723
- "Rank": 1
2724
- },
2725
- "Probability": {
2726
- "Average Score": 92.04426702796823,
2727
- "Standard Deviation": null,
2728
- "Rank": 3
2729
- }
2730
- }
2731
- }
2732
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/results/{models_2024-10-20-23:34:57.242641.json → models_2024-10-24-08:08:59.127307.json} RENAMED
The diff for this file is too large to render. See raw diff