wuqing157 commited on
Commit
3d0b0cf
1 Parent(s): 5a867a4
app.py CHANGED
@@ -102,9 +102,7 @@ def init_leaderboard(dataframe):
102
  # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
103
  # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
104
  # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
105
- # model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
106
- model_result_path = "./src/results/models_2024-10-09-05:17:38.810960.json"
107
- # model_result_path = "./src/results/models_2024-10-09-06:22:21.122422.json"
108
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
109
 
110
 
@@ -138,7 +136,7 @@ with demo:
138
  with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
139
 
140
  DESCRIPTION_TEXT = """
141
- Total #models: 53 (Last updated: 2024-10-08)
142
 
143
  This page provids a comprehensive overview of model ranks across various dimensions. Models are sorted based on their averaged rank across all dimensions.
144
  (Some missing values are due to the slow or problemtic model responses, and we will update the leaderboard once we have the complete results.)
@@ -157,7 +155,6 @@ with demo:
157
  AutoEvalColumn.rank_math_probability.name,
158
  AutoEvalColumn.rank_reason_logical.name,
159
  AutoEvalColumn.rank_reason_social.name,
160
- AutoEvalColumn.rank_chemistry.name,
161
  ],
162
  rank_col=[],
163
  )
@@ -275,7 +272,6 @@ with demo:
275
  [SocialIQA](https://arxiv.org/abs/1904.09728),
276
  [NormBank](https://arxiv.org/abs/2305.17008), covering challenging social reasoning tasks,
277
  such as social commonsense reasoning, social normative reasoning, Theory of Mind (ToM) reasoning, etc.
278
- More fine-grained types of reasoning, such as symbolic, analogical, counterfactual reasoning, are planned to be added in the future.
279
 
280
  """
281
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
@@ -316,10 +312,9 @@ with demo:
316
 
317
  with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
318
  CURRENT_TEXT = """
319
- Scientific tasks are crucial for evaluating LLMs, requiring both domain-specific knowledge and reasoning capabilities.
320
-
321
- We are adding several fine-grained scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
322
- We have diversely and aggressively collected recent scientific datasets, including but not limited to
323
  [GPQA](https://arxiv.org/abs/2311.12022),
324
  [JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
325
  [MMLU-Pro](https://arxiv.org/abs/2406.01574),
@@ -328,41 +323,12 @@ with demo:
328
  [SciEval](https://arxiv.org/abs/2308.13149).
329
  """
330
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
331
-
332
- with gr.TabItem("🧪 Chemistry", elem_id="chemistry_subtab", id=0, elem_classes="subtab"):
333
- leaderboard = overall_leaderboard(
334
- get_model_leaderboard_df(
335
- model_result_path,
336
- benchmark_cols=[
337
- AutoEvalColumn.rank_chemistry.name,
338
- AutoEvalColumn.model.name,
339
- AutoEvalColumn.score_chemistry.name,
340
- # AutoEvalColumn.sd_reason_social.name,
341
- AutoEvalColumn.license.name,
342
- AutoEvalColumn.organization.name,
343
- AutoEvalColumn.knowledge_cutoff.name,
344
- ],
345
- rank_col=[AutoEvalColumn.rank_chemistry.name],
346
- )
347
- )
348
-
349
- with gr.TabItem("⚛️ Physics", elem_id="physics_subtab", id=1, elem_classes="subtab"):
350
- CURRENT_TEXT = """
351
- # Coming soon!
352
- """
353
- gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
354
-
355
-
356
- with gr.TabItem("🧬 Biology", elem_id="biology_subtab", id=2, elem_classes="subtab"):
357
- CURRENT_TEXT = """
358
- # Coming soon!
359
- """
360
- gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
361
 
362
 
363
  with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
364
  CURRENT_TEXT = """
365
- We are working on adding more fine-grained tasks in coding domains to the leaderboard.
 
366
  The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
367
  We collect a variety of recent coding datasets, including
368
  [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
@@ -373,24 +339,6 @@ with demo:
373
  Our efforts also include synthesizing new code-related queries to ensure diversity!
374
  """
375
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
376
-
377
- with gr.TabItem("🐍 Python", elem_id="python_subtab", id=0, elem_classes="subtab"):
378
- CURRENT_TEXT = """
379
- # Coming soon!
380
- """
381
- gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
382
-
383
- with gr.TabItem("☕ Java", elem_id="java_subtab", id=1, elem_classes="subtab"):
384
- CURRENT_TEXT = """
385
- # Coming soon!
386
- """
387
- gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
388
-
389
- with gr.TabItem("➕ C++", elem_id="cpp_subtab", id=2, elem_classes="subtab"):
390
- CURRENT_TEXT = """
391
- # Coming soon!
392
- """
393
- gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
394
 
395
 
396
 
@@ -406,7 +354,7 @@ with demo:
406
 
407
  ## Team members
408
  Yanbin Yin, [Zhen Wang](https://zhenwang9102.github.io/), [Kun Zhou](https://lancelot39.github.io/), Xiangdong Zhang,
409
- [Shibo Hao](https://ber666.github.io/), [Yi Gu](https://www.yigu.page/), [Jieyuan Liu](https://www.linkedin.com/in/jieyuan-liu/), [Somanshu Singla](https://www.linkedin.com/in/somanshu-singla-105636214/), [Tianyang Liu](https://leolty.github.io/),
410
  [Eric P. Xing](https://www.cs.cmu.edu/~epxing/), [Zhengzhong Liu](https://hunterhector.github.io/), [Haojian Jin](https://www.haojianj.in/),
411
  [Zhiting Hu](https://zhiting.ucsd.edu/)
412
 
 
102
  # model_result_path = "./src/results/models_2024-10-07-14:50:12.666068.jsonl"
103
  # model_result_path = "./src/results/models_2024-10-08-03:10:26.811832.jsonl"
104
  # model_result_path = "./src/results/models_2024-10-08-03:25:44.801310.jsonl"
105
+ model_result_path = "./src/results/models_2024-10-08-17:39:21.001582.jsonl"
 
 
106
  # model_leaderboard_df = get_model_leaderboard_df(model_result_path)
107
 
108
 
 
136
  with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
137
 
138
  DESCRIPTION_TEXT = """
139
+ Total #models: 52 (Last updated: 2024-10-08)
140
 
141
  This page provids a comprehensive overview of model ranks across various dimensions. Models are sorted based on their averaged rank across all dimensions.
142
  (Some missing values are due to the slow or problemtic model responses, and we will update the leaderboard once we have the complete results.)
 
155
  AutoEvalColumn.rank_math_probability.name,
156
  AutoEvalColumn.rank_reason_logical.name,
157
  AutoEvalColumn.rank_reason_social.name,
 
158
  ],
159
  rank_col=[],
160
  )
 
272
  [SocialIQA](https://arxiv.org/abs/1904.09728),
273
  [NormBank](https://arxiv.org/abs/2305.17008), covering challenging social reasoning tasks,
274
  such as social commonsense reasoning, social normative reasoning, Theory of Mind (ToM) reasoning, etc.
 
275
 
276
  """
277
  gr.Markdown(DESCRIPTION_TEXT, elem_classes="markdown-text")
 
312
 
313
  with gr.TabItem("🔬 Science", elem_id="science-table", id=4):
314
  CURRENT_TEXT = """
315
+ # Coming soon!
316
+ We are working on adding more tasks on scientific domains to the leaderboard. The forthcoming ones are biology, chemistry, and physics.
317
+ We have diversely and aggressively collected recent science datasets, including but not limited to
 
318
  [GPQA](https://arxiv.org/abs/2311.12022),
319
  [JEEBench](https://aclanthology.org/2023.emnlp-main.468/),
320
  [MMLU-Pro](https://arxiv.org/abs/2406.01574),
 
323
  [SciEval](https://arxiv.org/abs/2308.13149).
324
  """
325
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
 
328
  with gr.TabItem("</> Coding", elem_id="coding-table", id=5):
329
  CURRENT_TEXT = """
330
+ # Coming soon!
331
+ We are working on adding more tasks in coding domains to the leaderboard.
332
  The forthcoming ones focus on Python, Java, and C++, with plans to expand to more languages.
333
  We collect a variety of recent coding datasets, including
334
  [HumanEval](https://huggingface.co/datasets/openai/openai_humaneval),
 
339
  Our efforts also include synthesizing new code-related queries to ensure diversity!
340
  """
341
  gr.Markdown(CURRENT_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
 
344
 
 
354
 
355
  ## Team members
356
  Yanbin Yin, [Zhen Wang](https://zhenwang9102.github.io/), [Kun Zhou](https://lancelot39.github.io/), Xiangdong Zhang,
357
+ [Shibo Hao](https://ber666.github.io/), [Yi Gu](https://www.yigu.page/), Jieyuan Liu, Somanshu Singla, [Tianyang Liu](https://leolty.github.io/),
358
  [Eric P. Xing](https://www.cs.cmu.edu/~epxing/), [Zhengzhong Liu](https://hunterhector.github.io/), [Haojian Jin](https://www.haojianj.in/),
359
  [Zhiting Hu](https://zhiting.ucsd.edu/)
360
 
requirements.txt CHANGED
@@ -2,7 +2,7 @@ APScheduler
2
  black
3
  datasets
4
  gradio
5
- # gradio[oauth]
6
  gradio_leaderboard==0.0.9
7
  gradio_client
8
  huggingface-hub>=0.18.0
 
2
  black
3
  datasets
4
  gradio
5
+ gradio[oauth]
6
  gradio_leaderboard==0.0.9
7
  gradio_client
8
  huggingface-hub>=0.18.0
src/display/utils.py CHANGED
@@ -63,7 +63,7 @@ auto_eval_column_dict.append(["score", ColumnContent, field(default_factory=lamb
63
  auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=lambda: ColumnContent("Score SD", "number", True))])
64
  auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
65
 
66
- # fine-grained dimensions
67
  auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
68
  auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
69
  auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
@@ -85,9 +85,6 @@ auto_eval_column_dict.append(["rank_math_probability", ColumnContent, field(defa
85
  auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
86
  auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
87
 
88
- auto_eval_column_dict.append(["score_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Chemistry)", "number", True))])
89
- auto_eval_column_dict.append(["sd_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Std dev (Chemistry)", "number", True))])
90
- auto_eval_column_dict.append(["rank_chemistry", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Chemistry)", "number", True))])
91
 
92
  for task in Tasks:
93
  auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
 
63
  auto_eval_column_dict.append(["score_sd", ColumnContent, field(default_factory=lambda: ColumnContent("Score SD", "number", True))])
64
  auto_eval_column_dict.append(["rank", ColumnContent, field(default_factory=lambda: ColumnContent("Rank", "number", True))])
65
 
66
+ # fine-graine dimensions
67
  auto_eval_column_dict.append(["score_overall", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Overall)", "number", True))])
68
  auto_eval_column_dict.append(["score_math_algebra", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Algebra)", "number", True))])
69
  auto_eval_column_dict.append(["score_math_geometry", ColumnContent, field(default_factory=lambda: ColumnContent("Score (Math Geometry)", "number", True))])
 
85
  auto_eval_column_dict.append(["rank_reason_logical", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Logical Reasoning)", "number", True))])
86
  auto_eval_column_dict.append(["rank_reason_social", ColumnContent, field(default_factory=lambda: ColumnContent("Rank (Social Reasoning)", "number", True))])
87
 
 
 
 
88
 
89
  for task in Tasks:
90
  auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
src/leaderboard/read_evals.py CHANGED
@@ -115,7 +115,7 @@ class ModelResult:
115
  new_v[kk] = vv
116
 
117
  new_results[k] = new_v
118
-
119
  # Extract results available in this file (some results are split in several files)
120
  # results = {}
121
  # for domain in Domains:
@@ -185,10 +185,6 @@ class ModelResult:
185
  AutoEvalColumn.rank_reason_logical.name: self.results.get("Logical").get("Rank", None),
186
  AutoEvalColumn.rank_reason_social.name: self.results.get("Social").get("Rank", None),
187
 
188
- AutoEvalColumn.score_chemistry.name: self.results.get("Chemistry").get("Average Score", None) if self.results.get("Chemistry") else None,
189
- AutoEvalColumn.sd_chemistry.name: self.results.get("Chemistry").get("Standard Deviation", None) if self.results.get("Chemistry") else None,
190
- AutoEvalColumn.rank_chemistry.name: self.results.get("Chemistry").get("Rank", None) if self.results.get("Chemistry") else None,
191
-
192
  AutoEvalColumn.license.name: self.license,
193
  AutoEvalColumn.organization.name: self.org,
194
  AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
 
115
  new_v[kk] = vv
116
 
117
  new_results[k] = new_v
118
+
119
  # Extract results available in this file (some results are split in several files)
120
  # results = {}
121
  # for domain in Domains:
 
185
  AutoEvalColumn.rank_reason_logical.name: self.results.get("Logical").get("Rank", None),
186
  AutoEvalColumn.rank_reason_social.name: self.results.get("Social").get("Rank", None),
187
 
 
 
 
 
188
  AutoEvalColumn.license.name: self.license,
189
  AutoEvalColumn.organization.name: self.org,
190
  AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
src/populate.py CHANGED
@@ -24,17 +24,11 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
24
  if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
25
  df = df.dropna(subset=benchmark_cols)
26
  df = df.sort_values(by=[rank_col[0]], ascending=True)
27
- # print(rank_col)
28
- else:
29
- # when rank_col, the first in benchmark_cols is empty, sort by averaging all the benchmarks, except the first one
30
- avg_rank = df.iloc[:, 1:].mean(axis=1)
31
- df["Average Rank"] = avg_rank.round(decimals=4)
32
- df = df.sort_values(by=["Average Rank"], ascending=True)
33
- df["Average Rank"] = df["Average Rank"].map('{:.4f}'.format)
34
-
35
- # we'll skip NaN, instrad of deleting the whole row
36
  df = df.fillna('--')
37
- # insert a rank column
38
  rank = np.arange(1, len(df)+1)
39
  df.insert(0, 'Rank', rank)
40
 
@@ -43,10 +37,7 @@ def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: lis
43
  # print(col)
44
  # if 'Std dev' in col or 'Score' in col:
45
  if 'Std dev' in col or 'Score' in col:
46
- if "Chemistry" in col:
47
- df[col] = (df[col]).map('{:.2f}'.format)
48
- else:
49
- df[col] = (df[col]*100).map('{:.2f}'.format)
50
  # df[col] = df[col].round(decimals=2)
51
 
52
  # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
 
24
  if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
25
  df = df.dropna(subset=benchmark_cols)
26
  df = df.sort_values(by=[rank_col[0]], ascending=True)
27
+ else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
28
+ avg_rank = df.iloc[:, 1:].mean(axis=1) # we'll skip NaN, instrad of deleting the whole row
29
+ df["Average Rank"] = avg_rank
30
+ df = df.sort_values(by=["Average Rank"], ascending=True)
 
 
 
 
 
31
  df = df.fillna('--')
 
32
  rank = np.arange(1, len(df)+1)
33
  df.insert(0, 'Rank', rank)
34
 
 
37
  # print(col)
38
  # if 'Std dev' in col or 'Score' in col:
39
  if 'Std dev' in col or 'Score' in col:
40
+ df[col] = (df[col]*100).map('{:.2f}'.format)
 
 
 
41
  # df[col] = df[col].round(decimals=2)
42
 
43
  # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
src/results/models_2024-10-09-05:17:38.810960.json DELETED
@@ -1,2327 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "model_name": "ChatGPT-4o-latest (2024-09-03)",
5
- "organization": "OpenAI",
6
- "license": "Proprietary",
7
- "knowledge_cutoff": "2023/10"
8
- },
9
- "results": {
10
- "OVERALL": {
11
- "Average Score": 0.974329609,
12
- "Standard Deviation": 0.005024959031,
13
- "Rank": 2
14
- },
15
- "Geometry": {
16
- "Average Score": 0.976028578,
17
- "Standard Deviation": 0.01507912373,
18
- "Rank": 3
19
- },
20
- "Algebra": {
21
- "Average Score": 0.951199453,
22
- "Standard Deviation": 0.08452452108,
23
- "Rank": 3
24
- },
25
- "Probability": {
26
- "Average Score": 0.842116641,
27
- "Standard Deviation": 0.006267759054,
28
- "Rank": 3
29
- },
30
- "Logical": {
31
- "Average Score": 0.828490728,
32
- "Standard Deviation": 0.009134213144,
33
- "Rank": 3
34
- },
35
- "Social": {
36
- "Average Score": 0.815902987,
37
- "Standard Deviation": 0.0196254222,
38
- "Rank": 4
39
- },
40
- "Chemistry": {
41
- "Average Score": 100.0,
42
- "Standard Deviation": null,
43
- "Rank": 1
44
- }
45
- }
46
- },
47
- {
48
- "config": {
49
- "model_name": "gpt-4o-2024-08-06",
50
- "organization": "OpenAI",
51
- "license": "Proprietary",
52
- "knowledge_cutoff": "2023/10"
53
- },
54
- "results": {
55
- "OVERALL": {
56
- "Average Score": 0.846571548,
57
- "Standard Deviation": 0.03394056554,
58
- "Rank": 6
59
- },
60
- "Geometry": {
61
- "Average Score": 0.99773096,
62
- "Standard Deviation": 0.002835555172,
63
- "Rank": 1
64
- },
65
- "Algebra": {
66
- "Average Score": 1.0,
67
- "Standard Deviation": 0.0,
68
- "Rank": 1
69
- },
70
- "Probability": {
71
- "Average Score": 0.78855795,
72
- "Standard Deviation": 0.008188675452,
73
- "Rank": 6
74
- },
75
- "Logical": {
76
- "Average Score": 0.668635768,
77
- "Standard Deviation": 0.03466314094,
78
- "Rank": 11
79
- },
80
- "Social": {
81
- "Average Score": 0.680417314,
82
- "Standard Deviation": 0.00656867063,
83
- "Rank": 9
84
- },
85
- "Chemistry": {
86
- "Average Score": 92.43090226400756,
87
- "Standard Deviation": null,
88
- "Rank": 2
89
- }
90
- }
91
- },
92
- {
93
- "config": {
94
- "model_name": "gpt-4o-2024-05-13",
95
- "organization": "OpenAI",
96
- "license": "Proprietary",
97
- "knowledge_cutoff": "2023/10"
98
- },
99
- "results": {
100
- "OVERALL": {
101
- "Average Score": 0.846334477,
102
- "Standard Deviation": 0.09377911572,
103
- "Rank": 7
104
- },
105
- "Geometry": {
106
- "Average Score": 0.972472377,
107
- "Standard Deviation": 0.01648274205,
108
- "Rank": 4
109
- },
110
- "Algebra": {
111
- "Average Score": 0.995511298,
112
- "Standard Deviation": 0.004097802515,
113
- "Rank": 2
114
- },
115
- "Probability": {
116
- "Average Score": 0.812149974,
117
- "Standard Deviation": 0.007669585485,
118
- "Rank": 4
119
- },
120
- "Logical": {
121
- "Average Score": 0.755019692,
122
- "Standard Deviation": 0.008149588572,
123
- "Rank": 6
124
- },
125
- "Social": {
126
- "Average Score": 0.609875087,
127
- "Standard Deviation": 0.038729239,
128
- "Rank": 14
129
- },
130
- "Chemistry": {
131
- "Average Score": 79.1592634699295,
132
- "Standard Deviation": null,
133
- "Rank": 7
134
- }
135
- }
136
- },
137
- {
138
- "config": {
139
- "model_name": "gpt-4-turbo-2024-04-09",
140
- "organization": "OpenAI",
141
- "license": "Proprietary",
142
- "knowledge_cutoff": "2023/12"
143
- },
144
- "results": {
145
- "OVERALL": {
146
- "Average Score": 0.855357972,
147
- "Standard Deviation": 0.1016986368,
148
- "Rank": 4
149
- },
150
- "Geometry": {
151
- "Average Score": 0.95374588,
152
- "Standard Deviation": 0.03109307166,
153
- "Rank": 5
154
- },
155
- "Algebra": {
156
- "Average Score": 0.930945223,
157
- "Standard Deviation": 0.06705136813,
158
- "Rank": 4
159
- },
160
- "Probability": {
161
- "Average Score": 0.750705448,
162
- "Standard Deviation": 0.05944483103,
163
- "Rank": 8
164
- },
165
- "Logical": {
166
- "Average Score": 0.77906699,
167
- "Standard Deviation": 0.007406734161,
168
- "Rank": 4
169
- },
170
- "Social": {
171
- "Average Score": 0.715935163,
172
- "Standard Deviation": 0.1209141409,
173
- "Rank": 7
174
- },
175
- "Chemistry": {
176
- "Average Score": 70.73143363230263,
177
- "Standard Deviation": null,
178
- "Rank": 12
179
- }
180
- }
181
- },
182
- {
183
- "config": {
184
- "model_name": "gemini-1.5-pro-001",
185
- "organization": "Google",
186
- "license": "Proprietary",
187
- "knowledge_cutoff": "2023/11"
188
- },
189
- "results": {
190
- "OVERALL": {
191
- "Average Score": 0.797187842,
192
- "Standard Deviation": 0.0272375249,
193
- "Rank": 10
194
- },
195
- "Geometry": {
196
- "Average Score": 0.9947169,
197
- "Standard Deviation": 0.009150597621,
198
- "Rank": 2
199
- },
200
- "Algebra": {
201
- "Average Score": 0.857464301,
202
- "Standard Deviation": 0.05014285338,
203
- "Rank": 5
204
- },
205
- "Probability": {
206
- "Average Score": 0.651781767,
207
- "Standard Deviation": 0.04156998547,
208
- "Rank": 12
209
- },
210
- "Logical": {
211
- "Average Score": 0.739745471,
212
- "Standard Deviation": 0.01631532019,
213
- "Rank": 7
214
- },
215
- "Social": {
216
- "Average Score": 0.649601885,
217
- "Standard Deviation": 0.104854889,
218
- "Rank": 12
219
- }
220
- }
221
- },
222
- {
223
- "config": {
224
- "model_name": "qwen2-72b-instruct",
225
- "organization": "Alibaba",
226
- "license": "Qianwen LICENSE",
227
- "knowledge_cutoff": "2024/09"
228
- },
229
- "results": {
230
- "OVERALL": {
231
- "Average Score": 0.737918558,
232
- "Standard Deviation": 0.09069077339,
233
- "Rank": 11
234
- },
235
- "Geometry": {
236
- "Average Score": 0.796870305,
237
- "Standard Deviation": 0.0509025346,
238
- "Rank": 9
239
- },
240
- "Algebra": {
241
- "Average Score": 0.836194231,
242
- "Standard Deviation": 0.04517093028,
243
- "Rank": 6
244
- },
245
- "Probability": {
246
- "Average Score": 0.788068004,
247
- "Standard Deviation": 0.007288989044,
248
- "Rank": 7
249
- },
250
- "Logical": {
251
- "Average Score": 0.619300904,
252
- "Standard Deviation": 0.06377931612,
253
- "Rank": 15
254
- },
255
- "Social": {
256
- "Average Score": 0.652578786,
257
- "Standard Deviation": 0.04259293171,
258
- "Rank": 11
259
- },
260
- "Chemistry": {
261
- "Average Score": 73.54037778797029,
262
- "Standard Deviation": null,
263
- "Rank": 8
264
- }
265
- }
266
- },
267
- {
268
- "config": {
269
- "model_name": "gpt-4o-mini-2024-07-18",
270
- "organization": "OpenAI",
271
- "license": "Proprietary",
272
- "knowledge_cutoff": "2023/10"
273
- },
274
- "results": {
275
- "OVERALL": {
276
- "Average Score": 0.847694133,
277
- "Standard Deviation": 0.02164304402,
278
- "Rank": 5
279
- },
280
- "Geometry": {
281
- "Average Score": 0.946650435,
282
- "Standard Deviation": 0.01831236482,
283
- "Rank": 7
284
- },
285
- "Algebra": {
286
- "Average Score": 0.796243022,
287
- "Standard Deviation": 0.05537539202,
288
- "Rank": 7
289
- },
290
- "Probability": {
291
- "Average Score": 0.798402685,
292
- "Standard Deviation": 0.009404491967,
293
- "Rank": 5
294
- },
295
- "Logical": {
296
- "Average Score": 0.727009735,
297
- "Standard Deviation": 0.02628110141,
298
- "Rank": 8
299
- },
300
- "Social": {
301
- "Average Score": 0.691949855,
302
- "Standard Deviation": 0.02072934333,
303
- "Rank": 8
304
- },
305
- "Chemistry": {
306
- "Average Score": 88.3877070580296,
307
- "Standard Deviation": null,
308
- "Rank": 3
309
- }
310
- }
311
- },
312
- {
313
- "config": {
314
- "model_name": "claude-3.5-sonnet",
315
- "organization": "Anthropic",
316
- "license": "Proprietary",
317
- "knowledge_cutoff": "2024/04"
318
- },
319
- "results": {
320
- "OVERALL": {
321
- "Average Score": 0.839004422,
322
- "Standard Deviation": 0.1461079564,
323
- "Rank": 8
324
- },
325
- "Geometry": {
326
- "Average Score": 0.95316419,
327
- "Standard Deviation": 0.02081192856,
328
- "Rank": 6
329
- },
330
- "Algebra": {
331
- "Average Score": 0.759789952,
332
- "Standard Deviation": 0.02611765096,
333
- "Rank": 8
334
- },
335
- "Probability": {
336
- "Average Score": 0.707730127,
337
- "Standard Deviation": 0.0394436664,
338
- "Rank": 10
339
- },
340
- "Logical": {
341
- "Average Score": 0.77342666,
342
- "Standard Deviation": 0.002892426458,
343
- "Rank": 5
344
- },
345
- "Social": {
346
- "Average Score": 0.790002247,
347
- "Standard Deviation": 0.1007410022,
348
- "Rank": 5
349
- },
350
- "Chemistry": {
351
- "Average Score": 82.37734076815008,
352
- "Standard Deviation": null,
353
- "Rank": 6
354
- }
355
- }
356
- },
357
- {
358
- "config": {
359
- "model_name": "o1-mini",
360
- "organization": "OpenAI",
361
- "license": "Proprietary",
362
- "knowledge_cutoff": "2023/10"
363
- },
364
- "results": {
365
- "OVERALL": {
366
- "Average Score": 1.0,
367
- "Standard Deviation": 0.0,
368
- "Rank": 1
369
- },
370
- "Geometry": {
371
- "Average Score": "N/A",
372
- "Standard Deviation": "N/A",
373
- "Rank": "N/A"
374
- },
375
- "Algebra": {
376
- "Average Score": "N/A",
377
- "Standard Deviation": "N/A",
378
- "Rank": "N/A"
379
- },
380
- "Probability": {
381
- "Average Score": 1.0,
382
- "Standard Deviation": 0.0,
383
- "Rank": 1
384
- },
385
- "Logical": {
386
- "Average Score": 1.0,
387
- "Standard Deviation": 0.0,
388
- "Rank": 1
389
- },
390
- "Social": {
391
- "Average Score": 0.993974241,
392
- "Standard Deviation": 0.001996882328,
393
- "Rank": 2
394
- }
395
- }
396
- },
397
- {
398
- "config": {
399
- "model_name": "o1-preview",
400
- "organization": "OpenAI",
401
- "license": "Proprietary",
402
- "knowledge_cutoff": "2023/10"
403
- },
404
- "results": {
405
- "OVERALL": {
406
- "Average Score": 0.945884589,
407
- "Standard Deviation": 0.01059250762,
408
- "Rank": 3
409
- },
410
- "Geometry": {
411
- "Average Score": "N/A",
412
- "Standard Deviation": "N/A",
413
- "Rank": "N/A"
414
- },
415
- "Algebra": {
416
- "Average Score": "N/A",
417
- "Standard Deviation": "N/A",
418
- "Rank": "N/A"
419
- },
420
- "Probability": {
421
- "Average Score": 0.964666392,
422
- "Standard Deviation": 0.003139983398,
423
- "Rank": 2
424
- },
425
- "Logical": {
426
- "Average Score": 0.987950057,
427
- "Standard Deviation": 0.004881220327,
428
- "Rank": 2
429
- },
430
- "Social": {
431
- "Average Score": 1.0,
432
- "Standard Deviation": 0.0,
433
- "Rank": 1
434
- }
435
- }
436
- },
437
- {
438
- "config": {
439
- "model_name": "gemini-1.5-flash-001",
440
- "organization": "Google",
441
- "license": "Proprietary",
442
- "knowledge_cutoff": "2023/11"
443
- },
444
- "results": {
445
- "OVERALL": {
446
- "Average Score": 0.726493401,
447
- "Standard Deviation": 0.01113913725,
448
- "Rank": 12
449
- },
450
- "Geometry": {
451
- "Average Score": 0.804144103,
452
- "Standard Deviation": 0.1327142178,
453
- "Rank": 8
454
- },
455
- "Algebra": {
456
- "Average Score": 0.731776765,
457
- "Standard Deviation": 0.02594657111,
458
- "Rank": 11
459
- },
460
- "Probability": {
461
- "Average Score": 0.614461891,
462
- "Standard Deviation": 0.04690131826,
463
- "Rank": 15
464
- },
465
- "Logical": {
466
- "Average Score": 0.630805991,
467
- "Standard Deviation": 0.04871350612,
468
- "Rank": 13
469
- },
470
- "Social": {
471
- "Average Score": 0.555933822,
472
- "Standard Deviation": 0.1029934524,
473
- "Rank": 16
474
- },
475
- "Chemistry": {
476
- "Average Score": 72.1127762005651,
477
- "Standard Deviation": null,
478
- "Rank": 11
479
- }
480
- }
481
- },
482
- {
483
- "config": {
484
- "model_name": "gpt4-1106",
485
- "organization": "OpenAI",
486
- "license": "Proprietary",
487
- "knowledge_cutoff": "2024/04"
488
- },
489
- "results": {
490
- "OVERALL": {
491
- "Average Score": 0.816347784,
492
- "Standard Deviation": 0.1566815755,
493
- "Rank": 9
494
- },
495
- "Geometry": {
496
- "Average Score": 0.71843088,
497
- "Standard Deviation": 0.04778038294,
498
- "Rank": 13
499
- },
500
- "Algebra": {
501
- "Average Score": 0.712910417,
502
- "Standard Deviation": 0.02581828898,
503
- "Rank": 12
504
- },
505
- "Probability": {
506
- "Average Score": 0.623947619,
507
- "Standard Deviation": 0.03502982933,
508
- "Rank": 14
509
- },
510
- "Logical": {
511
- "Average Score": 0.637482274,
512
- "Standard Deviation": 0.04158809888,
513
- "Rank": 12
514
- },
515
- "Social": {
516
- "Average Score": 0.450609816,
517
- "Standard Deviation": 0.05208655446,
518
- "Rank": 23
519
- },
520
- "Chemistry": {
521
- "Average Score": 69.11824072252848,
522
- "Standard Deviation": null,
523
- "Rank": 13
524
- }
525
- }
526
- },
527
- {
528
- "config": {
529
- "model_name": "gemma-2-27b-it",
530
- "organization": "Google",
531
- "license": "Gemma License",
532
- "knowledge_cutoff": "2024/06"
533
- },
534
- "results": {
535
- "OVERALL": {
536
- "Average Score": 0.624169623,
537
- "Standard Deviation": 0.1048365121,
538
- "Rank": 15
539
- },
540
- "Geometry": {
541
- "Average Score": 0.60112744,
542
- "Standard Deviation": 0.0469109952,
543
- "Rank": 19
544
- },
545
- "Algebra": {
546
- "Average Score": 0.687955914,
547
- "Standard Deviation": 0.01959958192,
548
- "Rank": 13
549
- },
550
- "Probability": {
551
- "Average Score": 0.589524771,
552
- "Standard Deviation": 0.03112689325,
553
- "Rank": 16
554
- },
555
- "Logical": {
556
- "Average Score": 0.614978944,
557
- "Standard Deviation": 0.05710657859,
558
- "Rank": 16
559
- },
560
- "Social": {
561
- "Average Score": 0.487844257,
562
- "Standard Deviation": 0.05857760809,
563
- "Rank": 20
564
- },
565
- "Chemistry": {
566
- "Average Score": 63.28920072143611,
567
- "Standard Deviation": null,
568
- "Rank": 15
569
- }
570
- }
571
- },
572
- {
573
- "config": {
574
- "model_name": "claude-3-opus",
575
- "organization": "Anthropic",
576
- "license": "Proprietary",
577
- "knowledge_cutoff": "2023/08"
578
- },
579
- "results": {
580
- "OVERALL": {
581
- "Average Score": 0.650636271,
582
- "Standard Deviation": 0.1197773541,
583
- "Rank": 14
584
- },
585
- "Geometry": {
586
- "Average Score": 0.7215743,
587
- "Standard Deviation": 0.04712598358,
588
- "Rank": 12
589
- },
590
- "Algebra": {
591
- "Average Score": 0.68777327,
592
- "Standard Deviation": 0.02382683713,
593
- "Rank": 14
594
- },
595
- "Probability": {
596
- "Average Score": 0.626471421,
597
- "Standard Deviation": 0.02911817976,
598
- "Rank": 13
599
- },
600
- "Logical": {
601
- "Average Score": 0.692346381,
602
- "Standard Deviation": 0.03617185198,
603
- "Rank": 10
604
- },
605
- "Social": {
606
- "Average Score": 0.663410854,
607
- "Standard Deviation": 0.09540220876,
608
- "Rank": 10
609
- },
610
- "Chemistry": {
611
- "Average Score": 73.5404403567132,
612
- "Standard Deviation": null,
613
- "Rank": 9
614
- }
615
- }
616
- },
617
- {
618
- "config": {
619
- "model_name": "gemma-2-9b-it-simpo",
620
- "organization": "Google",
621
- "license": "Gemma License",
622
- "knowledge_cutoff": "2024/07"
623
- },
624
- "results": {
625
- "OVERALL": {
626
- "Average Score": "N/A",
627
- "Standard Deviation": "N/A",
628
- "Rank": "N/A"
629
- },
630
- "Geometry": {
631
- "Average Score": 0.582787508,
632
- "Standard Deviation": 0.03965204074,
633
- "Rank": 20
634
- },
635
- "Algebra": {
636
- "Average Score": 0.658648133,
637
- "Standard Deviation": 0.02565919856,
638
- "Rank": 15
639
- },
640
- "Probability": {
641
- "Average Score": 0.547861265,
642
- "Standard Deviation": 0.02885209131,
643
- "Rank": 19
644
- },
645
- "Logical": {
646
- "Average Score": 0.540720893,
647
- "Standard Deviation": 0.01970134508,
648
- "Rank": 20
649
- },
650
- "Social": {
651
- "Average Score": 0.635266187,
652
- "Standard Deviation": 0.03620021751,
653
- "Rank": 13
654
- },
655
- "Chemistry": {
656
- "Average Score": 73.43757596214863,
657
- "Standard Deviation": null,
658
- "Rank": 10
659
- }
660
- }
661
- },
662
- {
663
- "config": {
664
- "model_name": "qwen1.5-72b-chat",
665
- "organization": "Alibaba",
666
- "license": "Qianwen LICENSE",
667
- "knowledge_cutoff": "2024/03"
668
- },
669
- "results": {
670
- "OVERALL": {
671
- "Average Score": 0.519549796,
672
- "Standard Deviation": 0.00903634343,
673
- "Rank": 18
674
- },
675
- "Geometry": {
676
- "Average Score": 0.543139301,
677
- "Standard Deviation": 0.03425202326,
678
- "Rank": 24
679
- },
680
- "Algebra": {
681
- "Average Score": 0.635228729,
682
- "Standard Deviation": 0.01944043425,
683
- "Rank": 16
684
- },
685
- "Probability": {
686
- "Average Score": 0.486948658,
687
- "Standard Deviation": 0.06064655315,
688
- "Rank": 23
689
- },
690
- "Logical": {
691
- "Average Score": 0.284069394,
692
- "Standard Deviation": 0.02686608506,
693
- "Rank": 33
694
- },
695
- "Social": {
696
- "Average Score": 0.415007627,
697
- "Standard Deviation": 0.03920053159,
698
- "Rank": 24
699
- },
700
- "Chemistry": {
701
- "Average Score": 48.69302376665551,
702
- "Standard Deviation": null,
703
- "Rank": 21
704
- }
705
- }
706
- },
707
- {
708
- "config": {
709
- "model_name": "qwen1.5-32b-chat",
710
- "organization": "Alibaba",
711
- "license": "Qianwen LICENSE",
712
- "knowledge_cutoff": "2024/03"
713
- },
714
- "results": {
715
- "OVERALL": {
716
- "Average Score": 0.393789407,
717
- "Standard Deviation": 0.05413770095,
718
- "Rank": 29
719
- },
720
- "Geometry": {
721
- "Average Score": 0.51086835,
722
- "Standard Deviation": 0.04052471998,
723
- "Rank": 27
724
- },
725
- "Algebra": {
726
- "Average Score": 0.609003168,
727
- "Standard Deviation": 0.04874143541,
728
- "Rank": 17
729
- },
730
- "Probability": {
731
- "Average Score": 0.476300002,
732
- "Standard Deviation": 0.05322403912,
733
- "Rank": 24
734
- },
735
- "Logical": {
736
- "Average Score": 0.331781014,
737
- "Standard Deviation": 0.004938997686,
738
- "Rank": 30
739
- },
740
- "Social": {
741
- "Average Score": 0.380987334,
742
- "Standard Deviation": 0.03762251776,
743
- "Rank": 26
744
- },
745
- "Chemistry": {
746
- "Average Score": 45.14284028264288,
747
- "Standard Deviation": null,
748
- "Rank": 25
749
- }
750
- }
751
- },
752
- {
753
- "config": {
754
- "model_name": "google-gemma-2-9b-it",
755
- "organization": "Google",
756
- "license": "Proprietary",
757
- "knowledge_cutoff": "2024/06"
758
- },
759
- "results": {
760
- "OVERALL": {
761
- "Average Score": 0.489663449,
762
- "Standard Deviation": 0.002595702019,
763
- "Rank": 21
764
- },
765
- "Geometry": {
766
- "Average Score": 0.575371308,
767
- "Standard Deviation": 0.03556220251,
768
- "Rank": 22
769
- },
770
- "Algebra": {
771
- "Average Score": 0.597045661,
772
- "Standard Deviation": 0.0313828123,
773
- "Rank": 18
774
- },
775
- "Probability": {
776
- "Average Score": 0.589221807,
777
- "Standard Deviation": 0.03110811656,
778
- "Rank": 18
779
- },
780
- "Logical": {
781
- "Average Score": 0.587579897,
782
- "Standard Deviation": 0.05512716783,
783
- "Rank": 18
784
- },
785
- "Social": {
786
- "Average Score": 0.768337958,
787
- "Standard Deviation": 0.04078610476,
788
- "Rank": 6
789
- },
790
- "Chemistry": {
791
- "Average Score": 54.03167523687635,
792
- "Standard Deviation": null,
793
- "Rank": 18
794
- }
795
- }
796
- },
797
- {
798
- "config": {
799
- "model_name": "yi-1.5-34b-chat",
800
- "organization": "01 AI",
801
- "license": "Proprietary",
802
- "knowledge_cutoff": "2024/05"
803
- },
804
- "results": {
805
- "OVERALL": {
806
- "Average Score": 0.607812897,
807
- "Standard Deviation": 0.1440881293,
808
- "Rank": 16
809
- },
810
- "Geometry": {
811
- "Average Score": 0.566666724,
812
- "Standard Deviation": 0.04001381658,
813
- "Rank": 23
814
- },
815
- "Algebra": {
816
- "Average Score": 0.590997292,
817
- "Standard Deviation": 0.03594087315,
818
- "Rank": 19
819
- },
820
- "Probability": {
821
- "Average Score": 0.589524589,
822
- "Standard Deviation": 0.03112618772,
823
- "Rank": 17
824
- },
825
- "Logical": {
826
- "Average Score": 0.574105508,
827
- "Standard Deviation": 0.03441737941,
828
- "Rank": 19
829
- },
830
- "Social": {
831
- "Average Score": 0.516980832,
832
- "Standard Deviation": 0.03369347985,
833
- "Rank": 19
834
- },
835
- "Chemistry": {
836
- "Average Score": 52.148798061768964,
837
- "Standard Deviation": null,
838
- "Rank": 19
839
- }
840
- }
841
- },
842
- {
843
- "config": {
844
- "model_name": "meta-llama-3.1-8b-instruct",
845
- "organization": "Meta",
846
- "license": "Llama 3.1 Community",
847
- "knowledge_cutoff": "2023/12"
848
- },
849
- "results": {
850
- "OVERALL": {
851
- "Average Score": 0.505936324,
852
- "Standard Deviation": 0.05286756493,
853
- "Rank": 19
854
- },
855
- "Geometry": {
856
- "Average Score": 0.522442162,
857
- "Standard Deviation": 0.03908236317,
858
- "Rank": 25
859
- },
860
- "Algebra": {
861
- "Average Score": 0.582702645,
862
- "Standard Deviation": 0.05002277711,
863
- "Rank": 20
864
- },
865
- "Probability": {
866
- "Average Score": 0.495001149,
867
- "Standard Deviation": 0.05244587037,
868
- "Rank": 22
869
- },
870
- "Logical": {
871
- "Average Score": 0.443030561,
872
- "Standard Deviation": 0.01343820628,
873
- "Rank": 25
874
- },
875
- "Social": {
876
- "Average Score": 0.329195941,
877
- "Standard Deviation": 0.03925019528,
878
- "Rank": 30
879
- },
880
- "Chemistry": {
881
- "Average Score": 44.41846841004584,
882
- "Standard Deviation": null,
883
- "Rank": 27
884
- }
885
- }
886
- },
887
- {
888
- "config": {
889
- "model_name": "gpt3.5-turbo-0125",
890
- "organization": "OpenAI",
891
- "license": "Proprietary",
892
- "knowledge_cutoff": "2021/09"
893
- },
894
- "results": {
895
- "OVERALL": {
896
- "Average Score": 0.313398088,
897
- "Standard Deviation": 0.09322528606,
898
- "Rank": 40
899
- },
900
- "Geometry": {
901
- "Average Score": 0.678714519,
902
- "Standard Deviation": 0.05926546762,
903
- "Rank": 14
904
- },
905
- "Algebra": {
906
- "Average Score": 0.569296173,
907
- "Standard Deviation": 0.05277281097,
908
- "Rank": 21
909
- },
910
- "Probability": {
911
- "Average Score": 0.448460767,
912
- "Standard Deviation": 0.05768095196,
913
- "Rank": 26
914
- },
915
- "Logical": {
916
- "Average Score": 0.148521348,
917
- "Standard Deviation": 0.04033712907,
918
- "Rank": 45
919
- },
920
- "Social": {
921
- "Average Score": 0.235071541,
922
- "Standard Deviation": 0.02632892457,
923
- "Rank": 39
924
- },
925
- "Chemistry": {
926
- "Average Score": 40.46958736582551,
927
- "Standard Deviation": null,
928
- "Rank": 30
929
- }
930
- }
931
- },
932
- {
933
- "config": {
934
- "model_name": "llama-3-70b-instruct",
935
- "organization": "Meta",
936
- "license": "Llama 3 Community",
937
- "knowledge_cutoff": "2023/12"
938
- },
939
- "results": {
940
- "OVERALL": {
941
- "Average Score": 0.456689885,
942
- "Standard Deviation": 0.01385989995,
943
- "Rank": 23
944
- },
945
- "Geometry": {
946
- "Average Score": 0.516865529,
947
- "Standard Deviation": 0.03858112564,
948
- "Rank": 26
949
- },
950
- "Algebra": {
951
- "Average Score": 0.566756531,
952
- "Standard Deviation": 0.03369826926,
953
- "Rank": 22
954
- },
955
- "Probability": {
956
- "Average Score": 0.513857306,
957
- "Standard Deviation": 0.05453699062,
958
- "Rank": 21
959
- },
960
- "Logical": {
961
- "Average Score": 0.713796415,
962
- "Standard Deviation": 0.02031215107,
963
- "Rank": 9
964
- },
965
- "Social": {
966
- "Average Score": 0.45872939,
967
- "Standard Deviation": 0.05347039576,
968
- "Rank": 22
969
- },
970
- "Chemistry": {
971
- "Average Score": 65.32140697218945,
972
- "Standard Deviation": null,
973
- "Rank": 14
974
- }
975
- }
976
- },
977
- {
978
- "config": {
979
- "model_name": "claude-3-sonnet",
980
- "organization": "Anthropic",
981
- "license": "Proprietary",
982
- "knowledge_cutoff": "2023/08"
983
- },
984
- "results": {
985
- "OVERALL": {
986
- "Average Score": 0.520010833,
987
- "Standard Deviation": 0.005030563799,
988
- "Rank": 17
989
- },
990
- "Geometry": {
991
- "Average Score": 0.675613638,
992
- "Standard Deviation": 0.05275594408,
993
- "Rank": 15
994
- },
995
- "Algebra": {
996
- "Average Score": 0.552025728,
997
- "Standard Deviation": 0.04122192409,
998
- "Rank": 23
999
- },
1000
- "Probability": {
1001
- "Average Score": 0.516192848,
1002
- "Standard Deviation": 0.04152293217,
1003
- "Rank": 20
1004
- },
1005
- "Logical": {
1006
- "Average Score": 0.588545747,
1007
- "Standard Deviation": 0.06068211943,
1008
- "Rank": 17
1009
- },
1010
- "Social": {
1011
- "Average Score": 0.570437582,
1012
- "Standard Deviation": 0.08607040862,
1013
- "Rank": 15
1014
- },
1015
- "Chemistry": {
1016
- "Average Score": 61.33538592327427,
1017
- "Standard Deviation": null,
1018
- "Rank": 16
1019
- }
1020
- }
1021
- },
1022
- {
1023
- "config": {
1024
- "model_name": "qwen1.5-14b-chat",
1025
- "organization": "Alibaba",
1026
- "license": "Qianwen LICENSE",
1027
- "knowledge_cutoff": "2024/02"
1028
- },
1029
- "results": {
1030
- "OVERALL": {
1031
- "Average Score": 0.415328996,
1032
- "Standard Deviation": 0.0743938717,
1033
- "Rank": 28
1034
- },
1035
- "Geometry": {
1036
- "Average Score": 0.452504016,
1037
- "Standard Deviation": 0.04225594393,
1038
- "Rank": 28
1039
- },
1040
- "Algebra": {
1041
- "Average Score": 0.538655725,
1042
- "Standard Deviation": 0.03721542594,
1043
- "Rank": 24
1044
- },
1045
- "Probability": {
1046
- "Average Score": 0.397185975,
1047
- "Standard Deviation": 0.05607695946,
1048
- "Rank": 30
1049
- },
1050
- "Logical": {
1051
- "Average Score": 0.264573129,
1052
- "Standard Deviation": 0.03936133174,
1053
- "Rank": 35
1054
- },
1055
- "Social": {
1056
- "Average Score": 0.287370142,
1057
- "Standard Deviation": 0.04264085315,
1058
- "Rank": 32
1059
- },
1060
- "Chemistry": {
1061
- "Average Score": 38.552779976347026,
1062
- "Standard Deviation": null,
1063
- "Rank": 32
1064
- }
1065
- }
1066
- },
1067
- {
1068
- "config": {
1069
- "model_name": "claude-3-haiku",
1070
- "organization": "Anthropic",
1071
- "license": "Proprietary",
1072
- "knowledge_cutoff": "2023/08"
1073
- },
1074
- "results": {
1075
- "OVERALL": {
1076
- "Average Score": 0.453901163,
1077
- "Standard Deviation": 0.003604084261,
1078
- "Rank": 24
1079
- },
1080
- "Geometry": {
1081
- "Average Score": 0.607993912,
1082
- "Standard Deviation": 0.05793460748,
1083
- "Rank": 17
1084
- },
1085
- "Algebra": {
1086
- "Average Score": 0.520054055,
1087
- "Standard Deviation": 0.03333544511,
1088
- "Rank": 25
1089
- },
1090
- "Probability": {
1091
- "Average Score": 0.474460688,
1092
- "Standard Deviation": 0.0446501933,
1093
- "Rank": 25
1094
- },
1095
- "Logical": {
1096
- "Average Score": 0.512815976,
1097
- "Standard Deviation": 0.0163264281,
1098
- "Rank": 21
1099
- },
1100
- "Social": {
1101
- "Average Score": 0.551083976,
1102
- "Standard Deviation": 0.05374722539,
1103
- "Rank": 17
1104
- },
1105
- "Chemistry": {
1106
- "Average Score": 56.40200048817984,
1107
- "Standard Deviation": null,
1108
- "Rank": 17
1109
- }
1110
- }
1111
- },
1112
- {
1113
- "config": {
1114
- "model_name": "claude-2.1",
1115
- "organization": "Anthropic",
1116
- "license": "Proprietary",
1117
- "knowledge_cutoff": "Unknown"
1118
- },
1119
- "results": {
1120
- "OVERALL": {
1121
- "Average Score": 0.35814708,
1122
- "Standard Deviation": 0.09168134168,
1123
- "Rank": 36
1124
- },
1125
- "Geometry": {
1126
- "Average Score": 0.62752395,
1127
- "Standard Deviation": 0.07232659398,
1128
- "Rank": 16
1129
- },
1130
- "Algebra": {
1131
- "Average Score": 0.508849609,
1132
- "Standard Deviation": 0.0346897465,
1133
- "Rank": 26
1134
- },
1135
- "Probability": {
1136
- "Average Score": 0.41477086,
1137
- "Standard Deviation": 0.05964060239,
1138
- "Rank": 29
1139
- },
1140
- "Logical": {
1141
- "Average Score": 0.482923674,
1142
- "Standard Deviation": 0.01989147048,
1143
- "Rank": 22
1144
- },
1145
- "Social": {
1146
- "Average Score": 0.333804568,
1147
- "Standard Deviation": 0.03775548253,
1148
- "Rank": 29
1149
- },
1150
- "Chemistry": {
1151
- "Average Score": 47.23672563994903,
1152
- "Standard Deviation": null,
1153
- "Rank": 22
1154
- }
1155
- }
1156
- },
1157
- {
1158
- "config": {
1159
- "model_name": "mistral-8x7b-instruct-v0.1",
1160
- "organization": "Mistral",
1161
- "license": "Apache 2.0",
1162
- "knowledge_cutoff": "2023/12"
1163
- },
1164
- "results": {
1165
- "OVERALL": {
1166
- "Average Score": 0.382659161,
1167
- "Standard Deviation": 0.07594496929,
1168
- "Rank": 31
1169
- },
1170
- "Geometry": {
1171
- "Average Score": 0.432216097,
1172
- "Standard Deviation": 0.04747949254,
1173
- "Rank": 31
1174
- },
1175
- "Algebra": {
1176
- "Average Score": 0.478314888,
1177
- "Standard Deviation": 0.01998797419,
1178
- "Rank": 27
1179
- },
1180
- "Probability": {
1181
- "Average Score": 0.427144725,
1182
- "Standard Deviation": 0.0590923329,
1183
- "Rank": 28
1184
- },
1185
- "Logical": {
1186
- "Average Score": 0.340041983,
1187
- "Standard Deviation": 0.008397574592,
1188
- "Rank": 28
1189
- },
1190
- "Social": {
1191
- "Average Score": 0.251949622,
1192
- "Standard Deviation": 0.03346674405,
1193
- "Rank": 37
1194
- },
1195
- "Chemistry": {
1196
- "Average Score": 44.533118241976666,
1197
- "Standard Deviation": null,
1198
- "Rank": 26
1199
- }
1200
- }
1201
- },
1202
- {
1203
- "config": {
1204
- "model_name": "claude-2.0",
1205
- "organization": "Anthropic",
1206
- "license": "Proprietary",
1207
- "knowledge_cutoff": "Unknown"
1208
- },
1209
- "results": {
1210
- "OVERALL": {
1211
- "Average Score": 0.322718057,
1212
- "Standard Deviation": 0.08369883584,
1213
- "Rank": 38
1214
- },
1215
- "Geometry": {
1216
- "Average Score": 0.604141967,
1217
- "Standard Deviation": 0.05116441826,
1218
- "Rank": 18
1219
- },
1220
- "Algebra": {
1221
- "Average Score": 0.474350734,
1222
- "Standard Deviation": 0.01510393066,
1223
- "Rank": 28
1224
- },
1225
- "Probability": {
1226
- "Average Score": 0.437950412,
1227
- "Standard Deviation": 0.05985594317,
1228
- "Rank": 27
1229
- },
1230
- "Logical": {
1231
- "Average Score": 0.445620646,
1232
- "Standard Deviation": 0.01812614805,
1233
- "Rank": 24
1234
- },
1235
- "Social": {
1236
- "Average Score": 0.469422836,
1237
- "Standard Deviation": 0.05999901796,
1238
- "Rank": 21
1239
- },
1240
- "Chemistry": {
1241
- "Average Score": 50.773143448036464,
1242
- "Standard Deviation": null,
1243
- "Rank": 20
1244
- }
1245
- }
1246
- },
1247
- {
1248
- "config": {
1249
- "model_name": "starling-lm-7b-beta",
1250
- "organization": "Nexusflow",
1251
- "license": "Apache-2.0",
1252
- "knowledge_cutoff": "2024/03"
1253
- },
1254
- "results": {
1255
- "OVERALL": {
1256
- "Average Score": 0.479391856,
1257
- "Standard Deviation": 0.04199990887,
1258
- "Rank": 22
1259
- },
1260
- "Geometry": {
1261
- "Average Score": 0.446654388,
1262
- "Standard Deviation": 0.05637864999,
1263
- "Rank": 30
1264
- },
1265
- "Algebra": {
1266
- "Average Score": 0.473952749,
1267
- "Standard Deviation": 0.01584301288,
1268
- "Rank": 29
1269
- },
1270
- "Probability": {
1271
- "Average Score": 0.395197837,
1272
- "Standard Deviation": 0.05814798892,
1273
- "Rank": 31
1274
- },
1275
- "Logical": {
1276
- "Average Score": 0.39927199,
1277
- "Standard Deviation": 0.02125277518,
1278
- "Rank": 26
1279
- },
1280
- "Social": {
1281
- "Average Score": 0.380021662,
1282
- "Standard Deviation": 0.04622452748,
1283
- "Rank": 27
1284
- },
1285
- "Chemistry": {
1286
- "Average Score": 38.27587102395908,
1287
- "Standard Deviation": null,
1288
- "Rank": 33
1289
- }
1290
- }
1291
- },
1292
- {
1293
- "config": {
1294
- "model_name": "gemini-1.0-pro-001",
1295
- "organization": "Google",
1296
- "license": "Proprietary",
1297
- "knowledge_cutoff": "2023/04"
1298
- },
1299
- "results": {
1300
- "OVERALL": {
1301
- "Average Score": 0.449040654,
1302
- "Standard Deviation": 0.0450610177,
1303
- "Rank": 25
1304
- },
1305
- "Geometry": {
1306
- "Average Score": 0.578347959,
1307
- "Standard Deviation": 0.04242873607,
1308
- "Rank": 21
1309
- },
1310
- "Algebra": {
1311
- "Average Score": 0.462417786,
1312
- "Standard Deviation": 0.01668313635,
1313
- "Rank": 30
1314
- },
1315
- "Probability": {
1316
- "Average Score": 0.289836324,
1317
- "Standard Deviation": 0.05739831115,
1318
- "Rank": 39
1319
- },
1320
- "Logical": {
1321
- "Average Score": 0.191140355,
1322
- "Standard Deviation": 0.03394652499,
1323
- "Rank": 41
1324
- },
1325
- "Social": {
1326
- "Average Score": 0.130790863,
1327
- "Standard Deviation": 0.02800188173,
1328
- "Rank": 47
1329
- },
1330
- "Chemistry": {
1331
- "Average Score": 45.22204471452975,
1332
- "Standard Deviation": null,
1333
- "Rank": 24
1334
- }
1335
- }
1336
- },
1337
- {
1338
- "config": {
1339
- "model_name": "openchat-3.5-0106",
1340
- "organization": "OpenChat",
1341
- "license": "Apache-2.0",
1342
- "knowledge_cutoff": "2024/01"
1343
- },
1344
- "results": {
1345
- "OVERALL": {
1346
- "Average Score": 0.363929888,
1347
- "Standard Deviation": 0.08602347145,
1348
- "Rank": 34
1349
- },
1350
- "Geometry": {
1351
- "Average Score": 0.38715246,
1352
- "Standard Deviation": 0.03701851946,
1353
- "Rank": 34
1354
- },
1355
- "Algebra": {
1356
- "Average Score": 0.441233712,
1357
- "Standard Deviation": 0.01135753754,
1358
- "Rank": 31
1359
- },
1360
- "Probability": {
1361
- "Average Score": 0.38802618,
1362
- "Standard Deviation": 0.05663879714,
1363
- "Rank": 32
1364
- },
1365
- "Logical": {
1366
- "Average Score": 0.336754383,
1367
- "Standard Deviation": 0.01608478079,
1368
- "Rank": 29
1369
- },
1370
- "Social": {
1371
- "Average Score": 0.250891608,
1372
- "Standard Deviation": 0.03253769914,
1373
- "Rank": 38
1374
- },
1375
- "Chemistry": {
1376
- "Average Score": 33.70639271807677,
1377
- "Standard Deviation": null,
1378
- "Rank": 34
1379
- }
1380
- }
1381
- },
1382
- {
1383
- "config": {
1384
- "model_name": "openchat-3.5",
1385
- "organization": "OpenChat",
1386
- "license": "Apache-2.0",
1387
- "knowledge_cutoff": "2023/11"
1388
- },
1389
- "results": {
1390
- "OVERALL": {
1391
- "Average Score": 0.361341296,
1392
- "Standard Deviation": 0.09034869493,
1393
- "Rank": 35
1394
- },
1395
- "Geometry": {
1396
- "Average Score": 0.401699069,
1397
- "Standard Deviation": 0.03410726557,
1398
- "Rank": 32
1399
- },
1400
- "Algebra": {
1401
- "Average Score": 0.414095336,
1402
- "Standard Deviation": 0.01881964261,
1403
- "Rank": 33
1404
- },
1405
- "Probability": {
1406
- "Average Score": 0.349601002,
1407
- "Standard Deviation": 0.05077455539,
1408
- "Rank": 34
1409
- },
1410
- "Logical": {
1411
- "Average Score": 0.331069242,
1412
- "Standard Deviation": 0.02180827173,
1413
- "Rank": 31
1414
- },
1415
- "Social": {
1416
- "Average Score": 0.319991655,
1417
- "Standard Deviation": 0.04502478724,
1418
- "Rank": 31
1419
- },
1420
- "Chemistry": {
1421
- "Average Score": 33.020911255646965,
1422
- "Standard Deviation": null,
1423
- "Rank": 35
1424
- }
1425
- }
1426
- },
1427
- {
1428
- "config": {
1429
- "model_name": "command-r-(08-2024)",
1430
- "organization": "Cohere",
1431
- "license": "CC-BY-NC-4.0",
1432
- "knowledge_cutoff": "2024/08"
1433
- },
1434
- "results": {
1435
- "OVERALL": {
1436
- "Average Score": 0.427605298,
1437
- "Standard Deviation": 0.01747449163,
1438
- "Rank": 26
1439
- },
1440
- "Geometry": {
1441
- "Average Score": 0.448300727,
1442
- "Standard Deviation": 0.04996362328,
1443
- "Rank": 29
1444
- },
1445
- "Algebra": {
1446
- "Average Score": 0.417519167,
1447
- "Standard Deviation": 0.01822196902,
1448
- "Rank": 32
1449
- },
1450
- "Probability": {
1451
- "Average Score": 0.366336281,
1452
- "Standard Deviation": 0.04716826942,
1453
- "Rank": 33
1454
- },
1455
- "Logical": {
1456
- "Average Score": 0.214657906,
1457
- "Standard Deviation": 0.03003579835,
1458
- "Rank": 38
1459
- },
1460
- "Social": {
1461
- "Average Score": 0.276088379,
1462
- "Standard Deviation": 0.03295234688,
1463
- "Rank": 34
1464
- },
1465
- "Chemistry": {
1466
- "Average Score": 39.61492485677676,
1467
- "Standard Deviation": null,
1468
- "Rank": 31
1469
- }
1470
- }
1471
- },
1472
- {
1473
- "config": {
1474
- "model_name": "gemma-1.1-7b-it",
1475
- "organization": "Google",
1476
- "license": "Gemma License",
1477
- "knowledge_cutoff": "2024/02"
1478
- },
1479
- "results": {
1480
- "OVERALL": {
1481
- "Average Score": 0.339506922,
1482
- "Standard Deviation": 0.1066279108,
1483
- "Rank": 37
1484
- },
1485
- "Geometry": {
1486
- "Average Score": 0.324170977,
1487
- "Standard Deviation": 0.04668553765,
1488
- "Rank": 37
1489
- },
1490
- "Algebra": {
1491
- "Average Score": 0.398684697,
1492
- "Standard Deviation": 0.01982398259,
1493
- "Rank": 34
1494
- },
1495
- "Probability": {
1496
- "Average Score": 0.293253175,
1497
- "Standard Deviation": 0.05126192191,
1498
- "Rank": 38
1499
- },
1500
- "Logical": {
1501
- "Average Score": 0.317750796,
1502
- "Standard Deviation": 0.01101933543,
1503
- "Rank": 32
1504
- },
1505
- "Social": {
1506
- "Average Score": 0.179073276,
1507
- "Standard Deviation": 0.02009658805,
1508
- "Rank": 43
1509
- },
1510
- "Chemistry": {
1511
- "Average Score": 42.666504105798204,
1512
- "Standard Deviation": null,
1513
- "Rank": 28
1514
- }
1515
- }
1516
- },
1517
- {
1518
- "config": {
1519
- "model_name": "llama3-8b-instruct",
1520
- "organization": "Meta",
1521
- "license": "Llama 3 Community",
1522
- "knowledge_cutoff": "2023/03"
1523
- },
1524
- "results": {
1525
- "OVERALL": {
1526
- "Average Score": 0.367722676,
1527
- "Standard Deviation": 0.1071368221,
1528
- "Rank": 32
1529
- },
1530
- "Geometry": {
1531
- "Average Score": 0.367143758,
1532
- "Standard Deviation": 0.04363680358,
1533
- "Rank": 35
1534
- },
1535
- "Algebra": {
1536
- "Average Score": 0.391480973,
1537
- "Standard Deviation": 0.02757445266,
1538
- "Rank": 35
1539
- },
1540
- "Probability": {
1541
- "Average Score": 0.317616445,
1542
- "Standard Deviation": 0.04300430361,
1543
- "Rank": 37
1544
- },
1545
- "Logical": {
1546
- "Average Score": 0.461607495,
1547
- "Standard Deviation": 0.02185028842,
1548
- "Rank": 23
1549
- },
1550
- "Social": {
1551
- "Average Score": 0.336373622,
1552
- "Standard Deviation": 0.05762408512,
1553
- "Rank": 28
1554
- },
1555
- "Chemistry": {
1556
- "Average Score": 45.35392139264795,
1557
- "Standard Deviation": null,
1558
- "Rank": 23
1559
- }
1560
- }
1561
- },
1562
- {
1563
- "config": {
1564
- "model_name": "gemma-2-2b-it",
1565
- "organization": "Google",
1566
- "license": "Gemma License",
1567
- "knowledge_cutoff": "2024/07"
1568
- },
1569
- "results": {
1570
- "OVERALL": {
1571
- "Average Score": 0.502167612,
1572
- "Standard Deviation": 0.04389786763,
1573
- "Rank": 20
1574
- },
1575
- "Geometry": {
1576
- "Average Score": 0.395006676,
1577
- "Standard Deviation": 0.05882607713,
1578
- "Rank": 33
1579
- },
1580
- "Algebra": {
1581
- "Average Score": 0.379391887,
1582
- "Standard Deviation": 0.01722410785,
1583
- "Rank": 36
1584
- },
1585
- "Probability": {
1586
- "Average Score": 0.331231097,
1587
- "Standard Deviation": 0.05392499987,
1588
- "Rank": 36
1589
- },
1590
- "Logical": {
1591
- "Average Score": 0.367687789,
1592
- "Standard Deviation": 0.02547968808,
1593
- "Rank": 27
1594
- },
1595
- "Social": {
1596
- "Average Score": 0.393482094,
1597
- "Standard Deviation": 0.06450214024,
1598
- "Rank": 25
1599
- },
1600
- "Chemistry": {
1601
- "Average Score": 30.53406933106768,
1602
- "Standard Deviation": null,
1603
- "Rank": 37
1604
- }
1605
- }
1606
- },
1607
- {
1608
- "config": {
1609
- "model_name": "starling-lm-7b-alpha",
1610
- "organization": "Nexusflow",
1611
- "license": "Apache-2.0",
1612
- "knowledge_cutoff": "2023/11"
1613
- },
1614
- "results": {
1615
- "OVERALL": {
1616
- "Average Score": 0.366628765,
1617
- "Standard Deviation": 0.08405492929,
1618
- "Rank": 33
1619
- },
1620
- "Geometry": {
1621
- "Average Score": 0.336782578,
1622
- "Standard Deviation": 0.04069449132,
1623
- "Rank": 36
1624
- },
1625
- "Algebra": {
1626
- "Average Score": 0.371551932,
1627
- "Standard Deviation": 0.03367241745,
1628
- "Rank": 37
1629
- },
1630
- "Probability": {
1631
- "Average Score": 0.331472505,
1632
- "Standard Deviation": 0.04833324282,
1633
- "Rank": 35
1634
- },
1635
- "Logical": {
1636
- "Average Score": 0.260869624,
1637
- "Standard Deviation": 0.03562735237,
1638
- "Rank": 36
1639
- },
1640
- "Social": {
1641
- "Average Score": 0.271975534,
1642
- "Standard Deviation": 0.04266753408,
1643
- "Rank": 35
1644
- },
1645
- "Chemistry": {
1646
- "Average Score": 30.07926487356878,
1647
- "Standard Deviation": null,
1648
- "Rank": 38
1649
- }
1650
- }
1651
- },
1652
- {
1653
- "config": {
1654
- "model_name": "qwen1.5-4b-chat",
1655
- "organization": "Alibaba",
1656
- "license": "Qianwen LICENSE",
1657
- "knowledge_cutoff": "2024/02"
1658
- },
1659
- "results": {
1660
- "OVERALL": {
1661
- "Average Score": 0.111876411,
1662
- "Standard Deviation": 0.04241022785,
1663
- "Rank": 49
1664
- },
1665
- "Geometry": {
1666
- "Average Score": 0.215834522,
1667
- "Standard Deviation": 0.0363766363,
1668
- "Rank": 41
1669
- },
1670
- "Algebra": {
1671
- "Average Score": 0.305589811,
1672
- "Standard Deviation": 0.02354198912,
1673
- "Rank": 38
1674
- },
1675
- "Probability": {
1676
- "Average Score": 0.149365327,
1677
- "Standard Deviation": 0.03489672675,
1678
- "Rank": 45
1679
- },
1680
- "Logical": {
1681
- "Average Score": 0.116210168,
1682
- "Standard Deviation": 0.005927966496,
1683
- "Rank": 47
1684
- },
1685
- "Social": {
1686
- "Average Score": 0.18195615,
1687
- "Standard Deviation": 0.02269805277,
1688
- "Rank": 42
1689
- },
1690
- "Chemistry": {
1691
- "Average Score": 13.21208067122554,
1692
- "Standard Deviation": null,
1693
- "Rank": 48
1694
- }
1695
- }
1696
- },
1697
- {
1698
- "config": {
1699
- "model_name": "command-r-(04-2024)",
1700
- "organization": "Cohere",
1701
- "license": "CC-BY-NC-4.0",
1702
- "knowledge_cutoff": "2024/04"
1703
- },
1704
- "results": {
1705
- "OVERALL": {
1706
- "Average Score": 0.388783887,
1707
- "Standard Deviation": 0.07417186783,
1708
- "Rank": 30
1709
- },
1710
- "Geometry": {
1711
- "Average Score": 0.300416698,
1712
- "Standard Deviation": 0.03485612736,
1713
- "Rank": 38
1714
- },
1715
- "Algebra": {
1716
- "Average Score": 0.293120231,
1717
- "Standard Deviation": 0.032926484,
1718
- "Rank": 39
1719
- },
1720
- "Probability": {
1721
- "Average Score": 0.281271304,
1722
- "Standard Deviation": 0.05697149867,
1723
- "Rank": 40
1724
- },
1725
- "Logical": {
1726
- "Average Score": 0.276189906,
1727
- "Standard Deviation": 0.03562914754,
1728
- "Rank": 34
1729
- },
1730
- "Social": {
1731
- "Average Score": 0.283882949,
1732
- "Standard Deviation": 0.03336901148,
1733
- "Rank": 33
1734
- },
1735
- "Chemistry": {
1736
- "Average Score": 41.346336503003236,
1737
- "Standard Deviation": null,
1738
- "Rank": 29
1739
- }
1740
- }
1741
- },
1742
- {
1743
- "config": {
1744
- "model_name": "vicuna-33b",
1745
- "organization": "LMSYS",
1746
- "license": "Non-commercial",
1747
- "knowledge_cutoff": "2023/08"
1748
- },
1749
- "results": {
1750
- "OVERALL": {
1751
- "Average Score": 0.316543555,
1752
- "Standard Deviation": 0.08922095647,
1753
- "Rank": 39
1754
- },
1755
- "Geometry": {
1756
- "Average Score": 0.208284679,
1757
- "Standard Deviation": 0.03937771461,
1758
- "Rank": 42
1759
- },
1760
- "Algebra": {
1761
- "Average Score": 0.248994048,
1762
- "Standard Deviation": 0.02668175054,
1763
- "Rank": 41
1764
- },
1765
- "Probability": {
1766
- "Average Score": 0.222313995,
1767
- "Standard Deviation": 0.03978859759,
1768
- "Rank": 43
1769
- },
1770
- "Logical": {
1771
- "Average Score": 0.180291222,
1772
- "Standard Deviation": 0.021886267,
1773
- "Rank": 42
1774
- },
1775
- "Social": {
1776
- "Average Score": 0.257623798,
1777
- "Standard Deviation": 0.02653724437,
1778
- "Rank": 36
1779
- },
1780
- "Chemistry": {
1781
- "Average Score": 28.01838653090379,
1782
- "Standard Deviation": null,
1783
- "Rank": 39
1784
- }
1785
- }
1786
- },
1787
- {
1788
- "config": {
1789
- "model_name": "gemma-7b-it",
1790
- "organization": "Google",
1791
- "license": "Gemma License",
1792
- "knowledge_cutoff": "2024/02"
1793
- },
1794
- "results": {
1795
- "OVERALL": {
1796
- "Average Score": 0.285077558,
1797
- "Standard Deviation": 0.08871758453,
1798
- "Rank": 41
1799
- },
1800
- "Geometry": {
1801
- "Average Score": 0.244791417,
1802
- "Standard Deviation": 0.0289612078,
1803
- "Rank": 39
1804
- },
1805
- "Algebra": {
1806
- "Average Score": 0.250614794,
1807
- "Standard Deviation": 0.01991678295,
1808
- "Rank": 40
1809
- },
1810
- "Probability": {
1811
- "Average Score": 0.174313053,
1812
- "Standard Deviation": 0.03765424728,
1813
- "Rank": 44
1814
- },
1815
- "Logical": {
1816
- "Average Score": 0.197505536,
1817
- "Standard Deviation": 0.02050298885,
1818
- "Rank": 39
1819
- },
1820
- "Social": {
1821
- "Average Score": 0.202138025,
1822
- "Standard Deviation": 0.02098346639,
1823
- "Rank": 41
1824
- },
1825
- "Chemistry": {
1826
- "Average Score": 28.014658234926813,
1827
- "Standard Deviation": null,
1828
- "Rank": 40
1829
- }
1830
- }
1831
- },
1832
- {
1833
- "config": {
1834
- "model_name": "mistral-7b-instruct-2",
1835
- "organization": "Mistral",
1836
- "license": "Apache 2.0",
1837
- "knowledge_cutoff": "2023/12"
1838
- },
1839
- "results": {
1840
- "OVERALL": {
1841
- "Average Score": 0.427513868,
1842
- "Standard Deviation": 0.05553921135,
1843
- "Rank": 27
1844
- },
1845
- "Geometry": {
1846
- "Average Score": 0.216402626,
1847
- "Standard Deviation": 0.03338414918,
1848
- "Rank": 40
1849
- },
1850
- "Algebra": {
1851
- "Average Score": 0.233777838,
1852
- "Standard Deviation": 0.0155226054,
1853
- "Rank": 42
1854
- },
1855
- "Probability": {
1856
- "Average Score": 0.25118175,
1857
- "Standard Deviation": 0.04065514593,
1858
- "Rank": 41
1859
- },
1860
- "Logical": {
1861
- "Average Score": 0.224469136,
1862
- "Standard Deviation": 0.03404706752,
1863
- "Rank": 37
1864
- },
1865
- "Social": {
1866
- "Average Score": 0.209386782,
1867
- "Standard Deviation": 0.02738569921,
1868
- "Rank": 40
1869
- },
1870
- "Chemistry": {
1871
- "Average Score": 31.382959631870822,
1872
- "Standard Deviation": null,
1873
- "Rank": 36
1874
- }
1875
- }
1876
- },
1877
- {
1878
- "config": {
1879
- "model_name": "mistral-7b-instruct-1",
1880
- "organization": "Mistral",
1881
- "license": "Apache 2.0",
1882
- "knowledge_cutoff": "2023/12"
1883
- },
1884
- "results": {
1885
- "OVERALL": {
1886
- "Average Score": 0.23016314,
1887
- "Standard Deviation": 0.07137625271,
1888
- "Rank": 46
1889
- },
1890
- "Geometry": {
1891
- "Average Score": 0.161799938,
1892
- "Standard Deviation": 0.03595278559,
1893
- "Rank": 46
1894
- },
1895
- "Algebra": {
1896
- "Average Score": 0.210341624,
1897
- "Standard Deviation": 0.01736539119,
1898
- "Rank": 43
1899
- },
1900
- "Probability": {
1901
- "Average Score": 0.238417922,
1902
- "Standard Deviation": 0.03744211933,
1903
- "Rank": 42
1904
- },
1905
- "Logical": {
1906
- "Average Score": 0.142636601,
1907
- "Standard Deviation": 0.02080406365,
1908
- "Rank": 46
1909
- },
1910
- "Social": {
1911
- "Average Score": 0.117646827,
1912
- "Standard Deviation": 0.009321202779,
1913
- "Rank": 49
1914
- },
1915
- "Chemistry": {
1916
- "Average Score": 18.929093202755805,
1917
- "Standard Deviation": null,
1918
- "Rank": 43
1919
- }
1920
- }
1921
- },
1922
- {
1923
- "config": {
1924
- "model_name": "vicuna-13b",
1925
- "organization": "LMSYS",
1926
- "license": "Non-commercial",
1927
- "knowledge_cutoff": "2023/07"
1928
- },
1929
- "results": {
1930
- "OVERALL": {
1931
- "Average Score": 0.201892849,
1932
- "Standard Deviation": 0.06021749802,
1933
- "Rank": 47
1934
- },
1935
- "Geometry": {
1936
- "Average Score": 0.200941928,
1937
- "Standard Deviation": 0.03366817781,
1938
- "Rank": 43
1939
- },
1940
- "Algebra": {
1941
- "Average Score": 0.196123323,
1942
- "Standard Deviation": 0.0135715643,
1943
- "Rank": 44
1944
- },
1945
- "Probability": {
1946
- "Average Score": 0.141214079,
1947
- "Standard Deviation": 0.02721328211,
1948
- "Rank": 46
1949
- },
1950
- "Logical": {
1951
- "Average Score": 0.148598631,
1952
- "Standard Deviation": 0.02241523892,
1953
- "Rank": 44
1954
- },
1955
- "Social": {
1956
- "Average Score": 0.124655135,
1957
- "Standard Deviation": 0.01122382671,
1958
- "Rank": 48
1959
- },
1960
- "Chemistry": {
1961
- "Average Score": 21.840013221590294,
1962
- "Standard Deviation": null,
1963
- "Rank": 41
1964
- }
1965
- }
1966
- },
1967
- {
1968
- "config": {
1969
- "model_name": "zephyr-7b-beta",
1970
- "organization": "HuggingFace",
1971
- "license": "MIT",
1972
- "knowledge_cutoff": "2023/10"
1973
- },
1974
- "results": {
1975
- "OVERALL": {
1976
- "Average Score": 0.102705119,
1977
- "Standard Deviation": 0.03683757312,
1978
- "Rank": 50
1979
- },
1980
- "Geometry": {
1981
- "Average Score": 0.114005544,
1982
- "Standard Deviation": 0.03144354365,
1983
- "Rank": 47
1984
- },
1985
- "Algebra": {
1986
- "Average Score": 0.141766633,
1987
- "Standard Deviation": 0.03179520129,
1988
- "Rank": 45
1989
- },
1990
- "Probability": {
1991
- "Average Score": 0.089050714,
1992
- "Standard Deviation": 0.002136754266,
1993
- "Rank": 49
1994
- },
1995
- "Logical": {
1996
- "Average Score": 0.069520789,
1997
- "Standard Deviation": 0.004477840857,
1998
- "Rank": 51
1999
- },
2000
- "Social": {
2001
- "Average Score": 0.0,
2002
- "Standard Deviation": 0.0,
2003
- "Rank": 54
2004
- },
2005
- "Chemistry": {
2006
- "Average Score": 18.92902220864132,
2007
- "Standard Deviation": null,
2008
- "Rank": 44
2009
- }
2010
- }
2011
- },
2012
- {
2013
- "config": {
2014
- "model_name": "gemma-1.1-2b-it",
2015
- "organization": "Google",
2016
- "license": "Gemma License",
2017
- "knowledge_cutoff": "2024/02"
2018
- },
2019
- "results": {
2020
- "OVERALL": {
2021
- "Average Score": 0.257700845,
2022
- "Standard Deviation": 0.07369021445,
2023
- "Rank": 44
2024
- },
2025
- "Geometry": {
2026
- "Average Score": 0.183974034,
2027
- "Standard Deviation": 0.0215548886,
2028
- "Rank": 45
2029
- },
2030
- "Algebra": {
2031
- "Average Score": 0.13422252,
2032
- "Standard Deviation": 0.01922819511,
2033
- "Rank": 46
2034
- },
2035
- "Probability": {
2036
- "Average Score": 0.095628657,
2037
- "Standard Deviation": 0.007536076456,
2038
- "Rank": 48
2039
- },
2040
- "Logical": {
2041
- "Average Score": 0.094965074,
2042
- "Standard Deviation": 0.005019175487,
2043
- "Rank": 49
2044
- },
2045
- "Social": {
2046
- "Average Score": 0.167796727,
2047
- "Standard Deviation": 0.01666541942,
2048
- "Rank": 44
2049
- },
2050
- "Chemistry": {
2051
- "Average Score": 20.724691953843916,
2052
- "Standard Deviation": null,
2053
- "Rank": 42
2054
- }
2055
- }
2056
- },
2057
- {
2058
- "config": {
2059
- "model_name": "llama2-7b-chat",
2060
- "organization": "Meta",
2061
- "license": "Llama 2 Community",
2062
- "knowledge_cutoff": "2023/07"
2063
- },
2064
- "results": {
2065
- "OVERALL": {
2066
- "Average Score": 0.260189428,
2067
- "Standard Deviation": 0.08019299364,
2068
- "Rank": 43
2069
- },
2070
- "Geometry": {
2071
- "Average Score": 0.087067276,
2072
- "Standard Deviation": 0.04274343402,
2073
- "Rank": 48
2074
- },
2075
- "Algebra": {
2076
- "Average Score": 0.12308805,
2077
- "Standard Deviation": 0.01856053622,
2078
- "Rank": 47
2079
- },
2080
- "Probability": {
2081
- "Average Score": 0.087515438,
2082
- "Standard Deviation": 0.006315053573,
2083
- "Rank": 50
2084
- },
2085
- "Logical": {
2086
- "Average Score": 0.17312827,
2087
- "Standard Deviation": 0.01867044092,
2088
- "Rank": 43
2089
- },
2090
- "Social": {
2091
- "Average Score": 0.152905272,
2092
- "Standard Deviation": 0.007166957097,
2093
- "Rank": 45
2094
- },
2095
- "Chemistry": {
2096
- "Average Score": 15.730513733660898,
2097
- "Standard Deviation": null,
2098
- "Rank": 46
2099
- }
2100
- }
2101
- },
2102
- {
2103
- "config": {
2104
- "model_name": "gemma-2b-it",
2105
- "organization": "Google",
2106
- "license": "Gemma License",
2107
- "knowledge_cutoff": "2024/02"
2108
- },
2109
- "results": {
2110
- "OVERALL": {
2111
- "Average Score": 0.234172069,
2112
- "Standard Deviation": 0.06522685718,
2113
- "Rank": 45
2114
- },
2115
- "Geometry": {
2116
- "Average Score": 0.198571153,
2117
- "Standard Deviation": 0.01699161031,
2118
- "Rank": 44
2119
- },
2120
- "Algebra": {
2121
- "Average Score": 0.109883009,
2122
- "Standard Deviation": 0.01520005833,
2123
- "Rank": 48
2124
- },
2125
- "Probability": {
2126
- "Average Score": 0.06467432,
2127
- "Standard Deviation": 0.002117497231,
2128
- "Rank": 52
2129
- },
2130
- "Logical": {
2131
- "Average Score": 0.039624492,
2132
- "Standard Deviation": 0.007606972686,
2133
- "Rank": 52
2134
- },
2135
- "Social": {
2136
- "Average Score": 0.087452913,
2137
- "Standard Deviation": 0.008170146562,
2138
- "Rank": 52
2139
- },
2140
- "Chemistry": {
2141
- "Average Score": 17.2715657115764,
2142
- "Standard Deviation": null,
2143
- "Rank": 45
2144
- }
2145
- }
2146
- },
2147
- {
2148
- "config": {
2149
- "model_name": "llama2-13b-chat",
2150
- "organization": "Meta",
2151
- "license": "Llama 2 Community",
2152
- "knowledge_cutoff": "2023/07"
2153
- },
2154
- "results": {
2155
- "OVERALL": {
2156
- "Average Score": 0.263305684,
2157
- "Standard Deviation": 0.07283640689,
2158
- "Rank": 42
2159
- },
2160
- "Geometry": {
2161
- "Average Score": 0.072729954,
2162
- "Standard Deviation": 0.02315988261,
2163
- "Rank": 50
2164
- },
2165
- "Algebra": {
2166
- "Average Score": 0.080371692,
2167
- "Standard Deviation": 0.01277569453,
2168
- "Rank": 49
2169
- },
2170
- "Probability": {
2171
- "Average Score": 0.117757344,
2172
- "Standard Deviation": 0.02418619619,
2173
- "Rank": 47
2174
- },
2175
- "Logical": {
2176
- "Average Score": 0.193149889,
2177
- "Standard Deviation": 0.01776690764,
2178
- "Rank": 40
2179
- },
2180
- "Social": {
2181
- "Average Score": 0.149125922,
2182
- "Standard Deviation": 0.01157416827,
2183
- "Rank": 46
2184
- },
2185
- "Chemistry": {
2186
- "Average Score": 13.17258252933903,
2187
- "Standard Deviation": null,
2188
- "Rank": 49
2189
- }
2190
- }
2191
- },
2192
- {
2193
- "config": {
2194
- "model_name": "vicuna-7b",
2195
- "organization": "LMSYS",
2196
- "license": "Non-commercial",
2197
- "knowledge_cutoff": "2023/07"
2198
- },
2199
- "results": {
2200
- "OVERALL": {
2201
- "Average Score": 0.198839786,
2202
- "Standard Deviation": 0.05725381576,
2203
- "Rank": 48
2204
- },
2205
- "Geometry": {
2206
- "Average Score": 0.083457058,
2207
- "Standard Deviation": 0.02520989111,
2208
- "Rank": 49
2209
- },
2210
- "Algebra": {
2211
- "Average Score": 0.070883882,
2212
- "Standard Deviation": 0.007315853253,
2213
- "Rank": 50
2214
- },
2215
- "Probability": {
2216
- "Average Score": 0.080987673,
2217
- "Standard Deviation": 0.005474288861,
2218
- "Rank": 51
2219
- },
2220
- "Logical": {
2221
- "Average Score": 0.100065588,
2222
- "Standard Deviation": 0.003561886452,
2223
- "Rank": 48
2224
- },
2225
- "Social": {
2226
- "Average Score": 0.111076414,
2227
- "Standard Deviation": 0.004805626512,
2228
- "Rank": 50
2229
- },
2230
- "Chemistry": {
2231
- "Average Score": 14.255194156624162,
2232
- "Standard Deviation": null,
2233
- "Rank": 47
2234
- }
2235
- }
2236
- },
2237
- {
2238
- "config": {
2239
- "model_name": "koala-13b",
2240
- "organization": "UC Berkeley",
2241
- "license": "Non-commercial",
2242
- "knowledge_cutoff": "2023/04"
2243
- },
2244
- "results": {
2245
- "OVERALL": {
2246
- "Average Score": 0.09387188,
2247
- "Standard Deviation": 0.02642167489,
2248
- "Rank": 51
2249
- },
2250
- "Geometry": {
2251
- "Average Score": 0.017374001,
2252
- "Standard Deviation": 0.01747053557,
2253
- "Rank": 51
2254
- },
2255
- "Algebra": {
2256
- "Average Score": 0.018129197,
2257
- "Standard Deviation": 0.01054371383,
2258
- "Rank": 51
2259
- },
2260
- "Probability": {
2261
- "Average Score": 0.043654362,
2262
- "Standard Deviation": 0.004288231886,
2263
- "Rank": 53
2264
- },
2265
- "Logical": {
2266
- "Average Score": 0.074694053,
2267
- "Standard Deviation": 0.002674646998,
2268
- "Rank": 50
2269
- },
2270
- "Social": {
2271
- "Average Score": 0.096983835,
2272
- "Standard Deviation": 0.007847059783,
2273
- "Rank": 51
2274
- },
2275
- "Chemistry": {
2276
- "Average Score": 6.36433272373514,
2277
- "Standard Deviation": null,
2278
- "Rank": 50
2279
- }
2280
- }
2281
- },
2282
- {
2283
- "config": {
2284
- "model_name": "openassistant-pythia-12b",
2285
- "organization": "OpenAssistant",
2286
- "license": "Non-commercial",
2287
- "knowledge_cutoff": "2023/04"
2288
- },
2289
- "results": {
2290
- "OVERALL": {
2291
- "Average Score": 0.0,
2292
- "Standard Deviation": 0.0,
2293
- "Rank": 52
2294
- },
2295
- "Geometry": {
2296
- "Average Score": 0.0,
2297
- "Standard Deviation": 0.0,
2298
- "Rank": 52
2299
- },
2300
- "Algebra": {
2301
- "Average Score": 0.0,
2302
- "Standard Deviation": 0.0,
2303
- "Rank": 52
2304
- },
2305
- "Probability": {
2306
- "Average Score": 0.0,
2307
- "Standard Deviation": 0.0,
2308
- "Rank": 54
2309
- },
2310
- "Logical": {
2311
- "Average Score": 0.0,
2312
- "Standard Deviation": 0.0,
2313
- "Rank": 53
2314
- },
2315
- "Social": {
2316
- "Average Score": 0.030792528,
2317
- "Standard Deviation": 0.007518796391,
2318
- "Rank": 53
2319
- },
2320
- "Chemistry": {
2321
- "Average Score": 0.0,
2322
- "Standard Deviation": null,
2323
- "Rank": 51
2324
- }
2325
- }
2326
- }
2327
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/results/models_2024-10-09-06:22:21.122422.json DELETED
@@ -1,2372 +0,0 @@
1
- [
2
- {
3
- "config": {
4
- "model_name": "ChatGPT-4o-latest (2024-09-03)",
5
- "organization": "OpenAI",
6
- "license": "Proprietary",
7
- "knowledge_cutoff": "2023/10"
8
- },
9
- "results": {
10
- "OVERALL": {
11
- "Average Score": 0.974329609,
12
- "Standard Deviation": 0.005024959031,
13
- "Rank": 2
14
- },
15
- "Geometry": {
16
- "Average Score": 0.976028578,
17
- "Standard Deviation": 0.01507912373,
18
- "Rank": 3
19
- },
20
- "Algebra": {
21
- "Average Score": 0.951199453,
22
- "Standard Deviation": 0.08452452108,
23
- "Rank": 3
24
- },
25
- "Probability": {
26
- "Average Score": 0.842116641,
27
- "Standard Deviation": 0.006267759054,
28
- "Rank": 3
29
- },
30
- "Logical": {
31
- "Average Score": 0.828490728,
32
- "Standard Deviation": 0.009134213144,
33
- "Rank": 3
34
- },
35
- "Social": {
36
- "Average Score": 0.815902987,
37
- "Standard Deviation": 0.0196254222,
38
- "Rank": 3
39
- },
40
- "Chemistry": {
41
- "Average Score": 100.0,
42
- "Standard Deviation": null,
43
- "Rank": 1
44
- }
45
- }
46
- },
47
- {
48
- "config": {
49
- "model_name": "meta-llama-3.1-70b-instruct",
50
- "organization": "Meta",
51
- "license": "Llama 3.1 Community",
52
- "knowledge_cutoff": "2023/12"
53
- },
54
- "results": {
55
- "OVERALL": {
56
- "Average Score": 0.708874896,
57
- "Standard Deviation": 0.1315111956,
58
- "Rank": 13
59
- },
60
- "Geometry": {
61
- "Average Score": 0.76184398,
62
- "Standard Deviation": 0.01790377984,
63
- "Rank": 10
64
- },
65
- "Algebra": {
66
- "Average Score": 0.732041699,
67
- "Standard Deviation": 0.02621439062,
68
- "Rank": 9
69
- },
70
- "Probability": {
71
- "Average Score": 0.676208383,
72
- "Standard Deviation": 0.05131201636,
73
- "Rank": 10
74
- },
75
- "Logical": {
76
- "Average Score": 0.620018631,
77
- "Standard Deviation": 0.02518873821,
78
- "Rank": 14
79
- },
80
- "Social": {
81
- "Average Score": 0.45872939,
82
- "Standard Deviation": 0.05347039576,
83
- "Rank": 20
84
- },
85
- "Chemistry": {
86
- "Average Score": 84.36815192532764,
87
- "Standard Deviation": null,
88
- "Rank": 4
89
- }
90
- }
91
- },
92
- {
93
- "config": {
94
- "model_name": "gpt-4o-2024-08-06",
95
- "organization": "OpenAI",
96
- "license": "Proprietary",
97
- "knowledge_cutoff": "2023/10"
98
- },
99
- "results": {
100
- "OVERALL": {
101
- "Average Score": 0.846571548,
102
- "Standard Deviation": 0.03394056554,
103
- "Rank": 6
104
- },
105
- "Geometry": {
106
- "Average Score": 0.99773096,
107
- "Standard Deviation": 0.002835555172,
108
- "Rank": 1
109
- },
110
- "Algebra": {
111
- "Average Score": 1.0,
112
- "Standard Deviation": 0.0,
113
- "Rank": 1
114
- },
115
- "Probability": {
116
- "Average Score": 0.78855795,
117
- "Standard Deviation": 0.008188675452,
118
- "Rank": 6
119
- },
120
- "Logical": {
121
- "Average Score": 0.668635768,
122
- "Standard Deviation": 0.03466314094,
123
- "Rank": 11
124
- },
125
- "Social": {
126
- "Average Score": 0.680417314,
127
- "Standard Deviation": 0.00656867063,
128
- "Rank": 8
129
- },
130
- "Chemistry": {
131
- "Average Score": 92.43090226400756,
132
- "Standard Deviation": null,
133
- "Rank": 2
134
- }
135
- }
136
- },
137
- {
138
- "config": {
139
- "model_name": "gpt-4o-2024-05-13",
140
- "organization": "OpenAI",
141
- "license": "Proprietary",
142
- "knowledge_cutoff": "2023/10"
143
- },
144
- "results": {
145
- "OVERALL": {
146
- "Average Score": 0.846334477,
147
- "Standard Deviation": 0.09377911572,
148
- "Rank": 7
149
- },
150
- "Geometry": {
151
- "Average Score": 0.972472377,
152
- "Standard Deviation": 0.01648274205,
153
- "Rank": 4
154
- },
155
- "Algebra": {
156
- "Average Score": 0.995511298,
157
- "Standard Deviation": 0.004097802515,
158
- "Rank": 2
159
- },
160
- "Probability": {
161
- "Average Score": 0.812149974,
162
- "Standard Deviation": 0.007669585485,
163
- "Rank": 4
164
- },
165
- "Logical": {
166
- "Average Score": 0.755019692,
167
- "Standard Deviation": 0.008149588572,
168
- "Rank": 6
169
- },
170
- "Social": {
171
- "Average Score": 0.609875087,
172
- "Standard Deviation": 0.038729239,
173
- "Rank": 13
174
- },
175
- "Chemistry": {
176
- "Average Score": 79.1592634699295,
177
- "Standard Deviation": null,
178
- "Rank": 6
179
- }
180
- }
181
- },
182
- {
183
- "config": {
184
- "model_name": "gpt-4-turbo-2024-04-09",
185
- "organization": "OpenAI",
186
- "license": "Proprietary",
187
- "knowledge_cutoff": "2023/12"
188
- },
189
- "results": {
190
- "OVERALL": {
191
- "Average Score": 0.855357972,
192
- "Standard Deviation": 0.1016986368,
193
- "Rank": 4
194
- },
195
- "Geometry": {
196
- "Average Score": 0.95374588,
197
- "Standard Deviation": 0.03109307166,
198
- "Rank": 5
199
- },
200
- "Algebra": {
201
- "Average Score": 0.930945223,
202
- "Standard Deviation": 0.06705136813,
203
- "Rank": 4
204
- },
205
- "Probability": {
206
- "Average Score": 0.750705448,
207
- "Standard Deviation": 0.05944483103,
208
- "Rank": 8
209
- },
210
- "Logical": {
211
- "Average Score": 0.77906699,
212
- "Standard Deviation": 0.007406734161,
213
- "Rank": 4
214
- },
215
- "Social": {
216
- "Average Score": 0.715935163,
217
- "Standard Deviation": 0.1209141409,
218
- "Rank": 6
219
- },
220
- "Chemistry": {
221
- "Average Score": 70.73143363230263,
222
- "Standard Deviation": null,
223
- "Rank": 11
224
- }
225
- }
226
- },
227
- {
228
- "config": {
229
- "model_name": "gemini-1.5-pro-001",
230
- "organization": "Google",
231
- "license": "Proprietary",
232
- "knowledge_cutoff": "2023/11"
233
- },
234
- "results": {
235
- "OVERALL": {
236
- "Average Score": 0.797187842,
237
- "Standard Deviation": 0.0272375249,
238
- "Rank": 10
239
- },
240
- "Geometry": {
241
- "Average Score": 0.9947169,
242
- "Standard Deviation": 0.009150597621,
243
- "Rank": 2
244
- },
245
- "Algebra": {
246
- "Average Score": 0.857464301,
247
- "Standard Deviation": 0.05014285338,
248
- "Rank": 5
249
- },
250
- "Probability": {
251
- "Average Score": 0.651781767,
252
- "Standard Deviation": 0.04156998547,
253
- "Rank": 11
254
- },
255
- "Logical": {
256
- "Average Score": 0.739745471,
257
- "Standard Deviation": 0.01631532019,
258
- "Rank": 7
259
- },
260
- "Social": {
261
- "Average Score": 0.649601885,
262
- "Standard Deviation": 0.104854889,
263
- "Rank": 11
264
- }
265
- }
266
- },
267
- {
268
- "config": {
269
- "model_name": "qwen2-72b-instruct",
270
- "organization": "Alibaba",
271
- "license": "Qianwen LICENSE",
272
- "knowledge_cutoff": "2024/09"
273
- },
274
- "results": {
275
- "OVERALL": {
276
- "Average Score": 0.737918558,
277
- "Standard Deviation": 0.09069077339,
278
- "Rank": 11
279
- },
280
- "Geometry": {
281
- "Average Score": 0.796870305,
282
- "Standard Deviation": 0.0509025346,
283
- "Rank": 9
284
- },
285
- "Algebra": {
286
- "Average Score": 0.836194231,
287
- "Standard Deviation": 0.04517093028,
288
- "Rank": 6
289
- },
290
- "Probability": {
291
- "Average Score": 0.788068004,
292
- "Standard Deviation": 0.007288989044,
293
- "Rank": 7
294
- },
295
- "Logical": {
296
- "Average Score": 0.619300904,
297
- "Standard Deviation": 0.06377931612,
298
- "Rank": 15
299
- },
300
- "Social": {
301
- "Average Score": 0.652578786,
302
- "Standard Deviation": 0.04259293171,
303
- "Rank": 10
304
- },
305
- "Chemistry": {
306
- "Average Score": 73.54037778797029,
307
- "Standard Deviation": null,
308
- "Rank": 8
309
- }
310
- }
311
- },
312
- {
313
- "config": {
314
- "model_name": "gpt-4o-mini-2024-07-18",
315
- "organization": "OpenAI",
316
- "license": "Proprietary",
317
- "knowledge_cutoff": "2023/10"
318
- },
319
- "results": {
320
- "OVERALL": {
321
- "Average Score": 0.847694133,
322
- "Standard Deviation": 0.02164304402,
323
- "Rank": 5
324
- },
325
- "Geometry": {
326
- "Average Score": 0.946650435,
327
- "Standard Deviation": 0.01831236482,
328
- "Rank": 7
329
- },
330
- "Algebra": {
331
- "Average Score": 0.796243022,
332
- "Standard Deviation": 0.05537539202,
333
- "Rank": 7
334
- },
335
- "Probability": {
336
- "Average Score": 0.798402685,
337
- "Standard Deviation": 0.009404491967,
338
- "Rank": 5
339
- },
340
- "Logical": {
341
- "Average Score": 0.727009735,
342
- "Standard Deviation": 0.02628110141,
343
- "Rank": 8
344
- },
345
- "Social": {
346
- "Average Score": 0.691949855,
347
- "Standard Deviation": 0.02072934333,
348
- "Rank": 7
349
- },
350
- "Chemistry": {
351
- "Average Score": 88.3877070580296,
352
- "Standard Deviation": null,
353
- "Rank": 3
354
- }
355
- }
356
- },
357
- {
358
- "config": {
359
- "model_name": "claude-3.5-sonnet",
360
- "organization": "Anthropic",
361
- "license": "Proprietary",
362
- "knowledge_cutoff": "2024/04"
363
- },
364
- "results": {
365
- "OVERALL": {
366
- "Average Score": 0.839004422,
367
- "Standard Deviation": 0.1461079564,
368
- "Rank": 8
369
- },
370
- "Geometry": {
371
- "Average Score": 0.95316419,
372
- "Standard Deviation": 0.02081192856,
373
- "Rank": 6
374
- },
375
- "Algebra": {
376
- "Average Score": 0.759789952,
377
- "Standard Deviation": 0.02611765096,
378
- "Rank": 8
379
- },
380
- "Probability": {
381
- "Average Score": 0.707730127,
382
- "Standard Deviation": 0.0394436664,
383
- "Rank": 9
384
- },
385
- "Logical": {
386
- "Average Score": 0.77342666,
387
- "Standard Deviation": 0.002892426458,
388
- "Rank": 5
389
- },
390
- "Social": {
391
- "Average Score": 0.790002247,
392
- "Standard Deviation": 0.1007410022,
393
- "Rank": 4
394
- },
395
- "Chemistry": {
396
- "Average Score": 82.37734076815008,
397
- "Standard Deviation": null,
398
- "Rank": 5
399
- }
400
- }
401
- },
402
- {
403
- "config": {
404
- "model_name": "o1-mini",
405
- "organization": "OpenAI",
406
- "license": "Proprietary",
407
- "knowledge_cutoff": "2023/10"
408
- },
409
- "results": {
410
- "OVERALL": {
411
- "Average Score": 1.0,
412
- "Standard Deviation": 0.0,
413
- "Rank": 1
414
- },
415
- "Geometry": {
416
- "Average Score": "N/A",
417
- "Standard Deviation": "N/A",
418
- "Rank": "N/A"
419
- },
420
- "Algebra": {
421
- "Average Score": "N/A",
422
- "Standard Deviation": "N/A",
423
- "Rank": "N/A"
424
- },
425
- "Probability": {
426
- "Average Score": 1.0,
427
- "Standard Deviation": 0.0,
428
- "Rank": 1
429
- },
430
- "Logical": {
431
- "Average Score": 1.0,
432
- "Standard Deviation": 0.0,
433
- "Rank": 1
434
- },
435
- "Social": {
436
- "Average Score": 0.993974241,
437
- "Standard Deviation": 0.001996882328,
438
- "Rank": 2
439
- }
440
- }
441
- },
442
- {
443
- "config": {
444
- "model_name": "o1-preview",
445
- "organization": "OpenAI",
446
- "license": "Proprietary",
447
- "knowledge_cutoff": "2023/10"
448
- },
449
- "results": {
450
- "OVERALL": {
451
- "Average Score": 0.945884589,
452
- "Standard Deviation": 0.01059250762,
453
- "Rank": 3
454
- },
455
- "Geometry": {
456
- "Average Score": "N/A",
457
- "Standard Deviation": "N/A",
458
- "Rank": "N/A"
459
- },
460
- "Algebra": {
461
- "Average Score": "N/A",
462
- "Standard Deviation": "N/A",
463
- "Rank": "N/A"
464
- },
465
- "Probability": {
466
- "Average Score": 0.964666392,
467
- "Standard Deviation": 0.003139983398,
468
- "Rank": 2
469
- },
470
- "Logical": {
471
- "Average Score": 0.987950057,
472
- "Standard Deviation": 0.004881220327,
473
- "Rank": 2
474
- },
475
- "Social": {
476
- "Average Score": 1.0,
477
- "Standard Deviation": 0.0,
478
- "Rank": 1
479
- }
480
- }
481
- },
482
- {
483
- "config": {
484
- "model_name": "gemini-1.5-flash-001",
485
- "organization": "Google",
486
- "license": "Proprietary",
487
- "knowledge_cutoff": "2023/11"
488
- },
489
- "results": {
490
- "OVERALL": {
491
- "Average Score": 0.726493401,
492
- "Standard Deviation": 0.01113913725,
493
- "Rank": 12
494
- },
495
- "Geometry": {
496
- "Average Score": 0.804144103,
497
- "Standard Deviation": 0.1327142178,
498
- "Rank": 8
499
- },
500
- "Algebra": {
501
- "Average Score": 0.731776765,
502
- "Standard Deviation": 0.02594657111,
503
- "Rank": 10
504
- },
505
- "Probability": {
506
- "Average Score": 0.614461891,
507
- "Standard Deviation": 0.04690131826,
508
- "Rank": 14
509
- },
510
- "Logical": {
511
- "Average Score": 0.630805991,
512
- "Standard Deviation": 0.04871350612,
513
- "Rank": 13
514
- },
515
- "Social": {
516
- "Average Score": 0.555933822,
517
- "Standard Deviation": 0.1029934524,
518
- "Rank": 15
519
- },
520
- "Chemistry": {
521
- "Average Score": 72.1127762005651,
522
- "Standard Deviation": null,
523
- "Rank": 10
524
- }
525
- }
526
- },
527
- {
528
- "config": {
529
- "model_name": "gpt4-1106",
530
- "organization": "OpenAI",
531
- "license": "Proprietary",
532
- "knowledge_cutoff": "2024/04"
533
- },
534
- "results": {
535
- "OVERALL": {
536
- "Average Score": 0.816347784,
537
- "Standard Deviation": 0.1566815755,
538
- "Rank": 9
539
- },
540
- "Geometry": {
541
- "Average Score": 0.71843088,
542
- "Standard Deviation": 0.04778038294,
543
- "Rank": 12
544
- },
545
- "Algebra": {
546
- "Average Score": 0.712910417,
547
- "Standard Deviation": 0.02581828898,
548
- "Rank": 11
549
- },
550
- "Probability": {
551
- "Average Score": 0.623947619,
552
- "Standard Deviation": 0.03502982933,
553
- "Rank": 13
554
- },
555
- "Logical": {
556
- "Average Score": 0.637482274,
557
- "Standard Deviation": 0.04158809888,
558
- "Rank": 12
559
- },
560
- "Social": {
561
- "Average Score": 0.450609816,
562
- "Standard Deviation": 0.05208655446,
563
- "Rank": 22
564
- },
565
- "Chemistry": {
566
- "Average Score": 69.11824072252848,
567
- "Standard Deviation": null,
568
- "Rank": 12
569
- }
570
- }
571
- },
572
- {
573
- "config": {
574
- "model_name": "gemma-2-27b-it",
575
- "organization": "Google",
576
- "license": "Gemma License",
577
- "knowledge_cutoff": "2024/06"
578
- },
579
- "results": {
580
- "OVERALL": {
581
- "Average Score": 0.624169623,
582
- "Standard Deviation": 0.1048365121,
583
- "Rank": 15
584
- },
585
- "Geometry": {
586
- "Average Score": 0.60112744,
587
- "Standard Deviation": 0.0469109952,
588
- "Rank": 18
589
- },
590
- "Algebra": {
591
- "Average Score": 0.687955914,
592
- "Standard Deviation": 0.01959958192,
593
- "Rank": 12
594
- },
595
- "Probability": {
596
- "Average Score": 0.589524771,
597
- "Standard Deviation": 0.03112689325,
598
- "Rank": 15
599
- },
600
- "Logical": {
601
- "Average Score": 0.614978944,
602
- "Standard Deviation": 0.05710657859,
603
- "Rank": 16
604
- },
605
- "Social": {
606
- "Average Score": 0.487844257,
607
- "Standard Deviation": 0.05857760809,
608
- "Rank": 18
609
- },
610
- "Chemistry": {
611
- "Average Score": 63.28920072143611,
612
- "Standard Deviation": null,
613
- "Rank": 14
614
- }
615
- }
616
- },
617
- {
618
- "config": {
619
- "model_name": "claude-3-opus",
620
- "organization": "Anthropic",
621
- "license": "Proprietary",
622
- "knowledge_cutoff": "2023/08"
623
- },
624
- "results": {
625
- "OVERALL": {
626
- "Average Score": 0.650636271,
627
- "Standard Deviation": 0.1197773541,
628
- "Rank": 14
629
- },
630
- "Geometry": {
631
- "Average Score": 0.7215743,
632
- "Standard Deviation": 0.04712598358,
633
- "Rank": 11
634
- },
635
- "Algebra": {
636
- "Average Score": 0.68777327,
637
- "Standard Deviation": 0.02382683713,
638
- "Rank": 13
639
- },
640
- "Probability": {
641
- "Average Score": 0.626471421,
642
- "Standard Deviation": 0.02911817976,
643
- "Rank": 12
644
- },
645
- "Logical": {
646
- "Average Score": 0.692346381,
647
- "Standard Deviation": 0.03617185198,
648
- "Rank": 10
649
- },
650
- "Social": {
651
- "Average Score": 0.663410854,
652
- "Standard Deviation": 0.09540220876,
653
- "Rank": 9
654
- },
655
- "Chemistry": {
656
- "Average Score": 73.5404403567132,
657
- "Standard Deviation": null,
658
- "Rank": 7
659
- }
660
- }
661
- },
662
- {
663
- "config": {
664
- "model_name": "gemma-2-9b-it-simpo",
665
- "organization": "Google",
666
- "license": "Gemma License",
667
- "knowledge_cutoff": "2024/07"
668
- },
669
- "results": {
670
- "OVERALL": {
671
- "Average Score": "N/A",
672
- "Standard Deviation": "N/A",
673
- "Rank": "N/A"
674
- },
675
- "Geometry": {
676
- "Average Score": 0.582787508,
677
- "Standard Deviation": 0.03965204074,
678
- "Rank": 19
679
- },
680
- "Algebra": {
681
- "Average Score": 0.658648133,
682
- "Standard Deviation": 0.02565919856,
683
- "Rank": 14
684
- },
685
- "Probability": {
686
- "Average Score": 0.547861265,
687
- "Standard Deviation": 0.02885209131,
688
- "Rank": 18
689
- },
690
- "Logical": {
691
- "Average Score": 0.540720893,
692
- "Standard Deviation": 0.01970134508,
693
- "Rank": 20
694
- },
695
- "Social": {
696
- "Average Score": 0.635266187,
697
- "Standard Deviation": 0.03620021751,
698
- "Rank": 12
699
- },
700
- "Chemistry": {
701
- "Average Score": 73.43757596214863,
702
- "Standard Deviation": null,
703
- "Rank": 9
704
- }
705
- }
706
- },
707
- {
708
- "config": {
709
- "model_name": "qwen1.5-72b-chat",
710
- "organization": "Alibaba",
711
- "license": "Qianwen LICENSE",
712
- "knowledge_cutoff": "2024/03"
713
- },
714
- "results": {
715
- "OVERALL": {
716
- "Average Score": 0.519549796,
717
- "Standard Deviation": 0.00903634343,
718
- "Rank": 18
719
- },
720
- "Geometry": {
721
- "Average Score": 0.543139301,
722
- "Standard Deviation": 0.03425202326,
723
- "Rank": 23
724
- },
725
- "Algebra": {
726
- "Average Score": 0.635228729,
727
- "Standard Deviation": 0.01944043425,
728
- "Rank": 15
729
- },
730
- "Probability": {
731
- "Average Score": 0.486948658,
732
- "Standard Deviation": 0.06064655315,
733
- "Rank": 22
734
- },
735
- "Logical": {
736
- "Average Score": 0.284069394,
737
- "Standard Deviation": 0.02686608506,
738
- "Rank": 33
739
- },
740
- "Social": {
741
- "Average Score": 0.415007627,
742
- "Standard Deviation": 0.03920053159,
743
- "Rank": 23
744
- },
745
- "Chemistry": {
746
- "Average Score": 48.69302376665551,
747
- "Standard Deviation": null,
748
- "Rank": 20
749
- }
750
- }
751
- },
752
- {
753
- "config": {
754
- "model_name": "qwen1.5-32b-chat",
755
- "organization": "Alibaba",
756
- "license": "Qianwen LICENSE",
757
- "knowledge_cutoff": "2024/03"
758
- },
759
- "results": {
760
- "OVERALL": {
761
- "Average Score": 0.393789407,
762
- "Standard Deviation": 0.05413770095,
763
- "Rank": 29
764
- },
765
- "Geometry": {
766
- "Average Score": 0.51086835,
767
- "Standard Deviation": 0.04052471998,
768
- "Rank": 26
769
- },
770
- "Algebra": {
771
- "Average Score": 0.609003168,
772
- "Standard Deviation": 0.04874143541,
773
- "Rank": 16
774
- },
775
- "Probability": {
776
- "Average Score": 0.476300002,
777
- "Standard Deviation": 0.05322403912,
778
- "Rank": 23
779
- },
780
- "Logical": {
781
- "Average Score": 0.331781014,
782
- "Standard Deviation": 0.004938997686,
783
- "Rank": 30
784
- },
785
- "Social": {
786
- "Average Score": 0.380987334,
787
- "Standard Deviation": 0.03762251776,
788
- "Rank": 25
789
- },
790
- "Chemistry": {
791
- "Average Score": 45.14284028264288,
792
- "Standard Deviation": null,
793
- "Rank": 24
794
- }
795
- }
796
- },
797
- {
798
- "config": {
799
- "model_name": "google-gemma-2-9b-it",
800
- "organization": "Google",
801
- "license": "Proprietary",
802
- "knowledge_cutoff": "2024/06"
803
- },
804
- "results": {
805
- "OVERALL": {
806
- "Average Score": 0.489663449,
807
- "Standard Deviation": 0.002595702019,
808
- "Rank": 21
809
- },
810
- "Geometry": {
811
- "Average Score": 0.575371308,
812
- "Standard Deviation": 0.03556220251,
813
- "Rank": 21
814
- },
815
- "Algebra": {
816
- "Average Score": 0.597045661,
817
- "Standard Deviation": 0.0313828123,
818
- "Rank": 17
819
- },
820
- "Probability": {
821
- "Average Score": 0.589221807,
822
- "Standard Deviation": 0.03110811656,
823
- "Rank": 17
824
- },
825
- "Logical": {
826
- "Average Score": 0.587579897,
827
- "Standard Deviation": 0.05512716783,
828
- "Rank": 18
829
- },
830
- "Social": {
831
- "Average Score": 0.768337958,
832
- "Standard Deviation": 0.04078610476,
833
- "Rank": 5
834
- },
835
- "Chemistry": {
836
- "Average Score": 54.03167523687635,
837
- "Standard Deviation": null,
838
- "Rank": 17
839
- }
840
- }
841
- },
842
- {
843
- "config": {
844
- "model_name": "yi-1.5-34b-chat",
845
- "organization": "01 AI",
846
- "license": "Proprietary",
847
- "knowledge_cutoff": "2024/05"
848
- },
849
- "results": {
850
- "OVERALL": {
851
- "Average Score": 0.607812897,
852
- "Standard Deviation": 0.1440881293,
853
- "Rank": 16
854
- },
855
- "Geometry": {
856
- "Average Score": 0.566666724,
857
- "Standard Deviation": 0.04001381658,
858
- "Rank": 22
859
- },
860
- "Algebra": {
861
- "Average Score": 0.590997292,
862
- "Standard Deviation": 0.03594087315,
863
- "Rank": 18
864
- },
865
- "Probability": {
866
- "Average Score": 0.589524589,
867
- "Standard Deviation": 0.03112618772,
868
- "Rank": 16
869
- },
870
- "Logical": {
871
- "Average Score": 0.574105508,
872
- "Standard Deviation": 0.03441737941,
873
- "Rank": 19
874
- },
875
- "Social": {
876
- "Average Score": 0.516980832,
877
- "Standard Deviation": 0.03369347985,
878
- "Rank": 17
879
- },
880
- "Chemistry": {
881
- "Average Score": 52.148798061768964,
882
- "Standard Deviation": null,
883
- "Rank": 18
884
- }
885
- }
886
- },
887
- {
888
- "config": {
889
- "model_name": "meta-llama-3.1-8b-instruct",
890
- "organization": "Meta",
891
- "license": "Llama 3.1 Community",
892
- "knowledge_cutoff": "2023/12"
893
- },
894
- "results": {
895
- "OVERALL": {
896
- "Average Score": 0.505936324,
897
- "Standard Deviation": 0.05286756493,
898
- "Rank": 19
899
- },
900
- "Geometry": {
901
- "Average Score": 0.522442162,
902
- "Standard Deviation": 0.03908236317,
903
- "Rank": 24
904
- },
905
- "Algebra": {
906
- "Average Score": 0.582702645,
907
- "Standard Deviation": 0.05002277711,
908
- "Rank": 19
909
- },
910
- "Probability": {
911
- "Average Score": 0.495001149,
912
- "Standard Deviation": 0.05244587037,
913
- "Rank": 21
914
- },
915
- "Logical": {
916
- "Average Score": 0.443030561,
917
- "Standard Deviation": 0.01343820628,
918
- "Rank": 25
919
- },
920
- "Social": {
921
- "Average Score": 0.329195941,
922
- "Standard Deviation": 0.03925019528,
923
- "Rank": 29
924
- },
925
- "Chemistry": {
926
- "Average Score": 44.41846841004584,
927
- "Standard Deviation": null,
928
- "Rank": 26
929
- }
930
- }
931
- },
932
- {
933
- "config": {
934
- "model_name": "gpt3.5-turbo-0125",
935
- "organization": "OpenAI",
936
- "license": "Proprietary",
937
- "knowledge_cutoff": "2021/09"
938
- },
939
- "results": {
940
- "OVERALL": {
941
- "Average Score": 0.313398088,
942
- "Standard Deviation": 0.09322528606,
943
- "Rank": 40
944
- },
945
- "Geometry": {
946
- "Average Score": 0.678714519,
947
- "Standard Deviation": 0.05926546762,
948
- "Rank": 13
949
- },
950
- "Algebra": {
951
- "Average Score": 0.569296173,
952
- "Standard Deviation": 0.05277281097,
953
- "Rank": 20
954
- },
955
- "Probability": {
956
- "Average Score": 0.448460767,
957
- "Standard Deviation": 0.05768095196,
958
- "Rank": 25
959
- },
960
- "Logical": {
961
- "Average Score": 0.148521348,
962
- "Standard Deviation": 0.04033712907,
963
- "Rank": 45
964
- },
965
- "Social": {
966
- "Average Score": 0.235071541,
967
- "Standard Deviation": 0.02632892457,
968
- "Rank": 38
969
- },
970
- "Chemistry": {
971
- "Average Score": 40.46958736582551,
972
- "Standard Deviation": null,
973
- "Rank": 29
974
- }
975
- }
976
- },
977
- {
978
- "config": {
979
- "model_name": "llama-3-70b-instruct",
980
- "organization": "Meta",
981
- "license": "Llama 3 Community",
982
- "knowledge_cutoff": "2023/12"
983
- },
984
- "results": {
985
- "OVERALL": {
986
- "Average Score": 0.456689885,
987
- "Standard Deviation": 0.01385989995,
988
- "Rank": 23
989
- },
990
- "Geometry": {
991
- "Average Score": 0.516865529,
992
- "Standard Deviation": 0.03858112564,
993
- "Rank": 25
994
- },
995
- "Algebra": {
996
- "Average Score": 0.566756531,
997
- "Standard Deviation": 0.03369826926,
998
- "Rank": 21
999
- },
1000
- "Probability": {
1001
- "Average Score": 0.513857306,
1002
- "Standard Deviation": 0.05453699062,
1003
- "Rank": 20
1004
- },
1005
- "Logical": {
1006
- "Average Score": 0.713796415,
1007
- "Standard Deviation": 0.02031215107,
1008
- "Rank": 9
1009
- },
1010
- "Social": {
1011
- "Average Score": 0.45872939,
1012
- "Standard Deviation": 0.05347039576,
1013
- "Rank": 21
1014
- },
1015
- "Chemistry": {
1016
- "Average Score": 65.32140697218945,
1017
- "Standard Deviation": null,
1018
- "Rank": 13
1019
- }
1020
- }
1021
- },
1022
- {
1023
- "config": {
1024
- "model_name": "claude-3-sonnet",
1025
- "organization": "Anthropic",
1026
- "license": "Proprietary",
1027
- "knowledge_cutoff": "2023/08"
1028
- },
1029
- "results": {
1030
- "OVERALL": {
1031
- "Average Score": 0.520010833,
1032
- "Standard Deviation": 0.005030563799,
1033
- "Rank": 17
1034
- },
1035
- "Geometry": {
1036
- "Average Score": 0.675613638,
1037
- "Standard Deviation": 0.05275594408,
1038
- "Rank": 14
1039
- },
1040
- "Algebra": {
1041
- "Average Score": 0.552025728,
1042
- "Standard Deviation": 0.04122192409,
1043
- "Rank": 22
1044
- },
1045
- "Probability": {
1046
- "Average Score": 0.516192848,
1047
- "Standard Deviation": 0.04152293217,
1048
- "Rank": 19
1049
- },
1050
- "Logical": {
1051
- "Average Score": 0.588545747,
1052
- "Standard Deviation": 0.06068211943,
1053
- "Rank": 17
1054
- },
1055
- "Social": {
1056
- "Average Score": 0.570437582,
1057
- "Standard Deviation": 0.08607040862,
1058
- "Rank": 14
1059
- },
1060
- "Chemistry": {
1061
- "Average Score": 61.33538592327427,
1062
- "Standard Deviation": null,
1063
- "Rank": 15
1064
- }
1065
- }
1066
- },
1067
- {
1068
- "config": {
1069
- "model_name": "qwen1.5-14b-chat",
1070
- "organization": "Alibaba",
1071
- "license": "Qianwen LICENSE",
1072
- "knowledge_cutoff": "2024/02"
1073
- },
1074
- "results": {
1075
- "OVERALL": {
1076
- "Average Score": 0.415328996,
1077
- "Standard Deviation": 0.0743938717,
1078
- "Rank": 28
1079
- },
1080
- "Geometry": {
1081
- "Average Score": 0.452504016,
1082
- "Standard Deviation": 0.04225594393,
1083
- "Rank": 27
1084
- },
1085
- "Algebra": {
1086
- "Average Score": 0.538655725,
1087
- "Standard Deviation": 0.03721542594,
1088
- "Rank": 23
1089
- },
1090
- "Probability": {
1091
- "Average Score": 0.397185975,
1092
- "Standard Deviation": 0.05607695946,
1093
- "Rank": 29
1094
- },
1095
- "Logical": {
1096
- "Average Score": 0.264573129,
1097
- "Standard Deviation": 0.03936133174,
1098
- "Rank": 35
1099
- },
1100
- "Social": {
1101
- "Average Score": 0.287370142,
1102
- "Standard Deviation": 0.04264085315,
1103
- "Rank": 31
1104
- },
1105
- "Chemistry": {
1106
- "Average Score": 38.552779976347026,
1107
- "Standard Deviation": null,
1108
- "Rank": 31
1109
- }
1110
- }
1111
- },
1112
- {
1113
- "config": {
1114
- "model_name": "claude-3-haiku",
1115
- "organization": "Anthropic",
1116
- "license": "Proprietary",
1117
- "knowledge_cutoff": "2023/08"
1118
- },
1119
- "results": {
1120
- "OVERALL": {
1121
- "Average Score": 0.453901163,
1122
- "Standard Deviation": 0.003604084261,
1123
- "Rank": 24
1124
- },
1125
- "Geometry": {
1126
- "Average Score": 0.607993912,
1127
- "Standard Deviation": 0.05793460748,
1128
- "Rank": 16
1129
- },
1130
- "Algebra": {
1131
- "Average Score": 0.520054055,
1132
- "Standard Deviation": 0.03333544511,
1133
- "Rank": 24
1134
- },
1135
- "Probability": {
1136
- "Average Score": 0.474460688,
1137
- "Standard Deviation": 0.0446501933,
1138
- "Rank": 24
1139
- },
1140
- "Logical": {
1141
- "Average Score": 0.512815976,
1142
- "Standard Deviation": 0.0163264281,
1143
- "Rank": 21
1144
- },
1145
- "Social": {
1146
- "Average Score": 0.551083976,
1147
- "Standard Deviation": 0.05374722539,
1148
- "Rank": 16
1149
- },
1150
- "Chemistry": {
1151
- "Average Score": 56.40200048817984,
1152
- "Standard Deviation": null,
1153
- "Rank": 16
1154
- }
1155
- }
1156
- },
1157
- {
1158
- "config": {
1159
- "model_name": "claude-2.1",
1160
- "organization": "Anthropic",
1161
- "license": "Proprietary",
1162
- "knowledge_cutoff": "Unknown"
1163
- },
1164
- "results": {
1165
- "OVERALL": {
1166
- "Average Score": 0.35814708,
1167
- "Standard Deviation": 0.09168134168,
1168
- "Rank": 36
1169
- },
1170
- "Geometry": {
1171
- "Average Score": 0.62752395,
1172
- "Standard Deviation": 0.07232659398,
1173
- "Rank": 15
1174
- },
1175
- "Algebra": {
1176
- "Average Score": 0.508849609,
1177
- "Standard Deviation": 0.0346897465,
1178
- "Rank": 25
1179
- },
1180
- "Probability": {
1181
- "Average Score": 0.41477086,
1182
- "Standard Deviation": 0.05964060239,
1183
- "Rank": 28
1184
- },
1185
- "Logical": {
1186
- "Average Score": 0.482923674,
1187
- "Standard Deviation": 0.01989147048,
1188
- "Rank": 22
1189
- },
1190
- "Social": {
1191
- "Average Score": 0.333804568,
1192
- "Standard Deviation": 0.03775548253,
1193
- "Rank": 28
1194
- },
1195
- "Chemistry": {
1196
- "Average Score": 47.23672563994903,
1197
- "Standard Deviation": null,
1198
- "Rank": 21
1199
- }
1200
- }
1201
- },
1202
- {
1203
- "config": {
1204
- "model_name": "mistral-8x7b-instruct-v0.1",
1205
- "organization": "Mistral",
1206
- "license": "Apache 2.0",
1207
- "knowledge_cutoff": "2023/12"
1208
- },
1209
- "results": {
1210
- "OVERALL": {
1211
- "Average Score": 0.382659161,
1212
- "Standard Deviation": 0.07594496929,
1213
- "Rank": 31
1214
- },
1215
- "Geometry": {
1216
- "Average Score": 0.432216097,
1217
- "Standard Deviation": 0.04747949254,
1218
- "Rank": 30
1219
- },
1220
- "Algebra": {
1221
- "Average Score": 0.478314888,
1222
- "Standard Deviation": 0.01998797419,
1223
- "Rank": 26
1224
- },
1225
- "Probability": {
1226
- "Average Score": 0.427144725,
1227
- "Standard Deviation": 0.0590923329,
1228
- "Rank": 27
1229
- },
1230
- "Logical": {
1231
- "Average Score": 0.340041983,
1232
- "Standard Deviation": 0.008397574592,
1233
- "Rank": 28
1234
- },
1235
- "Social": {
1236
- "Average Score": 0.251949622,
1237
- "Standard Deviation": 0.03346674405,
1238
- "Rank": 36
1239
- },
1240
- "Chemistry": {
1241
- "Average Score": 44.533118241976666,
1242
- "Standard Deviation": null,
1243
- "Rank": 25
1244
- }
1245
- }
1246
- },
1247
- {
1248
- "config": {
1249
- "model_name": "claude-2.0",
1250
- "organization": "Anthropic",
1251
- "license": "Proprietary",
1252
- "knowledge_cutoff": "Unknown"
1253
- },
1254
- "results": {
1255
- "OVERALL": {
1256
- "Average Score": 0.322718057,
1257
- "Standard Deviation": 0.08369883584,
1258
- "Rank": 38
1259
- },
1260
- "Geometry": {
1261
- "Average Score": 0.604141967,
1262
- "Standard Deviation": 0.05116441826,
1263
- "Rank": 17
1264
- },
1265
- "Algebra": {
1266
- "Average Score": 0.474350734,
1267
- "Standard Deviation": 0.01510393066,
1268
- "Rank": 27
1269
- },
1270
- "Probability": {
1271
- "Average Score": 0.437950412,
1272
- "Standard Deviation": 0.05985594317,
1273
- "Rank": 26
1274
- },
1275
- "Logical": {
1276
- "Average Score": 0.445620646,
1277
- "Standard Deviation": 0.01812614805,
1278
- "Rank": 24
1279
- },
1280
- "Social": {
1281
- "Average Score": 0.469422836,
1282
- "Standard Deviation": 0.05999901796,
1283
- "Rank": 19
1284
- },
1285
- "Chemistry": {
1286
- "Average Score": 50.773143448036464,
1287
- "Standard Deviation": null,
1288
- "Rank": 19
1289
- }
1290
- }
1291
- },
1292
- {
1293
- "config": {
1294
- "model_name": "starling-lm-7b-beta",
1295
- "organization": "Nexusflow",
1296
- "license": "Apache-2.0",
1297
- "knowledge_cutoff": "2024/03"
1298
- },
1299
- "results": {
1300
- "OVERALL": {
1301
- "Average Score": 0.479391856,
1302
- "Standard Deviation": 0.04199990887,
1303
- "Rank": 22
1304
- },
1305
- "Geometry": {
1306
- "Average Score": 0.446654388,
1307
- "Standard Deviation": 0.05637864999,
1308
- "Rank": 29
1309
- },
1310
- "Algebra": {
1311
- "Average Score": 0.473952749,
1312
- "Standard Deviation": 0.01584301288,
1313
- "Rank": 28
1314
- },
1315
- "Probability": {
1316
- "Average Score": 0.395197837,
1317
- "Standard Deviation": 0.05814798892,
1318
- "Rank": 30
1319
- },
1320
- "Logical": {
1321
- "Average Score": 0.39927199,
1322
- "Standard Deviation": 0.02125277518,
1323
- "Rank": 26
1324
- },
1325
- "Social": {
1326
- "Average Score": 0.380021662,
1327
- "Standard Deviation": 0.04622452748,
1328
- "Rank": 26
1329
- },
1330
- "Chemistry": {
1331
- "Average Score": 38.27587102395908,
1332
- "Standard Deviation": null,
1333
- "Rank": 32
1334
- }
1335
- }
1336
- },
1337
- {
1338
- "config": {
1339
- "model_name": "gemini-1.0-pro-001",
1340
- "organization": "Google",
1341
- "license": "Proprietary",
1342
- "knowledge_cutoff": "2023/04"
1343
- },
1344
- "results": {
1345
- "OVERALL": {
1346
- "Average Score": 0.449040654,
1347
- "Standard Deviation": 0.0450610177,
1348
- "Rank": 25
1349
- },
1350
- "Geometry": {
1351
- "Average Score": 0.578347959,
1352
- "Standard Deviation": 0.04242873607,
1353
- "Rank": 20
1354
- },
1355
- "Algebra": {
1356
- "Average Score": 0.462417786,
1357
- "Standard Deviation": 0.01668313635,
1358
- "Rank": 29
1359
- },
1360
- "Probability": {
1361
- "Average Score": 0.289836324,
1362
- "Standard Deviation": 0.05739831115,
1363
- "Rank": 38
1364
- },
1365
- "Logical": {
1366
- "Average Score": 0.191140355,
1367
- "Standard Deviation": 0.03394652499,
1368
- "Rank": 41
1369
- },
1370
- "Social": {
1371
- "Average Score": 0.130790863,
1372
- "Standard Deviation": 0.02800188173,
1373
- "Rank": 46
1374
- },
1375
- "Chemistry": {
1376
- "Average Score": 45.22204471452975,
1377
- "Standard Deviation": null,
1378
- "Rank": 23
1379
- }
1380
- }
1381
- },
1382
- {
1383
- "config": {
1384
- "model_name": "openchat-3.5-0106",
1385
- "organization": "OpenChat",
1386
- "license": "Apache-2.0",
1387
- "knowledge_cutoff": "2024/01"
1388
- },
1389
- "results": {
1390
- "OVERALL": {
1391
- "Average Score": 0.363929888,
1392
- "Standard Deviation": 0.08602347145,
1393
- "Rank": 34
1394
- },
1395
- "Geometry": {
1396
- "Average Score": 0.38715246,
1397
- "Standard Deviation": 0.03701851946,
1398
- "Rank": 33
1399
- },
1400
- "Algebra": {
1401
- "Average Score": 0.441233712,
1402
- "Standard Deviation": 0.01135753754,
1403
- "Rank": 30
1404
- },
1405
- "Probability": {
1406
- "Average Score": 0.38802618,
1407
- "Standard Deviation": 0.05663879714,
1408
- "Rank": 31
1409
- },
1410
- "Logical": {
1411
- "Average Score": 0.336754383,
1412
- "Standard Deviation": 0.01608478079,
1413
- "Rank": 29
1414
- },
1415
- "Social": {
1416
- "Average Score": 0.250891608,
1417
- "Standard Deviation": 0.03253769914,
1418
- "Rank": 37
1419
- },
1420
- "Chemistry": {
1421
- "Average Score": 33.70639271807677,
1422
- "Standard Deviation": null,
1423
- "Rank": 33
1424
- }
1425
- }
1426
- },
1427
- {
1428
- "config": {
1429
- "model_name": "openchat-3.5",
1430
- "organization": "OpenChat",
1431
- "license": "Apache-2.0",
1432
- "knowledge_cutoff": "2023/11"
1433
- },
1434
- "results": {
1435
- "OVERALL": {
1436
- "Average Score": 0.361341296,
1437
- "Standard Deviation": 0.09034869493,
1438
- "Rank": 35
1439
- },
1440
- "Geometry": {
1441
- "Average Score": 0.401699069,
1442
- "Standard Deviation": 0.03410726557,
1443
- "Rank": 31
1444
- },
1445
- "Algebra": {
1446
- "Average Score": 0.414095336,
1447
- "Standard Deviation": 0.01881964261,
1448
- "Rank": 32
1449
- },
1450
- "Probability": {
1451
- "Average Score": 0.349601002,
1452
- "Standard Deviation": 0.05077455539,
1453
- "Rank": 33
1454
- },
1455
- "Logical": {
1456
- "Average Score": 0.331069242,
1457
- "Standard Deviation": 0.02180827173,
1458
- "Rank": 31
1459
- },
1460
- "Social": {
1461
- "Average Score": 0.319991655,
1462
- "Standard Deviation": 0.04502478724,
1463
- "Rank": 30
1464
- },
1465
- "Chemistry": {
1466
- "Average Score": 33.020911255646965,
1467
- "Standard Deviation": null,
1468
- "Rank": 34
1469
- }
1470
- }
1471
- },
1472
- {
1473
- "config": {
1474
- "model_name": "command-r-(08-2024)",
1475
- "organization": "Cohere",
1476
- "license": "CC-BY-NC-4.0",
1477
- "knowledge_cutoff": "2024/08"
1478
- },
1479
- "results": {
1480
- "OVERALL": {
1481
- "Average Score": 0.427605298,
1482
- "Standard Deviation": 0.01747449163,
1483
- "Rank": 26
1484
- },
1485
- "Geometry": {
1486
- "Average Score": 0.448300727,
1487
- "Standard Deviation": 0.04996362328,
1488
- "Rank": 28
1489
- },
1490
- "Algebra": {
1491
- "Average Score": 0.417519167,
1492
- "Standard Deviation": 0.01822196902,
1493
- "Rank": 31
1494
- },
1495
- "Probability": {
1496
- "Average Score": 0.366336281,
1497
- "Standard Deviation": 0.04716826942,
1498
- "Rank": 32
1499
- },
1500
- "Logical": {
1501
- "Average Score": 0.214657906,
1502
- "Standard Deviation": 0.03003579835,
1503
- "Rank": 38
1504
- },
1505
- "Social": {
1506
- "Average Score": 0.276088379,
1507
- "Standard Deviation": 0.03295234688,
1508
- "Rank": 33
1509
- },
1510
- "Chemistry": {
1511
- "Average Score": 39.61492485677676,
1512
- "Standard Deviation": null,
1513
- "Rank": 30
1514
- }
1515
- }
1516
- },
1517
- {
1518
- "config": {
1519
- "model_name": "gemma-1.1-7b-it",
1520
- "organization": "Google",
1521
- "license": "Gemma License",
1522
- "knowledge_cutoff": "2024/02"
1523
- },
1524
- "results": {
1525
- "OVERALL": {
1526
- "Average Score": 0.339506922,
1527
- "Standard Deviation": 0.1066279108,
1528
- "Rank": 37
1529
- },
1530
- "Geometry": {
1531
- "Average Score": 0.324170977,
1532
- "Standard Deviation": 0.04668553765,
1533
- "Rank": 36
1534
- },
1535
- "Algebra": {
1536
- "Average Score": 0.398684697,
1537
- "Standard Deviation": 0.01982398259,
1538
- "Rank": 33
1539
- },
1540
- "Probability": {
1541
- "Average Score": 0.293253175,
1542
- "Standard Deviation": 0.05126192191,
1543
- "Rank": 37
1544
- },
1545
- "Logical": {
1546
- "Average Score": 0.317750796,
1547
- "Standard Deviation": 0.01101933543,
1548
- "Rank": 32
1549
- },
1550
- "Social": {
1551
- "Average Score": 0.179073276,
1552
- "Standard Deviation": 0.02009658805,
1553
- "Rank": 42
1554
- },
1555
- "Chemistry": {
1556
- "Average Score": 42.666504105798204,
1557
- "Standard Deviation": null,
1558
- "Rank": 27
1559
- }
1560
- }
1561
- },
1562
- {
1563
- "config": {
1564
- "model_name": "llama3-8b-instruct",
1565
- "organization": "Meta",
1566
- "license": "Llama 3 Community",
1567
- "knowledge_cutoff": "2023/03"
1568
- },
1569
- "results": {
1570
- "OVERALL": {
1571
- "Average Score": 0.367722676,
1572
- "Standard Deviation": 0.1071368221,
1573
- "Rank": 32
1574
- },
1575
- "Geometry": {
1576
- "Average Score": 0.367143758,
1577
- "Standard Deviation": 0.04363680358,
1578
- "Rank": 34
1579
- },
1580
- "Algebra": {
1581
- "Average Score": 0.391480973,
1582
- "Standard Deviation": 0.02757445266,
1583
- "Rank": 34
1584
- },
1585
- "Probability": {
1586
- "Average Score": 0.317616445,
1587
- "Standard Deviation": 0.04300430361,
1588
- "Rank": 36
1589
- },
1590
- "Logical": {
1591
- "Average Score": 0.461607495,
1592
- "Standard Deviation": 0.02185028842,
1593
- "Rank": 23
1594
- },
1595
- "Social": {
1596
- "Average Score": 0.336373622,
1597
- "Standard Deviation": 0.05762408512,
1598
- "Rank": 27
1599
- },
1600
- "Chemistry": {
1601
- "Average Score": 45.35392139264795,
1602
- "Standard Deviation": null,
1603
- "Rank": 22
1604
- }
1605
- }
1606
- },
1607
- {
1608
- "config": {
1609
- "model_name": "gemma-2-2b-it",
1610
- "organization": "Google",
1611
- "license": "Gemma License",
1612
- "knowledge_cutoff": "2024/07"
1613
- },
1614
- "results": {
1615
- "OVERALL": {
1616
- "Average Score": 0.502167612,
1617
- "Standard Deviation": 0.04389786763,
1618
- "Rank": 20
1619
- },
1620
- "Geometry": {
1621
- "Average Score": 0.395006676,
1622
- "Standard Deviation": 0.05882607713,
1623
- "Rank": 32
1624
- },
1625
- "Algebra": {
1626
- "Average Score": 0.379391887,
1627
- "Standard Deviation": 0.01722410785,
1628
- "Rank": 35
1629
- },
1630
- "Probability": {
1631
- "Average Score": 0.331231097,
1632
- "Standard Deviation": 0.05392499987,
1633
- "Rank": 35
1634
- },
1635
- "Logical": {
1636
- "Average Score": 0.367687789,
1637
- "Standard Deviation": 0.02547968808,
1638
- "Rank": 27
1639
- },
1640
- "Social": {
1641
- "Average Score": 0.393482094,
1642
- "Standard Deviation": 0.06450214024,
1643
- "Rank": 24
1644
- },
1645
- "Chemistry": {
1646
- "Average Score": 30.53406933106768,
1647
- "Standard Deviation": null,
1648
- "Rank": 36
1649
- }
1650
- }
1651
- },
1652
- {
1653
- "config": {
1654
- "model_name": "starling-lm-7b-alpha",
1655
- "organization": "Nexusflow",
1656
- "license": "Apache-2.0",
1657
- "knowledge_cutoff": "2023/11"
1658
- },
1659
- "results": {
1660
- "OVERALL": {
1661
- "Average Score": 0.366628765,
1662
- "Standard Deviation": 0.08405492929,
1663
- "Rank": 33
1664
- },
1665
- "Geometry": {
1666
- "Average Score": 0.336782578,
1667
- "Standard Deviation": 0.04069449132,
1668
- "Rank": 35
1669
- },
1670
- "Algebra": {
1671
- "Average Score": 0.371551932,
1672
- "Standard Deviation": 0.03367241745,
1673
- "Rank": 36
1674
- },
1675
- "Probability": {
1676
- "Average Score": 0.331472505,
1677
- "Standard Deviation": 0.04833324282,
1678
- "Rank": 34
1679
- },
1680
- "Logical": {
1681
- "Average Score": 0.260869624,
1682
- "Standard Deviation": 0.03562735237,
1683
- "Rank": 36
1684
- },
1685
- "Social": {
1686
- "Average Score": 0.271975534,
1687
- "Standard Deviation": 0.04266753408,
1688
- "Rank": 34
1689
- },
1690
- "Chemistry": {
1691
- "Average Score": 30.07926487356878,
1692
- "Standard Deviation": null,
1693
- "Rank": 37
1694
- }
1695
- }
1696
- },
1697
- {
1698
- "config": {
1699
- "model_name": "qwen1.5-4b-chat",
1700
- "organization": "Alibaba",
1701
- "license": "Qianwen LICENSE",
1702
- "knowledge_cutoff": "2024/02"
1703
- },
1704
- "results": {
1705
- "OVERALL": {
1706
- "Average Score": 0.111876411,
1707
- "Standard Deviation": 0.04241022785,
1708
- "Rank": 49
1709
- },
1710
- "Geometry": {
1711
- "Average Score": 0.215834522,
1712
- "Standard Deviation": 0.0363766363,
1713
- "Rank": 40
1714
- },
1715
- "Algebra": {
1716
- "Average Score": 0.305589811,
1717
- "Standard Deviation": 0.02354198912,
1718
- "Rank": 37
1719
- },
1720
- "Probability": {
1721
- "Average Score": 0.149365327,
1722
- "Standard Deviation": 0.03489672675,
1723
- "Rank": 44
1724
- },
1725
- "Logical": {
1726
- "Average Score": 0.116210168,
1727
- "Standard Deviation": 0.005927966496,
1728
- "Rank": 47
1729
- },
1730
- "Social": {
1731
- "Average Score": 0.18195615,
1732
- "Standard Deviation": 0.02269805277,
1733
- "Rank": 41
1734
- },
1735
- "Chemistry": {
1736
- "Average Score": 13.21208067122554,
1737
- "Standard Deviation": null,
1738
- "Rank": 47
1739
- }
1740
- }
1741
- },
1742
- {
1743
- "config": {
1744
- "model_name": "command-r-(04-2024)",
1745
- "organization": "Cohere",
1746
- "license": "CC-BY-NC-4.0",
1747
- "knowledge_cutoff": "2024/04"
1748
- },
1749
- "results": {
1750
- "OVERALL": {
1751
- "Average Score": 0.388783887,
1752
- "Standard Deviation": 0.07417186783,
1753
- "Rank": 30
1754
- },
1755
- "Geometry": {
1756
- "Average Score": 0.300416698,
1757
- "Standard Deviation": 0.03485612736,
1758
- "Rank": 37
1759
- },
1760
- "Algebra": {
1761
- "Average Score": 0.293120231,
1762
- "Standard Deviation": 0.032926484,
1763
- "Rank": 38
1764
- },
1765
- "Probability": {
1766
- "Average Score": 0.281271304,
1767
- "Standard Deviation": 0.05697149867,
1768
- "Rank": 39
1769
- },
1770
- "Logical": {
1771
- "Average Score": 0.276189906,
1772
- "Standard Deviation": 0.03562914754,
1773
- "Rank": 34
1774
- },
1775
- "Social": {
1776
- "Average Score": 0.283882949,
1777
- "Standard Deviation": 0.03336901148,
1778
- "Rank": 32
1779
- },
1780
- "Chemistry": {
1781
- "Average Score": 41.346336503003236,
1782
- "Standard Deviation": null,
1783
- "Rank": 28
1784
- }
1785
- }
1786
- },
1787
- {
1788
- "config": {
1789
- "model_name": "vicuna-33b",
1790
- "organization": "LMSYS",
1791
- "license": "Non-commercial",
1792
- "knowledge_cutoff": "2023/08"
1793
- },
1794
- "results": {
1795
- "OVERALL": {
1796
- "Average Score": 0.316543555,
1797
- "Standard Deviation": 0.08922095647,
1798
- "Rank": 39
1799
- },
1800
- "Geometry": {
1801
- "Average Score": 0.208284679,
1802
- "Standard Deviation": 0.03937771461,
1803
- "Rank": 41
1804
- },
1805
- "Algebra": {
1806
- "Average Score": 0.248994048,
1807
- "Standard Deviation": 0.02668175054,
1808
- "Rank": 40
1809
- },
1810
- "Probability": {
1811
- "Average Score": 0.222313995,
1812
- "Standard Deviation": 0.03978859759,
1813
- "Rank": 42
1814
- },
1815
- "Logical": {
1816
- "Average Score": 0.180291222,
1817
- "Standard Deviation": 0.021886267,
1818
- "Rank": 42
1819
- },
1820
- "Social": {
1821
- "Average Score": 0.257623798,
1822
- "Standard Deviation": 0.02653724437,
1823
- "Rank": 35
1824
- },
1825
- "Chemistry": {
1826
- "Average Score": 28.01838653090379,
1827
- "Standard Deviation": null,
1828
- "Rank": 38
1829
- }
1830
- }
1831
- },
1832
- {
1833
- "config": {
1834
- "model_name": "gemma-7b-it",
1835
- "organization": "Google",
1836
- "license": "Gemma License",
1837
- "knowledge_cutoff": "2024/02"
1838
- },
1839
- "results": {
1840
- "OVERALL": {
1841
- "Average Score": 0.285077558,
1842
- "Standard Deviation": 0.08871758453,
1843
- "Rank": 41
1844
- },
1845
- "Geometry": {
1846
- "Average Score": 0.244791417,
1847
- "Standard Deviation": 0.0289612078,
1848
- "Rank": 38
1849
- },
1850
- "Algebra": {
1851
- "Average Score": 0.250614794,
1852
- "Standard Deviation": 0.01991678295,
1853
- "Rank": 39
1854
- },
1855
- "Probability": {
1856
- "Average Score": 0.174313053,
1857
- "Standard Deviation": 0.03765424728,
1858
- "Rank": 43
1859
- },
1860
- "Logical": {
1861
- "Average Score": 0.197505536,
1862
- "Standard Deviation": 0.02050298885,
1863
- "Rank": 39
1864
- },
1865
- "Social": {
1866
- "Average Score": 0.202138025,
1867
- "Standard Deviation": 0.02098346639,
1868
- "Rank": 40
1869
- },
1870
- "Chemistry": {
1871
- "Average Score": 28.014658234926813,
1872
- "Standard Deviation": null,
1873
- "Rank": 39
1874
- }
1875
- }
1876
- },
1877
- {
1878
- "config": {
1879
- "model_name": "mistral-7b-instruct-2",
1880
- "organization": "Mistral",
1881
- "license": "Apache 2.0",
1882
- "knowledge_cutoff": "2023/12"
1883
- },
1884
- "results": {
1885
- "OVERALL": {
1886
- "Average Score": 0.427513868,
1887
- "Standard Deviation": 0.05553921135,
1888
- "Rank": 27
1889
- },
1890
- "Geometry": {
1891
- "Average Score": 0.216402626,
1892
- "Standard Deviation": 0.03338414918,
1893
- "Rank": 39
1894
- },
1895
- "Algebra": {
1896
- "Average Score": 0.233777838,
1897
- "Standard Deviation": 0.0155226054,
1898
- "Rank": 41
1899
- },
1900
- "Probability": {
1901
- "Average Score": 0.25118175,
1902
- "Standard Deviation": 0.04065514593,
1903
- "Rank": 40
1904
- },
1905
- "Logical": {
1906
- "Average Score": 0.224469136,
1907
- "Standard Deviation": 0.03404706752,
1908
- "Rank": 37
1909
- },
1910
- "Social": {
1911
- "Average Score": 0.209386782,
1912
- "Standard Deviation": 0.02738569921,
1913
- "Rank": 39
1914
- },
1915
- "Chemistry": {
1916
- "Average Score": 31.382959631870822,
1917
- "Standard Deviation": null,
1918
- "Rank": 35
1919
- }
1920
- }
1921
- },
1922
- {
1923
- "config": {
1924
- "model_name": "mistral-7b-instruct-1",
1925
- "organization": "Mistral",
1926
- "license": "Apache 2.0",
1927
- "knowledge_cutoff": "2023/12"
1928
- },
1929
- "results": {
1930
- "OVERALL": {
1931
- "Average Score": 0.23016314,
1932
- "Standard Deviation": 0.07137625271,
1933
- "Rank": 46
1934
- },
1935
- "Geometry": {
1936
- "Average Score": 0.161799938,
1937
- "Standard Deviation": 0.03595278559,
1938
- "Rank": 45
1939
- },
1940
- "Algebra": {
1941
- "Average Score": 0.210341624,
1942
- "Standard Deviation": 0.01736539119,
1943
- "Rank": 42
1944
- },
1945
- "Probability": {
1946
- "Average Score": 0.238417922,
1947
- "Standard Deviation": 0.03744211933,
1948
- "Rank": 41
1949
- },
1950
- "Logical": {
1951
- "Average Score": 0.142636601,
1952
- "Standard Deviation": 0.02080406365,
1953
- "Rank": 46
1954
- },
1955
- "Social": {
1956
- "Average Score": 0.117646827,
1957
- "Standard Deviation": 0.009321202779,
1958
- "Rank": 48
1959
- },
1960
- "Chemistry": {
1961
- "Average Score": 18.929093202755805,
1962
- "Standard Deviation": null,
1963
- "Rank": 42
1964
- }
1965
- }
1966
- },
1967
- {
1968
- "config": {
1969
- "model_name": "vicuna-13b",
1970
- "organization": "LMSYS",
1971
- "license": "Non-commercial",
1972
- "knowledge_cutoff": "2023/07"
1973
- },
1974
- "results": {
1975
- "OVERALL": {
1976
- "Average Score": 0.201892849,
1977
- "Standard Deviation": 0.06021749802,
1978
- "Rank": 47
1979
- },
1980
- "Geometry": {
1981
- "Average Score": 0.200941928,
1982
- "Standard Deviation": 0.03366817781,
1983
- "Rank": 42
1984
- },
1985
- "Algebra": {
1986
- "Average Score": 0.196123323,
1987
- "Standard Deviation": 0.0135715643,
1988
- "Rank": 43
1989
- },
1990
- "Probability": {
1991
- "Average Score": 0.141214079,
1992
- "Standard Deviation": 0.02721328211,
1993
- "Rank": 45
1994
- },
1995
- "Logical": {
1996
- "Average Score": 0.148598631,
1997
- "Standard Deviation": 0.02241523892,
1998
- "Rank": 44
1999
- },
2000
- "Social": {
2001
- "Average Score": 0.124655135,
2002
- "Standard Deviation": 0.01122382671,
2003
- "Rank": 47
2004
- },
2005
- "Chemistry": {
2006
- "Average Score": 21.840013221590294,
2007
- "Standard Deviation": null,
2008
- "Rank": 40
2009
- }
2010
- }
2011
- },
2012
- {
2013
- "config": {
2014
- "model_name": "zephyr-7b-beta",
2015
- "organization": "HuggingFace",
2016
- "license": "MIT",
2017
- "knowledge_cutoff": "2023/10"
2018
- },
2019
- "results": {
2020
- "OVERALL": {
2021
- "Average Score": 0.102705119,
2022
- "Standard Deviation": 0.03683757312,
2023
- "Rank": 50
2024
- },
2025
- "Geometry": {
2026
- "Average Score": 0.114005544,
2027
- "Standard Deviation": 0.03144354365,
2028
- "Rank": 46
2029
- },
2030
- "Algebra": {
2031
- "Average Score": 0.141766633,
2032
- "Standard Deviation": 0.03179520129,
2033
- "Rank": 44
2034
- },
2035
- "Probability": {
2036
- "Average Score": 0.089050714,
2037
- "Standard Deviation": 0.002136754266,
2038
- "Rank": 48
2039
- },
2040
- "Logical": {
2041
- "Average Score": 0.069520789,
2042
- "Standard Deviation": 0.004477840857,
2043
- "Rank": 51
2044
- },
2045
- "Social": {
2046
- "Average Score": 0.0,
2047
- "Standard Deviation": 0.0,
2048
- "Rank": 53
2049
- },
2050
- "Chemistry": {
2051
- "Average Score": 18.92902220864132,
2052
- "Standard Deviation": null,
2053
- "Rank": 43
2054
- }
2055
- }
2056
- },
2057
- {
2058
- "config": {
2059
- "model_name": "gemma-1.1-2b-it",
2060
- "organization": "Google",
2061
- "license": "Gemma License",
2062
- "knowledge_cutoff": "2024/02"
2063
- },
2064
- "results": {
2065
- "OVERALL": {
2066
- "Average Score": 0.257700845,
2067
- "Standard Deviation": 0.07369021445,
2068
- "Rank": 44
2069
- },
2070
- "Geometry": {
2071
- "Average Score": 0.183974034,
2072
- "Standard Deviation": 0.0215548886,
2073
- "Rank": 44
2074
- },
2075
- "Algebra": {
2076
- "Average Score": 0.13422252,
2077
- "Standard Deviation": 0.01922819511,
2078
- "Rank": 45
2079
- },
2080
- "Probability": {
2081
- "Average Score": 0.095628657,
2082
- "Standard Deviation": 0.007536076456,
2083
- "Rank": 47
2084
- },
2085
- "Logical": {
2086
- "Average Score": 0.094965074,
2087
- "Standard Deviation": 0.005019175487,
2088
- "Rank": 49
2089
- },
2090
- "Social": {
2091
- "Average Score": 0.167796727,
2092
- "Standard Deviation": 0.01666541942,
2093
- "Rank": 43
2094
- },
2095
- "Chemistry": {
2096
- "Average Score": 20.724691953843916,
2097
- "Standard Deviation": null,
2098
- "Rank": 41
2099
- }
2100
- }
2101
- },
2102
- {
2103
- "config": {
2104
- "model_name": "llama2-7b-chat",
2105
- "organization": "Meta",
2106
- "license": "Llama 2 Community",
2107
- "knowledge_cutoff": "2023/07"
2108
- },
2109
- "results": {
2110
- "OVERALL": {
2111
- "Average Score": 0.260189428,
2112
- "Standard Deviation": 0.08019299364,
2113
- "Rank": 43
2114
- },
2115
- "Geometry": {
2116
- "Average Score": 0.087067276,
2117
- "Standard Deviation": 0.04274343402,
2118
- "Rank": 47
2119
- },
2120
- "Algebra": {
2121
- "Average Score": 0.12308805,
2122
- "Standard Deviation": 0.01856053622,
2123
- "Rank": 46
2124
- },
2125
- "Probability": {
2126
- "Average Score": 0.087515438,
2127
- "Standard Deviation": 0.006315053573,
2128
- "Rank": 49
2129
- },
2130
- "Logical": {
2131
- "Average Score": 0.17312827,
2132
- "Standard Deviation": 0.01867044092,
2133
- "Rank": 43
2134
- },
2135
- "Social": {
2136
- "Average Score": 0.152905272,
2137
- "Standard Deviation": 0.007166957097,
2138
- "Rank": 44
2139
- },
2140
- "Chemistry": {
2141
- "Average Score": 15.730513733660898,
2142
- "Standard Deviation": null,
2143
- "Rank": 45
2144
- }
2145
- }
2146
- },
2147
- {
2148
- "config": {
2149
- "model_name": "gemma-2b-it",
2150
- "organization": "Google",
2151
- "license": "Gemma License",
2152
- "knowledge_cutoff": "2024/02"
2153
- },
2154
- "results": {
2155
- "OVERALL": {
2156
- "Average Score": 0.234172069,
2157
- "Standard Deviation": 0.06522685718,
2158
- "Rank": 45
2159
- },
2160
- "Geometry": {
2161
- "Average Score": 0.198571153,
2162
- "Standard Deviation": 0.01699161031,
2163
- "Rank": 43
2164
- },
2165
- "Algebra": {
2166
- "Average Score": 0.109883009,
2167
- "Standard Deviation": 0.01520005833,
2168
- "Rank": 47
2169
- },
2170
- "Probability": {
2171
- "Average Score": 0.06467432,
2172
- "Standard Deviation": 0.002117497231,
2173
- "Rank": 51
2174
- },
2175
- "Logical": {
2176
- "Average Score": 0.039624492,
2177
- "Standard Deviation": 0.007606972686,
2178
- "Rank": 52
2179
- },
2180
- "Social": {
2181
- "Average Score": 0.087452913,
2182
- "Standard Deviation": 0.008170146562,
2183
- "Rank": 51
2184
- },
2185
- "Chemistry": {
2186
- "Average Score": 17.2715657115764,
2187
- "Standard Deviation": null,
2188
- "Rank": 44
2189
- }
2190
- }
2191
- },
2192
- {
2193
- "config": {
2194
- "model_name": "llama2-13b-chat",
2195
- "organization": "Meta",
2196
- "license": "Llama 2 Community",
2197
- "knowledge_cutoff": "2023/07"
2198
- },
2199
- "results": {
2200
- "OVERALL": {
2201
- "Average Score": 0.263305684,
2202
- "Standard Deviation": 0.07283640689,
2203
- "Rank": 42
2204
- },
2205
- "Geometry": {
2206
- "Average Score": 0.072729954,
2207
- "Standard Deviation": 0.02315988261,
2208
- "Rank": 49
2209
- },
2210
- "Algebra": {
2211
- "Average Score": 0.080371692,
2212
- "Standard Deviation": 0.01277569453,
2213
- "Rank": 48
2214
- },
2215
- "Probability": {
2216
- "Average Score": 0.117757344,
2217
- "Standard Deviation": 0.02418619619,
2218
- "Rank": 46
2219
- },
2220
- "Logical": {
2221
- "Average Score": 0.193149889,
2222
- "Standard Deviation": 0.01776690764,
2223
- "Rank": 40
2224
- },
2225
- "Social": {
2226
- "Average Score": 0.149125922,
2227
- "Standard Deviation": 0.01157416827,
2228
- "Rank": 45
2229
- },
2230
- "Chemistry": {
2231
- "Average Score": 13.17258252933903,
2232
- "Standard Deviation": null,
2233
- "Rank": 48
2234
- }
2235
- }
2236
- },
2237
- {
2238
- "config": {
2239
- "model_name": "vicuna-7b",
2240
- "organization": "LMSYS",
2241
- "license": "Non-commercial",
2242
- "knowledge_cutoff": "2023/07"
2243
- },
2244
- "results": {
2245
- "OVERALL": {
2246
- "Average Score": 0.198839786,
2247
- "Standard Deviation": 0.05725381576,
2248
- "Rank": 48
2249
- },
2250
- "Geometry": {
2251
- "Average Score": 0.083457058,
2252
- "Standard Deviation": 0.02520989111,
2253
- "Rank": 48
2254
- },
2255
- "Algebra": {
2256
- "Average Score": 0.070883882,
2257
- "Standard Deviation": 0.007315853253,
2258
- "Rank": 49
2259
- },
2260
- "Probability": {
2261
- "Average Score": 0.080987673,
2262
- "Standard Deviation": 0.005474288861,
2263
- "Rank": 50
2264
- },
2265
- "Logical": {
2266
- "Average Score": 0.100065588,
2267
- "Standard Deviation": 0.003561886452,
2268
- "Rank": 48
2269
- },
2270
- "Social": {
2271
- "Average Score": 0.111076414,
2272
- "Standard Deviation": 0.004805626512,
2273
- "Rank": 49
2274
- },
2275
- "Chemistry": {
2276
- "Average Score": 14.255194156624162,
2277
- "Standard Deviation": null,
2278
- "Rank": 46
2279
- }
2280
- }
2281
- },
2282
- {
2283
- "config": {
2284
- "model_name": "koala-13b",
2285
- "organization": "UC Berkeley",
2286
- "license": "Non-commercial",
2287
- "knowledge_cutoff": "2023/04"
2288
- },
2289
- "results": {
2290
- "OVERALL": {
2291
- "Average Score": 0.09387188,
2292
- "Standard Deviation": 0.02642167489,
2293
- "Rank": 51
2294
- },
2295
- "Geometry": {
2296
- "Average Score": 0.017374001,
2297
- "Standard Deviation": 0.01747053557,
2298
- "Rank": 50
2299
- },
2300
- "Algebra": {
2301
- "Average Score": 0.018129197,
2302
- "Standard Deviation": 0.01054371383,
2303
- "Rank": 50
2304
- },
2305
- "Probability": {
2306
- "Average Score": 0.043654362,
2307
- "Standard Deviation": 0.004288231886,
2308
- "Rank": 52
2309
- },
2310
- "Logical": {
2311
- "Average Score": 0.074694053,
2312
- "Standard Deviation": 0.002674646998,
2313
- "Rank": 50
2314
- },
2315
- "Social": {
2316
- "Average Score": 0.096983835,
2317
- "Standard Deviation": 0.007847059783,
2318
- "Rank": 50
2319
- },
2320
- "Chemistry": {
2321
- "Average Score": 6.36433272373514,
2322
- "Standard Deviation": null,
2323
- "Rank": 49
2324
- }
2325
- }
2326
- },
2327
- {
2328
- "config": {
2329
- "model_name": "openassistant-pythia-12b",
2330
- "organization": "OpenAssistant",
2331
- "license": "Non-commercial",
2332
- "knowledge_cutoff": "2023/04"
2333
- },
2334
- "results": {
2335
- "OVERALL": {
2336
- "Average Score": 0.0,
2337
- "Standard Deviation": 0.0,
2338
- "Rank": 52
2339
- },
2340
- "Geometry": {
2341
- "Average Score": 0.0,
2342
- "Standard Deviation": 0.0,
2343
- "Rank": 51
2344
- },
2345
- "Algebra": {
2346
- "Average Score": 0.0,
2347
- "Standard Deviation": 0.0,
2348
- "Rank": 51
2349
- },
2350
- "Probability": {
2351
- "Average Score": 0.0,
2352
- "Standard Deviation": 0.0,
2353
- "Rank": 53
2354
- },
2355
- "Logical": {
2356
- "Average Score": 0.0,
2357
- "Standard Deviation": 0.0,
2358
- "Rank": 53
2359
- },
2360
- "Social": {
2361
- "Average Score": 0.030792528,
2362
- "Standard Deviation": 0.007518796391,
2363
- "Rank": 52
2364
- },
2365
- "Chemistry": {
2366
- "Average Score": 0.0,
2367
- "Standard Deviation": null,
2368
- "Rank": 50
2369
- }
2370
- }
2371
- }
2372
- ]