Hynek Kydlicek commited on
Commit
acc5a5f
โ€ข
1 Parent(s): 2b8570b

new models

Browse files
Files changed (4) hide show
  1. app.py +58 -19
  2. leaderboard/klokan.csv +11 -4
  3. leaderboard/table.csv +11 -4
  4. leaderboard/tsp.csv +11 -4
app.py CHANGED
@@ -12,6 +12,23 @@ import plotly.graph_objects as go
12
  import pandas as pd
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def make_default_md():
16
 
17
  leaderboard_md = f"""
@@ -34,7 +51,7 @@ def make_arena_leaderboard_md(arena_df):
34
  total_models = len(arena_df)
35
 
36
  leaderboard_md = f"""
37
- Total #models: **{total_models}**. Last updated: Feb 15, 2024.
38
  """
39
  return leaderboard_md
40
 
@@ -59,16 +76,30 @@ def plot_spider(df, title):
59
  categories[0],
60
  ] # Ensure the graph is circular by appending the start to the end
61
  colors = [
62
- "#1f77b4", # muted blue
63
- "#ff7f0e", # safety orange
64
- "#2ca02c", # cooked asparagus green
65
- "#d62728", # brick red
66
- "#9467bd", # muted purple
67
- "#8c564b", # chestnut brown
68
- "#e377c2", # raspberry yogurt pink
69
- "#7f7f7f", # middle gray
70
- "#bcbd22", # curry yellow-green
71
- "#17becf", # blue-teal
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  ]
73
 
74
  # Setting for 1000x1000
@@ -95,7 +126,7 @@ def plot_spider(df, title):
95
 
96
  fig_1000.update_layout(
97
  width=600,
98
- height=628,
99
  polar=dict(
100
  angularaxis=dict(
101
  gridwidth=2, # Increase line width for better visibility
@@ -143,14 +174,23 @@ def get_full_table(model_table_df):
143
  model_table_df.sort_values(by="average", ascending=False, inplace=True)
144
  model_table_df.insert(0, "rank", np.arange(1, len(model_table_df) + 1))
145
 
 
 
 
 
 
146
  # Add link
147
  model_table_df["model_name"] = model_table_df["model_name"].apply(
148
  lambda x: openrouter_hyperlink(x)
149
  )
150
 
 
 
 
151
  model_table_df.rename(
152
  columns={
153
  "model_name": "๐Ÿค– Model",
 
154
  "klokan": "๐Ÿงฎ Klokan-QA",
155
  "culture": "๐ŸŒ TSP-Culture",
156
  "analytical": "๐Ÿ” TSP-Analytical",
@@ -161,6 +201,7 @@ def get_full_table(model_table_df):
161
  inplace=True,
162
  )
163
 
 
164
  return model_table_df
165
 
166
 
@@ -195,17 +236,15 @@ def build_leaderboard_tab(leaderboard_table_file, klokan_table_file, tsp_table_f
195
  elem_id="arena_leaderboard_dataframe",
196
  height=700,
197
  column_widths=[
198
- 50,
199
  200,
 
 
 
 
200
  120,
201
  100,
202
  100,
203
- 150,
204
- 150,
205
- 100,
206
- 150,
207
- 150,
208
- 150,
209
  ],
210
  wrap=True,
211
  )
 
12
  import pandas as pd
13
 
14
 
15
+
16
+ MODEL_NAME_COST = {
17
+ "anthropic/claude-2.1": 8,
18
+ "anthropic/claude-3-haiku": 0.25,
19
+ "anthropic/claude-3-opus": 15,
20
+ "anthropic/claude-3-sonnet": 3,
21
+ "cohere/command-r": 0.5,
22
+ "google/gemini-pro": 0.12,
23
+ "google/gemma-7b-it": 0.1,
24
+ "mistralai/mistral-large": 8,
25
+ "mistralai/mistral-medium": 2.7,
26
+ "mistralai/mixtral-8x7b-instruct": 0.7,
27
+ "openai/gpt-3.5-turbo": 0.5,
28
+ "openai/gpt-4-1106-preview": 10,
29
+ }
30
+
31
+
32
  def make_default_md():
33
 
34
  leaderboard_md = f"""
 
51
  total_models = len(arena_df)
52
 
53
  leaderboard_md = f"""
54
+ Total #models: **{total_models}**. Last updated: Mar 17, 2024.
55
  """
56
  return leaderboard_md
57
 
 
76
  categories[0],
77
  ] # Ensure the graph is circular by appending the start to the end
78
  colors = [
79
+ '#1f77b4', # muted blue
80
+ '#ff7f0e', # safety orange
81
+ '#2ca02c', # cooked asparagus green
82
+ '#d62728', # brick red
83
+ '#9467bd', # muted purple
84
+ '#8c564b', # chestnut brown
85
+ '#e377c2', # raspberry yogurt pink
86
+ '#7f7f7f', # middle gray
87
+ '#bcbd22', # curry yellow-green
88
+ '#17becf', # blue-teal
89
+ '#f7b6d2', # pastel pink
90
+ '#bcbd22', # faded green
91
+ '#dbdb8d', # light olive
92
+ '#17becf', # soft blue
93
+ '#9edae5', # light blue
94
+ '#c5b0d5', # soft purple
95
+ '#c49c94', # dusty rose
96
+ '#f7b6d2', # pastel pink
97
+ '#bcbd22', # faded green
98
+ '#dbdb8d', # light olive
99
+ '#17becf', # soft blue
100
+ '#9edae5', # light blue
101
+ '#c5b0d5', # soft purple
102
+ '#c49c94', # dusty rose
103
  ]
104
 
105
  # Setting for 1000x1000
 
126
 
127
  fig_1000.update_layout(
128
  width=600,
129
+ height=950,
130
  polar=dict(
131
  angularaxis=dict(
132
  gridwidth=2, # Increase line width for better visibility
 
174
  model_table_df.sort_values(by="average", ascending=False, inplace=True)
175
  model_table_df.insert(0, "rank", np.arange(1, len(model_table_df) + 1))
176
 
177
+ # Add cost
178
+ model_table_df["completion_price"] = model_table_df["model_name"].apply(
179
+ lambda x: f"{MODEL_NAME_COST.get(x, "N/A")}$"
180
+ )
181
+
182
  # Add link
183
  model_table_df["model_name"] = model_table_df["model_name"].apply(
184
  lambda x: openrouter_hyperlink(x)
185
  )
186
 
187
+ # Ensure the dataframe is in the correct order before renaming
188
+ model_table_df = model_table_df[["rank", "model_name", "completion_price", "klokan", "culture", "analytical", "critical", "verbal", "average"]]
189
+
190
  model_table_df.rename(
191
  columns={
192
  "model_name": "๐Ÿค– Model",
193
+ "completion_price": "๐Ÿ’ฐ Cost (1M-Tokens)",
194
  "klokan": "๐Ÿงฎ Klokan-QA",
195
  "culture": "๐ŸŒ TSP-Culture",
196
  "analytical": "๐Ÿ” TSP-Analytical",
 
201
  inplace=True,
202
  )
203
 
204
+
205
  return model_table_df
206
 
207
 
 
236
  elem_id="arena_leaderboard_dataframe",
237
  height=700,
238
  column_widths=[
239
+ 70,
240
  200,
241
+ 110,
242
+ 120,
243
+ 120,
244
+ 120,
245
  120,
246
  100,
247
  100,
 
 
 
 
 
 
248
  ],
249
  wrap=True,
250
  )
leaderboard/klokan.csv CHANGED
@@ -1,6 +1,13 @@
1
  ,Elementary 2-3,Elementary 4-5,Elementary 6-7,Elementary 8-9,High School 1-2,High School 3-4
2
  anthropic/claude-2.1,43.96551724137931,50.35971223021583,39.87730061349693,39.75155279503105,33.33333333333333,14.772727272727273
3
- google/gemini-pro,25.0,28.05755395683453,22.699386503067483,20.496894409937887,24.691358024691358,19.318181818181817
4
- mistralai/mixtral-8x7b-instruct,34.48275862068966,25.899280575539567,25.766871165644172,25.465838509316768,20.98765432098765,19.318181818181817
5
- openai/gpt-3.5-turbo,37.06896551724138,41.007194244604314,33.74233128834356,29.81366459627329,26.543209876543212,17.045454545454543
6
- openai/gpt-4-1106-preview,66.37931034482759,62.589928057553955,50.306748466257666,40.993788819875775,32.71604938271605,36.36363636363637
 
 
 
 
 
 
 
 
1
  ,Elementary 2-3,Elementary 4-5,Elementary 6-7,Elementary 8-9,High School 1-2,High School 3-4
2
  anthropic/claude-2.1,43.96551724137931,50.35971223021583,39.87730061349693,39.75155279503105,33.33333333333333,14.772727272727273
3
+ anthropic/claude-3-haiku,37.5,38.405797101449274,27.160493827160494,31.25,32.70440251572327,25.609756097560975
4
+ anthropic/claude-3-opus,78.57142857142857,65.94202898550725,61.111111111111114,50.0,45.28301886792453,48.78048780487805
5
+ anthropic/claude-3-sonnet,59.82142857142857,60.86956521739131,38.2716049382716,45.0,33.9622641509434,25.609756097560975
6
+ cohere/command-r,24.107142857142858,28.26086956521739,29.01234567901235,21.875,24.528301886792452,18.29268292682927
7
+ google/gemini-pro,33.92857142857143,28.985507246376812,26.543209876543212,28.125,30.18867924528302,23.170731707317074
8
+ google/gemma-7b-it,16.964285714285715,15.942028985507244,16.666666666666664,12.5,20.125786163522015,21.951219512195124
9
+ mistralai/mistral-large,51.78571428571429,53.62318840579711,41.358024691358025,37.5,35.22012578616352,23.170731707317074
10
+ mistralai/mistral-medium,39.285714285714285,34.78260869565217,28.39506172839506,24.375,27.67295597484277,21.951219512195124
11
+ mistralai/mixtral-8x7b-instruct,30.357142857142854,34.05797101449276,28.39506172839506,22.5,22.0125786163522,23.170731707317074
12
+ openai/gpt-3.5-turbo,40.17857142857143,39.85507246376812,33.95061728395062,31.874999999999996,26.41509433962264,19.51219512195122
13
+ openai/gpt-4-1106-preview,71.42857142857143,69.56521739130434,59.876543209876544,52.5,50.314465408805034,42.68292682926829
leaderboard/table.csv CHANGED
@@ -1,6 +1,13 @@
1
  model_name,analytical,critical,culture,verbal,klokan
2
  anthropic/claude-2.1,0.3804034582132565,0.6449912126537786,0.7981770833333334,0.6336336336336337,0.3823884197828709
3
- google/gemini-pro,0.2680115273775216,0.5992970123022847,0.7825520833333334,0.5765765765765766,0.23522316043425814
4
- mistralai/mixtral-8x7b-instruct,0.24495677233429394,0.4833040421792619,0.6432291666666666,0.36936936936936937,0.25331724969843183
5
- openai/gpt-3.5-turbo,0.27761767531219983,0.46572934973637964,0.6822916666666666,0.4084084084084084,0.3148371531966224
6
- openai/gpt-4-1106-preview,0.4793467819404419,0.7662565905096661,0.9166666666666666,0.7207207207207207,0.47889022919179736
 
 
 
 
 
 
 
 
1
  model_name,analytical,critical,culture,verbal,klokan
2
  anthropic/claude-2.1,0.3804034582132565,0.6449912126537786,0.7981770833333334,0.6336336336336337,0.3823884197828709
3
+ anthropic/claude-3-haiku,0.3323727185398655,0.6045694200351494,0.81640625,0.6246246246246246,0.32226322263222634
4
+ anthropic/claude-3-opus,0.47262247838616717,0.7644991212653779,0.9244791666666666,0.8018018018018018,0.5781057810578106
5
+ anthropic/claude-3-sonnet,0.37848222862632086,0.6889279437609842,0.8346354166666666,0.6126126126126126,0.44280442804428044
6
+ cohere/command-r,0.27857829010566765,0.5342706502636204,0.7044270833333334,0.4444444444444444,0.24846248462484624
7
+ google/gemini-pro,0.28914505283381364,0.6098418277680141,0.8072916666666666,0.6096096096096096,0.2865928659286593
8
+ google/gemma-7b-it,0.2219020172910663,0.27943760984182775,0.22916666666666666,0.22822822822822822,0.16974169741697417
9
+ mistralai/mistral-large,0.3852065321805956,0.6678383128295254,0.859375,0.6276276276276276,0.4108241082410824
10
+ mistralai/mistral-medium,0.3121998078770413,0.5957820738137083,0.7734375,0.5045045045045045,0.2939729397293973
11
+ mistralai/mixtral-8x7b-instruct,0.2526416906820365,0.5114235500878734,0.7122395833333334,0.43543543543543545,0.26691266912669126
12
+ openai/gpt-3.5-turbo,0.3045148895292987,0.4991212653778559,0.7213541666666666,0.44744744744744747,0.3247232472324723
13
+ openai/gpt-4-1106-preview,0.515850144092219,0.7065026362038664,0.90234375,0.7267267267267268,0.5805658056580566
leaderboard/tsp.csv CHANGED
@@ -1,6 +1,13 @@
1
  ,Analytical,Critical,Cultural,Verbal
2
  anthropic/claude-2.1,38.04034582132565,64.49912126537785,79.81770833333334,63.36336336336337
3
- google/gemini-pro,26.801152737752158,59.929701230228474,78.25520833333334,57.65765765765766
4
- mistralai/mixtral-8x7b-instruct,24.495677233429394,48.33040421792619,64.32291666666666,36.93693693693694
5
- openai/gpt-3.5-turbo,27.761767531219984,46.57293497363796,68.22916666666666,40.84084084084084
6
- openai/gpt-4-1106-preview,47.93467819404419,76.6256590509666,91.66666666666666,72.07207207207207
 
 
 
 
 
 
 
 
1
  ,Analytical,Critical,Cultural,Verbal
2
  anthropic/claude-2.1,38.04034582132565,64.49912126537785,79.81770833333334,63.36336336336337
3
+ anthropic/claude-3-haiku,33.23727185398655,60.45694200351493,81.640625,62.46246246246246
4
+ anthropic/claude-3-opus,47.262247838616716,76.44991212653778,92.44791666666666,80.18018018018019
5
+ anthropic/claude-3-sonnet,37.848222862632085,68.89279437609842,83.46354166666666,61.261261261261254
6
+ cohere/command-r,27.857829010566764,53.427065026362044,70.44270833333334,44.44444444444444
7
+ google/gemini-pro,28.914505283381363,60.98418277680141,80.72916666666666,60.96096096096096
8
+ google/gemma-7b-it,22.19020172910663,27.943760984182774,22.916666666666664,22.822822822822822
9
+ mistralai/mistral-large,38.52065321805956,66.78383128295255,85.9375,62.76276276276276
10
+ mistralai/mistral-medium,31.21998078770413,59.57820738137083,77.34375,50.45045045045045
11
+ mistralai/mixtral-8x7b-instruct,25.26416906820365,51.14235500878734,71.22395833333334,43.54354354354354
12
+ openai/gpt-3.5-turbo,30.45148895292987,49.91212653778559,72.13541666666666,44.74474474474475
13
+ openai/gpt-4-1106-preview,51.5850144092219,70.65026362038664,90.234375,72.67267267267268