rusticluftig commited on
Commit
838067a
·
1 Parent(s): 9b87de8

Update leaderboard for multi evals

Browse files
Files changed (2) hide show
  1. app.py +13 -10
  2. utils.py +3 -11
app.py CHANGED
@@ -79,11 +79,7 @@ def main():
79
  gr.HTML(EVALUATION_HEADER)
80
  show_stale = gr.Checkbox(label="Show Stale", interactive=True)
81
  competition_leaderboards = []
82
- comp_2 = competitions.COMPETITION_DETAILS[2]
83
- # Covert the losses into % of correct answers.
84
- losses_2["losses"] = losses_2["losses"].apply(
85
- lambda x: 1 - x if x else None
86
- )
87
  with gr.Accordion(f"{comp_2.name} Competition"):
88
  gr.HTML(comp_2.html_description)
89
  competition_leaderboards.append(
@@ -94,7 +90,7 @@ def main():
94
  headers=[
95
  "Name",
96
  "Win Rate",
97
- "MC Score",
98
  "Weight",
99
  "UID",
100
  "Block",
@@ -117,18 +113,25 @@ def main():
117
  x="timestamp",
118
  x_title="Date",
119
  y="losses",
120
- y_title="MC Score",
121
  interactive=True,
122
  visible=True,
123
  width=1024,
124
- title="Best MC Score Over Time",
 
 
 
 
 
 
 
 
125
  )
126
  gr.HTML(
127
  """
128
  <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
129
  <li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
130
- <li><b>Average Loss:</b> the last loss value on the evaluation data for the model as calculated by the OTF validator (lower is better)</li>
131
- <li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
132
  <li><b>UID:</b> the Bittensor UID of the miner</li>
133
  <li><b>Weight:</b> the bittensor weight set for this model</li>
134
  <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
 
79
  gr.HTML(EVALUATION_HEADER)
80
  show_stale = gr.Checkbox(label="Show Stale", interactive=True)
81
  competition_leaderboards = []
82
+ comp_2 = competitions.COMPETITION_DETAILS[2]
 
 
 
 
83
  with gr.Accordion(f"{comp_2.name} Competition"):
84
  gr.HTML(comp_2.html_description)
85
  competition_leaderboards.append(
 
90
  headers=[
91
  "Name",
92
  "Win Rate",
93
+ "Score",
94
  "Weight",
95
  "UID",
96
  "Block",
 
113
  x="timestamp",
114
  x_title="Date",
115
  y="losses",
116
+ y_title="Score",
117
  interactive=True,
118
  visible=True,
119
  width=1024,
120
+ title="Best Score Over Time",
121
+ )
122
+ gr.HTML(
123
+ """
124
+ The definition of score changes over time as new evaluation tasks are added in releases.
125
+ <ul>
126
+ <li><b>Start-Oct 27</b>: % of wrong answers on synthetic MMLU</li>
127
+ <li><b>Oct 27-Now</b>: + word sorting eval</li>
128
+ """
129
  )
130
  gr.HTML(
131
  """
132
  <ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
133
  <li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
134
+ <li><b>Score:</b> the combined model score as calculated by the OTF validator (lower is better)</li>
 
135
  <li><b>UID:</b> the Bittensor UID of the miner</li>
136
  <li><b>Weight:</b> the bittensor weight set for this model</li>
137
  <li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
utils.py CHANGED
@@ -184,13 +184,6 @@ def get_scores(
184
  uids (List[int]): List of UIDs to get scores for.
185
  wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
186
  """
187
-
188
- def _maybe_convert_loss(loss: float, comp_id: int) -> float:
189
- """Converts loss to score for competitions that require it."""
190
- if comp_id == 2:
191
- return 1 - loss if loss else None
192
- return loss
193
-
194
  result = {}
195
  previous_timestamp = None
196
  seen_competitions = set()
@@ -216,9 +209,7 @@ def get_scores(
216
  # Only the most recent run per competition is fresh.
217
  is_fresh = comp_id not in seen_competitions
218
  result[uid] = {
219
- "avg_loss": _maybe_convert_loss(
220
- uid_data.get("average_loss", None), comp_id
221
- ),
222
  "win_rate": uid_data.get("win_rate", None),
223
  "win_total": uid_data.get("win_total", None),
224
  "weight": uid_data.get("weight", None),
@@ -283,7 +274,8 @@ def get_losses_over_time(wandb_runs: List, competition_id: int) -> pd.DataFrame:
283
  if c_id is None or c_id != competition_id:
284
  continue
285
 
286
- if loss < best_loss:
 
287
  best_loss = loss
288
  should_add_datapoint = True
289
  # Now that we've processed the run's most recent steps, check if we should add a datapoint.
 
184
  uids (List[int]): List of UIDs to get scores for.
185
  wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
186
  """
 
 
 
 
 
 
 
187
  result = {}
188
  previous_timestamp = None
189
  seen_competitions = set()
 
209
  # Only the most recent run per competition is fresh.
210
  is_fresh = comp_id not in seen_competitions
211
  result[uid] = {
212
+ "avg_loss": uid_data.get("average_loss", None),
 
 
213
  "win_rate": uid_data.get("win_rate", None),
214
  "win_total": uid_data.get("win_total", None),
215
  "weight": uid_data.get("weight", None),
 
274
  if c_id is None or c_id != competition_id:
275
  continue
276
 
277
+ # Filter out issue caused by wandb unavailability.
278
+ if loss < 0.99 and loss < best_loss:
279
  best_loss = loss
280
  should_add_datapoint = True
281
  # Now that we've processed the run's most recent steps, check if we should add a datapoint.