Spaces:
Runtime error
Runtime error
rusticluftig
commited on
Commit
·
838067a
1
Parent(s):
9b87de8
Update leaderboard for multi evals
Browse files
app.py
CHANGED
@@ -79,11 +79,7 @@ def main():
|
|
79 |
gr.HTML(EVALUATION_HEADER)
|
80 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
81 |
competition_leaderboards = []
|
82 |
-
comp_2 = competitions.COMPETITION_DETAILS[2]
|
83 |
-
# Covert the losses into % of correct answers.
|
84 |
-
losses_2["losses"] = losses_2["losses"].apply(
|
85 |
-
lambda x: 1 - x if x else None
|
86 |
-
)
|
87 |
with gr.Accordion(f"{comp_2.name} Competition"):
|
88 |
gr.HTML(comp_2.html_description)
|
89 |
competition_leaderboards.append(
|
@@ -94,7 +90,7 @@ def main():
|
|
94 |
headers=[
|
95 |
"Name",
|
96 |
"Win Rate",
|
97 |
-
"
|
98 |
"Weight",
|
99 |
"UID",
|
100 |
"Block",
|
@@ -117,18 +113,25 @@ def main():
|
|
117 |
x="timestamp",
|
118 |
x_title="Date",
|
119 |
y="losses",
|
120 |
-
y_title="
|
121 |
interactive=True,
|
122 |
visible=True,
|
123 |
width=1024,
|
124 |
-
title="Best
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
)
|
126 |
gr.HTML(
|
127 |
"""
|
128 |
<ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
|
129 |
<li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
|
130 |
-
<li><b>
|
131 |
-
<li><b>MC Score:</b> the % of correct multiple choice answers given by the model as calculated by the OTF validator (higher is better)</li>
|
132 |
<li><b>UID:</b> the Bittensor UID of the miner</li>
|
133 |
<li><b>Weight:</b> the bittensor weight set for this model</li>
|
134 |
<li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
|
|
|
79 |
gr.HTML(EVALUATION_HEADER)
|
80 |
show_stale = gr.Checkbox(label="Show Stale", interactive=True)
|
81 |
competition_leaderboards = []
|
82 |
+
comp_2 = competitions.COMPETITION_DETAILS[2]
|
|
|
|
|
|
|
|
|
83 |
with gr.Accordion(f"{comp_2.name} Competition"):
|
84 |
gr.HTML(comp_2.html_description)
|
85 |
competition_leaderboards.append(
|
|
|
90 |
headers=[
|
91 |
"Name",
|
92 |
"Win Rate",
|
93 |
+
"Score",
|
94 |
"Weight",
|
95 |
"UID",
|
96 |
"Block",
|
|
|
113 |
x="timestamp",
|
114 |
x_title="Date",
|
115 |
y="losses",
|
116 |
+
y_title="Score",
|
117 |
interactive=True,
|
118 |
visible=True,
|
119 |
width=1024,
|
120 |
+
title="Best Score Over Time",
|
121 |
+
)
|
122 |
+
gr.HTML(
|
123 |
+
"""
|
124 |
+
The definition of score changes over time as new evaluation tasks are added in releases.
|
125 |
+
<ul>
|
126 |
+
<li><b>Start-Oct 27</b>: % of wrong answers on synthetic MMLU</li>
|
127 |
+
<li><b>Oct 27-Now</b>: + word sorting eval</li>
|
128 |
+
"""
|
129 |
)
|
130 |
gr.HTML(
|
131 |
"""
|
132 |
<ul><li><b>Name:</b> the 🤗 Hugging Face repo (click to go to the model card)</li>
|
133 |
<li><b>Win Rate:</b> % of head-to-head evals won vs. other eval'd models, given an epsilon advantage or disadvantage</li>
|
134 |
+
<li><b>Score:</b> the combined model score as calculated by the OTF validator (lower is better)</li>
|
|
|
135 |
<li><b>UID:</b> the Bittensor UID of the miner</li>
|
136 |
<li><b>Weight:</b> the bittensor weight set for this model</li>
|
137 |
<li><b>Block:</b> the Bittensor block that the model was submitted in</li></ul><br/>More stats on <a href="https://taostats.io/subnets/netuid-37/" target="_blank">taostats</a>."""
|
utils.py
CHANGED
@@ -184,13 +184,6 @@ def get_scores(
|
|
184 |
uids (List[int]): List of UIDs to get scores for.
|
185 |
wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
|
186 |
"""
|
187 |
-
|
188 |
-
def _maybe_convert_loss(loss: float, comp_id: int) -> float:
|
189 |
-
"""Converts loss to score for competitions that require it."""
|
190 |
-
if comp_id == 2:
|
191 |
-
return 1 - loss if loss else None
|
192 |
-
return loss
|
193 |
-
|
194 |
result = {}
|
195 |
previous_timestamp = None
|
196 |
seen_competitions = set()
|
@@ -216,9 +209,7 @@ def get_scores(
|
|
216 |
# Only the most recent run per competition is fresh.
|
217 |
is_fresh = comp_id not in seen_competitions
|
218 |
result[uid] = {
|
219 |
-
"avg_loss":
|
220 |
-
uid_data.get("average_loss", None), comp_id
|
221 |
-
),
|
222 |
"win_rate": uid_data.get("win_rate", None),
|
223 |
"win_total": uid_data.get("win_total", None),
|
224 |
"weight": uid_data.get("weight", None),
|
@@ -283,7 +274,8 @@ def get_losses_over_time(wandb_runs: List, competition_id: int) -> pd.DataFrame:
|
|
283 |
if c_id is None or c_id != competition_id:
|
284 |
continue
|
285 |
|
286 |
-
|
|
|
287 |
best_loss = loss
|
288 |
should_add_datapoint = True
|
289 |
# Now that we've processed the run's most recent steps, check if we should add a datapoint.
|
|
|
184 |
uids (List[int]): List of UIDs to get scores for.
|
185 |
wandb_runs (List): List of validator runs from Wandb. Requires the runs are provided in descending order.
|
186 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
result = {}
|
188 |
previous_timestamp = None
|
189 |
seen_competitions = set()
|
|
|
209 |
# Only the most recent run per competition is fresh.
|
210 |
is_fresh = comp_id not in seen_competitions
|
211 |
result[uid] = {
|
212 |
+
"avg_loss": uid_data.get("average_loss", None),
|
|
|
|
|
213 |
"win_rate": uid_data.get("win_rate", None),
|
214 |
"win_total": uid_data.get("win_total", None),
|
215 |
"weight": uid_data.get("weight", None),
|
|
|
274 |
if c_id is None or c_id != competition_id:
|
275 |
continue
|
276 |
|
277 |
+
# Filter out issue caused by wandb unavailability.
|
278 |
+
if loss < 0.99 and loss < best_loss:
|
279 |
best_loss = loss
|
280 |
should_add_datapoint = True
|
281 |
# Now that we've processed the run's most recent steps, check if we should add a datapoint.
|