Spaces:
Running
Running
Leaderboard, decimal places
Browse files
app.py
CHANGED
@@ -28,11 +28,9 @@ from common import (
|
|
28 |
)
|
29 |
from example_metrics import EXAMPLE_METRICS
|
30 |
|
31 |
-
import hashlib
|
32 |
-
|
33 |
|
34 |
# Model and ELO score data
|
35 |
-
DEFAULT_ELO =
|
36 |
K_FACTOR = 32 # Standard chess K-factor, adjust as needed
|
37 |
elo_scores = defaultdict(lambda: DEFAULT_ELO)
|
38 |
vote_counts = defaultdict(int)
|
@@ -210,7 +208,7 @@ def get_current_votes():
|
|
210 |
return get_votes(db)
|
211 |
|
212 |
|
213 |
-
def get_leaderboard():
|
214 |
"""Generate leaderboard data using fresh votes from MongoDB."""
|
215 |
# Get fresh voting data
|
216 |
voting_data = get_current_votes()
|
@@ -263,12 +261,16 @@ def get_leaderboard():
|
|
263 |
leaderboard = []
|
264 |
for model in model_data.keys():
|
265 |
votes = matches[model]
|
|
|
|
|
|
|
|
|
266 |
elo = ratings[model]
|
267 |
ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
|
268 |
data = {
|
269 |
"Model": model,
|
270 |
-
"ELO Score": f"{elo
|
271 |
-
"95% CI": f"±{ci
|
272 |
"# Votes": votes,
|
273 |
"Organization": model_data[model]["organization"],
|
274 |
"License": model_data[model]["license"],
|
@@ -532,12 +534,52 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
532 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
533 |
|
534 |
with gr.TabItem("Leaderboard"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
535 |
stats_display = gr.Markdown()
|
536 |
leaderboard_table = gr.Dataframe(
|
537 |
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
538 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
539 |
)
|
540 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
541 |
with gr.TabItem("Policy"):
|
542 |
gr.Markdown(POLICY_CONTENT)
|
543 |
|
@@ -758,29 +800,6 @@ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
|
|
758 |
outputs=[send_btn, regenerate_button],
|
759 |
)
|
760 |
|
761 |
-
# Update the leaderboard
|
762 |
-
def refresh_leaderboard():
|
763 |
-
"""Refresh the leaderboard data and stats."""
|
764 |
-
leaderboard = get_leaderboard()
|
765 |
-
data = [
|
766 |
-
[
|
767 |
-
entry["Model"],
|
768 |
-
float(entry["ELO Score"]),
|
769 |
-
entry["95% CI"],
|
770 |
-
entry["# Votes"],
|
771 |
-
entry["Organization"],
|
772 |
-
entry["License"],
|
773 |
-
]
|
774 |
-
for entry in leaderboard
|
775 |
-
]
|
776 |
-
stats = get_leaderboard_stats()
|
777 |
-
return [gr.update(value=data), gr.update(value=stats)]
|
778 |
-
|
779 |
-
# Add the load event at the very end, just before demo.launch()
|
780 |
-
demo.load(
|
781 |
-
fn=refresh_leaderboard, inputs=None, outputs=[leaderboard_table, stats_display]
|
782 |
-
)
|
783 |
-
|
784 |
# Add click handlers for metric buttons
|
785 |
outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
|
786 |
|
|
|
28 |
)
|
29 |
from example_metrics import EXAMPLE_METRICS
|
30 |
|
|
|
|
|
31 |
|
32 |
# Model and ELO score data
|
33 |
+
DEFAULT_ELO = 1200 # Starting ELO for new models
|
34 |
K_FACTOR = 32 # Standard chess K-factor, adjust as needed
|
35 |
elo_scores = defaultdict(lambda: DEFAULT_ELO)
|
36 |
vote_counts = defaultdict(int)
|
|
|
208 |
return get_votes(db)
|
209 |
|
210 |
|
211 |
+
def get_leaderboard(show_preliminary=True):
|
212 |
"""Generate leaderboard data using fresh votes from MongoDB."""
|
213 |
# Get fresh voting data
|
214 |
voting_data = get_current_votes()
|
|
|
261 |
leaderboard = []
|
262 |
for model in model_data.keys():
|
263 |
votes = matches[model]
|
264 |
+
# Skip models with < 500 votes if show_preliminary is False
|
265 |
+
if not show_preliminary and votes < 500:
|
266 |
+
continue
|
267 |
+
|
268 |
elo = ratings[model]
|
269 |
ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
|
270 |
data = {
|
271 |
"Model": model,
|
272 |
+
"ELO Score": f"{int(elo)}",
|
273 |
+
"95% CI": f"±{int(ci)}",
|
274 |
"# Votes": votes,
|
275 |
"Organization": model_data[model]["organization"],
|
276 |
"License": model_data[model]["license"],
|
|
|
534 |
gr.Markdown(ACKNOWLEDGEMENTS)
|
535 |
|
536 |
with gr.TabItem("Leaderboard"):
|
537 |
+
with gr.Row():
|
538 |
+
with gr.Column(scale=1):
|
539 |
+
show_preliminary = gr.Checkbox(
|
540 |
+
label="Reveal preliminary results",
|
541 |
+
value=True, # Checked by default
|
542 |
+
info="Show all models, including models with less few human ratings (< 500 votes)",
|
543 |
+
interactive=True
|
544 |
+
)
|
545 |
stats_display = gr.Markdown()
|
546 |
leaderboard_table = gr.Dataframe(
|
547 |
headers=["Model", "ELO", "95% CI", "Matches", "Organization", "License"],
|
548 |
datatype=["str", "number", "str", "number", "str", "str", "str"],
|
549 |
)
|
550 |
|
551 |
+
# Update refresh_leaderboard to use the checkbox value
|
552 |
+
def refresh_leaderboard(show_preliminary):
|
553 |
+
"""Refresh the leaderboard data and stats."""
|
554 |
+
leaderboard = get_leaderboard(show_preliminary)
|
555 |
+
data = [
|
556 |
+
[
|
557 |
+
entry["Model"],
|
558 |
+
float(entry["ELO Score"]),
|
559 |
+
entry["95% CI"],
|
560 |
+
entry["# Votes"],
|
561 |
+
entry["Organization"],
|
562 |
+
entry["License"],
|
563 |
+
]
|
564 |
+
for entry in leaderboard
|
565 |
+
]
|
566 |
+
stats = get_leaderboard_stats()
|
567 |
+
return [gr.update(value=data), gr.update(value=stats)]
|
568 |
+
|
569 |
+
# Add change handler for checkbox
|
570 |
+
show_preliminary.change(
|
571 |
+
fn=refresh_leaderboard,
|
572 |
+
inputs=[show_preliminary],
|
573 |
+
outputs=[leaderboard_table, stats_display]
|
574 |
+
)
|
575 |
+
|
576 |
+
# Update the load event
|
577 |
+
demo.load(
|
578 |
+
fn=refresh_leaderboard,
|
579 |
+
inputs=[show_preliminary],
|
580 |
+
outputs=[leaderboard_table, stats_display]
|
581 |
+
)
|
582 |
+
|
583 |
with gr.TabItem("Policy"):
|
584 |
gr.Markdown(POLICY_CONTENT)
|
585 |
|
|
|
800 |
outputs=[send_btn, regenerate_button],
|
801 |
)
|
802 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
803 |
# Add click handlers for metric buttons
|
804 |
outputs_list = [eval_prompt] + [var_input for _, var_input in variable_rows]
|
805 |
|