Use confidence interval and seed to have reproducible scoring
Browse files- app.py +24 -19
- db.py +10 -2
- rating_systems.py +1 -1
app.py
CHANGED
@@ -49,30 +49,36 @@ commit_scheduler = CommitScheduler(
|
|
49 |
def fetch_elo_scores():
|
50 |
"""Fetch and log Elo scores."""
|
51 |
try:
|
52 |
-
|
53 |
logging.info("Elo scores successfully computed.")
|
54 |
-
return
|
55 |
except Exception as e:
|
56 |
logging.error("Error computing Elo scores: %s", str(e))
|
57 |
return None
|
58 |
|
59 |
def update_rankings_table():
|
60 |
-
"""Update and return the rankings table based on Elo scores."""
|
61 |
-
|
62 |
-
|
63 |
try:
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
except KeyError as e:
|
71 |
logging.error("Missing score for model: %s", str(e))
|
72 |
return []
|
73 |
return rankings
|
74 |
|
75 |
-
|
76 |
def select_new_image():
|
77 |
"""Select a new image and its segmented versions."""
|
78 |
max_attempts = 10
|
@@ -354,10 +360,10 @@ def gradio_interface():
|
|
354 |
|
355 |
with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
|
356 |
rankings_table = gr.Dataframe(
|
357 |
-
headers=["Model", "Elo score", "
|
358 |
value=update_rankings_table(),
|
359 |
label="Current Model Rankings",
|
360 |
-
column_widths=[180, 60, 60, 60],
|
361 |
row_count=4
|
362 |
)
|
363 |
|
@@ -368,12 +374,11 @@ def gradio_interface():
|
|
368 |
|
369 |
# Explanation of Bootstrapped Elo Score
|
370 |
explanation_text = """
|
371 |
-
The
|
372 |
-
|
373 |
-
|
374 |
-
variability and confidence in the model's ranking.
|
375 |
|
376 |
-
We used the approach from the Chatbot Arena [rating system code](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/rating_systems.py).
|
377 |
"""
|
378 |
gr.Markdown(explanation_text)
|
379 |
|
|
|
49 |
def fetch_elo_scores():
|
50 |
"""Fetch and log Elo scores."""
|
51 |
try:
|
52 |
+
median_elo_scores, model_rating_q025, model_rating_q975, variance = compute_elo_scores()
|
53 |
logging.info("Elo scores successfully computed.")
|
54 |
+
return median_elo_scores, model_rating_q025, model_rating_q975, variance
|
55 |
except Exception as e:
|
56 |
logging.error("Error computing Elo scores: %s", str(e))
|
57 |
return None
|
58 |
|
59 |
def update_rankings_table():
|
60 |
+
"""Update and return the rankings table based on Elo scores and vote counts."""
|
61 |
+
median_elo_scores, model_rating_q025, model_rating_q975, variance = fetch_elo_scores() or {}
|
62 |
+
model_vote_counts = compute_votes_per_model()
|
63 |
try:
|
64 |
+
# Create a list of models to iterate over
|
65 |
+
models = ["Photoroom", "RemoveBG", "BRIA RMBG 2.0"]
|
66 |
+
rankings = []
|
67 |
+
|
68 |
+
for model in models:
|
69 |
+
elo_score = int(median_elo_scores.get(model, 0))
|
70 |
+
model_variance = int(variance.get(model, 0))
|
71 |
+
ci_95 = f"{int(model_rating_q025.get(model, 0))} - {int(model_rating_q975.get(model, 0))}"
|
72 |
+
vote_count = model_vote_counts.get(model, 0)
|
73 |
+
rankings.append([model, elo_score, model_variance, ci_95, vote_count])
|
74 |
+
|
75 |
+
# Sort rankings by Elo score in descending order
|
76 |
+
rankings.sort(key=lambda x: x[1], reverse=True)
|
77 |
except KeyError as e:
|
78 |
logging.error("Missing score for model: %s", str(e))
|
79 |
return []
|
80 |
return rankings
|
81 |
|
|
|
82 |
def select_new_image():
|
83 |
"""Select a new image and its segmented versions."""
|
84 |
max_attempts = 10
|
|
|
360 |
|
361 |
with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
|
362 |
rankings_table = gr.Dataframe(
|
363 |
+
headers=["Model", "Elo score", "Variance", "95% CI", "Selections"],
|
364 |
value=update_rankings_table(),
|
365 |
label="Current Model Rankings",
|
366 |
+
column_widths=[180, 60, 60, 60, 60],
|
367 |
row_count=4
|
368 |
)
|
369 |
|
|
|
374 |
|
375 |
# Explanation of Bootstrapped Elo Score
|
376 |
explanation_text = """
|
377 |
+
The Elo score was calculated using bootstrapping with num_rounds=1000. This method provides a
|
378 |
+
distribution of Elo scores by repeatedly sampling the data, which helps in
|
379 |
+
understanding the variability and confidence in the model's ranking.
|
|
|
380 |
|
381 |
+
We used the approach from the Chatbot Arena [rating system code](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/rating_systems.py#L153).
|
382 |
"""
|
383 |
gr.Markdown(explanation_text)
|
384 |
|
db.py
CHANGED
@@ -5,6 +5,7 @@ from sqlalchemy.ext.declarative import declarative_base
|
|
5 |
from sqlalchemy.orm import sessionmaker, Session
|
6 |
from datetime import datetime
|
7 |
import pandas as pd
|
|
|
8 |
from datasets import load_dataset
|
9 |
from rating_systems import compute_elo, compute_bootstrap_elo, get_median_elo_from_bootstrap
|
10 |
|
@@ -115,10 +116,17 @@ def compute_elo_scores():
|
|
115 |
logging.info("Initial votes count: %d", init_size)
|
116 |
logging.info("Votes count after validation: %d", df.shape[0])
|
117 |
|
118 |
-
|
|
|
|
|
119 |
bootstrap_elo_scores = compute_bootstrap_elo(df)
|
120 |
median_elo_scores = get_median_elo_from_bootstrap(bootstrap_elo_scores)
|
121 |
-
|
|
|
|
|
|
|
|
|
|
|
122 |
|
123 |
# Function to compute the number of votes for each model
|
124 |
def compute_votes_per_model():
|
|
|
5 |
from sqlalchemy.orm import sessionmaker, Session
|
6 |
from datetime import datetime
|
7 |
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
from datasets import load_dataset
|
10 |
from rating_systems import compute_elo, compute_bootstrap_elo, get_median_elo_from_bootstrap
|
11 |
|
|
|
116 |
logging.info("Initial votes count: %d", init_size)
|
117 |
logging.info("Votes count after validation: %d", df.shape[0])
|
118 |
|
119 |
+
# Seed the random number generator for reproducibility
|
120 |
+
np.random.seed(42)
|
121 |
+
|
122 |
bootstrap_elo_scores = compute_bootstrap_elo(df)
|
123 |
median_elo_scores = get_median_elo_from_bootstrap(bootstrap_elo_scores)
|
124 |
+
|
125 |
+
model_rating_q025 = bootstrap_elo_scores.quantile(0.025)
|
126 |
+
model_rating_q975 = bootstrap_elo_scores.quantile(0.975)
|
127 |
+
variance = bootstrap_elo_scores.var()
|
128 |
+
|
129 |
+
return median_elo_scores, model_rating_q025, model_rating_q975, variance
|
130 |
|
131 |
# Function to compute the number of votes for each model
|
132 |
def compute_votes_per_model():
|
rating_systems.py
CHANGED
@@ -44,7 +44,7 @@ def compute_elo(df, k=4.0, base=10.0, init_rating=1000.0, scale=400.0):
|
|
44 |
|
45 |
|
46 |
def compute_bootstrap_elo(
|
47 |
-
df, num_round=
|
48 |
):
|
49 |
matchups, outcomes, models = preprocess_for_elo(df)
|
50 |
sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round))
|
|
|
44 |
|
45 |
|
46 |
def compute_bootstrap_elo(
|
47 |
+
df, num_round=1000, k=4.0, base=10.0, init_rating=1000.0, scale=400.0
|
48 |
):
|
49 |
matchups, outcomes, models = preprocess_for_elo(df)
|
50 |
sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round))
|