include elo bootstraping and counts
Browse files- app.py +29 -13
- db.py +43 -2
- rating_systems.py +45 -16
- requirements.txt +2 -1
app.py
CHANGED
@@ -18,7 +18,8 @@ from db import (
|
|
18 |
get_all_votes,
|
19 |
add_vote,
|
20 |
is_running_in_space,
|
21 |
-
fill_database_once
|
|
|
22 |
)
|
23 |
|
24 |
# Load environment variables
|
@@ -48,23 +49,27 @@ commit_scheduler = CommitScheduler(
|
|
48 |
def fetch_elo_scores():
|
49 |
"""Fetch and log Elo scores."""
|
50 |
try:
|
51 |
-
elo_scores = compute_elo_scores()
|
52 |
logging.info("Elo scores successfully computed.")
|
53 |
-
return elo_scores
|
54 |
except Exception as e:
|
55 |
logging.error("Error computing Elo scores: %s", str(e))
|
56 |
return None
|
57 |
|
58 |
def update_rankings_table():
|
59 |
"""Update and return the rankings table based on Elo scores."""
|
60 |
-
elo_scores = fetch_elo_scores() or {}
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
68 |
return rankings
|
69 |
|
70 |
|
@@ -349,10 +354,10 @@ def gradio_interface():
|
|
349 |
|
350 |
with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
|
351 |
rankings_table = gr.Dataframe(
|
352 |
-
headers=["Model", "
|
353 |
value=update_rankings_table(),
|
354 |
label="Current Model Rankings",
|
355 |
-
column_widths=[180, 60],
|
356 |
row_count=4
|
357 |
)
|
358 |
|
@@ -361,6 +366,17 @@ def gradio_interface():
|
|
361 |
outputs=rankings_table
|
362 |
)
|
363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
with gr.Tab("📊 Vote Data", id=2) as vote_data_tab:
|
365 |
def update_vote_data():
|
366 |
votes = get_all_votes()
|
|
|
18 |
get_all_votes,
|
19 |
add_vote,
|
20 |
is_running_in_space,
|
21 |
+
fill_database_once,
|
22 |
+
compute_votes_per_model
|
23 |
)
|
24 |
|
25 |
# Load environment variables
|
|
|
49 |
def fetch_elo_scores():
|
50 |
"""Fetch and log Elo scores."""
|
51 |
try:
|
52 |
+
elo_scores, bootstrap_elo_scores = compute_elo_scores()
|
53 |
logging.info("Elo scores successfully computed.")
|
54 |
+
return elo_scores, bootstrap_elo_scores
|
55 |
except Exception as e:
|
56 |
logging.error("Error computing Elo scores: %s", str(e))
|
57 |
return None
|
58 |
|
59 |
def update_rankings_table():
|
60 |
"""Update and return the rankings table based on Elo scores."""
|
61 |
+
elo_scores, bootstrap_elo_scores = fetch_elo_scores() or {}
|
62 |
+
votes_per_model = compute_votes_per_model()
|
63 |
+
try:
|
64 |
+
rankings = [
|
65 |
+
["Photoroom", int(elo_scores["Photoroom"]), int(bootstrap_elo_scores["Photoroom"]), votes_per_model.get("Photoroom", 0)],
|
66 |
+
["RemoveBG", int(elo_scores["RemoveBG"]), int(bootstrap_elo_scores["RemoveBG"]), votes_per_model.get("RemoveBG", 0)],
|
67 |
+
["BRIA RMBG 2.0", int(elo_scores["BRIA RMBG 2.0"]), int(bootstrap_elo_scores["BRIA RMBG 2.0"]), votes_per_model.get("BRIA RMBG 2.0", 0)],
|
68 |
+
]
|
69 |
+
except KeyError as e:
|
70 |
+
logging.error("Missing score for model: %s", str(e))
|
71 |
+
return []
|
72 |
+
rankings.sort(key=lambda x: x[2], reverse=True)
|
73 |
return rankings
|
74 |
|
75 |
|
|
|
354 |
|
355 |
with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
|
356 |
rankings_table = gr.Dataframe(
|
357 |
+
headers=["Model", "Elo score", "Bootstrapped Elo score", "Selections"],
|
358 |
value=update_rankings_table(),
|
359 |
label="Current Model Rankings",
|
360 |
+
column_widths=[180, 60, 60, 60],
|
361 |
row_count=4
|
362 |
)
|
363 |
|
|
|
366 |
outputs=rankings_table
|
367 |
)
|
368 |
|
369 |
+
# Explanation of Bootstrapped Elo Score
|
370 |
+
explanation_text = """
|
371 |
+
The Bootstrapped Elo score is a more robust estimate of the model's performance,
|
372 |
+
calculated using bootstrapping techniques. This method provides a distribution of
|
373 |
+
Elo scores by repeatedly sampling the data, which helps in understanding the
|
374 |
+
variability and confidence in the model's ranking.
|
375 |
+
|
376 |
+
We used the approach from the Chatbot Arena [rating system code](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/rating_systems.py).
|
377 |
+
"""
|
378 |
+
gr.Markdown(explanation_text)
|
379 |
+
|
380 |
with gr.Tab("📊 Vote Data", id=2) as vote_data_tab:
|
381 |
def update_vote_data():
|
382 |
votes = get_all_votes()
|
db.py
CHANGED
@@ -6,7 +6,7 @@ from sqlalchemy.orm import sessionmaker, Session
|
|
6 |
from datetime import datetime
|
7 |
import pandas as pd
|
8 |
from datasets import load_dataset
|
9 |
-
from rating_systems import compute_elo
|
10 |
|
11 |
def is_running_in_space():
|
12 |
return "SPACE_ID" in os.environ
|
@@ -88,6 +88,8 @@ def get_all_votes():
|
|
88 |
|
89 |
# Function to compute Elo scores
|
90 |
def compute_elo_scores():
|
|
|
|
|
91 |
with SessionLocal() as db:
|
92 |
votes = db.query(Vote).all()
|
93 |
data = {
|
@@ -96,5 +98,44 @@ def compute_elo_scores():
|
|
96 |
"winner": [vote.winner for vote in votes]
|
97 |
}
|
98 |
df = pd.DataFrame(data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
elo_scores = compute_elo(df)
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from datetime import datetime
|
7 |
import pandas as pd
|
8 |
from datasets import load_dataset
|
9 |
+
from rating_systems import compute_elo, compute_bootstrap_elo, get_median_elo_from_bootstrap
|
10 |
|
11 |
def is_running_in_space():
|
12 |
return "SPACE_ID" in os.environ
|
|
|
88 |
|
89 |
# Function to compute Elo scores
|
90 |
def compute_elo_scores():
|
91 |
+
valid_models = ["Photoroom", "RemoveBG", "BRIA RMBG 2.0"]
|
92 |
+
|
93 |
with SessionLocal() as db:
|
94 |
votes = db.query(Vote).all()
|
95 |
data = {
|
|
|
98 |
"winner": [vote.winner for vote in votes]
|
99 |
}
|
100 |
df = pd.DataFrame(data)
|
101 |
+
init_size = df.shape[0]
|
102 |
+
|
103 |
+
# Remove votes missing model_a, model_b or winner info
|
104 |
+
df.dropna(subset=["model_a", "model_b", "winner"], inplace=True)
|
105 |
+
|
106 |
+
# Validate models and winner
|
107 |
+
def is_valid_vote(row):
|
108 |
+
if row["model_a"] not in valid_models or row["model_b"] not in valid_models:
|
109 |
+
return False
|
110 |
+
if row["winner"] not in ["model_a", "model_b", "tie"]:
|
111 |
+
return False
|
112 |
+
return True
|
113 |
+
|
114 |
+
df = df[df.apply(is_valid_vote, axis=1)]
|
115 |
+
logging.info("Initial votes count: %d", init_size)
|
116 |
+
logging.info("Votes count after validation: %d", df.shape[0])
|
117 |
+
|
118 |
elo_scores = compute_elo(df)
|
119 |
+
bootstrap_elo_scores = compute_bootstrap_elo(df)
|
120 |
+
median_elo_scores = get_median_elo_from_bootstrap(bootstrap_elo_scores)
|
121 |
+
return elo_scores, median_elo_scores
|
122 |
+
|
123 |
+
# Function to compute the number of votes for each model
|
124 |
+
def compute_votes_per_model():
|
125 |
+
with SessionLocal() as db:
|
126 |
+
votes = db.query(Vote).all()
|
127 |
+
model_vote_count = {}
|
128 |
+
|
129 |
+
for vote in votes:
|
130 |
+
if vote.winner == "model_a":
|
131 |
+
model = vote.model_a
|
132 |
+
elif vote.winner == "model_b":
|
133 |
+
model = vote.model_b
|
134 |
+
else:
|
135 |
+
continue
|
136 |
+
|
137 |
+
if model not in model_vote_count:
|
138 |
+
model_vote_count[model] = 0
|
139 |
+
model_vote_count[model] += 1
|
140 |
+
|
141 |
+
return model_vote_count
|
rating_systems.py
CHANGED
@@ -6,6 +6,7 @@ import pandas as pd
|
|
6 |
import numpy as np
|
7 |
from sqlalchemy.orm import Session
|
8 |
import pandas as pd
|
|
|
9 |
|
10 |
def get_matchups_models(df):
|
11 |
n_rows = len(df)
|
@@ -41,19 +42,47 @@ def compute_elo(df, k=4.0, base=10.0, init_rating=1000.0, scale=400.0):
|
|
41 |
return {model: ratings[idx] for idx, model in enumerate(models)}
|
42 |
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
df
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import numpy as np
|
7 |
from sqlalchemy.orm import Session
|
8 |
import pandas as pd
|
9 |
+
from scipy.special import expit
|
10 |
|
11 |
def get_matchups_models(df):
|
12 |
n_rows = len(df)
|
|
|
42 |
return {model: ratings[idx] for idx, model in enumerate(models)}
|
43 |
|
44 |
|
45 |
+
|
46 |
+
def compute_bootstrap_elo(
|
47 |
+
df, num_round=100, k=4.0, base=10.0, init_rating=1000.0, scale=400.0
|
48 |
+
):
|
49 |
+
matchups, outcomes, models = preprocess_for_elo(df)
|
50 |
+
sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round))
|
51 |
+
ratings = fit_vectorized_elo(
|
52 |
+
matchups, outcomes, sample_indices, len(models), k, base, init_rating, scale
|
53 |
+
)
|
54 |
+
df = pd.DataFrame(data=ratings, columns=models)
|
55 |
+
return df[df.median().sort_values(ascending=False).index]
|
56 |
+
|
57 |
+
def fit_vectorized_elo(
|
58 |
+
matchups,
|
59 |
+
outcomes,
|
60 |
+
sample_indices,
|
61 |
+
num_models,
|
62 |
+
k=4.0,
|
63 |
+
base=10.0,
|
64 |
+
init_rating=1000.0,
|
65 |
+
scale=400.0,
|
66 |
+
):
|
67 |
+
"""fit multiple sets of Elo ratings on different samples of the data at the same time"""
|
68 |
+
alpha = math.log(base) / scale
|
69 |
+
num_samples = sample_indices.shape[1]
|
70 |
+
ratings = np.zeros(shape=(num_samples, num_models), dtype=np.float64)
|
71 |
+
# iterate over the rows of sample_indices, each column is an index into a match in the input arrays
|
72 |
+
sample_range = np.arange(num_samples)
|
73 |
+
for matchup_indices in sample_indices:
|
74 |
+
model_a_indices = matchups[matchup_indices, 0]
|
75 |
+
model_b_indices = matchups[matchup_indices, 1]
|
76 |
+
model_a_ratings = ratings[sample_range, model_a_indices]
|
77 |
+
model_b_ratings = ratings[sample_range, model_b_indices]
|
78 |
+
sample_outcomes = outcomes[matchup_indices]
|
79 |
+
probs = expit(alpha * (model_a_ratings - model_b_ratings))
|
80 |
+
updates = k * (sample_outcomes - probs)
|
81 |
+
ratings[sample_range, model_a_indices] += updates
|
82 |
+
ratings[sample_range, model_b_indices] -= updates
|
83 |
+
return ratings + init_rating
|
84 |
+
|
85 |
+
def get_median_elo_from_bootstrap(bootstrap_df):
|
86 |
+
median = dict(bootstrap_df.quantile(0.5))
|
87 |
+
median = {k: int(v + 0.5) for k, v in median.items()}
|
88 |
+
return median
|
requirements.txt
CHANGED
@@ -10,4 +10,5 @@ SQLAlchemy==2.0.36
|
|
10 |
uvicorn==0.30.1
|
11 |
py-spy
|
12 |
pandas
|
13 |
-
datasets
|
|
|
|
10 |
uvicorn==0.30.1
|
11 |
py-spy
|
12 |
pandas
|
13 |
+
datasets
|
14 |
+
scipy
|