tdurbor commited on
Commit
5fef682
·
1 Parent(s): 7f0069a

include elo bootstraping and counts

Browse files
Files changed (4) hide show
  1. app.py +29 -13
  2. db.py +43 -2
  3. rating_systems.py +45 -16
  4. requirements.txt +2 -1
app.py CHANGED
@@ -18,7 +18,8 @@ from db import (
18
  get_all_votes,
19
  add_vote,
20
  is_running_in_space,
21
- fill_database_once
 
22
  )
23
 
24
  # Load environment variables
@@ -48,23 +49,27 @@ commit_scheduler = CommitScheduler(
48
  def fetch_elo_scores():
49
  """Fetch and log Elo scores."""
50
  try:
51
- elo_scores = compute_elo_scores()
52
  logging.info("Elo scores successfully computed.")
53
- return elo_scores
54
  except Exception as e:
55
  logging.error("Error computing Elo scores: %s", str(e))
56
  return None
57
 
58
  def update_rankings_table():
59
  """Update and return the rankings table based on Elo scores."""
60
- elo_scores = fetch_elo_scores() or {}
61
- default_score = 1000
62
- rankings = [
63
- ["Photoroom", int(elo_scores.get("Photoroom", default_score))],
64
- ["RemoveBG", int(elo_scores.get("RemoveBG", default_score))],
65
- ["BRIA RMBG 2.0", int(elo_scores.get("BRIA RMBG 2.0", default_score))],
66
- ]
67
- rankings.sort(key=lambda x: x[1], reverse=True)
 
 
 
 
68
  return rankings
69
 
70
 
@@ -349,10 +354,10 @@ def gradio_interface():
349
 
350
  with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
351
  rankings_table = gr.Dataframe(
352
- headers=["Model", "Ranking"],
353
  value=update_rankings_table(),
354
  label="Current Model Rankings",
355
- column_widths=[180, 60],
356
  row_count=4
357
  )
358
 
@@ -361,6 +366,17 @@ def gradio_interface():
361
  outputs=rankings_table
362
  )
363
 
 
 
 
 
 
 
 
 
 
 
 
364
  with gr.Tab("📊 Vote Data", id=2) as vote_data_tab:
365
  def update_vote_data():
366
  votes = get_all_votes()
 
18
  get_all_votes,
19
  add_vote,
20
  is_running_in_space,
21
+ fill_database_once,
22
+ compute_votes_per_model
23
  )
24
 
25
  # Load environment variables
 
49
  def fetch_elo_scores():
50
  """Fetch and log Elo scores."""
51
  try:
52
+ elo_scores, bootstrap_elo_scores = compute_elo_scores()
53
  logging.info("Elo scores successfully computed.")
54
+ return elo_scores, bootstrap_elo_scores
55
  except Exception as e:
56
  logging.error("Error computing Elo scores: %s", str(e))
57
  return None
58
 
59
  def update_rankings_table():
60
  """Update and return the rankings table based on Elo scores."""
61
+ elo_scores, bootstrap_elo_scores = fetch_elo_scores() or {}
62
+ votes_per_model = compute_votes_per_model()
63
+ try:
64
+ rankings = [
65
+ ["Photoroom", int(elo_scores["Photoroom"]), int(bootstrap_elo_scores["Photoroom"]), votes_per_model.get("Photoroom", 0)],
66
+ ["RemoveBG", int(elo_scores["RemoveBG"]), int(bootstrap_elo_scores["RemoveBG"]), votes_per_model.get("RemoveBG", 0)],
67
+ ["BRIA RMBG 2.0", int(elo_scores["BRIA RMBG 2.0"]), int(bootstrap_elo_scores["BRIA RMBG 2.0"]), votes_per_model.get("BRIA RMBG 2.0", 0)],
68
+ ]
69
+ except KeyError as e:
70
+ logging.error("Missing score for model: %s", str(e))
71
+ return []
72
+ rankings.sort(key=lambda x: x[2], reverse=True)
73
  return rankings
74
 
75
 
 
354
 
355
  with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
356
  rankings_table = gr.Dataframe(
357
+ headers=["Model", "Elo score", "Bootstrapped Elo score", "Selections"],
358
  value=update_rankings_table(),
359
  label="Current Model Rankings",
360
+ column_widths=[180, 60, 60, 60],
361
  row_count=4
362
  )
363
 
 
366
  outputs=rankings_table
367
  )
368
 
369
+ # Explanation of Bootstrapped Elo Score
370
+ explanation_text = """
371
+ The Bootstrapped Elo score is a more robust estimate of the model's performance,
372
+ calculated using bootstrapping techniques. This method provides a distribution of
373
+ Elo scores by repeatedly sampling the data, which helps in understanding the
374
+ variability and confidence in the model's ranking.
375
+
376
+ We used the approach from the Chatbot Arena [rating system code](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/rating_systems.py).
377
+ """
378
+ gr.Markdown(explanation_text)
379
+
380
  with gr.Tab("📊 Vote Data", id=2) as vote_data_tab:
381
  def update_vote_data():
382
  votes = get_all_votes()
db.py CHANGED
@@ -6,7 +6,7 @@ from sqlalchemy.orm import sessionmaker, Session
6
  from datetime import datetime
7
  import pandas as pd
8
  from datasets import load_dataset
9
- from rating_systems import compute_elo
10
 
11
  def is_running_in_space():
12
  return "SPACE_ID" in os.environ
@@ -88,6 +88,8 @@ def get_all_votes():
88
 
89
  # Function to compute Elo scores
90
  def compute_elo_scores():
 
 
91
  with SessionLocal() as db:
92
  votes = db.query(Vote).all()
93
  data = {
@@ -96,5 +98,44 @@ def compute_elo_scores():
96
  "winner": [vote.winner for vote in votes]
97
  }
98
  df = pd.DataFrame(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  elo_scores = compute_elo(df)
100
- return elo_scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from datetime import datetime
7
  import pandas as pd
8
  from datasets import load_dataset
9
+ from rating_systems import compute_elo, compute_bootstrap_elo, get_median_elo_from_bootstrap
10
 
11
  def is_running_in_space():
12
  return "SPACE_ID" in os.environ
 
88
 
89
  # Function to compute Elo scores
90
  def compute_elo_scores():
91
+ valid_models = ["Photoroom", "RemoveBG", "BRIA RMBG 2.0"]
92
+
93
  with SessionLocal() as db:
94
  votes = db.query(Vote).all()
95
  data = {
 
98
  "winner": [vote.winner for vote in votes]
99
  }
100
  df = pd.DataFrame(data)
101
+ init_size = df.shape[0]
102
+
103
+ # Remove votes missing model_a, model_b or winner info
104
+ df.dropna(subset=["model_a", "model_b", "winner"], inplace=True)
105
+
106
+ # Validate models and winner
107
+ def is_valid_vote(row):
108
+ if row["model_a"] not in valid_models or row["model_b"] not in valid_models:
109
+ return False
110
+ if row["winner"] not in ["model_a", "model_b", "tie"]:
111
+ return False
112
+ return True
113
+
114
+ df = df[df.apply(is_valid_vote, axis=1)]
115
+ logging.info("Initial votes count: %d", init_size)
116
+ logging.info("Votes count after validation: %d", df.shape[0])
117
+
118
  elo_scores = compute_elo(df)
119
+ bootstrap_elo_scores = compute_bootstrap_elo(df)
120
+ median_elo_scores = get_median_elo_from_bootstrap(bootstrap_elo_scores)
121
+ return elo_scores, median_elo_scores
122
+
123
+ # Function to compute the number of votes for each model
124
+ def compute_votes_per_model():
125
+ with SessionLocal() as db:
126
+ votes = db.query(Vote).all()
127
+ model_vote_count = {}
128
+
129
+ for vote in votes:
130
+ if vote.winner == "model_a":
131
+ model = vote.model_a
132
+ elif vote.winner == "model_b":
133
+ model = vote.model_b
134
+ else:
135
+ continue
136
+
137
+ if model not in model_vote_count:
138
+ model_vote_count[model] = 0
139
+ model_vote_count[model] += 1
140
+
141
+ return model_vote_count
rating_systems.py CHANGED
@@ -6,6 +6,7 @@ import pandas as pd
6
  import numpy as np
7
  from sqlalchemy.orm import Session
8
  import pandas as pd
 
9
 
10
  def get_matchups_models(df):
11
  n_rows = len(df)
@@ -41,19 +42,47 @@ def compute_elo(df, k=4.0, base=10.0, init_rating=1000.0, scale=400.0):
41
  return {model: ratings[idx] for idx, model in enumerate(models)}
42
 
43
 
44
- def compute_elo_from_votes(db: Session):
45
- # Retrieve all votes from the database
46
- votes = db.query(Vote).all()
47
-
48
- # Convert votes to a DataFrame
49
- data = {
50
- "model_a": [vote.model_a for vote in votes],
51
- "model_b": [vote.model_b for vote in votes],
52
- "winner": [vote.winner for vote in votes]
53
- }
54
- df = pd.DataFrame(data)
55
-
56
- # Compute Elo scores using the existing function
57
- elo_scores = compute_elo(df)
58
-
59
- return elo_scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import numpy as np
7
  from sqlalchemy.orm import Session
8
  import pandas as pd
9
+ from scipy.special import expit
10
 
11
  def get_matchups_models(df):
12
  n_rows = len(df)
 
42
  return {model: ratings[idx] for idx, model in enumerate(models)}
43
 
44
 
45
+
46
+ def compute_bootstrap_elo(
47
+ df, num_round=100, k=4.0, base=10.0, init_rating=1000.0, scale=400.0
48
+ ):
49
+ matchups, outcomes, models = preprocess_for_elo(df)
50
+ sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round))
51
+ ratings = fit_vectorized_elo(
52
+ matchups, outcomes, sample_indices, len(models), k, base, init_rating, scale
53
+ )
54
+ df = pd.DataFrame(data=ratings, columns=models)
55
+ return df[df.median().sort_values(ascending=False).index]
56
+
57
+ def fit_vectorized_elo(
58
+ matchups,
59
+ outcomes,
60
+ sample_indices,
61
+ num_models,
62
+ k=4.0,
63
+ base=10.0,
64
+ init_rating=1000.0,
65
+ scale=400.0,
66
+ ):
67
+ """fit multiple sets of Elo ratings on different samples of the data at the same time"""
68
+ alpha = math.log(base) / scale
69
+ num_samples = sample_indices.shape[1]
70
+ ratings = np.zeros(shape=(num_samples, num_models), dtype=np.float64)
71
+ # iterate over the rows of sample_indices, each column is an index into a match in the input arrays
72
+ sample_range = np.arange(num_samples)
73
+ for matchup_indices in sample_indices:
74
+ model_a_indices = matchups[matchup_indices, 0]
75
+ model_b_indices = matchups[matchup_indices, 1]
76
+ model_a_ratings = ratings[sample_range, model_a_indices]
77
+ model_b_ratings = ratings[sample_range, model_b_indices]
78
+ sample_outcomes = outcomes[matchup_indices]
79
+ probs = expit(alpha * (model_a_ratings - model_b_ratings))
80
+ updates = k * (sample_outcomes - probs)
81
+ ratings[sample_range, model_a_indices] += updates
82
+ ratings[sample_range, model_b_indices] -= updates
83
+ return ratings + init_rating
84
+
85
+ def get_median_elo_from_bootstrap(bootstrap_df):
86
+ median = dict(bootstrap_df.quantile(0.5))
87
+ median = {k: int(v + 0.5) for k, v in median.items()}
88
+ return median
requirements.txt CHANGED
@@ -10,4 +10,5 @@ SQLAlchemy==2.0.36
10
  uvicorn==0.30.1
11
  py-spy
12
  pandas
13
- datasets
 
 
10
  uvicorn==0.30.1
11
  py-spy
12
  pandas
13
+ datasets
14
+ scipy