tdurbor commited on
Commit
892f774
·
1 Parent(s): c5aa08d

Use confidence interval and seed to have reproducible scoring

Browse files
Files changed (3) hide show
  1. app.py +24 -19
  2. db.py +10 -2
  3. rating_systems.py +1 -1
app.py CHANGED
@@ -49,30 +49,36 @@ commit_scheduler = CommitScheduler(
49
  def fetch_elo_scores():
50
  """Fetch and log Elo scores."""
51
  try:
52
- elo_scores, bootstrap_elo_scores = compute_elo_scores()
53
  logging.info("Elo scores successfully computed.")
54
- return elo_scores, bootstrap_elo_scores
55
  except Exception as e:
56
  logging.error("Error computing Elo scores: %s", str(e))
57
  return None
58
 
59
  def update_rankings_table():
60
- """Update and return the rankings table based on Elo scores."""
61
- elo_scores, bootstrap_elo_scores = fetch_elo_scores() or {}
62
- votes_per_model = compute_votes_per_model()
63
  try:
64
- rankings = [
65
- ["Photoroom", int(elo_scores["Photoroom"]), int(bootstrap_elo_scores["Photoroom"]), votes_per_model.get("Photoroom", 0)],
66
- ["RemoveBG", int(elo_scores["RemoveBG"]), int(bootstrap_elo_scores["RemoveBG"]), votes_per_model.get("RemoveBG", 0)],
67
- ["BRIA RMBG 2.0", int(elo_scores["BRIA RMBG 2.0"]), int(bootstrap_elo_scores["BRIA RMBG 2.0"]), votes_per_model.get("BRIA RMBG 2.0", 0)],
68
- ]
69
- rankings.sort(key=lambda x: x[2], reverse=True)
 
 
 
 
 
 
 
70
  except KeyError as e:
71
  logging.error("Missing score for model: %s", str(e))
72
  return []
73
  return rankings
74
 
75
-
76
  def select_new_image():
77
  """Select a new image and its segmented versions."""
78
  max_attempts = 10
@@ -354,10 +360,10 @@ def gradio_interface():
354
 
355
  with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
356
  rankings_table = gr.Dataframe(
357
- headers=["Model", "Elo score", "Bootstrapped Elo score", "Selections"],
358
  value=update_rankings_table(),
359
  label="Current Model Rankings",
360
- column_widths=[180, 60, 60, 60],
361
  row_count=4
362
  )
363
 
@@ -368,12 +374,11 @@ def gradio_interface():
368
 
369
  # Explanation of Bootstrapped Elo Score
370
  explanation_text = """
371
- The Bootstrapped Elo score is a more robust estimate of the model's performance,
372
- calculated using bootstrapping techniques. This method provides a distribution of
373
- Elo scores by repeatedly sampling the data, which helps in understanding the
374
- variability and confidence in the model's ranking.
375
 
376
- We used the approach from the Chatbot Arena [rating system code](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/rating_systems.py).
377
  """
378
  gr.Markdown(explanation_text)
379
 
 
49
  def fetch_elo_scores():
50
  """Fetch and log Elo scores."""
51
  try:
52
+ median_elo_scores, model_rating_q025, model_rating_q975, variance = compute_elo_scores()
53
  logging.info("Elo scores successfully computed.")
54
+ return median_elo_scores, model_rating_q025, model_rating_q975, variance
55
  except Exception as e:
56
  logging.error("Error computing Elo scores: %s", str(e))
57
  return None
58
 
59
  def update_rankings_table():
60
+ """Update and return the rankings table based on Elo scores and vote counts."""
61
+ median_elo_scores, model_rating_q025, model_rating_q975, variance = fetch_elo_scores() or {}
62
+ model_vote_counts = compute_votes_per_model()
63
  try:
64
+ # Create a list of models to iterate over
65
+ models = ["Photoroom", "RemoveBG", "BRIA RMBG 2.0"]
66
+ rankings = []
67
+
68
+ for model in models:
69
+ elo_score = int(median_elo_scores.get(model, 0))
70
+ model_variance = int(variance.get(model, 0))
71
+ ci_95 = f"{int(model_rating_q025.get(model, 0))} - {int(model_rating_q975.get(model, 0))}"
72
+ vote_count = model_vote_counts.get(model, 0)
73
+ rankings.append([model, elo_score, model_variance, ci_95, vote_count])
74
+
75
+ # Sort rankings by Elo score in descending order
76
+ rankings.sort(key=lambda x: x[1], reverse=True)
77
  except KeyError as e:
78
  logging.error("Missing score for model: %s", str(e))
79
  return []
80
  return rankings
81
 
 
82
  def select_new_image():
83
  """Select a new image and its segmented versions."""
84
  max_attempts = 10
 
360
 
361
  with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
362
  rankings_table = gr.Dataframe(
363
+ headers=["Model", "Elo score", "Variance", "95% CI", "Selections"],
364
  value=update_rankings_table(),
365
  label="Current Model Rankings",
366
+ column_widths=[180, 60, 60, 60, 60],
367
  row_count=4
368
  )
369
 
 
374
 
375
  # Explanation of Bootstrapped Elo Score
376
  explanation_text = """
377
+ The Elo score was calculated using bootstrapping with num_rounds=1000. This method provides a
378
+ distribution of Elo scores by repeatedly sampling the data, which helps in
379
+ understanding the variability and confidence in the model's ranking.
 
380
 
381
+ We used the approach from the Chatbot Arena [rating system code](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/rating_systems.py#L153).
382
  """
383
  gr.Markdown(explanation_text)
384
 
db.py CHANGED
@@ -5,6 +5,7 @@ from sqlalchemy.ext.declarative import declarative_base
5
  from sqlalchemy.orm import sessionmaker, Session
6
  from datetime import datetime
7
  import pandas as pd
 
8
  from datasets import load_dataset
9
  from rating_systems import compute_elo, compute_bootstrap_elo, get_median_elo_from_bootstrap
10
 
@@ -115,10 +116,17 @@ def compute_elo_scores():
115
  logging.info("Initial votes count: %d", init_size)
116
  logging.info("Votes count after validation: %d", df.shape[0])
117
 
118
- elo_scores = compute_elo(df)
 
 
119
  bootstrap_elo_scores = compute_bootstrap_elo(df)
120
  median_elo_scores = get_median_elo_from_bootstrap(bootstrap_elo_scores)
121
- return elo_scores, median_elo_scores
 
 
 
 
 
122
 
123
  # Function to compute the number of votes for each model
124
  def compute_votes_per_model():
 
5
  from sqlalchemy.orm import sessionmaker, Session
6
  from datetime import datetime
7
  import pandas as pd
8
+ import numpy as np
9
  from datasets import load_dataset
10
  from rating_systems import compute_elo, compute_bootstrap_elo, get_median_elo_from_bootstrap
11
 
 
116
  logging.info("Initial votes count: %d", init_size)
117
  logging.info("Votes count after validation: %d", df.shape[0])
118
 
119
+ # Seed the random number generator for reproducibility
120
+ np.random.seed(42)
121
+
122
  bootstrap_elo_scores = compute_bootstrap_elo(df)
123
  median_elo_scores = get_median_elo_from_bootstrap(bootstrap_elo_scores)
124
+
125
+ model_rating_q025 = bootstrap_elo_scores.quantile(0.025)
126
+ model_rating_q975 = bootstrap_elo_scores.quantile(0.975)
127
+ variance = bootstrap_elo_scores.var()
128
+
129
+ return median_elo_scores, model_rating_q025, model_rating_q975, variance
130
 
131
  # Function to compute the number of votes for each model
132
  def compute_votes_per_model():
rating_systems.py CHANGED
@@ -44,7 +44,7 @@ def compute_elo(df, k=4.0, base=10.0, init_rating=1000.0, scale=400.0):
44
 
45
 
46
  def compute_bootstrap_elo(
47
- df, num_round=100, k=4.0, base=10.0, init_rating=1000.0, scale=400.0
48
  ):
49
  matchups, outcomes, models = preprocess_for_elo(df)
50
  sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round))
 
44
 
45
 
46
  def compute_bootstrap_elo(
47
+ df, num_round=1000, k=4.0, base=10.0, init_rating=1000.0, scale=400.0
48
  ):
49
  matchups, outcomes, models = preprocess_for_elo(df)
50
  sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round))