Spaces:

bgsys
/

background-removal-arena

Running

App Files Files Community

tdurbor commited on Dec 10, 2024

Commit

5fef682

1 Parent(s): 7f0069a

include elo bootstraping and counts

Browse files

Files changed (4) hide show

app.py +29 -13
db.py +43 -2
rating_systems.py +45 -16
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -18,7 +18,8 @@ from db import (
     get_all_votes,
     add_vote,
     is_running_in_space,
-    fill_database_once
 )
 # Load environment variables
@@ -48,23 +49,27 @@ commit_scheduler = CommitScheduler(
 def fetch_elo_scores():
     """Fetch and log Elo scores."""
     try:
-        elo_scores = compute_elo_scores()
         logging.info("Elo scores successfully computed.")
-        return elo_scores
     except Exception as e:
         logging.error("Error computing Elo scores: %s", str(e))
         return None
 def update_rankings_table():
     """Update and return the rankings table based on Elo scores."""
-    elo_scores = fetch_elo_scores() or {}
-    default_score = 1000
-    rankings = [
-        ["Photoroom", int(elo_scores.get("Photoroom", default_score))],
-        ["RemoveBG", int(elo_scores.get("RemoveBG", default_score))],
-        ["BRIA RMBG 2.0", int(elo_scores.get("BRIA RMBG 2.0", default_score))],
-    ]
-    rankings.sort(key=lambda x: x[1], reverse=True)
     return rankings
@@ -349,10 +354,10 @@ def gradio_interface():
             with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
                 rankings_table = gr.Dataframe(
-                    headers=["Model", "Ranking"],
                     value=update_rankings_table(),
                     label="Current Model Rankings",
-                    column_widths=[180, 60],
                     row_count=4
                 )
@@ -361,6 +366,17 @@ def gradio_interface():
                     outputs=rankings_table
                 )
             with gr.Tab("📊 Vote Data", id=2) as vote_data_tab:
                 def update_vote_data():
                     votes = get_all_votes()

     get_all_votes,
     add_vote,
     is_running_in_space,
+    fill_database_once,
+    compute_votes_per_model
 )
 # Load environment variables
 def fetch_elo_scores():
     """Fetch and log Elo scores."""
     try:
+        elo_scores, bootstrap_elo_scores = compute_elo_scores()
         logging.info("Elo scores successfully computed.")
+        return elo_scores, bootstrap_elo_scores
     except Exception as e:
         logging.error("Error computing Elo scores: %s", str(e))
         return None
 def update_rankings_table():
     """Update and return the rankings table based on Elo scores."""
+    elo_scores, bootstrap_elo_scores = fetch_elo_scores() or {}
+    votes_per_model = compute_votes_per_model()
+    try:
+        rankings = [
+            ["Photoroom", int(elo_scores["Photoroom"]), int(bootstrap_elo_scores["Photoroom"]), votes_per_model.get("Photoroom", 0)],
+            ["RemoveBG", int(elo_scores["RemoveBG"]), int(bootstrap_elo_scores["RemoveBG"]), votes_per_model.get("RemoveBG", 0)],
+            ["BRIA RMBG 2.0", int(elo_scores["BRIA RMBG 2.0"]), int(bootstrap_elo_scores["BRIA RMBG 2.0"]), votes_per_model.get("BRIA RMBG 2.0", 0)],
+        ]
+    except KeyError as e:
+        logging.error("Missing score for model: %s", str(e))
+        return []
+    rankings.sort(key=lambda x: x[2], reverse=True)
     return rankings
             with gr.Tab("🏆 Leaderboard", id=1) as leaderboard_tab:
                 rankings_table = gr.Dataframe(
+                    headers=["Model", "Elo score", "Bootstrapped Elo score", "Selections"],
                     value=update_rankings_table(),
                     label="Current Model Rankings",
+                    column_widths=[180, 60, 60, 60],
                     row_count=4
                 )
                     outputs=rankings_table
                 )
+                # Explanation of Bootstrapped Elo Score
+                explanation_text = """
+                The Bootstrapped Elo score is a more robust estimate of the model's performance,
+                calculated using bootstrapping techniques. This method provides a distribution of
+                Elo scores by repeatedly sampling the data, which helps in understanding the
+                variability and confidence in the model's ranking.
+                We used the approach from the Chatbot Arena [rating system code](https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/monitor/rating_systems.py).
+                """
+                gr.Markdown(explanation_text)
             with gr.Tab("📊 Vote Data", id=2) as vote_data_tab:
                 def update_vote_data():
                     votes = get_all_votes()

db.py CHANGED Viewed

@@ -6,7 +6,7 @@ from sqlalchemy.orm import sessionmaker, Session
 from datetime import datetime
 import pandas as pd
 from datasets import load_dataset
-from rating_systems import compute_elo
 def is_running_in_space():
     return "SPACE_ID" in os.environ
@@ -88,6 +88,8 @@ def get_all_votes():
 # Function to compute Elo scores
 def compute_elo_scores():
     with SessionLocal() as db:
         votes = db.query(Vote).all()
         data = {
@@ -96,5 +98,44 @@ def compute_elo_scores():
             "winner": [vote.winner for vote in votes]
         }
         df = pd.DataFrame(data)
         elo_scores = compute_elo(df)
-        return elo_scores

 from datetime import datetime
 import pandas as pd
 from datasets import load_dataset
+from rating_systems import compute_elo, compute_bootstrap_elo, get_median_elo_from_bootstrap
 def is_running_in_space():
     return "SPACE_ID" in os.environ
 # Function to compute Elo scores
 def compute_elo_scores():
+    valid_models = ["Photoroom", "RemoveBG", "BRIA RMBG 2.0"]
     with SessionLocal() as db:
         votes = db.query(Vote).all()
         data = {
             "winner": [vote.winner for vote in votes]
         }
         df = pd.DataFrame(data)
+        init_size = df.shape[0]
+        # Remove votes missing model_a, model_b or winner info
+        df.dropna(subset=["model_a", "model_b", "winner"], inplace=True)
+        # Validate models and winner
+        def is_valid_vote(row):
+            if row["model_a"] not in valid_models or row["model_b"] not in valid_models:
+                return False
+            if row["winner"] not in ["model_a", "model_b", "tie"]:
+                return False
+            return True
+        df = df[df.apply(is_valid_vote, axis=1)]
+        logging.info("Initial votes count: %d", init_size)
+        logging.info("Votes count after validation: %d", df.shape[0])
         elo_scores = compute_elo(df)
+        bootstrap_elo_scores = compute_bootstrap_elo(df)
+        median_elo_scores = get_median_elo_from_bootstrap(bootstrap_elo_scores)
+        return elo_scores, median_elo_scores
+# Function to compute the number of votes for each model
+def compute_votes_per_model():
+    with SessionLocal() as db:
+        votes = db.query(Vote).all()
+        model_vote_count = {}
+        for vote in votes:
+            if vote.winner == "model_a":
+                model = vote.model_a
+            elif vote.winner == "model_b":
+                model = vote.model_b
+            else:
+                continue
+            if model not in model_vote_count:
+                model_vote_count[model] = 0
+            model_vote_count[model] += 1
+        return model_vote_count

rating_systems.py CHANGED Viewed

@@ -6,6 +6,7 @@ import pandas as pd
 import numpy as np
 from sqlalchemy.orm import Session
 import pandas as pd
 def get_matchups_models(df):
     n_rows = len(df)
@@ -41,19 +42,47 @@ def compute_elo(df, k=4.0, base=10.0, init_rating=1000.0, scale=400.0):
     return {model: ratings[idx] for idx, model in enumerate(models)}
-def compute_elo_from_votes(db: Session):
-    # Retrieve all votes from the database
-    votes = db.query(Vote).all()
-    # Convert votes to a DataFrame
-    data = {
-        "model_a": [vote.model_a for vote in votes],
-        "model_b": [vote.model_b for vote in votes],
-        "winner": [vote.winner for vote in votes]
-    }
-    df = pd.DataFrame(data)
-    # Compute Elo scores using the existing function
-    elo_scores = compute_elo(df)
-    return elo_scores

 import numpy as np
 from sqlalchemy.orm import Session
 import pandas as pd
+from scipy.special import expit
 def get_matchups_models(df):
     n_rows = len(df)
     return {model: ratings[idx] for idx, model in enumerate(models)}
+def compute_bootstrap_elo(
+    df, num_round=100, k=4.0, base=10.0, init_rating=1000.0, scale=400.0
+):
+    matchups, outcomes, models = preprocess_for_elo(df)
+    sample_indices = np.random.randint(low=0, high=len(df), size=(len(df), num_round))
+    ratings = fit_vectorized_elo(
+        matchups, outcomes, sample_indices, len(models), k, base, init_rating, scale
+    )
+    df = pd.DataFrame(data=ratings, columns=models)
+    return df[df.median().sort_values(ascending=False).index]
+def fit_vectorized_elo(
+    matchups,
+    outcomes,
+    sample_indices,
+    num_models,
+    k=4.0,
+    base=10.0,
+    init_rating=1000.0,
+    scale=400.0,
+):
+    """fit multiple sets of Elo ratings on different samples of the data at the same time"""
+    alpha = math.log(base) / scale
+    num_samples = sample_indices.shape[1]
+    ratings = np.zeros(shape=(num_samples, num_models), dtype=np.float64)
+    # iterate over the rows of sample_indices, each column is an index into a match in the input arrays
+    sample_range = np.arange(num_samples)
+    for matchup_indices in sample_indices:
+        model_a_indices = matchups[matchup_indices, 0]
+        model_b_indices = matchups[matchup_indices, 1]
+        model_a_ratings = ratings[sample_range, model_a_indices]
+        model_b_ratings = ratings[sample_range, model_b_indices]
+        sample_outcomes = outcomes[matchup_indices]
+        probs = expit(alpha * (model_a_ratings - model_b_ratings))
+        updates = k * (sample_outcomes - probs)
+        ratings[sample_range, model_a_indices] += updates
+        ratings[sample_range, model_b_indices] -= updates
+    return ratings + init_rating
+def get_median_elo_from_bootstrap(bootstrap_df):
+    median = dict(bootstrap_df.quantile(0.5))
+    median = {k: int(v + 0.5) for k, v in median.items()}
+    return median

requirements.txt CHANGED Viewed

@@ -10,4 +10,5 @@ SQLAlchemy==2.0.36
 uvicorn==0.30.1
 py-spy
 pandas
-datasets

 uvicorn==0.30.1
 py-spy
 pandas
+datasets
+scipy