Spaces:

AtlaAI
/

judge-arena

Running

File size: 3,830 Bytes

from collections import defaultdict
from datetime import datetime, timezone
from typing import Dict, List

# Constants
DEFAULT_ELO = 1200  # Starting ELO for new models
K_FACTOR = 32  # Standard chess K-factor

def get_leaderboard(model_data: Dict, voting_data: List, show_preliminary=True):
    """Generate leaderboard data using votes from MongoDB."""
    # Initialize dictionaries for tracking
    ratings = defaultdict(lambda: DEFAULT_ELO)
    matches = defaultdict(int)

    # Process each vote
    for vote in voting_data:
        try:
            model_a = vote.get("model_a")
            model_b = vote.get("model_b")
            winner = vote.get("winner")

            # Skip if models aren't in current model_data
            if (
                not all([model_a, model_b, winner])
                or model_a not in model_data
                or model_b not in model_data
            ):
                continue

            # Update match counts
            matches[model_a] += 1
            matches[model_b] += 1

            # Calculate ELO changes
            elo_a = ratings[model_a]
            elo_b = ratings[model_b]

            # Expected scores
            expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
            expected_b = 1 - expected_a

            # Actual scores
            score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
            score_b = 1 - score_a

            # Update ratings
            ratings[model_a] += K_FACTOR * (score_a - expected_a)
            ratings[model_b] += K_FACTOR * (score_b - expected_b)

        except Exception as e:
            print(f"Error processing vote: {e}")
            continue

    # Generate leaderboard data
    leaderboard = []
    for model in model_data.keys():
        votes = matches[model]
        # Skip models with < 300 votes if show_preliminary is False
        if not show_preliminary and votes < 300:
            continue
            
        elo = ratings[model]
        ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
        data = {
            "Model": model,
            "ELO Score": f"{int(elo)}",
            "95% CI": f"±{int(ci)}",
            "# Votes": votes,
            "Organization": model_data[model]["organization"],
            "License": model_data[model]["license"],
        }
        leaderboard.append(data)

    # Sort leaderboard by ELO score in descending order
    leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)

    return leaderboard

def get_leaderboard_stats(model_data: Dict, voting_data: List) -> str:
    """Get summary statistics for the leaderboard."""
    now = datetime.now(timezone.utc)
    total_votes = len(voting_data)
    total_models = len(model_data)
    # last_updated = now.strftime("%B %d, %Y at %H:%M:%S UTC")

    last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
        "%B %d, %Y at %H:00 UTC"
    )

    return f"""
### Leaderboard Stats
- **Total Models**: {total_models}
- **Total Votes**: {total_votes}
- **Last Updated**: {last_updated}
"""

def calculate_elo_change(rating_a: float, rating_b: float, winner: str) -> tuple[float, float]:
    """Calculate ELO rating changes for both players."""
    expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
    expected_b = 1 - expected_a

    if winner == "A":
        score_a, score_b = 1, 0
    elif winner == "B":
        score_a, score_b = 0, 1
    else:  # Handle ties
        score_a, score_b = 0.5, 0.5

    change_a = K_FACTOR * (score_a - expected_a)
    change_b = K_FACTOR * (score_b - expected_b)

    return change_a, change_b

def get_model_rankings(leaderboard: List[Dict]) -> Dict[str, int]:
    """Get current rankings of all models from leaderboard data."""
    return {entry["Model"]: idx + 1 for idx, entry in enumerate(leaderboard)}