open_pt_llm_leaderboard

Running on CPU Upgrade

File size: 8,546 Bytes

f2bc0a5
b1a1395
f2bc0a5
 
df66f6e
2a5f9fb
b1a1395
 
f2bc0a5
 
 
b1a1395
f2bc0a5
b1a1395
f2bc0a5
b1a1395
 
f2bc0a5
b1a1395
ec3a730
 
 
 
b1a1395
 
f2bc0a5
 
b1a1395
f2bc0a5
 
b1a1395
 
 
 
 
 
dbb8b5d
 
 
f2bc0a5
 
b1a1395
 
 
 
5639a81
 
b1a1395
 
 
 
 
 
 
 
 
f2bc0a5
b1a1395
ec3a730
f2bc0a5
b1a1395
 
f2bc0a5
 
 
b1a1395
f2bc0a5
 
 
 
 
 
b1a1395
 
 
f2bc0a5
 
 
 
 
b1a1395
 
f2bc0a5
 
 
 
 
b1a1395
f2bc0a5
 
 
 
 
 
 
 
 
 
 
 
 
 
b1a1395
f2bc0a5
 
c0fa950
f2bc0a5
 
 
 
b1a1395
 
 
f2bc0a5
b1a1395
f2bc0a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359d8a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e84464
 
 
 
359d8a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2bc0a5
 
 
2a5f9fb

import pandas as pd
import numpy as np
import plotly.express as px
from plotly.graph_objs import Figure

from src.leaderboard.filter_models import FLAGGED_MODELS
from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS
from src.leaderboard.read_evals import EvalResult



def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
    """
    Generates a DataFrame containing the maximum scores until each date.

    :param results_df: A DataFrame containing result information including metric scores and dates.
    :return: A new DataFrame containing the maximum scores until each date for every metric.
    """
    # Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it

    #create dataframe with EvalResult dataclass columns, even if raw_data is empty
    results_df = pd.DataFrame(raw_data, columns=EvalResult.__dataclass_fields__.keys())

    #results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
    results_df.sort_values(by="date", inplace=True)

    # Step 2: Initialize the scores dictionary
    scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]}

    # Step 3: Iterate over the rows of the DataFrame and update the scores dictionary
    for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]:
        current_max = 0
        last_date = ""
        column = task.col_name
        for _, row in results_df.iterrows():
            current_model = row["full_model"]
            # We ignore models that are flagged/no longer on the hub/not finished 
            to_ignore = not row["still_on_hub"] or row["flagged"] or current_model in FLAGGED_MODELS or row["status"] != "FINISHED"
            if to_ignore:
                continue

            current_date = row["date"]
            if task.benchmark == "Average":
                current_score = np.mean(list(row["results"].values()))
            else:
                if task.benchmark not in row["results"]:
                    continue
                current_score = row["results"][task.benchmark]

            if current_score > current_max:
                if current_date == last_date and len(scores[column]) > 0:
                    scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score}
                else:
                    scores[column].append({"model": current_model, "date": current_date, "score": current_score})
                current_max = current_score
                last_date = current_date

    # Step 4: Return all dictionaries as DataFrames
    return {k: pd.DataFrame(v, columns=["model", "date", "score"]) for k, v in scores.items()}


def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame:
    """
    Transforms the scores DataFrame into a new format suitable for plotting.

    :param scores_df: A DataFrame containing metric scores and dates.
    :return: A new DataFrame reshaped for plotting purposes.
    """
    # Initialize the list to store DataFrames
    dfs = []

    # Iterate over the cols and create a new DataFrame for each column
    for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]:
        d = scores_df[col].reset_index(drop=True)
        d["task"] = col
        dfs.append(d)

    # Concatenate all the created DataFrames
    concat_df = pd.concat(dfs, ignore_index=True)

    # Sort values by 'date'
    concat_df.sort_values(by="date", inplace=True)
    concat_df.reset_index(drop=True, inplace=True)
    return concat_df


def create_metric_plot_obj(
    df: pd.DataFrame, metrics: list[str], title: str
) -> Figure:
    """
    Create a Plotly figure object with lines representing different metrics
    and horizontal dotted lines representing human baselines.

    :param df: The DataFrame containing the metric values, names, and dates.
    :param metrics: A list of strings representing the names of the metrics
                    to be included in the plot.
    :param title: A string representing the title of the plot.
    :return: A Plotly figure object with lines representing metrics and
             horizontal dotted lines representing human baselines.
    """

    # Filter the DataFrame based on the specified metrics
    df = df[df["task"].isin(metrics)]

    # Filter the human baselines based on the specified metrics
    filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics if v is not None}

    # Create a line figure using plotly express with specified markers and custom data
    fig = px.line(
        df,
        x="date",
        y="score",
        color="task",
        markers=True,
        custom_data=["task", "score", "model"],
        title=title,
    )

    # Update hovertemplate for better hover interaction experience
    fig.update_traces(
        hovertemplate="<br>".join(
            [
                "Model Name: %{customdata[2]}",
                "Metric Name: %{customdata[0]}",
                "Date: %{x}",
                "Metric Value: %{y}",
            ]
        )
    )

    # Update the range of the y-axis
    fig.update_layout(yaxis_range=[0, 100])

    # Create a dictionary to hold the color mapping for each metric
    metric_color_mapping = {}

    # Map each metric name to its color in the figure
    for trace in fig.data:
        metric_color_mapping[trace.name] = trace.line.color

    # Iterate over filtered human baselines and add horizontal lines to the figure
    for metric, value in filtered_human_baselines.items():
        color = metric_color_mapping.get(metric, "blue")  # Retrieve color from mapping; default to blue if not found
        location = "top left" if metric == "HellaSwag" else "bottom left"  # Set annotation position
        # Add horizontal line with matched color and positioned annotation
        fig.add_hline(
            y=value,
            line_dash="dot",
            annotation_text=f"{metric} human baseline",
            annotation_position=location,
            annotation_font_size=10,
            annotation_font_color=color,
            line_color=color,
        )

    return fig

def create_lat_score_mem_plot_obj(leaderboard_df):
    copy_df = leaderboard_df.copy()
    copy_df = copy_df[~(copy_df[AutoEvalColumn.dummy.name].isin(["baseline", "human_baseline"]))]
    # plot
    SCORE_MEMORY_LATENCY_DATA = [
        AutoEvalColumn.dummy.name,
        AutoEvalColumn.average.name,
        AutoEvalColumn.params.name,
        AutoEvalColumn.architecture.name,
        "Evaluation Time (min)"
    ]

    copy_df["LLM Average Score"] = copy_df[AutoEvalColumn.average.name]
    copy_df["Evaluation Time (min)"] = copy_df[AutoEvalColumn.eval_time.name] / 60

    #copy_df["size"] = copy_df[AutoEvalColumn.params.name]
    copy_df["size"] = copy_df[AutoEvalColumn.params.name].apply(lambda x: 0.5 if 0 <= x < 0.8 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 0.8 if 0.8 <= x < 2 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 1.5 if 2 <= x < 5 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 2.0 if 5 <= x < 10 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 3.0 if 10 <= x < 35 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 4.0 if 35 <= x < 60 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 6.0 if 60 <= x < 90 else x)
    copy_df["size"] = copy_df["size"].apply(lambda x: 8.0 if x >= 90 else x)

    fig = px.scatter(
        copy_df,
        x="Evaluation Time (min)",
        y="LLM Average Score",
        size="size",
        color=AutoEvalColumn.architecture.name,
        custom_data=SCORE_MEMORY_LATENCY_DATA,
        color_discrete_sequence=px.colors.qualitative.Light24,
        log_x=True
    )
    fig.update_traces(
        hovertemplate="<br>".join(
            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
        )
    )
    fig.update_layout(
        title={
            "text": "Eval Time vs. Score vs. #Params",
            "y": 0.95,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        xaxis_title="Time To Evaluate (min)",
        yaxis_title="LLM Average Score",
        legend_title="LLM Architecture",
        width=1200,
        height=600,
    )

    return fig

# Example Usage:
# human_baselines dictionary is defined.
# chart = create_metric_plot_obj(scores_df, ["ARC", "HellaSwag", "MMLU", "TruthfulQA"], human_baselines, "Graph Title")