|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__author__ = "Dmitry Ustalov" |
|
__license__ = "Apache 2.0" |
|
|
|
from collections.abc import Callable |
|
from typing import BinaryIO, cast |
|
|
|
import evalica |
|
import gradio as gr |
|
import networkx as nx |
|
import numpy as np |
|
import pandas as pd |
|
import plotly.express as px |
|
from evalica import Winner |
|
from plotly.graph_objects import Figure |
|
|
|
TOLERANCE, LIMIT = 1e-6, 100 |
|
|
|
|
|
def visualize(df_pairwise: pd.DataFrame) -> Figure: |
|
fig = px.imshow(df_pairwise, color_continuous_scale="RdBu", text_auto=".2f") |
|
|
|
fig.update_layout(xaxis_title="Loser", yaxis_title="Winner", xaxis_side="top") |
|
|
|
fig.update_traces(hovertemplate="Winner: %{y}<br>Loser: %{x}<br>Fraction of Wins: %{z}<extra></extra>") |
|
|
|
return fig |
|
|
|
|
|
def counting(xs: "pd.Series[str]", ys: "pd.Series[str]", |
|
ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": |
|
result = evalica.counting(xs, ys, ws, index=index) |
|
return result.scores |
|
|
|
|
|
def average_win_rate(xs: "pd.Series[str]", ys: "pd.Series[str]", |
|
ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": |
|
result = evalica.counting(xs, ys, ws, index=index) |
|
return result.scores |
|
|
|
|
|
def bradley_terry(xs: "pd.Series[str]", ys: "pd.Series[str]", |
|
ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": |
|
result = evalica.bradley_terry(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT) |
|
return result.scores |
|
|
|
|
|
def elo(xs: "pd.Series[str]", ys: "pd.Series[str]", |
|
ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": |
|
result = evalica.elo(xs, ys, ws, index=index) |
|
return result.scores |
|
|
|
|
|
def eigen(xs: "pd.Series[str]", ys: "pd.Series[str]", |
|
ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": |
|
result = evalica.eigen(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT) |
|
return result.scores |
|
|
|
|
|
def pagerank(xs: "pd.Series[str]", ys: "pd.Series[str]", |
|
ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": |
|
result = evalica.pagerank(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT) |
|
return result.scores |
|
|
|
|
|
def newman(xs: "pd.Series[str]", ys: "pd.Series[str]", |
|
ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": |
|
result = evalica.newman(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT) |
|
return result.scores |
|
|
|
|
|
ALGORITHMS = { |
|
"Counting": counting, |
|
"Average Win Rate": average_win_rate, |
|
"Bradley-Terry (1952)": bradley_terry, |
|
"Elo (1960)": elo, |
|
"Eigenvector (1987)": eigen, |
|
"PageRank (1998)": pagerank, |
|
"Newman (2023)": newman, |
|
} |
|
|
|
|
|
def largest_strongly_connected_component(df_pairs: pd.DataFrame) -> set[str]: |
|
G = nx.from_pandas_edgelist(df_pairs, source="left", target="right", create_using=nx.DiGraph) |
|
H = nx.from_pandas_edgelist(df_pairs[df_pairs["winner"] == "tie"], source="right", target="left", |
|
create_using=nx.DiGraph) |
|
F = nx.compose(G, H) |
|
largest = max(nx.strongly_connected_components(F), key=len) |
|
return cast(set[str], largest) |
|
|
|
|
|
def estimate(df_pairs: pd.DataFrame, |
|
algorithm: Callable[[ |
|
"pd.Series[str]", "pd.Series[str]", "pd.Series[Winner]", dict[str, int]], |
|
"pd.Series[float]", |
|
], |
|
index: dict[str, int]) -> pd.DataFrame: |
|
scores = algorithm(df_pairs["left"], df_pairs["right"], df_pairs["winner"], index) |
|
|
|
df_result = pd.DataFrame(data={"score": scores}, index=index) |
|
df_result.index.name = "item" |
|
|
|
return df_result |
|
|
|
|
|
def bootstrap(df_pairs: pd.DataFrame, |
|
algorithm: Callable[[ |
|
"pd.Series[str]", "pd.Series[str]", "pd.Series[Winner]", dict[str, int]], |
|
"pd.Series[float]", |
|
], |
|
index: dict[str, int], |
|
rounds: int) -> pd.DataFrame: |
|
scores: list[pd.Series[float]] = [] |
|
|
|
for r in range(rounds): |
|
df_sample = df_pairs.sample(frac=1.0, replace=True, random_state=r) |
|
|
|
sample_scores = algorithm(df_sample["left"], df_sample["right"], df_sample["winner"], index) |
|
|
|
scores.append(sample_scores) |
|
|
|
df_bootstrap = pd.DataFrame(scores, columns=index) |
|
|
|
ratings = df_bootstrap.quantile(.5) |
|
|
|
ci = df_bootstrap.apply(lambda row: ( |
|
row.quantile(.025).item(), row.quantile(.975).item(), |
|
), axis=0, result_type="reduce") |
|
|
|
df_result = pd.DataFrame({"score": ratings, "ci": ci}) |
|
df_result.index.name = "item" |
|
|
|
return df_result |
|
|
|
|
|
def handler( |
|
file: BinaryIO, |
|
algorithm: str, |
|
filtered: bool, |
|
truncated: bool, |
|
rounds: int, |
|
) -> tuple[pd.DataFrame, Figure]: |
|
if file is None: |
|
raise gr.Error("File must be uploaded") |
|
|
|
if algorithm not in ALGORITHMS: |
|
raise gr.Error(f"Unknown algorithm: {algorithm}") |
|
|
|
try: |
|
df_pairs = pd.read_csv(file.name, dtype=str) |
|
except ValueError as e: |
|
raise gr.Error(f"Parsing error: {e}") from e |
|
|
|
if not pd.Series(["left", "right", "winner"]).isin(df_pairs.columns).all(): |
|
raise gr.Error("Columns must exist: left, right, winner") |
|
|
|
if not df_pairs["winner"].isin(pd.Series(["left", "right", "tie"])).all(): |
|
raise gr.Error("Allowed winner values: left, right, tie") |
|
|
|
df_pairs = df_pairs[["left", "right", "winner"]] |
|
df_pairs["winner"] = df_pairs["winner"].map( |
|
{"left": Winner.X, "right": Winner.Y, "tie": Winner.Draw}, |
|
) |
|
|
|
df_pairs = df_pairs.dropna(axis=0) |
|
|
|
if filtered: |
|
largest = largest_strongly_connected_component(df_pairs) |
|
|
|
df_pairs = df_pairs.drop(df_pairs[~(df_pairs["left"].isin(largest) & df_pairs["right"].isin(largest))].index) |
|
|
|
*_, index = evalica.indexing(xs=df_pairs["left"], ys=df_pairs["right"]) |
|
|
|
if rounds: |
|
df_result = bootstrap(df_pairs, ALGORITHMS[algorithm], index, rounds) |
|
else: |
|
df_result = estimate(df_pairs, ALGORITHMS[algorithm], index) |
|
|
|
df_result["pairs"] = pd.Series(0, dtype=int, index=index).add( |
|
df_pairs.groupby("left")["left"].count(), fill_value=0, |
|
).add( |
|
df_pairs.groupby("right")["right"].count(), fill_value=0, |
|
).astype(int) |
|
|
|
df_result["rank"] = df_result["score"].rank(na_option="bottom", ascending=False).astype(int) |
|
|
|
df_result = df_result.fillna(-np.inf) |
|
df_result = df_result.sort_values(by=["rank", "score"], ascending=[True, False]) |
|
df_result = df_result.reset_index() |
|
|
|
if truncated: |
|
df_result = pd.concat((df_result.head(5), df_result.tail(5)), copy=False) |
|
df_result = df_result[~df_result.index.duplicated(keep="last")] |
|
|
|
pairwise = evalica.pairwise_scores(df_result["score"].to_numpy()) |
|
|
|
df_pairwise = pd.DataFrame(data=pairwise, index=df_result["item"], columns=df_result["item"]) |
|
|
|
fig = visualize(df_pairwise) |
|
|
|
if "ci" in df_result.columns: |
|
df_result["ci"] = df_result.apply( |
|
lambda row: f"({row['score'] - row['ci'][0]:.03f}; {row['ci'][1] - row['score']:.03f})", |
|
axis=1, |
|
) |
|
|
|
df_result["score"] = df_result["score"].apply(lambda x: f"{x:.03f}") |
|
|
|
return df_result, fig |
|
|
|
|
|
def main() -> None: |
|
iface = gr.Interface( |
|
fn=handler, |
|
inputs=[ |
|
gr.File( |
|
file_types=[".tsv", ".csv"], |
|
label="Comparisons", |
|
), |
|
gr.Dropdown( |
|
choices=cast(list[str], ALGORITHMS), |
|
value="Bradley-Terry (1952)", |
|
label="Algorithm", |
|
), |
|
gr.Checkbox( |
|
value=False, |
|
label="Largest SCC", |
|
info="Bradley-Terry, Eigenvector, and Newman algorithms require the comparison graph " |
|
"to be strongly-connected. " |
|
"This option keeps only the largest strongly-connected component (SCC) of the input graph. " |
|
"Some items might be missing as a result of this filtering.", |
|
), |
|
gr.Checkbox( |
|
value=False, |
|
label="Truncate Output", |
|
info="Perform the entire computation but output only five head and five tail items, " |
|
"avoiding overlap.", |
|
), |
|
gr.Number( |
|
value=0, |
|
minimum=0, |
|
maximum=10000, |
|
label="Bootstrap Rounds", |
|
info="Number of bootstrap rounds to perform for estimating the confidence interval.", |
|
), |
|
], |
|
outputs=[ |
|
gr.Dataframe( |
|
headers=["item", "score", "ci", "pairs", "rank"], |
|
label="Ranking", |
|
), |
|
gr.Plot( |
|
label="Pairwise Chances of Winning the Comparison", |
|
), |
|
], |
|
examples=[ |
|
["food.csv", "Counting", False, False, 0], |
|
["food.csv", "Bradley-Terry (1952)", False, False, 1000], |
|
["food.csv", "Eigenvector (1987)", False, False, 1000], |
|
["food.csv", "PageRank (1998)", False, False, 1000], |
|
["food.csv", "Newman (2023)", False, False, 1000], |
|
["llmfao.csv", "Average Win Rate", False, True, 100], |
|
["llmfao.csv", "Bradley-Terry (1952)", False, True, 100], |
|
["llmfao.csv", "Elo (1960)", False, True, 100], |
|
], |
|
title="Evalica: Turn Your Side-by-Side Comparisons into Ranking!", |
|
description=""" |
|
""".strip(), |
|
article=""" |
|
This easy-to-use tool transforms pairwise comparisons (*aka* side-by-side) to a meaningful ranking of items. |
|
|
|
As an input, it expects a comma-separated (CSV) file with a header containing the following columns: |
|
|
|
- `left`: the first compared item |
|
- `right`: the second compared item |
|
- `winner`: the label indicating the winning item |
|
|
|
Possible values for `winner` are `left`, `right`, or `tie`. The provided examples might be a good starting point. |
|
|
|
As the output, this tool provides a table with items, their estimated scores, and ranks. |
|
|
|
**More Evalica:** |
|
|
|
- Paper: TBD ([arXiv](https://arxiv.org/abs/2412.11314)) |
|
- GitHub: <https://github.com/dustalov/evalica> |
|
- PyPI: <https://pypi.org/project/evalica/> |
|
- conda-forge: <https://anaconda.org/conda-forge/evalica> |
|
- LLMFAO: <https://evalovernite.substack.com/p/llmfao-human-ranking> |
|
""".strip(), |
|
flagging_mode="never", |
|
) |
|
|
|
iface.launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|