dustalov commited on
Commit
da54bd5
Β·
verified Β·
1 Parent(s): 702877d

Compute CIs

Browse files
Files changed (3) hide show
  1. README.md +3 -3
  2. app.py +96 -32
  3. requirements.txt +1 -1
README.md CHANGED
@@ -1,17 +1,17 @@
1
  ---
2
- title: Pair2Rank
3
  emoji: πŸ’ž
4
  colorFrom: green
5
  colorTo: purple
6
  sdk: gradio
7
  python_version: 3.11
8
- sdk_version: 5.9.1
9
  app_file: app.py
10
  pinned: true
11
  license: apache-2.0
12
  ---
13
 
14
- # Pair2Rank
15
 
16
  [Evalica](https://github.com/dustalov/evalica) is a library for pairwise comparisons as described in paper
17
  Reliable, Reproducible, and Really Fast Leaderboards with Evalica
 
1
  ---
2
+ title: Evalica
3
  emoji: πŸ’ž
4
  colorFrom: green
5
  colorTo: purple
6
  sdk: gradio
7
  python_version: 3.11
8
+ sdk_version: 5.12.0
9
  app_file: app.py
10
  pinned: true
11
  license: apache-2.0
12
  ---
13
 
14
+ # Evalica
15
 
16
  [Evalica](https://github.com/dustalov/evalica) is a library for pairwise comparisons as described in paper
17
  Reliable, Reproducible, and Really Fast Leaderboards with Evalica
app.py CHANGED
@@ -17,6 +17,7 @@
17
  __author__ = "Dmitry Ustalov"
18
  __license__ = "Apache 2.0"
19
 
 
20
  from typing import BinaryIO, cast
21
 
22
  import evalica
@@ -42,45 +43,45 @@ def visualize(df_pairwise: pd.DataFrame) -> Figure:
42
 
43
 
44
  def counting(xs: "pd.Series[str]", ys: "pd.Series[str]",
45
- ws: "pd.Series[Winner]") -> tuple["pd.Series[str]", dict[str, int]]: # type: ignore[type-var]
46
- result = evalica.counting(xs, ys, ws)
47
- return result.scores, result.index
48
 
49
 
50
  def average_win_rate(xs: "pd.Series[str]", ys: "pd.Series[str]",
51
- ws: "pd.Series[Winner]") -> tuple["pd.Series[str]", dict[str, int]]: # type: ignore[type-var]
52
- result = evalica.counting(xs, ys, ws)
53
- return result.scores, result.index
54
 
55
 
56
  def bradley_terry(xs: "pd.Series[str]", ys: "pd.Series[str]",
57
- ws: "pd.Series[Winner]") -> tuple["pd.Series[str]", dict[str, int]]: # type: ignore[type-var]
58
- result = evalica.bradley_terry(xs, ys, ws, tolerance=TOLERANCE, limit=LIMIT)
59
- return result.scores, result.index
60
 
61
 
62
  def elo(xs: "pd.Series[str]", ys: "pd.Series[str]",
63
- ws: "pd.Series[Winner]") -> tuple["pd.Series[str]", dict[str, int]]: # type: ignore[type-var]
64
- result = evalica.elo(xs, ys, ws)
65
- return result.scores, result.index
66
 
67
 
68
  def eigen(xs: "pd.Series[str]", ys: "pd.Series[str]",
69
- ws: "pd.Series[Winner]") -> tuple["pd.Series[str]", dict[str, int]]: # type: ignore[type-var]
70
- result = evalica.eigen(xs, ys, ws, tolerance=TOLERANCE, limit=LIMIT)
71
- return result.scores, result.index
72
 
73
 
74
  def pagerank(xs: "pd.Series[str]", ys: "pd.Series[str]",
75
- ws: "pd.Series[Winner]") -> tuple["pd.Series[str]", dict[str, int]]: # type: ignore[type-var]
76
- result = evalica.pagerank(xs, ys, ws, tolerance=TOLERANCE, limit=LIMIT)
77
- return result.scores, result.index
78
 
79
 
80
  def newman(xs: "pd.Series[str]", ys: "pd.Series[str]",
81
- ws: "pd.Series[Winner]") -> tuple["pd.Series[str]", dict[str, int]]: # type: ignore[type-var]
82
- result = evalica.newman(xs, ys, ws, tolerance=TOLERANCE, limit=LIMIT)
83
- return result.scores, result.index
84
 
85
 
86
  ALGORITHMS = {
@@ -103,11 +104,56 @@ def largest_strongly_connected_component(df_pairs: pd.DataFrame) -> set[str]:
103
  return cast(set[str], largest)
104
 
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  def handler(
107
  file: BinaryIO,
108
  algorithm: str,
109
  filtered: bool,
110
  truncated: bool,
 
111
  ) -> tuple[pd.DataFrame, Figure]:
112
  if file is None:
113
  raise gr.Error("File must be uploaded")
@@ -127,6 +173,9 @@ def handler(
127
  raise gr.Error("Allowed winner values: left, right, tie")
128
 
129
  df_pairs = df_pairs[["left", "right", "winner"]]
 
 
 
130
 
131
  df_pairs = df_pairs.dropna(axis=0)
132
 
@@ -135,13 +184,12 @@ def handler(
135
 
136
  df_pairs = df_pairs.drop(df_pairs[~(df_pairs["left"].isin(largest) & df_pairs["right"].isin(largest))].index)
137
 
138
- xs, ys = df_pairs["left"], df_pairs["right"]
139
- ws = df_pairs["winner"].map({"left": Winner.X, "right": Winner.Y, "tie": Winner.Draw})
140
 
141
- scores, index = ALGORITHMS[algorithm](xs, ys, ws)
142
-
143
- df_result = pd.DataFrame(data={"score": scores}, index=index)
144
- df_result.index.name = "item"
145
 
146
  df_result["pairs"] = pd.Series(0, dtype=int, index=index).add(
147
  df_pairs.groupby("left")["left"].count(), fill_value=0,
@@ -165,6 +213,14 @@ def handler(
165
 
166
  fig = visualize(df_pairwise)
167
 
 
 
 
 
 
 
 
 
168
  return df_result, fig
169
 
170
 
@@ -195,10 +251,17 @@ def main() -> None:
195
  info="Perform the entire computation but output only five head and five tail items, "
196
  "avoiding overlap.",
197
  ),
 
 
 
 
 
 
 
198
  ],
199
  outputs=[
200
  gr.Dataframe(
201
- headers=["item", "score", "pairs", "rank"],
202
  label="Ranking",
203
  ),
204
  gr.Plot(
@@ -215,9 +278,11 @@ def main() -> None:
215
  ["llmfao.csv", "Bradley-Terry (1952)", False, True],
216
  ["llmfao.csv", "Elo (1960)", False, True],
217
  ],
218
- title="Pair2Rank: Turn Your Side-by-Side Comparisons into Ranking!",
219
  description="""
220
- This easy-to-use tool transforms pairwise comparisons (aka side-by-side) to a meaningful ranking of items.
 
 
221
 
222
  As an input, it expects a comma-separated (CSV) file with a header containing the following columns:
223
 
@@ -228,8 +293,7 @@ As an input, it expects a comma-separated (CSV) file with a header containing th
228
  Possible values for `winner` are `left`, `right`, or `tie`. The provided examples might be a good starting point.
229
 
230
  As the output, this tool provides a table with items, their estimated scores, and ranks.
231
- """.strip(),
232
- article="""
233
  **More Evalica:**
234
 
235
  - Paper: TBD ([arXiv](https://arxiv.org/abs/2412.11314))
 
17
  __author__ = "Dmitry Ustalov"
18
  __license__ = "Apache 2.0"
19
 
20
+ from collections.abc import Callable
21
  from typing import BinaryIO, cast
22
 
23
  import evalica
 
43
 
44
 
45
  def counting(xs: "pd.Series[str]", ys: "pd.Series[str]",
46
+ ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
47
+ result = evalica.counting(xs, ys, ws, index=index)
48
+ return result.scores
49
 
50
 
51
  def average_win_rate(xs: "pd.Series[str]", ys: "pd.Series[str]",
52
+ ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
53
+ result = evalica.counting(xs, ys, ws, index=index)
54
+ return result.scores
55
 
56
 
57
  def bradley_terry(xs: "pd.Series[str]", ys: "pd.Series[str]",
58
+ ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
59
+ result = evalica.bradley_terry(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
60
+ return result.scores
61
 
62
 
63
  def elo(xs: "pd.Series[str]", ys: "pd.Series[str]",
64
+ ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
65
+ result = evalica.elo(xs, ys, ws, index=index)
66
+ return result.scores
67
 
68
 
69
  def eigen(xs: "pd.Series[str]", ys: "pd.Series[str]",
70
+ ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
71
+ result = evalica.eigen(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
72
+ return result.scores
73
 
74
 
75
  def pagerank(xs: "pd.Series[str]", ys: "pd.Series[str]",
76
+ ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
77
+ result = evalica.pagerank(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
78
+ return result.scores
79
 
80
 
81
  def newman(xs: "pd.Series[str]", ys: "pd.Series[str]",
82
+ ws: "pd.Series[Winner]", index: dict[str, int]) -> "pd.Series[float]": # type: ignore[type-var]
83
+ result = evalica.newman(xs, ys, ws, index=index, tolerance=TOLERANCE, limit=LIMIT)
84
+ return result.scores
85
 
86
 
87
  ALGORITHMS = {
 
104
  return cast(set[str], largest)
105
 
106
 
107
+ def estimate(df_pairs: pd.DataFrame,
108
+ algorithm: Callable[[ # type: ignore[type-var]
109
+ "pd.Series[str]", "pd.Series[str]", "pd.Series[Winner]", dict[str, int]],
110
+ "pd.Series[float]",
111
+ ],
112
+ index: dict[str, int]) -> pd.DataFrame:
113
+ scores = algorithm(df_pairs["left"], df_pairs["right"], df_pairs["winner"], index)
114
+
115
+ df_result = pd.DataFrame(data={"score": scores}, index=index)
116
+ df_result.index.name = "item"
117
+
118
+ return df_result
119
+
120
+
121
+ def bootstrap(df_pairs: pd.DataFrame,
122
+ algorithm: Callable[[ # type: ignore[type-var]
123
+ "pd.Series[str]", "pd.Series[str]", "pd.Series[Winner]", dict[str, int]],
124
+ "pd.Series[float]",
125
+ ],
126
+ index: dict[str, int],
127
+ rounds: int) -> pd.DataFrame:
128
+ scores: list[pd.Series[float]] = [] # assuming model names are strings
129
+
130
+ for r in range(rounds):
131
+ df_sample = df_pairs.sample(frac=1.0, replace=True, random_state=r)
132
+
133
+ sample_scores = algorithm(df_sample["left"], df_sample["right"], df_sample["winner"], index)
134
+
135
+ scores.append(sample_scores)
136
+
137
+ df_bootstrap = pd.DataFrame(scores, columns=index)
138
+
139
+ ratings = df_bootstrap.quantile(.5)
140
+
141
+ ci = df_bootstrap.apply(lambda row: (
142
+ row.quantile(.025).item(), row.quantile(.975).item(),
143
+ ), axis=0, result_type="reduce")
144
+
145
+ df_result = pd.DataFrame({"score": ratings, "ci": ci})
146
+ df_result.index.name = "item"
147
+
148
+ return df_result
149
+
150
+
151
  def handler(
152
  file: BinaryIO,
153
  algorithm: str,
154
  filtered: bool,
155
  truncated: bool,
156
+ rounds: int,
157
  ) -> tuple[pd.DataFrame, Figure]:
158
  if file is None:
159
  raise gr.Error("File must be uploaded")
 
173
  raise gr.Error("Allowed winner values: left, right, tie")
174
 
175
  df_pairs = df_pairs[["left", "right", "winner"]]
176
+ df_pairs["winner"] = df_pairs["winner"].map(
177
+ {"left": Winner.X, "right": Winner.Y, "tie": Winner.Draw},
178
+ )
179
 
180
  df_pairs = df_pairs.dropna(axis=0)
181
 
 
184
 
185
  df_pairs = df_pairs.drop(df_pairs[~(df_pairs["left"].isin(largest) & df_pairs["right"].isin(largest))].index)
186
 
187
+ *_, index = evalica.indexing(xs=df_pairs["left"], ys=df_pairs["right"])
 
188
 
189
+ if rounds:
190
+ df_result = bootstrap(df_pairs, ALGORITHMS[algorithm], index, rounds)
191
+ else:
192
+ df_result = estimate(df_pairs, ALGORITHMS[algorithm], index)
193
 
194
  df_result["pairs"] = pd.Series(0, dtype=int, index=index).add(
195
  df_pairs.groupby("left")["left"].count(), fill_value=0,
 
213
 
214
  fig = visualize(df_pairwise)
215
 
216
+ df_result["score"] = df_result["score"].apply(lambda x: f"{x:.03f}")
217
+
218
+ if "ci" in df_result.columns:
219
+ df_result["ci"] = df_result.apply(
220
+ lambda row: f"({row['score'] - row['ci'][0]:.03f}; {row['ci'][1] - row['score']:.03f})",
221
+ axis=1,
222
+ )
223
+
224
  return df_result, fig
225
 
226
 
 
251
  info="Perform the entire computation but output only five head and five tail items, "
252
  "avoiding overlap.",
253
  ),
254
+ gr.Number(
255
+ value=0,
256
+ minimum=0,
257
+ maximum=10000,
258
+ label="Bootstrap Rounds",
259
+ info="Number of bootstrap rounds to perform for estimating the confidence interval.",
260
+ ),
261
  ],
262
  outputs=[
263
  gr.Dataframe(
264
+ headers=["item", "score", "ci", "pairs", "rank"],
265
  label="Ranking",
266
  ),
267
  gr.Plot(
 
278
  ["llmfao.csv", "Bradley-Terry (1952)", False, True],
279
  ["llmfao.csv", "Elo (1960)", False, True],
280
  ],
281
+ title="Evalica: Turn Your Side-by-Side Comparisons into Ranking!",
282
  description="""
283
+ """.strip(),
284
+ article="""
285
+ This easy-to-use tool transforms pairwise comparisons (*aka* side-by-side) to a meaningful ranking of items.
286
 
287
  As an input, it expects a comma-separated (CSV) file with a header containing the following columns:
288
 
 
293
  Possible values for `winner` are `left`, `right`, or `tie`. The provided examples might be a good starting point.
294
 
295
  As the output, this tool provides a table with items, their estimated scores, and ranks.
296
+
 
297
  **More Evalica:**
298
 
299
  - Paper: TBD ([arXiv](https://arxiv.org/abs/2412.11314))
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- evalica
2
  networkx
3
  plotly
 
1
+ evalica[gradio]
2
  networkx
3
  plotly