Spaces:
Running
Running
File size: 4,923 Bytes
0de1d17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
"""
Baseline: based on most-common answer
"""
import pandas as pd
import numpy as np
from tqdm import tqdm
from .metrics import mapk, rank_biased_overlap
from .plots import plot_ranks
import logging
from typing import List, Callable, Optional
from rouge_score import rouge_scorer as rs
from collections import Counter
import random
logger = logging.getLogger(__name__)
tol = 0.001
class MCARank:
"""
Baseline method: based on most common answer
"""
def __init__(
self,
MODELS: List,
evaluator: Callable,
true_ranking: Optional[List] = None,
show_progress: Optional[bool] = False,
):
self.MODELS = MODELS
self.N = len(MODELS)
self.evaluate = evaluator
self.true_ranking = true_ranking
self.show_progress = show_progress
def fit(self, df: pd.DataFrame, measure: Optional[str]='equality', p: float = 0):
"""
df: Dataframe where each row is a benchmark instance,
and there is a column with the output for each Model
measure: decides how the most common answer is decided.
p - is the noise level to include (only used for noisy-equality)
"""
assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
if measure == 'equality':
# Select the most common answer per question
mca = df.mode(axis=1).iloc[:, 0]
# Count all the times each model answered the most common one
wins = df.eq(mca, axis=0).astype(int)
self.ranking = wins.sum().sort_values(ascending=False).index.to_list()
elif measure == 'noisy_equality':
# Most common answer
mca = df.mode(axis=1).iloc[:, 0]
perturb = lambda x: not x if (random.random() <= p) else x
def __noisy_equality(x, mca):
wins = (x == mca).apply(perturb)
return wins
wins = df.apply(__noisy_equality, axis='rows', args=(mca, ))
self.ranking = wins.sum().sort_values(ascending=False).index.to_list()
elif measure == 'rouge':
MODELS = df.columns.to_list()
SIZE = 256
def __mca(x):
""" Most Commmon Answer, as the top k bigrams across all outputs """
cs = [rs._create_ngrams(x[m], n=2) for m in MODELS]
c = sum(cs, Counter())
return Counter(dict(c.most_common(SIZE)))
def __score_mca(x):
""" Rouge score computed relative to most-common-answer """
res = {}
for m in MODELS:
p_n = rs._create_ngrams(x[m], n=2)
res[m] = rs._score_ngrams(x.mca, p_n).fmeasure
return pd.Series(res)
df['mca'] = df.apply(__mca, axis=1)
# Winning model based on best ROUGE score for each question
win_rates = df.apply(__score_mca, axis=1).idxmax(axis=1).value_counts()
win_rate_rank = win_rates.index.tolist()
# include models with nowins at the bottom
no_wins = list(set(MODELS) - set(win_rate_rank))
self.ranking = win_rate_rank + no_wins
else:
raise ValueError(f"Measure {measure} not understood.")
logger.info(f"Estimated ranks (best to worst): {self.ranking}")
logger.info(f"True ranking: {self.true_ranking}")
logger.info(f"RBO measure: {self.measure()}")
return self.ranking # Best to worst
def measure(self, metric='rbo', k=5, p=0.95) -> float:
"""
Report metric related to self-rank
"""
if metric not in ['rbo', 'mapk']:
raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")
if hasattr(self, 'ranking'):
if self.true_ranking is not None:
if metric == 'mapk':
if k > len(self.true_ranking):
logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
actual = [self.true_ranking[:k]]
pred = [self.ranking[:k]]
return mapk(actual, pred, k=k)
elif metric == 'rbo':
return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
else:
raise ValueError(f"Metric {metric} not understood.")
else:
raise ValueError("True ranking not available for metric calculation.")
else:
raise ValueError("Ranking not estimated. Run 'fit' first.")
def plot(self, caselabel="output"):
if hasattr(self, 'ranking') & (self.true_ranking is not None):
plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)
|