File size: 4,923 Bytes
0de1d17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
Baseline: based on most-common answer
"""

import pandas as pd
import numpy as np
from tqdm import tqdm
from .metrics import mapk, rank_biased_overlap
from .plots import plot_ranks
import logging
from typing import List, Callable, Optional
from rouge_score import rouge_scorer as rs
from collections import Counter
import random

logger = logging.getLogger(__name__)
tol = 0.001


class MCARank:
    """
    Baseline method: based on most common answer
    """

    def __init__(
        self,
        MODELS: List,
        evaluator: Callable,
        true_ranking: Optional[List] = None,
        show_progress: Optional[bool] = False,
    ):
        self.MODELS = MODELS
        self.N = len(MODELS)
        self.evaluate = evaluator
        self.true_ranking = true_ranking
        self.show_progress = show_progress


    def fit(self, df: pd.DataFrame, measure: Optional[str]='equality', p: float = 0):
        """
        df: Dataframe where each row is a benchmark instance,
        and there is a column with the output for each Model

        measure: decides how the most common answer is decided.
        p - is the noise level to include (only used for noisy-equality)
        """

        assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."

        if measure == 'equality':

            # Select the most common answer per question
            mca = df.mode(axis=1).iloc[:, 0]

            # Count all the times each model answered the most common one
            wins = df.eq(mca, axis=0).astype(int)

            self.ranking = wins.sum().sort_values(ascending=False).index.to_list()
        
        elif measure == 'noisy_equality':

            # Most common answer
            mca = df.mode(axis=1).iloc[:, 0]

            perturb = lambda x: not x if (random.random() <= p) else x

            def __noisy_equality(x, mca):
                wins = (x == mca).apply(perturb)
                return wins
            
            wins = df.apply(__noisy_equality, axis='rows', args=(mca, ))

            self.ranking = wins.sum().sort_values(ascending=False).index.to_list()

        elif measure == 'rouge':

            MODELS = df.columns.to_list()
            SIZE = 256

            def __mca(x):
                """ Most Commmon Answer, as the top k bigrams across all outputs """

                cs = [rs._create_ngrams(x[m], n=2) for m in MODELS]
                c = sum(cs, Counter())
                return Counter(dict(c.most_common(SIZE)))

            def __score_mca(x):
                """ Rouge score computed relative to most-common-answer """

                res = {}
                for m in MODELS:
                    p_n = rs._create_ngrams(x[m], n=2)
                    res[m] = rs._score_ngrams(x.mca, p_n).fmeasure
                return pd.Series(res)
                
            df['mca'] = df.apply(__mca, axis=1)

            # Winning model based on best ROUGE score for each question
            win_rates = df.apply(__score_mca, axis=1).idxmax(axis=1).value_counts()
            win_rate_rank = win_rates.index.tolist()

            # include models with nowins at the bottom
            no_wins = list(set(MODELS) - set(win_rate_rank))

            self.ranking = win_rate_rank + no_wins
        
        
        else:
            raise ValueError(f"Measure {measure} not understood.")


        logger.info(f"Estimated ranks (best to worst): {self.ranking}")
        logger.info(f"True ranking: {self.true_ranking}")
        logger.info(f"RBO measure: {self.measure()}")
        return self.ranking # Best to worst


    def measure(self, metric='rbo', k=5, p=0.95) -> float:
        """
        Report metric related to self-rank
        """
        if metric not in ['rbo', 'mapk']:
            raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")

        if hasattr(self, 'ranking'):
            if self.true_ranking is not None:
                if metric == 'mapk':
                    if k > len(self.true_ranking):
                        logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
                    actual = [self.true_ranking[:k]]
                    pred = [self.ranking[:k]]
                    return mapk(actual, pred, k=k)
                elif metric == 'rbo':
                    return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
                else:
                    raise ValueError(f"Metric {metric} not understood.")
            else:
                raise ValueError("True ranking not available for metric calculation.")
        else:
            raise ValueError("Ranking not estimated. Run 'fit' first.")


    def plot(self, caselabel="output"):
        if hasattr(self, 'ranking') & (self.true_ranking is not None):
            plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)