Spaces:

ibm
/

llm-rank-themselves

Running

File size: 4,923 Bytes

0de1d17

"""
Baseline: based on most-common answer
"""

import pandas as pd
import numpy as np
from tqdm import tqdm
from .metrics import mapk, rank_biased_overlap
from .plots import plot_ranks
import logging
from typing import List, Callable, Optional
from rouge_score import rouge_scorer as rs
from collections import Counter
import random

logger = logging.getLogger(__name__)
tol = 0.001


class MCARank:
    """
    Baseline method: based on most common answer
    """

    def __init__(
        self,
        MODELS: List,
        evaluator: Callable,
        true_ranking: Optional[List] = None,
        show_progress: Optional[bool] = False,
    ):
        self.MODELS = MODELS
        self.N = len(MODELS)
        self.evaluate = evaluator
        self.true_ranking = true_ranking
        self.show_progress = show_progress


    def fit(self, df: pd.DataFrame, measure: Optional[str]='equality', p: float = 0):
        """
        df: Dataframe where each row is a benchmark instance,
        and there is a column with the output for each Model

        measure: decides how the most common answer is decided.
        p - is the noise level to include (only used for noisy-equality)
        """

        assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."

        if measure == 'equality':

            # Select the most common answer per question
            mca = df.mode(axis=1).iloc[:, 0]

            # Count all the times each model answered the most common one
            wins = df.eq(mca, axis=0).astype(int)

            self.ranking = wins.sum().sort_values(ascending=False).index.to_list()
        
        elif measure == 'noisy_equality':

            # Most common answer
            mca = df.mode(axis=1).iloc[:, 0]

            perturb = lambda x: not x if (random.random() <= p) else x

            def __noisy_equality(x, mca):
                wins = (x == mca).apply(perturb)
                return wins
            
            wins = df.apply(__noisy_equality, axis='rows', args=(mca, ))

            self.ranking = wins.sum().sort_values(ascending=False).index.to_list()

        elif measure == 'rouge':

            MODELS = df.columns.to_list()
            SIZE = 256

            def __mca(x):
                """ Most Commmon Answer, as the top k bigrams across all outputs """

                cs = [rs._create_ngrams(x[m], n=2) for m in MODELS]
                c = sum(cs, Counter())
                return Counter(dict(c.most_common(SIZE)))

            def __score_mca(x):
                """ Rouge score computed relative to most-common-answer """

                res = {}
                for m in MODELS:
                    p_n = rs._create_ngrams(x[m], n=2)
                    res[m] = rs._score_ngrams(x.mca, p_n).fmeasure
                return pd.Series(res)
                
            df['mca'] = df.apply(__mca, axis=1)

            # Winning model based on best ROUGE score for each question
            win_rates = df.apply(__score_mca, axis=1).idxmax(axis=1).value_counts()
            win_rate_rank = win_rates.index.tolist()

            # include models with nowins at the bottom
            no_wins = list(set(MODELS) - set(win_rate_rank))

            self.ranking = win_rate_rank + no_wins
        
        
        else:
            raise ValueError(f"Measure {measure} not understood.")


        logger.info(f"Estimated ranks (best to worst): {self.ranking}")
        logger.info(f"True ranking: {self.true_ranking}")
        logger.info(f"RBO measure: {self.measure()}")
        return self.ranking # Best to worst


    def measure(self, metric='rbo', k=5, p=0.95) -> float:
        """
        Report metric related to self-rank
        """
        if metric not in ['rbo', 'mapk']:
            raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")

        if hasattr(self, 'ranking'):
            if self.true_ranking is not None:
                if metric == 'mapk':
                    if k > len(self.true_ranking):
                        logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
                    actual = [self.true_ranking[:k]]
                    pred = [self.ranking[:k]]
                    return mapk(actual, pred, k=k)
                elif metric == 'rbo':
                    return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
                else:
                    raise ValueError(f"Metric {metric} not understood.")
            else:
                raise ValueError("True ranking not available for metric calculation.")
        else:
            raise ValueError("Ranking not estimated. Run 'fit' first.")


    def plot(self, caselabel="output"):
        if hasattr(self, 'ranking') & (self.true_ranking is not None):
            plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)