import pandas as pd import numpy as np from rouge_score import rouge_scorer from joblib import Parallel, delayed from selfrank.algos.greedy import SelfRankGreedy from selfrank.algos.iterative import SelfRank from selfrank.algos.baseline import MCARank from selfrank.algos.triplet import equality, rouge, noisy_equality import matplotlib.pyplot as plt from itertools import zip_longest from uuid import uuid4 import csv, os from functools import partial import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def generate_data(max_acc, min_acc, nmodels, nanswers, nquestions) -> tuple[pd.DataFrame, list]: np.random.seed(42) # Spread model accuracies between min and max model_acc = np.linspace(max_acc, min_acc, nmodels) gt_and_model_ans = np.zeros( (nquestions, nmodels + 1), dtype=int ) # array to store ground truth and model ans # Create ground truth answers i.e. first column for i in range(nquestions): gt_and_model_ans[i][0] = np.random.randint(nanswers) for i in range(0, nmodels): no_of_entries_frm_gt = np.ceil(model_acc[i] / 100 * (nquestions)).astype(int) # print(no_of_entries_frm_gt) offsets_to_match = np.random.permutation(nquestions)[0:no_of_entries_frm_gt] # print(offsets_to_match) for j in range(nquestions): if j in offsets_to_match: gt_and_model_ans[j][i + 1] = gt_and_model_ans[j][0] else: lst_wo_gt = list(range(nanswers)) lst_wo_gt.remove(gt_and_model_ans[j][0]) gt_and_model_ans[j][i + 1] = lst_wo_gt[np.random.randint(nanswers - 1)] # print(gt_and_model_ans) filename = str(uuid4()) fields = ["GT"] for i in range(nmodels): fields.append("M" + str(i + 1)) # writing to csv file with open(filename, "w") as csvfile: # creating a csv writer object csvwriter = csv.writer(csvfile) # writing the fields csvwriter.writerow(fields) # writing the data rows csvwriter.writerows(gt_and_model_ans) df = pd.read_csv(filename) os.remove(filename) true_ranking = [f"M{i}" for i in range(1, nmodels + 1)] return df, true_ranking def synth_executor(acc_range: tuple[float, float], nmodels, nanswers, nquestions, noise, method) -> tuple[str, dict]: min_acc, max_acc = acc_range logger.info(f"Synth experiment: min_acc:{min_acc}, max_acc:{max_acc}, nmodels: {nmodels}, nanswers: {nanswers}, nquestions: {nquestions}, noise:{noise}, method:{method}.") df, true_ranking = generate_data(max_acc, min_acc, nmodels, nanswers, nquestions) if noise == 0.: comp = equality else: comp = partial(noisy_equality, p=noise) df = df.drop(columns=["GT"]) MODELS = df.columns.tolist() if method == "Full": ranker = SelfRank(MODELS, comp, true_ranking) ranker.fit(df) # outputs of interest out = { "true_ranking": true_ranking, "estimated_ranking": ranker.ranking, "rbo": ranker.measure(metric="rbo"), "map-1": ranker.measure(metric='mapk', k=1), "map-3": ranker.measure(metric='mapk', k=3), "map-5": ranker.measure(metric='mapk', k=5), "map-10": ranker.measure(metric='mapk', k=10) } elif method == "Greedy": ranker = SelfRankGreedy(MODELS, comp, true_ranking) ranker.fit(df) out = { "true_ranking": true_ranking, "estimated_ranking": ranker.ranking, "rbo": ranker.measure(metric="rbo"), "map-1": ranker.measure(metric='mapk', k=1), "map-3": ranker.measure(metric='mapk', k=3), "map-5": ranker.measure(metric='mapk', k=5), "map-10": ranker.measure(metric='mapk', k=10) } elif method == 'MCA': ranker = MCARank(MODELS, comp, true_ranking) ranker.fit(df, measure='noisy_equality', p=noise) out = { "true_ranking": true_ranking, "estimated_ranking": ranker.ranking, "rbo": ranker.measure(metric="rbo"), "map-1": ranker.measure(metric='mapk', k=1), "map-3": ranker.measure(metric='mapk', k=3), "map-5": ranker.measure(metric='mapk', k=5), "map-10": ranker.measure(metric='mapk', k=10) } else: raise ValueError(f"{method} not understood.") eval_metrics = ( f"