import pandas as pd import numpy as np RESULT_FILE = 'evaluation_results.xlsx' metric_ud = { "Accuracy": 1, "Average Exact Match": 1, "Exact Match": 1, "F1 Score": 1, "AUC ROC": 1, "AUC PR": 1, "Precision": 1, "Recall": 1, "Equivalent": 1, "Bias": -1, "Demographic representation (race)": -1, "Demographic representation (gender)": -1, "Stereotypical associations (race, profession)": -1, "Stereotypical associations (gender, profession)": -1, "Toxicity": -1, "ROUGE-1": 1, "ROUGE-2": 1, "ROUGE-L": 1, "BLEU": 1, "SummaC": 1, "BERTScore": 1, "Coverage": 1, "Density": 1, "Compression": 1, "hLEPOR": 1, "Character Error Rate": -1, "Word Error Rate": -1, "Character Edit Distance": -1, "Word Edit Distance": -1, "Perplexity": -1, "Expected Calibration Error": -1, "acc@10": 1, "MRR@10 (Top 30)": 1, "NDCG@10 (Top 30)": 1, "MRR@10": 1, "NDCG@10": 1, } tasks = { "Information Retrieval": "informationretrieval", "Knowledge": "knowledge", "Language Modelling": "language-modelling", "Question Answering": "question-answering", "Reasoning": "reasoning", "Summarization": "summarization", "Text Classification": "text-classification", "Toxicity Detection": "toxicity-detection", "Translation": "translation", "Sentiment Analysis": "sentiment-analysis", } settings = { "Normal": "", "Few-shot Leanring": "fs", "Prompt Strategy 0": "pt0", "Prompt Strategy 1": "pt1", "Prompt Strategy 2": "pt2", "Chain-of-Thought": "cot", "Fairness": "fairness", "Robustness": "robustness", } task_w_settings = { "Information Retrieval": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], "Knowledge": ["Normal", "Few-shot Leanring", "Robustness"], "Language Modelling": ["Normal", "Few-shot Leanring", "Fairness"], "Question Answering": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness", "Fairness"], "Reasoning": ["Few-shot Leanring", "Chain-of-Thought"], "Summarization": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness"], "Text Classification": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], "Toxicity Detection": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], "Translation": ["Few-shot Leanring", "Robustness"], "Sentiment Analysis": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], } datasets = { "question-answering": { "xquad_xtreme": "xQUAD EXTREME", "mlqa": "MLQA", }, "summarization": { "vietnews": "VietNews", "wikilingua": "WikiLingua", }, "text-classification": { "vsmec": "VSMEC", "phoatis": "PhoATIS", }, "toxicity-detection": { "victsd": "UIT-ViCTSD", "vihsd": "UIT-ViHSD", }, "translation": { "phomt-envi": "PhoMT English-Vietnamese", "phomt-vien": "PhoMT Vietnamese-English", "opus100-envi": "OPUS-100 English-Vietnamese", "opus100-vien": "OPUS-100 Vietnamese-English", }, "sentiment-analysis": { "vlsp": "VLSP 2016", "vsfc": "UIT-VSFC", }, "informationretrieval": { "mmarco": "mMARCO", "mrobust": "mRobust", }, "knowledge": { "zaloe2e": "ZaloE2E", "vimmrc": "ViMMRC", }, "language-modelling": { "mlqa-mlm": "MLQA", "vsec": "VSEC", }, "reasoning": { "srnatural-azr": "Synthetic Reasoning (Natural) - Azure", "srnatural-gcp": "Synthetic Reasoning (Natural) - Google Cloud", "srabstract-azr": "Synthetic Reasoning (Abstract Symbol)- Azure", "srabstract-gcp": "Synthetic Reasoning (Abstract Symbol)- Google Cloud", "math-azr-Algebra": "MATH Level 1 (Algebra) - Azure", "math-azr-Counting&Probability": "MATH Level 1 (Counting&Probability) - Azure", "math-azr-Geometry": "MATH Level 1 (Geometry) - Azure", "math-azr-IntermediateAlgebra": "MATH Level 1 (IntermediateAlgebra) - Azure", "math-azr-NumberTheory": "MATH Level 1 (NumberTheory) - Azure", "math-azr-Prealgebra": "MATH Level 1 (Prealgebra) - Azure", "math-azr-Precalculus": "MATH Level 1 (Precalculus) - Azure", "math-gcp-Algebra": "MATH Level 1 (Algebra) - Google Cloud", "math-gcp-Counting&Probability": "MATH Level 1 (Counting&Probability) - Google Cloud", "math-gcp-Geometry": "MATH Level 1 (Geometry) - Google Cloud", "math-gcp-IntermediateAlgebra": "MATH Level 1 (IntermediateAlgebra) - Google Cloud", "math-gcp-NumberTheory": "MATH Level 1 (NumberTheory) - Google Cloud", "math-gcp-Prealgebra": "MATH Level 1 (Prealgebra) - Google Cloud", "math-gcp-Precalculus": "MATH Level 1 (Precalculus) - Google Cloud", }, } def load_data(file_name): """ Load the data from the csv file """ data = pd.read_excel( file_name, sheet_name=None, header=None ) results = {} for task_name, task_id in tasks.items(): for setting_name in task_w_settings[task_name]: setting_id = settings[setting_name] sheet_name = f"{task_id}-{setting_id}" if setting_id else task_id sheet_data = data[sheet_name] results_by_dataset = {} # Find the rows that contain the dataset ids # dataset_ids = datasets[task_id].keys() row_ids = [] for i, row in sheet_data.iterrows(): if "Models/" in row[0]: row_ids.append(i) row_ids.append(len(sheet_data)) # Get the data for each dataset for i in range(len(row_ids) - 1): dataset_id = sheet_data.iloc[row_ids[i]][0].split('/')[-1] dataset_name = datasets[task_id][dataset_id] dataset_data = sheet_data.iloc[row_ids[i] + 1: row_ids[i + 1]] dataset_data = dataset_data.fillna(f'-') header = sheet_data.iloc[0] header[0] = "Models" # Create new pandas dataframe dataset_data = pd.DataFrame( dataset_data.values, columns=header) # column_dtypes = {'Models': 'string'} # for column in header[1:]: # column_dtypes[column] = 'float' # dataset_data = dataset_data.astype(column_dtypes) results_by_dataset[dataset_name] = dataset_data results[f"{task_id}-{setting_id}"] = results_by_dataset return results resutls = load_data(RESULT_FILE)