Spaces:

ura-hcmut
/

ura-llama-evaluation

Running

App Files Files Community

martinakaduc commited on Oct 9, 2023

Commit

4af8ee7

•

1 Parent(s): ef06837

Update code

Browse files

Files changed (7) hide show

Logo BK.png +0 -0
Logo Stanford.png +0 -0
Logo VNU-HCM.png +0 -0
app.py +45 -0
data_loader.py +175 -0
evaluation_results.xlsx +0 -0
requirements.txt +1 -0

Logo BK.png ADDED Viewed

Logo Stanford.png ADDED Viewed

Logo VNU-HCM.png ADDED Viewed

app.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import streamlit as st
+from data_loader import (
+    resutls,
+    metric_ud,
+    tasks,
+    settings,
+    task_w_settings,
+    datasets
+)
+if __name__ == "__main__":
+    st.set_page_config(
+        page_title="URA-LLaMa Evaluation Dashboard",
+        page_icon="🧊",
+        layout="wide",
+        initial_sidebar_state="expanded",
+    )
+    st.image(["Logo BK.png", "Logo VNU-HCM.png",
+             "Logo Stanford.png"], width=120)
+    st.title("URA-LLaMa Evaluation Dashboard")
+    st.write(
+        "This dashboard is used to visualize the results of the URA-LLaMa evaluation.")
+    task = st.sidebar.selectbox(
+        "Select Task",
+        list(tasks.keys())
+    )
+    setting = st.sidebar.selectbox(
+        "Select Setting",
+        task_w_settings[task]
+    )
+    task_id = tasks[task]
+    dataset = st.sidebar.selectbox(
+        "Select Dataset",
+        list(datasets[task_id].values())
+    )
+    result_id = f"{task_id}-{settings[setting]}"
+    result_sheet = resutls[result_id][dataset]
+    # Visualize the data stored as a pandas dataframe
+    st.dataframe(result_sheet)

data_loader.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import pandas as pd
+import numpy as np
+RESULT_FILE = 'evaluation_results.xlsx'
+metric_ud = {
+    "Accuracy": 1,
+    "Average Exact Match": 1,
+    "Exact Match": 1,
+    "F1 Score": 1,
+    "AUC ROC": 1,
+    "AUC PR": 1,
+    "Precision": 1,
+    "Recall": 1,
+    "Equivalent": 1,
+    "Bias": -1,
+    "Toxicity": -1,
+    "ROUGE-1": 1,
+    "ROUGE-2": 1,
+    "ROUGE-L": 1,
+    "BLEU": 1,
+    "SummaC": 1,
+    "BERTScore": 1,
+    "Coverage": 1,
+    "Density": 1,
+    "Compression": 1,
+    "hLEPOR": 1,
+    "Character Error Rate": -1,
+    "Word Error Rate": -1,
+    "Character Edit Distance": -1,
+    "Word Edit Distance": -1,
+    "Perplexity": -1,
+    "Expected Calibration Error": -1,
+    "acc@10": 1,
+    "MRR@10 (Top 30)": 1,
+    "NDCG@10 (Top 30)": 1,
+    "MRR@10": 1,
+    "NDCG@10": 1,
+}
+tasks = {
+    "Information Retrieval": "informationretrieval",
+    "Knowledge": "knowledge",
+    "Language Modelling": "language-modelling",
+    "Question Answering": "question-answering",
+    "Reasoning": "reasoning",
+    "Summarization": "summarization",
+    "Text Classification": "text-classification",
+    "Toxicity Detection": "toxicity-detection",
+    "Translation": "translation",
+    "Sentiment Analysis": "sentiment-analysis",
+}
+settings = {
+    "Normal": "",
+    "Few-shot Leanring": "fs",
+    "Prompt Strategy 0": "pt0",
+    "Prompt Strategy 1": "pt1",
+    "Prompt Strategy 2": "pt2",
+    "Chain-of-Thought": "cot",
+    "Fairness": "fairness",
+    "Robustness": "robustness",
+}
+task_w_settings = {
+    "Information Retrieval": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
+    "Knowledge": ["Normal", "Few-shot Leanring", "Robustness"],
+    "Language Modelling": ["Normal", "Few-shot Leanring", "Fairness"],
+    "Question Answering": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness", "Fairness"],
+    "Reasoning": ["Few-shot Leanring", "Chain-of-Thought"],
+    "Summarization": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness"],
+    "Text Classification": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
+    "Toxicity Detection": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
+    "Translation": ["Few-shot Leanring", "Robustness"],
+    "Sentiment Analysis": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
+}
+datasets = {
+    "question-answering": {
+        "xquad_xtreme": "xQUAD EXTREME",
+        "mlqa": "MLQA",
+    },
+    "summarization": {
+        "vietnews": "VietNews",
+        "wikilingua": "WikiLingua",
+    },
+    "text-classification": {
+        "vsmec": "VSMEC",
+        "phoatis": "PhoATIS",
+    },
+    "toxicity-detection": {
+        "victsd": "UIT-ViCTSD",
+        "vihsd": "UIT-ViHSD",
+    },
+    "translation": {
+        "phomt-envi": "PhoMT English-Vietnamese",
+        "phomt-vien": "PhoMT Vietnamese-English",
+        "opus100-envi": "OPUS-100 English-Vietnamese",
+        "opus100-vien": "OPUS-100 Vietnamese-English",
+    },
+    "sentiment-analysis": {
+        "vlsp": "VLSP 2016",
+        "vsfc": "UIT-VSFC",
+    },
+    "informationretrieval": {
+        "mmarco": "mMARCO",
+        "mrobust": "mRobust",
+    },
+    "knowledge": {
+        "zaloe2e": "ZaloE2E",
+        "vimmrc": "ViMMRC",
+    },
+    "language-modelling": {
+        "mlqa": "MLQA",
+        "vsec": "VSEC",
+    },
+    "reasoning": {
+        "srnatural-azr": "Synthetic Reasoning (Natural) - Azure",
+        "srnatural-gcp": "Synthetic Reasoning (Natural) - Google Cloud",
+        "srabstract-azr": "Synthetic Reasoning (Abstract Symbol)- Azure",
+        "srabstract-gcp": "Synthetic Reasoning (Abstract Symbol)- Google Cloud",
+        "math-azr": "MATH Level 1 - Azure",
+        "math-gcp": "MATH Level 1 - Google Cloud",
+    },
+}
+def load_data(file_name):
+    """
+    Load the data from the csv file
+    """
+    data = pd.read_excel(
+        file_name,
+        sheet_name=None,
+        header=None
+    )
+    results = {}
+    for task_name, task_id in tasks.items():
+        for setting_name in task_w_settings[task_name]:
+            setting_id = settings[setting_name]
+            sheet_name = f"{task_id}-{setting_id}" if setting_id else task_id
+            sheet_data = data[sheet_name]
+            results_by_dataset = {}
+            # Find the rows that contain the dataset ids
+            # dataset_ids = datasets[task_id].keys()
+            row_ids = []
+            for i, row in sheet_data.iterrows():
+                if "Models/" in row[0]:
+                    row_ids.append(i)
+            row_ids.append(len(sheet_data))
+            # Get the data for each dataset
+            for i in range(len(row_ids) - 1):
+                dataset_id = sheet_data.iloc[row_ids[i]][0].split('/')[-1]
+                dataset_name = datasets[task_id][dataset_id]
+                dataset_data = sheet_data.iloc[row_ids[i] + 1: row_ids[i + 1]]
+                dataset_data = dataset_data.fillna('')
+                # dataset_data = dataset_data.dropna(axis=1, how='all')
+                # dataset_data = dataset_data.dropna(axis=0, how='all')
+                header = sheet_data.iloc[0]
+                header[0] = "Models"
+                # Create new pandas dataframe
+                dataset_data = pd.DataFrame(
+                    dataset_data.values, columns=header)
+                results_by_dataset[dataset_name] = dataset_data
+            results[f"{task_id}-{setting_id}"] = results_by_dataset
+    return results
+resutls = load_data(RESULT_FILE)

evaluation_results.xlsx ADDED Viewed

Binary file (141 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ openpyxl