martinakaduc commited on
Commit
4af8ee7
1 Parent(s): ef06837

Update code

Browse files
Logo BK.png ADDED
Logo Stanford.png ADDED
Logo VNU-HCM.png ADDED
app.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from data_loader import (
3
+ resutls,
4
+ metric_ud,
5
+ tasks,
6
+ settings,
7
+ task_w_settings,
8
+ datasets
9
+ )
10
+
11
+ if __name__ == "__main__":
12
+ st.set_page_config(
13
+ page_title="URA-LLaMa Evaluation Dashboard",
14
+ page_icon="🧊",
15
+ layout="wide",
16
+ initial_sidebar_state="expanded",
17
+ )
18
+
19
+ st.image(["Logo BK.png", "Logo VNU-HCM.png",
20
+ "Logo Stanford.png"], width=120)
21
+ st.title("URA-LLaMa Evaluation Dashboard")
22
+ st.write(
23
+ "This dashboard is used to visualize the results of the URA-LLaMa evaluation.")
24
+
25
+ task = st.sidebar.selectbox(
26
+ "Select Task",
27
+ list(tasks.keys())
28
+ )
29
+
30
+ setting = st.sidebar.selectbox(
31
+ "Select Setting",
32
+ task_w_settings[task]
33
+ )
34
+
35
+ task_id = tasks[task]
36
+ dataset = st.sidebar.selectbox(
37
+ "Select Dataset",
38
+ list(datasets[task_id].values())
39
+ )
40
+
41
+ result_id = f"{task_id}-{settings[setting]}"
42
+ result_sheet = resutls[result_id][dataset]
43
+
44
+ # Visualize the data stored as a pandas dataframe
45
+ st.dataframe(result_sheet)
data_loader.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ RESULT_FILE = 'evaluation_results.xlsx'
5
+
6
+ metric_ud = {
7
+ "Accuracy": 1,
8
+ "Average Exact Match": 1,
9
+ "Exact Match": 1,
10
+ "F1 Score": 1,
11
+ "AUC ROC": 1,
12
+ "AUC PR": 1,
13
+ "Precision": 1,
14
+ "Recall": 1,
15
+ "Equivalent": 1,
16
+ "Bias": -1,
17
+ "Toxicity": -1,
18
+ "ROUGE-1": 1,
19
+ "ROUGE-2": 1,
20
+ "ROUGE-L": 1,
21
+ "BLEU": 1,
22
+ "SummaC": 1,
23
+ "BERTScore": 1,
24
+ "Coverage": 1,
25
+ "Density": 1,
26
+ "Compression": 1,
27
+ "hLEPOR": 1,
28
+ "Character Error Rate": -1,
29
+ "Word Error Rate": -1,
30
+ "Character Edit Distance": -1,
31
+ "Word Edit Distance": -1,
32
+ "Perplexity": -1,
33
+ "Expected Calibration Error": -1,
34
+ "acc@10": 1,
35
+ "MRR@10 (Top 30)": 1,
36
+ "NDCG@10 (Top 30)": 1,
37
+ "MRR@10": 1,
38
+ "NDCG@10": 1,
39
+ }
40
+
41
+ tasks = {
42
+ "Information Retrieval": "informationretrieval",
43
+ "Knowledge": "knowledge",
44
+ "Language Modelling": "language-modelling",
45
+ "Question Answering": "question-answering",
46
+ "Reasoning": "reasoning",
47
+ "Summarization": "summarization",
48
+ "Text Classification": "text-classification",
49
+ "Toxicity Detection": "toxicity-detection",
50
+ "Translation": "translation",
51
+ "Sentiment Analysis": "sentiment-analysis",
52
+ }
53
+
54
+ settings = {
55
+ "Normal": "",
56
+ "Few-shot Leanring": "fs",
57
+ "Prompt Strategy 0": "pt0",
58
+ "Prompt Strategy 1": "pt1",
59
+ "Prompt Strategy 2": "pt2",
60
+ "Chain-of-Thought": "cot",
61
+ "Fairness": "fairness",
62
+ "Robustness": "robustness",
63
+ }
64
+
65
+ task_w_settings = {
66
+ "Information Retrieval": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
67
+ "Knowledge": ["Normal", "Few-shot Leanring", "Robustness"],
68
+ "Language Modelling": ["Normal", "Few-shot Leanring", "Fairness"],
69
+ "Question Answering": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness", "Fairness"],
70
+ "Reasoning": ["Few-shot Leanring", "Chain-of-Thought"],
71
+ "Summarization": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness"],
72
+ "Text Classification": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
73
+ "Toxicity Detection": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
74
+ "Translation": ["Few-shot Leanring", "Robustness"],
75
+ "Sentiment Analysis": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
76
+ }
77
+
78
+ datasets = {
79
+ "question-answering": {
80
+ "xquad_xtreme": "xQUAD EXTREME",
81
+ "mlqa": "MLQA",
82
+ },
83
+ "summarization": {
84
+ "vietnews": "VietNews",
85
+ "wikilingua": "WikiLingua",
86
+ },
87
+ "text-classification": {
88
+ "vsmec": "VSMEC",
89
+ "phoatis": "PhoATIS",
90
+ },
91
+ "toxicity-detection": {
92
+ "victsd": "UIT-ViCTSD",
93
+ "vihsd": "UIT-ViHSD",
94
+ },
95
+ "translation": {
96
+ "phomt-envi": "PhoMT English-Vietnamese",
97
+ "phomt-vien": "PhoMT Vietnamese-English",
98
+ "opus100-envi": "OPUS-100 English-Vietnamese",
99
+ "opus100-vien": "OPUS-100 Vietnamese-English",
100
+ },
101
+ "sentiment-analysis": {
102
+ "vlsp": "VLSP 2016",
103
+ "vsfc": "UIT-VSFC",
104
+ },
105
+ "informationretrieval": {
106
+ "mmarco": "mMARCO",
107
+ "mrobust": "mRobust",
108
+ },
109
+ "knowledge": {
110
+ "zaloe2e": "ZaloE2E",
111
+ "vimmrc": "ViMMRC",
112
+ },
113
+ "language-modelling": {
114
+ "mlqa": "MLQA",
115
+ "vsec": "VSEC",
116
+ },
117
+ "reasoning": {
118
+ "srnatural-azr": "Synthetic Reasoning (Natural) - Azure",
119
+ "srnatural-gcp": "Synthetic Reasoning (Natural) - Google Cloud",
120
+ "srabstract-azr": "Synthetic Reasoning (Abstract Symbol)- Azure",
121
+ "srabstract-gcp": "Synthetic Reasoning (Abstract Symbol)- Google Cloud",
122
+ "math-azr": "MATH Level 1 - Azure",
123
+ "math-gcp": "MATH Level 1 - Google Cloud",
124
+ },
125
+ }
126
+
127
+
128
+ def load_data(file_name):
129
+ """
130
+ Load the data from the csv file
131
+ """
132
+ data = pd.read_excel(
133
+ file_name,
134
+ sheet_name=None,
135
+ header=None
136
+ )
137
+ results = {}
138
+ for task_name, task_id in tasks.items():
139
+ for setting_name in task_w_settings[task_name]:
140
+ setting_id = settings[setting_name]
141
+ sheet_name = f"{task_id}-{setting_id}" if setting_id else task_id
142
+ sheet_data = data[sheet_name]
143
+ results_by_dataset = {}
144
+
145
+ # Find the rows that contain the dataset ids
146
+ # dataset_ids = datasets[task_id].keys()
147
+ row_ids = []
148
+ for i, row in sheet_data.iterrows():
149
+ if "Models/" in row[0]:
150
+ row_ids.append(i)
151
+ row_ids.append(len(sheet_data))
152
+
153
+ # Get the data for each dataset
154
+ for i in range(len(row_ids) - 1):
155
+ dataset_id = sheet_data.iloc[row_ids[i]][0].split('/')[-1]
156
+ dataset_name = datasets[task_id][dataset_id]
157
+
158
+ dataset_data = sheet_data.iloc[row_ids[i] + 1: row_ids[i + 1]]
159
+ dataset_data = dataset_data.fillna('')
160
+ # dataset_data = dataset_data.dropna(axis=1, how='all')
161
+ # dataset_data = dataset_data.dropna(axis=0, how='all')
162
+ header = sheet_data.iloc[0]
163
+ header[0] = "Models"
164
+
165
+ # Create new pandas dataframe
166
+ dataset_data = pd.DataFrame(
167
+ dataset_data.values, columns=header)
168
+ results_by_dataset[dataset_name] = dataset_data
169
+
170
+ results[f"{task_id}-{setting_id}"] = results_by_dataset
171
+
172
+ return results
173
+
174
+
175
+ resutls = load_data(RESULT_FILE)
evaluation_results.xlsx ADDED
Binary file (141 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ openpyxl