Spaces:
Running
Running
Ranks
Browse files- app.py +128 -31
- compare_significance.py +231 -0
- model_compare.py +62 -0
- requirements.txt +4 -1
app.py
CHANGED
@@ -18,13 +18,14 @@ import gradio as gr
|
|
18 |
|
19 |
from huggingface_hub import HfApi, snapshot_download
|
20 |
|
|
|
|
|
21 |
|
22 |
JSON_DATASET_DIR = Path("../json_dataset")
|
23 |
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
24 |
|
25 |
JSON_DATASET_PATH = JSON_DATASET_DIR / f"train-{uuid4()}.json"
|
26 |
|
27 |
-
|
28 |
api = HfApi()
|
29 |
|
30 |
ORG= "CZLC"
|
@@ -38,28 +39,81 @@ DATASET_VERSIONS = ['dev-set-1', 'dev-set-2']
|
|
38 |
|
39 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
40 |
|
41 |
-
|
42 |
-
|
43 |
class LeaderboardServer:
|
44 |
def __init__(self, server_address):
|
45 |
self.server_address = server_address
|
46 |
self.repo_type = "dataset"
|
47 |
-
self.local_leaderboard = snapshot_download(self.server_address,repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
|
48 |
-
|
|
|
49 |
def on_submit(self):
|
50 |
self.local_leaderboard = snapshot_download(self.server_address,repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
|
51 |
|
52 |
def get_leaderboard(self):
|
53 |
results = []
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
for submission in glob.glob(os.path.join(self.local_leaderboard, "data") + "/*.json"):
|
56 |
data = json.load(open(submission))
|
57 |
submission_id = data["metadata"]["model_description"]
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
local_results["submission_id"] = submission_id
|
60 |
results.append(local_results)
|
61 |
dataframe = pd.DataFrame.from_records(results)
|
|
|
|
|
|
|
62 |
return dataframe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
def save_json(self,file, submission_name) -> None:
|
65 |
filename = os.path.basename(file)
|
@@ -72,11 +126,7 @@ class LeaderboardServer:
|
|
72 |
)
|
73 |
|
74 |
|
75 |
-
|
76 |
-
|
77 |
leaderboard_server = LeaderboardServer(REPO)
|
78 |
-
|
79 |
-
|
80 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
81 |
|
82 |
|
@@ -86,6 +136,8 @@ MAX_SUBMISSIONS_PER_24H = 2
|
|
86 |
# CHALLENGE_NAME = 'NOTSOFAR1'
|
87 |
|
88 |
|
|
|
|
|
89 |
# if __name__ == '__main__':
|
90 |
with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility: hidden}") as main):
|
91 |
app_state = gr.State({})
|
@@ -136,9 +188,9 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility
|
|
136 |
return gr.Tabs(selected=first_tab_name), populate_leaderboard(first_tab_name, None)
|
137 |
|
138 |
|
139 |
-
with gr.Tab('
|
140 |
-
with gr.Row():
|
141 |
-
|
142 |
# with gr.Row():
|
143 |
# with gr.Column():
|
144 |
# dataset_version_drop = gr.Dropdown(choices=DATASET_VERSIONS, multiselect=False,
|
@@ -150,17 +202,35 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility
|
|
150 |
# gr.Markdown('') # Empty column for spacing
|
151 |
# with gr.Column():
|
152 |
# gr.Markdown('') # Empty column for spacing
|
153 |
-
with gr.Row():
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
# dataset_version_drop.select(fn=on_dropdown_change, inputs=[dataset_version_drop],
|
161 |
# outputs=[leaderboards_tabs, leaderboard_tables_list[0]])
|
162 |
|
|
|
163 |
|
|
|
164 |
# Submission Tab #
|
165 |
##################
|
166 |
with gr.Tab('Submission'):
|
@@ -217,30 +287,57 @@ with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility
|
|
217 |
# leaderboard_tab.render()
|
218 |
return gr.update(value='Submit', interactive=True)
|
219 |
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
submission_team_name_tb = gr.Textbox(label='Team Name')
|
222 |
-
|
223 |
-
|
|
|
|
|
|
|
224 |
with gr.Row():
|
225 |
hf_token_tb = gr.Textbox(label='Token', type='password')
|
226 |
submissions_24h_txt = gr.Textbox(label='Submissions 24h', value='')
|
227 |
-
|
228 |
-
|
229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
submission_btn.click(
|
231 |
fn=on_submit_pressed,
|
232 |
outputs=[submission_btn]
|
233 |
).then(
|
234 |
fn=process_submission,
|
235 |
-
inputs=[submission_team_name_tb, submission_file_path,
|
236 |
-
submission_type_radio, description_tb, app_state]
|
237 |
).then(
|
238 |
fn=on_submit_done,
|
239 |
outputs=[submission_btn]
|
240 |
-
).then(
|
241 |
-
fn=on_dropdown_change,
|
242 |
-
outputs=[leaderboards_tabs, leaderboard_tables_list[0]]
|
243 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
# # My Submissions Tab #
|
246 |
# ######################
|
|
|
18 |
|
19 |
from huggingface_hub import HfApi, snapshot_download
|
20 |
|
21 |
+
from compare_significance import check_significance, SUPPORTED_METRICS
|
22 |
+
from model_compare import ModelCompare
|
23 |
|
24 |
JSON_DATASET_DIR = Path("../json_dataset")
|
25 |
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
26 |
|
27 |
JSON_DATASET_PATH = JSON_DATASET_DIR / f"train-{uuid4()}.json"
|
28 |
|
|
|
29 |
api = HfApi()
|
30 |
|
31 |
ORG= "CZLC"
|
|
|
39 |
|
40 |
HF_TOKEN = os.environ.get("HF_TOKEN")
|
41 |
|
|
|
|
|
42 |
class LeaderboardServer:
|
43 |
def __init__(self, server_address):
|
44 |
self.server_address = server_address
|
45 |
self.repo_type = "dataset"
|
46 |
+
self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
|
47 |
+
self.submisssion_id_to_file = {} # Map submission ids to file paths
|
48 |
+
|
49 |
def on_submit(self):
|
50 |
self.local_leaderboard = snapshot_download(self.server_address,repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
|
51 |
|
52 |
def get_leaderboard(self):
|
53 |
results = []
|
54 |
+
|
55 |
+
new_results = []
|
56 |
+
submission_ids = set()
|
57 |
+
|
58 |
+
# pre-computed ranks
|
59 |
+
with open(os.path.join(self.local_leaderboard, "metadata", "ranks.json")) as ranks_file:
|
60 |
+
ranks = json.load(ranks_file)
|
61 |
+
model_compare = ModelCompare()
|
62 |
+
ranks = model_compare.get_tasks_ranks(ranks)
|
63 |
+
|
64 |
+
# Models data
|
65 |
for submission in glob.glob(os.path.join(self.local_leaderboard, "data") + "/*.json"):
|
66 |
data = json.load(open(submission))
|
67 |
submission_id = data["metadata"]["model_description"]
|
68 |
+
|
69 |
+
if submission_id in submission_ids:
|
70 |
+
continue
|
71 |
+
submission_ids.add(submission_id)
|
72 |
+
|
73 |
+
self.submisssion_id_to_file[submission_id] = submission
|
74 |
+
|
75 |
+
|
76 |
+
local_results = {task: list(task_ranks).index(submission_id)+1 for task, task_ranks in ranks.items()}
|
77 |
local_results["submission_id"] = submission_id
|
78 |
results.append(local_results)
|
79 |
dataframe = pd.DataFrame.from_records(results)
|
80 |
+
# Reorder to have the id (model description) first
|
81 |
+
df_order = ["submission_id"] + [col for col in dataframe.columns if col != "submission_id"]
|
82 |
+
dataframe = dataframe[df_order]
|
83 |
return dataframe
|
84 |
+
|
85 |
+
def compute_ranks(self):
|
86 |
+
''' Compute rankings on every submit '''
|
87 |
+
|
88 |
+
self.get_leaderboard()
|
89 |
+
|
90 |
+
ids = list(self.submisssion_id_to_file.keys())
|
91 |
+
rankings = {id: {} for id in ids}
|
92 |
+
|
93 |
+
for a_idx in range(len(ids)):
|
94 |
+
for b_idx in range(a_idx+1, len(ids)):
|
95 |
+
modelA_id = ids[a_idx]
|
96 |
+
modelB_id = ids[b_idx]
|
97 |
+
res = self.compare_models(modelA_id, modelB_id)
|
98 |
+
rankings[modelA_id][modelB_id] = {
|
99 |
+
task: data["significant"] for task,data in res.items()
|
100 |
+
}
|
101 |
+
rankings[modelB_id][modelA_id] = {
|
102 |
+
task: not data["significant"] for task,data in res.items()
|
103 |
+
}
|
104 |
+
|
105 |
+
return rankings
|
106 |
+
|
107 |
+
|
108 |
+
def compare_models(self, modelA, modelB):
|
109 |
+
modelA_path = self.submisssion_id_to_file.get(modelA)
|
110 |
+
modelB_path = self.submisssion_id_to_file.get(modelB)
|
111 |
+
return check_significance(modelA_path, modelB_path)
|
112 |
+
|
113 |
+
|
114 |
+
def get_rankings(self):
|
115 |
+
# TODO retrieve saved rankings for models on tasks
|
116 |
+
pass
|
117 |
|
118 |
def save_json(self,file, submission_name) -> None:
|
119 |
filename = os.path.basename(file)
|
|
|
126 |
)
|
127 |
|
128 |
|
|
|
|
|
129 |
leaderboard_server = LeaderboardServer(REPO)
|
|
|
|
|
130 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
131 |
|
132 |
|
|
|
136 |
# CHALLENGE_NAME = 'NOTSOFAR1'
|
137 |
|
138 |
|
139 |
+
|
140 |
+
|
141 |
# if __name__ == '__main__':
|
142 |
with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility: hidden}") as main):
|
143 |
app_state = gr.State({})
|
|
|
188 |
return gr.Tabs(selected=first_tab_name), populate_leaderboard(first_tab_name, None)
|
189 |
|
190 |
|
191 |
+
with gr.Tab('Leaderboard') as leaderboards_tab:
|
192 |
+
# with gr.Row():
|
193 |
+
# gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
|
194 |
# with gr.Row():
|
195 |
# with gr.Column():
|
196 |
# dataset_version_drop = gr.Dropdown(choices=DATASET_VERSIONS, multiselect=False,
|
|
|
202 |
# gr.Markdown('') # Empty column for spacing
|
203 |
# with gr.Column():
|
204 |
# gr.Markdown('') # Empty column for spacing
|
205 |
+
# with gr.Row():
|
206 |
+
# with gr.Tabs() as leaderboards_tabs:
|
207 |
+
# leaderboard_tables_list = []
|
208 |
+
# for leaderboard_idx, leaderboard_type in enumerate(LEADERBOARD_TYPES):
|
209 |
+
# l_tab = create_leaderboard_tab(leaderboard_type, leaderboard_idx, None)
|
210 |
+
# leaderboard_tables_list.append(l_tab)
|
211 |
+
|
212 |
+
# change the table based on the selected model
|
213 |
+
def on_dropdown_change(model_detail):
|
214 |
+
leaderboard = leaderboard_server.get_leaderboard()
|
215 |
+
return leaderboard[leaderboard["submission_id"] == model_detail]
|
216 |
+
|
217 |
+
results_table = gr.DataFrame(leaderboard_server.get_leaderboard(), interactive=False, label=None, visible=True)
|
218 |
+
model_detail = gr.Dropdown(choices=list(leaderboard_server.get_leaderboard()["submission_id"]), label="Select model", interactive=True)
|
219 |
+
model_detail_button = gr.Button("Show model detail", interactive=True)
|
220 |
+
model_detail_button.click(
|
221 |
+
fn=on_dropdown_change,
|
222 |
+
inputs=[model_detail],
|
223 |
+
outputs=[results_table]
|
224 |
+
)
|
225 |
+
|
226 |
+
# results_table.select(fn=on_dropdown_change, inputs=[model_detail], outputs=[results_table])
|
227 |
|
228 |
# dataset_version_drop.select(fn=on_dropdown_change, inputs=[dataset_version_drop],
|
229 |
# outputs=[leaderboards_tabs, leaderboard_tables_list[0]])
|
230 |
|
231 |
+
|
232 |
|
233 |
+
##################
|
234 |
# Submission Tab #
|
235 |
##################
|
236 |
with gr.Tab('Submission'):
|
|
|
287 |
# leaderboard_tab.render()
|
288 |
return gr.update(value='Submit', interactive=True)
|
289 |
|
290 |
+
def show_leaderboard():
|
291 |
+
gr.Info("Loding leaderboard...")
|
292 |
+
return leaderboard_server.get_leaderboard()
|
293 |
+
|
294 |
+
gr.Markdown(
|
295 |
+
"""
|
296 |
+
# Model submission
|
297 |
+
Model can be compared with other models and submitted\n
|
298 |
+
Click **Compare results** to compare your model with other models in the leaderboard\n
|
299 |
+
Click **Submit results** to submit your model to the leaderboard
|
300 |
+
(Comparison by itself is not a submission)
|
301 |
+
"""
|
302 |
+
)
|
303 |
+
|
304 |
submission_team_name_tb = gr.Textbox(label='Team Name')
|
305 |
+
# submission_type_radio = gr.Radio(label='Submission Track', choices=LEADERBOARD_TYPES)
|
306 |
+
with gr.Row():
|
307 |
+
description_tb = gr.Textbox(label='Description', type='text')
|
308 |
+
link_to_model_tb = gr.Textbox(label='Link to model', type='text')
|
309 |
+
|
310 |
with gr.Row():
|
311 |
hf_token_tb = gr.Textbox(label='Token', type='password')
|
312 |
submissions_24h_txt = gr.Textbox(label='Submissions 24h', value='')
|
313 |
+
|
314 |
+
submission_file_path = gr.File(label='Upload your results', type='filepath')
|
315 |
+
compare_results_button = gr.DataFrame(show_leaderboard(), interactive=False, label=None, visible=True)
|
316 |
+
|
317 |
+
# Button that triggers shows the current leaderboard
|
318 |
+
show_results_button = gr.Button("Compare results", interactive=True)
|
319 |
+
show_results_button.click(
|
320 |
+
fn=show_leaderboard,
|
321 |
+
outputs=[compare_results_button]
|
322 |
+
)
|
323 |
+
|
324 |
+
submission_btn = gr.Button(value='Submit results', interactive=True)
|
325 |
submission_btn.click(
|
326 |
fn=on_submit_pressed,
|
327 |
outputs=[submission_btn]
|
328 |
).then(
|
329 |
fn=process_submission,
|
330 |
+
inputs=[submission_team_name_tb, submission_file_path, description_tb, app_state]
|
|
|
331 |
).then(
|
332 |
fn=on_submit_done,
|
333 |
outputs=[submission_btn]
|
|
|
|
|
|
|
334 |
)
|
335 |
+
|
336 |
+
# .then(
|
337 |
+
# fn=on_dropdown_change,
|
338 |
+
# outputs=[leaderboards_tabs, leaderboard_tables_list[0]]
|
339 |
+
# )
|
340 |
+
|
341 |
|
342 |
# # My Submissions Tab #
|
343 |
# ######################
|
compare_significance.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
from collections import defaultdict
|
4 |
+
from typing import Sequence
|
5 |
+
|
6 |
+
import numpy
|
7 |
+
import numpy as np
|
8 |
+
from scipy.stats import ttest_ind, ttest_rel
|
9 |
+
from sklearn.metrics import roc_curve, auc
|
10 |
+
from tqdm import tqdm
|
11 |
+
|
12 |
+
# from leaderboard import SUPPORTED_METRICS
|
13 |
+
|
14 |
+
SUPPORTED_METRICS = [
|
15 |
+
"avg_mcauroc", # for classification tasks
|
16 |
+
"em", # for QA tasks
|
17 |
+
"acc", # for multichoice tasks
|
18 |
+
"rouge", # for summarization tasks
|
19 |
+
"ppl", # for language modeling tasks
|
20 |
+
]
|
21 |
+
|
22 |
+
|
23 |
+
def _get_CMs(i, probabilities, references, thresholds):
|
24 |
+
confusion_matrices = []
|
25 |
+
for threshold in thresholds[i]:
|
26 |
+
TP = 0
|
27 |
+
FP = 0
|
28 |
+
TN = 0
|
29 |
+
FN = 0
|
30 |
+
for j in range(len(probabilities)):
|
31 |
+
if probabilities[j][i] >= threshold:
|
32 |
+
if references[j] == i:
|
33 |
+
TP += 1
|
34 |
+
else:
|
35 |
+
FP += 1
|
36 |
+
else:
|
37 |
+
if references[j] == i:
|
38 |
+
FN += 1
|
39 |
+
else:
|
40 |
+
TN += 1
|
41 |
+
cm = {"TP": TP, "FP": FP, "TN": TN, "FN": FN, "threshold": threshold, "class": i}
|
42 |
+
confusion_matrices.append(cm)
|
43 |
+
|
44 |
+
return confusion_matrices
|
45 |
+
|
46 |
+
|
47 |
+
def compute_significance_accuracy(predsA, referencesA, predsB, referencesB):
|
48 |
+
# following https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L164C32-L165C24
|
49 |
+
# Calculate the T-test on TWO RELATED samples of scores, a and b. for one sided test we multiply p-value by half
|
50 |
+
scores_A = [1 if pred == ref else 0 for pred, ref in zip(predsA, referencesA)]
|
51 |
+
scores_B = [1 if pred == ref else 0 for pred, ref in zip(predsB, referencesB)]
|
52 |
+
t, p = ttest_rel(scores_A, scores_B)
|
53 |
+
# correct for one-tailed test
|
54 |
+
p_value = p / 2
|
55 |
+
delta = np.mean(scores_A) - np.mean(scores_B)
|
56 |
+
return p_value, delta
|
57 |
+
|
58 |
+
def compute_significance_em(predsA, referencesA, predsB, referencesB):
|
59 |
+
pass
|
60 |
+
def compute_significance_rouge(predsA, referencesA, predsB, referencesB):
|
61 |
+
# TODO: MDocekal
|
62 |
+
# Use bootstrapping
|
63 |
+
# https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L89
|
64 |
+
pass
|
65 |
+
def compute_significance_ppl(predsA, referencesA, predsB, referencesB):
|
66 |
+
# TODO: MDocekal
|
67 |
+
# Use bootstrapping
|
68 |
+
# https://github.com/rtmdrr/testSignificanceNLP/blob/c7302d015538944364b622eb860dd9fbee6d50ec/testSignificance.py#L89
|
69 |
+
pass
|
70 |
+
def compute_significance_avg_mcauroc(probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
|
71 |
+
probsB: Sequence[Sequence[float]], referencesB: Sequence[int]):
|
72 |
+
# compute MC-AUC for model A
|
73 |
+
model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=1_000)
|
74 |
+
model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=1_000)
|
75 |
+
|
76 |
+
# one-tailed test
|
77 |
+
p_value = ((model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
|
78 |
+
/ (len(model_A_scores) * len(model_B_scores)))
|
79 |
+
|
80 |
+
delta = np.mean(model_A_scores) - np.mean(model_B_scores)
|
81 |
+
return p_value, delta
|
82 |
+
|
83 |
+
|
84 |
+
def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
|
85 |
+
n_classes = list(range(len(probs[0])))
|
86 |
+
fpr = dict()
|
87 |
+
thresholds = dict()
|
88 |
+
# compute AUC for every class
|
89 |
+
auc_scores_per_class = []
|
90 |
+
for i in range(len(n_classes)):
|
91 |
+
# for i-th class vs all others
|
92 |
+
fpr[i], _, thresholds[i] = roc_curve(y_true=[1 if x == n_classes[i] else 0 for x in references],
|
93 |
+
y_score=[prob[i] for prob in probs])
|
94 |
+
|
95 |
+
confusion_matrices = _get_CMs(i, probs, references, thresholds)
|
96 |
+
|
97 |
+
ฮป = 1.0 # <- Flat prior
|
98 |
+
# ฮป = 0.5 # <- Jeffrey's prior
|
99 |
+
|
100 |
+
# sample variates for every threshold
|
101 |
+
tpr_variates_for_each_fpr = []
|
102 |
+
for k in range(len(thresholds[i])):
|
103 |
+
tpr_variates_for_each_fpr.append(
|
104 |
+
numpy.random.beta(confusion_matrices[k]["TP"] + ฮป, confusion_matrices[k]["FN"] + ฮป, Nsamples))
|
105 |
+
|
106 |
+
# fprs x tpr_variates
|
107 |
+
tpr_variates_for_each_fpr = np.array(tpr_variates_for_each_fpr)
|
108 |
+
|
109 |
+
# now pick 1 variate for each fpr, and compute AUC
|
110 |
+
auc_scores = []
|
111 |
+
for tpr_variates in tqdm(tpr_variates_for_each_fpr.T,
|
112 |
+
desc=f"Computing AUCs for class {i + 1}/{len(n_classes)}"):
|
113 |
+
auc_score = auc(fpr[i], tpr_variates)
|
114 |
+
# if numpy.isnan(auc_score):
|
115 |
+
# auc_score = 0
|
116 |
+
auc_scores.append(auc_score)
|
117 |
+
auc_scores_per_class.append(auc_scores)
|
118 |
+
|
119 |
+
auc_scores_per_class = np.array(auc_scores_per_class)
|
120 |
+
mcauc_scores = np.mean(auc_scores_per_class, axis=0)
|
121 |
+
return mcauc_scores
|
122 |
+
|
123 |
+
|
124 |
+
def read_json(file_path):
|
125 |
+
data = defaultdict(list)
|
126 |
+
with open(file_path, "r") as f:
|
127 |
+
fc = json.load(f)
|
128 |
+
for task, results in fc["predictions"].items():
|
129 |
+
# determine the metric
|
130 |
+
metric = None
|
131 |
+
for key in SUPPORTED_METRICS:
|
132 |
+
if key in results[0]:
|
133 |
+
metric = key
|
134 |
+
break
|
135 |
+
if metric is None:
|
136 |
+
raise ValueError(f"Unsupported metric in {file_path}")
|
137 |
+
|
138 |
+
if metric == "avg_mcauroc":
|
139 |
+
local_data = [line[metric] for line in fc["predictions"][task]]
|
140 |
+
unzipped_list = list(zip(*local_data))
|
141 |
+
golds = unzipped_list[0]
|
142 |
+
probs = unzipped_list[1]
|
143 |
+
data[task] = (golds, probs), metric
|
144 |
+
return data, fc["metadata"]
|
145 |
+
|
146 |
+
|
147 |
+
def check_significance_task(fileA, fileB, task, significance_level=0.05):
|
148 |
+
|
149 |
+
dataA, metadataA = read_json(fileA)
|
150 |
+
dataB, metadataB = read_json(fileB)
|
151 |
+
|
152 |
+
print("DEBUG",fileA, task, dataA[task])
|
153 |
+
|
154 |
+
decisions = dict()
|
155 |
+
|
156 |
+
metricA = dataA[task][1]
|
157 |
+
metricB = dataB[task][1]
|
158 |
+
assert metricA == metricB
|
159 |
+
assert len(dataA[task]) == len(dataB[task])
|
160 |
+
|
161 |
+
if metricA == "avg_mcauroc":
|
162 |
+
p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
163 |
+
probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
|
164 |
+
|
165 |
+
elif metricA == "acc":
|
166 |
+
p_value, delta = compute_significance_accuracy(predsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
167 |
+
predsB=dataB[task][0][1], referencesB=dataB[task][0][0])
|
168 |
+
elif metricA == "em":
|
169 |
+
raise NotImplementedError("Exact match is not supported yet.")
|
170 |
+
elif metricA == "rouge":
|
171 |
+
raise NotImplementedError("Rouge is not supported yet.")
|
172 |
+
elif metricA == "ppl":
|
173 |
+
raise NotImplementedError("Perplexity is not supported yet.")
|
174 |
+
else:
|
175 |
+
raise ValueError(f"Unsupported metric {metricA}")
|
176 |
+
decisions[task] = {
|
177 |
+
"significant": not (p_value > significance_level),
|
178 |
+
"p_value": p_value,
|
179 |
+
"delta": delta,
|
180 |
+
}
|
181 |
+
return decisions
|
182 |
+
|
183 |
+
def check_significance(fileA, fileB, significance_level=0.05):
|
184 |
+
dataA, metadataA = read_json(fileA)
|
185 |
+
dataB, metadataB = read_json(fileB)
|
186 |
+
|
187 |
+
decisions = dict()
|
188 |
+
for task in dataA.keys():
|
189 |
+
metricA = dataA[task][1]
|
190 |
+
metricB = dataB[task][1]
|
191 |
+
assert metricA == metricB
|
192 |
+
assert len(dataA[task]) == len(dataB[task])
|
193 |
+
|
194 |
+
if metricA == "avg_mcauroc":
|
195 |
+
p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
196 |
+
probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
|
197 |
+
|
198 |
+
elif metricA == "acc":
|
199 |
+
p_value, delta = compute_significance_accuracy(predsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
200 |
+
predsB=dataB[task][0][1], referencesB=dataB[task][0][0])
|
201 |
+
elif metricA == "em":
|
202 |
+
raise NotImplementedError("Exact match is not supported yet.")
|
203 |
+
elif metricA == "rouge":
|
204 |
+
raise NotImplementedError("Rouge is not supported yet.")
|
205 |
+
elif metricA == "ppl":
|
206 |
+
raise NotImplementedError("Perplexity is not supported yet.")
|
207 |
+
else:
|
208 |
+
raise ValueError(f"Unsupported metric {metricA}")
|
209 |
+
decisions[task] = {
|
210 |
+
"significant": not (p_value > significance_level),
|
211 |
+
"p_value": p_value,
|
212 |
+
"delta": delta,
|
213 |
+
}
|
214 |
+
return decisions
|
215 |
+
|
216 |
+
|
217 |
+
def main():
|
218 |
+
parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
|
219 |
+
parser.add_argument("--modelA", help="ModelA JSONL file from lm harness.")
|
220 |
+
parser.add_argument("--modelB", help="ModelB JSONL file from lm harness.")
|
221 |
+
parser.add_argument("--significance_level", type=float, default=0.05, help="Significance level (e.g., 0.05)")
|
222 |
+
args = parser.parse_args()
|
223 |
+
|
224 |
+
result = check_significance(args.modelA, args.modelB, args.significance_level)
|
225 |
+
print(json.dumps(result, indent=2))
|
226 |
+
|
227 |
+
# harness already returns stderr estimate for sampling distribution
|
228 |
+
# see https://github.com/EleutherAI/lm-evaluation-harness/blob/6433bd3fe3033d302b22cdcd53af237e9039ef29/lm_eval/api/metrics.py#L213
|
229 |
+
|
230 |
+
if __name__ == "__main__":
|
231 |
+
main()
|
model_compare.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from functools import cmp_to_key
|
3 |
+
from compare_significance import check_significance
|
4 |
+
|
5 |
+
class ModelCompare():
|
6 |
+
|
7 |
+
TASKS = ["propaganda_demonizace",
|
8 |
+
"propaganda_vina",
|
9 |
+
"propaganda_relativizace",
|
10 |
+
"propaganda_argumentace",
|
11 |
+
"propaganda_lokace",
|
12 |
+
"propaganda_nazor",
|
13 |
+
"propaganda_emoce",
|
14 |
+
"propaganda_fabulace",
|
15 |
+
"propaganda_nalepkovani",
|
16 |
+
"propaganda_zamereni",
|
17 |
+
"propaganda_zanr",
|
18 |
+
"propaganda_rusko",
|
19 |
+
"propaganda_strach",
|
20 |
+
"benczechmark_sentiment"]
|
21 |
+
|
22 |
+
def __init__(self, ranks:dict=None):
|
23 |
+
self.ranks = ranks
|
24 |
+
|
25 |
+
def compare_models(self, modelA_id, modelB_id):
|
26 |
+
if not self.ranks:
|
27 |
+
raise Exception("Missing model rankings")
|
28 |
+
|
29 |
+
res = self.ranks[modelA_id][modelB_id][self.current_task]
|
30 |
+
if res == True:
|
31 |
+
return 1
|
32 |
+
elif res == False:
|
33 |
+
return -1
|
34 |
+
else:
|
35 |
+
return -1
|
36 |
+
|
37 |
+
|
38 |
+
def get_tasks_ranks(self, ranks:dict) -> dict:
|
39 |
+
'''Order models based on the significance improvement'''
|
40 |
+
|
41 |
+
self.ranks = ranks
|
42 |
+
|
43 |
+
tasks_ranks = {}
|
44 |
+
|
45 |
+
models = ranks.keys()
|
46 |
+
for task in self.TASKS:
|
47 |
+
self.current_task = task
|
48 |
+
tasks_ranks[task] = sorted(models, key=cmp_to_key(self.compare_models))
|
49 |
+
return tasks_ranks
|
50 |
+
|
51 |
+
|
52 |
+
# models = {
|
53 |
+
# model1 : {
|
54 |
+
# task1 : order_idx
|
55 |
+
# task2 : order_idx
|
56 |
+
# task3 : order_idx
|
57 |
+
# }
|
58 |
+
# }
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
requirements.txt
CHANGED
@@ -4,4 +4,7 @@ azure-cosmos
|
|
4 |
huggingface_hub
|
5 |
requests
|
6 |
Pyarrow
|
7 |
-
tabulate
|
|
|
|
|
|
|
|
4 |
huggingface_hub
|
5 |
requests
|
6 |
Pyarrow
|
7 |
+
tabulate
|
8 |
+
scipy
|
9 |
+
numpy
|
10 |
+
scikit-learn
|