Spaces:
Runtime error
Runtime error
pminervini
commited on
Commit
•
5ca644e
1
Parent(s):
1e82ba8
update
Browse files- cli/analysis-cli.py +31 -29
- cli/eval-cli.py +3 -2
- cli/fever-upload-cli.py +73 -0
- src/backend/envs.py +1 -0
- src/backend/tasks/fever/fever11.yaml +17 -0
cli/analysis-cli.py
CHANGED
@@ -111,8 +111,6 @@ if data_map is None:
|
|
111 |
for dataset_name, results_dict in data["results"].items():
|
112 |
for metric_name, value in results_dict.items():
|
113 |
|
114 |
-
# print(model_name, dataset_name, metric_name, value)
|
115 |
-
|
116 |
if ',' in metric_name and '_stderr' not in metric_name \
|
117 |
and 'f1' not in metric_name \
|
118 |
and model_name_to_model_map[model_name]["likes"] > 128:
|
@@ -160,9 +158,8 @@ if data_map is None:
|
|
160 |
if 'fever' in dataset_name:
|
161 |
to_add = False
|
162 |
|
163 |
-
if 'xsum' in dataset_name:
|
164 |
-
|
165 |
-
pass
|
166 |
|
167 |
if 'rouge' in metric_name:
|
168 |
value /= 100.0
|
@@ -186,8 +183,10 @@ if data_map is None:
|
|
186 |
|
187 |
model_name_lst = [m for m in data_map.keys()]
|
188 |
|
|
|
|
|
189 |
for model_name in model_name_lst:
|
190 |
-
if len(data_map[model_name]) <
|
191 |
del data_map[model_name]
|
192 |
|
193 |
plot_type_lst = ['all', 'summ', 'qa', 'instr', 'detect', 'rc']
|
@@ -293,27 +292,30 @@ for plot_type in plot_type_lst:
|
|
293 |
|
294 |
print('figsize', (fig_width, fig_height))
|
295 |
|
296 |
-
print(f'Generating clustermap for {plot_type}')
|
297 |
-
|
298 |
-
# fig = sns.clustermap(df, method='average', metric='cosine', cmap='coolwarm', figsize=(16, 12), annot=True)
|
299 |
-
fig = sns.clustermap(df,
|
300 |
-
method='ward',
|
301 |
-
metric='euclidean',
|
302 |
-
cmap='coolwarm',
|
303 |
-
figsize=(fig_width, fig_height), # figsize=(24, 16),
|
304 |
-
annot=True,
|
305 |
-
mask=o_df.isnull(),
|
306 |
-
dendrogram_ratio=dendrogram_ratio,
|
307 |
-
fmt='.2f',
|
308 |
-
col_cluster=col_cluster,
|
309 |
-
row_cluster=row_cluster)
|
310 |
-
|
311 |
-
# Adjust the size of the cells (less wide)
|
312 |
-
plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
|
313 |
-
plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
|
314 |
-
|
315 |
-
# Save the clustermap to file
|
316 |
-
fig.savefig(f'plots/clustermap_{plot_type}.pdf')
|
317 |
-
fig.savefig(f'plots/clustermap_{plot_type}.png')
|
318 |
-
|
319 |
o_df.to_json(f'plots/clustermap_{plot_type}.json', orient='split')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
for dataset_name, results_dict in data["results"].items():
|
112 |
for metric_name, value in results_dict.items():
|
113 |
|
|
|
|
|
114 |
if ',' in metric_name and '_stderr' not in metric_name \
|
115 |
and 'f1' not in metric_name \
|
116 |
and model_name_to_model_map[model_name]["likes"] > 128:
|
|
|
158 |
if 'fever' in dataset_name:
|
159 |
to_add = False
|
160 |
|
161 |
+
if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' in dataset_name:
|
162 |
+
to_add = False
|
|
|
163 |
|
164 |
if 'rouge' in metric_name:
|
165 |
value /= 100.0
|
|
|
183 |
|
184 |
model_name_lst = [m for m in data_map.keys()]
|
185 |
|
186 |
+
nb_max_metrics = max(len(data_map[model_name]) for model_name in model_name_lst)
|
187 |
+
|
188 |
for model_name in model_name_lst:
|
189 |
+
if len(data_map[model_name]) < nb_max_metrics - 5:
|
190 |
del data_map[model_name]
|
191 |
|
192 |
plot_type_lst = ['all', 'summ', 'qa', 'instr', 'detect', 'rc']
|
|
|
292 |
|
293 |
print('figsize', (fig_width, fig_height))
|
294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
o_df.to_json(f'plots/clustermap_{plot_type}.json', orient='split')
|
296 |
+
|
297 |
+
print(f'Generating the clustermaps for {plot_type}')
|
298 |
+
|
299 |
+
for cmap in [None, 'coolwarm', 'viridis']:
|
300 |
+
fig = sns.clustermap(df,
|
301 |
+
method='ward',
|
302 |
+
metric='euclidean',
|
303 |
+
cmap=cmap,
|
304 |
+
figsize=(fig_width, fig_height), # figsize=(24, 16),
|
305 |
+
annot=True,
|
306 |
+
mask=o_df.isnull(),
|
307 |
+
dendrogram_ratio=dendrogram_ratio,
|
308 |
+
fmt='.2f',
|
309 |
+
col_cluster=col_cluster,
|
310 |
+
row_cluster=row_cluster)
|
311 |
+
|
312 |
+
# Adjust the size of the cells (less wide)
|
313 |
+
plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
|
314 |
+
plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
|
315 |
+
|
316 |
+
cmap_suffix = '' if cmap is None else f'_{cmap}'
|
317 |
+
|
318 |
+
# Save the clustermap to file
|
319 |
+
fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}.pdf')
|
320 |
+
fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}.png')
|
321 |
+
fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}_t.png', transparent=True, facecolor="none")
|
cli/eval-cli.py
CHANGED
@@ -36,7 +36,8 @@ def main():
|
|
36 |
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
37 |
# my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
38 |
# my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
|
39 |
-
my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
|
|
|
40 |
|
41 |
eval_logger = utils.eval_logger
|
42 |
import logging
|
@@ -59,7 +60,7 @@ def main():
|
|
59 |
|
60 |
# breakpoint()
|
61 |
results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
|
62 |
-
batch_size=1, device="mps", use_cache=None, limit=
|
63 |
print('AAA', results["results"])
|
64 |
|
65 |
breakpoint()
|
|
|
36 |
# my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
|
37 |
# my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
|
38 |
# my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
|
39 |
+
# my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
|
40 |
+
my_task = Task("fever11", "acc", "FEVER", 8)
|
41 |
|
42 |
eval_logger = utils.eval_logger
|
43 |
import logging
|
|
|
60 |
|
61 |
# breakpoint()
|
62 |
results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
|
63 |
+
batch_size=1, device="mps", use_cache=None, limit=1000, write_out=True)
|
64 |
print('AAA', results["results"])
|
65 |
|
66 |
breakpoint()
|
cli/fever-upload-cli.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
import glob
|
4 |
+
import os
|
5 |
+
|
6 |
+
import random
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
from datasets import Dataset, DatasetDict, load_dataset
|
10 |
+
|
11 |
+
|
12 |
+
def convert(list_of_dicts):
|
13 |
+
res = {}
|
14 |
+
for d in list_of_dicts:
|
15 |
+
for k, v in d.items():
|
16 |
+
res.setdefault(k, []).append(v)
|
17 |
+
return res
|
18 |
+
|
19 |
+
|
20 |
+
v10 = load_dataset("fever", "v1.0")
|
21 |
+
name_lst = ['train', 'labelled_dev']
|
22 |
+
|
23 |
+
old_to_new_label_map = {
|
24 |
+
'SUPPORTS': 'supported',
|
25 |
+
'REFUTES': 'refuted'
|
26 |
+
}
|
27 |
+
|
28 |
+
data_map = {}
|
29 |
+
|
30 |
+
for name in name_lst:
|
31 |
+
instance_lst = []
|
32 |
+
|
33 |
+
for entry in tqdm(v10[name]):
|
34 |
+
id_ = entry['id']
|
35 |
+
label = entry['label']
|
36 |
+
claim = entry['claim']
|
37 |
+
|
38 |
+
evidence_id = entry['evidence_id']
|
39 |
+
evidence_wiki_url = entry['evidence_wiki_url']
|
40 |
+
|
41 |
+
if evidence_id != -1:
|
42 |
+
assert label in {'SUPPORTS', 'REFUTES'}
|
43 |
+
|
44 |
+
instance = {'id': id_, 'label': old_to_new_label_map[label], 'claim': claim}
|
45 |
+
instance_lst.append(instance)
|
46 |
+
|
47 |
+
key = 'dev' if name in {'labelled_dev'} else name
|
48 |
+
|
49 |
+
instance_lst = sorted([dict(t) for t in {tuple(d.items()) for d in instance_lst}], key=lambda d: d['claim'])
|
50 |
+
|
51 |
+
label_to_instance_lst = {}
|
52 |
+
for e in instance_lst:
|
53 |
+
if e['label'] not in label_to_instance_lst:
|
54 |
+
label_to_instance_lst[e['label']] = []
|
55 |
+
label_to_instance_lst[e['label']].append(e)
|
56 |
+
|
57 |
+
min_len = min(len(v) for k, v in label_to_instance_lst.items())
|
58 |
+
|
59 |
+
new_instance_lst = []
|
60 |
+
for k in sorted(label_to_instance_lst.keys()):
|
61 |
+
new_instance_lst += label_to_instance_lst[k][:min_len]
|
62 |
+
|
63 |
+
random.Random(42).shuffle(new_instance_lst)
|
64 |
+
data_map[key] = new_instance_lst
|
65 |
+
|
66 |
+
ds_path = 'pminervini/hl-fever'
|
67 |
+
|
68 |
+
task_to_ds_map = {k: Dataset.from_dict(convert(v)) for k, v in data_map.items()}
|
69 |
+
ds_dict = DatasetDict(task_to_ds_map)
|
70 |
+
|
71 |
+
ds_dict.push_to_hub(ds_path, "v1.0")
|
72 |
+
|
73 |
+
# breakpoint()
|
src/backend/envs.py
CHANGED
@@ -46,6 +46,7 @@ class Tasks(Enum):
|
|
46 |
task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
|
47 |
|
48 |
# task15 = Task("fever10", "acc", "FEVER", 16)
|
|
|
49 |
|
50 |
task16 = Task("squadv2", "exact", "SQuADv2", 4)
|
51 |
|
|
|
46 |
task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
|
47 |
|
48 |
# task15 = Task("fever10", "acc", "FEVER", 16)
|
49 |
+
task15_1 = Task("fever11", "acc", "FEVER", 8)
|
50 |
|
51 |
task16 = Task("squadv2", "exact", "SQuADv2", 4)
|
52 |
|
src/backend/tasks/fever/fever11.yaml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group: fever
|
2 |
+
task: fever11
|
3 |
+
dataset_path: pminervini/hl-fever
|
4 |
+
dataset_name: v1.0
|
5 |
+
output_type: multiple_choice
|
6 |
+
training_split: train
|
7 |
+
validation_split: dev
|
8 |
+
test_split: null
|
9 |
+
doc_to_text: "Claim: {{claim}}\nLabel:"
|
10 |
+
doc_to_choice: ["supported", "refuted"]
|
11 |
+
doc_to_target: label
|
12 |
+
metric_list:
|
13 |
+
- metric: acc
|
14 |
+
aggregation: mean
|
15 |
+
higher_is_better: true
|
16 |
+
metadata:
|
17 |
+
version: 0.0
|