pminervini commited on
Commit
5ca644e
·
1 Parent(s): 1e82ba8
cli/analysis-cli.py CHANGED
@@ -111,8 +111,6 @@ if data_map is None:
111
  for dataset_name, results_dict in data["results"].items():
112
  for metric_name, value in results_dict.items():
113
 
114
- # print(model_name, dataset_name, metric_name, value)
115
-
116
  if ',' in metric_name and '_stderr' not in metric_name \
117
  and 'f1' not in metric_name \
118
  and model_name_to_model_map[model_name]["likes"] > 128:
@@ -160,9 +158,8 @@ if data_map is None:
160
  if 'fever' in dataset_name:
161
  to_add = False
162
 
163
- if 'xsum' in dataset_name:
164
- # to_add = False
165
- pass
166
 
167
  if 'rouge' in metric_name:
168
  value /= 100.0
@@ -186,8 +183,10 @@ if data_map is None:
186
 
187
  model_name_lst = [m for m in data_map.keys()]
188
 
 
 
189
  for model_name in model_name_lst:
190
- if len(data_map[model_name]) < 14:
191
  del data_map[model_name]
192
 
193
  plot_type_lst = ['all', 'summ', 'qa', 'instr', 'detect', 'rc']
@@ -293,27 +292,30 @@ for plot_type in plot_type_lst:
293
 
294
  print('figsize', (fig_width, fig_height))
295
 
296
- print(f'Generating clustermap for {plot_type}')
297
-
298
- # fig = sns.clustermap(df, method='average', metric='cosine', cmap='coolwarm', figsize=(16, 12), annot=True)
299
- fig = sns.clustermap(df,
300
- method='ward',
301
- metric='euclidean',
302
- cmap='coolwarm',
303
- figsize=(fig_width, fig_height), # figsize=(24, 16),
304
- annot=True,
305
- mask=o_df.isnull(),
306
- dendrogram_ratio=dendrogram_ratio,
307
- fmt='.2f',
308
- col_cluster=col_cluster,
309
- row_cluster=row_cluster)
310
-
311
- # Adjust the size of the cells (less wide)
312
- plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
313
- plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
314
-
315
- # Save the clustermap to file
316
- fig.savefig(f'plots/clustermap_{plot_type}.pdf')
317
- fig.savefig(f'plots/clustermap_{plot_type}.png')
318
-
319
  o_df.to_json(f'plots/clustermap_{plot_type}.json', orient='split')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  for dataset_name, results_dict in data["results"].items():
112
  for metric_name, value in results_dict.items():
113
 
 
 
114
  if ',' in metric_name and '_stderr' not in metric_name \
115
  and 'f1' not in metric_name \
116
  and model_name_to_model_map[model_name]["likes"] > 128:
 
158
  if 'fever' in dataset_name:
159
  to_add = False
160
 
161
+ if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' in dataset_name:
162
+ to_add = False
 
163
 
164
  if 'rouge' in metric_name:
165
  value /= 100.0
 
183
 
184
  model_name_lst = [m for m in data_map.keys()]
185
 
186
+ nb_max_metrics = max(len(data_map[model_name]) for model_name in model_name_lst)
187
+
188
  for model_name in model_name_lst:
189
+ if len(data_map[model_name]) < nb_max_metrics - 5:
190
  del data_map[model_name]
191
 
192
  plot_type_lst = ['all', 'summ', 'qa', 'instr', 'detect', 'rc']
 
292
 
293
  print('figsize', (fig_width, fig_height))
294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  o_df.to_json(f'plots/clustermap_{plot_type}.json', orient='split')
296
+
297
+ print(f'Generating the clustermaps for {plot_type}')
298
+
299
+ for cmap in [None, 'coolwarm', 'viridis']:
300
+ fig = sns.clustermap(df,
301
+ method='ward',
302
+ metric='euclidean',
303
+ cmap=cmap,
304
+ figsize=(fig_width, fig_height), # figsize=(24, 16),
305
+ annot=True,
306
+ mask=o_df.isnull(),
307
+ dendrogram_ratio=dendrogram_ratio,
308
+ fmt='.2f',
309
+ col_cluster=col_cluster,
310
+ row_cluster=row_cluster)
311
+
312
+ # Adjust the size of the cells (less wide)
313
+ plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
314
+ plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
315
+
316
+ cmap_suffix = '' if cmap is None else f'_{cmap}'
317
+
318
+ # Save the clustermap to file
319
+ fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}.pdf')
320
+ fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}.png')
321
+ fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}_t.png', transparent=True, facecolor="none")
cli/eval-cli.py CHANGED
@@ -36,7 +36,8 @@ def main():
36
  # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
37
  # my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
38
  # my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
39
- my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
 
40
 
41
  eval_logger = utils.eval_logger
42
  import logging
@@ -59,7 +60,7 @@ def main():
59
 
60
  # breakpoint()
61
  results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
62
- batch_size=1, device="mps", use_cache=None, limit=10, write_out=True)
63
  print('AAA', results["results"])
64
 
65
  breakpoint()
 
36
  # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
37
  # my_task = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
38
  # my_task = Task("truefalse_cieacf", "acc", "TrueFalse", 5)
39
+ # my_task = Task("faithdial_hallu", "acc", "FaithDIAL", 2)
40
+ my_task = Task("fever11", "acc", "FEVER", 8)
41
 
42
  eval_logger = utils.eval_logger
43
  import logging
 
60
 
61
  # breakpoint()
62
  results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
63
+ batch_size=1, device="mps", use_cache=None, limit=1000, write_out=True)
64
  print('AAA', results["results"])
65
 
66
  breakpoint()
cli/fever-upload-cli.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import glob
4
+ import os
5
+
6
+ import random
7
+ from tqdm import tqdm
8
+
9
+ from datasets import Dataset, DatasetDict, load_dataset
10
+
11
+
12
+ def convert(list_of_dicts):
13
+ res = {}
14
+ for d in list_of_dicts:
15
+ for k, v in d.items():
16
+ res.setdefault(k, []).append(v)
17
+ return res
18
+
19
+
20
+ v10 = load_dataset("fever", "v1.0")
21
+ name_lst = ['train', 'labelled_dev']
22
+
23
+ old_to_new_label_map = {
24
+ 'SUPPORTS': 'supported',
25
+ 'REFUTES': 'refuted'
26
+ }
27
+
28
+ data_map = {}
29
+
30
+ for name in name_lst:
31
+ instance_lst = []
32
+
33
+ for entry in tqdm(v10[name]):
34
+ id_ = entry['id']
35
+ label = entry['label']
36
+ claim = entry['claim']
37
+
38
+ evidence_id = entry['evidence_id']
39
+ evidence_wiki_url = entry['evidence_wiki_url']
40
+
41
+ if evidence_id != -1:
42
+ assert label in {'SUPPORTS', 'REFUTES'}
43
+
44
+ instance = {'id': id_, 'label': old_to_new_label_map[label], 'claim': claim}
45
+ instance_lst.append(instance)
46
+
47
+ key = 'dev' if name in {'labelled_dev'} else name
48
+
49
+ instance_lst = sorted([dict(t) for t in {tuple(d.items()) for d in instance_lst}], key=lambda d: d['claim'])
50
+
51
+ label_to_instance_lst = {}
52
+ for e in instance_lst:
53
+ if e['label'] not in label_to_instance_lst:
54
+ label_to_instance_lst[e['label']] = []
55
+ label_to_instance_lst[e['label']].append(e)
56
+
57
+ min_len = min(len(v) for k, v in label_to_instance_lst.items())
58
+
59
+ new_instance_lst = []
60
+ for k in sorted(label_to_instance_lst.keys()):
61
+ new_instance_lst += label_to_instance_lst[k][:min_len]
62
+
63
+ random.Random(42).shuffle(new_instance_lst)
64
+ data_map[key] = new_instance_lst
65
+
66
+ ds_path = 'pminervini/hl-fever'
67
+
68
+ task_to_ds_map = {k: Dataset.from_dict(convert(v)) for k, v in data_map.items()}
69
+ ds_dict = DatasetDict(task_to_ds_map)
70
+
71
+ ds_dict.push_to_hub(ds_path, "v1.0")
72
+
73
+ # breakpoint()
src/backend/envs.py CHANGED
@@ -46,6 +46,7 @@ class Tasks(Enum):
46
  task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
47
 
48
  # task15 = Task("fever10", "acc", "FEVER", 16)
 
49
 
50
  task16 = Task("squadv2", "exact", "SQuADv2", 4)
51
 
 
46
  task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
47
 
48
  # task15 = Task("fever10", "acc", "FEVER", 16)
49
+ task15_1 = Task("fever11", "acc", "FEVER", 8)
50
 
51
  task16 = Task("squadv2", "exact", "SQuADv2", 4)
52
 
src/backend/tasks/fever/fever11.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: fever
2
+ task: fever11
3
+ dataset_path: pminervini/hl-fever
4
+ dataset_name: v1.0
5
+ output_type: multiple_choice
6
+ training_split: train
7
+ validation_split: dev
8
+ test_split: null
9
+ doc_to_text: "Claim: {{claim}}\nLabel:"
10
+ doc_to_choice: ["supported", "refuted"]
11
+ doc_to_target: label
12
+ metric_list:
13
+ - metric: acc
14
+ aggregation: mean
15
+ higher_is_better: true
16
+ metadata:
17
+ version: 0.0