pminervini commited on
Commit
fd975b0
1 Parent(s): 19d09c1
src/backend/tasks/cnndm/task.py CHANGED
@@ -2,8 +2,61 @@ from lm_eval.api.task import Task
2
  from lm_eval.api.instance import Instance
3
  from lm_eval.api.registry import register_task
4
  from lm_eval.api.metrics import mean
5
- import datasets
6
- from src.backend.tasks.cnndm import utils
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  @register_task("cnndm")
@@ -14,7 +67,14 @@ class CnnDm(Task):
14
 
15
  def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
16
  super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
17
- print('XXX CNNDM!')
 
 
 
 
 
 
 
18
 
19
  def has_training_docs(self):
20
  return True
@@ -63,14 +123,44 @@ class CnnDm(Task):
63
  Instance(
64
  request_type="generate_until",
65
  doc=doc,
66
- arguments=(ctx, {"until": ["\n", "."]}),
67
  idx=0,
68
  **kwargs
69
  )
70
  ]
71
 
72
  def process_results(self, doc, results):
73
- return utils.process_results(doc, results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  def aggregation(self):
76
  """
 
2
  from lm_eval.api.instance import Instance
3
  from lm_eval.api.registry import register_task
4
  from lm_eval.api.metrics import mean
5
+
6
+ import torch
7
+ import sacrebleu
8
+ from rouge_score import rouge_scorer, scoring
9
+
10
+
11
+ def bleu(refs, preds):
12
+ """
13
+ Returns `t5` style BLEU scores. See the related implementation:
14
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
15
+
16
+ :param refs:
17
+ A `list` of `list` of reference `str`s.
18
+ :param preds:
19
+ A `list` of predicted `str`s.
20
+ """
21
+ score = sacrebleu.corpus_bleu(
22
+ preds,
23
+ refs,
24
+ smooth_method="exp",
25
+ smooth_value=0.0,
26
+ force=False,
27
+ lowercase=False,
28
+ tokenize="intl",
29
+ use_effective_order=False,
30
+ ).score
31
+ return score
32
+
33
+
34
+ def rouge(refs, preds):
35
+ """
36
+ Returns `t5` style ROUGE scores. See the related implementation:
37
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
38
+
39
+ :param refs:
40
+ A `list` of reference `strs`.
41
+ :param preds:
42
+ A `list` of predicted `strs`.
43
+ """
44
+ rouge_types = ["rouge1", "rouge2", "rougeLsum"]
45
+ scorer = rouge_scorer.RougeScorer(rouge_types)
46
+ # Add newlines between sentences to correctly compute `rougeLsum`.
47
+
48
+ def _prepare_summary(summary):
49
+ summary = summary.replace(" . ", ".\n")
50
+ return summary
51
+
52
+ # Accumulate confidence intervals.
53
+ aggregator = scoring.BootstrapAggregator()
54
+ for ref, pred in zip(refs, preds):
55
+ ref = _prepare_summary(ref)
56
+ pred = _prepare_summary(pred)
57
+ aggregator.add_scores(scorer.score(ref, pred))
58
+ result = aggregator.aggregate()
59
+ return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
60
 
61
 
62
  @register_task("cnndm")
 
67
 
68
  def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
69
  super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
70
+ self.factkb_tokenizer = None
71
+ self.factkb_model = None
72
+
73
+ def maybe_init_factkb(self):
74
+ if self.factkb_tokenizer is None or self.factkb_model is None:
75
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
76
+ self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
77
+ self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, device_map="auto")
78
 
79
  def has_training_docs(self):
80
  return True
 
123
  Instance(
124
  request_type="generate_until",
125
  doc=doc,
126
+ arguments=(ctx, {"until": ["\n"]}),
127
  idx=0,
128
  **kwargs
129
  )
130
  ]
131
 
132
  def process_results(self, doc, results):
133
+ completion = results[0]
134
+ # true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
135
+ # all_refs = true_refs + false_refs
136
+
137
+ document = doc["article"]
138
+ true_refs = [doc["highlights"]]
139
+ all_refs = true_refs
140
+
141
+ # ROUGE-N
142
+ rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
143
+ # ROUGE-1
144
+ rouge1_scores = [score["rouge1"] for score in rouge_scores]
145
+ # ROUGE-2
146
+ rouge2_scores = [score["rouge2"] for score in rouge_scores]
147
+ # ROUGE-L
148
+ rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
149
+
150
+ self.maybe_init_factkb()
151
+ input_factkb = [[completion, document]]
152
+ factkb_tokens = self.factkb_tokenizer(input_factkb, return_tensors="pt", padding="max_length", truncation=True).to(self.factkb_model.device)
153
+ factkb_logits = self.factkb_model(**factkb_tokens).logits
154
+ factkb_res = torch.softmax(factkb_logits, dim=1)
155
+
156
+ res = {
157
+ "rouge1": rouge1_scores[0],
158
+ "rouge2": rouge2_scores[0],
159
+ "rougeL": rougeL_scores[0],
160
+ "factKB": float(factkb_res[0][1])
161
+ }
162
+
163
+ return res
164
 
165
  def aggregation(self):
166
  """
src/backend/tasks/cnndm/utils.py DELETED
@@ -1,89 +0,0 @@
1
- import sacrebleu
2
- import numpy as np
3
-
4
- from rouge_score import rouge_scorer, scoring
5
-
6
-
7
- def process_results(doc, results):
8
- # (Pdb)doc.keys()
9
- # dict_keys(['document', 'summary', 'id'])
10
- # (Pdb++) results
11
- # [' The Welsh Government has announced
12
-
13
- # breakpoint()
14
-
15
- completion = results[0]
16
- # true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
17
- # all_refs = true_refs + false_refs
18
-
19
- document = doc["article"]
20
- true_refs = [doc["highlights"]]
21
- all_refs = true_refs
22
-
23
- # ROUGE-N
24
- rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
25
- # ROUGE-1
26
- rouge1_scores = [score["rouge1"] for score in rouge_scores]
27
- # ROUGE-2
28
- rouge2_scores = [score["rouge2"] for score in rouge_scores]
29
- # ROUGE-L
30
- rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
31
-
32
- res = {
33
- "rouge1": rouge1_scores[0],
34
- "rouge2": rouge2_scores[0],
35
- "rougeL": rougeL_scores[0],
36
- }
37
-
38
- return res
39
-
40
-
41
- def bleu(refs, preds):
42
- """
43
- Returns `t5` style BLEU scores. See the related implementation:
44
- https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
45
-
46
- :param refs:
47
- A `list` of `list` of reference `str`s.
48
- :param preds:
49
- A `list` of predicted `str`s.
50
- """
51
- score = sacrebleu.corpus_bleu(
52
- preds,
53
- refs,
54
- smooth_method="exp",
55
- smooth_value=0.0,
56
- force=False,
57
- lowercase=False,
58
- tokenize="intl",
59
- use_effective_order=False,
60
- ).score
61
- return score
62
-
63
-
64
- def rouge(refs, preds):
65
- """
66
- Returns `t5` style ROUGE scores. See the related implementation:
67
- https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
68
-
69
- :param refs:
70
- A `list` of reference `strs`.
71
- :param preds:
72
- A `list` of predicted `strs`.
73
- """
74
- rouge_types = ["rouge1", "rouge2", "rougeLsum"]
75
- scorer = rouge_scorer.RougeScorer(rouge_types)
76
- # Add newlines between sentences to correctly compute `rougeLsum`.
77
-
78
- def _prepare_summary(summary):
79
- summary = summary.replace(" . ", ".\n")
80
- return summary
81
-
82
- # Accumulate confidence intervals.
83
- aggregator = scoring.BootstrapAggregator()
84
- for ref, pred in zip(refs, preds):
85
- ref = _prepare_summary(ref)
86
- pred = _prepare_summary(pred)
87
- aggregator.add_scores(scorer.score(ref, pred))
88
- result = aggregator.aggregate()
89
- return {type: result[type].mid.fmeasure * 100 for type in rouge_types}