pminervini commited on
Commit
7a6df75
·
1 Parent(s): f69cf8f
src/backend/tasks/xsum/README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Task-name
2
+
3
+ ### Paper
4
+
5
+ Title: `Know What You Don’t Know: Unanswerable Questions for SQuAD`
6
+ Abstract: https://arxiv.org/abs/1806.03822
7
+
8
+ Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
9
+ consisting of questions posed by crowdworkers on a set of Wikipedia articles,
10
+ where the answer to every question is a segment of text, or span, from the
11
+ corresponding reading passage, or the question might be unanswerable.
12
+ SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
13
+ questions written adversarially by crowdworkers to look similar to answerable ones.
14
+ To do well on SQuAD2.0, systems must not only answer questions when possible, but
15
+ also determine when no answer is supported by the paragraph and abstain from answering.
16
+
17
+ Homepage: https://rajpurkar.github.io/SQuAD-explorer/
18
+
19
+
20
+ ### Citation
21
+
22
+ ```
23
+ @misc{rajpurkar2018know,
24
+ title={Know What You Don't Know: Unanswerable Questions for SQuAD},
25
+ author={Pranav Rajpurkar and Robin Jia and Percy Liang},
26
+ year={2018},
27
+ eprint={1806.03822},
28
+ archivePrefix={arXiv},
29
+ primaryClass={cs.CL}
30
+ }
31
+ ```
32
+
33
+ ### Groups and Tasks
34
+
35
+ #### Groups
36
+
37
+ * Not part of a group yet
38
+
39
+ #### Tasks
40
+
41
+ * `squadv2`: `Default squadv2 task`
42
+
43
+ ### Checklist
44
+
45
+ For adding novel benchmarks/datasets to the library:
46
+ * [ ] Is the task an existing benchmark in the literature?
47
+ * [ ] Have you referenced the original paper that introduced the task?
48
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
49
+
50
+
51
+ If other tasks on this dataset are already supported:
52
+ * [ ] Is the "Main" variant of this task clearly denoted?
53
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
54
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
src/backend/tasks/xsum/task.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lm_eval.api.task import Task
2
+ from lm_eval.api.instance import Instance
3
+ from lm_eval.api.registry import register_task
4
+ from lm_eval.api.metrics import mean
5
+
6
+ from src.backend.tasks.xsum import utils
7
+
8
+
9
+ @register_task("xsum")
10
+ class XSum(Task):
11
+ VERSION = 0
12
+ DATASET_PATH = "EdinburghNLP/xsum"
13
+ DATASET_NAME = None
14
+
15
+ def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
16
+ super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
17
+ print('XXX XSum!')
18
+
19
+ def has_training_docs(self):
20
+ return True
21
+
22
+ def has_validation_docs(self):
23
+ return True
24
+
25
+ def has_test_docs(self):
26
+ return True
27
+
28
+ def training_docs(self):
29
+ return self.dataset["train"]
30
+
31
+ def validation_docs(self):
32
+ return self.dataset["validation"]
33
+
34
+ def test_docs(self):
35
+ return self.dataset["test"]
36
+
37
+ def doc_to_text(self, doc):
38
+ return f'Document: {doc["document"]}\nSummary:'
39
+
40
+ @staticmethod
41
+ def should_decontaminate():
42
+ return True
43
+
44
+ def doc_to_decontamination_query(self, doc):
45
+ return doc["document"]
46
+
47
+ def doc_to_target(self, doc):
48
+ return doc["summary"]
49
+
50
+ def construct_requests(self, doc, ctx, **kwargs):
51
+ """Uses RequestFactory to construct Requests and returns an iterable of
52
+ Requests which will be sent to the LM.
53
+
54
+ :param doc:
55
+ The document as returned from training_docs, validation_docs, or test_docs.
56
+ :param ctx: str
57
+ The context string, generated by fewshot_context. This includes the natural
58
+ language description, as well as the few shot examples, and the question
59
+ part of the document for `doc`.
60
+ """
61
+
62
+ return [
63
+ Instance(
64
+ request_type="generate_until",
65
+ doc=doc,
66
+ arguments=(ctx, {"until": ["\n", "."]}),
67
+ idx=0,
68
+ **kwargs
69
+ )
70
+ ]
71
+
72
+ def process_results(self, doc, results):
73
+ return utils.process_results(doc, results)
74
+
75
+ def aggregation(self):
76
+ """
77
+ :returns: {str: [float] -> float}
78
+ A dictionary where keys are the names of submetrics and values are
79
+ functions that aggregate a list of metrics
80
+ """
81
+ return {k: mean for k in ["rouge1", "rouge2", "rougeL"]}
82
+
83
+ def higher_is_better(self):
84
+ """
85
+ :returns: {str: bool}
86
+ A dictionary where keys are the names of submetrics and values are
87
+ whether a higher value of the submetric is better
88
+ """
89
+ return {k: True for k in ["rouge1", "rouge2", "rougeL"]}
90
+
src/backend/tasks/xsum/xsum.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ task:
2
+ - xsum