Vallp commited on
Commit
b35923b
1 Parent(s): 955bf73

Create ter.py

Browse files
Files changed (1) hide show
  1. ter.py +203 -0
ter.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 The HuggingFace Evaluate Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """ TER metric as available in sacrebleu. """
15
+ import datasets
16
+ import sacrebleu as scb
17
+ from packaging import version
18
+ from sacrebleu import TER
19
+
20
+ import evaluate
21
+
22
+
23
+ _CITATION = """\
24
+ @inproceedings{snover-etal-2006-study,
25
+ title = "A Study of Translation Edit Rate with Targeted Human Annotation",
26
+ author = "Snover, Matthew and
27
+ Dorr, Bonnie and
28
+ Schwartz, Rich and
29
+ Micciulla, Linnea and
30
+ Makhoul, John",
31
+ booktitle = "Proceedings of the 7th Conference of the Association for Machine Translation in the Americas: Technical Papers",
32
+ month = aug # " 8-12",
33
+ year = "2006",
34
+ address = "Cambridge, Massachusetts, USA",
35
+ publisher = "Association for Machine Translation in the Americas",
36
+ url = "https://aclanthology.org/2006.amta-papers.25",
37
+ pages = "223--231",
38
+ }
39
+ @inproceedings{post-2018-call,
40
+ title = "A Call for Clarity in Reporting {BLEU} Scores",
41
+ author = "Post, Matt",
42
+ booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
43
+ month = oct,
44
+ year = "2018",
45
+ address = "Belgium, Brussels",
46
+ publisher = "Association for Computational Linguistics",
47
+ url = "https://www.aclweb.org/anthology/W18-6319",
48
+ pages = "186--191",
49
+ }
50
+ """
51
+
52
+ _DESCRIPTION = """\
53
+ TER (Translation Edit Rate, also called Translation Error Rate) is a metric to quantify the edit operations that a
54
+ hypothesis requires to match a reference translation. We use the implementation that is already present in sacrebleu
55
+ (https://github.com/mjpost/sacreBLEU#ter), which in turn is inspired by the TERCOM implementation, which can be found
56
+ here: https://github.com/jhclark/tercom.
57
+ The implementation here is slightly different from sacrebleu in terms of the required input format. The length of
58
+ the references and hypotheses lists need to be the same, so you may need to transpose your references compared to
59
+ sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
60
+ See the README.md file at https://github.com/mjpost/sacreBLEU#ter for more information.
61
+ """
62
+
63
+ _KWARGS_DESCRIPTION = """
64
+ Produces TER scores alongside the number of edits and reference length.
65
+ Args:
66
+ predictions (list of str): The system stream (a sequence of segments).
67
+ references (list of list of str): A list of one or more reference streams (each a sequence of segments).
68
+ normalized (boolean): If `True`, applies basic tokenization and normalization to sentences. Defaults to `False`.
69
+ ignore_punct (boolean): If `True`, applies basic tokenization and normalization to sentences. Defaults to `False`.
70
+ support_zh_ja_chars (boolean): If `True`, tokenization/normalization supports processing of Chinese characters,
71
+ as well as Japanese Kanji, Hiragana, Katakana, and Phonetic Extensions of Katakana.
72
+ Only applies if `normalized = True`. Defaults to `False`.
73
+ case_sensitive (boolean): If `False`, makes all predictions and references lowercase to ignore differences in case. Defaults to `False`.
74
+ Returns:
75
+ 'score' (float): TER score (num_edits / sum_ref_lengths * 100)
76
+ 'num_edits' (int): The cumulative number of edits
77
+ 'ref_length' (float): The cumulative average reference length
78
+ Examples:
79
+ Example 1:
80
+ >>> predictions = ["does this sentence match??",
81
+ ... "what about this sentence?",
82
+ ... "What did the TER metric user say to the developer?"]
83
+ >>> references = [["does this sentence match", "does this sentence match!?!"],
84
+ ... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"],
85
+ ... ["Your jokes are...", "...TERrible"]]
86
+ >>> ter = evaluate.load("ter")
87
+ >>> results = ter.compute(predictions=predictions,
88
+ ... references=references,
89
+ ... case_sensitive=True)
90
+ >>> print(results)
91
+ {'score': 150.0, 'num_edits': 15, 'ref_length': 10.0}
92
+ Example 2:
93
+ >>> predictions = ["does this sentence match??",
94
+ ... "what about this sentence?"]
95
+ >>> references = [["does this sentence match", "does this sentence match!?!"],
96
+ ... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
97
+ >>> ter = evaluate.load("ter")
98
+ >>> results = ter.compute(predictions=predictions,
99
+ ... references=references,
100
+ ... case_sensitive=True)
101
+ >>> print(results)
102
+ {'score': 62.5, 'num_edits': 5, 'ref_length': 8.0}
103
+ Example 3:
104
+ >>> predictions = ["does this sentence match??",
105
+ ... "what about this sentence?"]
106
+ >>> references = [["does this sentence match", "does this sentence match!?!"],
107
+ ... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
108
+ >>> ter = evaluate.load("ter")
109
+ >>> results = ter.compute(predictions=predictions,
110
+ ... references=references,
111
+ ... normalized=True,
112
+ ... case_sensitive=True)
113
+ >>> print(results)
114
+ {'score': 57.14285714285714, 'num_edits': 6, 'ref_length': 10.5}
115
+ Example 4:
116
+ >>> predictions = ["does this sentence match??",
117
+ ... "what about this sentence?"]
118
+ >>> references = [["does this sentence match", "does this sentence match!?!"],
119
+ ... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
120
+ >>> ter = evaluate.load("ter")
121
+ >>> results = ter.compute(predictions=predictions,
122
+ ... references=references,
123
+ ... ignore_punct=True,
124
+ ... case_sensitive=False)
125
+ >>> print(results)
126
+ {'score': 0.0, 'num_edits': 0, 'ref_length': 8.0}
127
+ Example 5:
128
+ >>> predictions = ["does this sentence match??",
129
+ ... "what about this sentence?",
130
+ ... "What did the TER metric user say to the developer?"]
131
+ >>> references = [["does this sentence match", "does this sentence match!?!"],
132
+ ... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"],
133
+ ... ["Your jokes are...", "...TERrible"]]
134
+ >>> ter = evaluate.load("ter")
135
+ >>> results = ter.compute(predictions=predictions,
136
+ ... references=references,
137
+ ... ignore_punct=True,
138
+ ... case_sensitive=False)
139
+ >>> print(results)
140
+ {'score': 100.0, 'num_edits': 10, 'ref_length': 10.0}
141
+ """
142
+
143
+
144
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
145
+ class Ter(evaluate.Metric):
146
+ def _info(self):
147
+ if version.parse(scb.__version__) < version.parse("1.4.12"):
148
+ raise ImportWarning(
149
+ "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
150
+ 'You can install it with `pip install "sacrebleu>=1.4.12"`.'
151
+ )
152
+ return evaluate.MetricInfo(
153
+ description=_DESCRIPTION,
154
+ citation=_CITATION,
155
+ homepage="http://www.cs.umd.edu/~snover/tercom/",
156
+ inputs_description=_KWARGS_DESCRIPTION,
157
+ features=[
158
+ datasets.Features(
159
+ {
160
+ "predictions": datasets.Value("string", id="sequence"),
161
+ "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
162
+ }
163
+ ),
164
+ datasets.Features(
165
+ {
166
+ "predictions": datasets.Value("string", id="sequence"),
167
+ "references": datasets.Value("string", id="sequence"),
168
+ }
169
+ ),
170
+ ],
171
+ codebase_urls=["https://github.com/mjpost/sacreBLEU#ter"],
172
+ reference_urls=[
173
+ "https://github.com/jhclark/tercom",
174
+ ],
175
+ )
176
+
177
+ def _compute(
178
+ self,
179
+ predictions,
180
+ references,
181
+ normalized: bool = False,
182
+ ignore_punct: bool = False,
183
+ support_zh_ja_chars: bool = False,
184
+ case_sensitive: bool = False,
185
+ ):
186
+ # if only one reference is provided make sure we still use list of lists
187
+ if isinstance(references[0], str):
188
+ references = [[ref] for ref in references]
189
+
190
+ references_per_prediction = len(references[0])
191
+ if any(len(refs) != references_per_prediction for refs in references):
192
+ raise ValueError("Sacrebleu requires the same number of references for each prediction")
193
+ transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
194
+
195
+ sb_ter = TER(
196
+ normalized=normalized,
197
+ no_punct=ignore_punct,
198
+ asian_support=support_zh_ja_chars,
199
+ case_sensitive=case_sensitive,
200
+ )
201
+ output = sb_ter.corpus_score(predictions, transformed_references)
202
+
203
+ return {"score": output.score, "num_edits": output.num_edits, "ref_length": output.ref_length}