File size: 6,999 Bytes
3e9388e
 
 
 
 
 
 
 
 
e3f9170
 
 
 
 
 
 
 
 
 
 
 
 
 
3e9388e
 
e3f9170
 
 
3e9388e
 
e3f9170
3e9388e
e3f9170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e9388e
e3f9170
3e9388e
e3f9170
 
 
 
 
 
 
3e9388e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""iBleu metric."""

import datasets
import sacrebleu as scb
from packaging import version

import evaluate


_CITATION = """\
@inproceedings{sun-zhou-2012-joint,
    title = "Joint Learning of a Dual {SMT} System for Paraphrase Generation",
    author = "Sun, Hong  and
      Zhou, Ming",
    booktitle = "Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
    month = jul,
    year = "2012",
    address = "Jeju Island, Korea",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/P12-2008",
    pages = "38--42",
}

"""

_DESCRIPTION = """\
iBLEU measures the adequacy and dissimilarity of generated paraphrases.
"""

_KWARGS_DESCRIPTION = """
Produces iBLEU score from an input and a prediction against one or more references.
Args:
    inputs (`list` of `str`): list of model inputs. Each input should be tokenized into a list of tokens.
    predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
    references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
    alpha (`float`): parameter for balancing between adequacy and dissimilarity; smaller α value indicates larger punishment on self-paraphrase.
    smooth_method (`str`): The smoothing method to use, defaults to `'exp'`. Possible values are:
        - `'none'`: no smoothing
        - `'floor'`: increment zero counts
        - `'add-k'`: increment num/denom by k for n>1
        - `'exp'`: exponential decay
    smooth_value (`float`): The smoothing value. Only valid when `smooth_method='floor'` (in which case `smooth_value` defaults to `0.1`) or `smooth_method='add-k'` (in which case `smooth_value` defaults to `1`).
    tokenize (`str`): Tokenization method to use for iBLEU. If not provided, defaults to `'zh'` for Chinese, `'ja-mecab'` for Japanese and `'13a'` (mteval) otherwise. Possible values are:
        - `'none'`: No tokenization.
        - `'zh'`: Chinese tokenization.
        - `'13a'`: mimics the `mteval-v13a` script from Moses.
        - `'intl'`: International tokenization, mimics the `mteval-v14` script from Moses
        - `'char'`: Language-agnostic character-level tokenization.
        - `'ja-mecab'`: Japanese tokenization. Uses the [MeCab tokenizer](https://pypi.org/project/mecab-python3).
    lowercase (`bool`): If `True`, lowercases the input, enabling case-insensitivity. Defaults to `False`.
    force (`bool`): If `True`, insists that your tokenized input is actually detokenized. Defaults to `False`.
    use_effective_order (`bool`): If `True`, stops including n-gram orders for which precision is 0. This should be `True`, if sentence-level BLEU will be computed. Defaults to `False`.
Returns:
    'score': iBLEU score,
Examples:
    >>> inputs = ["greetings general kenobi", "foo  foo bar bar"]
    >>> predictions = ["hello there general kenobi", "foo bar foobar"]
    >>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
    >>> ibleu = evaluate.load("rahular/ibleu")
    >>> results = ibleu.compute(inputs=inputs, predictions=predictions, references=references)
    >>> print(results)
    {'score': 60.41585343630594}
"""


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ibleu(evaluate.Metric):
    def _info(self):
        if version.parse(scb.__version__) < version.parse("1.4.12"):
            raise ImportWarning(
                "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
                'You can install it with `pip install "sacrebleu>=1.4.12"`.'
            )
        return evaluate.MetricInfo(
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            features=[
                datasets.Features(
                    {
                        "inputs": datasets.Value("string", id="sequence"),
                        "predictions": datasets.Value("string", id="sequence"),
                        "references": datasets.Sequence(
                            datasets.Value("string", id="sequence"), id="references"
                        ),
                    }
                ),
                datasets.Features(
                    {
                        "inputs": datasets.Value("string", id="sequence"),
                        "predictions": datasets.Value("string", id="sequence"),
                        "references": datasets.Value("string", id="sequence"),
                    }
                ),
            ],
            reference_urls=[
                "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"
            ],
        )

    def _compute(
        self,
        inputs,
        predictions,
        references,
        alpha=0.7,
        smooth_method="exp",
        smooth_value=None,
        force=False,
        lowercase=False,
        tokenize=None,
        use_effective_order=False,
    ):
        # if only one reference is provided make sure we still use list of lists
        if isinstance(references[0], str):
            references = [[ref] for ref in references]
        # we need to do the same for inputs
        if isinstance(inputs[0], str):
            inputs = [[inp] for inp in inputs]
        else:
            raise ValueError("There can be only one input string")
        
        references_per_prediction = len(references[0])
        if any(len(refs) != references_per_prediction for refs in references):
            raise ValueError("Sacrebleu requires the same number of references for each prediction")
        transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]

        tgt_bleu = scb.corpus_bleu(
            predictions,
            transformed_references,
            smooth_method=smooth_method,
            smooth_value=smooth_value,
            force=force,
            lowercase=lowercase,
            use_effective_order=use_effective_order,
            **(dict(tokenize=tokenize) if tokenize else {}),
        ).score
        self_bleu = scb.corpus_bleu(
            predictions,
            inputs,
            smooth_method=smooth_method,
            smooth_value=smooth_value,
            force=force,
            lowercase=lowercase,
            use_effective_order=use_effective_order,
            **(dict(tokenize=tokenize) if tokenize else {}),
        ).score
        output_dict = {
            "score": alpha * tgt_bleu - (1 - alpha) * self_bleu
        }
        return output_dict