initial commit
Browse files- .gitignore +1 -0
- README.md +1 -1
- app.py +11 -0
- ibleu.py +148 -0
- requirements.txt +2 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.vscode/
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: π
|
4 |
colorFrom: red
|
5 |
colorTo: indigo
|
|
|
1 |
---
|
2 |
+
title: iBleu
|
3 |
emoji: π
|
4 |
colorFrom: red
|
5 |
colorTo: indigo
|
app.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
import evaluate
|
4 |
+
from evaluate.utils import launch_gradio_widget
|
5 |
+
|
6 |
+
|
7 |
+
sys.path = [p for p in sys.path if p != "/home/user/app"]
|
8 |
+
module = evaluate.load("rahular/ibleu")
|
9 |
+
sys.path = ["/home/user/app"] + sys.path
|
10 |
+
|
11 |
+
launch_gradio_widget(module)
|
ibleu.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""iBleu metric."""
|
2 |
+
|
3 |
+
import datasets
|
4 |
+
import sacrebleu as scb
|
5 |
+
from packaging import version
|
6 |
+
|
7 |
+
import evaluate
|
8 |
+
|
9 |
+
|
10 |
+
_DESCRIPTION = """
|
11 |
+
Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
|
12 |
+
Accuracy = (TP + TN) / (TP + TN + FP + FN)
|
13 |
+
Where:
|
14 |
+
TP: True positive
|
15 |
+
TN: True negative
|
16 |
+
FP: False positive
|
17 |
+
FN: False negative
|
18 |
+
"""
|
19 |
+
|
20 |
+
|
21 |
+
_KWARGS_DESCRIPTION = """
|
22 |
+
Args:
|
23 |
+
predictions (`list` of `int`): Predicted labels.
|
24 |
+
references (`list` of `int`): Ground truth labels.
|
25 |
+
normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
|
26 |
+
sample_weight (`list` of `float`): Sample weights Defaults to None.
|
27 |
+
Returns:
|
28 |
+
accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
|
29 |
+
Examples:
|
30 |
+
Example 1-A simple example
|
31 |
+
>>> accuracy_metric = evaluate.load("accuracy")
|
32 |
+
>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
|
33 |
+
>>> print(results)
|
34 |
+
{'accuracy': 0.5}
|
35 |
+
Example 2-The same as Example 1, except with `normalize` set to `False`.
|
36 |
+
>>> accuracy_metric = evaluate.load("accuracy")
|
37 |
+
>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
|
38 |
+
>>> print(results)
|
39 |
+
{'accuracy': 3.0}
|
40 |
+
Example 3-The same as Example 1, except with `sample_weight` set.
|
41 |
+
>>> accuracy_metric = evaluate.load("accuracy")
|
42 |
+
>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
|
43 |
+
>>> print(results)
|
44 |
+
{'accuracy': 0.8778625954198473}
|
45 |
+
"""
|
46 |
+
|
47 |
+
|
48 |
+
_CITATION = """
|
49 |
+
@article{scikit-learn,
|
50 |
+
title={Scikit-learn: Machine Learning in {P}ython},
|
51 |
+
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
|
52 |
+
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
|
53 |
+
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
|
54 |
+
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
|
55 |
+
journal={Journal of Machine Learning Research},
|
56 |
+
volume={12},
|
57 |
+
pages={2825--2830},
|
58 |
+
year={2011}
|
59 |
+
}
|
60 |
+
"""
|
61 |
+
|
62 |
+
|
63 |
+
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
64 |
+
class ibleu(evaluate.Metric):
|
65 |
+
def _info(self):
|
66 |
+
if version.parse(scb.__version__) < version.parse("1.4.12"):
|
67 |
+
raise ImportWarning(
|
68 |
+
"To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
|
69 |
+
'You can install it with `pip install "sacrebleu>=1.4.12"`.'
|
70 |
+
)
|
71 |
+
return evaluate.MetricInfo(
|
72 |
+
description=_DESCRIPTION,
|
73 |
+
citation=_CITATION,
|
74 |
+
inputs_description=_KWARGS_DESCRIPTION,
|
75 |
+
features=[
|
76 |
+
datasets.Features(
|
77 |
+
{
|
78 |
+
"inputs": datasets.Value("string", id="sequence"),
|
79 |
+
"predictions": datasets.Value("string", id="sequence"),
|
80 |
+
"references": datasets.Sequence(
|
81 |
+
datasets.Value("string", id="sequence"), id="references"
|
82 |
+
),
|
83 |
+
}
|
84 |
+
),
|
85 |
+
datasets.Features(
|
86 |
+
{
|
87 |
+
"inputs": datasets.Value("string", id="sequence"),
|
88 |
+
"predictions": datasets.Value("string", id="sequence"),
|
89 |
+
"references": datasets.Value("string", id="sequence"),
|
90 |
+
}
|
91 |
+
),
|
92 |
+
],
|
93 |
+
reference_urls=[
|
94 |
+
"https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"
|
95 |
+
],
|
96 |
+
)
|
97 |
+
|
98 |
+
def _compute(
|
99 |
+
self,
|
100 |
+
inputs,
|
101 |
+
predictions,
|
102 |
+
references,
|
103 |
+
alpha=0.7,
|
104 |
+
smooth_method="exp",
|
105 |
+
smooth_value=None,
|
106 |
+
force=False,
|
107 |
+
lowercase=False,
|
108 |
+
tokenize=None,
|
109 |
+
use_effective_order=False,
|
110 |
+
):
|
111 |
+
# if only one reference is provided make sure we still use list of lists
|
112 |
+
if isinstance(references[0], str):
|
113 |
+
references = [[ref] for ref in references]
|
114 |
+
# we need to do the same for inputs
|
115 |
+
if isinstance(inputs[0], str):
|
116 |
+
inputs = [[inp] for inp in inputs]
|
117 |
+
else:
|
118 |
+
raise ValueError("There can be only one input string")
|
119 |
+
|
120 |
+
references_per_prediction = len(references[0])
|
121 |
+
if any(len(refs) != references_per_prediction for refs in references):
|
122 |
+
raise ValueError("Sacrebleu requires the same number of references for each prediction")
|
123 |
+
transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
|
124 |
+
|
125 |
+
tgt_bleu = scb.corpus_bleu(
|
126 |
+
predictions,
|
127 |
+
transformed_references,
|
128 |
+
smooth_method=smooth_method,
|
129 |
+
smooth_value=smooth_value,
|
130 |
+
force=force,
|
131 |
+
lowercase=lowercase,
|
132 |
+
use_effective_order=use_effective_order,
|
133 |
+
**(dict(tokenize=tokenize) if tokenize else {}),
|
134 |
+
).score
|
135 |
+
self_bleu = scb.corpus_bleu(
|
136 |
+
predictions,
|
137 |
+
inputs,
|
138 |
+
smooth_method=smooth_method,
|
139 |
+
smooth_value=smooth_value,
|
140 |
+
force=force,
|
141 |
+
lowercase=lowercase,
|
142 |
+
use_effective_order=use_effective_order,
|
143 |
+
**(dict(tokenize=tokenize) if tokenize else {}),
|
144 |
+
).score
|
145 |
+
output_dict = {
|
146 |
+
"score": alpha * tgt_bleu - (1 - alpha) * self_bleu
|
147 |
+
}
|
148 |
+
return output_dict
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/evaluate@6abb0d53b82b1e5efea5d683b91d7990a653c78d
|
2 |
+
sacrebleu
|