rahular commited on
Commit
3e9388e
β€’
1 Parent(s): 534a3f8

initial commit

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. README.md +1 -1
  3. app.py +11 -0
  4. ibleu.py +148 -0
  5. requirements.txt +2 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .vscode/
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Ibleu
3
  emoji: πŸ“Š
4
  colorFrom: red
5
  colorTo: indigo
 
1
  ---
2
+ title: iBleu
3
  emoji: πŸ“Š
4
  colorFrom: red
5
  colorTo: indigo
app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import evaluate
4
+ from evaluate.utils import launch_gradio_widget
5
+
6
+
7
+ sys.path = [p for p in sys.path if p != "/home/user/app"]
8
+ module = evaluate.load("rahular/ibleu")
9
+ sys.path = ["/home/user/app"] + sys.path
10
+
11
+ launch_gradio_widget(module)
ibleu.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """iBleu metric."""
2
+
3
+ import datasets
4
+ import sacrebleu as scb
5
+ from packaging import version
6
+
7
+ import evaluate
8
+
9
+
10
+ _DESCRIPTION = """
11
+ Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
12
+ Accuracy = (TP + TN) / (TP + TN + FP + FN)
13
+ Where:
14
+ TP: True positive
15
+ TN: True negative
16
+ FP: False positive
17
+ FN: False negative
18
+ """
19
+
20
+
21
+ _KWARGS_DESCRIPTION = """
22
+ Args:
23
+ predictions (`list` of `int`): Predicted labels.
24
+ references (`list` of `int`): Ground truth labels.
25
+ normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
26
+ sample_weight (`list` of `float`): Sample weights Defaults to None.
27
+ Returns:
28
+ accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
29
+ Examples:
30
+ Example 1-A simple example
31
+ >>> accuracy_metric = evaluate.load("accuracy")
32
+ >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
33
+ >>> print(results)
34
+ {'accuracy': 0.5}
35
+ Example 2-The same as Example 1, except with `normalize` set to `False`.
36
+ >>> accuracy_metric = evaluate.load("accuracy")
37
+ >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
38
+ >>> print(results)
39
+ {'accuracy': 3.0}
40
+ Example 3-The same as Example 1, except with `sample_weight` set.
41
+ >>> accuracy_metric = evaluate.load("accuracy")
42
+ >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
43
+ >>> print(results)
44
+ {'accuracy': 0.8778625954198473}
45
+ """
46
+
47
+
48
+ _CITATION = """
49
+ @article{scikit-learn,
50
+ title={Scikit-learn: Machine Learning in {P}ython},
51
+ author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
52
+ and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
53
+ and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
54
+ Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
55
+ journal={Journal of Machine Learning Research},
56
+ volume={12},
57
+ pages={2825--2830},
58
+ year={2011}
59
+ }
60
+ """
61
+
62
+
63
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
64
+ class ibleu(evaluate.Metric):
65
+ def _info(self):
66
+ if version.parse(scb.__version__) < version.parse("1.4.12"):
67
+ raise ImportWarning(
68
+ "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
69
+ 'You can install it with `pip install "sacrebleu>=1.4.12"`.'
70
+ )
71
+ return evaluate.MetricInfo(
72
+ description=_DESCRIPTION,
73
+ citation=_CITATION,
74
+ inputs_description=_KWARGS_DESCRIPTION,
75
+ features=[
76
+ datasets.Features(
77
+ {
78
+ "inputs": datasets.Value("string", id="sequence"),
79
+ "predictions": datasets.Value("string", id="sequence"),
80
+ "references": datasets.Sequence(
81
+ datasets.Value("string", id="sequence"), id="references"
82
+ ),
83
+ }
84
+ ),
85
+ datasets.Features(
86
+ {
87
+ "inputs": datasets.Value("string", id="sequence"),
88
+ "predictions": datasets.Value("string", id="sequence"),
89
+ "references": datasets.Value("string", id="sequence"),
90
+ }
91
+ ),
92
+ ],
93
+ reference_urls=[
94
+ "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"
95
+ ],
96
+ )
97
+
98
+ def _compute(
99
+ self,
100
+ inputs,
101
+ predictions,
102
+ references,
103
+ alpha=0.7,
104
+ smooth_method="exp",
105
+ smooth_value=None,
106
+ force=False,
107
+ lowercase=False,
108
+ tokenize=None,
109
+ use_effective_order=False,
110
+ ):
111
+ # if only one reference is provided make sure we still use list of lists
112
+ if isinstance(references[0], str):
113
+ references = [[ref] for ref in references]
114
+ # we need to do the same for inputs
115
+ if isinstance(inputs[0], str):
116
+ inputs = [[inp] for inp in inputs]
117
+ else:
118
+ raise ValueError("There can be only one input string")
119
+
120
+ references_per_prediction = len(references[0])
121
+ if any(len(refs) != references_per_prediction for refs in references):
122
+ raise ValueError("Sacrebleu requires the same number of references for each prediction")
123
+ transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
124
+
125
+ tgt_bleu = scb.corpus_bleu(
126
+ predictions,
127
+ transformed_references,
128
+ smooth_method=smooth_method,
129
+ smooth_value=smooth_value,
130
+ force=force,
131
+ lowercase=lowercase,
132
+ use_effective_order=use_effective_order,
133
+ **(dict(tokenize=tokenize) if tokenize else {}),
134
+ ).score
135
+ self_bleu = scb.corpus_bleu(
136
+ predictions,
137
+ inputs,
138
+ smooth_method=smooth_method,
139
+ smooth_value=smooth_value,
140
+ force=force,
141
+ lowercase=lowercase,
142
+ use_effective_order=use_effective_order,
143
+ **(dict(tokenize=tokenize) if tokenize else {}),
144
+ ).score
145
+ output_dict = {
146
+ "score": alpha * tgt_bleu - (1 - alpha) * self_bleu
147
+ }
148
+ return output_dict
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ git+https://github.com/huggingface/evaluate@6abb0d53b82b1e5efea5d683b91d7990a653c78d
2
+ sacrebleu