Spaces:
Sleeping
Sleeping
update outpus format.
Browse files- lingo_judge_metric.py +13 -2
lingo_judge_metric.py
CHANGED
@@ -30,7 +30,10 @@ Args:
|
|
30 |
`references` (list of list of str): Multiple references per question.
|
31 |
|
32 |
Returns:
|
33 |
-
`
|
|
|
|
|
|
|
34 |
|
35 |
Examples:
|
36 |
>>> metric = evaluate.load("maysonma/lingo_judge_metric")
|
@@ -74,4 +77,12 @@ class LingoJudgeMetric(evaluate.Metric):
|
|
74 |
def _compute(self, questions, predictions, references):
|
75 |
"""Returns the scores"""
|
76 |
scores = self.scorer.compute(questions, references, predictions)
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
`references` (list of list of str): Multiple references per question.
|
31 |
|
32 |
Returns:
|
33 |
+
`score` (list of float): Lingo-Judge score.
|
34 |
+
`probability` (list of float): Probability of the prediction being correct.
|
35 |
+
`correct` (list of bool): Whether the prediction is correct.
|
36 |
+
`benchmark_score` (float): Benchmark score.
|
37 |
|
38 |
Examples:
|
39 |
>>> metric = evaluate.load("maysonma/lingo_judge_metric")
|
|
|
77 |
def _compute(self, questions, predictions, references):
|
78 |
"""Returns the scores"""
|
79 |
scores = self.scorer.compute(questions, references, predictions)
|
80 |
+
probability = torch.sigmoid(scores)
|
81 |
+
correct = scores > 0.0
|
82 |
+
benchmark_score = float(torch.sum(correct).item() / len(correct))
|
83 |
+
return {
|
84 |
+
"score": scores.cpu().tolist(),
|
85 |
+
"probability": probability.cpu().tolist(),
|
86 |
+
"correct": correct.cpu().tolist(),
|
87 |
+
"benchmark_score": benchmark_score,
|
88 |
+
}
|