Spaces:

maysonma
/

lingo_judge_metric

Runtime error

maysonma commited on May 16, 2024

Commit

264b095

1 Parent(s): abee4d2

update outpus format.

Files changed (1) hide show

lingo_judge_metric.py CHANGED Viewed

@@ -30,7 +30,10 @@ Args:
     `references` (list of list of str): Multiple references per question.
 Returns:
-    `scores` (list of float): Score indicating truthfulness.
 Examples:
     >>> metric = evaluate.load("maysonma/lingo_judge_metric")
@@ -74,4 +77,12 @@ class LingoJudgeMetric(evaluate.Metric):
     def _compute(self, questions, predictions, references):
         """Returns the scores"""
         scores = self.scorer.compute(questions, references, predictions)
-        return scores.cpu().tolist()

     `references` (list of list of str): Multiple references per question.
 Returns:
+    `score` (list of float): Lingo-Judge score.
+    `probability` (list of float): Probability of the prediction being correct.
+    `correct` (list of bool): Whether the prediction is correct.
+    `benchmark_score` (float): Benchmark score.
 Examples:
     >>> metric = evaluate.load("maysonma/lingo_judge_metric")
     def _compute(self, questions, predictions, references):
         """Returns the scores"""
         scores = self.scorer.compute(questions, references, predictions)
+        probability = torch.sigmoid(scores)
+        correct = scores > 0.0
+        benchmark_score = float(torch.sum(correct).item() / len(correct))
+        return {
+            "score": scores.cpu().tolist(),
+            "probability": probability.cpu().tolist(),
+            "correct": correct.cpu().tolist(),
+            "benchmark_score": benchmark_score,
+        }