maysonma commited on
Commit
264b095
1 Parent(s): abee4d2

update outpus format.

Browse files
Files changed (1) hide show
  1. lingo_judge_metric.py +13 -2
lingo_judge_metric.py CHANGED
@@ -30,7 +30,10 @@ Args:
30
  `references` (list of list of str): Multiple references per question.
31
 
32
  Returns:
33
- `scores` (list of float): Score indicating truthfulness.
 
 
 
34
 
35
  Examples:
36
  >>> metric = evaluate.load("maysonma/lingo_judge_metric")
@@ -74,4 +77,12 @@ class LingoJudgeMetric(evaluate.Metric):
74
  def _compute(self, questions, predictions, references):
75
  """Returns the scores"""
76
  scores = self.scorer.compute(questions, references, predictions)
77
- return scores.cpu().tolist()
 
 
 
 
 
 
 
 
 
30
  `references` (list of list of str): Multiple references per question.
31
 
32
  Returns:
33
+ `score` (list of float): Lingo-Judge score.
34
+ `probability` (list of float): Probability of the prediction being correct.
35
+ `correct` (list of bool): Whether the prediction is correct.
36
+ `benchmark_score` (float): Benchmark score.
37
 
38
  Examples:
39
  >>> metric = evaluate.load("maysonma/lingo_judge_metric")
 
77
  def _compute(self, questions, predictions, references):
78
  """Returns the scores"""
79
  scores = self.scorer.compute(questions, references, predictions)
80
+ probability = torch.sigmoid(scores)
81
+ correct = scores > 0.0
82
+ benchmark_score = float(torch.sum(correct).item() / len(correct))
83
+ return {
84
+ "score": scores.cpu().tolist(),
85
+ "probability": probability.cpu().tolist(),
86
+ "correct": correct.cpu().tolist(),
87
+ "benchmark_score": benchmark_score,
88
+ }