pinnnn commited on
Commit
1b862fc
·
1 Parent(s): a03ab93

fix: Evaluator

Browse files

Former-commit-id: 8a115c21d0415a5965307307803ac37dd6180cc9

evaluation/evaluation.py CHANGED
@@ -23,6 +23,8 @@ class Evaluator:
23
  print("gt_s.source_text: ", gt_s.translation)
24
 
25
  scores_dict = scorer.get_scores(pred_s.source_text, pred_s.translation, gt_s.translation)
 
 
26
  scores_dict['Source'] = pred_s.source_text
27
  scores_dict['Prediction'] = pred_s.translation
28
  scores_dict['Ground Truth'] = gt_s.translation
 
23
  print("gt_s.source_text: ", gt_s.translation)
24
 
25
  scores_dict = scorer.get_scores(pred_s.source_text, pred_s.translation, gt_s.translation)
26
+ print("scores_dict: ", scores_dict)
27
+
28
  scores_dict['Source'] = pred_s.source_text
29
  scores_dict['Prediction'] = pred_s.translation
30
  scores_dict['Ground Truth'] = gt_s.translation
evaluation/scores/LLM_eval.py CHANGED
@@ -4,6 +4,7 @@
4
  # Written by Jiaen LIU, 2023/09/18
5
 
6
  # Import the necessary packages
 
7
  from langchain.evaluation import load_evaluator, EvaluatorType
8
  from langchain.prompts import PromptTemplate
9
  from langchain.chat_models import ChatOpenAI
@@ -46,13 +47,26 @@ def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", mod
46
  # prase the output of the evaluation
47
  # example :
48
  # 'value': 'Accuracy: 80. The predicted answer is partially correct. The sentence "这是一个测试句子" translates to "This is a test sentence" in English. However, the original sentence is "This is an test sentences" which is grammatically incorrect in English. The correct translation should be "这是一个测试句子" if we correct the English sentence to "This is a test sentence". Therefore, the predicted answer is not entirely wrong, but it does not match the original sentence exactly due to the grammatical error in the original sentence.'
 
 
 
 
 
 
 
 
49
  def parse_eval_result(eval_result):
50
- # score = eval_result.score
51
- value = eval_result["value"]
52
- value = value.split("Accuracy: ")[1].split(".")
53
- # combine the rest of the string into the whole explanation
54
- explanation = ".".join(value[1:])
55
- return int(value[0]), explanation
 
 
 
 
 
56
 
57
  def evaluate_prediction(input, reference, prediction, evaluator):
58
  eval_result = evaluator.evaluate_strings(
@@ -65,5 +79,6 @@ def evaluate_prediction(input, reference, prediction, evaluator):
65
  if __name__ == "__main__":
66
  evaluator = init_evaluator()
67
  # For no input english sentence, just put "" in the input
68
- eval_result = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
69
- print(eval_result)
 
 
4
  # Written by Jiaen LIU, 2023/09/18
5
 
6
  # Import the necessary packages
7
+ import re
8
  from langchain.evaluation import load_evaluator, EvaluatorType
9
  from langchain.prompts import PromptTemplate
10
  from langchain.chat_models import ChatOpenAI
 
47
  # prase the output of the evaluation
48
  # example :
49
  # 'value': 'Accuracy: 80. The predicted answer is partially correct. The sentence "这是一个测试句子" translates to "This is a test sentence" in English. However, the original sentence is "This is an test sentences" which is grammatically incorrect in English. The correct translation should be "这是一个测试句子" if we correct the English sentence to "This is a test sentence". Therefore, the predicted answer is not entirely wrong, but it does not match the original sentence exactly due to the grammatical error in the original sentence.'
50
+ # def parse_eval_result(eval_result):
51
+ # # score = eval_result.score
52
+ # value = eval_result["value"]
53
+ # value = value.split("Accuracy: ")[1].split(".")
54
+ # # combine the rest of the string into the whole explanation
55
+ # explanation = ".".join(value[1:])
56
+ # return int(value[0]), explanation
57
+
58
  def parse_eval_result(eval_result):
59
+ # Extract the 'Accuracy' score using a regular expression from the 'reasoning' key
60
+ accuracy_match = re.search(r'Accuracy: (\d+)', eval_result['reasoning'])
61
+ if accuracy_match:
62
+ accuracy = int(accuracy_match.group(1))
63
+ else:
64
+ accuracy = 0
65
+
66
+ # Directly get the 'Explanation' value from the 'value' key
67
+ explanation = eval_result['value']
68
+
69
+ return accuracy, explanation
70
 
71
  def evaluate_prediction(input, reference, prediction, evaluator):
72
  eval_result = evaluator.evaluate_strings(
 
79
  if __name__ == "__main__":
80
  evaluator = init_evaluator()
81
  # For no input english sentence, just put "" in the input
82
+ accuracy, explanation = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
83
+ print("Accuracy:", accuracy)
84
+ print("Explanation:", explanation)
evaluation/scores/multi_scores.py CHANGED
@@ -1,6 +1,6 @@
1
  from comet import download_model, load_from_checkpoint
2
  from sacrebleu.metrics import BLEU, CHRF, TER
3
- import LLM_eval
4
 
5
  class multi_scores:
6
  def __init__(self, source_lang="English", target_lang="Chinese", domain="starcraft 2") -> None:
@@ -15,8 +15,8 @@ class multi_scores:
15
  def get_scores(self, src:str, mt:str, ref:str) -> dict:
16
  comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
17
  bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
18
- LLM_score = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
19
- return {'bleu_score':bleu_score, 'comet_score':comet_score, 'llm_score':LLM_score[0], 'llm_explanation':LLM_score[1]}
20
 
21
  if __name__ == "__main__":
22
  src = "this is an test sentences"
 
1
  from comet import download_model, load_from_checkpoint
2
  from sacrebleu.metrics import BLEU, CHRF, TER
3
+ from scores import LLM_eval
4
 
5
  class multi_scores:
6
  def __init__(self, source_lang="English", target_lang="Chinese", domain="starcraft 2") -> None:
 
15
  def get_scores(self, src:str, mt:str, ref:str) -> dict:
16
  comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
17
  bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
18
+ llm_score, llm_explanation = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
19
+ return {'bleu_score':bleu_score, 'comet_score':comet_score, 'llm_score':llm_score, 'llm_explanation': llm_explanation}
20
 
21
  if __name__ == "__main__":
22
  src = "this is an test sentences"