Spaces:

StarPigeon
/

ViDove

Sleeping

App Files Files Community

pinnnn commited on Oct 5, 2023

Commit

1b862fc

1 Parent(s): a03ab93

fix: Evaluator

Browse files

Former-commit-id: 8a115c21d0415a5965307307803ac37dd6180cc9

Files changed (3) hide show

evaluation/evaluation.py +2 -0
evaluation/scores/LLM_eval.py +23 -8
evaluation/scores/multi_scores.py +3 -3

evaluation/evaluation.py CHANGED Viewed

@@ -23,6 +23,8 @@ class Evaluator:
             print("gt_s.source_text: ", gt_s.translation)
             scores_dict = scorer.get_scores(pred_s.source_text, pred_s.translation, gt_s.translation)
             scores_dict['Source'] = pred_s.source_text
             scores_dict['Prediction'] = pred_s.translation
             scores_dict['Ground Truth'] = gt_s.translation

             print("gt_s.source_text: ", gt_s.translation)
             scores_dict = scorer.get_scores(pred_s.source_text, pred_s.translation, gt_s.translation)
+            print("scores_dict: ", scores_dict)
             scores_dict['Source'] = pred_s.source_text
             scores_dict['Prediction'] = pred_s.translation
             scores_dict['Ground Truth'] = gt_s.translation

evaluation/scores/LLM_eval.py CHANGED Viewed

@@ -4,6 +4,7 @@
 # Written by Jiaen LIU, 2023/09/18
 # Import the necessary packages
 from langchain.evaluation import load_evaluator, EvaluatorType
 from langchain.prompts import PromptTemplate
 from langchain.chat_models import ChatOpenAI
@@ -46,13 +47,26 @@ def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", mod
 # prase the output of the evaluation
 # example :
 # 'value': 'Accuracy: 80. The predicted answer is partially correct. The sentence "这是一个测试句子" translates to "This is a test sentence" in English. However, the original sentence is "This is an test sentences" which is grammatically incorrect in English. The correct translation should be "这是一个测试句子" if we correct the English sentence to "This is a test sentence". Therefore, the predicted answer is not entirely wrong, but it does not match the original sentence exactly due to the grammatical error in the original sentence.'
 def parse_eval_result(eval_result):
-    # score = eval_result.score
-    value = eval_result["value"]
-    value = value.split("Accuracy: ")[1].split(".")
-    # combine the rest of the string into the whole explanation
-    explanation = ".".join(value[1:])
-    return int(value[0]), explanation
 def evaluate_prediction(input, reference, prediction, evaluator):
     eval_result = evaluator.evaluate_strings(
@@ -65,5 +79,6 @@ def evaluate_prediction(input, reference, prediction, evaluator):
 if __name__ == "__main__":
     evaluator = init_evaluator()
     # For no input english sentence, just put "" in the input
-    eval_result = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
-    print(eval_result)

 # Written by Jiaen LIU, 2023/09/18
 # Import the necessary packages
+import re
 from langchain.evaluation import load_evaluator, EvaluatorType
 from langchain.prompts import PromptTemplate
 from langchain.chat_models import ChatOpenAI
 # prase the output of the evaluation
 # example :
 # 'value': 'Accuracy: 80. The predicted answer is partially correct. The sentence "这是一个测试句子" translates to "This is a test sentence" in English. However, the original sentence is "This is an test sentences" which is grammatically incorrect in English. The correct translation should be "这是一个测试句子" if we correct the English sentence to "This is a test sentence". Therefore, the predicted answer is not entirely wrong, but it does not match the original sentence exactly due to the grammatical error in the original sentence.'
+# def parse_eval_result(eval_result):
+#     # score = eval_result.score
+#     value = eval_result["value"]
+#     value = value.split("Accuracy: ")[1].split(".")
+#     # combine the rest of the string into the whole explanation
+#     explanation = ".".join(value[1:])
+#     return int(value[0]), explanation
 def parse_eval_result(eval_result):
+    # Extract the 'Accuracy' score using a regular expression from the 'reasoning' key
+    accuracy_match = re.search(r'Accuracy: (\d+)', eval_result['reasoning'])
+    if accuracy_match:
+        accuracy = int(accuracy_match.group(1))
+    else:
+        accuracy = 0
+    # Directly get the 'Explanation' value from the 'value' key
+    explanation = eval_result['value']
+    return accuracy, explanation
 def evaluate_prediction(input, reference, prediction, evaluator):
     eval_result = evaluator.evaluate_strings(
 if __name__ == "__main__":
     evaluator = init_evaluator()
     # For no input english sentence, just put "" in the input
+    accuracy, explanation = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
+    print("Accuracy:", accuracy)
+    print("Explanation:", explanation)

evaluation/scores/multi_scores.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from comet import download_model, load_from_checkpoint
 from sacrebleu.metrics import BLEU, CHRF, TER
-import LLM_eval
 class multi_scores:
     def __init__(self, source_lang="English", target_lang="Chinese", domain="starcraft 2") -> None:
@@ -15,8 +15,8 @@ class multi_scores:
     def get_scores(self, src:str, mt:str, ref:str) -> dict:
         comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
         bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
-        LLM_score = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
-        return {'bleu_score':bleu_score, 'comet_score':comet_score, 'llm_score':LLM_score[0], 'llm_explanation':LLM_score[1]}
 if __name__ == "__main__":
     src = "this is an test sentences"

 from comet import download_model, load_from_checkpoint
 from sacrebleu.metrics import BLEU, CHRF, TER
+from scores import LLM_eval
 class multi_scores:
     def __init__(self, source_lang="English", target_lang="Chinese", domain="starcraft 2") -> None:
     def get_scores(self, src:str, mt:str, ref:str) -> dict:
         comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
         bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
+        llm_score, llm_explanation = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
+        return {'bleu_score':bleu_score, 'comet_score':comet_score, 'llm_score':llm_score, 'llm_explanation': llm_explanation}
 if __name__ == "__main__":
     src = "this is an test sentences"