Spaces:
Sleeping
Sleeping
fix: Evaluator
Browse filesFormer-commit-id: 8a115c21d0415a5965307307803ac37dd6180cc9
evaluation/evaluation.py
CHANGED
@@ -23,6 +23,8 @@ class Evaluator:
|
|
23 |
print("gt_s.source_text: ", gt_s.translation)
|
24 |
|
25 |
scores_dict = scorer.get_scores(pred_s.source_text, pred_s.translation, gt_s.translation)
|
|
|
|
|
26 |
scores_dict['Source'] = pred_s.source_text
|
27 |
scores_dict['Prediction'] = pred_s.translation
|
28 |
scores_dict['Ground Truth'] = gt_s.translation
|
|
|
23 |
print("gt_s.source_text: ", gt_s.translation)
|
24 |
|
25 |
scores_dict = scorer.get_scores(pred_s.source_text, pred_s.translation, gt_s.translation)
|
26 |
+
print("scores_dict: ", scores_dict)
|
27 |
+
|
28 |
scores_dict['Source'] = pred_s.source_text
|
29 |
scores_dict['Prediction'] = pred_s.translation
|
30 |
scores_dict['Ground Truth'] = gt_s.translation
|
evaluation/scores/LLM_eval.py
CHANGED
@@ -4,6 +4,7 @@
|
|
4 |
# Written by Jiaen LIU, 2023/09/18
|
5 |
|
6 |
# Import the necessary packages
|
|
|
7 |
from langchain.evaluation import load_evaluator, EvaluatorType
|
8 |
from langchain.prompts import PromptTemplate
|
9 |
from langchain.chat_models import ChatOpenAI
|
@@ -46,13 +47,26 @@ def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", mod
|
|
46 |
# prase the output of the evaluation
|
47 |
# example :
|
48 |
# 'value': 'Accuracy: 80. The predicted answer is partially correct. The sentence "这是一个测试句子" translates to "This is a test sentence" in English. However, the original sentence is "This is an test sentences" which is grammatically incorrect in English. The correct translation should be "这是一个测试句子" if we correct the English sentence to "This is a test sentence". Therefore, the predicted answer is not entirely wrong, but it does not match the original sentence exactly due to the grammatical error in the original sentence.'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
def parse_eval_result(eval_result):
|
50 |
-
# score
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
def evaluate_prediction(input, reference, prediction, evaluator):
|
58 |
eval_result = evaluator.evaluate_strings(
|
@@ -65,5 +79,6 @@ def evaluate_prediction(input, reference, prediction, evaluator):
|
|
65 |
if __name__ == "__main__":
|
66 |
evaluator = init_evaluator()
|
67 |
# For no input english sentence, just put "" in the input
|
68 |
-
|
69 |
-
print(
|
|
|
|
4 |
# Written by Jiaen LIU, 2023/09/18
|
5 |
|
6 |
# Import the necessary packages
|
7 |
+
import re
|
8 |
from langchain.evaluation import load_evaluator, EvaluatorType
|
9 |
from langchain.prompts import PromptTemplate
|
10 |
from langchain.chat_models import ChatOpenAI
|
|
|
47 |
# prase the output of the evaluation
|
48 |
# example :
|
49 |
# 'value': 'Accuracy: 80. The predicted answer is partially correct. The sentence "这是一个测试句子" translates to "This is a test sentence" in English. However, the original sentence is "This is an test sentences" which is grammatically incorrect in English. The correct translation should be "这是一个测试句子" if we correct the English sentence to "This is a test sentence". Therefore, the predicted answer is not entirely wrong, but it does not match the original sentence exactly due to the grammatical error in the original sentence.'
|
50 |
+
# def parse_eval_result(eval_result):
|
51 |
+
# # score = eval_result.score
|
52 |
+
# value = eval_result["value"]
|
53 |
+
# value = value.split("Accuracy: ")[1].split(".")
|
54 |
+
# # combine the rest of the string into the whole explanation
|
55 |
+
# explanation = ".".join(value[1:])
|
56 |
+
# return int(value[0]), explanation
|
57 |
+
|
58 |
def parse_eval_result(eval_result):
|
59 |
+
# Extract the 'Accuracy' score using a regular expression from the 'reasoning' key
|
60 |
+
accuracy_match = re.search(r'Accuracy: (\d+)', eval_result['reasoning'])
|
61 |
+
if accuracy_match:
|
62 |
+
accuracy = int(accuracy_match.group(1))
|
63 |
+
else:
|
64 |
+
accuracy = 0
|
65 |
+
|
66 |
+
# Directly get the 'Explanation' value from the 'value' key
|
67 |
+
explanation = eval_result['value']
|
68 |
+
|
69 |
+
return accuracy, explanation
|
70 |
|
71 |
def evaluate_prediction(input, reference, prediction, evaluator):
|
72 |
eval_result = evaluator.evaluate_strings(
|
|
|
79 |
if __name__ == "__main__":
|
80 |
evaluator = init_evaluator()
|
81 |
# For no input english sentence, just put "" in the input
|
82 |
+
accuracy, explanation = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
|
83 |
+
print("Accuracy:", accuracy)
|
84 |
+
print("Explanation:", explanation)
|
evaluation/scores/multi_scores.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from comet import download_model, load_from_checkpoint
|
2 |
from sacrebleu.metrics import BLEU, CHRF, TER
|
3 |
-
import LLM_eval
|
4 |
|
5 |
class multi_scores:
|
6 |
def __init__(self, source_lang="English", target_lang="Chinese", domain="starcraft 2") -> None:
|
@@ -15,8 +15,8 @@ class multi_scores:
|
|
15 |
def get_scores(self, src:str, mt:str, ref:str) -> dict:
|
16 |
comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
|
17 |
bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
|
18 |
-
|
19 |
-
return {'bleu_score':bleu_score, 'comet_score':comet_score, 'llm_score':
|
20 |
|
21 |
if __name__ == "__main__":
|
22 |
src = "this is an test sentences"
|
|
|
1 |
from comet import download_model, load_from_checkpoint
|
2 |
from sacrebleu.metrics import BLEU, CHRF, TER
|
3 |
+
from scores import LLM_eval
|
4 |
|
5 |
class multi_scores:
|
6 |
def __init__(self, source_lang="English", target_lang="Chinese", domain="starcraft 2") -> None:
|
|
|
15 |
def get_scores(self, src:str, mt:str, ref:str) -> dict:
|
16 |
comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
|
17 |
bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
|
18 |
+
llm_score, llm_explanation = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
|
19 |
+
return {'bleu_score':bleu_score, 'comet_score':comet_score, 'llm_score':llm_score, 'llm_explanation': llm_explanation}
|
20 |
|
21 |
if __name__ == "__main__":
|
22 |
src = "this is an test sentences"
|