Spaces:
Sleeping
Sleeping
JiaenLiu
commited on
Commit
·
11ff02b
1
Parent(s):
dca0a7c
some updates for scores.
Browse filesFormer-commit-id: 5d6b5d325b183e15512755c8f6accc4909b7f8a1
- evaluation/scores/LLM_eval.py +65 -28
- evaluation/scores/multi_scores.py +25 -5
evaluation/scores/LLM_eval.py
CHANGED
@@ -13,35 +13,46 @@ from langchain.chat_models import ChatOpenAI
|
|
13 |
# Load the evaluator
|
14 |
|
15 |
def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", model="gpt-4-0613"):
|
16 |
-
llm = ChatOpenAI(temperature=0, model=model)
|
17 |
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
|
|
|
|
20 |
fstring = """
|
21 |
-
You are grading the following
|
22 |
{input}
|
23 |
-
|
|
|
|
|
24 |
{reference}
|
25 |
-
You are grading the following
|
26 |
{output}
|
27 |
based on the following criteria:
|
28 |
{criteria}
|
29 |
-
Give
|
30 |
Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
|
31 |
numerically incorrect this also includes values that have the $ in front
|
32 |
Please give the completeness score first followed by the accuracy score.
|
33 |
-
For example:
|
|
|
|
|
34 |
Do not differ from the format ever
|
35 |
"""
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
# Give two grades, one for completness and another for accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low completeness/accuracy) and 100 is the highest (very high completness/accuracy)?
|
40 |
-
# Do not base the two scores off each other give them the scores independently. Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
|
41 |
-
# numerically incorrect this also includes values that have the $ in front
|
42 |
-
# Please give the completeness score first followed by the accuracy score.
|
43 |
-
# For example: Completeness: 70. Accuracy: 40. Explanation here
|
44 |
-
# Do not differ from the format ever
|
45 |
return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")
|
46 |
|
47 |
# prase the output of the evaluation
|
@@ -55,18 +66,41 @@ def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", mod
|
|
55 |
# explanation = ".".join(value[1:])
|
56 |
# return int(value[0]), explanation
|
57 |
|
58 |
-
def parse_eval_result(eval_result):
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
#
|
67 |
-
|
|
|
|
|
|
|
68 |
|
69 |
-
return accuracy,
|
70 |
|
71 |
def evaluate_prediction(input, reference, prediction, evaluator):
|
72 |
eval_result = evaluator.evaluate_strings(
|
@@ -74,11 +108,14 @@ def evaluate_prediction(input, reference, prediction, evaluator):
|
|
74 |
input=input,
|
75 |
reference=reference,
|
76 |
)
|
|
|
77 |
return parse_eval_result(eval_result)
|
78 |
|
79 |
if __name__ == "__main__":
|
80 |
evaluator = init_evaluator()
|
81 |
# For no input english sentence, just put "" in the input
|
82 |
-
accuracy,
|
83 |
-
print("Accuracy:", accuracy)
|
84 |
-
print("
|
|
|
|
|
|
13 |
# Load the evaluator
|
14 |
|
15 |
def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", model="gpt-4-0613"):
|
|
|
16 |
|
17 |
+
# map the language code to the language name
|
18 |
+
language_map = {
|
19 |
+
"en": "English",
|
20 |
+
"zh": "Chinese",
|
21 |
+
}
|
22 |
+
|
23 |
+
llm = ChatOpenAI(temperature=0, model=model)
|
24 |
|
25 |
+
# Completeness is the percentage of the input that is translated
|
26 |
+
# Accuracy is the percentage of the translation that is correct
|
27 |
fstring = """
|
28 |
+
You are grading the translation based on following input:
|
29 |
{input}
|
30 |
+
if the input is "", that means there is no input sentence.
|
31 |
+
you should grade the translation based on the reference translation:
|
32 |
+
Here is the real answer(reference):
|
33 |
{reference}
|
34 |
+
You are grading the following translation:
|
35 |
{output}
|
36 |
based on the following criteria:
|
37 |
{criteria}
|
38 |
+
Give two grades, accuracy and completeness rate them from a scale of 0 to 100, where 0 is the lowest (very low accuracy/completeness) and 100 is the highest (very high accuracy/completeness)?
|
39 |
Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
|
40 |
numerically incorrect this also includes values that have the $ in front
|
41 |
Please give the completeness score first followed by the accuracy score.
|
42 |
+
For example:
|
43 |
+
Accuracy: 40. Explanation here
|
44 |
+
Completeness: 80. Explanation here
|
45 |
Do not differ from the format ever
|
46 |
"""
|
47 |
+
|
48 |
+
if source_lang in language_map and target_lang in language_map:
|
49 |
+
lang_str = f"You are an expert {language_map[source_lang]} to {language_map[target_lang]} translator specialized in {domain}."
|
50 |
+
prompt = PromptTemplate.from_template(lang_str+fstring, template_format="f-string")
|
51 |
+
|
52 |
+
else:
|
53 |
+
print("The language code is not supported, please check the language code.")
|
54 |
+
prompt = PromptTemplate.from_template(fstring, template_format="f-string")
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
return load_evaluator("labeled_criteria", llm=llm, prompt=prompt, criteria="correctness")
|
57 |
|
58 |
# prase the output of the evaluation
|
|
|
66 |
# explanation = ".".join(value[1:])
|
67 |
# return int(value[0]), explanation
|
68 |
|
69 |
+
# def parse_eval_result(eval_result):
|
70 |
+
# # Extract the 'Accuracy' score using a regular expression from the 'reasoning' key
|
71 |
+
# accuracy_match = re.search(r'Accuracy: (\d+)', eval_result['value'])
|
72 |
+
# print(accuracy_match)
|
73 |
+
# if accuracy_match:
|
74 |
+
# accuracy = int(accuracy_match.group(1))
|
75 |
+
# else:
|
76 |
+
# # try to get the accuracy from the 'value' key
|
77 |
+
# accuracy = 0
|
78 |
+
|
79 |
+
# # Directly get the 'Explanation' value from the 'value' key
|
80 |
+
# explanation = eval_result['value']
|
81 |
+
|
82 |
+
# return accuracy, explanation
|
83 |
+
|
84 |
+
def parse_eval_result(data):
|
85 |
+
# Extract the value string
|
86 |
+
value_str = data.get('value', '')
|
87 |
+
reasoning_str = data.get('reasoning', '')
|
88 |
+
|
89 |
+
# Use regex to extract accuracy value and explanation
|
90 |
+
accuracy_match = re.search(r'Accuracy: (\d+)', value_str)
|
91 |
+
acc_explanation_match = re.search(r'Accuracy: \d+\. (.+)', value_str)
|
92 |
+
|
93 |
+
# Use regex to extract completeness value and explanation
|
94 |
+
completeness_match = re.search(r'Completeness: (\d+)', reasoning_str)
|
95 |
+
completeness_explanation_match = re.search(r'Completeness: \d+\. (.+)', reasoning_str)
|
96 |
|
97 |
+
# Extract the matched groups
|
98 |
+
completeness = int(completeness_match.group(1)) if completeness_match else None
|
99 |
+
completeness_explanation = completeness_explanation_match.group(1) if completeness_explanation_match else None
|
100 |
+
accuracy = int(accuracy_match.group(1)) if accuracy_match else None
|
101 |
+
acc_explanation = acc_explanation_match.group(1) if acc_explanation_match else None
|
102 |
|
103 |
+
return (accuracy, acc_explanation), (completeness, completeness_explanation)
|
104 |
|
105 |
def evaluate_prediction(input, reference, prediction, evaluator):
|
106 |
eval_result = evaluator.evaluate_strings(
|
|
|
108 |
input=input,
|
109 |
reference=reference,
|
110 |
)
|
111 |
+
# print(eval_result)
|
112 |
return parse_eval_result(eval_result)
|
113 |
|
114 |
if __name__ == "__main__":
|
115 |
evaluator = init_evaluator()
|
116 |
# For no input english sentence, just put "" in the input
|
117 |
+
accuracy, completeness = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
|
118 |
+
print("Accuracy:", accuracy[0])
|
119 |
+
print("Acc_Explanation:", accuracy[1])
|
120 |
+
print("Completeness:", completeness[0])
|
121 |
+
print("Comp_Explanation:", completeness[1])
|
evaluation/scores/multi_scores.py
CHANGED
@@ -1,26 +1,46 @@
|
|
1 |
from comet import download_model, load_from_checkpoint
|
2 |
from sacrebleu.metrics import BLEU, CHRF, TER
|
3 |
-
from scores import LLM_eval
|
|
|
4 |
|
5 |
class multi_scores:
|
6 |
-
def __init__(self, source_lang="
|
7 |
self.comet_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
|
8 |
-
self.bleu_model = BLEU(tokenize=
|
9 |
self.LLM_model = LLM_eval.init_evaluator(source_lang=source_lang, target_lang=target_lang, domain=domain)
|
|
|
10 |
|
11 |
# The function to get the scores
|
12 |
# src: orginal sentence
|
13 |
# mt: machine translation
|
14 |
# ref: reference translation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def get_scores(self, src:str, mt:str, ref:str) -> dict:
|
16 |
comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
|
17 |
bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
|
18 |
-
|
19 |
-
return {'bleu_score':bleu_score,
|
|
|
20 |
|
21 |
if __name__ == "__main__":
|
22 |
src = "this is an test sentences"
|
23 |
mt = "这是一个测试句子。"
|
24 |
ref = "这不是一个测试语句。"
|
25 |
print(multi_scores().get_scores(src, mt, ref))
|
|
|
|
|
26 |
|
|
|
1 |
from comet import download_model, load_from_checkpoint
|
2 |
from sacrebleu.metrics import BLEU, CHRF, TER
|
3 |
+
# from scores import LLM_eval
|
4 |
+
import LLM_eval
|
5 |
|
6 |
class multi_scores:
|
7 |
+
def __init__(self, source_lang="en", target_lang="zh", domain="starcraft 2") -> None:
|
8 |
self.comet_model = load_from_checkpoint(download_model("Unbabel/wmt22-comet-da"))
|
9 |
+
self.bleu_model = BLEU(tokenize=target_lang)
|
10 |
self.LLM_model = LLM_eval.init_evaluator(source_lang=source_lang, target_lang=target_lang, domain=domain)
|
11 |
+
# self.score = {}
|
12 |
|
13 |
# The function to get the scores
|
14 |
# src: orginal sentence
|
15 |
# mt: machine translation
|
16 |
# ref: reference translation
|
17 |
+
def calculate_comet_llm(self, src:str, mt:str, ref:str) -> dict:
|
18 |
+
comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
|
19 |
+
# bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
|
20 |
+
llm_acc, llm_completeness = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
|
21 |
+
return {'comet_score':comet_score, 'llm_score':llm_acc[0], 'llm_explanation': llm_acc[1]}
|
22 |
+
# self.score['bleu_score'] = bleu_score
|
23 |
+
# self.score['comet_score'] = comet_score
|
24 |
+
# self.score['llm_score'] = llm_score
|
25 |
+
# self.score['llm_explanation'] = llm_explanation
|
26 |
+
|
27 |
+
def calculate_bleu(self, mts:list, refs:list) -> dict:
|
28 |
+
# mt and ref are list of sentences
|
29 |
+
bleu_score = self.bleu_model.corpus_score(mts, refs).score
|
30 |
+
return {'bleu_score':bleu_score}
|
31 |
+
|
32 |
def get_scores(self, src:str, mt:str, ref:str) -> dict:
|
33 |
comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
|
34 |
bleu_score = self.bleu_model.corpus_score([mt], [ref]).score
|
35 |
+
llm_acc, llm_completeness = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
|
36 |
+
return {'bleu_score':bleu_score ,'comet_score':comet_score, 'llm_score':llm_acc[0], 'llm_explanation': llm_acc[1]}
|
37 |
+
|
38 |
|
39 |
if __name__ == "__main__":
|
40 |
src = "this is an test sentences"
|
41 |
mt = "这是一个测试句子。"
|
42 |
ref = "这不是一个测试语句。"
|
43 |
print(multi_scores().get_scores(src, mt, ref))
|
44 |
+
# print(multi_scores().calculate_comet_llm(src, mt, ref))
|
45 |
+
# print(multi_scores().calculate_bleu([mt], [ref]))
|
46 |
|