Spaces:

StarPigeon
/

ViDove

Sleeping

App Files Files Community

JiaenLiu commited on Oct 30, 2023

Commit

fb908a6

•

1 Parent(s): 6723c13

parse fix

Browse files

Former-commit-id: 713a1c3babc77b7865e7aae1fbe0edd72ffaf381

Files changed (2) hide show

evaluation/scores/LLM_eval.py +12 -9
evaluation/scores/multi_scores.py +1 -1

evaluation/scores/LLM_eval.py CHANGED Viewed

@@ -5,6 +5,7 @@
 # Import the necessary packages
 import re
 from langchain.evaluation import load_evaluator, EvaluatorType
 from langchain.prompts import PromptTemplate
 from langchain.chat_models import ChatOpenAI
@@ -22,7 +23,7 @@ def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", mod
     llm = ChatOpenAI(temperature=0, model=model)
-    # Completeness is the percentage of the input that is translated
     # Accuracy is the percentage of the translation that is correct
     fstring = """
             You are grading the translation based on following input:
@@ -83,16 +84,18 @@ def init_evaluator(source_lang="en", target_lang="zh", domain="startcraft2", mod
 def parse_eval_result(data):
     # Extract the value string
-    value_str = data.get('value', '')
-    reasoning_str = data.get('reasoning', '')
     # Use regex to extract accuracy value and explanation
-    accuracy_match = re.search(r'Accuracy: (\d+)', value_str)
-    acc_explanation_match = re.search(r'Accuracy: \d+\. (.+)', value_str)
     # Use regex to extract completeness value and explanation
-    completeness_match = re.search(r'Completeness: (\d+)', reasoning_str)
-    completeness_explanation_match = re.search(r'Completeness: \d+\. (.+)', reasoning_str)
     # Extract the matched groups
     completeness = int(completeness_match.group(1)) if completeness_match else None
@@ -108,13 +111,13 @@ def evaluate_prediction(input, reference, prediction, evaluator):
         input=input,
         reference=reference,
     )
-    # print(eval_result)
     return parse_eval_result(eval_result)
 if __name__ == "__main__":
     evaluator = init_evaluator()
     # For no input english sentence, just put "" in the input
-    accuracy, completeness = evaluate_prediction("this is an test sentences", "这不是一个测试语句。", "这是一个测试句子。", evaluator)
     print("Accuracy:", accuracy[0])
     print("Acc_Explanation:", accuracy[1])
     print("Completeness:", completeness[0])

 # Import the necessary packages
 import re
 from langchain.evaluation import load_evaluator, EvaluatorType
 from langchain.prompts import PromptTemplate
 from langchain.chat_models import ChatOpenAI
     llm = ChatOpenAI(temperature=0, model=model)
+    # Completeness is the percentage of the input that is translated, to test if there is any missing information
     # Accuracy is the percentage of the translation that is correct
     fstring = """
             You are grading the translation based on following input:
 def parse_eval_result(data):
     # Extract the value string
+    value_str = data.get('value', '').lower()
+    reasoning_str = data.get('reasoning', '').lower()
+    response = value_str + reasoning_str
     # Use regex to extract accuracy value and explanation
+    accuracy_match = re.search(r'accuracy: (\d+)', response)
+    acc_explanation_match = re.search(r'accuracy: \d+\. (.+)', response)
     # Use regex to extract completeness value and explanation
+    completeness_match = re.search(r'completeness: (\d+)', response)
+    completeness_explanation_match = re.search(r'completeness: \d+\. (.+)', response)
     # Extract the matched groups
     completeness = int(completeness_match.group(1)) if completeness_match else None
         input=input,
         reference=reference,
     )
+    print(eval_result)
     return parse_eval_result(eval_result)
 if __name__ == "__main__":
     evaluator = init_evaluator()
     # For no input english sentence, just put "" in the input
+    accuracy, completeness = evaluate_prediction("it's obviously going to be 神族 trying to go for a 野炮台", " 每当我们看到BF开", " 每当我们看到BF开", evaluator)
     print("Accuracy:", accuracy[0])
     print("Acc_Explanation:", accuracy[1])
     print("Completeness:", completeness[0])

evaluation/scores/multi_scores.py CHANGED Viewed

@@ -50,7 +50,7 @@ class multi_scores:
         comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
         bleu_score = self.bleu_model.corpus_score([mt], [[ref]]).score
         llm_acc, llm_completeness = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
-        return {'bleu_score':bleu_score ,'comet_score':comet_score, 'llm_score':llm_acc[0], 'llm_explanation': llm_acc[1]}
 if __name__ == "__main__":

         comet_score = self.comet_model.predict([{"src":src, "mt":mt, "ref":ref}], batch_size=8, gpus=0).scores[0]
         bleu_score = self.bleu_model.corpus_score([mt], [[ref]]).score
         llm_acc, llm_completeness = LLM_eval.evaluate_prediction(src, ref, mt, self.LLM_model)
+        return {'bleu_score':bleu_score ,'comet_score':comet_score, 'llm_score':llm_acc[0], 'llm_explanation': llm_acc[1], 'llm_completeness':llm_completeness[0], 'llm_completeness_explanation':llm_completeness[1]}
 if __name__ == "__main__":