Spaces:
Running
Running
JiaenLiu
commited on
Commit
·
b0198e8
1
Parent(s):
61ca873
Evaluation structure
Browse filesFormer-commit-id: 956e1085199af0c47b3fc01f395f1c3be195ece4
- evaluation/alignment.py +0 -0
- evaluation/evaluation.py +0 -0
- evaluation/readme.md +24 -0
- evaluation/scores/LLM_eval.py +76 -0
- evaluation/scores/scores.py +0 -0
evaluation/alignment.py
ADDED
File without changes
|
evaluation/evaluation.py
ADDED
File without changes
|
evaluation/readme.md
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Evaluation:
|
2 |
+
BLEU (https://github.com/mjpost/sacrebleu)
|
3 |
+
COMET (https://github.com/Unbabel/COMET)
|
4 |
+
LLM eval
|
5 |
+
Eval time stamp
|
6 |
+
|
7 |
+
Sep 18 - Sep 25
|
8 |
+
Proj-t
|
9 |
+
src
|
10 |
+
evaluation
|
11 |
+
- scores
|
12 |
+
- LLM_eval.py (jiaen)
|
13 |
+
- scores.py (wizard)
|
14 |
+
- comet
|
15 |
+
- sacrebleu
|
16 |
+
- alignment.py (david)
|
17 |
+
- evaluation.py (not assigned)
|
18 |
+
- results
|
19 |
+
- mmddyy-HMS-results.csv
|
20 |
+
- logs
|
21 |
+
|
22 |
+
entry:
|
23 |
+
Python3 evaluation/evaluation.py –pred path/to/pred –gt path/to/gt
|
24 |
+
|
evaluation/scores/LLM_eval.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This script is used to evaluate the performance of Pigeon AI Video Translation system by using Large Language Model.
|
2 |
+
|
3 |
+
# Written by Jiaen LIU, 2023/09/18
|
4 |
+
|
5 |
+
# Import the necessary packages
|
6 |
+
from langchain.evaluation import load_evaluator, EvaluatorType
|
7 |
+
from langchain.prompts import PromptTemplate
|
8 |
+
from langchain.chat_models import ChatAnthropic
|
9 |
+
|
10 |
+
import re
|
11 |
+
from tqdm import tqdm
|
12 |
+
import pandas as pd
|
13 |
+
import numpy as np
|
14 |
+
import time
|
15 |
+
import os
|
16 |
+
import argparse
|
17 |
+
from pathlib import Path
|
18 |
+
from src.srt_util.srt import SrtScript
|
19 |
+
|
20 |
+
class PiegonLLMEvaluator():
|
21 |
+
"""
|
22 |
+
input :
|
23 |
+
- predicted sentences
|
24 |
+
- ground truth sentences
|
25 |
+
output :
|
26 |
+
- scores
|
27 |
+
- explanations
|
28 |
+
"""
|
29 |
+
|
30 |
+
def __init__(self, output_dir: str, data_dir: str ) -> None:
|
31 |
+
# self.__eval_chain = self.__initialize_QAEvalChain()
|
32 |
+
self.__data_dir = data_dir
|
33 |
+
self.__result_dir = output_dir
|
34 |
+
# self.__chatbot = evaluater
|
35 |
+
self.__result_df = pd.DataFrame()
|
36 |
+
self.__initialize_df()
|
37 |
+
self.running_cost = 0
|
38 |
+
self.last_cost = 0
|
39 |
+
pass
|
40 |
+
|
41 |
+
def __init_llm_evaluater(self):
|
42 |
+
# llm = ChatOpenAI(temperature=0, model="gpt-4-0613")
|
43 |
+
# search = SerpAPIWrapper()
|
44 |
+
# tools = [
|
45 |
+
# Tool(
|
46 |
+
# name="Search",
|
47 |
+
# func=search.run,
|
48 |
+
# coroutine=search.arun,
|
49 |
+
# description="Useful when you need to answer questions about current events. You should ask targeted questions.",
|
50 |
+
# ),
|
51 |
+
# ]
|
52 |
+
# agents = [
|
53 |
+
# initialize_agent(tools, llm, agent=AgentType.OPENAI_MULTI_FUNCTIONS, verbose=False),
|
54 |
+
# initialize_agent(tools, llm, agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION, verbose=False)
|
55 |
+
# ]
|
56 |
+
|
57 |
+
llm = ChatAnthropic(temperature=0)
|
58 |
+
|
59 |
+
fstring = """You are an expert English to Chinese translator specialized in Startcraft2.
|
60 |
+
You are grading the following question:
|
61 |
+
{query}
|
62 |
+
Here is the real answer:
|
63 |
+
{answer}
|
64 |
+
You are grading the following predicted answer:
|
65 |
+
{result}
|
66 |
+
Give two grades, one for completness and another for accuracy and rate them from a scale of 0 to 100, where 0 is the lowest (very low completeness/accuracy) and 100 is the highest (very high completness/accuracy)?
|
67 |
+
Do not base the two scores off each other give them the scores independently. Give explanations for every single one and if the answer if partially correct that is acceptable. However punish the scores for answers that are
|
68 |
+
numerically incorrect this also includes values that have the $ in front
|
69 |
+
Please give the completeness score first followed by the accuracy score.
|
70 |
+
For example: Completeness: 70. Accuracy: 40. Explanation here
|
71 |
+
Do not differ from the format ever
|
72 |
+
"""
|
73 |
+
prompt = PromptTemplate.from_template(fstring)
|
74 |
+
|
75 |
+
self.__llm_evaluator = load_evaluator("criteria", llm=llm, criteria="conciseness",prompt=prompt)
|
76 |
+
|
evaluation/scores/scores.py
ADDED
File without changes
|