|
from fastapi import FastAPI |
|
from pydantic import BaseModel |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.chains.question_answering import load_qa_chain |
|
from langchain.chains.question_answering import load_qa_chain |
|
from langchain import HuggingFaceHub |
|
from langchain import PromptTemplate |
|
|
|
|
|
import os |
|
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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" |
|
|
|
|
|
|
|
|
|
|
|
|
|
app = FastAPI(docs_url="/") |
|
|
|
class ModelOutputEvaluate(BaseModel): |
|
question: str |
|
answer: str |
|
domain: str |
|
context: str |
|
|
|
class BasePromptContext: |
|
def __init__(self): |
|
self.variables_list = ["question","answer","context"] |
|
self.base_template = """Please act as an impartial judge and evaluate the quality of the provided answer which attempts to answer the provided question based on a provided context. |
|
|
|
And you'll need to submit your grading for the correctness, comprehensiveness and readability of the answer, using JSON format with the 2 items in parenthesis: |
|
("score": [your score number for the correctness of the answer], "reasoning": [your one line step by step reasoning about the correctness of the answer]) |
|
|
|
Below is your grading rubric: |
|
|
|
- Correctness: If the answer correctly answer the question, below are the details for different scores: |
|
- Score 0: the answer is completely incorrect, doesn’t mention anything about the question or is completely contrary to the correct answer. |
|
- For example, when asked “How to terminate a databricks cluster”, the answer is empty string, or content that’s completely irrelevant, or sorry I don’t know the answer. |
|
- Score 4: the answer provides some relevance to the question and answer one aspect of the question correctly. |
|
- Example: |
|
- Question: How to terminate a databricks cluster |
|
- Answer: Databricks cluster is a cloud-based computing environment that allows users to process big data and run distributed data processing tasks efficiently. |
|
- Or answer: In the Databricks workspace, navigate to the "Clusters" tab. And then this is a hard question that I need to think more about it |
|
- Score 7: the answer mostly answer the question but is missing or hallucinating on one critical aspect. |
|
- Example: |
|
- Question: How to terminate a databricks cluster” |
|
- Answer: “In the Databricks workspace, navigate to the "Clusters" tab. |
|
Find the cluster you want to terminate from the list of active clusters. |
|
And then you’ll find a button to terminate all clusters at once” |
|
- Score 10: the answer correctly answer the question and not missing any major aspect |
|
- Example: |
|
- Question: How to terminate a databricks cluster |
|
- Answer: In the Databricks workspace, navigate to the "Clusters" tab. |
|
Find the cluster you want to terminate from the list of active clusters. |
|
Click on the down-arrow next to the cluster name to open the cluster details. |
|
Click on the "Terminate" button. A confirmation dialog will appear. Click "Terminate" again to confirm the action.” |
|
|
|
Provided question: |
|
{question} |
|
|
|
Provided answer: |
|
{answer} |
|
|
|
Provided context: |
|
{context} |
|
|
|
Please provide your grading for the correctness""" |
|
|
|
|
|
class Evaluater: |
|
def __init__(self, item: ModelOutputEvaluate): |
|
self.question = item.question |
|
self.answer = item.answer |
|
self.domain = item.domain |
|
self.context = item.context |
|
self.llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":1, "max_length":1000000}) |
|
|
|
def get_prompt_template(self): |
|
prompt = BasePromptContext() |
|
template = prompt.base_template |
|
varialbles = prompt.variables_list |
|
eval_template = PromptTemplate(input_variables=varialbles, template=template) |
|
return eval_template |
|
|
|
def evaluate(self): |
|
prompt = self.get_prompt_template().format(question = self.question, answer = self.answer, context = self.context) |
|
score = self.llm(prompt) |
|
return score |
|
|
|
|
|
@app.post("/evaluate/") |
|
async def create_evaluation_scenario(item: ModelOutputEvaluate): |
|
output = { |
|
"input": item, |
|
"score" : Evaluater(item).evaluate() |
|
} |
|
return output |
|
|
|
|
|
|
|
|
|
|