Spaces:
Sleeping
Sleeping
File size: 3,230 Bytes
271d2c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# Phoenix Evaluation
import os
from getpass import getpass
import nest_asyncio
nest_asyncio.apply()
import matplotlib.pyplot as plt
import openai
import pandas as pd
from pycm import ConfusionMatrix
from sklearn.metrics import classification_report
from phoenix.evals import (
HALLUCINATION_PROMPT_RAILS_MAP,
HALLUCINATION_PROMPT_TEMPLATE,
OpenAIModel,
download_benchmark_dataset,
llm_classify,
)
import phoenix.evals.default_templates as templates
from phoenix.evals import (
OpenAIModel,
download_benchmark_dataset,
llm_classify,
)
from phoenix.evals import (
RAG_RELEVANCY_PROMPT_RAILS_MAP,
RAG_RELEVANCY_PROMPT_TEMPLATE,
OpenAIModel,
download_benchmark_dataset,
llm_classify,
)
from phoenix.evals import (
CODE_READABILITY_PROMPT_RAILS_MAP,
CODE_READABILITY_PROMPT_TEMPLATE,
OpenAIModel,
download_benchmark_dataset,
llm_classify,
)
from phoenix.evals import (
TOXICITY_PROMPT_RAILS_MAP,
TOXICITY_PROMPT_TEMPLATE,
OpenAIModel,
download_benchmark_dataset,
llm_classify,
)
from phoenix.evals import (
QA_PROMPT_RAILS_MAP,
QA_PROMPT_TEMPLATE,
OpenAIModel,
download_benchmark_dataset,
llm_classify,
)
from phoenix.evals.default_templates import (
REFERENCE_LINK_CORRECTNESS_PROMPT_RAILS_MAP,
REFERENCE_LINK_CORRECTNESS_PROMPT_TEMPLATE
)
from phoenix.evals import (
OpenAIModel,
download_benchmark_dataset,
llm_classify,
llm_generate,
USER_FRUSTRATION_PROMPT_RAILS_MAP,
USER_FRUSTRATION_PROMPT_TEMPLATE,
)
from phoenix.evals import (
SQL_GEN_EVAL_PROMPT_TEMPLATE,
SQL_GEN_EVAL_PROMPT_RAILS_MAP
)
def phoenix_eval(metrics, openai_api_key, df):
import os
os.environ["OPENAI_API_KEY"] = openai_api_key
model = OpenAIModel(model="gpt-3.5-turbo", temperature=0.25)
# Rename columns to match expected input names for evaluation
df.rename(columns={"question": "input", "answer": "output", "cleaned_context": "reference"}, inplace=True)
# Define a dictionary of metric configurations
metric_mappings = {
"hallucination": (HALLUCINATION_PROMPT_TEMPLATE, HALLUCINATION_PROMPT_RAILS_MAP, "Hallucination"),
"toxicity": (TOXICITY_PROMPT_TEMPLATE, TOXICITY_PROMPT_RAILS_MAP, "Toxicity"),
"relevance": (RAG_RELEVANCY_PROMPT_TEMPLATE, RAG_RELEVANCY_PROMPT_RAILS_MAP, "Relevancy"),
"Q&A": (QA_PROMPT_TEMPLATE, QA_PROMPT_RAILS_MAP, "Q&A_eval"),
}
# Loop over each metric in the provided metrics list
for metric in metrics:
if metric in metric_mappings:
template, rails_map, column_name = metric_mappings[metric]
rails = list(rails_map.values())
# Perform classification and add results to a new column for the current metric
classifications = llm_classify(dataframe=df, template=template, model=model, rails=rails, concurrency=20)["label"].tolist()
df[column_name] = classifications
else:
print(f"Warning: Metric '{metric}' is not supported.")
# Rename columns back to their original names
df.rename(columns={"input": "question", "output": "answer", "reference": "context"}, inplace=True)
return df
|