File size: 3,230 Bytes
271d2c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Phoenix Evaluation
import os
from getpass import getpass
import nest_asyncio
nest_asyncio.apply()

import matplotlib.pyplot as plt
import openai
import pandas as pd
from pycm import ConfusionMatrix
from sklearn.metrics import classification_report

from phoenix.evals import (
    HALLUCINATION_PROMPT_RAILS_MAP,
    HALLUCINATION_PROMPT_TEMPLATE,
    OpenAIModel,
    download_benchmark_dataset,
    llm_classify,
)
import phoenix.evals.default_templates as templates
from phoenix.evals import (
    OpenAIModel,
    download_benchmark_dataset,
    llm_classify,
)

from phoenix.evals import (
    RAG_RELEVANCY_PROMPT_RAILS_MAP,
    RAG_RELEVANCY_PROMPT_TEMPLATE,
    OpenAIModel,
    download_benchmark_dataset,
    llm_classify,
)
from phoenix.evals import (
    CODE_READABILITY_PROMPT_RAILS_MAP,
    CODE_READABILITY_PROMPT_TEMPLATE,
    OpenAIModel,
    download_benchmark_dataset,
    llm_classify,
)


from phoenix.evals import (
    TOXICITY_PROMPT_RAILS_MAP,
    TOXICITY_PROMPT_TEMPLATE,
    OpenAIModel,
    download_benchmark_dataset,
    llm_classify,
)

from phoenix.evals import (
    QA_PROMPT_RAILS_MAP,
    QA_PROMPT_TEMPLATE,
    OpenAIModel,
    download_benchmark_dataset,
    llm_classify,
)

from phoenix.evals.default_templates import (
    REFERENCE_LINK_CORRECTNESS_PROMPT_RAILS_MAP,
    REFERENCE_LINK_CORRECTNESS_PROMPT_TEMPLATE
)
from phoenix.evals import (
    OpenAIModel,
    download_benchmark_dataset,
    llm_classify,
    llm_generate,
    USER_FRUSTRATION_PROMPT_RAILS_MAP,
    USER_FRUSTRATION_PROMPT_TEMPLATE,
)
from phoenix.evals import (
    SQL_GEN_EVAL_PROMPT_TEMPLATE,
    SQL_GEN_EVAL_PROMPT_RAILS_MAP
)


def phoenix_eval(metrics, openai_api_key, df):
    import os
    os.environ["OPENAI_API_KEY"] = openai_api_key
    model = OpenAIModel(model="gpt-3.5-turbo", temperature=0.25)

    # Rename columns to match expected input names for evaluation
    df.rename(columns={"question": "input", "answer": "output", "cleaned_context": "reference"}, inplace=True)

    # Define a dictionary of metric configurations
    metric_mappings = {
        "hallucination": (HALLUCINATION_PROMPT_TEMPLATE, HALLUCINATION_PROMPT_RAILS_MAP, "Hallucination"),
        "toxicity": (TOXICITY_PROMPT_TEMPLATE, TOXICITY_PROMPT_RAILS_MAP, "Toxicity"),
        "relevance": (RAG_RELEVANCY_PROMPT_TEMPLATE, RAG_RELEVANCY_PROMPT_RAILS_MAP, "Relevancy"),
        "Q&A": (QA_PROMPT_TEMPLATE, QA_PROMPT_RAILS_MAP, "Q&A_eval"),
    }

    # Loop over each metric in the provided metrics list
    for metric in metrics:
        if metric in metric_mappings:
            template, rails_map, column_name = metric_mappings[metric]
            rails = list(rails_map.values())
            
            # Perform classification and add results to a new column for the current metric
            classifications = llm_classify(dataframe=df, template=template, model=model, rails=rails, concurrency=20)["label"].tolist()
            df[column_name] = classifications
        else:
            print(f"Warning: Metric '{metric}' is not supported.")

    # Rename columns back to their original names
    df.rename(columns={"input": "question", "output": "answer", "reference": "context"}, inplace=True)

    return df