|
import asyncio |
|
import json |
|
from typing import Iterable, Tuple |
|
|
|
|
|
asyncio.set_event_loop(asyncio.new_event_loop()) |
|
|
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import pandas as pd |
|
from st_aggrid import AgGrid |
|
from st_aggrid.grid_options_builder import GridOptionsBuilder |
|
from st_aggrid.shared import GridUpdateMode |
|
from st_aggrid.shared import JsCode |
|
import streamlit as st |
|
from ux.add_logo import add_logo_and_style_overrides |
|
from ux.styles import CATEGORY |
|
|
|
from trulens_eval import Tru |
|
from trulens_eval.app import Agent |
|
from trulens_eval.app import ComponentView |
|
from trulens_eval.app import instrumented_component_views |
|
from trulens_eval.app import LLM |
|
from trulens_eval.app import Other |
|
from trulens_eval.app import Prompt |
|
from trulens_eval.app import Tool |
|
from trulens_eval.db import MULTI_CALL_NAME_DELIMITER |
|
from trulens_eval.react_components.record_viewer import record_viewer |
|
from trulens_eval.schema import Record |
|
from trulens_eval.schema import Select |
|
from trulens_eval.utils.json import jsonify_for_ui |
|
from trulens_eval.utils.serial import Lens |
|
from trulens_eval.ux.components import draw_agent_info |
|
from trulens_eval.ux.components import draw_call |
|
from trulens_eval.ux.components import draw_llm_info |
|
from trulens_eval.ux.components import draw_metadata |
|
from trulens_eval.ux.components import draw_prompt_info |
|
from trulens_eval.ux.components import draw_tool_info |
|
from trulens_eval.ux.components import render_selector_markdown |
|
from trulens_eval.ux.components import write_or_json |
|
from trulens_eval.ux.styles import cellstyle_jscode |
|
|
|
st.set_page_config(page_title="Evaluations", layout="wide") |
|
|
|
st.title("Evaluations") |
|
|
|
st.runtime.legacy_caching.clear_cache() |
|
|
|
add_logo_and_style_overrides() |
|
|
|
tru = Tru(database_file="./models/trulens_eval.sqlite") |
|
lms = tru.db |
|
|
|
df_results, feedback_cols = lms.get_records_and_feedback([]) |
|
|
|
|
|
feedback_directions = { |
|
( |
|
row.feedback_json.get("supplied_name", "") or |
|
row.feedback_json["implementation"]["name"] |
|
): ( |
|
"HIGHER_IS_BETTER" if row.feedback_json.get("higher_is_better", True) |
|
else "LOWER_IS_BETTER" |
|
) for _, row in lms.get_feedback_defs().iterrows() |
|
} |
|
default_direction = "HIGHER_IS_BETTER" |
|
|
|
|
|
def render_component(query, component, header=True): |
|
|
|
if header: |
|
st.markdown( |
|
f"##### Component {render_selector_markdown(Select.for_app(query))}" |
|
) |
|
|
|
|
|
cls = component.cls |
|
base_cls = cls.base_class() |
|
label = f"__{repr(cls)}__" |
|
if str(base_cls) != str(cls): |
|
label += f" < __{repr(base_cls)}__" |
|
st.write("Python class: " + label) |
|
|
|
|
|
if isinstance(component, LLM): |
|
draw_llm_info(component=component, query=query) |
|
|
|
elif isinstance(component, Prompt): |
|
draw_prompt_info(component=component, query=query) |
|
|
|
elif isinstance(component, Agent): |
|
draw_agent_info(component=component, query=query) |
|
|
|
elif isinstance(component, Tool): |
|
draw_tool_info(component=component, query=query) |
|
|
|
elif isinstance(component, Other): |
|
with st.expander("Uncategorized Component Details:"): |
|
st.json(jsonify_for_ui(component.json)) |
|
|
|
else: |
|
with st.expander("Unhandled Component Details:"): |
|
st.json(jsonify_for_ui(component.json)) |
|
|
|
|
|
|
|
def render_record_metrics(app_df: pd.DataFrame, selected_rows: pd.DataFrame): |
|
app_specific_df = app_df[app_df["app_id"] == selected_rows["app_id"][0]] |
|
|
|
token_col, cost_col, latency_col = st.columns(3) |
|
|
|
num_tokens = selected_rows["total_tokens"][0] |
|
token_col.metric(label="Total tokens (#)", value=num_tokens) |
|
|
|
cost = selected_rows["total_cost"][0] |
|
average_cost = app_specific_df["total_cost"].mean() |
|
delta_cost = "{:.3g}".format(cost - average_cost) |
|
cost_col.metric( |
|
label="Total cost (USD)", |
|
value=selected_rows["total_cost"][0], |
|
delta=delta_cost, |
|
delta_color="inverse", |
|
) |
|
|
|
latency = selected_rows["latency"][0] |
|
average_latency = app_specific_df["latency"].mean() |
|
delta_latency = "{:.3g}s".format(latency - average_latency) |
|
latency_col.metric( |
|
label="Latency (s)", |
|
value=selected_rows["latency"][0], |
|
delta=delta_latency, |
|
delta_color="inverse", |
|
) |
|
|
|
|
|
if df_results.empty: |
|
st.write("No records yet...") |
|
|
|
else: |
|
apps = list(df_results.app_id.unique()) |
|
if "app" in st.session_state: |
|
app = st.session_state.app |
|
else: |
|
app = apps |
|
|
|
st.experimental_set_query_params(app=app) |
|
|
|
options = st.multiselect("Filter Applications", apps, default=app) |
|
|
|
if len(options) == 0: |
|
st.header("All Applications") |
|
app_df = df_results |
|
|
|
elif len(options) == 1: |
|
st.header(options[0]) |
|
|
|
app_df = df_results[df_results.app_id.isin(options)] |
|
|
|
else: |
|
st.header("Multiple Applications Selected") |
|
|
|
app_df = df_results[df_results.app_id.isin(options)] |
|
|
|
tab1, tab2 = st.tabs(["Records", "Feedback Functions"]) |
|
|
|
with tab1: |
|
gridOptions = {"alwaysShowHorizontalScroll": True} |
|
evaluations_df = app_df |
|
|
|
|
|
input_array = evaluations_df['input'].to_numpy() |
|
output_array = evaluations_df['output'].to_numpy() |
|
|
|
decoded_input = np.vectorize( |
|
lambda x: x.encode('utf-8').decode('unicode-escape') |
|
)(input_array) |
|
decoded_output = np.vectorize( |
|
lambda x: x.encode('utf-8').decode('unicode-escape') |
|
)(output_array) |
|
|
|
evaluations_df['input'] = decoded_input |
|
evaluations_df['output'] = decoded_output |
|
|
|
gb = GridOptionsBuilder.from_dataframe(evaluations_df) |
|
|
|
gb.configure_column("type", header_name="App Type") |
|
gb.configure_column("record_json", header_name="Record JSON", hide=True) |
|
gb.configure_column("app_json", header_name="App JSON", hide=True) |
|
gb.configure_column("cost_json", header_name="Cost JSON", hide=True) |
|
gb.configure_column("perf_json", header_name="Perf. JSON", hide=True) |
|
|
|
gb.configure_column("record_id", header_name="Record ID", hide=True) |
|
gb.configure_column("app_id", header_name="App ID") |
|
|
|
gb.configure_column("feedback_id", header_name="Feedback ID", hide=True) |
|
gb.configure_column("input", header_name="User Input") |
|
gb.configure_column( |
|
"output", |
|
header_name="Response", |
|
) |
|
gb.configure_column("total_tokens", header_name="Total Tokens (#)") |
|
gb.configure_column("total_cost", header_name="Total Cost (USD)") |
|
gb.configure_column("latency", header_name="Latency (Seconds)") |
|
gb.configure_column("tags", header_name="Tags") |
|
gb.configure_column("ts", header_name="Time Stamp", sort="desc") |
|
|
|
non_feedback_cols = [ |
|
"app_id", |
|
"type", |
|
"ts", |
|
"total_tokens", |
|
"total_cost", |
|
"record_json", |
|
"latency", |
|
"record_id", |
|
"app_id", |
|
"cost_json", |
|
"app_json", |
|
"input", |
|
"output", |
|
"perf_json", |
|
] |
|
|
|
for feedback_col in evaluations_df.columns.drop(non_feedback_cols): |
|
if "distance" in feedback_col: |
|
gb.configure_column( |
|
feedback_col, hide=feedback_col.endswith("_calls") |
|
) |
|
else: |
|
|
|
cellstyle = JsCode( |
|
cellstyle_jscode[feedback_directions.get( |
|
feedback_col, default_direction |
|
)] |
|
) |
|
|
|
gb.configure_column( |
|
feedback_col, |
|
cellStyle=cellstyle, |
|
hide=feedback_col.endswith("_calls") |
|
) |
|
|
|
gb.configure_pagination() |
|
gb.configure_side_bar() |
|
gb.configure_selection(selection_mode="single", use_checkbox=False) |
|
|
|
gridOptions = gb.build() |
|
data = AgGrid( |
|
evaluations_df, |
|
gridOptions=gridOptions, |
|
update_mode=GridUpdateMode.SELECTION_CHANGED, |
|
allow_unsafe_jscode=True, |
|
) |
|
|
|
selected_rows = data["selected_rows"] |
|
selected_rows = pd.DataFrame(selected_rows) |
|
|
|
if len(selected_rows) == 0: |
|
st.write("Hint: select a row to display details of a record") |
|
|
|
else: |
|
|
|
st.divider() |
|
|
|
|
|
st.caption( |
|
f"{selected_rows['app_id'][0]} / {selected_rows['record_id'][0]}" |
|
) |
|
st.header(f"{selected_rows['record_id'][0]}") |
|
|
|
render_record_metrics(app_df, selected_rows) |
|
|
|
st.markdown("") |
|
|
|
prompt = selected_rows["input"][0] |
|
response = selected_rows["output"][0] |
|
details = selected_rows["app_json"][0] |
|
|
|
app_json = json.loads( |
|
details |
|
) |
|
|
|
row = selected_rows.head().iloc[0] |
|
|
|
|
|
|
|
input_col, response_col = st.columns(2) |
|
|
|
(input_tab,) = input_col.tabs(["Input"]) |
|
with input_tab: |
|
with st.expander( |
|
f"Input {render_selector_markdown(Select.RecordInput)}", |
|
expanded=True): |
|
write_or_json(st, obj=prompt) |
|
|
|
(response_tab,) = response_col.tabs(["Response"]) |
|
with response_tab: |
|
with st.expander( |
|
f"Response {render_selector_markdown(Select.RecordOutput)}", |
|
expanded=True): |
|
write_or_json(st, obj=response) |
|
|
|
feedback_tab, metadata_tab = st.tabs(["Feedback", "Metadata"]) |
|
|
|
with metadata_tab: |
|
metadata = app_json.get("metadata") |
|
if metadata: |
|
with st.expander("Metadata"): |
|
st.markdown(draw_metadata(metadata)) |
|
else: |
|
st.write("No metadata found") |
|
|
|
with feedback_tab: |
|
if len(feedback_cols) == 0: |
|
st.write("No feedback details") |
|
|
|
for fcol in feedback_cols: |
|
feedback_name = fcol |
|
feedback_result = row[fcol] |
|
print(feedback_result) |
|
|
|
if MULTI_CALL_NAME_DELIMITER in fcol: |
|
fcol = fcol.split(MULTI_CALL_NAME_DELIMITER)[0] |
|
feedback_calls = row[f"{fcol}_calls"] |
|
|
|
def display_feedback_call(call): |
|
|
|
def highlight(s): |
|
if "distance" in feedback_name: |
|
return [ |
|
f"background-color: {CATEGORY.UNKNOWN.color}" |
|
] * len(s) |
|
cat = CATEGORY.of_score( |
|
s.result, |
|
higher_is_better=feedback_directions.get( |
|
fcol, default_direction |
|
) == default_direction |
|
) |
|
return [f"background-color: {cat.color}"] * len(s) |
|
|
|
if call is not None and len(call) > 0: |
|
df = pd.DataFrame.from_records( |
|
[call[i]["args"] for i in range(len(call))] |
|
) |
|
df["result"] = pd.DataFrame( |
|
[ |
|
float(call[i]["ret"]) |
|
if call[i]["ret"] is not None else -1 |
|
for i in range(len(call)) |
|
] |
|
) |
|
df["meta"] = pd.Series( |
|
[call[i]["meta"] for i in range(len(call))] |
|
) |
|
df = df.join(df.meta.apply(lambda m: pd.Series(m)) |
|
).drop(columns="meta") |
|
|
|
st.dataframe( |
|
df.style.apply(highlight, axis=1).format( |
|
"{:.2}", subset=["result"] |
|
) |
|
) |
|
|
|
else: |
|
st.text("No feedback details.") |
|
|
|
with st.expander(f"{feedback_name} = {feedback_result}", |
|
expanded=True): |
|
display_feedback_call(feedback_calls) |
|
|
|
record_str = selected_rows["record_json"][0] |
|
record_json = json.loads(record_str) |
|
record = Record.model_validate(record_json) |
|
|
|
classes: Iterable[Tuple[Lens, ComponentView] |
|
] = list(instrumented_component_views(app_json)) |
|
classes_map = {path: view for path, view in classes} |
|
|
|
st.markdown("") |
|
st.subheader("Timeline") |
|
val = record_viewer(record_json, app_json) |
|
st.markdown("") |
|
|
|
match_query = None |
|
|
|
|
|
if val != "": |
|
match = None |
|
for call in record.calls: |
|
if call.perf.start_time.isoformat() == val: |
|
match = call |
|
break |
|
|
|
if match: |
|
length = len(match.stack) |
|
app_call = match.stack[length - 1] |
|
|
|
match_query = match.top().path |
|
|
|
st.subheader( |
|
f"{app_call.method.obj.cls.name} {render_selector_markdown(Select.for_app(match_query))}" |
|
) |
|
|
|
draw_call(match) |
|
|
|
view = classes_map.get(match_query) |
|
if view is not None: |
|
render_component( |
|
query=match_query, component=view, header=False |
|
) |
|
else: |
|
st.write( |
|
f"Call by `{match_query}` was not associated with any instrumented" |
|
" component." |
|
) |
|
|
|
|
|
try: |
|
app_component_json = list( |
|
match_query.get(app_json) |
|
)[0] |
|
if app_component_json is not None: |
|
with st.expander( |
|
"Uninstrumented app component details." |
|
): |
|
st.json(app_component_json) |
|
except Exception: |
|
st.write( |
|
f"Recorded invocation by component `{match_query}` but cannot find this component in the app json." |
|
) |
|
|
|
else: |
|
st.text("No match found") |
|
else: |
|
st.subheader(f"App {render_selector_markdown(Select.App)}") |
|
with st.expander("App Details:"): |
|
st.json(jsonify_for_ui(app_json)) |
|
|
|
if match_query is not None: |
|
container = st.empty() |
|
|
|
has_subcomponents = False |
|
for query, component in classes: |
|
if not match_query.is_immediate_prefix_of(query): |
|
continue |
|
|
|
if len(query.path) == 0: |
|
|
|
continue |
|
|
|
has_subcomponents = True |
|
render_component(query, component) |
|
|
|
if has_subcomponents: |
|
container.markdown("#### Subcomponents:") |
|
|
|
st.header("More options:") |
|
|
|
if st.button("Display full app json"): |
|
st.write(jsonify_for_ui(app_json)) |
|
|
|
if st.button("Display full record json"): |
|
st.write(jsonify_for_ui(record_json)) |
|
|
|
with tab2: |
|
feedback = feedback_cols |
|
cols = 4 |
|
rows = len(feedback) // cols + 1 |
|
|
|
for row_num in range(rows): |
|
with st.container(): |
|
columns = st.columns(cols) |
|
for col_num in range(cols): |
|
with columns[col_num]: |
|
ind = row_num * cols + col_num |
|
if ind < len(feedback): |
|
|
|
fig, ax = plt.subplots() |
|
bins = [ |
|
0, 0.2, 0.4, 0.6, 0.8, 1.0 |
|
] |
|
ax.hist( |
|
app_df[feedback[ind]], |
|
bins=bins, |
|
edgecolor="black", |
|
color="#2D736D" |
|
) |
|
ax.set_xlabel("Feedback Value") |
|
ax.set_ylabel("Frequency") |
|
ax.set_title(feedback[ind], loc="center") |
|
st.pyplot(fig) |
|
|