import argparse import asyncio import json import math import sys # https://github.com/jerryjliu/llama_index/issues/7244: asyncio.set_event_loop(asyncio.new_event_loop()) from millify import millify import numpy as np import streamlit as st from streamlit_extras.switch_page_button import switch_page from trulens_eval.db_migration import MIGRATION_UNKNOWN_STR from trulens_eval.ux.styles import CATEGORY st.runtime.legacy_caching.clear_cache() from trulens_eval import Tru from trulens_eval.ux import styles from trulens_eval.ux.components import draw_metadata st.set_page_config(page_title="Leaderboard", layout="wide") from trulens_eval.ux.add_logo import add_logo_and_style_overrides add_logo_and_style_overrides() database_url = None def streamlit_app(): tru = Tru(database_file="./models/trulens_eval.sqlite") lms = tru.db # Set the title and subtitle of the app st.title("App Leaderboard") st.write( "Average feedback values displayed in the range from 0 (worst) to 1 (best)." ) df, feedback_col_names = lms.get_records_and_feedback([]) feedback_defs = lms.get_feedback_defs() feedback_directions = { ( row.feedback_json.get("supplied_name", "") or row.feedback_json["implementation"]["name"] ): row.feedback_json.get("higher_is_better", True) for _, row in feedback_defs.iterrows() } if df.empty: st.write("No records yet...") return df = df.sort_values(by="app_id") if df.empty: st.write("No records yet...") apps = list(df.app_id.unique()) st.markdown("""---""") for app in apps: app_df = df.loc[df.app_id == app] if app_df.empty: continue app_str = app_df["app_json"].iloc[0] app_json = json.loads(app_str) metadata = app_json.get("metadata") # st.text('Metadata' + str(metadata)) st.header(app, help=draw_metadata(metadata)) app_feedback_col_names = [ col_name for col_name in feedback_col_names if not app_df[col_name].isna().all() ] col1, col2, col3, col4, *feedback_cols, col99 = st.columns( 5 + len(app_feedback_col_names) ) latency_mean = ( app_df["latency"]. apply(lambda td: td if td != MIGRATION_UNKNOWN_STR else None).mean() ) # app_df_feedback = df.loc[df.app_id == app] col1.metric("Records", len(app_df)) col2.metric( "Average Latency (Seconds)", ( f"{millify(round(latency_mean, 5), precision=2)}" if not math.isnan(latency_mean) else "nan" ), ) col3.metric( "Total Cost (USD)", f"${millify(round(sum(cost for cost in app_df.total_cost if cost is not None), 5), precision = 2)}", ) col4.metric( "Total Tokens", millify( sum( tokens for tokens in app_df.total_tokens if tokens is not None ), precision=2 ), ) for i, col_name in enumerate(app_feedback_col_names): mean = app_df[col_name].mean() st.write( styles.stmetricdelta_hidearrow, unsafe_allow_html=True, ) higher_is_better = feedback_directions.get(col_name, True) if "distance" in col_name: feedback_cols[i].metric( label=col_name, value=f"{round(mean, 2)}", delta_color="normal" ) else: cat = CATEGORY.of_score(mean, higher_is_better=higher_is_better) feedback_cols[i].metric( label=col_name, value=f"{round(mean, 2)}", delta=f"{cat.icon} {cat.adjective}", delta_color=( "normal" if cat.compare( mean, CATEGORY.PASS[cat.direction].threshold ) else "inverse" ), ) with col99: if st.button("Select App", key=f"app-selector-{app}"): st.session_state.app = app switch_page("Evaluations") # with st.expander("Model metadata"): # st.markdown(draw_metadata(metadata)) st.markdown("""---""") # Define the main function to run the app def main(): streamlit_app() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--database-url", default=None) try: args = parser.parse_args() except SystemExit as e: # This exception will be raised if --help or invalid command line arguments # are used. Currently, streamlit prevents the program from exiting normally, # so we have to do a hard exit. sys.exit(e.code) database_url = args.database_url main()