import streamlit as st import pandas as pd import plotly.express as px import plotly.graph_objects as go import statsmodels.api as sm # Set the layout to wide st.set_page_config(layout="wide") def prep_rankings_table(df, y_column): # Create a copy of the dataframe. df_copy = df.copy() # Select the columns we care about, sort by the y column, and reset the index. df_copy = ( df_copy[ [ "model_name", y_column, "num_words_mean", ] ] .sort_values(y_column, ascending=False) .reset_index() ) # Create a rank column. df_copy["rank"] = df_copy.index + 1 # Round the y column. df_copy[y_column] = df_copy[y_column].round(2) # Fix the order. df_copy = df_copy[["rank", "model_name", y_column, "num_words_mean"]] return df_copy def app(): st.title("AlpacaEval Visualizations") st.markdown("## Win rate vs. overall mean length") # Load the data df = pd.read_json("data/model_win_rates.json") # Add a model name column for hover labels df["model_name"] = df.index.astype(str) # Define the preset groups presets = { "gpt": df[df["model_name"].str.contains("openai|gpt", case=False)][ "model_name" ].tolist(), "claude": df[df["model_name"].str.contains("claude", case=False)][ "model_name" ].tolist(), "moa": df[df["model_name"].str.contains("moa", case=False)][ "model_name" ].tolist(), "llama": df[df["model_name"].str.contains("llama", case=False)][ "model_name" ].tolist(), "custom": [], } # Add radio button for preset groups preset_selection = st.radio( "Select a preset group of models or choose 'custom' to select manually", options=["custom", "gpt", "claude", "moa", "llama"], ) # Add multiselect for custom model selection if preset_selection == "custom": selected_models = st.multiselect( "Select models to highlight", options=df["model_name"].unique() ) else: selected_models = presets[preset_selection] def create_scatter_plot(df, y_column, selected_models, title): fig = go.Figure() # Add scatter plots for num_words_mean and num_tokens_mean fig.add_trace( go.Scatter( x=df["num_words_mean"], y=df[y_column], mode="markers", name="words", text=df["model_name"], marker=dict(size=5, color="skyblue"), showlegend=True, visible="legendonly", # Make 'words' trace initially visible only in legend ) ) fig.add_trace( go.Scatter( x=df["num_tokens_mean"], y=df[y_column], mode="markers", name="tokens", text=df["model_name"], marker=dict(size=5, color="orange"), showlegend=True, ) ) # Highlight selected models if selected_models: selected_data = df[df["model_name"].isin(selected_models)] fig.add_trace( go.Scatter( x=selected_data["num_words_mean"], y=selected_data[y_column], mode="markers", name="selected words", text=selected_data["model_name"], marker=dict(size=10, color="blue"), showlegend=True, visible="legendonly", # Make 'selected words' trace initially visible only in legend ) ) fig.add_trace( go.Scatter( x=selected_data["num_tokens_mean"], y=selected_data[y_column], mode="markers", name="selected tokens", text=selected_data["model_name"], marker=dict(size=10, color="orangered"), showlegend=True, ) ) # Add trendlines def add_trendline(fig, x, y, name, color, visibility="legendonly"): X = sm.add_constant(df[x]) model = sm.OLS(df[y], X).fit() trendline = model.predict(X) fig.add_trace( go.Scatter( x=df[x], y=trendline, mode="lines", name=f"{name} trendline", line=dict(color=color, width=2), visible=visibility, # Control the initial visibility ) ) return model.rsquared r_squared_words = add_trendline( fig, "num_words_mean", y_column, "words", "blue" ) r_squared_tokens = add_trendline( fig, "num_tokens_mean", y_column, "tokens", "orangered", visibility=True ) # Update layout with titles and labels fig.update_layout( xaxis_title="Mean length", yaxis_title=( "Win rate" if y_column == "win_rate" else ( "LC Win Rate" if y_column == "length_controlled_winrate" else "Discrete Win Rate" ) ), title=title, legend_title="Legend", ) return fig, r_squared_words, r_squared_tokens y_column1 = "length_controlled_winrate" y_column2 = "win_rate" y_column3 = "discrete_win_rate" fig1, r_squared_words_1, r_squared_tokens_1 = create_scatter_plot( df, y_column1, selected_models, "Length-Controlled Win Rate" ) fig2, r_squared_words_2, r_squared_tokens_2 = create_scatter_plot( df, y_column2, selected_models, "Win Rate" ) fig3, r_squared_words_3, r_squared_tokens_3 = create_scatter_plot( df, y_column3, selected_models, "Discrete Win Rate" ) # Create tabs for each chart tab1, tab2, tab3 = st.tabs(["LC Win Rate", "Win Rate", "Discrete Win Rate"]) with tab1: col1, col2 = st.columns([3, 2]) col1.plotly_chart(fig1) col2.markdown("#### Rankings") prepped_df = prep_rankings_table(df, "length_controlled_winrate") col2.dataframe( prepped_df, hide_index=True, ) with st.expander("Trendline R²"): st.markdown( f"- R² (Words vs {y_column1}): {r_squared_words_1:.2f} \n- R² (Tokens vs {y_column1}): {r_squared_tokens_1:.2f}" ) with tab2: col1, col2 = st.columns([3, 2]) col1.plotly_chart(fig2) col2.markdown("#### Rankings") prepped_df = prep_rankings_table(df, "win_rate") col2.dataframe( prepped_df, hide_index=True, ) with st.expander("Trendline R²"): st.markdown( f"- R² (Words vs {y_column2}): {r_squared_words_2:.2f} \n- R² (Tokens vs {y_column2}): {r_squared_tokens_2:.2f}" ) with tab3: col1, col2 = st.columns([3, 2]) col1.plotly_chart(fig3) col2.markdown("#### Rankings") prepped_df = prep_rankings_table(df, "discrete_win_rate") col2.dataframe( prepped_df, hide_index=True, ) with st.expander("Trendline R²"): st.markdown( f"- R² (Words vs {y_column3}): {r_squared_words_3:.2f}\n- R² (Tokens vs {y_column3}): {r_squared_tokens_3:.2f}" ) with st.expander("Raw data"): st.dataframe(df) if __name__ == "__main__": app()