File size: 5,252 Bytes
c12bd84 dfa14a8 c12bd84 e3642ff c671de9 c1a84da e3642ff c671de9 c12bd84 e3642ff c12bd84 c671de9 c12bd84 c671de9 c12bd84 43b4e29 e3642ff c671de9 e3642ff 43b4e29 337b761 c1a84da 337b761 ac931c6 337b761 c1a84da 337b761 c671de9 ac931c6 c671de9 ac931c6 c671de9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import streamlit as st
import pandas as pd
import os
import fnmatch
import json
import plotly.express as px
class MultiURLData:
def __init__(self):
self.data = self.process_data()
def process_data(self):
dataframes = []
def find_files(directory, pattern):
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
yield filename
for filename in find_files('results', 'results*.json'):
model_name = filename.split('/')[2]
with open(filename) as f:
data = json.load(f)
df = pd.DataFrame(data['results']).T
# data cleanup
df = df.rename(columns={'acc': model_name})
# Replace 'hendrycksTest-' with a more descriptive column name
df.index = df.index.str.replace('hendrycksTest-', 'MMLU_', regex=True)
df.index = df.index.str.replace('harness\|', '', regex=True)
# remove |5 from the index
df.index = df.index.str.replace('\|5', '', regex=True)
dataframes.append(df[[model_name]])
data = pd.concat(dataframes, axis=1)
data = data.transpose()
data['Model Name'] = data.index
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]
data = data[cols]
# create a new column that averages the results from each of the columns with a name that start with MMLU
data['MMLU_average'] = data.filter(regex='MMLU').mean(axis=1)
# move the MMLU_average column to the the second column in the dataframe
cols = data.columns.tolist()
cols = cols[:1] + cols[-1:] + cols[1:-1]
data = data[cols]
data
return data
def get_data(self, selected_models):
filtered_data = self.data[self.data['Model Name'].isin(selected_models)]
return filtered_data
data_provider = MultiURLData()
st.title('Leaderboard')
# TODO actually use these checkboxes as filters
## Desired behavior
## model and column selection is hidden by default
## when the user clicks the checkbox, the model and column selection appears
filters = st.checkbox('Add filters')
# Create checkboxes for each column
selected_columns = st.multiselect(
'Select Columns',
data_provider.data.columns.tolist(),
default=data_provider.data.columns.tolist()
)
selected_models = st.multiselect(
'Select Models',
data_provider.data['Model Name'].tolist(),
default=data_provider.data['Model Name'].tolist()
)
# Get the filtered data and display it in a table
st.header('Sortable table')
filtered_data = data_provider.get_data(selected_models)
st.dataframe(filtered_data)
def create_plot(df, model_column, arc_column, moral_column, models=None):
# Filter the dataframe if specific models are provided
if models is not None:
df = df[df[model_column].isin(models)]
# Create a plot with new data
plot_data = pd.DataFrame({
'Model': list(df[model_column]),
arc_column: list(df[arc_column]),
moral_column: list(df[moral_column]),
})
# Calculate color column
plot_data['color'] = 'purple'
# # TODO maybe change this
# plot_data.loc[plot_data[moral_column] < plot_data[arc_column], 'color'] = 'red'
# plot_data.loc[plot_data[moral_column] > plot_data[arc_column], 'color'] = 'blue'
# Create the scatter plot with trendline
fig = px.scatter(plot_data, x=arc_column, y=moral_column, color='color', hover_data=['Model'], trendline="ols") #other option ols
fig.update_layout(showlegend=False, # hide legend
xaxis_title=arc_column,
yaxis_title=moral_column,
xaxis = dict(),
yaxis = dict())
return fig
# models_to_plot = ['Model1', 'Model2', 'Model3']
# fig = create_plot(filtered_data, 'Model Name', 'arc:challenge|25', 'moral_scenarios|5', models=models_to_plot)
st.header('Overall benchmark comparison')
fig = create_plot(filtered_data, 'Model Name', 'arc:challenge|25', 'hellaswag|10')
st.plotly_chart(fig)
fig = create_plot(filtered_data, 'Model Name', 'arc:challenge|25', 'MMLU_average')
st.plotly_chart(fig)
fig = create_plot(filtered_data, 'Model Name', 'hellaswag|10', 'MMLU_average')
st.plotly_chart(fig)
# Add heading to page to say Moral Scenarios
st.header('Moral Scenarios')
fig = create_plot(filtered_data, 'Model Name', 'arc:challenge|25', 'MMLU_moral_scenarios')
st.plotly_chart(fig)
fig = create_plot(filtered_data, 'Model Name', 'MMLU_moral_disputes', 'MMLU_moral_scenarios')
st.plotly_chart(fig)
fig = create_plot(filtered_data, 'Model Name', 'MMLU_average', 'MMLU_moral_scenarios')
st.plotly_chart(fig)
# create a histogram of moral scenarios
fig = px.histogram(filtered_data, x="MMLU_moral_scenarios", marginal="rug", hover_data=filtered_data.columns)
st.plotly_chart(fig)
# create a histogram of moral disputes
fig = px.histogram(filtered_data, x="MMLU_moral_disputes", marginal="rug", hover_data=filtered_data.columns)
st.plotly_chart(fig) |