Spaces:
Running
Running
File size: 11,856 Bytes
fc68f79 8012c4e fc68f79 bc527a4 30ec544 b049de2 a989e5c 708e0e9 e9dbd6f 708e0e9 744e7f0 f7eab11 e9dbd6f 8a0dd37 f7eab11 8012c4e 20b3456 8a0dd37 744e7f0 b005e3f e9dbd6f b005e3f 47a867b b005e3f fc68f79 e9dbd6f fc68f79 35bf268 b005e3f fc68f79 8a0dd37 e9dbd6f 8a0dd37 e9dbd6f 47a867b 8a0dd37 e9dbd6f 8a0dd37 a989e5c e9dbd6f 434f3cf e9dbd6f a989e5c 8a0dd37 e9dbd6f 5e3730c ec2bb77 8a0dd37 952614e 708e0e9 4b13fa7 708e0e9 7d1b966 15046f9 7d1b966 15046f9 df11296 a989e5c df11296 a989e5c f7eab11 5d80dad 7d1b966 8a0dd37 ebf8650 8a0dd37 7d1b966 8a0dd37 7d1b966 8a0dd37 7d1b966 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
import streamlit as st
import pandas as pd
from huggingface_hub import HfApi, ModelCard
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
import re
from io import StringIO
from yall import create_yall
import plotly.graph_objs as go
def calculate_pages(df, items_per_page):
"""Calculate the number of pages needed for pagination."""
return -(-len(df) // items_per_page) # Equivalent to math.ceil(len(df) / items_per_page)
@st.cache_data
def cached_model_info(_api, model):
"""Fetch model information from the Hugging Face API and cache the result."""
try:
return _api.model_info(repo_id=str(model))
except (RepositoryNotFoundError, RevisionNotFoundError):
return None
@st.cache_data
def get_model_info(df):
"""Get model information and update the DataFrame with likes and tags."""
api = HfApi()
with st.spinner("Fetching model information..."):
for index, row in df.iterrows():
model_info = cached_model_info(api, row['Model'].strip())
if model_info:
df.loc[index, 'Likes'] = model_info.likes
df.loc[index, 'Tags'] = ', '.join(model_info.tags)
else:
df.loc[index, 'Likes'] = -1
df.loc[index, 'Tags'] = ''
return df
def convert_markdown_table_to_dataframe(md_content):
"""Convert a markdown table to a pandas DataFrame."""
cleaned_content = re.sub(r'\|\s*$', '', re.sub(r'^\|\s*', '', md_content, flags=re.MULTILINE), flags=re.MULTILINE)
df = pd.read_csv(StringIO(cleaned_content), sep="\|", engine='python')
df = df.drop(0, axis=0)
df.columns = df.columns.str.strip()
model_link_pattern = r'\[(.*?)\]\((.*?)\)\s*\[.*?\]\(.*?\)'
df['URL'] = df['Model'].apply(lambda x: re.search(model_link_pattern, x).group(2) if re.search(model_link_pattern, x) else None)
df['Model'] = df['Model'].apply(lambda x: re.sub(model_link_pattern, r'\1', x))
return df
def create_bar_chart(df, category):
"""Create a horizontal bar chart for the specified category."""
st.write(f"### {category} Scores")
sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
fig = go.Figure(go.Bar(
x=sorted_df[category],
y=sorted_df['Model'],
orientation='h',
marker=dict(color=sorted_df[category], colorscale='Viridis'),
hoverinfo='x+y',
text=sorted_df[category],
textposition='auto'
))
fig.update_layout(
margin=dict(l=20, r=20, t=20, b=20),
title=f"Leaderboard for {category} Scores"
)
st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)
def fetch_merge_configs(df):
"""Fetch and save merge configurations for the top models."""
df_sorted = df.sort_values(by='Average', ascending=False)
try:
with open('/tmp/configurations.txt', 'a') as file:
for index, row in df_sorted.head(20).iterrows():
model_name = row['Model'].rstrip()
try:
card = ModelCard.load(model_name)
file.write(f'Model Name: {model_name}\n')
file.write(f'Scores: {row["Average"]}\n')
file.write(f'AGIEval: {row["AGIEval"]}\n')
file.write(f'GPT4All: {row["GPT4All"]}\n')
file.write(f'TruthfulQA: {row["TruthfulQA"]}\n')
file.write(f'Bigbench: {row["Bigbench"]}\n')
file.write(f'Model Card: {card}\n')
except Exception as e:
st.error(f"Error loading model card for {model_name}: {str(e)}")
with open('/tmp/configurations.txt', 'r') as file:
content = file.read()
matches = re.findall(r'yaml(.*?)```', content, re.DOTALL)
with open('/tmp/configurations2.txt', 'w') as file:
for row, match in zip(df_sorted[['Model', 'Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']].head(20).values, matches):
file.write(f'Model Name: {row[0]}\n')
file.write(f'Scores: {row[1]}\n')
file.write(f'AGIEval: {row[2]}\n')
file.write(f'GPT4All: {row[3]}\n')
file.write(f'TruthfulQA: {row[4]}\n')
file.write(f'Bigbench: {row[5]}\n')
file.write('yaml' + match + '```\n')
except Exception as e:
st.error(f"Error while fetching merge configs: {str(e)}")
def main():
"""Main function to set up the Streamlit app and display the leaderboard."""
st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
st.title("π YALL - Yet Another LLM Leaderboard")
st.markdown("Leaderboard made with π§ [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
content = create_yall()
tab1, tab2 = st.tabs(["π Leaderboard", "π About"])
with tab1:
if content:
try:
score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
full_df = convert_markdown_table_to_dataframe(content)
for col in score_columns:
full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
full_df = get_model_info(full_df)
full_df['Tags'] = full_df['Tags'].fillna('')
df = pd.DataFrame(columns=full_df.columns)
show_phi = st.checkbox("Phi (2.8B)", value=True)
show_mistral = st.checkbox("Mistral (7B)", value=True)
show_other = st.checkbox("Other", value=True)
dfs_to_concat = []
if show_phi:
dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
if show_mistral:
dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('mistral,')])
if show_other:
other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
dfs_to_concat.append(other_df)
if dfs_to_concat:
df = pd.concat(dfs_to_concat, ignore_index=True)
search_query = st.text_input("Search models", "")
if search_query:
df = df[df['Model'].str.contains(search_query, case=False)]
items_per_page = 50
pages = calculate_pages(df, items_per_page)
page = st.selectbox("Page", list(range(1, pages + 1)))
df = df.sort_values(by='Average', ascending=False)
start = (page - 1) * items_per_page
end = start + items_per_page
df = df[start:end]
selected_benchmarks = st.multiselect('Select benchmarks to include in the average', score_columns, default=score_columns)
if selected_benchmarks:
df['Filtered Average'] = df[selected_benchmarks].mean(axis=1)
df = df.sort_values(by='Filtered Average', ascending=False)
st.dataframe(
df[['Model'] + selected_benchmarks + ['Filtered Average', 'Likes', 'URL']],
use_container_width=True,
column_config={
"Likes": st.column_config.NumberColumn(
"Likes",
help="Number of likes on Hugging Face",
format="%d β€οΈ",
),
"URL": st.column_config.LinkColumn("URL"),
},
hide_index=True,
height=len(df) * 37,
)
selected_models = st.multiselect('Select models to compare', df['Model'].unique())
comparison_df = df[df['Model'].isin(selected_models)]
st.dataframe(comparison_df)
if st.button("Export to CSV"):
csv_data = df.to_csv(index=False)
st.download_button(
label="Download CSV",
data=csv_data,
file_name="leaderboard.csv",
key="download-csv",
help="Click to download the CSV file",
)
if st.button("Fetch Merge-Configs"):
fetch_merge_configs(full_df)
st.success("Merge configurations have been fetched and saved.")
create_bar_chart(df, 'Filtered Average')
col1, col2 = st.columns(2)
with col1:
create_bar_chart(df, score_columns[1])
with col2:
create_bar_chart(df, score_columns[2])
col3, col4 = st.columns(2)
with col3:
create_bar_chart(df, score_columns[3])
with col4:
create_bar_chart(df, score_columns[4])
except Exception as e:
st.error("An error occurred while processing the markdown table.")
st.error(str(e))
else:
st.error("Failed to download the content from the URL provided.")
with tab2:
st.markdown('''
### Nous benchmark suite
Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
* [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
* **GPT4ALL** (0-shot): `hellaswag,openbookqa,winogrande,arc_easy,arc_challenge,boolq,piqa`
* [**TruthfulQA**](https://arxiv.org/abs/2109.07958) (0-shot): `truthfulqa_mc`
* [**Bigbench**](https://arxiv.org/abs/2206.04615) (0-shot): `bigbench_causal_judgement,bigbench_date_understanding,bigbench_disambiguation_qa,bigbench_geometric_shapes,bigbench_logical_deduction_five_objects,bigbench_logical_deduction_seven_objects,bigbench_logical_deduction_three_objects,bigbench_movie_recommendation,bigbench_navigate,bigbench_reasoning_about_colored_objects,bigbench_ruin_names,bigbench_salient_translation_error_detection,bigbench_snarks,bigbench_sports_understanding,bigbench_temporal_sequences,bigbench_tracking_shuffled_objects_five_objects,bigbench_tracking_shuffled_objects_seven_objects,bigbench_tracking_shuffled_objects_three_objects`
### Reproducibility
You can easily reproduce these results using π§ [LLM AutoEval](https://github.com/mlabonne/llm-autoeval/tree/master), a colab notebook that automates the evaluation process (benchmark: `nous`). This will upload the results to GitHub as gists. You can find the entire table with the links to the detailed results [here](https://gist.github.com/mlabonne/90294929a2dbcb8877f9696f28105fdf).
### Clone this space
You can create your own leaderboard with your LLM AutoEval results on GitHub Gist. You just need to clone this space and specify two variables:
* Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
* Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
''')
if __name__ == "__main__":
main()
|