Spaces:

devingulliver
/

subquadratic-llm-leaderboard

Running

File size: 6,691 Bytes

b414398
38210ee
b414398
0fade42
0dcfe74
 
9fc3aec
b414398
 
91e32b4
cd7f1fc
91e32b4
42e2af3
ea82c60
42e2af3
c31cd39
62088f1
91e32b4
 
00e3be0
38210ee
356fe31
aabd0b6
363a6cb
 
00e3be0
356fe31
06e8c2c
 
5ce3551
91e32b4
ec51f9d
91e32b4
 
 
38210ee
 
b414398
7c67606
0fade42
8300932
7c67606
0fade42
7c67606
 
 
59d0aa9
 
 
7c67606
 
 
 
 
59d0aa9
 
 
387cdcd
b414398
 
883e413
7285c62
8c47a7a
b414398
 
 
83d1755
356fe31
cc9ebdf
9127c44
339ab1a
356fe31
6ab6442
00e3be0
e98b562
 
e2002de
38210ee
e2002de
38210ee
e98b562
 
00e3be0
97d8622
00e3be0
 
20cfd29
03af7e5
20cfd29
 
b414398
 
 
4fd57b2
8c47a7a
4fd57b2
8c47a7a
4fd57b2
8c47a7a
4fd57b2
e13f0c8
b414398
 
 
 
 
cc9ebdf
b414398
 
 
38210ee
b414398
 
eb5a1b2

import os
import pandas as pd
import requests
import huggingface_hub
import gradio as gr

data = pd.read_csv("data.csv", dtype="str")
webhook_url = os.environ.get("WEBHOOK_URL")

archlinks = {
    "H3": "https://arxiv.org/abs/2212.14052",
    "Mamba": "https://arxiv.org/abs/2312.00752",
    "Jamba": "https://arxiv.org/abs/2403.19887",
    "Based": "https://arxiv.org/abs/2402.18668",
    "RWKV-4": "https://arxiv.org/abs/2305.13048",
    "RWKV-5": "https://substack.recursal.ai/p/rwkv-v5-15b-achieves-sota-multi-lingual", # paper soon!
    "StripedHyena": "https://www.together.ai/blog/stripedhyena-7b", # no paper?
}

def filter_table(cols, name, type, arch, size):
    tmp = data
    # filter
    tmp = tmp[tmp["Name"].str.contains(name, case=False)]
    tmp = tmp[tmp["Type"].isin(type)]
    tmp = tmp[tmp["Architecture"].isin(arch)]
    tmp = tmp[tmp["Model Size"].isin(size)]
    # prettify
    tmp["Type"] = tmp["Type"].apply(lambda x: x[0])
    tmp = tmp.rename({"Type": "T"}, axis=1)
    tmp["Name"] = tmp["Name"].apply(lambda x: f'<a target="_blank" href="https://huggingface.co/{x}" style="color:var(--link-text-color);text-decoration:underline;text-decoration-style:dotted">{x}</a>')
    tmp["Architecture"] = tmp["Architecture"].apply(lambda x: f'<a target="_blank" href="{archlinks[x]}" style="color:var(--link-text-color);text-decoration:underline;text-decoration-style:dotted">{x}</a>')
    tmp["Base Model"] = tmp["Base Model"].apply(lambda x: f'<a target="_blank" href="https://huggingface.co/{x}" style="color:var(--link-text-color);text-decoration:underline;text-decoration-style:dotted">{x}</a>' if x != "base" else "")
    # show/hide
    tmp = tmp.drop(cols, axis=1)
    # done!
    return tmp

def submit_model(name):
    try:
        huggingface_hub.hf_hub_download(repo_id=name, filename="config.json") # sanity check input
    except huggingface_hub.utils._errors.EntryNotFoundError:
        return "# ERROR: Model does not have a config.json file!"
    except huggingface_hub.utils._errors.RepositoryNotFoundError:
        return "# ERROR: Model could not be found on the Hugging Face Hub!"
    except requests.exceptions.HTTPError:
        return "# ERROR: Network error while validating model. Please try again later."
    except Exception as e:
        print(e)
        return "ERROR: Unexpected error. Please try again later."
    
    try:
        result = requests.post(webhook_url, json={"content":name})
    except requests.exceptions.HTTPError:
        return "# ERROR: Network error while contacting queue. Please try again in a few minutes."
    except Exception as e:
        print(e)
        return "ERROR: Unexpected error. Please try again later."
    
    return "# SUCCESS: Please wait up to 24 hours for your model to be added to the queue."

with gr.Blocks(css=".gradio-container{max-width:95%!important} .tab-buttons button{font-size:1.3em}") as demo:
    gr.HTML('<h1 style="text-align:center"><span style="font-size:1.3em">Subquadratic LLM Leaderboard</span></h1>')
    gr.Markdown("**REMEMBER:** If you don't see an eligible model here, make sure to submit it! We hope to incentivize subquadratic/attention-free LLM development through friendly competition.")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.Tab("🏅 LLM Benchmark"):
            with gr.Row():
                with gr.Column():
                    namefilter = gr.Textbox(max_lines=1, placeholder="Search by model name and hit Enter...", show_label=False)
                    typefilter = gr.CheckboxGroup(show_label=False, choices=list(data["Type"].unique()), value=[n for n in data["Type"].unique() if n not in ["⏳ Pending"]])
                
                with gr.Column():
                    archfilter = gr.CheckboxGroup(label="Filter by model architecture", choices=list(data["Architecture"].unique()), value=list(data["Architecture"].unique()))
                    sizefilter = gr.CheckboxGroup(label="Filter by model size", choices=list(data["Model Size"].unique()), value=list(data["Model Size"].unique()))

                with gr.Column():
                    colfilter = gr.CheckboxGroup(label="Hide columns", choices=list(data.columns)[2:], value=["Architecture","Model Size","Base Model"])
                
            table = gr.Dataframe(filter_table(["Architecture","Model Size","Base Model"],"",[n for n in data["Type"].unique() if n not in ["⏳ Pending"]],list(data["Architecture"].unique()),list(data["Model Size"].unique())), datatype="markdown")

            # actions
            
            namefilter.submit(filter_table, [colfilter,namefilter,typefilter,archfilter,sizefilter], table)
            
            for filter in [colfilter,typefilter,archfilter,sizefilter]:
                filter.input(filter_table, [colfilter,namefilter,typefilter,archfilter,sizefilter], table)

        with gr.Tab("⚖️ Comparison"):
            gr.Markdown("This table is whitelisted to one model per architecture, specifically 1.5B models trained on The Pile for 1 epoch, for a direct comparison of architectures.")
            gr.Dataframe(data[data["Name"].isin(["RWKV/rwkv-4-1b5-pile","state-spaces/mamba-1.4b","danfu09/H3-1.3B"])], datatype="markdown")
        
        with gr.Tab("📝 About"):
            gr.Markdown("""
                The **Subquadratic LLM Leaderboard** evaluates LLMs with subquadratic/attention-free architectures (i.e. RWKV & Mamba) with the goal of providing open
                evaluation results while the architectures themselves are pending inclusion/release in the 🤗 Transformers library.  
                
                The metrics are the same as the Open LLM Leaderboard: ARC 25-shot, HellaSwag 10-shot, MMLU 5-shot, TruthfulQA zeroshot, Winogrande 5-shot, and GSM8K 5-shot.  
                
                This leaderboard is maintained by Devin Gulliver and is perpetually under construction, check back regularly for further improvements!  
                
                Compute for evaluating RWKV models is generously provided by [Recursal AI](https://recursal.ai).
                """)
        
        with gr.Tab("🚀 Submit here!"):
            with gr.Group():
                with gr.Row():
                    model_name = gr.Textbox(max_lines=1, placeholder="Enter model name...", show_label=False, scale=4)
                    submit = gr.Button("Submit", variant="primary", scale=0)
            
            output = gr.Markdown("Enter a public HF repo id, then hit Submit to add it to the evaluation queue.")
            
            submit.click(fn=submit_model, inputs=model_name, outputs=output)

demo.launch(show_api=False, allowed_paths=["data.csv"])