# some code blocks are taken from https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/tree/main import json import os from datetime import datetime, timezone import gradio as gr import pandas as pd import requests from huggingface_hub import HfApi from src.css_html import custom_css from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3, CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL from src.utils import ( AutoEvalColumn, fields, is_model_on_hub, make_clickable_names, plot_elo_mle, plot_solve_rate, styled_error, styled_message, ) from datasets import load_dataset TOKEN = os.environ.get("TOKEN", None) api = HfApi(TOKEN) df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values(["complete", "instruct"], ascending=False) task_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="task_no_tie").to_pandas() bench_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="benchmark_tie").to_pandas() complete_solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", split="complete").to_pandas() instruct_solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", split="instruct").to_pandas() QUEUE_REPO = "bigcode/bigcodebench-requests" EVAL_REQUESTS_PATH = "eval-queue" COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden] COLS_LITE = [ c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden ] TYPES_LITE = [ c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden ] def add_new_eval( model: str, revision: str, model_type: str, ): current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") if model_type is None or model_type == "": return styled_error("Please select a model type.") # check the model actually exists before adding the eval if revision == "": revision = "main" model_on_hub, error = is_model_on_hub(model, revision) if not model_on_hub: return styled_error(f'Model "{model}" {error}') print("adding new eval") eval_entry = { "model": model, "revision": revision, "status": "PENDING", "submitted_time": current_time, "model_type": model_type.split(" ")[1], } user_name = "" model_path = model if "/" in model: user_name = model.split("/")[0] model_path = model.split("/")[1] OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}" os.makedirs(OUT_DIR, exist_ok=True) out_path = f"{OUT_DIR}/{model_path}_eval_request.json" print(f"Saving eval request to {out_path}") with open(out_path, "w") as f: f.write(json.dumps(eval_entry)) api.upload_file( path_or_fileobj=out_path, path_in_repo=out_path.split("eval-queue/")[1], repo_id=QUEUE_REPO, repo_type="dataset", commit_message=f"Add {model} to eval queue", ) # remove the local file os.remove(out_path) return styled_message("Your request has been submitted to the evaluation queue!\n") def select_columns(df, columns): always_here_cols = [ AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name, ] # We use COLS to maintain sorting filtered_df = df[ always_here_cols + [c for c in COLS if c in df.columns and c in columns] ] return filtered_df def filter_types(df, leaderboard_table, query): if query == "all": return df[leaderboard_table.columns] else: query = query[0] filtered_df = df[df["type"].str.contains(query, na=False)] return filtered_df[leaderboard_table.columns] def filter_direct_complete(df, leaderboard_table, query): if query == "all": return df[leaderboard_table.columns] if query == "chat template": return df[~df["direct_complete"]][leaderboard_table.columns] else: return df[df["direct_complete"]][leaderboard_table.columns] def search_table(df, leaderboard_table, query): filtered_df = df[(df["model"].str.contains("|".join(q.strip() for q in query.split("|")), case=False))] return filtered_df[leaderboard_table.columns] df = make_clickable_names(df) demo = gr.Blocks(css=custom_css) with demo: with gr.Row(): gr.Markdown( """
Inspired from the ๐ค Open LLM Leaderboard and โญ Big Code Models Leaderboard, we compare performance of LLMs on BigCodeBench benchmark.
To get started, please check out our GitHub repository.
""", elem_classes="markdown-text", ) with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.Column(): with gr.Tabs(elem_classes="A100-tabs") as A100_tabs: with gr.TabItem("๐ Evaluation Table", id=0): with gr.Column(): with gr.Accordion("โก๏ธ See All Columns", open=False): shown_columns = gr.CheckboxGroup( choices=[ c for c in COLS if c not in [ AutoEvalColumn.dummy.name, AutoEvalColumn.model.name, AutoEvalColumn.model_type_symbol.name, ] ], value=[ c for c in COLS_LITE if c not in [ AutoEvalColumn.dummy.name, AutoEvalColumn.model.name, AutoEvalColumn.model_type_symbol.name, ] ], label="", elem_id="column-select", interactive=True, ) # with gr.Column(min_width=780): with gr.Row(): search_bar = gr.Textbox( placeholder="๐ Separate multiple queries with '|'", show_label=False, elem_id="search-bar", ) filter_types_columns = gr.Radio( label="โ Filter model types", choices=["all", "๐ข base", "๐ถ instruction-tuned"], #, "EXT external-evaluation"], value="all", elem_id="filter-columns", ) filter_prompting_columns = gr.Radio( label="โ Filter prompting", choices=["all", "chat template", "direct complete"], value="all", elem_id="filter-direct-complete", ) leaderboard_df = gr.components.Dataframe( value=df[ [ AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name, ] + shown_columns.value ], headers=[ AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name, ] + shown_columns.value, datatype=TYPES, elem_id="leaderboard-table", interactive=False, ) hidden_leaderboard_df = gr.components.Dataframe( value=df, headers=COLS, datatype=["str" for _ in range(len(COLS))], visible=False, ) search_bar.submit( search_table, [hidden_leaderboard_df, leaderboard_df, search_bar], leaderboard_df, ) filter_types_columns.change( filter_types, [hidden_leaderboard_df, leaderboard_df, filter_types_columns], leaderboard_df, ) filter_prompting_columns.change( filter_direct_complete, [hidden_leaderboard_df, leaderboard_df, filter_prompting_columns], leaderboard_df, ) shown_columns.change( select_columns, [hidden_leaderboard_df, shown_columns], leaderboard_df, ) gr.Markdown( """ **Notes:** - _Complete_ vs _Instruct_: - Complete: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding. - Instruct (๐ฅVibe Check๐ฅ): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code. - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants. - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times. - `size` is the amount of activated model weight during inference. - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination. - For more details check the ๐ About section. """, elem_classes="markdown-text", ) with gr.TabItem("๐ Elo Rating", id=1): with gr.Column(): with gr.Group(): gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_") task_elo_map = gr.Plot() demo.load(plot_elo_mle, [gr.Dataframe(task_elo_mle_df, visible=False)], task_elo_map) with gr.Group(): gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)") model_elo_map = gr.Plot() demo.load(plot_elo_mle, [gr.Dataframe(bench_elo_mle_df, visible=False)], model_elo_map) with gr.TabItem("๐งฉ Solve Rate", id=2): with gr.Column(): complete_map = gr.Plot() demo.load(plot_solve_rate, [gr.Dataframe(complete_solve_rate, visible=False), gr.Textbox("Complete", visible=False), ], complete_map) instruct_map = gr.Plot() demo.load(plot_solve_rate, [gr.Dataframe(instruct_solve_rate, visible=False), gr.Textbox("Instruct", visible=False), ], instruct_map) with gr.TabItem("๐ About", id=3): gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text") with gr.TabItem("Submit/Request Results ๐", id=4): gr.Markdown(SUBMISSION_TEXT_3) with gr.Row(): with gr.Accordion("๐ Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) demo.launch()