import json import os from datetime import datetime, timezone import gradio as gr import numpy as np import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi from src.assets.text_content import * from src.elo_leaderboard.load_results import get_elo_plots, get_elo_results_dicts from src.assets.css_html_js import custom_css, get_window_url_params # left in case you need them from src.utils_display import EloEvalColumn, fields, styled_error, styled_warning, styled_message from src.init import load_all_info_from_hub # clone / pull the lmeh eval data H4_TOKEN = os.environ.get("H4_TOKEN", None) HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval" GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals" IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True)) ADD_PLOTS = False EVAL_REQUESTS_PATH = "auto_evals/eval_requests" api = HfApi() def restart_space(): api.restart_space( repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN ) human_eval_repo, gpt_4_eval_repo = load_all_info_from_hub(HUMAN_EVAL_REPO, GPT_4_EVAL_REPO) ELO_COLS = [c.name for c in fields(EloEvalColumn)] ELO_TYPES = [c.type for c in fields(EloEvalColumn)] ELO_SORT_COL = EloEvalColumn.gpt4.name def has_no_nan_values(df, columns): return df[columns].notna().all(axis=1) def has_nan_values(df, columns): return df[columns].isna().any(axis=1) def get_elo_leaderboard(df_instruct, df_code_instruct, tie_allowed=False): if human_eval_repo: print("Pulling human_eval_repo changes") human_eval_repo.git_pull() all_data = get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed) dataframe = pd.DataFrame.from_records(all_data) dataframe = dataframe.sort_values(by=ELO_SORT_COL, ascending=False) dataframe = dataframe[ELO_COLS] return dataframe def get_elo_elements(): df_instruct = pd.read_json("human_evals/without_code.json") df_code_instruct = pd.read_json("human_evals/with_code.json") elo_leaderboard = get_elo_leaderboard( df_instruct, df_code_instruct, tie_allowed=False ) elo_leaderboard_with_tie_allowed = get_elo_leaderboard( df_instruct, df_code_instruct, tie_allowed=True ) plot_1, plot_2, plot_3, plot_4 = get_elo_plots( df_instruct, df_code_instruct, tie_allowed=False ) return ( elo_leaderboard, elo_leaderboard_with_tie_allowed, plot_1, plot_2, plot_3, plot_4, ) ( elo_leaderboard, elo_leaderboard_with_tie_allowed, plot_1, plot_2, plot_3, plot_4, ) = get_elo_elements() demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) with gr.Row(): gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Column(): with gr.Row(): with gr.Column(scale=2): gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text") with gr.Column(scale=1): gr.Image( "src/assets/scale-hf-logo.png", elem_id="scale-logo", show_label=False ) gr.Markdown("## No tie allowed") elo_leaderboard_table = gr.components.Dataframe( value=elo_leaderboard, headers=ELO_COLS, datatype=ELO_TYPES, max_rows=5, ) gr.Markdown("## Tie allowed*") elo_leaderboard_table_with_tie_allowed = gr.components.Dataframe( value=elo_leaderboard_with_tie_allowed, headers=ELO_COLS, datatype=ELO_TYPES, max_rows=5, ) gr.Markdown( "\* Results when the scores of 4 and 5 were treated as ties.", elem_classes="markdown-text", ) gr.Markdown( "Let us know in [this discussion](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/65) which models we should add!", elem_id="models-to-add-text", ) if ADD_PLOTS: with gr.Box(): visualization_title = gr.HTML(VISUALIZATION_TITLE) with gr.Row(): with gr.Column(): gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}") plot_1 = gr.Plot(plot_1, show_label=False) with gr.Column(): gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}") plot_2 = gr.Plot(plot_2, show_label=False) with gr.Row(): with gr.Column(): gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}") plot_3 = gr.Plot(plot_3, show_label=False) with gr.Column(): gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}") plot_4 = gr.Plot(plot_4, show_label=False) with gr.Row(): with gr.Column(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", ).style(show_copy_button=True) with gr.Column(): with gr.Accordion("✨ CHANGELOG", open=False): changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text") scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=3600) scheduler.start() demo.queue(concurrency_count=40).launch()