Spaces:

Salesforce
/

GIFT-Eval

Running

File size: 12,384 Bytes

aada8de
f5303bc
aada8de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5303bc
aada8de
 
 
 
 
 
f5303bc
aada8de
409ae36
c29a860
aada8de
 
 
 
 
409ae36
aada8de
89d6f25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aada8de
409ae36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5303bc
 
 
 
 
 
 
 
 
 
aada8de
8d950ab
 
 
 
 
aada8de
409ae36
f5303bc
 
aada8de
f5303bc
 
 
 
 
 
 
 
 
aada8de
f5303bc
 
aada8de
f5303bc
 
 
 
 
 
 
89d6f25
aada8de
f5303bc
409ae36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5303bc
 
 
 
aada8de
 
 
 
 
 
 
 
 
 
26602a7
f5303bc
26602a7
409ae36
 
f5303bc
409ae36
 
 
f5303bc
409ae36
 
 
f5303bc
409ae36
 
aada8de
 
8d950ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409ae36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aada8de
 
 
 
 
 
 
 
 
 
 
 
af47d97
aada8de

import gradio as gr
import ipdb
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
    BENCHMARK_COLS,
    EVAL_COLS,
    EVAL_TYPES,
    ModelInfoColumn,
    ModelType,
    fields,
    WeightType,
    Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_info_df, get_merged_df
from src.submission.submit import add_new_eval
from src.utils import norm_sNavie, pivot_df
# import ipdb


def restart_space():
    API.restart_space(repo_id=REPO_ID)


### Space initialisation
# try:
#     print(EVAL_REQUESTS_PATH)
#     snapshot_download(
#         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
#         token=TOKEN
#     )
# except Exception:
#     restart_space()
# try:
#     print(EVAL_RESULTS_PATH)
#     snapshot_download(
#         repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
#         token=TOKEN
#     )
# except Exception:
#     restart_space()

# # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
# df = pd.read_csv('LOTSAv2_EvalBenchmark(Long).csv')
# # Step 2: Pivot the DataFrame
# LEADERBOARD_DF = df.pivot_table(index='model',
#                              columns='dataset',
#                              values='eval_metrics/MAE[0.5]',
#                              aggfunc='first')
# LEADERBOARD_DF.drop(columns=['ALL'], inplace=True)
#
# # Reset the index if you want the model column to be part of the DataFrame
# LEADERBOARD_DF.reset_index(inplace=True)
# # Step 3: noramlize the values
# # ipdb.set_trace()
# LEADERBOARD_DF = norm_sNavie(LEADERBOARD_DF)
#
# # LEADERBOARD_DF['Average'] = LEADERBOARD_DF.mean(axis=1)
# # LEADERBOARD_DF.insert(1, 'Average', LEADERBOARD_DF.pop('Average'))
# # LEADERBOARD_DF = LEADERBOARD_DF.sort_values(by=['Average'], ascending=True)
# print(f"The leaderboard is {LEADERBOARD_DF}")
# print(f'Columns: ', LEADERBOARD_DF.columns)

# LEADERBOARD_DF = pd.read_csv('pivoted_df.csv')
domain_df = pivot_df('results/grouped_results_by_domain.csv', tab_name='domain')
print(f'Domain dataframe is {domain_df}')
freq_df = pivot_df('results/grouped_results_by_frequency.csv', tab_name='frequency')
print(f'Freq dataframe is {freq_df}')
term_length_df = pivot_df('results/grouped_results_by_term_length.csv', tab_name='term_length')
print(f'Term length dataframe is {term_length_df}')
variate_type_df = pivot_df('results/grouped_results_by_univariate.csv', tab_name='univariate')
print(f'Variate type dataframe is {variate_type_df}')
model_info_df = get_model_info_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)

# domain_df = get_merged_df(domain_df, model_info_df)
# print('Merged domain df: ', domain_df)
# freq_df = get_merged_df(freq_df, model_info_df)
# print('Merged freq df: ', freq_df)
# term_length_df = get_merged_df(term_length_df, model_info_df)
# print('Merged term length df: ', term_length_df)
# variate_type_df = get_merged_df(variate_type_df, model_info_df)
# print('Merged variate type df: ', variate_type_df)

# (
#     finished_eval_queue_df,
#     running_eval_queue_df,
#     pending_eval_queue_df,
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)


def init_leaderboard(ori_dataframe, model_info_df):
    if ori_dataframe is None or ori_dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    model_info_col_list = [c.name for c in fields(ModelInfoColumn) if c.displayed_by_default if c.name not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
    default_selection_list = list(ori_dataframe.columns) + model_info_col_list
    print('default_selection_list: ', default_selection_list)
    # ipdb.set_trace()
    # default_selection_list = [col for col in default_selection_list if col not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
    merged_df = get_merged_df(ori_dataframe, model_info_df)
    new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
    merged_df = merged_df[new_cols]
    print('Merged df: ', merged_df)
    return Leaderboard(
        value=merged_df,
        # datatype=[c.type for c in fields(ModelInfoColumn)],
        select_columns=SelectColumns(
            default_selection=default_selection_list,
            # default_selection=[c.name for c in fields(ModelInfoColumn) if
            #                    c.displayed_by_default and c.name not in ['params', 'available_on_hub', 'hub',
            #                                                              'Model sha', 'Hub License']],
            # default_selection=list(dataframe.columns),
            cant_deselect=[c.name for c in fields(ModelInfoColumn) if c.never_hidden],
            label="Select Columns to Display:",
            # How to uncheck??
        ),
        hide_columns=[c.name for c in fields(ModelInfoColumn) if c.hidden],
        search_columns=['model'],
        # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
        # filter_columns=[
        #     ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
        #     ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
        #     ColumnFilter(
        #         AutoEvalColumn.params.name,
        #         type="slider",
        #         min=0.01,
        #         max=500,
        #         label="Select the number of parameters (B)",
        #     ),
        #     ColumnFilter(
        #         AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
        #     ),
        # ],
        filter_columns=[
            ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
        ],
        # bool_checkboxgroup_label="Hide models",
        interactive=False,
    )


demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
            leaderboard = init_leaderboard(domain_df, model_info_df)
            print(f"FINAL Domain LEADERBOARD 1 {domain_df}")

        with gr.TabItem("🏅 By Frequency", elem_id="llm-benchmark-tab-table", id=1):
            leaderboard = init_leaderboard(freq_df, model_info_df)
            print(f"FINAL Frequency LEADERBOARD 1 {freq_df}")

        with gr.TabItem("🏅 By term length", elem_id="llm-benchmark-tab-table", id=2):
            leaderboard = init_leaderboard(term_length_df, model_info_df)
            print(f"FINAL term length LEADERBOARD 1 {term_length_df}")

        with gr.TabItem("🏅 By variate type", elem_id="llm-benchmark-tab-table", id=3):
            leaderboard = init_leaderboard(variate_type_df, model_info_df)
            print(f"FINAL LEADERBOARD 1 {variate_type_df}")
        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

        # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=5):
        #     with gr.Column():
        #         with gr.Row():
        #             gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
        #
        #         with gr.Column():
        #             with gr.Accordion(
        #                     f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
        #                     open=False,
        #             ):
        #                 with gr.Row():
        #                     finished_eval_table = gr.components.Dataframe(
        #                         value=finished_eval_queue_df,
        #                         headers=EVAL_COLS,
        #                         datatype=EVAL_TYPES,
        #                         row_count=5,
        #                     )
        #             with gr.Accordion(
        #                     f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
        #                     open=False,
        #             ):
        #                 with gr.Row():
        #                     running_eval_table = gr.components.Dataframe(
        #                         value=running_eval_queue_df,
        #                         headers=EVAL_COLS,
        #                         datatype=EVAL_TYPES,
        #                         row_count=5,
        #                     )
        #
        #             with gr.Accordion(
        #                     f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
        #                     open=False,
        #             ):
        #                 with gr.Row():
        #                     pending_eval_table = gr.components.Dataframe(
        #                         value=pending_eval_queue_df,
        #                         headers=EVAL_COLS,
        #                         datatype=EVAL_TYPES,
        #                         row_count=5,
        #                     )
        #     with gr.Row():
        #         gr.Markdown("# ✉️✨ Submit your model outputs !", elem_classes="markdown-text")
        #         gr.Markdown(
        #             "Send your model outputs for all the models using the ContextualBench code and email them to us at xnguyen@salesforce.com ",
        #             elem_classes="markdown-text")

            # with gr.Row():
            #     with gr.Column():
            #         model_name_textbox = gr.Textbox(label="Model name")
            #         revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
            #         model_type = gr.Dropdown(
            #             choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
            #             label="Model type",
            #             multiselect=False,
            #             value=None,
            #             interactive=True,
            #         )

            #     with gr.Column():
            #         precision = gr.Dropdown(
            #             choices=[i.value.name for i in Precision if i != Precision.Unknown],
            #             label="Precision",
            #             multiselect=False,
            #             value="float16",
            #             interactive=True,
            #         )
            #         weight_type = gr.Dropdown(
            #             choices=[i.value.name for i in WeightType],
            #             label="Weights type",
            #             multiselect=False,
            #             value="Original",
            #             interactive=True,
            #         )
            #         base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")

            # submit_button = gr.Button("Submit Eval")
            # submission_result = gr.Markdown()
            # submit_button.click(
            #     add_new_eval,
            #     [
            #         model_name_textbox,
            #         base_model_name_textbox,
            #         revision_name_textbox,
            #         precision,
            #         weight_type,
            #         model_type,
            #     ],
            #     submission_result,
            # )

    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                lines=20,
                elem_id="citation-button",
                show_copy_button=True,
            )

scheduler = BackgroundScheduler()
# scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()