|
import os |
|
import logging |
|
import time |
|
import datetime |
|
import gradio as gr |
|
from threading import Thread |
|
import datasets |
|
from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard |
|
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
|
|
|
|
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci |
|
|
|
from src.display.about import ( |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
|
|
TITLE, |
|
ABOUT_TEXT, |
|
SUBMISSION_TEXT_3, |
|
) |
|
from src.display.css_html_js import custom_css |
|
from src.display.utils import ( |
|
COLS, |
|
EVAL_COLS, |
|
EVAL_TYPES, |
|
AutoEvalColumn, |
|
fields, |
|
EvalQueueColumn |
|
) |
|
from src.envs import ( |
|
API, |
|
EVAL_REQUESTS_PATH, |
|
RESULT_REPO, |
|
DATA_VERSION, |
|
DATA_REPO, |
|
HARD_RESULT_REPO, |
|
ELO_REPO, |
|
HARD_ELO_REPO, |
|
SOLVE_REPO, |
|
HARD_SOLVE_REPO, |
|
HF_TOKEN, |
|
QUEUE_REPO, |
|
REPO_ID, |
|
VOTES_REPO, |
|
VOTES_PATH, |
|
HF_HOME, |
|
) |
|
from src.populate import get_evaluation_queue_df, get_leaderboard_df |
|
from src.execute import generate_command, is_running, default_command, stream_logs, find_result_file |
|
from src.tools.plots import plot_elo_mle, plot_solve_rate |
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") |
|
|
|
|
|
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci |
|
|
|
|
|
|
|
DO_FULL_INIT = True |
|
NEW_DATA_ON_LEADERBOARD = True |
|
LEADERBOARD_DF = None |
|
HARD_LEADERBOARD_DF = None |
|
ELO_TASK_DF = None |
|
ELO_BENCH_DF = None |
|
HARD_ELO_TASK_DF = None |
|
HARD_ELO_BENCH_DF = None |
|
COMPLETE_SOLVE_DF = None |
|
INSTRUCT_SOLVE_DF = None |
|
HARD_COMPLETE_SOLVE_DF = None |
|
HARD_INSTRUCT_SOLVE_DF = None |
|
|
|
DATA = datasets.load_dataset(DATA_REPO, "default", cache_dir=HF_HOME, split=DATA_VERSION, |
|
verification_mode="no_checks") |
|
|
|
|
|
def filter_data(data, keyword): |
|
if not keyword: |
|
return data |
|
filtered_data = [item for item in data if keyword.lower() in item['complete_prompt'].lower()] |
|
return filtered_data |
|
|
|
|
|
def update_display(search_keyword, index, show_test): |
|
filtered_data = filter_data(DATA, search_keyword) |
|
|
|
if not filtered_data: |
|
return ["No data available. Check the search criteria."] + [""] * 4 + [0, gr.update(maximum=0, value=0)] |
|
|
|
max_index = len(filtered_data) - 1 |
|
index = min(max(0, index), max_index) |
|
|
|
task_id = filtered_data[index]['task_id'] |
|
snippet1 = filtered_data[index]['complete_prompt'] |
|
snippet2 = filtered_data[index]['instruct_prompt'] |
|
|
|
snippet4 = filtered_data[index]['test'] if show_test else "" |
|
|
|
return [ |
|
task_id, |
|
snippet1, |
|
snippet2, |
|
|
|
snippet4, |
|
len(filtered_data), |
|
gr.update(maximum=max_index, value=index) |
|
] |
|
|
|
def restart_space(): |
|
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN) |
|
|
|
|
|
def time_diff_wrapper(func): |
|
def wrapper(*args, **kwargs): |
|
start_time = time.time() |
|
result = func(*args, **kwargs) |
|
end_time = time.time() |
|
diff = end_time - start_time |
|
logging.info(f"Time taken for {func.__name__}: {diff} seconds") |
|
return result |
|
|
|
return wrapper |
|
|
|
|
|
@time_diff_wrapper |
|
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5): |
|
"""Download dataset with exponential backoff retries.""" |
|
attempt = 0 |
|
while attempt < max_attempts: |
|
try: |
|
logging.info(f"Downloading {repo_id} to {local_dir}") |
|
snapshot_download( |
|
repo_id=repo_id, |
|
local_dir=local_dir, |
|
repo_type=repo_type, |
|
tqdm_class=None, |
|
etag_timeout=30, |
|
max_workers=8, |
|
) |
|
logging.info("Download successful") |
|
return |
|
except Exception as e: |
|
wait_time = backoff_factor**attempt |
|
logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s") |
|
time.sleep(wait_time) |
|
attempt += 1 |
|
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts") |
|
|
|
def get_latest_data_leaderboard( |
|
leaderboard_initial_df = None, |
|
hard_leaderboard_initial_df = None, |
|
elo_task_df = None, |
|
elo_bench_df = None, |
|
hard_elo_task_df = None, |
|
hard_elo_bench_df = None, |
|
complete_solve_df = None, |
|
instruct_solve_df = None, |
|
hard_complete_solve_df = None, |
|
hard_instruct_solve_df = None |
|
): |
|
global NEW_DATA_ON_LEADERBOARD |
|
global LEADERBOARD_DF |
|
global HARD_LEADERBOARD_DF |
|
global ELO_TASK_DF |
|
global ELO_BENCH_DF |
|
global HARD_ELO_TASK_DF |
|
global HARD_ELO_BENCH_DF |
|
global COMPLETE_SOLVE_DF |
|
global INSTRUCT_SOLVE_DF |
|
global HARD_COMPLETE_SOLVE_DF |
|
global HARD_INSTRUCT_SOLVE_DF |
|
|
|
if NEW_DATA_ON_LEADERBOARD: |
|
print("Leaderboard updated at reload!") |
|
leaderboard_dataset = datasets.load_dataset( |
|
RESULT_REPO, |
|
"default", |
|
split="train", |
|
cache_dir=HF_HOME, |
|
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, |
|
verification_mode="no_checks" |
|
) |
|
LEADERBOARD_DF = get_leaderboard_df( |
|
leaderboard_dataset=leaderboard_dataset, |
|
cols=COLS, |
|
) |
|
hard_leaderboard_dataset = datasets.load_dataset( |
|
HARD_RESULT_REPO, |
|
"default", |
|
split="train", |
|
cache_dir=HF_HOME, |
|
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, |
|
verification_mode="no_checks" |
|
) |
|
hard_leaderboard_df = get_leaderboard_df( |
|
leaderboard_dataset=hard_leaderboard_dataset, |
|
cols=COLS, |
|
) |
|
HARD_LEADERBOARD_DF = hard_leaderboard_df |
|
|
|
elo_task_df = datasets.load_dataset( |
|
ELO_REPO, |
|
"default", |
|
split="task_no_tie", |
|
cache_dir=HF_HOME, |
|
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, |
|
verification_mode="no_checks" |
|
).to_pandas() |
|
elo_bench_df = datasets.load_dataset( |
|
ELO_REPO, |
|
"default", |
|
split="benchmark_tie", |
|
cache_dir=HF_HOME, |
|
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, |
|
verification_mode="no_checks" |
|
).to_pandas() |
|
ELO_TASK_DF = elo_task_df |
|
ELO_BENCH_DF = elo_bench_df |
|
|
|
hard_elo_task_df = datasets.load_dataset( |
|
HARD_ELO_REPO, |
|
"default", |
|
split="task_no_tie", |
|
cache_dir=HF_HOME, |
|
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, |
|
verification_mode="no_checks" |
|
).to_pandas() |
|
hard_elo_bench_df = datasets.load_dataset( |
|
HARD_ELO_REPO, |
|
"default", |
|
split="benchmark_tie", |
|
cache_dir=HF_HOME, |
|
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, |
|
verification_mode="no_checks" |
|
).to_pandas() |
|
HARD_ELO_TASK_DF = hard_elo_task_df |
|
HARD_ELO_BENCH_DF = hard_elo_bench_df |
|
|
|
complete_solve_df = datasets.load_dataset( |
|
SOLVE_REPO, |
|
"default", |
|
split="complete", |
|
cache_dir=HF_HOME, |
|
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, |
|
verification_mode="no_checks" |
|
).to_pandas() |
|
instruct_solve_df = datasets.load_dataset( |
|
SOLVE_REPO, |
|
"default", |
|
split="instruct", |
|
cache_dir=HF_HOME, |
|
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, |
|
verification_mode="no_checks" |
|
).to_pandas() |
|
COMPLETE_SOLVE_DF = complete_solve_df |
|
INSTRUCT_SOLVE_DF = instruct_solve_df |
|
|
|
hard_complete_solve_df = datasets.load_dataset( |
|
HARD_SOLVE_REPO, |
|
"default", |
|
split="complete", |
|
cache_dir=HF_HOME, |
|
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, |
|
verification_mode="no_checks" |
|
).to_pandas() |
|
hard_instruct_solve_df = datasets.load_dataset( |
|
HARD_SOLVE_REPO, |
|
"default", |
|
split="instruct", |
|
cache_dir=HF_HOME, |
|
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, |
|
verification_mode="no_checks" |
|
).to_pandas() |
|
HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df |
|
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df |
|
|
|
NEW_DATA_ON_LEADERBOARD = False |
|
|
|
else: |
|
LEADERBOARD_DF = leaderboard_initial_df |
|
|
|
ELO_TASK_DF = elo_task_df |
|
|
|
|
|
HARD_ELO_BENCH_DF = hard_elo_bench_df |
|
COMPLETE_SOLVE_DF = complete_solve_df |
|
|
|
|
|
HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df |
|
|
|
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF) |
|
|
|
|
|
|
|
def init_space(): |
|
"""Initializes the application space, loading only necessary data.""" |
|
|
|
|
|
global LEADERBOARD_DF |
|
global HARD_LEADERBOARD_DF |
|
global ELO_TASK_DF |
|
global ELO_BENCH_DF |
|
global HARD_ELO_TASK_DF |
|
global HARD_ELO_BENCH_DF |
|
global COMPLETE_SOLVE_DF |
|
global INSTRUCT_SOLVE_DF |
|
global HARD_COMPLETE_SOLVE_DF |
|
global HARD_INSTRUCT_SOLVE_DF |
|
|
|
LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard() |
|
|
|
|
|
return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \ |
|
ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \ |
|
COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \ |
|
HARD_INSTRUCT_SOLVE_DF = init_space() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_login(profile: gr.OAuthProfile | None) -> bool: |
|
if profile is None: |
|
return False |
|
return True |
|
|
|
def init_leaderboard(dataframe): |
|
if dataframe is None or dataframe.empty: |
|
raise ValueError("Leaderboard DataFrame is empty or None.") |
|
return Leaderboard( |
|
value=dataframe, |
|
datatype=[c.type for c in fields(AutoEvalColumn)], |
|
select_columns=SelectColumns( |
|
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], |
|
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy], |
|
label="Select Columns to Display:", |
|
), |
|
search_columns=[AutoEvalColumn.model.name], |
|
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], |
|
filter_columns=[ |
|
ColumnFilter(AutoEvalColumn.type.name, type="checkboxgroup", label="Model Types"), |
|
ColumnFilter(AutoEvalColumn.openness.name, type="checkboxgroup", label="Openness"), |
|
ColumnFilter(AutoEvalColumn.size_range.name, type="dropdown", label="Model Size"), |
|
ColumnFilter(AutoEvalColumn.moe.name, type="checkboxgroup", label="Model Architecture"), |
|
], |
|
bool_checkboxgroup_label="Hide models", |
|
interactive=False, |
|
) |
|
|
|
|
|
def init_others(dataframe): |
|
if dataframe is None or dataframe.empty: |
|
raise ValueError("Gradio DataFrame is empty or None.") |
|
return gr.Dataframe(dataframe, visible=False) |
|
|
|
main_block = gr.Blocks(css=custom_css) |
|
with main_block as demo: |
|
with gr.Row(elem_id="header-row"): |
|
gr.HTML(TITLE + "<p>Total models: " + str(len(HARD_LEADERBOARD_DF))+ "</p>") |
|
|
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.Tab("💎 Hard Set") as hard_tabs: |
|
with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"): |
|
hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF) |
|
gr.Markdown( |
|
""" |
|
**Notes:** |
|
- For the efficiency reasons, we only display the Hard Set leaderboard. |
|
- _Hard Set_ vs _Full Set_: |
|
- <u>Hard Set</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging. |
|
- <u>Full Set</u>: The full set of 1140 BigCodeBench tasks. |
|
- _Complete_ vs _Instruct_: |
|
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding. |
|
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code. |
|
- `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits. |
|
- `Average` is the average of `Complete` and `Instruct` when both are available. |
|
- `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the Complete + Instruct splits. The rating starts from 1000 and is bootstrapped 500 times. We only consider the models having both `Complete` and `Instruct` scores. |
|
- `#Act Params (B)` is the number of activated model parameters during inference. |
|
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination. |
|
- For more details check the 📝 About section. |
|
""", |
|
elem_classes="markdown-text", |
|
) |
|
|
|
with gr.TabItem("📊 Elo Rating", id="hard_elo"): |
|
with gr.Column(): |
|
with gr.Group(): |
|
gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_") |
|
hard_task_elo_map = gr.Plot() |
|
hard_elo_task_gr = init_others(HARD_ELO_TASK_DF) |
|
demo.load(plot_elo_mle, [hard_elo_task_gr], |
|
hard_task_elo_map) |
|
with gr.Group(): |
|
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)") |
|
hard_bench_elo_map = gr.Plot() |
|
hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF) |
|
demo.load(plot_elo_mle, [hard_elo_bench_gr], |
|
hard_bench_elo_map) |
|
|
|
with gr.TabItem("🧩 Solve Rate", id="hard_solve"): |
|
with gr.Column(): |
|
hard_complete_map = gr.Plot() |
|
hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF) |
|
demo.load(plot_solve_rate, [hard_complete_solve_gr, |
|
gr.Textbox("Complete", visible=False), |
|
gr.Number(10, visible=False), |
|
gr.Number(16, visible=False), |
|
], hard_complete_map) |
|
hard_instruct_map = gr.Plot() |
|
hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF) |
|
demo.load(plot_solve_rate, [hard_instruct_solve_gr, |
|
gr.Textbox("Instruct", visible=False), |
|
gr.Number(10, visible=False), |
|
gr.Number(16, visible=False), |
|
], hard_instruct_map) |
|
with gr.Tab("🎯 Full Set") as full_tabs: |
|
with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"): |
|
leaderboard = init_leaderboard(LEADERBOARD_DF) |
|
gr.Markdown( |
|
""" |
|
**Notes:** |
|
- _Complete_ vs _Instruct_: |
|
- <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding. |
|
- <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code. |
|
- `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants. |
|
- `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times. |
|
- `size` is the amount of activated model weight during inference. |
|
- Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination. |
|
- For more details check the 📝 About section. |
|
""", |
|
elem_classes="markdown-text", |
|
) |
|
|
|
with gr.TabItem("📊 Elo Rating", id="full_elo"): |
|
with gr.Column(): |
|
with gr.Group(): |
|
|
|
gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_") |
|
task_elo_map = gr.Plot() |
|
elo_task_gr = init_others(ELO_TASK_DF) |
|
demo.load(plot_elo_mle, [elo_task_gr], task_elo_map) |
|
with gr.Group(): |
|
gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)") |
|
bench_elo_map = gr.Plot() |
|
elo_bench_gr = init_others(ELO_BENCH_DF) |
|
demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map) |
|
|
|
with gr.TabItem("🧩 Solve Rate", id="full_solve"): |
|
with gr.Column(): |
|
complete_map = gr.Plot() |
|
complete_solve_gr = init_others(COMPLETE_SOLVE_DF) |
|
demo.load(plot_solve_rate, [complete_solve_gr, |
|
gr.Textbox("Complete", visible=False), |
|
], complete_map) |
|
instruct_map = gr.Plot() |
|
instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF) |
|
demo.load(plot_solve_rate, [instruct_solve_gr, |
|
gr.Textbox("Instruct", visible=False), |
|
], instruct_map) |
|
with gr.TabItem("📝 About", id=3): |
|
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text") |
|
with gr.TabItem("🔎 Data Viewer", id="viewer"): |
|
search_input = gr.Textbox(label="Search by keyword") |
|
count_output = gr.Number(label="Number of filtered items") |
|
index_slider = gr.Slider(minimum=0, maximum=len(DATA)-1, step=1, label="Select Index") |
|
|
|
show_test = gr.Checkbox(label="Show Test Cases") |
|
update_button = gr.Button("Update") |
|
|
|
task_id_output = gr.Textbox(label="Task ID") |
|
code_completion = gr.Code(language="python", label="Code Completion") |
|
nl_instruction = gr.Code(language="markdown", label="Natural Language Instruction") |
|
|
|
test_cases = gr.Code(language="python", label="Test Cases") |
|
|
|
update_button.click( |
|
update_display, |
|
inputs=[search_input, index_slider, show_test], |
|
outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider] |
|
) |
|
|
|
|
|
demo.load( |
|
update_display, |
|
inputs=[search_input, index_slider, show_test], |
|
outputs=[task_id_output, code_completion, nl_instruction, test_cases, count_output, index_slider] |
|
) |
|
|
|
with gr.TabItem("🚀 Request", id=4): |
|
gr.Markdown(SUBMISSION_TEXT_3) |
|
|
|
with gr.TabItem("🛠️ Execute", id=5): |
|
gr.Markdown("# BigCodeBench Evaluator") |
|
|
|
with gr.Row(): |
|
jsonl_file = gr.File(label="Upload JSONL file", file_types=[".jsonl"]) |
|
split = gr.Dropdown(choices=["complete", "instruct"], label="Split", value="complete") |
|
subset = gr.Dropdown(choices=["hard", "full"], label="Subset", value="hard") |
|
|
|
with gr.Row(): |
|
parallel = gr.Number(label="Parallel (optional)", precision=0) |
|
min_time_limit = gr.Number(label="Min Time Limit", value=1, precision=1) |
|
max_as_limit = gr.Number(label="Max AS Limit", value=25*1024, precision=0) |
|
|
|
with gr.Row(): |
|
max_data_limit = gr.Number(label="Max Data Limit", value=25*1024, precision=0) |
|
max_stack_limit = gr.Number(label="Max Stack Limit", value=10, precision=0) |
|
check_gt_only = gr.Checkbox(label="Check GT Only") |
|
no_gt = gr.Checkbox(label="No GT") |
|
|
|
command_output = gr.Textbox(label="Command", value=default_command, interactive=False) |
|
with gr.Row(): |
|
submit_btn = gr.Button("Run Evaluation") |
|
download_btn = gr.DownloadButton(label="Download Result") |
|
log_output = gr.Textbox(label="Execution Logs", lines=20) |
|
|
|
input_components = [ |
|
jsonl_file, split, subset, parallel, |
|
min_time_limit, max_as_limit, max_data_limit, max_stack_limit, |
|
check_gt_only, no_gt |
|
] |
|
|
|
for component in input_components: |
|
component.change(generate_command, inputs=input_components, outputs=command_output) |
|
|
|
|
|
def start_evaluation(command, jsonl_file, subset, split): |
|
extra = subset + "_" if subset != "full" else "" |
|
if jsonl_file is not None: |
|
result_path = os.path.basename(jsonl_file.name).replace(".jsonl", f"_{extra}eval_results.json") |
|
else: |
|
result_path = None |
|
|
|
for log in stream_logs(command, jsonl_file): |
|
if jsonl_file is not None: |
|
yield log, gr.update(value=result_path, label=result_path), gr.update() |
|
else: |
|
yield log, gr.update(), gr.update() |
|
result_file = find_result_file() |
|
if result_file: |
|
return gr.update(label="Evaluation completed. Result file found."), gr.update(value=result_file) |
|
|
|
|
|
else: |
|
return gr.update(label="Evaluation completed. No result file found."), gr.update(value=result_path) |
|
|
|
|
|
submit_btn.click(start_evaluation, |
|
inputs=[command_output, jsonl_file, subset, split], |
|
outputs=[log_output, download_btn]) |
|
|
|
with gr.Row(): |
|
with gr.Accordion("📙 Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
lines=20, |
|
elem_id="citation-button", |
|
show_copy_button=True, |
|
) |
|
|
|
main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr]) |
|
|
|
|
|
|
|
|
|
main_block.queue(default_concurrency_limit=100) |
|
|
|
|
|
def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer: |
|
|
|
|
|
|
|
if SPACE_ID is None: |
|
print("Not in a Space: Space CI disabled.") |
|
return WebhooksServer(ui=main_block) |
|
|
|
if IS_EPHEMERAL_SPACE: |
|
print("In an ephemeral Space: Space CI disabled.") |
|
return WebhooksServer(ui=main_block) |
|
|
|
card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space") |
|
config = card.data.get("space_ci", {}) |
|
print(f"Enabling Space CI with config from README: {config}") |
|
|
|
return configure_space_ci( |
|
blocks=ui, |
|
trusted_authors=config.get("trusted_authors"), |
|
private=config.get("private", "auto"), |
|
variables=config.get("variables", "auto"), |
|
secrets=config.get("secrets"), |
|
hardware=config.get("hardware"), |
|
storage=config.get("storage"), |
|
) |
|
|
|
|
|
webhooks_server = enable_space_ci_and_return_server(ui=main_block) |
|
|
|
|
|
@webhooks_server.add_webhook |
|
def update_leaderboard(payload: WebhookPayload) -> None: |
|
"""Redownloads the leaderboard dataset each time it updates""" |
|
if payload.repo.type == "dataset" and payload.event.action == "update": |
|
global NEW_DATA_ON_LEADERBOARD |
|
if NEW_DATA_ON_LEADERBOARD: |
|
return |
|
NEW_DATA_ON_LEADERBOARD = True |
|
|
|
for repo in [RESULT_REPO, HARD_RESULT_REPO, ELO_REPO, HARD_ELO_REPO, SOLVE_REPO, HARD_SOLVE_REPO]: |
|
datasets.load_dataset( |
|
repo, |
|
"default", |
|
cache_dir=HF_HOME, |
|
download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD, |
|
verification_mode="no_checks" |
|
) |
|
|
|
|
|
|
|
webhooks_server.launch() |
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(restart_space, "interval", hours=3) |
|
scheduler.start() |