import os import json import datetime from email.utils import parseaddr import gradio as gr import pandas as pd import numpy as np from datasets import load_dataset from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi # InfoStrings from scorer import question_scorer from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT BALM_TOKEN = os.environ.get("WTOKEN", None) OWNER="gaia-benchmark" DATA_DATASET = f"{OWNER}/GAIA" INTERNAL_DATA_DATASET = f"{OWNER}/GAIA_internal" SUBMISSION_DATASET = f"{OWNER}/submissions_internal" RESULTS_DATASET = f"{OWNER}/results" LEADERBOARD_PATH = f"{OWNER}/leaderboard" api = HfApi() YEAR_VERSION = "2023" os.makedirs("scored", exist_ok=True) # Display the results eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, use_auth_token=BALM_TOKEN) eval_dataframe_val = pd.DataFrame(eval_results["validation"].remove_columns("mail")) eval_dataframe_test = pd.DataFrame(eval_results["test"].remove_columns("mail")) # Gold answers gold_results = {} gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", use_auth_token=BALM_TOKEN) gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "validation"]} def restart_space(): api.restart_space(repo_id=LEADERBOARD_PATH, token=BALM_TOKEN) COLS = ["Model", "Score ⬆️", "Organisation"] TYPES = ["str", "number", "str",] def add_new_eval( val_or_test: str, model: str, path_to_file, organisation: str, mail: str, ): # Very basic email parsing _, parsed_mail = parseaddr(mail) if not "@" in parsed_mail: return format_warning("Please provide a valid email adress.") print("Adding new eval") # Check if the combination model/org already exists and prints a warning message if yes if model.lower() in set(eval_results[val_or_test]["model"]) and organisation.lower() in set(eval_results[val_or_test]["organisation"]): return format_warning("This model has been already submitted.") if path_to_file is None: return format_warning("Please attach a file.") # Save submitted file api.upload_file( repo_id=SUBMISSION_DATASET, path_or_fileobj=path_to_file.name, path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl", repo_type="dataset", token=BALM_TOKEN ) # Compute score file_path = path_to_file.name scores = {"all": 0, 1: 0, 2: 0, 3: 0} num_questions = {"all": 0, 1: 0, 2: 0, 3: 0} with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: with open(file_path, 'r') as f: for line in f: task = json.loads(line) if "model_answer" not in task: raise Exception("No model_answer key in the file provided") answer = task["model_answer"] task_id = task["task_id"] level = int(gold_results[val_or_test][task_id]["Level"]) score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"]) scored_file.write( json.dumps({ "id": task_id, "model_answer": answer, "score": score, "level": level }) + "\n" ) scores["all"] += score scores[level] += score num_questions["all"] += 1 num_questions[level] += 1 # Save scored file api.upload_file( repo_id=SUBMISSION_DATASET, path_or_fileobj=f"scored/{organisation}_{model}.jsonl", path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl", repo_type="dataset", token=BALM_TOKEN ) # Actual submission eval_entry = { "model": model, "organisation": organisation, "mail": mail, "score": scores["all"]/num_questions["all"], "score_level1": scores[1]/num_questions[1], "score_level2": scores[2]/num_questions[2], "score_level3": scores[3]/num_questions[3], } eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry) print(eval_results) eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=BALM_TOKEN) return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait for up to an hour to see the score displayed") def refresh(): eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, use_auth_token=BALM_TOKEN, download_mode="force_redownload") eval_dataframe_val = pd.DataFrame(eval_results["validation"].remove_columns("mail")) eval_dataframe_test = pd.DataFrame(eval_results["test"].remove_columns("mail")) return eval_dataframe_val, eval_dataframe_test def upload_file(files): file_paths = [file.name for file in files] return file_paths demo = gr.Blocks() with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", ).style(show_copy_button=True) with gr.Tab("Results: Validation"): leaderboard_table_val = gr.components.Dataframe( value=eval_dataframe_val, headers=COLS, datatype=TYPES, interactive=False, ) with gr.Tab("Results: Test"): leaderboard_table_test = gr.components.Dataframe( value=eval_dataframe_test, headers=COLS, datatype=TYPES, interactive=False, ) refresh_button = gr.Button("Refresh") refresh_button.click( refresh, inputs=[], outputs=[ leaderboard_table_val, leaderboard_table_test, ], ) with gr.Accordion("Submit a new model for evaluation"): with gr.Row(): with gr.Column(): level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split") model_name_textbox = gr.Textbox(label="Model name") file_output = gr.File() with gr.Column(): organisation = gr.Textbox(label="Organisation") mail = gr.Textbox(label="Contact email") submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, [ level_of_test, model_name_textbox, file_output, organisation, mail ], submission_result, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=3600) scheduler.start() demo.launch()