|
import os |
|
import json |
|
import datetime |
|
from email.utils import parseaddr |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
|
|
from datasets import load_dataset |
|
from apscheduler.schedulers.background import BackgroundScheduler |
|
from huggingface_hub import HfApi |
|
|
|
|
|
from scorer import question_scorer |
|
from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CHANGELOG_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT |
|
|
|
BALM_TOKEN = os.environ.get("BALM_TOKEN", None) |
|
|
|
OWNER="balm" |
|
SUBMISSION_DATASET = f"{OWNER}/submissions" |
|
|
|
SPLIT="validation" |
|
api = HfApi() |
|
|
|
os.makedirs("scored", exist_ok=True) |
|
|
|
|
|
eval_results = {} |
|
for level in range(1, 4): |
|
eval_results[level] = load_dataset(f"{OWNER}/BALM_ResultsLevel{level}", token=BALM_TOKEN, split=SPLIT) |
|
|
|
eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail")) |
|
eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail")) |
|
eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail")) |
|
|
|
|
|
gold_results = {} |
|
for level in range(1, 4): |
|
level_dataset = load_dataset(f"{OWNER}/BALM", f"2023_level{level}", split=SPLIT, token=BALM_TOKEN) |
|
gold_results[level] = {row["task_id"]: row["ground_truth"] for row in level_dataset} |
|
|
|
|
|
def restart_space(): |
|
api.restart_space(repo_id=f"{OWNER}/BALM_Leaderboard", token=BALM_TOKEN) |
|
|
|
|
|
COLS = ["Model", "Score ⬆️", "Organisation"] |
|
TYPES = ["str", "number", "str",] |
|
|
|
def add_new_eval( |
|
level_of_dev: str, |
|
model: str, |
|
path_to_file, |
|
organisation: str, |
|
mail: str, |
|
): |
|
level = int(level_of_dev.split(" ")[-1]) |
|
|
|
|
|
_, parsed_mail = parseaddr(mail) |
|
if not "@" in parsed_mail: |
|
return format_warning("Please provide a valid email adress.") |
|
|
|
print("Adding new eval") |
|
|
|
|
|
if model.lower() in set(eval_results[level]["model"]) and organisation.lower() in set(eval_results[level]["organisation"]): |
|
return format_warning("This model has been already submitted.") |
|
|
|
|
|
api.upload_file( |
|
repo_id=SUBMISSION_DATASET, |
|
path_or_fileobj=path_to_file.name, |
|
path_in_repo=f"{organisation}/{model}/level{level}_raw_{datetime.datetime.today()}.jsonl", |
|
repo_type="dataset", |
|
token=BALM_TOKEN |
|
) |
|
|
|
|
|
file_path = path_to_file.name |
|
total_score = 0 |
|
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: |
|
with open(file_path, 'r') as f: |
|
for line in f: |
|
task = json.loads(line) |
|
|
|
if "model_answer" not in task: |
|
raise Exception("No model_answer key in the file provided") |
|
answer = task["model_answer"] |
|
task_id = task["task_id"] |
|
|
|
score = question_scorer(task['model_answer'], gold_results[level][task_id]) |
|
|
|
scored_file.write( |
|
json.dumps({ |
|
"id": task_id, |
|
"model_answer": answer, |
|
"score": score |
|
}) + "\n" |
|
) |
|
|
|
total_score += score |
|
|
|
|
|
api.upload_file( |
|
repo_id=SUBMISSION_DATASET, |
|
path_or_fileobj=f"scored/{organisation}_{model}.jsonl", |
|
path_in_repo=f"{organisation}/{model}/level{level}_scored_{datetime.datetime.today()}.jsonl", |
|
repo_type="dataset", |
|
token=BALM_TOKEN |
|
) |
|
|
|
|
|
eval_entry = { |
|
"model": model, |
|
"score": total_score, |
|
"organisation": organisation, |
|
"mail": mail, |
|
} |
|
eval_results[level] = eval_results[level].add_item(eval_entry) |
|
|
|
eval_results[level].push_to_hub(f"{OWNER}/BALM_ResultsLevel{level}", token=BALM_TOKEN, split=SPLIT) |
|
|
|
return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait for up to an hour to see the score displayed") |
|
|
|
|
|
def refresh(): |
|
eval_results = {} |
|
for level in range(1, 4): |
|
eval_results[level] = load_dataset(f"{OWNER}/BALM_ResultsLevel{level}", use_auth_token=BALM_TOKEN, split=SPLIT) |
|
eval_dataframe_1 = pd.DataFrame(eval_results[1].remove_columns("mail")) |
|
eval_dataframe_2 = pd.DataFrame(eval_results[2].remove_columns("mail")) |
|
eval_dataframe_3 = pd.DataFrame(eval_results[3].remove_columns("mail")) |
|
return eval_dataframe_1, eval_dataframe_2, eval_dataframe_3 |
|
|
|
def upload_file(files): |
|
file_paths = [file.name for file in files] |
|
return file_paths |
|
|
|
|
|
demo = gr.Blocks() |
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Accordion("📙 Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
elem_id="citation-button", |
|
).style(show_copy_button=True) |
|
with gr.Column(): |
|
with gr.Accordion("✨ CHANGELOG", open=False): |
|
changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text") |
|
|
|
with gr.Tab("Results: Level 1"): |
|
leaderboard_table_1 = gr.components.Dataframe( |
|
value=eval_dataframe_1, headers=COLS, datatype=TYPES, interactive=False, |
|
) |
|
with gr.Tab("Results: Level 2"): |
|
leaderboard_table_2 = gr.components.Dataframe( |
|
value=eval_dataframe_2, headers=COLS, datatype=TYPES, interactive=False, |
|
) |
|
with gr.Tab("Results: Level 3"): |
|
leaderboard_table_3 = gr.components.Dataframe( |
|
value=eval_dataframe_3, headers=COLS, datatype=TYPES, interactive=False, |
|
) |
|
|
|
refresh_button = gr.Button("Refresh") |
|
refresh_button.click( |
|
refresh, |
|
inputs=[], |
|
outputs=[ |
|
leaderboard_table_1, |
|
leaderboard_table_2, |
|
leaderboard_table_3, |
|
], |
|
) |
|
with gr.Accordion("Submit a new model for evaluation"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
level_of_test = gr.Radio(["Level 1", "Level 2", "Level 3"], value="Level 1", label="{split} set level") |
|
model_name_textbox = gr.Textbox(label="Model name") |
|
file_output = gr.File() |
|
with gr.Column(): |
|
organisation = gr.Textbox(label="Organisation") |
|
mail = gr.Textbox(label="Contact email") |
|
|
|
submit_button = gr.Button("Submit Eval") |
|
submission_result = gr.Markdown() |
|
submit_button.click( |
|
add_new_eval, |
|
[ |
|
level_of_test, |
|
model_name_textbox, |
|
file_output, |
|
organisation, |
|
mail |
|
], |
|
submission_result, |
|
) |
|
|
|
scheduler = BackgroundScheduler() |
|
scheduler.add_job(restart_space, "interval", seconds=3600) |
|
scheduler.start() |
|
demo.launch() |
|
|