Spaces:

AssistantBench
/

leaderboard

Running

App Files Files Community

Ori commited on Jul 21, 2024

Commit

c1ec713

verified ·

1 Parent(s): 4a825f1

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -13

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import json
 import datetime
 from email.utils import parseaddr
 import gradio as gr
 import pandas as pd
 from datasets import load_dataset
@@ -11,7 +10,6 @@ from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import HfApi
 from content import format_error, format_warning, format_log, TITLE
 # Placeholder for the question_scorer function
 def question_scorer(prediction, gold_answer):
     return 1 if prediction == gold_answer else 0
@@ -34,7 +32,9 @@ os.makedirs("scored", exist_ok=True)
 eval_results = load_dataset(RESULTS_DATASET, token=TOKEN, download_mode="force_redownload",
                             ignore_verifications=True, trust_remote_code=True)
 gold_results = load_dataset(DATA_DATASET, token=TOKEN, trust_remote_code=True)
 gold_answers = {split: {row["id"]: row["answer"] for row in gold_results[split]} for split in ["test"]}
 # Function to get dataframe from results
@@ -46,8 +46,18 @@ def get_dataframe_from_results(eval_results, split):
     df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
     return df
 eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
 # Function to restart the space
@@ -55,7 +65,7 @@ def restart_space():
     api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
-TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]
 # Function to add a new evaluation
@@ -92,6 +102,10 @@ def add_new_eval(
     file_path = path_to_file.name
     scores = 0
     num_questions = 0
     with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
         with open(file_path, 'r') as f:
             for ix, line in enumerate(f):
@@ -111,6 +125,8 @@ def add_new_eval(
                         f"{task_id} not found in test set. Are you sure you submitted the correct file?")
                 score = question_scorer(task['answer'], gold_answers["test"][task_id])
                 scored_file.write(
                     json.dumps({
                         "id": task_id,
@@ -118,8 +134,15 @@ def add_new_eval(
                         "score": score
                     }) + "\n"
                 )
                 scores += score
                 num_questions += 1
     api.upload_file(
         repo_id=SUBMISSION_DATASET,
@@ -131,14 +154,16 @@ def add_new_eval(
     eval_entry = {
         "Model Name": model_name,
-        "Model Family": model_family,
         "URL": url,
         "Organization": organization,
         "Accuracy": scores / num_questions if num_questions > 0 else 0,
         "Answer rate": scores / num_questions if num_questions > 0 else 0,
         "Precision": scores / num_questions if num_questions > 0 else 0,
-        "EM": scores if num_questions > 0 else 0,
-        "Cost": 0,  # Placeholder for cost, update with actual value if needed
     }
     eval_results["test"] = eval_results["test"].add_item(eval_entry)
     eval_results.push_to_hub(RESULTS_DATASET, config_name=YEAR_VERSION, token=TOKEN)
@@ -152,6 +177,7 @@ def refresh():
     eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload",
                                 ignore_verifications=True, trust_remote_code=True)
     eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
     return eval_dataframe_test
@@ -185,17 +211,16 @@ with demo:
     with gr.Accordion("Submit a new model for evaluation"):
         with gr.Row():
             gr.Markdown("""
-                To make a new submission, upload a predictions file. We support JSONL files with the following format:
                 ```
                 {"id": "task_id_1", "answer": "Answer 1 from your model"}
                 {"id": "task_id_2", "answer": "Answer 2 from your model"}
                 ```
-                Our scoring function can be found [here](https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/scorer.py).
             """)
         with gr.Row():
             with gr.Column():
                 model_name_textbox = gr.Textbox(label="Model Name")
-                model_family_textbox = gr.Textbox(label="Model Family")
                 url_textbox = gr.Textbox(label="URL to Model Information")
             with gr.Column():
                 organization = gr.Textbox(label="Organization")
@@ -220,11 +245,11 @@ with demo:
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
-            citation_text = """@article{yoran-etal-2023-assistantbench,
     title={AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?},
     author={Ori Yoran and Samuel Amouyal and Chaitanya Malaviya and Ben Bogin and Ofir Press and Jonathan Berant},
     year={2024},
-    eprint={TODO},
     archivePrefix={arXiv},
     primaryClass={cs.CL}
 }"""
@@ -237,9 +262,9 @@ with demo:
             )
     gr.HTML(
-        "<p>We would like to thank the GAIA team on which this leaderboard is based on their template and HuggingFace for hosting the leaderboard.</p>")
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600)
 scheduler.start()
-demo.launch(debug=True)

 import datetime
 from email.utils import parseaddr
 import gradio as gr
 import pandas as pd
 from datasets import load_dataset
 from huggingface_hub import HfApi
 from content import format_error, format_warning, format_log, TITLE
 # Placeholder for the question_scorer function
 def question_scorer(prediction, gold_answer):
     return 1 if prediction == gold_answer else 0
 eval_results = load_dataset(RESULTS_DATASET, token=TOKEN, download_mode="force_redownload",
                             ignore_verifications=True, trust_remote_code=True)
 gold_results = load_dataset(DATA_DATASET, token=TOKEN, trust_remote_code=True)
 gold_answers = {split: {row["id"]: row["answer"] for row in gold_results[split]} for split in ["test"]}
+gold_difficulties = {split: {row["id"]: row["difficulty"] for row in gold_results[split]} for split in ["test"]}
 # Function to get dataframe from results
     df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
     return df
+# Update function to format dataframe
+def format_dataframe(df):
+    df["Accuracy"] = df["Accuracy"].apply(lambda x: f"**{x:.2f}**")
+    if "URL" in df.columns:
+        df["Model Name"] = df.apply(lambda row: f"[{row['Model Name']}]({row['URL']})", axis=1)
+        df = df.drop(columns=["URL"])
+    df = df.rename(columns={"Model Family": "Base Model"})
+    df = df[["Model Name", "Accuracy", "Accuracy (easy)", "Accuracy (medium)", "Accuracy (hard)", "Answer rate", "Precision", "EM", "Base Model", "Organization"]]
+    return df
 eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
+eval_dataframe_test = format_dataframe(eval_dataframe_test)
 # Function to restart the space
     api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
+TYPES = ["markdown", "markdown", "number", "number", "number", "number", "number", "number", "str", "str"]
 # Function to add a new evaluation
     file_path = path_to_file.name
     scores = 0
     num_questions = 0
+    difficulty_scores = {"Easy": 0, "Medium": 0, "Hard": 0}
+    difficulty_counts = {"Easy": 0, "Medium": 0, "Hard": 0}
     with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
         with open(file_path, 'r') as f:
             for ix, line in enumerate(f):
                         f"{task_id} not found in test set. Are you sure you submitted the correct file?")
                 score = question_scorer(task['answer'], gold_answers["test"][task_id])
+                difficulty = gold_difficulties["test"][task_id]
                 scored_file.write(
                     json.dumps({
                         "id": task_id,
                         "score": score
                     }) + "\n"
                 )
                 scores += score
                 num_questions += 1
+                difficulty_scores[difficulty] += score
+                difficulty_counts[difficulty] += 1
+    accuracy_easy = difficulty_scores["Easy"] / difficulty_counts["Easy"] if difficulty_counts["Easy"] > 0 else 0
+    accuracy_medium = difficulty_scores["Medium"] / difficulty_counts["Medium"] if difficulty_counts["Medium"] > 0 else 0
+    accuracy_hard = difficulty_scores["Hard"] / difficulty_counts["Hard"] if difficulty_counts["Hard"] > 0 else 0
     api.upload_file(
         repo_id=SUBMISSION_DATASET,
     eval_entry = {
         "Model Name": model_name,
+        "Base Model": model_family,
         "URL": url,
         "Organization": organization,
         "Accuracy": scores / num_questions if num_questions > 0 else 0,
+        "Accuracy (easy)": accuracy_easy,
+        "Accuracy (medium)": accuracy_medium,
+        "Accuracy (hard)": accuracy_hard,
         "Answer rate": scores / num_questions if num_questions > 0 else 0,
         "Precision": scores / num_questions if num_questions > 0 else 0,
+        "EM": scores if num_questions > 0 else 0
     }
     eval_results["test"] = eval_results["test"].add_item(eval_entry)
     eval_results.push_to_hub(RESULTS_DATASET, config_name=YEAR_VERSION, token=TOKEN)
     eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload",
                                 ignore_verifications=True, trust_remote_code=True)
     eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
+    eval_dataframe_test = format_dataframe(eval_dataframe_test)
     return eval_dataframe_test
     with gr.Accordion("Submit a new model for evaluation"):
         with gr.Row():
             gr.Markdown("""
+                To make a new submission, upload a predictions file. Our scoring function can be found [here](https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/scorer.py). We support JSONL files with the following format:
                 ```
                 {"id": "task_id_1", "answer": "Answer 1 from your model"}
                 {"id": "task_id_2", "answer": "Answer 2 from your model"}
                 ```
             """)
         with gr.Row():
             with gr.Column():
                 model_name_textbox = gr.Textbox(label="Model Name")
+                model_family_textbox = gr.Textbox(label="Base Model")
                 url_textbox = gr.Textbox(label="URL to Model Information")
             with gr.Column():
                 organization = gr.Textbox(label="Organization")
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
+            citation_text = """@article{yoran-etal-2024-assistantbench,
     title={AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?},
     author={Ori Yoran and Samuel Amouyal and Chaitanya Malaviya and Ben Bogin and Ofir Press and Jonathan Berant},
     year={2024},
+    eprint={?},
     archivePrefix={arXiv},
     primaryClass={cs.CL}
 }"""
             )
     gr.HTML(
+        "<p>We would like to thank the GAIA team for sharing the source code for their leaderboard which we used as a template and HuggingFace for hosting the leaderboard.</p>")
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600)
 scheduler.start()
+demo.launch(debug=True)