Spaces:

gaia-benchmark
/

leaderboard

Running on CPU Upgrade

App Files Files Community

Clémentine commited on Oct 31, 2023

Commit

5f9d165

1 Parent(s): d0c2655

small visu fixes

Browse files

Files changed (2) hide show

app.py +40 -17
content.py +16 -7

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from huggingface_hub import HfApi
 # InfoStrings
 from scorer import question_scorer
-from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
 TOKEN = os.environ.get("TOKEN", None)
@@ -21,7 +21,7 @@ OWNER="gaia-benchmark"
 DATA_DATASET = f"{OWNER}/GAIA"
 INTERNAL_DATA_DATASET = f"{OWNER}/GAIA_internal"
 SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
-RESULTS_DATASET = f"{OWNER}/results"
 LEADERBOARD_PATH = f"{OWNER}/leaderboard"
 api = HfApi()
@@ -30,27 +30,40 @@ YEAR_VERSION = "2023"
 os.makedirs("scored", exist_ok=True)
 # Display the results
-eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, use_auth_token=TOKEN)
-eval_dataframe_val = pd.DataFrame(eval_results["validation"].remove_columns("mail"))
-eval_dataframe_test = pd.DataFrame(eval_results["test"].remove_columns("mail"))
 # Gold answers
 gold_results = {}
-gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", use_auth_token=TOKEN)
 gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "validation"]}
 def restart_space():
     api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
-COLS = ["Model", "Score ⬆️", "Organisation"]
-TYPES = ["str", "number", "str",]
 def add_new_eval(
     val_or_test: str,
     model: str,
-    path_to_file,
     organisation: str,
     mail: str,
 ):
@@ -120,6 +133,9 @@ def add_new_eval(
     # Actual submission
     eval_entry = {
         "model": model,
         "organisation": organisation,
         "mail": mail,
         "score": scores["all"]/num_questions["all"],
@@ -131,13 +147,13 @@ def add_new_eval(
     print(eval_results)
     eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
-    return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait for up to an hour to see the score displayed")
 def refresh():
-    eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, use_auth_token=TOKEN, download_mode="force_redownload")
-    eval_dataframe_val = pd.DataFrame(eval_results["validation"].remove_columns("mail"))
-    eval_dataframe_test = pd.DataFrame(eval_results["test"].remove_columns("mail"))
     return eval_dataframe_val, eval_dataframe_test
 def upload_file(files):
@@ -160,11 +176,11 @@ with demo:
     with gr.Tab("Results: Validation"):
         leaderboard_table_val = gr.components.Dataframe(
-            value=eval_dataframe_val, headers=COLS, datatype=TYPES, interactive=False,
         )
     with gr.Tab("Results: Test"):
         leaderboard_table_test = gr.components.Dataframe(
-            value=eval_dataframe_test, headers=COLS, datatype=TYPES, interactive=False,
         )
     refresh_button = gr.Button("Refresh")
@@ -181,10 +197,14 @@ with demo:
             with gr.Column():
                 level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
                 model_name_textbox = gr.Textbox(label="Model name")
-                file_output = gr.File()
             with gr.Column():
                 organisation = gr.Textbox(label="Organisation")
                 mail = gr.Textbox(label="Contact email")
         submit_button = gr.Button("Submit Eval")
         submission_result = gr.Markdown()
@@ -193,6 +213,9 @@ with demo:
             [
                 level_of_test,
                 model_name_textbox,
                 file_output,
                 organisation,
                 mail

 # InfoStrings
 from scorer import question_scorer
+from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
 TOKEN = os.environ.get("TOKEN", None)
 DATA_DATASET = f"{OWNER}/GAIA"
 INTERNAL_DATA_DATASET = f"{OWNER}/GAIA_internal"
 SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
+RESULTS_DATASET = f"{OWNER}/results_public"
 LEADERBOARD_PATH = f"{OWNER}/leaderboard"
 api = HfApi()
 os.makedirs("scored", exist_ok=True)
 # Display the results
+eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload")
+def get_dataframe_from_results(eval_results, split):
+    local_df = eval_results[split]
+    local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
+    local_df = local_df.remove_columns(["mail", "system_prompt", "url"])
+    local_df = local_df.rename_column("model", "Model name")
+    local_df = local_df.rename_column("model_family", "Model family")
+    local_df = local_df.rename_column("score", "Average score (%)")
+    for i in [1, 2, 3]:
+        local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)")
+    df = pd.DataFrame(local_df)
+    return df
+eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
+eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
 # Gold answers
 gold_results = {}
+gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", token=TOKEN)
 gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "validation"]}
 def restart_space():
     api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
+TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]
 def add_new_eval(
     val_or_test: str,
     model: str,
+    model_family: str,
+    system_prompt: str,
+    url: str,
+    path_to_file: str,
     organisation: str,
     mail: str,
 ):
     # Actual submission
     eval_entry = {
         "model": model,
+        "model_family": model_family,
+        "system_prompt": system_prompt,
+        "url": url,
         "organisation": organisation,
         "mail": mail,
         "score": scores["all"]/num_questions["all"],
     print(eval_results)
     eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
+    return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")
 def refresh():
+    eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload")
+    eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
+    eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
     return eval_dataframe_val, eval_dataframe_test
 def upload_file(files):
     with gr.Tab("Results: Validation"):
         leaderboard_table_val = gr.components.Dataframe(
+            value=eval_dataframe_val, datatype=TYPES, interactive=False,
         )
     with gr.Tab("Results: Test"):
         leaderboard_table_test = gr.components.Dataframe(
+            value=eval_dataframe_test, datatype=TYPES, interactive=False,
         )
     refresh_button = gr.Button("Refresh")
             with gr.Column():
                 level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
                 model_name_textbox = gr.Textbox(label="Model name")
+                model_family_textbox = gr.Textbox(label="Model family")
+                system_prompt_textbox = gr.Textbox(label="System prompt example")
+                url_textbox = gr.Textbox(label="Url to model information")
             with gr.Column():
                 organisation = gr.Textbox(label="Organisation")
                 mail = gr.Textbox(label="Contact email")
+                file_output = gr.File()
         submit_button = gr.Button("Submit Eval")
         submission_result = gr.Markdown()
             [
                 level_of_test,
                 model_name_textbox,
+                model_family_textbox,
+                system_prompt_textbox,
+                url_textbox,
                 file_output,
                 organisation,
                 mail

content.py CHANGED Viewed

@@ -2,18 +2,25 @@ TITLE = """<h1 align="center" id="space-title">GAIA Leaderboard</h1>"""
 CANARY_STRING = "" # TODO
-INTRODUCTION_TEXT = f"""
 Large language models have seen their potential capabilities increased by several orders of magnitude with the introduction of augmentations, from simple prompting adjustement to actual external tooling (calculators, vision models, ...) or online web retrieval.
 To evaluate the next generation of LLMs, we argue for a new kind of benchmark, simple and yet effective to measure actual progress on augmented capabilities,
 We therefore present GAIA.
 GAIA is made of 3 evaluation levels, depending on the added level of tooling and autonomy the model needs.
 We expect the level 1 to be breakable by very good LLMs, and the level 3 to indicate a strong jump in model capabilities.
-Each of these levels is divided into two sets: a public dev set, on which people can self report their results, and a private test set, which will be unlocked once public performance passes a threshold on the dev set.
-Please do not repost the public dev set, nor use it in training data for your models. Its canary string is """ + CANARY_STRING + """ and files containing this string should be removed from training data.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -21,8 +28,6 @@ CITATION_BUTTON_TEXT = r"""@misc{gaia, # TODO
   author = {tbd},
   title = {General AI Assistant benchamrk},
   year = {2023},
-  #publisher = {Hugging Face},
-  #howpublished = "\url{https://huggingface.co/spaces/gaia-benchmark/}"
 }"""
@@ -30,4 +35,8 @@ def format_warning(msg):
     return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
 def format_log(msg):
-    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"

 CANARY_STRING = "" # TODO
+INTRODUCTION_TEXT = """
 Large language models have seen their potential capabilities increased by several orders of magnitude with the introduction of augmentations, from simple prompting adjustement to actual external tooling (calculators, vision models, ...) or online web retrieval.
 To evaluate the next generation of LLMs, we argue for a new kind of benchmark, simple and yet effective to measure actual progress on augmented capabilities,
 We therefore present GAIA.
 GAIA is made of 3 evaluation levels, depending on the added level of tooling and autonomy the model needs.
 We expect the level 1 to be breakable by very good LLMs, and the level 3 to indicate a strong jump in model capabilities.
+Each of these levels is divided into two sets: a fully public dev set, on which people can test their models, and a test set with private answers and metadata. Results can be submitted for both validation and test.
+We expect submissions to be json-line files with the following format:
+```
+{"task_id": "task_id_1", "model_answer": "Answer 1 from your model", "reasoning_trace": "The different steps by which your model reached answer 1"}
+{"task_id": "task_id_2", "model_answer": "Answer 2 from your model", "reasoning_trace": "The different steps by which your model reached answer 2"}
+...
+```
+Scores are expressed as the percentage of correct answers for a given split.
+Please do not repost the public dev set, nor use it in training data for your models.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
   author = {tbd},
   title = {General AI Assistant benchamrk},
   year = {2023},
 }"""
     return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
 def format_log(msg):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'