Spaces:

babylm
/

leaderboard-2024

Running

App Files Files Community

Aaron Mueller commited on Dec 2, 2024

Commit

a5487ef

1 Parent(s): e7750ca

update interface/error messages

Browse files

Files changed (4) hide show

app.py +3 -2
src/display/utils.py +135 -1
src/submission/check_validity.py +68 -0
src/submission/submit.py +11 -4

app.py CHANGED Viewed

@@ -97,7 +97,7 @@ def process_json(temp_file):
     except Exception as e:
         raise gr.Error(f"Error processing file: {str(e)}")
-    return data
 demo = gr.Blocks(css=custom_css)
@@ -174,10 +174,11 @@ with demo:
                     predictions_data = gr.State()
                     upload_button = gr.UploadButton(label="Upload predictions", file_types=[".json", ".gz"], file_count="single")
                     upload_button.upload(
                         fn=process_json,
                         inputs=upload_button,
-                        outputs=predictions_data,
                         api_name="upload_json"
                     )

     except Exception as e:
         raise gr.Error(f"Error processing file: {str(e)}")
+    return data, data   # two: one for state (passed to data handler), one for display
 demo = gr.Blocks(css=custom_css)
                     predictions_data = gr.State()
                     upload_button = gr.UploadButton(label="Upload predictions", file_types=[".json", ".gz"], file_count="single")
+                    json_display = gr.JSON()
                     upload_button.upload(
                         fn=process_json,
                         inputs=upload_button,
+                        outputs=(predictions_data, json_display),
                         api_name="upload_json"
                     )

src/display/utils.py CHANGED Viewed

@@ -74,4 +74,138 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]
-BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]

 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]
+BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
+TEXT_TASKS = {
+    "glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
+            "boolq", "multirc", "wsc"],
+    # Lots of BLiMP tasks – use verifier function below to see if you've included everything.
+    "blimp": [taskname.split(".jsonl")[0] for taskname in os.listdir("evaluation_data/blimp_filtered/")],
+    "blimp_supplement": ["hypernym", "qa_congruence_easy", "qa_congruence_tricky",
+                "subject_aux_inversion", "turn_taking"],
+    "ewok": ["agent-properties", "material-dynamics", "material-properties", "physical-dynamics",
+            "physical-interactions", "physical-relations", "quantitative-properties",
+            "social-interactions", "social-properties", "social-relations", "spatial-relations"]
+}
+VISION_TASKS = {
+    "vqa": ["vqa"],
+    "winoground": ["winoground"],
+    "devbench": ["lex-viz_vocab", "gram-trog", "sem-things"]
+}
+NUM_EXPECTED_EXAMPLES = {
+    "glue": {
+        "cola": 522,
+        "sst2": 436,
+        "mrpc": 204,
+        "qqp": 20215,
+        "mnli": 4908,
+        "mnli-mm": 4916,
+        "qnli": 2732,
+        "rte": 139,
+        "boolq": 1635,
+        "multirc": 2424,
+        "wsc": 52
+    },
+    "blimp": {
+        "adjunct_island": 928,
+        "anaphor_gender_agreement": 971,
+        "anaphor_number_agreement": 931,
+        "animate_subject_passive": 895,
+        "animate_subject_trans": 923,
+        "causative": 818,
+        "complex_NP_island": 846,
+        "coordinate_structure_constraint_complex_left_branch": 906,
+        "coordinate_structure_constraint_object_extraction": 949,
+        "determiner_noun_agreement_1": 929,
+        "determiner_noun_agreement_2": 931,
+        "determiner_noun_agreement_irregular_1": 681,
+        "determiner_noun_agreement_irregular_2": 820,
+        "determiner_noun_agreement_with_adjective_1": 933,
+        "determiner_noun_agreement_with_adj_2": 941,
+        "determiner_noun_agreement_with_adj_irregular_1": 718,
+        "determiner_noun_agreement_with_adj_irregular_2": 840,
+        "distractor_agreement_relational_noun": 788,
+        "distractor_agreement_relative_clause": 871,
+        "drop_argument": 920,
+        "ellipsis_n_bar_1": 802,
+        "ellipsis_n_bar_2": 828,
+        "existential_there_object_raising": 812,
+        "existential_there_quantifiers_1": 930,
+        "existential_there_quantifiers_2": 911,
+        "existential_there_subject_raising": 924,
+        "expletive_it_object_raising": 759,
+        "inchoative": 855,
+        "intransitive": 868,
+        "irregular_past_participle_adjectives": 961,
+        "irregular_past_participle_verbs": 942,
+        "irregular_plural_subject_verb_agreement_1": 804,
+        "irregular_plural_subject_verb_agreement_2": 892,
+        "left_branch_island_echo_question": 947,
+        "left_branch_island_simple_question": 951,
+        "matrix_question_npi_licensor_present": 929,
+        "npi_present_1": 909,
+        "npi_present_2": 914,
+        "only_npi_licensor_present": 882,
+        "only_npi_scope": 837,
+        "passive_1": 840,
+        "passive_2": 903,
+        "principle_A_case_1": 912,
+        "principle_A_case_2": 915,
+        "principle_A_c_command": 946,
+        "principle_A_domain_1": 914,
+        "principle_A_domain_2": 915,
+        "principle_A_domain_3": 941,
+        "principle_A_reconstruction": 967,
+        "regular_plural_subject_verb_agreement_1": 890,
+        "regular_plural_subject_verb_agreement_2": 945,
+        "sentential_negation_npi_licensor_present": 919,
+        "sentential_negation_npi_scope": 871,
+        "sentential_subject_island": 961,
+        "superlative_quantifiers_1": 979,
+        "superlative_quantifiers_2": 986,
+        "tough_vs_raising_1": 948,
+        "tough_vs_raising_2": 920,
+        "transitive": 868,
+        "wh_island": 960,
+        "wh_questions_object_gap": 859,
+        "wh_questions_subject_gap": 898,
+        "wh_questions_subject_gap_long_distance": 857,
+        "wh_vs_that_no_gap": 861,
+        "wh_vs_that_no_gap_long_distance": 875,
+        "wh_vs_that_with_gap": 919,
+        "wh_vs_that_with_gap_long_distance": 910
+    },
+    "blimp_supplement": {
+        "hypernym": 842,
+        "qa_congruence_easy": 64,
+        "qa_congruence_tricky": 165,
+        "subject_aux_inversion": 3867,
+        "turn_taking": 280
+    },
+    "ewok": {
+        "agent-properties": 2210,
+        "material-dynamics": 770,
+        "material-properties": 170,
+        "physical-dynamics": 120,
+        "physical-interactions": 556,
+        "physical-relations": 818,
+        "quantitative-properties": 314,
+        "social-interactions": 294,
+        "social-properties": 328,
+        "social-relations": 1548,
+        "spatial-relations": 490
+    },
+    "vqa": {
+        "vqa": 25230
+    },
+    "winoground": {
+        "winoground": 746
+    },
+    "devbench": {
+        "lex-viz_vocab": 119,
+        "gram-trog": 76,
+        "sem-things": 1854
+    }
+}

src/submission/check_validity.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import os
 import re
 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
@@ -10,6 +11,8 @@ from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:
@@ -97,3 +100,68 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
                     users_to_submission_dates[organisation].append(info["submitted_time"])
     return set(file_names), users_to_submission_dates

 import json
 import os
 import re
+import numpy as np
 from collections import defaultdict
 from datetime import datetime, timedelta, timezone
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
+from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:
                     users_to_submission_dates[organisation].append(info["submitted_time"])
     return set(file_names), users_to_submission_dates
+def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
+    out_msg = ""
+    for task in TEXT_TASKS:
+        if task not in predictions:
+            out_msg = f"Error: {task} not present"
+            break
+        for subtask in TEXT_TASKS[task]:
+            if subtask not in predictions[task]:
+                out_msg = f"Error: {subtask} not present under {task}"
+                break
+        if out_msg != "":
+            break
+    if "vqa" in predictions or "winoground" in predictions or "devbench" in predictions:
+        for task in VISION_TASKS:
+            if task not in predictions:
+                out_msg = f"Error: {task} not present"
+                break
+            for subtask in VISION_TASKS[task]:
+                if subtask not in predictions[task]:
+                    out_msg = f"Error: {subtask} not present under {task}"
+                    break
+            if out_msg != "":
+                break
+    # Make sure all examples have predictions, and that predictions are the correct type
+    for task in predictions:
+        for subtask in predictions[task]:
+            if task == "devbench":
+                a = np.array(predictions[task][subtask]["predictions"])
+                if subtask == "sem-things":
+                    required_shape = (1854, 1854)
+                elif subtask == "gram-trog":
+                    required_shape = (76, 4, 1)
+                elif subtask == "lex-viz_vocab":
+                    required_shape = (119, 4, 1)
+                if a.shape[0] != required_shape[0] or a.shape[1] != required_shape[1]:
+                    out_msg = f"Error: Wrong shape for results for `{subtask}` in `{task}`."
+                    break
+                if not str(a.dtype).startswith("float"):
+                    out_msg = f"Error: Results for `{subtask}` ({task}) \
+                        should be floats but aren't."
+                    break
+                continue
+            num_expected_examples = NUM_EXPECTED_EXAMPLES[task][subtask]
+            if len(predictions[task][subtask]["predictions"]) != num_expected_examples:
+                out_msg = f"Error: {subtask} has the wrong number of examples."
+                break
+            if task == "glue":
+                if type(predictions[task][subtask]["predictions"][0]["pred"]) != int:
+                    out_msg = f"Error: results for `{subtask}` (`{task}`) should be integers but aren't."
+                    break
+            else:
+                if type(predictions[task][subtask]["predictions"][0]["pred"]) != str:
+                    out_msg = f"Error: results for `{subtask}` (`{task}`) should be strings but aren't."
+                    break
+        if out_msg != "":
+            break
+    if out_msg != "":
+        return False, out_msg
+    return True, "Upload successful."

src/submission/submit.py CHANGED Viewed

@@ -9,6 +9,7 @@ from src.submission.check_validity import (
     check_model_card,
     get_model_size,
     is_model_on_hub,
 )
 REQUESTED_MODELS = None
@@ -43,16 +44,22 @@ def add_new_eval(
     # Does the model actually exist?
     if revision == "":
         revision = "main"
     # Is the model info correctly filled?
     try:
         model_info = API.model_info(repo_id=model_id, revision=revision)
     except Exception:
-        return styled_warning("Could not get your model information. Please fill it up properly.")
     modelcard_OK, error_msg = check_model_card(model_name)
     if not modelcard_OK:
-        return styled_warning(error_msg)
     # Seems good, creating the eval
     print("Adding new eval")
@@ -70,12 +77,12 @@ def add_new_eval(
     # Check for duplicate submission
     if f"{model_name}_{revision}_{track}" in REQUESTED_MODELS:
-        return styled_warning("This model has been already submitted.")
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{track}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

     check_model_card,
     get_model_size,
     is_model_on_hub,
+    is_valid_predictions,
 )
 REQUESTED_MODELS = None
     # Does the model actually exist?
     if revision == "":
         revision = "main"
+    out_message = ""
     # Is the model info correctly filled?
     try:
         model_info = API.model_info(repo_id=model_id, revision=revision)
     except Exception:
+        out_message += styled_warning("Could not get your model information. The leaderboard entry will not have a link to its HF repo.") + "<br>"
     modelcard_OK, error_msg = check_model_card(model_name)
     if not modelcard_OK:
+        out_message += styled_warning(error_msg) + "<br>"
+    predictions_OK, error_msg = is_valid_predictions(predictions)
+    if not predictions_OK:
+        return styled_error(error_msg) + "<br>"
     # Seems good, creating the eval
     print("Adding new eval")
     # Check for duplicate submission
     if f"{model_name}_{revision}_{track}" in REQUESTED_MODELS:
+        return styled_error("A model with this name has been already submitted.")
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
+    out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))