Spaces:
Running
Running
Aaron Mueller
commited on
Commit
·
a5487ef
1
Parent(s):
e7750ca
update interface/error messages
Browse files- app.py +3 -2
- src/display/utils.py +135 -1
- src/submission/check_validity.py +68 -0
- src/submission/submit.py +11 -4
app.py
CHANGED
@@ -97,7 +97,7 @@ def process_json(temp_file):
|
|
97 |
except Exception as e:
|
98 |
raise gr.Error(f"Error processing file: {str(e)}")
|
99 |
|
100 |
-
return data
|
101 |
|
102 |
|
103 |
demo = gr.Blocks(css=custom_css)
|
@@ -174,10 +174,11 @@ with demo:
|
|
174 |
|
175 |
predictions_data = gr.State()
|
176 |
upload_button = gr.UploadButton(label="Upload predictions", file_types=[".json", ".gz"], file_count="single")
|
|
|
177 |
upload_button.upload(
|
178 |
fn=process_json,
|
179 |
inputs=upload_button,
|
180 |
-
outputs=predictions_data,
|
181 |
api_name="upload_json"
|
182 |
)
|
183 |
|
|
|
97 |
except Exception as e:
|
98 |
raise gr.Error(f"Error processing file: {str(e)}")
|
99 |
|
100 |
+
return data, data # two: one for state (passed to data handler), one for display
|
101 |
|
102 |
|
103 |
demo = gr.Blocks(css=custom_css)
|
|
|
174 |
|
175 |
predictions_data = gr.State()
|
176 |
upload_button = gr.UploadButton(label="Upload predictions", file_types=[".json", ".gz"], file_count="single")
|
177 |
+
json_display = gr.JSON()
|
178 |
upload_button.upload(
|
179 |
fn=process_json,
|
180 |
inputs=upload_button,
|
181 |
+
outputs=(predictions_data, json_display),
|
182 |
api_name="upload_json"
|
183 |
)
|
184 |
|
src/display/utils.py
CHANGED
@@ -74,4 +74,138 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
|
74 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
75 |
|
76 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
77 |
-
BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
75 |
|
76 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
77 |
+
BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
|
78 |
+
|
79 |
+
TEXT_TASKS = {
|
80 |
+
"glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
|
81 |
+
"boolq", "multirc", "wsc"],
|
82 |
+
# Lots of BLiMP tasks – use verifier function below to see if you've included everything.
|
83 |
+
"blimp": [taskname.split(".jsonl")[0] for taskname in os.listdir("evaluation_data/blimp_filtered/")],
|
84 |
+
"blimp_supplement": ["hypernym", "qa_congruence_easy", "qa_congruence_tricky",
|
85 |
+
"subject_aux_inversion", "turn_taking"],
|
86 |
+
"ewok": ["agent-properties", "material-dynamics", "material-properties", "physical-dynamics",
|
87 |
+
"physical-interactions", "physical-relations", "quantitative-properties",
|
88 |
+
"social-interactions", "social-properties", "social-relations", "spatial-relations"]
|
89 |
+
}
|
90 |
+
|
91 |
+
VISION_TASKS = {
|
92 |
+
"vqa": ["vqa"],
|
93 |
+
"winoground": ["winoground"],
|
94 |
+
"devbench": ["lex-viz_vocab", "gram-trog", "sem-things"]
|
95 |
+
}
|
96 |
+
|
97 |
+
NUM_EXPECTED_EXAMPLES = {
|
98 |
+
"glue": {
|
99 |
+
"cola": 522,
|
100 |
+
"sst2": 436,
|
101 |
+
"mrpc": 204,
|
102 |
+
"qqp": 20215,
|
103 |
+
"mnli": 4908,
|
104 |
+
"mnli-mm": 4916,
|
105 |
+
"qnli": 2732,
|
106 |
+
"rte": 139,
|
107 |
+
"boolq": 1635,
|
108 |
+
"multirc": 2424,
|
109 |
+
"wsc": 52
|
110 |
+
},
|
111 |
+
"blimp": {
|
112 |
+
"adjunct_island": 928,
|
113 |
+
"anaphor_gender_agreement": 971,
|
114 |
+
"anaphor_number_agreement": 931,
|
115 |
+
"animate_subject_passive": 895,
|
116 |
+
"animate_subject_trans": 923,
|
117 |
+
"causative": 818,
|
118 |
+
"complex_NP_island": 846,
|
119 |
+
"coordinate_structure_constraint_complex_left_branch": 906,
|
120 |
+
"coordinate_structure_constraint_object_extraction": 949,
|
121 |
+
"determiner_noun_agreement_1": 929,
|
122 |
+
"determiner_noun_agreement_2": 931,
|
123 |
+
"determiner_noun_agreement_irregular_1": 681,
|
124 |
+
"determiner_noun_agreement_irregular_2": 820,
|
125 |
+
"determiner_noun_agreement_with_adjective_1": 933,
|
126 |
+
"determiner_noun_agreement_with_adj_2": 941,
|
127 |
+
"determiner_noun_agreement_with_adj_irregular_1": 718,
|
128 |
+
"determiner_noun_agreement_with_adj_irregular_2": 840,
|
129 |
+
"distractor_agreement_relational_noun": 788,
|
130 |
+
"distractor_agreement_relative_clause": 871,
|
131 |
+
"drop_argument": 920,
|
132 |
+
"ellipsis_n_bar_1": 802,
|
133 |
+
"ellipsis_n_bar_2": 828,
|
134 |
+
"existential_there_object_raising": 812,
|
135 |
+
"existential_there_quantifiers_1": 930,
|
136 |
+
"existential_there_quantifiers_2": 911,
|
137 |
+
"existential_there_subject_raising": 924,
|
138 |
+
"expletive_it_object_raising": 759,
|
139 |
+
"inchoative": 855,
|
140 |
+
"intransitive": 868,
|
141 |
+
"irregular_past_participle_adjectives": 961,
|
142 |
+
"irregular_past_participle_verbs": 942,
|
143 |
+
"irregular_plural_subject_verb_agreement_1": 804,
|
144 |
+
"irregular_plural_subject_verb_agreement_2": 892,
|
145 |
+
"left_branch_island_echo_question": 947,
|
146 |
+
"left_branch_island_simple_question": 951,
|
147 |
+
"matrix_question_npi_licensor_present": 929,
|
148 |
+
"npi_present_1": 909,
|
149 |
+
"npi_present_2": 914,
|
150 |
+
"only_npi_licensor_present": 882,
|
151 |
+
"only_npi_scope": 837,
|
152 |
+
"passive_1": 840,
|
153 |
+
"passive_2": 903,
|
154 |
+
"principle_A_case_1": 912,
|
155 |
+
"principle_A_case_2": 915,
|
156 |
+
"principle_A_c_command": 946,
|
157 |
+
"principle_A_domain_1": 914,
|
158 |
+
"principle_A_domain_2": 915,
|
159 |
+
"principle_A_domain_3": 941,
|
160 |
+
"principle_A_reconstruction": 967,
|
161 |
+
"regular_plural_subject_verb_agreement_1": 890,
|
162 |
+
"regular_plural_subject_verb_agreement_2": 945,
|
163 |
+
"sentential_negation_npi_licensor_present": 919,
|
164 |
+
"sentential_negation_npi_scope": 871,
|
165 |
+
"sentential_subject_island": 961,
|
166 |
+
"superlative_quantifiers_1": 979,
|
167 |
+
"superlative_quantifiers_2": 986,
|
168 |
+
"tough_vs_raising_1": 948,
|
169 |
+
"tough_vs_raising_2": 920,
|
170 |
+
"transitive": 868,
|
171 |
+
"wh_island": 960,
|
172 |
+
"wh_questions_object_gap": 859,
|
173 |
+
"wh_questions_subject_gap": 898,
|
174 |
+
"wh_questions_subject_gap_long_distance": 857,
|
175 |
+
"wh_vs_that_no_gap": 861,
|
176 |
+
"wh_vs_that_no_gap_long_distance": 875,
|
177 |
+
"wh_vs_that_with_gap": 919,
|
178 |
+
"wh_vs_that_with_gap_long_distance": 910
|
179 |
+
},
|
180 |
+
"blimp_supplement": {
|
181 |
+
"hypernym": 842,
|
182 |
+
"qa_congruence_easy": 64,
|
183 |
+
"qa_congruence_tricky": 165,
|
184 |
+
"subject_aux_inversion": 3867,
|
185 |
+
"turn_taking": 280
|
186 |
+
},
|
187 |
+
"ewok": {
|
188 |
+
"agent-properties": 2210,
|
189 |
+
"material-dynamics": 770,
|
190 |
+
"material-properties": 170,
|
191 |
+
"physical-dynamics": 120,
|
192 |
+
"physical-interactions": 556,
|
193 |
+
"physical-relations": 818,
|
194 |
+
"quantitative-properties": 314,
|
195 |
+
"social-interactions": 294,
|
196 |
+
"social-properties": 328,
|
197 |
+
"social-relations": 1548,
|
198 |
+
"spatial-relations": 490
|
199 |
+
},
|
200 |
+
"vqa": {
|
201 |
+
"vqa": 25230
|
202 |
+
},
|
203 |
+
"winoground": {
|
204 |
+
"winoground": 746
|
205 |
+
},
|
206 |
+
"devbench": {
|
207 |
+
"lex-viz_vocab": 119,
|
208 |
+
"gram-trog": 76,
|
209 |
+
"sem-things": 1854
|
210 |
+
}
|
211 |
+
}
|
src/submission/check_validity.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
import re
|
|
|
4 |
from collections import defaultdict
|
5 |
from datetime import datetime, timedelta, timezone
|
6 |
|
@@ -10,6 +11,8 @@ from huggingface_hub.hf_api import ModelInfo
|
|
10 |
from transformers import AutoConfig
|
11 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
12 |
|
|
|
|
|
13 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
14 |
"""Checks if the model card and license exist and have been filled"""
|
15 |
try:
|
@@ -97,3 +100,68 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
97 |
users_to_submission_dates[organisation].append(info["submitted_time"])
|
98 |
|
99 |
return set(file_names), users_to_submission_dates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import re
|
4 |
+
import numpy as np
|
5 |
from collections import defaultdict
|
6 |
from datetime import datetime, timedelta, timezone
|
7 |
|
|
|
11 |
from transformers import AutoConfig
|
12 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
13 |
|
14 |
+
from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
|
15 |
+
|
16 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
17 |
"""Checks if the model card and license exist and have been filled"""
|
18 |
try:
|
|
|
100 |
users_to_submission_dates[organisation].append(info["submitted_time"])
|
101 |
|
102 |
return set(file_names), users_to_submission_dates
|
103 |
+
|
104 |
+
def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
|
105 |
+
out_msg = ""
|
106 |
+
for task in TEXT_TASKS:
|
107 |
+
if task not in predictions:
|
108 |
+
out_msg = f"Error: {task} not present"
|
109 |
+
break
|
110 |
+
for subtask in TEXT_TASKS[task]:
|
111 |
+
if subtask not in predictions[task]:
|
112 |
+
out_msg = f"Error: {subtask} not present under {task}"
|
113 |
+
break
|
114 |
+
if out_msg != "":
|
115 |
+
break
|
116 |
+
if "vqa" in predictions or "winoground" in predictions or "devbench" in predictions:
|
117 |
+
for task in VISION_TASKS:
|
118 |
+
if task not in predictions:
|
119 |
+
out_msg = f"Error: {task} not present"
|
120 |
+
break
|
121 |
+
for subtask in VISION_TASKS[task]:
|
122 |
+
if subtask not in predictions[task]:
|
123 |
+
out_msg = f"Error: {subtask} not present under {task}"
|
124 |
+
break
|
125 |
+
if out_msg != "":
|
126 |
+
break
|
127 |
+
|
128 |
+
# Make sure all examples have predictions, and that predictions are the correct type
|
129 |
+
for task in predictions:
|
130 |
+
for subtask in predictions[task]:
|
131 |
+
if task == "devbench":
|
132 |
+
a = np.array(predictions[task][subtask]["predictions"])
|
133 |
+
if subtask == "sem-things":
|
134 |
+
required_shape = (1854, 1854)
|
135 |
+
elif subtask == "gram-trog":
|
136 |
+
required_shape = (76, 4, 1)
|
137 |
+
elif subtask == "lex-viz_vocab":
|
138 |
+
required_shape = (119, 4, 1)
|
139 |
+
if a.shape[0] != required_shape[0] or a.shape[1] != required_shape[1]:
|
140 |
+
out_msg = f"Error: Wrong shape for results for `{subtask}` in `{task}`."
|
141 |
+
break
|
142 |
+
if not str(a.dtype).startswith("float"):
|
143 |
+
out_msg = f"Error: Results for `{subtask}` ({task}) \
|
144 |
+
should be floats but aren't."
|
145 |
+
break
|
146 |
+
continue
|
147 |
+
|
148 |
+
num_expected_examples = NUM_EXPECTED_EXAMPLES[task][subtask]
|
149 |
+
if len(predictions[task][subtask]["predictions"]) != num_expected_examples:
|
150 |
+
out_msg = f"Error: {subtask} has the wrong number of examples."
|
151 |
+
break
|
152 |
+
|
153 |
+
if task == "glue":
|
154 |
+
if type(predictions[task][subtask]["predictions"][0]["pred"]) != int:
|
155 |
+
out_msg = f"Error: results for `{subtask}` (`{task}`) should be integers but aren't."
|
156 |
+
break
|
157 |
+
else:
|
158 |
+
if type(predictions[task][subtask]["predictions"][0]["pred"]) != str:
|
159 |
+
out_msg = f"Error: results for `{subtask}` (`{task}`) should be strings but aren't."
|
160 |
+
break
|
161 |
+
|
162 |
+
if out_msg != "":
|
163 |
+
break
|
164 |
+
|
165 |
+
if out_msg != "":
|
166 |
+
return False, out_msg
|
167 |
+
return True, "Upload successful."
|
src/submission/submit.py
CHANGED
@@ -9,6 +9,7 @@ from src.submission.check_validity import (
|
|
9 |
check_model_card,
|
10 |
get_model_size,
|
11 |
is_model_on_hub,
|
|
|
12 |
)
|
13 |
|
14 |
REQUESTED_MODELS = None
|
@@ -43,16 +44,22 @@ def add_new_eval(
|
|
43 |
# Does the model actually exist?
|
44 |
if revision == "":
|
45 |
revision = "main"
|
|
|
|
|
46 |
|
47 |
# Is the model info correctly filled?
|
48 |
try:
|
49 |
model_info = API.model_info(repo_id=model_id, revision=revision)
|
50 |
except Exception:
|
51 |
-
|
52 |
|
53 |
modelcard_OK, error_msg = check_model_card(model_name)
|
54 |
if not modelcard_OK:
|
55 |
-
|
|
|
|
|
|
|
|
|
56 |
|
57 |
# Seems good, creating the eval
|
58 |
print("Adding new eval")
|
@@ -70,12 +77,12 @@ def add_new_eval(
|
|
70 |
|
71 |
# Check for duplicate submission
|
72 |
if f"{model_name}_{revision}_{track}" in REQUESTED_MODELS:
|
73 |
-
return
|
74 |
|
75 |
print("Creating eval file")
|
76 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
77 |
os.makedirs(OUT_DIR, exist_ok=True)
|
78 |
-
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{track}.json"
|
79 |
|
80 |
with open(out_path, "w") as f:
|
81 |
f.write(json.dumps(eval_entry))
|
|
|
9 |
check_model_card,
|
10 |
get_model_size,
|
11 |
is_model_on_hub,
|
12 |
+
is_valid_predictions,
|
13 |
)
|
14 |
|
15 |
REQUESTED_MODELS = None
|
|
|
44 |
# Does the model actually exist?
|
45 |
if revision == "":
|
46 |
revision = "main"
|
47 |
+
|
48 |
+
out_message = ""
|
49 |
|
50 |
# Is the model info correctly filled?
|
51 |
try:
|
52 |
model_info = API.model_info(repo_id=model_id, revision=revision)
|
53 |
except Exception:
|
54 |
+
out_message += styled_warning("Could not get your model information. The leaderboard entry will not have a link to its HF repo.") + "<br>"
|
55 |
|
56 |
modelcard_OK, error_msg = check_model_card(model_name)
|
57 |
if not modelcard_OK:
|
58 |
+
out_message += styled_warning(error_msg) + "<br>"
|
59 |
+
|
60 |
+
predictions_OK, error_msg = is_valid_predictions(predictions)
|
61 |
+
if not predictions_OK:
|
62 |
+
return styled_error(error_msg) + "<br>"
|
63 |
|
64 |
# Seems good, creating the eval
|
65 |
print("Adding new eval")
|
|
|
77 |
|
78 |
# Check for duplicate submission
|
79 |
if f"{model_name}_{revision}_{track}" in REQUESTED_MODELS:
|
80 |
+
return styled_error("A model with this name has been already submitted.")
|
81 |
|
82 |
print("Creating eval file")
|
83 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
84 |
os.makedirs(OUT_DIR, exist_ok=True)
|
85 |
+
out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
|
86 |
|
87 |
with open(out_path, "w") as f:
|
88 |
f.write(json.dumps(eval_entry))
|