Aaron Mueller commited on
Commit
a5487ef
·
1 Parent(s): e7750ca

update interface/error messages

Browse files
app.py CHANGED
@@ -97,7 +97,7 @@ def process_json(temp_file):
97
  except Exception as e:
98
  raise gr.Error(f"Error processing file: {str(e)}")
99
 
100
- return data
101
 
102
 
103
  demo = gr.Blocks(css=custom_css)
@@ -174,10 +174,11 @@ with demo:
174
 
175
  predictions_data = gr.State()
176
  upload_button = gr.UploadButton(label="Upload predictions", file_types=[".json", ".gz"], file_count="single")
 
177
  upload_button.upload(
178
  fn=process_json,
179
  inputs=upload_button,
180
- outputs=predictions_data,
181
  api_name="upload_json"
182
  )
183
 
 
97
  except Exception as e:
98
  raise gr.Error(f"Error processing file: {str(e)}")
99
 
100
+ return data, data # two: one for state (passed to data handler), one for display
101
 
102
 
103
  demo = gr.Blocks(css=custom_css)
 
174
 
175
  predictions_data = gr.State()
176
  upload_button = gr.UploadButton(label="Upload predictions", file_types=[".json", ".gz"], file_count="single")
177
+ json_display = gr.JSON()
178
  upload_button.upload(
179
  fn=process_json,
180
  inputs=upload_button,
181
+ outputs=(predictions_data, json_display),
182
  api_name="upload_json"
183
  )
184
 
src/display/utils.py CHANGED
@@ -74,4 +74,138 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
74
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
75
 
76
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
77
- BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
75
 
76
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
77
+ BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
78
+
79
+ TEXT_TASKS = {
80
+ "glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
81
+ "boolq", "multirc", "wsc"],
82
+ # Lots of BLiMP tasks – use verifier function below to see if you've included everything.
83
+ "blimp": [taskname.split(".jsonl")[0] for taskname in os.listdir("evaluation_data/blimp_filtered/")],
84
+ "blimp_supplement": ["hypernym", "qa_congruence_easy", "qa_congruence_tricky",
85
+ "subject_aux_inversion", "turn_taking"],
86
+ "ewok": ["agent-properties", "material-dynamics", "material-properties", "physical-dynamics",
87
+ "physical-interactions", "physical-relations", "quantitative-properties",
88
+ "social-interactions", "social-properties", "social-relations", "spatial-relations"]
89
+ }
90
+
91
+ VISION_TASKS = {
92
+ "vqa": ["vqa"],
93
+ "winoground": ["winoground"],
94
+ "devbench": ["lex-viz_vocab", "gram-trog", "sem-things"]
95
+ }
96
+
97
+ NUM_EXPECTED_EXAMPLES = {
98
+ "glue": {
99
+ "cola": 522,
100
+ "sst2": 436,
101
+ "mrpc": 204,
102
+ "qqp": 20215,
103
+ "mnli": 4908,
104
+ "mnli-mm": 4916,
105
+ "qnli": 2732,
106
+ "rte": 139,
107
+ "boolq": 1635,
108
+ "multirc": 2424,
109
+ "wsc": 52
110
+ },
111
+ "blimp": {
112
+ "adjunct_island": 928,
113
+ "anaphor_gender_agreement": 971,
114
+ "anaphor_number_agreement": 931,
115
+ "animate_subject_passive": 895,
116
+ "animate_subject_trans": 923,
117
+ "causative": 818,
118
+ "complex_NP_island": 846,
119
+ "coordinate_structure_constraint_complex_left_branch": 906,
120
+ "coordinate_structure_constraint_object_extraction": 949,
121
+ "determiner_noun_agreement_1": 929,
122
+ "determiner_noun_agreement_2": 931,
123
+ "determiner_noun_agreement_irregular_1": 681,
124
+ "determiner_noun_agreement_irregular_2": 820,
125
+ "determiner_noun_agreement_with_adjective_1": 933,
126
+ "determiner_noun_agreement_with_adj_2": 941,
127
+ "determiner_noun_agreement_with_adj_irregular_1": 718,
128
+ "determiner_noun_agreement_with_adj_irregular_2": 840,
129
+ "distractor_agreement_relational_noun": 788,
130
+ "distractor_agreement_relative_clause": 871,
131
+ "drop_argument": 920,
132
+ "ellipsis_n_bar_1": 802,
133
+ "ellipsis_n_bar_2": 828,
134
+ "existential_there_object_raising": 812,
135
+ "existential_there_quantifiers_1": 930,
136
+ "existential_there_quantifiers_2": 911,
137
+ "existential_there_subject_raising": 924,
138
+ "expletive_it_object_raising": 759,
139
+ "inchoative": 855,
140
+ "intransitive": 868,
141
+ "irregular_past_participle_adjectives": 961,
142
+ "irregular_past_participle_verbs": 942,
143
+ "irregular_plural_subject_verb_agreement_1": 804,
144
+ "irregular_plural_subject_verb_agreement_2": 892,
145
+ "left_branch_island_echo_question": 947,
146
+ "left_branch_island_simple_question": 951,
147
+ "matrix_question_npi_licensor_present": 929,
148
+ "npi_present_1": 909,
149
+ "npi_present_2": 914,
150
+ "only_npi_licensor_present": 882,
151
+ "only_npi_scope": 837,
152
+ "passive_1": 840,
153
+ "passive_2": 903,
154
+ "principle_A_case_1": 912,
155
+ "principle_A_case_2": 915,
156
+ "principle_A_c_command": 946,
157
+ "principle_A_domain_1": 914,
158
+ "principle_A_domain_2": 915,
159
+ "principle_A_domain_3": 941,
160
+ "principle_A_reconstruction": 967,
161
+ "regular_plural_subject_verb_agreement_1": 890,
162
+ "regular_plural_subject_verb_agreement_2": 945,
163
+ "sentential_negation_npi_licensor_present": 919,
164
+ "sentential_negation_npi_scope": 871,
165
+ "sentential_subject_island": 961,
166
+ "superlative_quantifiers_1": 979,
167
+ "superlative_quantifiers_2": 986,
168
+ "tough_vs_raising_1": 948,
169
+ "tough_vs_raising_2": 920,
170
+ "transitive": 868,
171
+ "wh_island": 960,
172
+ "wh_questions_object_gap": 859,
173
+ "wh_questions_subject_gap": 898,
174
+ "wh_questions_subject_gap_long_distance": 857,
175
+ "wh_vs_that_no_gap": 861,
176
+ "wh_vs_that_no_gap_long_distance": 875,
177
+ "wh_vs_that_with_gap": 919,
178
+ "wh_vs_that_with_gap_long_distance": 910
179
+ },
180
+ "blimp_supplement": {
181
+ "hypernym": 842,
182
+ "qa_congruence_easy": 64,
183
+ "qa_congruence_tricky": 165,
184
+ "subject_aux_inversion": 3867,
185
+ "turn_taking": 280
186
+ },
187
+ "ewok": {
188
+ "agent-properties": 2210,
189
+ "material-dynamics": 770,
190
+ "material-properties": 170,
191
+ "physical-dynamics": 120,
192
+ "physical-interactions": 556,
193
+ "physical-relations": 818,
194
+ "quantitative-properties": 314,
195
+ "social-interactions": 294,
196
+ "social-properties": 328,
197
+ "social-relations": 1548,
198
+ "spatial-relations": 490
199
+ },
200
+ "vqa": {
201
+ "vqa": 25230
202
+ },
203
+ "winoground": {
204
+ "winoground": 746
205
+ },
206
+ "devbench": {
207
+ "lex-viz_vocab": 119,
208
+ "gram-trog": 76,
209
+ "sem-things": 1854
210
+ }
211
+ }
src/submission/check_validity.py CHANGED
@@ -1,6 +1,7 @@
1
  import json
2
  import os
3
  import re
 
4
  from collections import defaultdict
5
  from datetime import datetime, timedelta, timezone
6
 
@@ -10,6 +11,8 @@ from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
  from transformers.models.auto.tokenization_auto import AutoTokenizer
12
 
 
 
13
  def check_model_card(repo_id: str) -> tuple[bool, str]:
14
  """Checks if the model card and license exist and have been filled"""
15
  try:
@@ -97,3 +100,68 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
97
  users_to_submission_dates[organisation].append(info["submitted_time"])
98
 
99
  return set(file_names), users_to_submission_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
  import re
4
+ import numpy as np
5
  from collections import defaultdict
6
  from datetime import datetime, timedelta, timezone
7
 
 
11
  from transformers import AutoConfig
12
  from transformers.models.auto.tokenization_auto import AutoTokenizer
13
 
14
+ from src.display.utils import TEXT_TASKS, VISION_TASKS, NUM_EXPECTED_EXAMPLES
15
+
16
  def check_model_card(repo_id: str) -> tuple[bool, str]:
17
  """Checks if the model card and license exist and have been filled"""
18
  try:
 
100
  users_to_submission_dates[organisation].append(info["submitted_time"])
101
 
102
  return set(file_names), users_to_submission_dates
103
+
104
+ def is_valid_predictions(predictions: dict) -> tuple[bool, str]:
105
+ out_msg = ""
106
+ for task in TEXT_TASKS:
107
+ if task not in predictions:
108
+ out_msg = f"Error: {task} not present"
109
+ break
110
+ for subtask in TEXT_TASKS[task]:
111
+ if subtask not in predictions[task]:
112
+ out_msg = f"Error: {subtask} not present under {task}"
113
+ break
114
+ if out_msg != "":
115
+ break
116
+ if "vqa" in predictions or "winoground" in predictions or "devbench" in predictions:
117
+ for task in VISION_TASKS:
118
+ if task not in predictions:
119
+ out_msg = f"Error: {task} not present"
120
+ break
121
+ for subtask in VISION_TASKS[task]:
122
+ if subtask not in predictions[task]:
123
+ out_msg = f"Error: {subtask} not present under {task}"
124
+ break
125
+ if out_msg != "":
126
+ break
127
+
128
+ # Make sure all examples have predictions, and that predictions are the correct type
129
+ for task in predictions:
130
+ for subtask in predictions[task]:
131
+ if task == "devbench":
132
+ a = np.array(predictions[task][subtask]["predictions"])
133
+ if subtask == "sem-things":
134
+ required_shape = (1854, 1854)
135
+ elif subtask == "gram-trog":
136
+ required_shape = (76, 4, 1)
137
+ elif subtask == "lex-viz_vocab":
138
+ required_shape = (119, 4, 1)
139
+ if a.shape[0] != required_shape[0] or a.shape[1] != required_shape[1]:
140
+ out_msg = f"Error: Wrong shape for results for `{subtask}` in `{task}`."
141
+ break
142
+ if not str(a.dtype).startswith("float"):
143
+ out_msg = f"Error: Results for `{subtask}` ({task}) \
144
+ should be floats but aren't."
145
+ break
146
+ continue
147
+
148
+ num_expected_examples = NUM_EXPECTED_EXAMPLES[task][subtask]
149
+ if len(predictions[task][subtask]["predictions"]) != num_expected_examples:
150
+ out_msg = f"Error: {subtask} has the wrong number of examples."
151
+ break
152
+
153
+ if task == "glue":
154
+ if type(predictions[task][subtask]["predictions"][0]["pred"]) != int:
155
+ out_msg = f"Error: results for `{subtask}` (`{task}`) should be integers but aren't."
156
+ break
157
+ else:
158
+ if type(predictions[task][subtask]["predictions"][0]["pred"]) != str:
159
+ out_msg = f"Error: results for `{subtask}` (`{task}`) should be strings but aren't."
160
+ break
161
+
162
+ if out_msg != "":
163
+ break
164
+
165
+ if out_msg != "":
166
+ return False, out_msg
167
+ return True, "Upload successful."
src/submission/submit.py CHANGED
@@ -9,6 +9,7 @@ from src.submission.check_validity import (
9
  check_model_card,
10
  get_model_size,
11
  is_model_on_hub,
 
12
  )
13
 
14
  REQUESTED_MODELS = None
@@ -43,16 +44,22 @@ def add_new_eval(
43
  # Does the model actually exist?
44
  if revision == "":
45
  revision = "main"
 
 
46
 
47
  # Is the model info correctly filled?
48
  try:
49
  model_info = API.model_info(repo_id=model_id, revision=revision)
50
  except Exception:
51
- return styled_warning("Could not get your model information. Please fill it up properly.")
52
 
53
  modelcard_OK, error_msg = check_model_card(model_name)
54
  if not modelcard_OK:
55
- return styled_warning(error_msg)
 
 
 
 
56
 
57
  # Seems good, creating the eval
58
  print("Adding new eval")
@@ -70,12 +77,12 @@ def add_new_eval(
70
 
71
  # Check for duplicate submission
72
  if f"{model_name}_{revision}_{track}" in REQUESTED_MODELS:
73
- return styled_warning("This model has been already submitted.")
74
 
75
  print("Creating eval file")
76
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
77
  os.makedirs(OUT_DIR, exist_ok=True)
78
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{track}.json"
79
 
80
  with open(out_path, "w") as f:
81
  f.write(json.dumps(eval_entry))
 
9
  check_model_card,
10
  get_model_size,
11
  is_model_on_hub,
12
+ is_valid_predictions,
13
  )
14
 
15
  REQUESTED_MODELS = None
 
44
  # Does the model actually exist?
45
  if revision == "":
46
  revision = "main"
47
+
48
+ out_message = ""
49
 
50
  # Is the model info correctly filled?
51
  try:
52
  model_info = API.model_info(repo_id=model_id, revision=revision)
53
  except Exception:
54
+ out_message += styled_warning("Could not get your model information. The leaderboard entry will not have a link to its HF repo.") + "<br>"
55
 
56
  modelcard_OK, error_msg = check_model_card(model_name)
57
  if not modelcard_OK:
58
+ out_message += styled_warning(error_msg) + "<br>"
59
+
60
+ predictions_OK, error_msg = is_valid_predictions(predictions)
61
+ if not predictions_OK:
62
+ return styled_error(error_msg) + "<br>"
63
 
64
  # Seems good, creating the eval
65
  print("Adding new eval")
 
77
 
78
  # Check for duplicate submission
79
  if f"{model_name}_{revision}_{track}" in REQUESTED_MODELS:
80
+ return styled_error("A model with this name has been already submitted.")
81
 
82
  print("Creating eval file")
83
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
84
  os.makedirs(OUT_DIR, exist_ok=True)
85
+ out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request_False_{track}.json"
86
 
87
  with open(out_path, "w") as f:
88
  f.write(json.dumps(eval_entry))