Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -112,6 +112,7 @@ def add_new_eval(
|
|
112 |
|
113 |
with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
|
114 |
with open(file_path, 'r') as f:
|
|
|
115 |
for ix, line in enumerate(f):
|
116 |
try:
|
117 |
task = json.loads(line)
|
@@ -141,12 +142,17 @@ def add_new_eval(
|
|
141 |
)
|
142 |
|
143 |
all_scores.append({"score": score, "has_ans": has_ans, "model_answer": answer, 'id': task_id})
|
144 |
-
|
145 |
scores += score
|
146 |
num_questions += 1
|
147 |
difficulty_scores[difficulty] += score
|
148 |
difficulty_counts[difficulty] += 1
|
149 |
|
|
|
|
|
|
|
|
|
|
|
150 |
accuracy_easy = difficulty_scores["Easy"] / difficulty_counts["Easy"] if difficulty_counts["Easy"] > 0 else 0
|
151 |
accuracy_medium = difficulty_scores["Medium"] / difficulty_counts["Medium"] if difficulty_counts["Medium"] > 0 else 0
|
152 |
accuracy_hard = difficulty_scores["Hard"] / difficulty_counts["Hard"] if difficulty_counts["Hard"] > 0 else 0
|
|
|
112 |
|
113 |
with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
|
114 |
with open(file_path, 'r') as f:
|
115 |
+
submitted_ids = set()
|
116 |
for ix, line in enumerate(f):
|
117 |
try:
|
118 |
task = json.loads(line)
|
|
|
142 |
)
|
143 |
|
144 |
all_scores.append({"score": score, "has_ans": has_ans, "model_answer": answer, 'id': task_id})
|
145 |
+
submitted_ids.add(task["id"])
|
146 |
scores += score
|
147 |
num_questions += 1
|
148 |
difficulty_scores[difficulty] += score
|
149 |
difficulty_counts[difficulty] += 1
|
150 |
|
151 |
+
# Check if all gold answer IDs are present in the submission
|
152 |
+
missing_ids = set(gold_answers["test"].keys()) - submitted_ids
|
153 |
+
if missing_ids:
|
154 |
+
return format_error(f"Submission is missing the following IDs: {', '.join(missing_ids)}")
|
155 |
+
|
156 |
accuracy_easy = difficulty_scores["Easy"] / difficulty_counts["Easy"] if difficulty_counts["Easy"] > 0 else 0
|
157 |
accuracy_medium = difficulty_scores["Medium"] / difficulty_counts["Medium"] if difficulty_counts["Medium"] > 0 else 0
|
158 |
accuracy_hard = difficulty_scores["Hard"] / difficulty_counts["Hard"] if difficulty_counts["Hard"] > 0 else 0
|