samuelam commited on
Commit
90a7ae2
1 Parent(s): c5f30ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -5
app.py CHANGED
@@ -132,7 +132,8 @@ def add_new_eval(
132
  json.dumps({
133
  "id": task_id,
134
  "model_answer": answer,
135
- "score": score
 
136
  }) + "\n"
137
  )
138
 
@@ -153,18 +154,23 @@ def add_new_eval(
153
  token=TOKEN
154
  )
155
 
 
 
 
 
 
156
  eval_entry = {
157
  "Model Name": model_name,
158
  "Base Model": model_family,
159
  "URL": url,
160
  "Organization": organization,
161
- "Accuracy": scores / num_questions if num_questions > 0 else 0,
162
  "Accuracy (easy)": accuracy_easy,
163
  "Accuracy (medium)": accuracy_medium,
164
  "Accuracy (hard)": accuracy_hard,
165
- "Answer rate": scores / num_questions if num_questions > 0 else 0,
166
- "Precision": scores / num_questions if num_questions > 0 else 0,
167
- "EM": scores if num_questions > 0 else 0
168
  }
169
  eval_results["test"] = eval_results["test"].add_item(eval_entry)
170
  eval_results.push_to_hub(RESULTS_DATASET, config_name=YEAR_VERSION, token=TOKEN)
 
132
  json.dumps({
133
  "id": task_id,
134
  "model_answer": answer,
135
+ "score": score,
136
+ "has_ans": has_ans
137
  }) + "\n"
138
  )
139
 
 
154
  token=TOKEN
155
  )
156
 
157
+ accuracy = float("{:.1f}".format(np.average([x["acc"] for x in scored_file]) * 100))
158
+ coverage = float("{:.1f}".format(np.average([x["has_ans"] for x in scored_file])))
159
+ em = float("{:.1f}".format(np.average([1 if x["acc"] == 1 else 0 for x in scored_file])))
160
+ precision = float("{:.1f}".format(np.average([x["acc"] for x in scored_file if x["has_ans"] == 1])))
161
+
162
  eval_entry = {
163
  "Model Name": model_name,
164
  "Base Model": model_family,
165
  "URL": url,
166
  "Organization": organization,
167
+ "Accuracy": accuracy,
168
  "Accuracy (easy)": accuracy_easy,
169
  "Accuracy (medium)": accuracy_medium,
170
  "Accuracy (hard)": accuracy_hard,
171
+ "Answer rate": coverage,
172
+ "Precision": precision,
173
+ "EM": em
174
  }
175
  eval_results["test"] = eval_results["test"].add_item(eval_entry)
176
  eval_results.push_to_hub(RESULTS_DATASET, config_name=YEAR_VERSION, token=TOKEN)