Ori commited on
Commit
c1ec713
β€’
1 Parent(s): 4a825f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -13
app.py CHANGED
@@ -3,7 +3,6 @@ import json
3
  import datetime
4
  from email.utils import parseaddr
5
 
6
-
7
  import gradio as gr
8
  import pandas as pd
9
  from datasets import load_dataset
@@ -11,7 +10,6 @@ from apscheduler.schedulers.background import BackgroundScheduler
11
  from huggingface_hub import HfApi
12
  from content import format_error, format_warning, format_log, TITLE
13
 
14
-
15
  # Placeholder for the question_scorer function
16
  def question_scorer(prediction, gold_answer):
17
  return 1 if prediction == gold_answer else 0
@@ -34,7 +32,9 @@ os.makedirs("scored", exist_ok=True)
34
  eval_results = load_dataset(RESULTS_DATASET, token=TOKEN, download_mode="force_redownload",
35
  ignore_verifications=True, trust_remote_code=True)
36
  gold_results = load_dataset(DATA_DATASET, token=TOKEN, trust_remote_code=True)
 
37
  gold_answers = {split: {row["id"]: row["answer"] for row in gold_results[split]} for split in ["test"]}
 
38
 
39
 
40
  # Function to get dataframe from results
@@ -46,8 +46,18 @@ def get_dataframe_from_results(eval_results, split):
46
  df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
47
  return df
48
 
 
 
 
 
 
 
 
 
 
49
 
50
  eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
 
51
 
52
 
53
  # Function to restart the space
@@ -55,7 +65,7 @@ def restart_space():
55
  api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
56
 
57
 
58
- TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]
59
 
60
 
61
  # Function to add a new evaluation
@@ -92,6 +102,10 @@ def add_new_eval(
92
  file_path = path_to_file.name
93
  scores = 0
94
  num_questions = 0
 
 
 
 
95
  with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
96
  with open(file_path, 'r') as f:
97
  for ix, line in enumerate(f):
@@ -111,6 +125,8 @@ def add_new_eval(
111
  f"{task_id} not found in test set. Are you sure you submitted the correct file?")
112
 
113
  score = question_scorer(task['answer'], gold_answers["test"][task_id])
 
 
114
  scored_file.write(
115
  json.dumps({
116
  "id": task_id,
@@ -118,8 +134,15 @@ def add_new_eval(
118
  "score": score
119
  }) + "\n"
120
  )
 
121
  scores += score
122
  num_questions += 1
 
 
 
 
 
 
123
 
124
  api.upload_file(
125
  repo_id=SUBMISSION_DATASET,
@@ -131,14 +154,16 @@ def add_new_eval(
131
 
132
  eval_entry = {
133
  "Model Name": model_name,
134
- "Model Family": model_family,
135
  "URL": url,
136
  "Organization": organization,
137
  "Accuracy": scores / num_questions if num_questions > 0 else 0,
 
 
 
138
  "Answer rate": scores / num_questions if num_questions > 0 else 0,
139
  "Precision": scores / num_questions if num_questions > 0 else 0,
140
- "EM": scores if num_questions > 0 else 0,
141
- "Cost": 0, # Placeholder for cost, update with actual value if needed
142
  }
143
  eval_results["test"] = eval_results["test"].add_item(eval_entry)
144
  eval_results.push_to_hub(RESULTS_DATASET, config_name=YEAR_VERSION, token=TOKEN)
@@ -152,6 +177,7 @@ def refresh():
152
  eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload",
153
  ignore_verifications=True, trust_remote_code=True)
154
  eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
 
155
  return eval_dataframe_test
156
 
157
 
@@ -185,17 +211,16 @@ with demo:
185
  with gr.Accordion("Submit a new model for evaluation"):
186
  with gr.Row():
187
  gr.Markdown("""
188
- To make a new submission, upload a predictions file. We support JSONL files with the following format:
189
  ```
190
  {"id": "task_id_1", "answer": "Answer 1 from your model"}
191
  {"id": "task_id_2", "answer": "Answer 2 from your model"}
192
  ```
193
- Our scoring function can be found [here](https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/scorer.py).
194
  """)
195
  with gr.Row():
196
  with gr.Column():
197
  model_name_textbox = gr.Textbox(label="Model Name")
198
- model_family_textbox = gr.Textbox(label="Model Family")
199
  url_textbox = gr.Textbox(label="URL to Model Information")
200
  with gr.Column():
201
  organization = gr.Textbox(label="Organization")
@@ -220,11 +245,11 @@ with demo:
220
 
221
  with gr.Row():
222
  with gr.Accordion("πŸ“™ Citation", open=False):
223
- citation_text = """@article{yoran-etal-2023-assistantbench,
224
  title={AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?},
225
  author={Ori Yoran and Samuel Amouyal and Chaitanya Malaviya and Ben Bogin and Ofir Press and Jonathan Berant},
226
  year={2024},
227
- eprint={TODO},
228
  archivePrefix={arXiv},
229
  primaryClass={cs.CL}
230
  }"""
@@ -237,9 +262,9 @@ with demo:
237
  )
238
 
239
  gr.HTML(
240
- "<p>We would like to thank the GAIA team on which this leaderboard is based on their template and HuggingFace for hosting the leaderboard.</p>")
241
 
242
  scheduler = BackgroundScheduler()
243
  scheduler.add_job(restart_space, "interval", seconds=3600)
244
  scheduler.start()
245
- demo.launch(debug=True)
 
3
  import datetime
4
  from email.utils import parseaddr
5
 
 
6
  import gradio as gr
7
  import pandas as pd
8
  from datasets import load_dataset
 
10
  from huggingface_hub import HfApi
11
  from content import format_error, format_warning, format_log, TITLE
12
 
 
13
  # Placeholder for the question_scorer function
14
  def question_scorer(prediction, gold_answer):
15
  return 1 if prediction == gold_answer else 0
 
32
  eval_results = load_dataset(RESULTS_DATASET, token=TOKEN, download_mode="force_redownload",
33
  ignore_verifications=True, trust_remote_code=True)
34
  gold_results = load_dataset(DATA_DATASET, token=TOKEN, trust_remote_code=True)
35
+
36
  gold_answers = {split: {row["id"]: row["answer"] for row in gold_results[split]} for split in ["test"]}
37
+ gold_difficulties = {split: {row["id"]: row["difficulty"] for row in gold_results[split]} for split in ["test"]}
38
 
39
 
40
  # Function to get dataframe from results
 
46
  df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
47
  return df
48
 
49
+ # Update function to format dataframe
50
+ def format_dataframe(df):
51
+ df["Accuracy"] = df["Accuracy"].apply(lambda x: f"**{x:.2f}**")
52
+ if "URL" in df.columns:
53
+ df["Model Name"] = df.apply(lambda row: f"[{row['Model Name']}]({row['URL']})", axis=1)
54
+ df = df.drop(columns=["URL"])
55
+ df = df.rename(columns={"Model Family": "Base Model"})
56
+ df = df[["Model Name", "Accuracy", "Accuracy (easy)", "Accuracy (medium)", "Accuracy (hard)", "Answer rate", "Precision", "EM", "Base Model", "Organization"]]
57
+ return df
58
 
59
  eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
60
+ eval_dataframe_test = format_dataframe(eval_dataframe_test)
61
 
62
 
63
  # Function to restart the space
 
65
  api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
66
 
67
 
68
+ TYPES = ["markdown", "markdown", "number", "number", "number", "number", "number", "number", "str", "str"]
69
 
70
 
71
  # Function to add a new evaluation
 
102
  file_path = path_to_file.name
103
  scores = 0
104
  num_questions = 0
105
+
106
+ difficulty_scores = {"Easy": 0, "Medium": 0, "Hard": 0}
107
+ difficulty_counts = {"Easy": 0, "Medium": 0, "Hard": 0}
108
+
109
  with open(f"scored/{organization}_{model_name}.jsonl", "w") as scored_file:
110
  with open(file_path, 'r') as f:
111
  for ix, line in enumerate(f):
 
125
  f"{task_id} not found in test set. Are you sure you submitted the correct file?")
126
 
127
  score = question_scorer(task['answer'], gold_answers["test"][task_id])
128
+ difficulty = gold_difficulties["test"][task_id]
129
+
130
  scored_file.write(
131
  json.dumps({
132
  "id": task_id,
 
134
  "score": score
135
  }) + "\n"
136
  )
137
+
138
  scores += score
139
  num_questions += 1
140
+ difficulty_scores[difficulty] += score
141
+ difficulty_counts[difficulty] += 1
142
+
143
+ accuracy_easy = difficulty_scores["Easy"] / difficulty_counts["Easy"] if difficulty_counts["Easy"] > 0 else 0
144
+ accuracy_medium = difficulty_scores["Medium"] / difficulty_counts["Medium"] if difficulty_counts["Medium"] > 0 else 0
145
+ accuracy_hard = difficulty_scores["Hard"] / difficulty_counts["Hard"] if difficulty_counts["Hard"] > 0 else 0
146
 
147
  api.upload_file(
148
  repo_id=SUBMISSION_DATASET,
 
154
 
155
  eval_entry = {
156
  "Model Name": model_name,
157
+ "Base Model": model_family,
158
  "URL": url,
159
  "Organization": organization,
160
  "Accuracy": scores / num_questions if num_questions > 0 else 0,
161
+ "Accuracy (easy)": accuracy_easy,
162
+ "Accuracy (medium)": accuracy_medium,
163
+ "Accuracy (hard)": accuracy_hard,
164
  "Answer rate": scores / num_questions if num_questions > 0 else 0,
165
  "Precision": scores / num_questions if num_questions > 0 else 0,
166
+ "EM": scores if num_questions > 0 else 0
 
167
  }
168
  eval_results["test"] = eval_results["test"].add_item(eval_entry)
169
  eval_results.push_to_hub(RESULTS_DATASET, config_name=YEAR_VERSION, token=TOKEN)
 
177
  eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload",
178
  ignore_verifications=True, trust_remote_code=True)
179
  eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
180
+ eval_dataframe_test = format_dataframe(eval_dataframe_test)
181
  return eval_dataframe_test
182
 
183
 
 
211
  with gr.Accordion("Submit a new model for evaluation"):
212
  with gr.Row():
213
  gr.Markdown("""
214
+ To make a new submission, upload a predictions file. Our scoring function can be found [here](https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/scorer.py). We support JSONL files with the following format:
215
  ```
216
  {"id": "task_id_1", "answer": "Answer 1 from your model"}
217
  {"id": "task_id_2", "answer": "Answer 2 from your model"}
218
  ```
 
219
  """)
220
  with gr.Row():
221
  with gr.Column():
222
  model_name_textbox = gr.Textbox(label="Model Name")
223
+ model_family_textbox = gr.Textbox(label="Base Model")
224
  url_textbox = gr.Textbox(label="URL to Model Information")
225
  with gr.Column():
226
  organization = gr.Textbox(label="Organization")
 
245
 
246
  with gr.Row():
247
  with gr.Accordion("πŸ“™ Citation", open=False):
248
+ citation_text = """@article{yoran-etal-2024-assistantbench,
249
  title={AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?},
250
  author={Ori Yoran and Samuel Amouyal and Chaitanya Malaviya and Ben Bogin and Ofir Press and Jonathan Berant},
251
  year={2024},
252
+ eprint={?},
253
  archivePrefix={arXiv},
254
  primaryClass={cs.CL}
255
  }"""
 
262
  )
263
 
264
  gr.HTML(
265
+ "<p>We would like to thank the GAIA team for sharing the source code for their leaderboard which we used as a template and HuggingFace for hosting the leaderboard.</p>")
266
 
267
  scheduler = BackgroundScheduler()
268
  scheduler.add_job(restart_space, "interval", seconds=3600)
269
  scheduler.start()
270
+ demo.launch(debug=True)