Clémentine commited on
Commit
5f9d165
1 Parent(s): d0c2655

small visu fixes

Browse files
Files changed (2) hide show
  1. app.py +40 -17
  2. content.py +16 -7
app.py CHANGED
@@ -13,7 +13,7 @@ from huggingface_hub import HfApi
13
 
14
  # InfoStrings
15
  from scorer import question_scorer
16
- from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
17
 
18
  TOKEN = os.environ.get("TOKEN", None)
19
 
@@ -21,7 +21,7 @@ OWNER="gaia-benchmark"
21
  DATA_DATASET = f"{OWNER}/GAIA"
22
  INTERNAL_DATA_DATASET = f"{OWNER}/GAIA_internal"
23
  SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
24
- RESULTS_DATASET = f"{OWNER}/results"
25
  LEADERBOARD_PATH = f"{OWNER}/leaderboard"
26
  api = HfApi()
27
 
@@ -30,27 +30,40 @@ YEAR_VERSION = "2023"
30
  os.makedirs("scored", exist_ok=True)
31
 
32
  # Display the results
33
- eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, use_auth_token=TOKEN)
34
- eval_dataframe_val = pd.DataFrame(eval_results["validation"].remove_columns("mail"))
35
- eval_dataframe_test = pd.DataFrame(eval_results["test"].remove_columns("mail"))
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  # Gold answers
38
  gold_results = {}
39
- gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", use_auth_token=TOKEN)
40
  gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "validation"]}
41
 
42
 
43
  def restart_space():
44
  api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
45
 
46
-
47
- COLS = ["Model", "Score ⬆️", "Organisation"]
48
- TYPES = ["str", "number", "str",]
49
 
50
  def add_new_eval(
51
  val_or_test: str,
52
  model: str,
53
- path_to_file,
 
 
 
54
  organisation: str,
55
  mail: str,
56
  ):
@@ -120,6 +133,9 @@ def add_new_eval(
120
  # Actual submission
121
  eval_entry = {
122
  "model": model,
 
 
 
123
  "organisation": organisation,
124
  "mail": mail,
125
  "score": scores["all"]/num_questions["all"],
@@ -131,13 +147,13 @@ def add_new_eval(
131
  print(eval_results)
132
  eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
133
 
134
- return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait for up to an hour to see the score displayed")
135
 
136
 
137
  def refresh():
138
- eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, use_auth_token=TOKEN, download_mode="force_redownload")
139
- eval_dataframe_val = pd.DataFrame(eval_results["validation"].remove_columns("mail"))
140
- eval_dataframe_test = pd.DataFrame(eval_results["test"].remove_columns("mail"))
141
  return eval_dataframe_val, eval_dataframe_test
142
 
143
  def upload_file(files):
@@ -160,11 +176,11 @@ with demo:
160
 
161
  with gr.Tab("Results: Validation"):
162
  leaderboard_table_val = gr.components.Dataframe(
163
- value=eval_dataframe_val, headers=COLS, datatype=TYPES, interactive=False,
164
  )
165
  with gr.Tab("Results: Test"):
166
  leaderboard_table_test = gr.components.Dataframe(
167
- value=eval_dataframe_test, headers=COLS, datatype=TYPES, interactive=False,
168
  )
169
 
170
  refresh_button = gr.Button("Refresh")
@@ -181,10 +197,14 @@ with demo:
181
  with gr.Column():
182
  level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
183
  model_name_textbox = gr.Textbox(label="Model name")
184
- file_output = gr.File()
 
 
185
  with gr.Column():
186
  organisation = gr.Textbox(label="Organisation")
187
  mail = gr.Textbox(label="Contact email")
 
 
188
 
189
  submit_button = gr.Button("Submit Eval")
190
  submission_result = gr.Markdown()
@@ -193,6 +213,9 @@ with demo:
193
  [
194
  level_of_test,
195
  model_name_textbox,
 
 
 
196
  file_output,
197
  organisation,
198
  mail
 
13
 
14
  # InfoStrings
15
  from scorer import question_scorer
16
+ from content import format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
17
 
18
  TOKEN = os.environ.get("TOKEN", None)
19
 
 
21
  DATA_DATASET = f"{OWNER}/GAIA"
22
  INTERNAL_DATA_DATASET = f"{OWNER}/GAIA_internal"
23
  SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
24
+ RESULTS_DATASET = f"{OWNER}/results_public"
25
  LEADERBOARD_PATH = f"{OWNER}/leaderboard"
26
  api = HfApi()
27
 
 
30
  os.makedirs("scored", exist_ok=True)
31
 
32
  # Display the results
33
+ eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload")
34
+ def get_dataframe_from_results(eval_results, split):
35
+ local_df = eval_results[split]
36
+ local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
37
+ local_df = local_df.remove_columns(["mail", "system_prompt", "url"])
38
+ local_df = local_df.rename_column("model", "Model name")
39
+ local_df = local_df.rename_column("model_family", "Model family")
40
+ local_df = local_df.rename_column("score", "Average score (%)")
41
+ for i in [1, 2, 3]:
42
+ local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)")
43
+ df = pd.DataFrame(local_df)
44
+ return df
45
+
46
+ eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
47
+ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
48
 
49
  # Gold answers
50
  gold_results = {}
51
+ gold_dataset = load_dataset(INTERNAL_DATA_DATASET, f"{YEAR_VERSION}_all", token=TOKEN)
52
  gold_results = {split: {row["task_id"]: row for row in gold_dataset[split]} for split in ["test", "validation"]}
53
 
54
 
55
  def restart_space():
56
  api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
57
 
58
+ TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]
 
 
59
 
60
  def add_new_eval(
61
  val_or_test: str,
62
  model: str,
63
+ model_family: str,
64
+ system_prompt: str,
65
+ url: str,
66
+ path_to_file: str,
67
  organisation: str,
68
  mail: str,
69
  ):
 
133
  # Actual submission
134
  eval_entry = {
135
  "model": model,
136
+ "model_family": model_family,
137
+ "system_prompt": system_prompt,
138
+ "url": url,
139
  "organisation": organisation,
140
  "mail": mail,
141
  "score": scores["all"]/num_questions["all"],
 
147
  print(eval_results)
148
  eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
149
 
150
+ return format_log(f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")
151
 
152
 
153
  def refresh():
154
+ eval_results = load_dataset(RESULTS_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload")
155
+ eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
156
+ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
157
  return eval_dataframe_val, eval_dataframe_test
158
 
159
  def upload_file(files):
 
176
 
177
  with gr.Tab("Results: Validation"):
178
  leaderboard_table_val = gr.components.Dataframe(
179
+ value=eval_dataframe_val, datatype=TYPES, interactive=False,
180
  )
181
  with gr.Tab("Results: Test"):
182
  leaderboard_table_test = gr.components.Dataframe(
183
+ value=eval_dataframe_test, datatype=TYPES, interactive=False,
184
  )
185
 
186
  refresh_button = gr.Button("Refresh")
 
197
  with gr.Column():
198
  level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
199
  model_name_textbox = gr.Textbox(label="Model name")
200
+ model_family_textbox = gr.Textbox(label="Model family")
201
+ system_prompt_textbox = gr.Textbox(label="System prompt example")
202
+ url_textbox = gr.Textbox(label="Url to model information")
203
  with gr.Column():
204
  organisation = gr.Textbox(label="Organisation")
205
  mail = gr.Textbox(label="Contact email")
206
+ file_output = gr.File()
207
+
208
 
209
  submit_button = gr.Button("Submit Eval")
210
  submission_result = gr.Markdown()
 
213
  [
214
  level_of_test,
215
  model_name_textbox,
216
+ model_family_textbox,
217
+ system_prompt_textbox,
218
+ url_textbox,
219
  file_output,
220
  organisation,
221
  mail
content.py CHANGED
@@ -2,18 +2,25 @@ TITLE = """<h1 align="center" id="space-title">GAIA Leaderboard</h1>"""
2
 
3
  CANARY_STRING = "" # TODO
4
 
5
- INTRODUCTION_TEXT = f"""
6
  Large language models have seen their potential capabilities increased by several orders of magnitude with the introduction of augmentations, from simple prompting adjustement to actual external tooling (calculators, vision models, ...) or online web retrieval.
7
-
8
  To evaluate the next generation of LLMs, we argue for a new kind of benchmark, simple and yet effective to measure actual progress on augmented capabilities,
9
  We therefore present GAIA.
10
 
11
  GAIA is made of 3 evaluation levels, depending on the added level of tooling and autonomy the model needs.
12
  We expect the level 1 to be breakable by very good LLMs, and the level 3 to indicate a strong jump in model capabilities.
 
 
 
 
 
 
 
 
13
 
14
- Each of these levels is divided into two sets: a public dev set, on which people can self report their results, and a private test set, which will be unlocked once public performance passes a threshold on the dev set.
15
 
16
- Please do not repost the public dev set, nor use it in training data for your models. Its canary string is """ + CANARY_STRING + """ and files containing this string should be removed from training data.
17
  """
18
 
19
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -21,8 +28,6 @@ CITATION_BUTTON_TEXT = r"""@misc{gaia, # TODO
21
  author = {tbd},
22
  title = {General AI Assistant benchamrk},
23
  year = {2023},
24
- #publisher = {Hugging Face},
25
- #howpublished = "\url{https://huggingface.co/spaces/gaia-benchmark/}"
26
  }"""
27
 
28
 
@@ -30,4 +35,8 @@ def format_warning(msg):
30
  return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
31
 
32
  def format_log(msg):
33
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
 
 
 
 
 
2
 
3
  CANARY_STRING = "" # TODO
4
 
5
+ INTRODUCTION_TEXT = """
6
  Large language models have seen their potential capabilities increased by several orders of magnitude with the introduction of augmentations, from simple prompting adjustement to actual external tooling (calculators, vision models, ...) or online web retrieval.
 
7
  To evaluate the next generation of LLMs, we argue for a new kind of benchmark, simple and yet effective to measure actual progress on augmented capabilities,
8
  We therefore present GAIA.
9
 
10
  GAIA is made of 3 evaluation levels, depending on the added level of tooling and autonomy the model needs.
11
  We expect the level 1 to be breakable by very good LLMs, and the level 3 to indicate a strong jump in model capabilities.
12
+ Each of these levels is divided into two sets: a fully public dev set, on which people can test their models, and a test set with private answers and metadata. Results can be submitted for both validation and test.
13
+
14
+ We expect submissions to be json-line files with the following format:
15
+ ```
16
+ {"task_id": "task_id_1", "model_answer": "Answer 1 from your model", "reasoning_trace": "The different steps by which your model reached answer 1"}
17
+ {"task_id": "task_id_2", "model_answer": "Answer 2 from your model", "reasoning_trace": "The different steps by which your model reached answer 2"}
18
+ ...
19
+ ```
20
 
21
+ Scores are expressed as the percentage of correct answers for a given split.
22
 
23
+ Please do not repost the public dev set, nor use it in training data for your models.
24
  """
25
 
26
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
28
  author = {tbd},
29
  title = {General AI Assistant benchamrk},
30
  year = {2023},
 
 
31
  }"""
32
 
33
 
 
35
  return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
36
 
37
  def format_log(msg):
38
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
39
+
40
+ def model_hyperlink(link, model_name):
41
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
42
+