djstrong commited on
Commit
ed33da8
1 Parent(s): 36e06b6

add metadata and filters

Browse files
Files changed (5) hide show
  1. app.py +3 -2
  2. src/about.py +20 -19
  3. src/display/utils.py +9 -7
  4. src/leaderboard/read_evals.py +35 -2
  5. src/populate.py +23 -22
app.py CHANGED
@@ -173,7 +173,7 @@ with demo:
173
  value=[t.to_str() for t in ModelType],
174
  interactive=True,
175
  elem_id="filter-columns-type",
176
- visible=False,
177
  )
178
  filter_columns_precision = gr.CheckboxGroup(
179
  label="Precision",
@@ -189,7 +189,7 @@ with demo:
189
  value=list(NUMERIC_INTERVALS.keys()),
190
  interactive=True,
191
  elem_id="filter-columns-size",
192
- visible=False,
193
  )
194
  filter_columns_nshot = gr.CheckboxGroup(
195
  label="N-shot",
@@ -238,6 +238,7 @@ with demo:
238
  interactive=False,
239
  visible=True,
240
  # column_widths=["2%", "33%"]
 
241
  )
242
 
243
  # Dummy leaderboard for handling the case when the user uses backspace key
 
173
  value=[t.to_str() for t in ModelType],
174
  interactive=True,
175
  elem_id="filter-columns-type",
176
+ visible=True,
177
  )
178
  filter_columns_precision = gr.CheckboxGroup(
179
  label="Precision",
 
189
  value=list(NUMERIC_INTERVALS.keys()),
190
  interactive=True,
191
  elem_id="filter-columns-size",
192
+ visible=True,
193
  )
194
  filter_columns_nshot = gr.CheckboxGroup(
195
  label="N-shot",
 
238
  interactive=False,
239
  visible=True,
240
  # column_widths=["2%", "33%"]
241
+ height=900
242
  )
243
 
244
  # Dummy leaderboard for handling the case when the user uses backspace key
src/about.py CHANGED
@@ -1,35 +1,36 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
- @dataclass
5
  class Task:
6
  benchmark: str
7
  metric: str
8
  col_name: str
 
9
 
10
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn")
16
- task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g")
17
- task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2_in_mc")
18
- task5 = Task("polemo2_out", "exact_match,score-first", "polemo2_out_g")
19
- task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2_out_mc")
20
- task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc")
21
- task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g")
22
- task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g")
23
- task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc")
24
- task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g")
25
- task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc")
26
- task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g")
27
- task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc")
28
- task15 = Task("polish_psc_regex", "f1,score-first", "psc_g")
29
- task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc")
30
- task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g")
31
- task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc")
32
- task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g")
33
 
34
  NUM_FEWSHOT = 0 # Change with your few shot
35
  # ---------------------------------------------------
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+ @dataclass(frozen=True)
5
  class Task:
6
  benchmark: str
7
  metric: str
8
  col_name: str
9
+ type: str
10
 
11
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice")
17
+ task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until")
18
+ task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice")
19
+ task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until")
20
+ task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
21
+ task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
22
+ task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
23
+ task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
24
+ task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
25
+ task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")
26
+ task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice")
27
+ task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until")
28
+ task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice")
29
+ task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until")
30
+ task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice")
31
+ task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until")
32
+ task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice")
33
+ task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until")
34
 
35
  NUM_FEWSHOT = 0 # Change with your few shot
36
  # ---------------------------------------------------
src/display/utils.py CHANGED
@@ -29,15 +29,17 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
29
  auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
30
  #Scores
31
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
 
32
  for task in Tasks:
33
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
34
  # Model information
35
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
36
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
37
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
38
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
39
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
40
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
41
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
42
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
43
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
@@ -67,9 +69,9 @@ class ModelDetails:
67
 
68
  class ModelType(Enum):
69
  PT = ModelDetails(name="pretrained", symbol="🟢")
70
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
71
  IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
72
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
73
  Unknown = ModelDetails(name="", symbol="?")
74
 
75
  def to_str(self, separator=" "):
@@ -77,11 +79,11 @@ class ModelType(Enum):
77
 
78
  @staticmethod
79
  def from_str(type):
80
- if "fine-tuned" in type or "🔶" in type:
81
- return ModelType.FT
82
  if "pretrained" in type or "🟢" in type:
83
  return ModelType.PT
84
- if "RL-tuned" in type or "🟦" in type:
85
  return ModelType.RL
86
  if "instruction-tuned" in type or "⭕" in type:
87
  return ModelType.IFT
 
29
  auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
30
  #Scores
31
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
32
+ auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
33
+ auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
34
  for task in Tasks:
35
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
36
  # Model information
37
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", True)])
38
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
39
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
40
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
41
  auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
42
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
43
  auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
44
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
45
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
69
 
70
  class ModelType(Enum):
71
  PT = ModelDetails(name="pretrained", symbol="🟢")
72
+ CPT = ModelDetails(name="continuously pretrained", symbol="🟩")
73
  IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
74
+ RL = ModelDetails(name="RL-tuned", symbol="💬")
75
  Unknown = ModelDetails(name="", symbol="?")
76
 
77
  def to_str(self, separator=" "):
 
79
 
80
  @staticmethod
81
  def from_str(type):
82
+ if "continuously pretrained" in type or "🟩" in type:
83
+ return ModelType.CPT
84
  if "pretrained" in type or "🟢" in type:
85
  return ModelType.PT
86
+ if "RL-tuned" in type or "💬" in type:
87
  return ModelType.RL
88
  if "instruction-tuned" in type or "⭕" in type:
89
  return ModelType.IFT
src/leaderboard/read_evals.py CHANGED
@@ -106,8 +106,22 @@ class EvalResult:
106
  n_shot=NShotType.from_str(n_shot_num)
107
  )
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  def update_with_request_file(self, requests_path):
110
  """Finds the relevant request file for the current model and updates info with it"""
 
111
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
112
 
113
  try:
@@ -125,7 +139,13 @@ class EvalResult:
125
  def to_dict(self):
126
  """Converts the Eval Result to a dict compatible with our dataframe display"""
127
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
128
- print('average', average)
 
 
 
 
 
 
129
  data_dict={}
130
  # data_dict = {
131
  # "eval_name": self.eval_name, # not a column, just a save name,
@@ -202,6 +222,16 @@ class EvalResult:
202
  except KeyError:
203
  print(f"Could not find average")
204
 
 
 
 
 
 
 
 
 
 
 
205
  try:
206
  data_dict[AutoEvalColumn.license.name] = self.license
207
  except KeyError:
@@ -267,7 +297,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
267
  return request_file
268
 
269
 
270
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
271
  """From the path of the results folder root, extract all needed info for results"""
272
  model_result_filepaths = []
273
 
@@ -291,6 +321,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
291
  # Creation of result
292
  eval_result = EvalResult.init_from_json_file(model_result_filepath, n_shot_num=n_shot)
293
  eval_result.update_with_request_file(requests_path)
 
 
 
294
 
295
  # Store results of same eval together
296
  eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
 
106
  n_shot=NShotType.from_str(n_shot_num)
107
  )
108
 
109
+ def update_with_metadata(self, metadata):
110
+ #print('UPDATE', self.full_model, self.model, self.eval_name)
111
+ try:
112
+ meta=metadata[self.full_model]
113
+ self.model_type = ModelType.from_str(meta.get("type", "?"))
114
+ self.num_params = meta.get("params", 0)
115
+ self.license = meta.get("license", "?")
116
+ # self.lang = meta.get("lang", "?") #TODO
117
+ #TODO desc name
118
+ except KeyError:
119
+ print(f"Could not find metadata for {self.full_model}")
120
+
121
+
122
  def update_with_request_file(self, requests_path):
123
  """Finds the relevant request file for the current model and updates info with it"""
124
+ return
125
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
126
 
127
  try:
 
139
  def to_dict(self):
140
  """Converts the Eval Result to a dict compatible with our dataframe display"""
141
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
142
+ g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
143
+ mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
144
+
145
+ average_g = sum([v for task,v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
146
+ average_mc = sum([v for task,v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
147
+
148
+
149
  data_dict={}
150
  # data_dict = {
151
  # "eval_name": self.eval_name, # not a column, just a save name,
 
222
  except KeyError:
223
  print(f"Could not find average")
224
 
225
+ try:
226
+ data_dict[AutoEvalColumn.average_g.name] = average_g
227
+ except KeyError:
228
+ print(f"Could not find average_g")
229
+
230
+ try:
231
+ data_dict[AutoEvalColumn.average_mc.name] = average_mc
232
+ except KeyError:
233
+ print(f"Could not find average_mc")
234
+
235
  try:
236
  data_dict[AutoEvalColumn.license.name] = self.license
237
  except KeyError:
 
297
  return request_file
298
 
299
 
300
+ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> list[EvalResult]:
301
  """From the path of the results folder root, extract all needed info for results"""
302
  model_result_filepaths = []
303
 
 
321
  # Creation of result
322
  eval_result = EvalResult.init_from_json_file(model_result_filepath, n_shot_num=n_shot)
323
  eval_result.update_with_request_file(requests_path)
324
+ #update with metadata
325
+ eval_result.update_with_metadata(metadata)
326
+
327
 
328
  # Store results of same eval together
329
  eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
src/populate.py CHANGED
@@ -9,7 +9,8 @@ from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- raw_data = get_raw_eval_results(results_path, requests_path)
 
13
  all_data_json = [v.to_dict() for v in raw_data]
14
  print(all_data_json)
15
  df = pd.DataFrame.from_records(all_data_json)
@@ -25,27 +26,27 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
25
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
26
  all_evals = []
27
 
28
- for entry in entries:
29
- if ".json" in entry:
30
- file_path = os.path.join(save_path, entry)
31
- with open(file_path) as fp:
32
- data = json.load(fp)
33
-
34
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
35
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
36
-
37
- all_evals.append(data)
38
- elif ".md" not in entry:
39
- # this is a folder
40
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
41
- for sub_entry in sub_entries:
42
- file_path = os.path.join(save_path, entry, sub_entry)
43
- with open(file_path) as fp:
44
- data = json.load(fp)
45
-
46
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
47
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
48
- all_evals.append(data)
49
 
50
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
51
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
 
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
+ metadata=json.load(open(f"{requests_path}/metadata.json"))
13
+ raw_data = get_raw_eval_results(results_path, requests_path, metadata)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
  print(all_data_json)
16
  df = pd.DataFrame.from_records(all_data_json)
 
26
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
27
  all_evals = []
28
 
29
+ # for entry in entries:
30
+ # if ".json" in entry:
31
+ # file_path = os.path.join(save_path, entry)
32
+ # with open(file_path) as fp:
33
+ # data = json.load(fp)
34
+ #
35
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
36
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
37
+ #
38
+ # all_evals.append(data)
39
+ # elif ".md" not in entry:
40
+ # # this is a folder
41
+ # sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
42
+ # for sub_entry in sub_entries:
43
+ # file_path = os.path.join(save_path, entry, sub_entry)
44
+ # with open(file_path) as fp:
45
+ # data = json.load(fp)
46
+ #
47
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
48
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
49
+ # all_evals.append(data)
50
 
51
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
52
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]