yzabc007 commited on
Commit
bfb3ae7
1 Parent(s): f99d80b

Update space

Browse files
Files changed (5) hide show
  1. app.py +3 -2
  2. src/about.py +16 -1
  3. src/display/utils.py +57 -16
  4. src/leaderboard/read_evals.py +50 -18
  5. src/populate.py +17 -1
app.py CHANGED
@@ -25,7 +25,7 @@ from src.display.utils import (
25
  Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
31
 
@@ -97,6 +97,7 @@ def init_leaderboard(dataframe):
97
  )
98
 
99
 
 
100
 
101
  def overall_leaderboard(dataframe):
102
  if dataframe is None or dataframe.empty:
@@ -124,7 +125,7 @@ with demo:
124
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
125
 
126
  with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
127
- leaderboard = overall_leaderboard(LEADERBOARD_DF)
128
 
129
 
130
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
 
25
  Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
31
 
 
97
  )
98
 
99
 
100
+ model_leaderboard_df = get_model_leaderboard_df()
101
 
102
  def overall_leaderboard(dataframe):
103
  if dataframe is None or dataframe.empty:
 
125
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
126
 
127
  with gr.TabItem("🏅 Overview", elem_id="llm-benchmark-tab-table", id=0):
128
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
129
 
130
 
131
  with gr.TabItem("🎯 Overall", elem_id="llm-benchmark-tab-table", id=1):
src/about.py CHANGED
@@ -1,6 +1,21 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -14,7 +29,7 @@ class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("anli_r1", "acc", "ANLI")
16
  task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
20
 
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+ @dataclass
5
+ class Domain:
6
+ dimension: str
7
+ metric: str
8
+ col_name: str
9
+
10
+
11
+ class Domains(Enum):
12
+ # dimension_key in the json file, metric_key in the json file, name to display in the leaderboard
13
+ dim0 = Domain("overall", "Avg Rank", "Overall")
14
+ # dim1 = Task("math", "acc", "Math")
15
+ # dim2 = Task("math_algebra", "acc", "Algebra")
16
+
17
+
18
+
19
  @dataclass
20
  class Task:
21
  benchmark: str
 
29
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
30
  task0 = Task("anli_r1", "acc", "ANLI")
31
  task1 = Task("logiqa", "acc_norm", "LogiQA")
32
+
33
  NUM_FEWSHOT = 0 # Change with your few shot
34
  # ---------------------------------------------------
35
 
src/display/utils.py CHANGED
@@ -1,9 +1,9 @@
1
- from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -22,26 +22,67 @@ class ColumnContent:
22
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
 
 
 
 
 
 
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
 
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
 
 
 
45
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
 
1
+ from dataclasses import dataclass, make_dataclass, field
2
  from enum import Enum
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks, Domains
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
22
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
+ # # Init
26
+ # auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
27
+ # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
28
+
29
+ # # new columns
30
+ # for domain in Domains:
31
+ # auto_eval_column_dict.append([domain.name, ColumnContent, ColumnContent(domain.value.col_name, "number", True)])
32
+
33
+ # auto_eval_column_dict.append(["organization", ColumnContent, ColumnContent("Organization", "str", False)])
34
+ # auto_eval_column_dict.append(["knowledge_cutoff", ColumnContent, ColumnContent("Knowledge cutoff", "str", False)])
35
+
36
+
37
+ # for task in Tasks:
38
+ # auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
39
+ # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
40
+ # #Scores
41
+ # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
42
+ # # Model information
43
+ # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
44
+ # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
45
+ # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
46
+ # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
47
+ # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
48
+ # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
49
+ # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
50
+ # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
51
+
52
  # Init
53
+ auto_eval_column_dict.append(["model", ColumnContent, field(default_factory=lambda: ColumnContent("Model", "markdown", True, never_hidden=True))])
54
+ auto_eval_column_dict.append(["license", ColumnContent, field(default_factory=lambda: ColumnContent("Hub License", "str", False))])
55
+
56
+ # new columns
57
+ for domain in Domains:
58
+ auto_eval_column_dict.append([domain.name, ColumnContent, field(default_factory=lambda: ColumnContent(domain.value.col_name, "number", True))])
59
+
60
+ auto_eval_column_dict.append(["organization", ColumnContent, field(default_factory=lambda: ColumnContent("Organization", "str", False))])
61
+ auto_eval_column_dict.append(["knowledge_cutoff", ColumnContent, field(default_factory=lambda: ColumnContent("Knowledge cutoff", "str", False))])
62
+
63
+
64
  for task in Tasks:
65
+ auto_eval_column_dict.append([task.name, ColumnContent, field(default_factory=lambda: ColumnContent(task.value.col_name, "number", True))])
66
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, field(default_factory=lambda: ColumnContent("T", "str", True, never_hidden=True))])
67
+ #Scores
68
+ auto_eval_column_dict.append(["average", ColumnContent, field(default_factory=lambda: ColumnContent("Average ⬆️", "number", True))])
69
  # Model information
70
+ auto_eval_column_dict.append(["model_type", ColumnContent, field(default_factory=lambda: ColumnContent("Type", "str", False))])
71
+ auto_eval_column_dict.append(["architecture", ColumnContent, field(default_factory=lambda: ColumnContent("Architecture", "str", False))])
72
+ auto_eval_column_dict.append(["weight_type", ColumnContent, field(default_factory=lambda: ColumnContent("Weight type", "str", False, True))])
73
+ auto_eval_column_dict.append(["precision", ColumnContent, field(default_factory=lambda: ColumnContent("Precision", "str", False))])
74
+ auto_eval_column_dict.append(["params", ColumnContent, field(default_factory=lambda: ColumnContent("#Params (B)", "number", False))])
75
+ auto_eval_column_dict.append(["likes", ColumnContent, field(default_factory=lambda: ColumnContent("Hub ❤️", "number", False))])
76
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, field(default_factory=lambda: ColumnContent("Available on the hub", "bool", False))])
77
+ auto_eval_column_dict.append(["revision", ColumnContent, field(default_factory=lambda: ColumnContent("Model sha", "str", False, False))])
 
78
 
79
  # We use make dataclass to dynamically fill the scores from Tasks
80
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
81
+ # print all attributes of AutoEvalColumn
82
+ print(AutoEvalColumn.__annotations__.keys())
83
+ # preint precision attribute
84
+ print(AutoEvalColumn.precision)
85
+
86
 
87
  ## For the queue columns in the submission tab
88
  @dataclass(frozen=True)
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -18,34 +18,57 @@ class ModelResult:
18
  """
19
  eval_name: str
20
  full_model: str
 
 
 
 
 
21
 
22
  @classmethod
23
- def init_from_jsonl_file(self, json_filepath):
24
- try:
25
- with open(json_filepath) as fp:
26
- data = json.load(fp)
27
- except:
28
- data = eval(open(json_filepath).read()) # a list of dicts
29
-
30
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
-
33
- return
 
 
 
 
 
 
 
34
 
35
  def to_dict(self):
36
  """Converts the Eval Result to a dict compatible with our dataframe display"""
37
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
38
  data_dict = {
39
  "eval_name": self.eval_name, # not a column, just a save name,
 
 
 
 
 
40
  AutoEvalColumn.precision.name: self.precision.value.name,
41
  AutoEvalColumn.model_type.name: self.model_type.value.name,
42
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
43
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
44
  AutoEvalColumn.architecture.name: self.architecture,
45
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
46
  AutoEvalColumn.revision.name: self.revision,
47
  AutoEvalColumn.average.name: average,
48
- AutoEvalColumn.license.name: self.license,
49
  AutoEvalColumn.likes.name: self.likes,
50
  AutoEvalColumn.params.name: self.num_params,
51
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
@@ -54,6 +77,9 @@ class ModelResult:
54
  for task in Tasks:
55
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
56
 
 
 
 
57
  return data_dict
58
 
59
  @dataclass
@@ -154,6 +180,7 @@ class EvalResult:
154
  def to_dict(self):
155
  """Converts the Eval Result to a dict compatible with our dataframe display"""
156
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
157
  data_dict = {
158
  "eval_name": self.eval_name, # not a column, just a save name,
159
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -242,14 +269,19 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
242
 
243
  def get_raw_model_results(results_path: str) -> list[EvalResult]:
244
  """From the path of the results folder root, extract all needed info for results"""
245
- model_result_filepaths = results_path
 
 
 
 
 
246
 
247
  eval_results = {}
248
-
249
- for model_result_filepath in model_result_filepaths:
250
- # Creation of result
251
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
252
 
 
 
 
 
253
  # Store results of same eval together
254
  eval_name = eval_result.eval_name
255
  if eval_name in eval_results.keys():
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, Domains
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
18
  """
19
  eval_name: str
20
  full_model: str
21
+ org: str
22
+ model: str
23
+ results: dict
24
+ license: str = "?"
25
+ knowledge_cutoff: str = ""
26
 
27
  @classmethod
28
+ def init_from_json_dict(self, data):
 
 
 
 
 
 
29
 
30
+ config = data.get("config")
31
+ # Get model and org
32
+ model = config.get("model_name")
33
+ org = config.get("org_name")
34
+ license = config.get("license")
35
+ knowledge_cutoff = config.get("knowledge_cutoff")
36
+
37
+ # Extract results available in this file (some results are split in several files)
38
+ results = {}
39
+ for domain in Domains:
40
+ domain = domain.value
41
+ results[domain.dimension] = data.get("results").get(domain.metric)
42
 
43
+ return self(
44
+ eval_name=f"{org}_{model}",
45
+ full_model=f"{org}/{model}",
46
+ org=org,
47
+ model=model,
48
+ results=results,
49
+ license=license,
50
+ knowledge_cutoff=knowledge_cutoff
51
+ )
52
 
53
  def to_dict(self):
54
  """Converts the Eval Result to a dict compatible with our dataframe display"""
55
+
56
+ average = 1 / self.results[Domains.dim0.dimension] if self.results[Domains.dim0.dimension] != 0 else 0
57
+ # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
58
  data_dict = {
59
  "eval_name": self.eval_name, # not a column, just a save name,
60
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
61
+ AutoEvalColumn.license.name: self.license,
62
+ AutoEvalColumn.organization.name: self.org,
63
+ AutoEvalColumn.knowledge_cutoff.name: self.knowledge_cutoff,
64
+
65
  AutoEvalColumn.precision.name: self.precision.value.name,
66
  AutoEvalColumn.model_type.name: self.model_type.value.name,
67
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
68
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
69
  AutoEvalColumn.architecture.name: self.architecture,
 
70
  AutoEvalColumn.revision.name: self.revision,
71
  AutoEvalColumn.average.name: average,
 
72
  AutoEvalColumn.likes.name: self.likes,
73
  AutoEvalColumn.params.name: self.num_params,
74
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
 
77
  for task in Tasks:
78
  data_dict[task.value.col_name] = self.results[task.value.benchmark]
79
 
80
+ for domain in Domains:
81
+ data_dict[domain.value.col_name] = self.results[domain.value.dimension]
82
+
83
  return data_dict
84
 
85
  @dataclass
 
180
  def to_dict(self):
181
  """Converts the Eval Result to a dict compatible with our dataframe display"""
182
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
183
+ print(AutoEvalColumn.precision.name, self.precision.value.name)
184
  data_dict = {
185
  "eval_name": self.eval_name, # not a column, just a save name,
186
  AutoEvalColumn.precision.name: self.precision.value.name,
 
269
 
270
  def get_raw_model_results(results_path: str) -> list[EvalResult]:
271
  """From the path of the results folder root, extract all needed info for results"""
272
+
273
+ try:
274
+ with open(results_path) as fp:
275
+ data = json.load(fp)
276
+ except:
277
+ data = eval(open(results_path).read()) # a list of dicts
278
 
279
  eval_results = {}
 
 
 
 
280
 
281
+ for result in data:
282
+ # Creation of result
283
+ eval_result = ModelResult.init_from_json_dict(result)
284
+
285
  # Store results of same eval together
286
  eval_name = eval_result.eval_name
287
  if eval_name in eval_results.keys():
src/populate.py CHANGED
@@ -5,12 +5,28 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
+ from src.leaderboard.read_evals import get_raw_eval_results, get_raw_model_results
9
 
10
 
11
+ def get_model_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
+ """Creates a dataframe from all the individual experiment results"""
13
+ raw_data = get_raw_model_results(results_path)
14
+ all_data_json = [v.to_dict() for v in raw_data]
15
+
16
+ df = pd.DataFrame.from_records(all_data_json)
17
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
+ df = df[cols].round(decimals=2)
19
+
20
+ # filter out if any of the benchmarks have not been produced
21
+ # df = df[has_no_nan_values(df, benchmark_cols)]
22
+ return df
23
+
24
+
25
+
26
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
27
  """Creates a dataframe from all the individual experiment results"""
28
  raw_data = get_raw_eval_results(results_path, requests_path)
29
+ # raw_data = get_raw_model_results(results_path)
30
  all_data_json = [v.to_dict() for v in raw_data]
31
 
32
  df = pd.DataFrame.from_records(all_data_json)