juncliu commited on
Commit
f5303bc
·
1 Parent(s): 9087f3e

update code and result files

Browse files
README.md CHANGED
@@ -7,7 +7,7 @@ sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
- short_description: 'GIFT-Eval: A Benchmark for General Time Series Forecasting M'
11
  sdk_version: 4.44.0
12
  ---
13
 
@@ -43,4 +43,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
43
  You'll find
44
  - the main table' columns names and properties in `src/display/utils.py`
45
  - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
+ short_description: 'GIFT-Eval: A Benchmark for General Time Series Forecasting'
11
  sdk_version: 4.44.0
12
  ---
13
 
 
43
  You'll find
44
  - the main table' columns names and properties in `src/display/utils.py`
45
  - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
+ - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -15,17 +16,16 @@ from src.about import (
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
  BENCHMARK_COLS,
18
- COLS,
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
- AutoEvalColumn,
22
  ModelType,
23
  fields,
24
  WeightType,
25
  Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
  from src.utils import norm_sNavie, pivot_df
31
  # import ipdb
@@ -83,6 +83,16 @@ term_length_df = pivot_df('results/grouped_results_by_term_length.csv', tab_name
83
  print(f'Term length dataframe is {term_length_df}')
84
  variate_type_df = pivot_df('results/grouped_results_by_univariate.csv', tab_name='univariate')
85
  print(f'Variate type dataframe is {variate_type_df}')
 
 
 
 
 
 
 
 
 
 
86
 
87
  # (
88
  # finished_eval_queue_df,
@@ -91,20 +101,32 @@ print(f'Variate type dataframe is {variate_type_df}')
91
  # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
92
 
93
 
94
- def init_leaderboard(dataframe):
95
- if dataframe is None or dataframe.empty:
96
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
 
 
 
 
 
 
 
 
97
  return Leaderboard(
98
- value=dataframe,
99
- datatype=[c.type for c in fields(AutoEvalColumn)],
100
  select_columns=SelectColumns(
101
- # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.name not in ['params', 'available_on_hub', 'hub', 'Model sha','Hub License']],
102
- default_selection=list(dataframe.columns),
103
- cant_deselect=['model'],
104
- label="Select Datasets to Display:",
 
 
 
105
  # How to uncheck??
106
  ),
107
-
108
  search_columns=['model'],
109
  # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
110
  # filter_columns=[
@@ -121,7 +143,10 @@ def init_leaderboard(dataframe):
121
  # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
122
  # ),
123
  # ],
124
- bool_checkboxgroup_label="Hide models",
 
 
 
125
  interactive=False,
126
  )
127
 
@@ -133,19 +158,19 @@ with demo:
133
 
134
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
135
  with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
136
- leaderboard = init_leaderboard(domain_df)
137
  print(f"FINAL Domain LEADERBOARD 1 {domain_df}")
138
 
139
  with gr.TabItem("🏅 By Frequency", elem_id="llm-benchmark-tab-table", id=1):
140
- leaderboard = init_leaderboard(freq_df)
141
  print(f"FINAL Frequency LEADERBOARD 1 {freq_df}")
142
 
143
  with gr.TabItem("🏅 By term length", elem_id="llm-benchmark-tab-table", id=2):
144
- leaderboard = init_leaderboard(term_length_df)
145
  print(f"FINAL term length LEADERBOARD 1 {term_length_df}")
146
 
147
  with gr.TabItem("🏅 By variate type", elem_id="llm-benchmark-tab-table", id=3):
148
- leaderboard = init_leaderboard(variate_type_df)
149
  print(f"FINAL LEADERBOARD 1 {variate_type_df}")
150
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
151
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
1
  import gradio as gr
2
+ import ipdb
3
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
 
16
  from src.display.css_html_js import custom_css
17
  from src.display.utils import (
18
  BENCHMARK_COLS,
 
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
+ ModelInfoColumn,
22
  ModelType,
23
  fields,
24
  WeightType,
25
  Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_info_df, get_merged_df
29
  from src.submission.submit import add_new_eval
30
  from src.utils import norm_sNavie, pivot_df
31
  # import ipdb
 
83
  print(f'Term length dataframe is {term_length_df}')
84
  variate_type_df = pivot_df('results/grouped_results_by_univariate.csv', tab_name='univariate')
85
  print(f'Variate type dataframe is {variate_type_df}')
86
+ model_info_df = get_model_info_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
87
+
88
+ # domain_df = get_merged_df(domain_df, model_info_df)
89
+ # print('Merged domain df: ', domain_df)
90
+ # freq_df = get_merged_df(freq_df, model_info_df)
91
+ # print('Merged freq df: ', freq_df)
92
+ # term_length_df = get_merged_df(term_length_df, model_info_df)
93
+ # print('Merged term length df: ', term_length_df)
94
+ # variate_type_df = get_merged_df(variate_type_df, model_info_df)
95
+ # print('Merged variate type df: ', variate_type_df)
96
 
97
  # (
98
  # finished_eval_queue_df,
 
101
  # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
102
 
103
 
104
+ def init_leaderboard(ori_dataframe, model_info_df):
105
+ if ori_dataframe is None or ori_dataframe.empty:
106
  raise ValueError("Leaderboard DataFrame is empty or None.")
107
+ model_info_col_list = [c.name for c in fields(ModelInfoColumn) if c.displayed_by_default if c.name not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
108
+ default_selection_list = list(ori_dataframe.columns) + model_info_col_list
109
+ print('default_selection_list: ', default_selection_list)
110
+ # ipdb.set_trace()
111
+ # default_selection_list = [col for col in default_selection_list if col not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
112
+ merged_df = get_merged_df(ori_dataframe, model_info_df)
113
+ new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
114
+ merged_df = merged_df[new_cols]
115
+ print('Merged df: ', merged_df)
116
  return Leaderboard(
117
+ value=merged_df,
118
+ # datatype=[c.type for c in fields(ModelInfoColumn)],
119
  select_columns=SelectColumns(
120
+ default_selection=default_selection_list,
121
+ # default_selection=[c.name for c in fields(ModelInfoColumn) if
122
+ # c.displayed_by_default and c.name not in ['params', 'available_on_hub', 'hub',
123
+ # 'Model sha', 'Hub License']],
124
+ # default_selection=list(dataframe.columns),
125
+ cant_deselect=[c.name for c in fields(ModelInfoColumn) if c.never_hidden],
126
+ label="Select Columns to Display:",
127
  # How to uncheck??
128
  ),
129
+ hide_columns=[c.name for c in fields(ModelInfoColumn) if c.hidden],
130
  search_columns=['model'],
131
  # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
132
  # filter_columns=[
 
143
  # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
144
  # ),
145
  # ],
146
+ filter_columns=[
147
+ ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
148
+ ],
149
+ # bool_checkboxgroup_label="Hide models",
150
  interactive=False,
151
  )
152
 
 
158
 
159
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
160
  with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
161
+ leaderboard = init_leaderboard(domain_df, model_info_df)
162
  print(f"FINAL Domain LEADERBOARD 1 {domain_df}")
163
 
164
  with gr.TabItem("🏅 By Frequency", elem_id="llm-benchmark-tab-table", id=1):
165
+ leaderboard = init_leaderboard(freq_df, model_info_df)
166
  print(f"FINAL Frequency LEADERBOARD 1 {freq_df}")
167
 
168
  with gr.TabItem("🏅 By term length", elem_id="llm-benchmark-tab-table", id=2):
169
+ leaderboard = init_leaderboard(term_length_df, model_info_df)
170
  print(f"FINAL term length LEADERBOARD 1 {term_length_df}")
171
 
172
  with gr.TabItem("🏅 By variate type", elem_id="llm-benchmark-tab-table", id=3):
173
+ leaderboard = init_leaderboard(variate_type_df, model_info_df)
174
  print(f"FINAL LEADERBOARD 1 {variate_type_df}")
175
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
176
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
results/auto_arima/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "auto_arima",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/auto_ets/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "auto_ets",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/auto_theta/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "auto_theta",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/chronos-small/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "chronos-small",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/chronos_base/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "chronos_base",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/chronos_large/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "chronos_large",
3
+ "model_type": "pretrained",
4
+ "model_dtype": "float32"
5
+ }
results/crossformer/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "crossformer",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/d_linear/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "d_linear",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/deepar/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "deepar",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/i_transformer/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "i_transformer",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/moirai_1.1_R_base_no_leak/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "moirai_1.1_R_base_no_leak",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/moirai_1.1_R_large_no_leak/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "moirai_1.1_R_large_no_leak",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/moirai_1.1_R_small_no_leak/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "moirai_1.1_R_small_no_leak",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/n_beats/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "n_beats",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/naive/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "naive",
3
+ "model_type": "statistical",
4
+ "model_dtype": "float32"
5
+ }
results/patch_tst/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "patch_tst",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/seasonal_naive/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "seasonal_naive",
3
+ "model_type": "statistical",
4
+ "model_dtype": "float32"
5
+ }
results/tft/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "tft",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/tide/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "tide",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/timesfm/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "timesfm",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
results/visionts/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "model": "visionts",
3
+ "model_type": "deep-learning",
4
+ "model_dtype": "float32"
5
+ }
src/display/utils.py CHANGED
@@ -21,27 +21,23 @@ class ColumnContent:
21
  never_hidden: bool = False
22
 
23
  ## Leaderboard columns
24
- auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
@@ -62,10 +58,11 @@ class ModelDetails:
62
 
63
 
64
  class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟢")
66
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
 
69
  Unknown = ModelDetails(name="", symbol="?")
70
 
71
  def to_str(self, separator=" "):
@@ -77,10 +74,10 @@ class ModelType(Enum):
77
  return ModelType.FT
78
  if "pretrained" in type or "🟢" in type:
79
  return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "" in type:
83
- return ModelType.IFT
84
  return ModelType.Unknown
85
 
86
  class WeightType(Enum):
@@ -101,7 +98,7 @@ class Precision(Enum):
101
  return Precision.Unknown
102
 
103
  # Column selection
104
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 
21
  never_hidden: bool = False
22
 
23
  ## Leaderboard columns
24
+ model_info_dict = []
25
+ # Init column for the model properties
26
+ model_info_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
+ model_info_dict.append(["model", ColumnContent, ColumnContent("model", "markdown", True, never_hidden=True)])
 
 
 
 
28
  # Model information
29
+ model_info_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
30
+ model_info_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
31
+ model_info_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
32
+ model_info_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
33
+ model_info_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
34
+ model_info_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
35
+ model_info_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
36
+ model_info_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
37
+ model_info_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
38
 
39
  # We use make dataclass to dynamically fill the scores from Tasks
40
+ ModelInfoColumn = make_dataclass("ModelInfoColumn", model_info_dict, frozen=True)
41
 
42
  ## For the queue columns in the submission tab
43
  @dataclass(frozen=True)
 
58
 
59
 
60
  class ModelType(Enum):
61
+ PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
62
+ FT = ModelDetails(name="🔶 fine-tuned", symbol="🔶")
63
+ DL = ModelDetails(name="🔷 deep-learning", symbol="🔷")
64
+ ST = ModelDetails(name="🟣 statistical", symbol="🟣")
65
+
66
  Unknown = ModelDetails(name="", symbol="?")
67
 
68
  def to_str(self, separator=" "):
 
74
  return ModelType.FT
75
  if "pretrained" in type or "🟢" in type:
76
  return ModelType.PT
77
+ if "deep-learning" in type or "🟦" in type:
78
+ return ModelType.DL
79
+ if "statistical" in type or "🟣" in type:
80
+ return ModelType.ST
81
  return ModelType.Unknown
82
 
83
  class WeightType(Enum):
 
98
  return Precision.Unknown
99
 
100
  # Column selection
101
+ MODEL_INFO_COLS = [c.name for c in fields(ModelInfoColumn) if not c.hidden]
102
 
103
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
104
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
src/envs.py CHANGED
@@ -18,7 +18,8 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
22
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
 
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
+ # EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results")
23
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
24
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
25
 
src/leaderboard/read_evals.py CHANGED
@@ -8,10 +8,48 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  @dataclass
16
  class EvalResult:
17
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
@@ -154,7 +192,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
154
  return request_file
155
 
156
 
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
  """From the path of the results folder root, extract all needed info for results"""
159
  model_result_filepaths = []
160
 
@@ -172,6 +210,49 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
172
  for file in files:
173
  model_result_filepaths.append(os.path.join(root, file))
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  eval_results = {}
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import ModelType, Tasks, Precision, WeightType, ModelInfoColumn
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
+ @dataclass
16
+ class ModelConfig:
17
+ """Represents the model configuration of a model"""
18
+ model: str
19
+ model_type: ModelType = ModelType.Unknown
20
+ precision: Precision = Precision.Unknown
21
+ license: str = "?"
22
+ likes: int = 0
23
+ num_params: int = 0
24
+
25
+ @classmethod
26
+ def init_from_json_file(cls, json_filepath):
27
+ """Inits the result from the specific model result file"""
28
+ with open(json_filepath) as fp:
29
+ data = json.load(fp)
30
+
31
+ # config = data.get("config")
32
+
33
+ # Precision
34
+ precision = Precision.from_str(data.get("model_dtype"))
35
+ model_type = ModelType.from_str(data.get("model_type", ""))
36
+ model = data.get("model", "")
37
+ return cls(model=model, model_type=model_type, precision=precision)
38
+
39
+ def to_dict(self):
40
+ """Converts the model info to a dict compatible with our dataframe display"""
41
+ data_dict = {
42
+ "model": self.model, # not a column, just a save name,
43
+ ModelInfoColumn.precision.name: self.precision.value.name,
44
+ ModelInfoColumn.model_type.name: self.model_type.value.name,
45
+ ModelInfoColumn.model_type_symbol.name: self.model_type.value.symbol,
46
+ ModelInfoColumn.license.name: self.license,
47
+ ModelInfoColumn.likes.name: self.likes,
48
+ ModelInfoColumn.params.name: self.num_params,
49
+ }
50
+
51
+ return data_dict
52
+
53
  @dataclass
54
  class EvalResult:
55
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
 
192
  return request_file
193
 
194
 
195
+ def get_model_info(results_path: str, requests_path: str) -> list[ModelConfig]:
196
  """From the path of the results folder root, extract all needed info for results"""
197
  model_result_filepaths = []
198
 
 
210
  for file in files:
211
  model_result_filepaths.append(os.path.join(root, file))
212
 
213
+ model_infos = {}
214
+ for model_result_filepath in model_result_filepaths:
215
+ # Creation of result
216
+ model_info = ModelConfig.init_from_json_file(model_result_filepath)
217
+ # eval_result.update_with_request_file(requests_path)
218
+
219
+ # Store results of same eval together
220
+ model_name = model_info.model
221
+ model_infos[model_name] = model_info
222
+ # if eval_name in eval_results.keys():
223
+ # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
224
+ # else:
225
+ # eval_results[eval_name] = eval_result
226
+
227
+ results = []
228
+ for v in model_infos.values():
229
+ try:
230
+ v.to_dict() # we test if the dict version is complete
231
+ results.append(v)
232
+ except KeyError: # not all eval values present
233
+ continue
234
+
235
+ return results
236
+
237
+
238
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
239
+ """From the path of the results folder root, extract all needed info for results"""
240
+ model_result_filepaths = []
241
+
242
+ for root, _, files in os.walk(results_path):
243
+ # We should only have json files in model results
244
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
245
+ continue
246
+
247
+ # # Sort the files by date
248
+ # try:
249
+ # files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
250
+ # except dateutil.parser._parser.ParserError:
251
+ # files = [files[-1]]
252
+
253
+ for file in files:
254
+ model_result_filepaths.append(os.path.join(root, file))
255
+
256
  eval_results = {}
257
  for model_result_filepath in model_result_filepaths:
258
  # Creation of result
src/populate.py CHANGED
@@ -4,12 +4,27 @@ import os
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
- # import ipdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
 
13
  # raw_data = get_raw_eval_results(results_path, requests_path)
14
  # print('results_path:', results_path)
15
  # all_data_json = [v.to_dict() for v in raw_data]
 
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import EvalQueueColumn
8
+ from src.leaderboard.read_evals import get_model_info
9
+ import ipdb
10
+
11
+ def get_model_info_df(results_path: str, requests_path: str, cols: list=[], benchmark_cols: list=[]) -> pd.DataFrame:
12
+ """Creates a dataframe from all the individual experiment results"""
13
+ raw_data = get_model_info(results_path, requests_path)
14
+ all_data_json = [v.to_dict() for v in raw_data]
15
+ print(f"The raw data is {all_data_json}")
16
+ df = pd.DataFrame.from_records(all_data_json)
17
+ print(f"DF for Model Info ********** {df}")
18
+ return df
19
+
20
+ def get_merged_df(result_df: pd.DataFrame, model_info_df: pd.DataFrame) -> pd.DataFrame:
21
+ """Merges the model info dataframe with the results dataframe"""
22
+ merged_df = pd.merge(model_info_df, result_df, on='model', how='inner')
23
+ return merged_df
24
 
25
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
26
  """Creates a dataframe from all the individual experiment results"""
27
+ raw_data = get_raw_eval_results(results_path, requests_path)
28
  # raw_data = get_raw_eval_results(results_path, requests_path)
29
  # print('results_path:', results_path)
30
  # all_data_json = [v.to_dict() for v in raw_data]
src/utils.py CHANGED
@@ -24,4 +24,5 @@ def pivot_df(file_name, tab_name):
24
  # df_pivot.to_csv('pivoted_df.csv')
25
  # print(df_pivot)
26
  df_pivot = df_pivot.reset_index()
 
27
  return df_pivot
 
24
  # df_pivot.to_csv('pivoted_df.csv')
25
  # print(df_pivot)
26
  df_pivot = df_pivot.reset_index()
27
+ df_pivot = df_pivot.round(3)
28
  return df_pivot