Spaces:
Running
Running
update code and result files
Browse files- README.md +2 -2
- app.py +42 -17
- results/auto_arima/config.json +5 -0
- results/auto_ets/config.json +5 -0
- results/auto_theta/config.json +5 -0
- results/chronos-small/config.json +5 -0
- results/chronos_base/config.json +5 -0
- results/chronos_large/config.json +5 -0
- results/crossformer/config.json +5 -0
- results/d_linear/config.json +5 -0
- results/deepar/config.json +5 -0
- results/i_transformer/config.json +5 -0
- results/moirai_1.1_R_base_no_leak/config.json +5 -0
- results/moirai_1.1_R_large_no_leak/config.json +5 -0
- results/moirai_1.1_R_small_no_leak/config.json +5 -0
- results/n_beats/config.json +5 -0
- results/naive/config.json +5 -0
- results/patch_tst/config.json +5 -0
- results/seasonal_naive/config.json +5 -0
- results/tft/config.json +5 -0
- results/tide/config.json +5 -0
- results/timesfm/config.json +5 -0
- results/visionts/config.json +5 -0
- src/display/utils.py +24 -27
- src/envs.py +2 -1
- src/leaderboard/read_evals.py +83 -2
- src/populate.py +18 -3
- src/utils.py +1 -0
README.md
CHANGED
@@ -7,7 +7,7 @@ sdk: gradio
|
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
-
short_description: 'GIFT-Eval: A Benchmark for General Time Series Forecasting
|
11 |
sdk_version: 4.44.0
|
12 |
---
|
13 |
|
@@ -43,4 +43,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
|
|
43 |
You'll find
|
44 |
- the main table' columns names and properties in `src/display/utils.py`
|
45 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
46 |
-
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
|
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
+
short_description: 'GIFT-Eval: A Benchmark for General Time Series Forecasting'
|
11 |
sdk_version: 4.44.0
|
12 |
---
|
13 |
|
|
|
43 |
You'll find
|
44 |
- the main table' columns names and properties in `src/display/utils.py`
|
45 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
46 |
+
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
@@ -15,17 +16,16 @@ from src.about import (
|
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
17 |
BENCHMARK_COLS,
|
18 |
-
COLS,
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
-
|
22 |
ModelType,
|
23 |
fields,
|
24 |
WeightType,
|
25 |
Precision
|
26 |
)
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
from src.submission.submit import add_new_eval
|
30 |
from src.utils import norm_sNavie, pivot_df
|
31 |
# import ipdb
|
@@ -83,6 +83,16 @@ term_length_df = pivot_df('results/grouped_results_by_term_length.csv', tab_name
|
|
83 |
print(f'Term length dataframe is {term_length_df}')
|
84 |
variate_type_df = pivot_df('results/grouped_results_by_univariate.csv', tab_name='univariate')
|
85 |
print(f'Variate type dataframe is {variate_type_df}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
# (
|
88 |
# finished_eval_queue_df,
|
@@ -91,20 +101,32 @@ print(f'Variate type dataframe is {variate_type_df}')
|
|
91 |
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
92 |
|
93 |
|
94 |
-
def init_leaderboard(
|
95 |
-
if
|
96 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
return Leaderboard(
|
98 |
-
value=
|
99 |
-
datatype=[c.type for c in fields(
|
100 |
select_columns=SelectColumns(
|
101 |
-
|
102 |
-
default_selection=
|
103 |
-
|
104 |
-
|
|
|
|
|
|
|
105 |
# How to uncheck??
|
106 |
),
|
107 |
-
|
108 |
search_columns=['model'],
|
109 |
# hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
110 |
# filter_columns=[
|
@@ -121,7 +143,10 @@ def init_leaderboard(dataframe):
|
|
121 |
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
|
122 |
# ),
|
123 |
# ],
|
124 |
-
|
|
|
|
|
|
|
125 |
interactive=False,
|
126 |
)
|
127 |
|
@@ -133,19 +158,19 @@ with demo:
|
|
133 |
|
134 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
135 |
with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
|
136 |
-
leaderboard = init_leaderboard(domain_df)
|
137 |
print(f"FINAL Domain LEADERBOARD 1 {domain_df}")
|
138 |
|
139 |
with gr.TabItem("🏅 By Frequency", elem_id="llm-benchmark-tab-table", id=1):
|
140 |
-
leaderboard = init_leaderboard(freq_df)
|
141 |
print(f"FINAL Frequency LEADERBOARD 1 {freq_df}")
|
142 |
|
143 |
with gr.TabItem("🏅 By term length", elem_id="llm-benchmark-tab-table", id=2):
|
144 |
-
leaderboard = init_leaderboard(term_length_df)
|
145 |
print(f"FINAL term length LEADERBOARD 1 {term_length_df}")
|
146 |
|
147 |
with gr.TabItem("🏅 By variate type", elem_id="llm-benchmark-tab-table", id=3):
|
148 |
-
leaderboard = init_leaderboard(variate_type_df)
|
149 |
print(f"FINAL LEADERBOARD 1 {variate_type_df}")
|
150 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
|
151 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
1 |
import gradio as gr
|
2 |
+
import ipdb
|
3 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
4 |
import pandas as pd
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
16 |
from src.display.css_html_js import custom_css
|
17 |
from src.display.utils import (
|
18 |
BENCHMARK_COLS,
|
|
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
+
ModelInfoColumn,
|
22 |
ModelType,
|
23 |
fields,
|
24 |
WeightType,
|
25 |
Precision
|
26 |
)
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df, get_model_info_df, get_merged_df
|
29 |
from src.submission.submit import add_new_eval
|
30 |
from src.utils import norm_sNavie, pivot_df
|
31 |
# import ipdb
|
|
|
83 |
print(f'Term length dataframe is {term_length_df}')
|
84 |
variate_type_df = pivot_df('results/grouped_results_by_univariate.csv', tab_name='univariate')
|
85 |
print(f'Variate type dataframe is {variate_type_df}')
|
86 |
+
model_info_df = get_model_info_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
87 |
+
|
88 |
+
# domain_df = get_merged_df(domain_df, model_info_df)
|
89 |
+
# print('Merged domain df: ', domain_df)
|
90 |
+
# freq_df = get_merged_df(freq_df, model_info_df)
|
91 |
+
# print('Merged freq df: ', freq_df)
|
92 |
+
# term_length_df = get_merged_df(term_length_df, model_info_df)
|
93 |
+
# print('Merged term length df: ', term_length_df)
|
94 |
+
# variate_type_df = get_merged_df(variate_type_df, model_info_df)
|
95 |
+
# print('Merged variate type df: ', variate_type_df)
|
96 |
|
97 |
# (
|
98 |
# finished_eval_queue_df,
|
|
|
101 |
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
102 |
|
103 |
|
104 |
+
def init_leaderboard(ori_dataframe, model_info_df):
|
105 |
+
if ori_dataframe is None or ori_dataframe.empty:
|
106 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
107 |
+
model_info_col_list = [c.name for c in fields(ModelInfoColumn) if c.displayed_by_default if c.name not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
|
108 |
+
default_selection_list = list(ori_dataframe.columns) + model_info_col_list
|
109 |
+
print('default_selection_list: ', default_selection_list)
|
110 |
+
# ipdb.set_trace()
|
111 |
+
# default_selection_list = [col for col in default_selection_list if col not in ['#Params (B)', 'available_on_hub', 'hub', 'Model sha','Hub License']]
|
112 |
+
merged_df = get_merged_df(ori_dataframe, model_info_df)
|
113 |
+
new_cols = ['T'] + [col for col in merged_df.columns if col != 'T']
|
114 |
+
merged_df = merged_df[new_cols]
|
115 |
+
print('Merged df: ', merged_df)
|
116 |
return Leaderboard(
|
117 |
+
value=merged_df,
|
118 |
+
# datatype=[c.type for c in fields(ModelInfoColumn)],
|
119 |
select_columns=SelectColumns(
|
120 |
+
default_selection=default_selection_list,
|
121 |
+
# default_selection=[c.name for c in fields(ModelInfoColumn) if
|
122 |
+
# c.displayed_by_default and c.name not in ['params', 'available_on_hub', 'hub',
|
123 |
+
# 'Model sha', 'Hub License']],
|
124 |
+
# default_selection=list(dataframe.columns),
|
125 |
+
cant_deselect=[c.name for c in fields(ModelInfoColumn) if c.never_hidden],
|
126 |
+
label="Select Columns to Display:",
|
127 |
# How to uncheck??
|
128 |
),
|
129 |
+
hide_columns=[c.name for c in fields(ModelInfoColumn) if c.hidden],
|
130 |
search_columns=['model'],
|
131 |
# hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
132 |
# filter_columns=[
|
|
|
143 |
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
|
144 |
# ),
|
145 |
# ],
|
146 |
+
filter_columns=[
|
147 |
+
ColumnFilter(ModelInfoColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
148 |
+
],
|
149 |
+
# bool_checkboxgroup_label="Hide models",
|
150 |
interactive=False,
|
151 |
)
|
152 |
|
|
|
158 |
|
159 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
160 |
with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
|
161 |
+
leaderboard = init_leaderboard(domain_df, model_info_df)
|
162 |
print(f"FINAL Domain LEADERBOARD 1 {domain_df}")
|
163 |
|
164 |
with gr.TabItem("🏅 By Frequency", elem_id="llm-benchmark-tab-table", id=1):
|
165 |
+
leaderboard = init_leaderboard(freq_df, model_info_df)
|
166 |
print(f"FINAL Frequency LEADERBOARD 1 {freq_df}")
|
167 |
|
168 |
with gr.TabItem("🏅 By term length", elem_id="llm-benchmark-tab-table", id=2):
|
169 |
+
leaderboard = init_leaderboard(term_length_df, model_info_df)
|
170 |
print(f"FINAL term length LEADERBOARD 1 {term_length_df}")
|
171 |
|
172 |
with gr.TabItem("🏅 By variate type", elem_id="llm-benchmark-tab-table", id=3):
|
173 |
+
leaderboard = init_leaderboard(variate_type_df, model_info_df)
|
174 |
print(f"FINAL LEADERBOARD 1 {variate_type_df}")
|
175 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
|
176 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
results/auto_arima/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "auto_arima",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/auto_ets/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "auto_ets",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/auto_theta/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "auto_theta",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/chronos-small/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "chronos-small",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/chronos_base/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "chronos_base",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/chronos_large/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "chronos_large",
|
3 |
+
"model_type": "pretrained",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/crossformer/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "crossformer",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/d_linear/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "d_linear",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/deepar/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "deepar",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/i_transformer/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "i_transformer",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/moirai_1.1_R_base_no_leak/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "moirai_1.1_R_base_no_leak",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/moirai_1.1_R_large_no_leak/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "moirai_1.1_R_large_no_leak",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/moirai_1.1_R_small_no_leak/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "moirai_1.1_R_small_no_leak",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/n_beats/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "n_beats",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/naive/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "naive",
|
3 |
+
"model_type": "statistical",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/patch_tst/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "patch_tst",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/seasonal_naive/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "seasonal_naive",
|
3 |
+
"model_type": "statistical",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/tft/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "tft",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/tide/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "tide",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/timesfm/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "timesfm",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
results/visionts/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "visionts",
|
3 |
+
"model_type": "deep-learning",
|
4 |
+
"model_dtype": "float32"
|
5 |
+
}
|
src/display/utils.py
CHANGED
@@ -21,27 +21,23 @@ class ColumnContent:
|
|
21 |
never_hidden: bool = False
|
22 |
|
23 |
## Leaderboard columns
|
24 |
-
|
25 |
-
# Init
|
26 |
-
|
27 |
-
|
28 |
-
#Scores
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
-
for task in Tasks:
|
31 |
-
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
-
|
45 |
|
46 |
## For the queue columns in the submission tab
|
47 |
@dataclass(frozen=True)
|
@@ -62,10 +58,11 @@ class ModelDetails:
|
|
62 |
|
63 |
|
64 |
class ModelType(Enum):
|
65 |
-
PT = ModelDetails(name="pretrained", symbol="🟢")
|
66 |
-
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
67 |
-
|
68 |
-
|
|
|
69 |
Unknown = ModelDetails(name="", symbol="?")
|
70 |
|
71 |
def to_str(self, separator=" "):
|
@@ -77,10 +74,10 @@ class ModelType(Enum):
|
|
77 |
return ModelType.FT
|
78 |
if "pretrained" in type or "🟢" in type:
|
79 |
return ModelType.PT
|
80 |
-
if "
|
81 |
-
return ModelType.
|
82 |
-
if "
|
83 |
-
return ModelType.
|
84 |
return ModelType.Unknown
|
85 |
|
86 |
class WeightType(Enum):
|
@@ -101,7 +98,7 @@ class Precision(Enum):
|
|
101 |
return Precision.Unknown
|
102 |
|
103 |
# Column selection
|
104 |
-
|
105 |
|
106 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
|
21 |
never_hidden: bool = False
|
22 |
|
23 |
## Leaderboard columns
|
24 |
+
model_info_dict = []
|
25 |
+
# Init column for the model properties
|
26 |
+
model_info_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
+
model_info_dict.append(["model", ColumnContent, ColumnContent("model", "markdown", True, never_hidden=True)])
|
|
|
|
|
|
|
|
|
28 |
# Model information
|
29 |
+
model_info_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
30 |
+
model_info_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
31 |
+
model_info_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
32 |
+
model_info_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
33 |
+
model_info_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
34 |
+
model_info_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
35 |
+
model_info_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
36 |
+
model_info_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
37 |
+
model_info_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
38 |
|
39 |
# We use make dataclass to dynamically fill the scores from Tasks
|
40 |
+
ModelInfoColumn = make_dataclass("ModelInfoColumn", model_info_dict, frozen=True)
|
41 |
|
42 |
## For the queue columns in the submission tab
|
43 |
@dataclass(frozen=True)
|
|
|
58 |
|
59 |
|
60 |
class ModelType(Enum):
|
61 |
+
PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
|
62 |
+
FT = ModelDetails(name="🔶 fine-tuned", symbol="🔶")
|
63 |
+
DL = ModelDetails(name="🔷 deep-learning", symbol="🔷")
|
64 |
+
ST = ModelDetails(name="🟣 statistical", symbol="🟣")
|
65 |
+
|
66 |
Unknown = ModelDetails(name="", symbol="?")
|
67 |
|
68 |
def to_str(self, separator=" "):
|
|
|
74 |
return ModelType.FT
|
75 |
if "pretrained" in type or "🟢" in type:
|
76 |
return ModelType.PT
|
77 |
+
if "deep-learning" in type or "🟦" in type:
|
78 |
+
return ModelType.DL
|
79 |
+
if "statistical" in type or "🟣" in type:
|
80 |
+
return ModelType.ST
|
81 |
return ModelType.Unknown
|
82 |
|
83 |
class WeightType(Enum):
|
|
|
98 |
return Precision.Unknown
|
99 |
|
100 |
# Column selection
|
101 |
+
MODEL_INFO_COLS = [c.name for c in fields(ModelInfoColumn) if not c.hidden]
|
102 |
|
103 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
104 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
src/envs.py
CHANGED
@@ -18,7 +18,8 @@ CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
18 |
|
19 |
# Local caches
|
20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
22 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
23 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
24 |
|
|
|
18 |
|
19 |
# Local caches
|
20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
+
# EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
22 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "results")
|
23 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
24 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
25 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,10 +8,48 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
@dataclass
|
16 |
class EvalResult:
|
17 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
@@ -154,7 +192,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
154 |
return request_file
|
155 |
|
156 |
|
157 |
-
def
|
158 |
"""From the path of the results folder root, extract all needed info for results"""
|
159 |
model_result_filepaths = []
|
160 |
|
@@ -172,6 +210,49 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
172 |
for file in files:
|
173 |
model_result_filepaths.append(os.path.join(root, file))
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
eval_results = {}
|
176 |
for model_result_filepath in model_result_filepaths:
|
177 |
# Creation of result
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import ModelType, Tasks, Precision, WeightType, ModelInfoColumn
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
15 |
+
@dataclass
|
16 |
+
class ModelConfig:
|
17 |
+
"""Represents the model configuration of a model"""
|
18 |
+
model: str
|
19 |
+
model_type: ModelType = ModelType.Unknown
|
20 |
+
precision: Precision = Precision.Unknown
|
21 |
+
license: str = "?"
|
22 |
+
likes: int = 0
|
23 |
+
num_params: int = 0
|
24 |
+
|
25 |
+
@classmethod
|
26 |
+
def init_from_json_file(cls, json_filepath):
|
27 |
+
"""Inits the result from the specific model result file"""
|
28 |
+
with open(json_filepath) as fp:
|
29 |
+
data = json.load(fp)
|
30 |
+
|
31 |
+
# config = data.get("config")
|
32 |
+
|
33 |
+
# Precision
|
34 |
+
precision = Precision.from_str(data.get("model_dtype"))
|
35 |
+
model_type = ModelType.from_str(data.get("model_type", ""))
|
36 |
+
model = data.get("model", "")
|
37 |
+
return cls(model=model, model_type=model_type, precision=precision)
|
38 |
+
|
39 |
+
def to_dict(self):
|
40 |
+
"""Converts the model info to a dict compatible with our dataframe display"""
|
41 |
+
data_dict = {
|
42 |
+
"model": self.model, # not a column, just a save name,
|
43 |
+
ModelInfoColumn.precision.name: self.precision.value.name,
|
44 |
+
ModelInfoColumn.model_type.name: self.model_type.value.name,
|
45 |
+
ModelInfoColumn.model_type_symbol.name: self.model_type.value.symbol,
|
46 |
+
ModelInfoColumn.license.name: self.license,
|
47 |
+
ModelInfoColumn.likes.name: self.likes,
|
48 |
+
ModelInfoColumn.params.name: self.num_params,
|
49 |
+
}
|
50 |
+
|
51 |
+
return data_dict
|
52 |
+
|
53 |
@dataclass
|
54 |
class EvalResult:
|
55 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
|
|
192 |
return request_file
|
193 |
|
194 |
|
195 |
+
def get_model_info(results_path: str, requests_path: str) -> list[ModelConfig]:
|
196 |
"""From the path of the results folder root, extract all needed info for results"""
|
197 |
model_result_filepaths = []
|
198 |
|
|
|
210 |
for file in files:
|
211 |
model_result_filepaths.append(os.path.join(root, file))
|
212 |
|
213 |
+
model_infos = {}
|
214 |
+
for model_result_filepath in model_result_filepaths:
|
215 |
+
# Creation of result
|
216 |
+
model_info = ModelConfig.init_from_json_file(model_result_filepath)
|
217 |
+
# eval_result.update_with_request_file(requests_path)
|
218 |
+
|
219 |
+
# Store results of same eval together
|
220 |
+
model_name = model_info.model
|
221 |
+
model_infos[model_name] = model_info
|
222 |
+
# if eval_name in eval_results.keys():
|
223 |
+
# eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
224 |
+
# else:
|
225 |
+
# eval_results[eval_name] = eval_result
|
226 |
+
|
227 |
+
results = []
|
228 |
+
for v in model_infos.values():
|
229 |
+
try:
|
230 |
+
v.to_dict() # we test if the dict version is complete
|
231 |
+
results.append(v)
|
232 |
+
except KeyError: # not all eval values present
|
233 |
+
continue
|
234 |
+
|
235 |
+
return results
|
236 |
+
|
237 |
+
|
238 |
+
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
239 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
240 |
+
model_result_filepaths = []
|
241 |
+
|
242 |
+
for root, _, files in os.walk(results_path):
|
243 |
+
# We should only have json files in model results
|
244 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
245 |
+
continue
|
246 |
+
|
247 |
+
# # Sort the files by date
|
248 |
+
# try:
|
249 |
+
# files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
250 |
+
# except dateutil.parser._parser.ParserError:
|
251 |
+
# files = [files[-1]]
|
252 |
+
|
253 |
+
for file in files:
|
254 |
+
model_result_filepaths.append(os.path.join(root, file))
|
255 |
+
|
256 |
eval_results = {}
|
257 |
for model_result_filepath in model_result_filepaths:
|
258 |
# Creation of result
|
src/populate.py
CHANGED
@@ -4,12 +4,27 @@ import os
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import
|
8 |
-
from src.leaderboard.read_evals import
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
13 |
# raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
# print('results_path:', results_path)
|
15 |
# all_data_json = [v.to_dict() for v in raw_data]
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
from src.display.utils import EvalQueueColumn
|
8 |
+
from src.leaderboard.read_evals import get_model_info
|
9 |
+
import ipdb
|
10 |
+
|
11 |
+
def get_model_info_df(results_path: str, requests_path: str, cols: list=[], benchmark_cols: list=[]) -> pd.DataFrame:
|
12 |
+
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
raw_data = get_model_info(results_path, requests_path)
|
14 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
+
print(f"The raw data is {all_data_json}")
|
16 |
+
df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
print(f"DF for Model Info ********** {df}")
|
18 |
+
return df
|
19 |
+
|
20 |
+
def get_merged_df(result_df: pd.DataFrame, model_info_df: pd.DataFrame) -> pd.DataFrame:
|
21 |
+
"""Merges the model info dataframe with the results dataframe"""
|
22 |
+
merged_df = pd.merge(model_info_df, result_df, on='model', how='inner')
|
23 |
+
return merged_df
|
24 |
|
25 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
26 |
"""Creates a dataframe from all the individual experiment results"""
|
27 |
+
raw_data = get_raw_eval_results(results_path, requests_path)
|
28 |
# raw_data = get_raw_eval_results(results_path, requests_path)
|
29 |
# print('results_path:', results_path)
|
30 |
# all_data_json = [v.to_dict() for v in raw_data]
|
src/utils.py
CHANGED
@@ -24,4 +24,5 @@ def pivot_df(file_name, tab_name):
|
|
24 |
# df_pivot.to_csv('pivoted_df.csv')
|
25 |
# print(df_pivot)
|
26 |
df_pivot = df_pivot.reset_index()
|
|
|
27 |
return df_pivot
|
|
|
24 |
# df_pivot.to_csv('pivoted_df.csv')
|
25 |
# print(df_pivot)
|
26 |
df_pivot = df_pivot.reset_index()
|
27 |
+
df_pivot = df_pivot.round(3)
|
28 |
return df_pivot
|