Spaces:
Running
Running
initial demo
Browse files- app.py +130 -79
- results/grouped_results_by_domain.csv +148 -0
- results/grouped_results_by_frequency.csv +211 -0
- results/grouped_results_by_term_length.csv +64 -0
- results/grouped_results_by_univariate.csv +43 -0
- src/about.py +58 -33
- src/populate.py +24 -6
- src/utils.py +27 -0
app.py
CHANGED
@@ -27,29 +27,62 @@ from src.display.utils import (
|
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
from src.submission.submit import add_new_eval
|
|
|
|
|
30 |
|
31 |
|
32 |
def restart_space():
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
|
|
35 |
### Space initialisation
|
36 |
try:
|
37 |
print(EVAL_REQUESTS_PATH)
|
38 |
snapshot_download(
|
39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
|
|
40 |
)
|
41 |
except Exception:
|
42 |
restart_space()
|
43 |
try:
|
44 |
print(EVAL_RESULTS_PATH)
|
45 |
snapshot_download(
|
46 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
|
|
47 |
)
|
48 |
except Exception:
|
49 |
restart_space()
|
50 |
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
(
|
55 |
finished_eval_queue_df,
|
@@ -57,6 +90,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
57 |
pending_eval_queue_df,
|
58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
|
|
60 |
def init_leaderboard(dataframe):
|
61 |
if dataframe is None or dataframe.empty:
|
62 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
@@ -64,26 +98,28 @@ def init_leaderboard(dataframe):
|
|
64 |
value=dataframe,
|
65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
select_columns=SelectColumns(
|
67 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
-
|
69 |
-
|
|
|
70 |
),
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
87 |
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
89 |
)
|
@@ -95,21 +131,33 @@ with demo:
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("🏅
|
99 |
-
leaderboard = init_leaderboard(
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
|
104 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=
|
105 |
with gr.Column():
|
106 |
with gr.Row():
|
107 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
|
109 |
with gr.Column():
|
110 |
with gr.Accordion(
|
111 |
-
|
112 |
-
|
113 |
):
|
114 |
with gr.Row():
|
115 |
finished_eval_table = gr.components.Dataframe(
|
@@ -119,8 +167,8 @@ with demo:
|
|
119 |
row_count=5,
|
120 |
)
|
121 |
with gr.Accordion(
|
122 |
-
|
123 |
-
|
124 |
):
|
125 |
with gr.Row():
|
126 |
running_eval_table = gr.components.Dataframe(
|
@@ -131,8 +179,8 @@ with demo:
|
|
131 |
)
|
132 |
|
133 |
with gr.Accordion(
|
134 |
-
|
135 |
-
|
136 |
):
|
137 |
with gr.Row():
|
138 |
pending_eval_table = gr.components.Dataframe(
|
@@ -142,51 +190,54 @@ with demo:
|
|
142 |
row_count=5,
|
143 |
)
|
144 |
with gr.Row():
|
145 |
-
gr.Markdown("# ✉️✨ Submit your model
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
190 |
|
191 |
with gr.Row():
|
192 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
from src.submission.submit import add_new_eval
|
30 |
+
from src.utils import norm_sNavie, pivot_df
|
31 |
+
import ipdb
|
32 |
|
33 |
|
34 |
def restart_space():
|
35 |
API.restart_space(repo_id=REPO_ID)
|
36 |
|
37 |
+
|
38 |
### Space initialisation
|
39 |
try:
|
40 |
print(EVAL_REQUESTS_PATH)
|
41 |
snapshot_download(
|
42 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
43 |
+
token=TOKEN
|
44 |
)
|
45 |
except Exception:
|
46 |
restart_space()
|
47 |
try:
|
48 |
print(EVAL_RESULTS_PATH)
|
49 |
snapshot_download(
|
50 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
|
51 |
+
token=TOKEN
|
52 |
)
|
53 |
except Exception:
|
54 |
restart_space()
|
55 |
|
56 |
+
# # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
57 |
+
# df = pd.read_csv('LOTSAv2_EvalBenchmark(Long).csv')
|
58 |
+
# # Step 2: Pivot the DataFrame
|
59 |
+
# LEADERBOARD_DF = df.pivot_table(index='model',
|
60 |
+
# columns='dataset',
|
61 |
+
# values='eval_metrics/MAE[0.5]',
|
62 |
+
# aggfunc='first')
|
63 |
+
# LEADERBOARD_DF.drop(columns=['ALL'], inplace=True)
|
64 |
+
#
|
65 |
+
# # Reset the index if you want the model column to be part of the DataFrame
|
66 |
+
# LEADERBOARD_DF.reset_index(inplace=True)
|
67 |
+
# # Step 3: noramlize the values
|
68 |
+
# # ipdb.set_trace()
|
69 |
+
# LEADERBOARD_DF = norm_sNavie(LEADERBOARD_DF)
|
70 |
+
#
|
71 |
+
# # LEADERBOARD_DF['Average'] = LEADERBOARD_DF.mean(axis=1)
|
72 |
+
# # LEADERBOARD_DF.insert(1, 'Average', LEADERBOARD_DF.pop('Average'))
|
73 |
+
# # LEADERBOARD_DF = LEADERBOARD_DF.sort_values(by=['Average'], ascending=True)
|
74 |
+
# print(f"The leaderboard is {LEADERBOARD_DF}")
|
75 |
+
# print(f'Columns: ', LEADERBOARD_DF.columns)
|
76 |
+
|
77 |
+
# LEADERBOARD_DF = pd.read_csv('pivoted_df.csv')
|
78 |
+
domain_df = pivot_df('results/grouped_results_by_domain.csv', tab_name='domain')
|
79 |
+
print(f'Domain dataframe is {domain_df}')
|
80 |
+
freq_df = pivot_df('results/grouped_results_by_frequency.csv', tab_name='frequency')
|
81 |
+
print(f'Freq dataframe is {freq_df}')
|
82 |
+
term_length_df = pivot_df('results/grouped_results_by_term_length.csv', tab_name='term_length')
|
83 |
+
print(f'Term length dataframe is {term_length_df}')
|
84 |
+
variate_type_df = pivot_df('results/grouped_results_by_univariate.csv', tab_name='univariate')
|
85 |
+
print(f'Variate type dataframe is {variate_type_df}')
|
86 |
|
87 |
(
|
88 |
finished_eval_queue_df,
|
|
|
90 |
pending_eval_queue_df,
|
91 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
92 |
|
93 |
+
|
94 |
def init_leaderboard(dataframe):
|
95 |
if dataframe is None or dataframe.empty:
|
96 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
98 |
value=dataframe,
|
99 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
100 |
select_columns=SelectColumns(
|
101 |
+
# default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.name not in ['params', 'available_on_hub', 'hub', 'Model sha','Hub License']],
|
102 |
+
default_selection=list(dataframe.columns),
|
103 |
+
cant_deselect=['model'],
|
104 |
+
label="Select Datasets to Display:",
|
105 |
),
|
106 |
+
|
107 |
+
search_columns=['model'],
|
108 |
+
# hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
109 |
+
# filter_columns=[
|
110 |
+
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
111 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
112 |
+
# ColumnFilter(
|
113 |
+
# AutoEvalColumn.params.name,
|
114 |
+
# type="slider",
|
115 |
+
# min=0.01,
|
116 |
+
# max=500,
|
117 |
+
# label="Select the number of parameters (B)",
|
118 |
+
# ),
|
119 |
+
# ColumnFilter(
|
120 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
|
121 |
+
# ),
|
122 |
+
# ],
|
123 |
bool_checkboxgroup_label="Hide models",
|
124 |
interactive=False,
|
125 |
)
|
|
|
131 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
132 |
|
133 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
134 |
+
with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
|
135 |
+
leaderboard = init_leaderboard(domain_df)
|
136 |
+
print(f"FINAL Domain LEADERBOARD 1 {domain_df}")
|
137 |
+
|
138 |
+
with gr.TabItem("🏅 By Frequency", elem_id="llm-benchmark-tab-table", id=1):
|
139 |
+
leaderboard = init_leaderboard(freq_df)
|
140 |
+
print(f"FINAL Frequency LEADERBOARD 1 {freq_df}")
|
141 |
+
|
142 |
+
with gr.TabItem("🏅 By term length", elem_id="llm-benchmark-tab-table", id=2):
|
143 |
+
leaderboard = init_leaderboard(term_length_df)
|
144 |
+
print(f"FINAL term length LEADERBOARD 1 {term_length_df}")
|
145 |
+
|
146 |
+
with gr.TabItem("🏅 By variate type", elem_id="llm-benchmark-tab-table", id=3):
|
147 |
+
leaderboard = init_leaderboard(variate_type_df)
|
148 |
+
print(f"FINAL LEADERBOARD 1 {variate_type_df}")
|
149 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
|
150 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
151 |
|
152 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=5):
|
153 |
with gr.Column():
|
154 |
with gr.Row():
|
155 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
156 |
|
157 |
with gr.Column():
|
158 |
with gr.Accordion(
|
159 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
160 |
+
open=False,
|
161 |
):
|
162 |
with gr.Row():
|
163 |
finished_eval_table = gr.components.Dataframe(
|
|
|
167 |
row_count=5,
|
168 |
)
|
169 |
with gr.Accordion(
|
170 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
171 |
+
open=False,
|
172 |
):
|
173 |
with gr.Row():
|
174 |
running_eval_table = gr.components.Dataframe(
|
|
|
179 |
)
|
180 |
|
181 |
with gr.Accordion(
|
182 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
183 |
+
open=False,
|
184 |
):
|
185 |
with gr.Row():
|
186 |
pending_eval_table = gr.components.Dataframe(
|
|
|
190 |
row_count=5,
|
191 |
)
|
192 |
with gr.Row():
|
193 |
+
gr.Markdown("# ✉️✨ Submit your model outputs !", elem_classes="markdown-text")
|
194 |
+
gr.Markdown(
|
195 |
+
"Send your model outputs for all the models using the ContextualBench code and email them to us at xnguyen@salesforce.com ",
|
196 |
+
elem_classes="markdown-text")
|
197 |
+
|
198 |
+
# with gr.Row():
|
199 |
+
# with gr.Column():
|
200 |
+
# model_name_textbox = gr.Textbox(label="Model name")
|
201 |
+
# revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
202 |
+
# model_type = gr.Dropdown(
|
203 |
+
# choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
204 |
+
# label="Model type",
|
205 |
+
# multiselect=False,
|
206 |
+
# value=None,
|
207 |
+
# interactive=True,
|
208 |
+
# )
|
209 |
+
|
210 |
+
# with gr.Column():
|
211 |
+
# precision = gr.Dropdown(
|
212 |
+
# choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
213 |
+
# label="Precision",
|
214 |
+
# multiselect=False,
|
215 |
+
# value="float16",
|
216 |
+
# interactive=True,
|
217 |
+
# )
|
218 |
+
# weight_type = gr.Dropdown(
|
219 |
+
# choices=[i.value.name for i in WeightType],
|
220 |
+
# label="Weights type",
|
221 |
+
# multiselect=False,
|
222 |
+
# value="Original",
|
223 |
+
# interactive=True,
|
224 |
+
# )
|
225 |
+
# base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
226 |
+
|
227 |
+
# submit_button = gr.Button("Submit Eval")
|
228 |
+
# submission_result = gr.Markdown()
|
229 |
+
# submit_button.click(
|
230 |
+
# add_new_eval,
|
231 |
+
# [
|
232 |
+
# model_name_textbox,
|
233 |
+
# base_model_name_textbox,
|
234 |
+
# revision_name_textbox,
|
235 |
+
# precision,
|
236 |
+
# weight_type,
|
237 |
+
# model_type,
|
238 |
+
# ],
|
239 |
+
# submission_result,
|
240 |
+
# )
|
241 |
|
242 |
with gr.Row():
|
243 |
with gr.Accordion("📙 Citation", open=False):
|
results/grouped_results_by_domain.csv
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
domain,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
|
2 |
+
Econ/Fin,auto_arima,0.9365192023498371,0.82547313686996,7.166666666666667
|
3 |
+
Econ/Fin,auto_ets,1.069790707228502,0.9914310347244895,9.0
|
4 |
+
Econ/Fin,auto_theta,1.0054900702449128,0.846318218177465,7.833333333333333
|
5 |
+
Econ/Fin,chronos-small,0.8016337645320165,0.7776047759026133,7.5
|
6 |
+
Econ/Fin,chronos_base,0.8020694855013312,0.7682503560215578,6.333333333333333
|
7 |
+
Econ/Fin,chronos_large,0.7991640460606071,0.7743218919662125,7.0
|
8 |
+
Econ/Fin,crossformer,34.27971965234223,126.76758253255495,21.0
|
9 |
+
Econ/Fin,d_linear,1.1148346887593732,1.1323705430596809,17.5
|
10 |
+
Econ/Fin,deepar,1.2278569268049724,1.4148933569849873,14.333333333333334
|
11 |
+
Econ/Fin,i_transformer,1.0214445148835511,0.8548360574080038,8.333333333333334
|
12 |
+
Econ/Fin,moirai_1.1_R_base_no_leak,1.2693975078273285,1.035652986793995,15.5
|
13 |
+
Econ/Fin,moirai_1.1_R_large_no_leak,0.879988898990186,0.7355518100899686,3.1666666666666665
|
14 |
+
Econ/Fin,moirai_1.1_R_small_no_leak,1.035635272136103,0.8530614545680443,10.833333333333334
|
15 |
+
Econ/Fin,n_beats,0.8629634668807649,0.9855058224964246,12.666666666666666
|
16 |
+
Econ/Fin,naive,1.222637910798122,1.3614416351328862,15.166666666666666
|
17 |
+
Econ/Fin,patch_tst,0.9463918645805333,0.8154506342478772,6.666666666666667
|
18 |
+
Econ/Fin,seasonal_naive,1.0,1.0,14.833333333333334
|
19 |
+
Econ/Fin,tft,1.1360880871257253,0.8476154067835847,7.833333333333333
|
20 |
+
Econ/Fin,tide,1.1466847399238815,1.0894612064116727,17.0
|
21 |
+
Econ/Fin,timesfm,0.840899493300519,0.7288672456979325,4.833333333333333
|
22 |
+
Econ/Fin,visionts,0.9927325904977805,1.056804867374159,16.5
|
23 |
+
Energy,auto_arima,1.020271660057138,0.8634432097816029,12.28125
|
24 |
+
Energy,auto_ets,1.3407403717825905,4.309210526315789e+23,15.8125
|
25 |
+
Energy,auto_theta,1.3500329947825767,2.288546713618502,16.90625
|
26 |
+
Energy,chronos-small,0.9106949318701584,0.684989122019499,8.59375
|
27 |
+
Energy,chronos_base,0.908599317521494,0.6608616701619849,7.375
|
28 |
+
Energy,chronos_large,0.9122256820323856,0.6592412529602467,7.25
|
29 |
+
Energy,crossformer,9236423597.08186,11.594762627902593,14.25
|
30 |
+
Energy,d_linear,1.1889257931784007,0.9161631466345352,14.71875
|
31 |
+
Energy,deepar,1.678068514068864,1.3746230213328996,14.6875
|
32 |
+
Energy,i_transformer,1.2394136898117367,0.8256530209999239,7.0625
|
33 |
+
Energy,moirai_1.1_R_base_no_leak,0.9874963954081143,0.6351616784559907,5.6875
|
34 |
+
Energy,moirai_1.1_R_large_no_leak,0.8909713530536371,0.6258812753967218,5.375
|
35 |
+
Energy,moirai_1.1_R_small_no_leak,0.8991945372018496,0.6974970102561562,7.0
|
36 |
+
Energy,n_beats,1.1689169414175054,0.9804388197933043,15.09375
|
37 |
+
Energy,naive,1.2329496956881505,1.717544651501066,18.21875
|
38 |
+
Energy,patch_tst,1.0002148326675349,0.6519449023593654,5.75
|
39 |
+
Energy,seasonal_naive,1.0,1.0,15.84375
|
40 |
+
Energy,tft,1.094935043145979,0.6698073412229205,7.28125
|
41 |
+
Energy,tide,1.2969242335580595,0.8815970753846475,10.46875
|
42 |
+
Energy,timesfm,0.990249590684815,0.7036315984358718,8.25
|
43 |
+
Energy,visionts,1.1457219960976157,0.8253258612638164,13.09375
|
44 |
+
Healthcare,auto_arima,0.7829904478130371,0.6313761904761905,7.8
|
45 |
+
Healthcare,auto_ets,0.8061744759695987,0.6378749206349206,7.2
|
46 |
+
Healthcare,auto_theta,0.9667122096533645,0.8276838095238095,12.0
|
47 |
+
Healthcare,chronos-small,0.6676511382266025,0.5753475132275132,5.8
|
48 |
+
Healthcare,chronos_base,0.7248913460690712,0.5941960317460317,5.0
|
49 |
+
Healthcare,chronos_large,0.6616148175532721,0.5314045502645502,4.2
|
50 |
+
Healthcare,crossformer,454.59633819128214,70.08870634920635,16.0
|
51 |
+
Healthcare,d_linear,0.8850081956016165,0.9878685185185185,16.0
|
52 |
+
Healthcare,deepar,0.8566987649911045,0.8641066137566138,10.4
|
53 |
+
Healthcare,i_transformer,0.9023075876752837,0.7645924867724868,9.4
|
54 |
+
Healthcare,moirai_1.1_R_base_no_leak,1.1475007332958291,1.073448677248677,15.8
|
55 |
+
Healthcare,moirai_1.1_R_large_no_leak,0.8166086394810568,0.68611,8.0
|
56 |
+
Healthcare,moirai_1.1_R_small_no_leak,1.0612957366806712,0.925962328042328,14.8
|
57 |
+
Healthcare,n_beats,0.7698977996924792,0.8690820634920635,14.0
|
58 |
+
Healthcare,naive,1.2056710696279012,1.2890160846560845,17.8
|
59 |
+
Healthcare,patch_tst,0.7946209932133224,0.6791721164021164,8.6
|
60 |
+
Healthcare,seasonal_naive,1.0,1.0,15.0
|
61 |
+
Healthcare,tft,0.7904105809823141,0.7010021693121693,8.2
|
62 |
+
Healthcare,tide,0.8426731528233347,1.1561685185185187,14.4
|
63 |
+
Healthcare,timesfm,0.791330416522951,0.7994040740740741,7.8
|
64 |
+
Healthcare,visionts,0.8034595649263452,0.7956369841269841,12.8
|
65 |
+
Nature,auto_arima,0.9361489953148219,0.7095217542336816,14.4
|
66 |
+
Nature,auto_ets,1.206307582700855,76254692.36191763,16.933333333333334
|
67 |
+
Nature,auto_theta,5.150553292857,1.0319763336420513,16.4
|
68 |
+
Nature,chronos-small,0.9491555540245159,0.4571809141301187,9.666666666666666
|
69 |
+
Nature,chronos_base,0.8087692327609204,0.43483874046585586,8.733333333333333
|
70 |
+
Nature,chronos_large,0.7215301114853574,0.43339550624240464,8.2
|
71 |
+
Nature,crossformer,3.6857728593414816,1.8222011094109303,12.8
|
72 |
+
Nature,d_linear,1.6637383989524568,0.566411290270835,14.466666666666667
|
73 |
+
Nature,deepar,1.3368617172298543,0.784343921808244,11.866666666666667
|
74 |
+
Nature,i_transformer,1.0245163140037494,0.3923948352004396,6.533333333333333
|
75 |
+
Nature,moirai_1.1_R_base_no_leak,1.0846943426539009,0.42165427104639003,4.466666666666667
|
76 |
+
Nature,moirai_1.1_R_large_no_leak,0.9012043168826274,0.37755828000010155,4.133333333333334
|
77 |
+
Nature,moirai_1.1_R_small_no_leak,0.8636937125921123,0.400148437852242,4.4
|
78 |
+
Nature,n_beats,2.051183579793879,0.5865729148518006,14.533333333333333
|
79 |
+
Nature,naive,1.0153007149015423,1.5216687585771838,19.266666666666666
|
80 |
+
Nature,patch_tst,0.9757662316880771,0.40362241062795473,7.266666666666667
|
81 |
+
Nature,seasonal_naive,1.0,1.0,18.6
|
82 |
+
Nature,tft,1.3479799792947338,0.4024676715316202,7.466666666666667
|
83 |
+
Nature,tide,1.6518355265449745,0.648933595846154,13.466666666666667
|
84 |
+
Nature,timesfm,1.0759825145269837,0.38186899336385965,5.733333333333333
|
85 |
+
Nature,visionts,1.0368902840216354,0.4874755604862317,11.666666666666666
|
86 |
+
Sales,auto_arima,0.7716630938105196,0.4828581089269842,14.25
|
87 |
+
Sales,auto_ets,0.9017684593360312,30.76895095733506,17.25
|
88 |
+
Sales,auto_theta,0.8258637946630958,0.5029359984438486,14.5
|
89 |
+
Sales,chronos-small,0.7186805493171662,0.3848441218930615,9.0
|
90 |
+
Sales,chronos_base,0.7008513669210537,0.3850621220616795,7.25
|
91 |
+
Sales,chronos_large,0.7034956230009589,0.3844707529650858,7.25
|
92 |
+
Sales,crossformer,1.4946326987237475,7.655215975652274,20.75
|
93 |
+
Sales,d_linear,0.7999361690904114,0.5046638267307796,14.25
|
94 |
+
Sales,deepar,0.7388013496334613,0.3684882238444817,6.25
|
95 |
+
Sales,i_transformer,0.7592707077676131,0.37054645387589946,4.75
|
96 |
+
Sales,moirai_1.1_R_base_no_leak,0.667796706791987,0.5158002274792624,9.0
|
97 |
+
Sales,moirai_1.1_R_large_no_leak,0.6706792874796048,0.4063993273240754,5.25
|
98 |
+
Sales,moirai_1.1_R_small_no_leak,0.6717145779320488,0.4624986481003004,8.5
|
99 |
+
Sales,n_beats,0.7261206955984014,0.42619466975098175,11.0
|
100 |
+
Sales,naive,0.9988290398126464,0.9354131622562287,19.0
|
101 |
+
Sales,patch_tst,0.7506252415562384,0.36695813811595074,3.25
|
102 |
+
Sales,seasonal_naive,1.0,1.0,19.25
|
103 |
+
Sales,tft,0.7571398644569189,0.3639182778535524,8.0
|
104 |
+
Sales,tide,1.0042130411120884,0.5031160265435741,14.0
|
105 |
+
Sales,timesfm,0.6834660865486862,0.36525039257779146,2.75
|
106 |
+
Sales,visionts,0.8110052069079339,0.5234893692225551,15.5
|
107 |
+
Transport,auto_arima,1.067553229756302,0.7895352174994626,15.866666666666667
|
108 |
+
Transport,auto_ets,1.2519332616788197,62214211389283.484,18.333333333333332
|
109 |
+
Transport,auto_theta,1.080401746635928,1.484666133944374,18.733333333333334
|
110 |
+
Transport,chronos-small,0.8463413166527496,0.6018945114961274,10.066666666666666
|
111 |
+
Transport,chronos_base,0.8525884818870904,0.5855383296935212,8.066666666666666
|
112 |
+
Transport,chronos_large,0.847275145385676,0.5853558157193545,8.4
|
113 |
+
Transport,crossformer,2.133541126273085,2.824391583266013,10.466666666666667
|
114 |
+
Transport,d_linear,0.9088963832125505,0.7037765247623837,14.733333333333333
|
115 |
+
Transport,deepar,0.8113936242603784,0.5544948055430984,6.466666666666667
|
116 |
+
Transport,i_transformer,0.827077979022359,0.4999616864926626,5.866666666666666
|
117 |
+
Transport,moirai_1.1_R_base_no_leak,0.8561472773934119,0.47760992257555535,6.133333333333334
|
118 |
+
Transport,moirai_1.1_R_large_no_leak,0.9275713341627421,0.5021373535569643,6.666666666666667
|
119 |
+
Transport,moirai_1.1_R_small_no_leak,0.910054545689888,0.5002405082060885,8.333333333333334
|
120 |
+
Transport,n_beats,0.759611430343423,0.6406924635381934,12.6
|
121 |
+
Transport,naive,1.4793208069977917,2.2909473535610148,20.133333333333333
|
122 |
+
Transport,patch_tst,0.8021327551126702,0.5059201467965427,5.8
|
123 |
+
Transport,seasonal_naive,1.0,1.0,17.333333333333332
|
124 |
+
Transport,tft,0.8220948248404197,0.485294449011853,4.8
|
125 |
+
Transport,tide,0.8779217036886292,0.5698046392964627,10.4
|
126 |
+
Transport,timesfm,0.9283663454018408,0.577738758232893,8.133333333333333
|
127 |
+
Transport,visionts,0.8701382141384387,0.6655022406963716,13.666666666666666
|
128 |
+
Web/CloudOps,auto_arima,0.8940721359171526,0.9239632177767032,14.5
|
129 |
+
Web/CloudOps,auto_ets,1.1484024357848706,3541668.1195238987,17.05
|
130 |
+
Web/CloudOps,auto_theta,0.8325449233161077,0.7377937575734188,12.1
|
131 |
+
Web/CloudOps,chronos-small,1.1477382857881004,0.7519221963095372,10.4
|
132 |
+
Web/CloudOps,chronos_base,1.2983512147050473,0.8105248727247287,11.35
|
133 |
+
Web/CloudOps,chronos_large,1.3303495508509569,0.79130505302003,11.65
|
134 |
+
Web/CloudOps,crossformer,3.905809488486181,0.7282280986973914,11.35
|
135 |
+
Web/CloudOps,d_linear,1.679664570939319,0.813864694620387,12.55
|
136 |
+
Web/CloudOps,deepar,0.8567686630861442,0.7806071927900515,11.7
|
137 |
+
Web/CloudOps,i_transformer,0.7194432840929166,0.5224562708709003,4.5
|
138 |
+
Web/CloudOps,moirai_1.1_R_base_no_leak,1.0566434817767107,0.7682523197700815,9.25
|
139 |
+
Web/CloudOps,moirai_1.1_R_large_no_leak,0.7913068650225961,0.7415333306227597,8.35
|
140 |
+
Web/CloudOps,moirai_1.1_R_small_no_leak,0.797099135532333,0.7437898694659932,8.5
|
141 |
+
Web/CloudOps,n_beats,0.6423921434834379,0.6616483361015169,10.3
|
142 |
+
Web/CloudOps,naive,1.1134728329755728,1.1880618871151416,16.7
|
143 |
+
Web/CloudOps,patch_tst,0.6023812811006274,0.517794941208908,3.95
|
144 |
+
Web/CloudOps,seasonal_naive,1.0,1.0,16.35
|
145 |
+
Web/CloudOps,tft,1.3456759309631106,0.6485214709355084,5.95
|
146 |
+
Web/CloudOps,tide,0.957645003291147,0.6729746748245962,9.95
|
147 |
+
Web/CloudOps,timesfm,2.3672130873427584,0.9761625637942284,13.9
|
148 |
+
Web/CloudOps,visionts,0.8379189396040971,0.7244329358471615,10.65
|
results/grouped_results_by_frequency.csv
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
frequency,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
|
2 |
+
10S,auto_arima,1.0,1.0,8.5
|
3 |
+
10S,auto_ets,1.7729614789749542,2.9904066969242904,19.5
|
4 |
+
10S,auto_theta,0.6490202004866038,0.4498416650221458,1.0
|
5 |
+
10S,chronos-small,2.4209056102186177,1.0976423357911036,10.833333333333334
|
6 |
+
10S,chronos_base,2.9496999072884034,1.2710971183275626,12.333333333333334
|
7 |
+
10S,chronos_large,3.0233397960335355,1.1850701939432435,11.666666666666666
|
8 |
+
10S,crossformer,10.78297867187441,0.9632338077490566,11.5
|
9 |
+
10S,d_linear,3.464549733595483,1.1289544947882015,11.666666666666666
|
10 |
+
10S,deepar,1.8262604211066729,1.028182807201692,11.166666666666666
|
11 |
+
10S,i_transformer,1.3136663904154584,0.6908224736822052,2.5
|
12 |
+
10S,moirai_1.1_R_base_no_leak,2.3417832113521384,1.4018041619123724,16.5
|
13 |
+
10S,moirai_1.1_R_large_no_leak,1.5520209642771396,1.2388948175006333,14.166666666666666
|
14 |
+
10S,moirai_1.1_R_small_no_leak,1.6476110189439555,1.3097769500283245,14.833333333333334
|
15 |
+
10S,n_beats,0.9967269159720157,0.821329634043785,6.666666666666667
|
16 |
+
10S,naive,1.9717621185326415,1.537476210057875,16.333333333333332
|
17 |
+
10S,patch_tst,1.059144382825294,0.732123599927497,5.0
|
18 |
+
10S,seasonal_naive,1.0,1.0,9.5
|
19 |
+
10S,tft,3.2429931822722793,1.0464406489926144,7.666666666666667
|
20 |
+
10S,tide,1.6709303370483655,0.9849620277425579,10.166666666666666
|
21 |
+
10S,timesfm,6.2343560526627115,1.8234677065697433,19.333333333333332
|
22 |
+
10S,visionts,1.0968489534913595,0.9341805708581102,10.166666666666666
|
23 |
+
10T,auto_arima,1.0,1.0,15.5
|
24 |
+
10T,auto_ets,1.655224521738566,2.151457928700739,16.666666666666668
|
25 |
+
10T,auto_theta,2.629806275955802,3.6738488709022494,19.666666666666668
|
26 |
+
10T,chronos-small,1.5117776833428394,0.6428724843624029,12.166666666666666
|
27 |
+
10T,chronos_base,1.2748386240996392,0.5553606639434913,8.833333333333334
|
28 |
+
10T,chronos_large,1.0346877837456316,0.5513783261338567,8.0
|
29 |
+
10T,crossformer,19934536143.591114,2.1015695650743917,11.5
|
30 |
+
10T,d_linear,1.299502689362975,0.6517937533428791,11.833333333333334
|
31 |
+
10T,deepar,0.6449733779553711,0.5921106678712438,10.666666666666666
|
32 |
+
10T,i_transformer,0.7796030121280376,0.590764568044439,6.833333333333333
|
33 |
+
10T,moirai_1.1_R_base_no_leak,0.9116352990069784,0.4334215808615318,5.0
|
34 |
+
10T,moirai_1.1_R_large_no_leak,1.0458495459083907,0.4946922044906253,7.5
|
35 |
+
10T,moirai_1.1_R_small_no_leak,0.5833924359839285,0.5920693990394471,8.5
|
36 |
+
10T,n_beats,1.2619963794129159,0.7808649076454232,13.166666666666666
|
37 |
+
10T,naive,0.7585251839995307,2.2345586312657204,19.333333333333332
|
38 |
+
10T,patch_tst,0.8811585844239113,0.5583960366038973,7.333333333333333
|
39 |
+
10T,seasonal_naive,1.0,1.0,16.5
|
40 |
+
10T,tft,1.2009513786157717,0.4124334801906368,4.333333333333333
|
41 |
+
10T,tide,1.1631853970897563,0.655064716092509,11.833333333333334
|
42 |
+
10T,timesfm,1.0220126106014014,0.5696636404743637,8.333333333333334
|
43 |
+
10T,visionts,1.0799543876471867,0.5014732753557328,7.5
|
44 |
+
15T,auto_arima,1.1006594936981116,0.9576282754708779,13.916666666666666
|
45 |
+
15T,auto_ets,1.2708218834942537,77767764236603.9,18.166666666666668
|
46 |
+
15T,auto_theta,0.9891316566321592,1.7607790421016025,17.0
|
47 |
+
15T,chronos-small,0.9607408272903095,0.7932548659240278,9.75
|
48 |
+
15T,chronos_base,0.9515313401712095,0.7673882941899328,8.5
|
49 |
+
15T,chronos_large,0.9444257867877283,0.7636894556076325,8.166666666666666
|
50 |
+
15T,crossformer,6.318458905448402,3.106359900469947,13.25
|
51 |
+
15T,d_linear,1.0317429035906032,0.9374198238688525,14.083333333333334
|
52 |
+
15T,deepar,1.9585178196595348,1.6805423348003437,14.5
|
53 |
+
15T,i_transformer,0.8998429174870893,0.658390212420039,3.5
|
54 |
+
15T,moirai_1.1_R_base_no_leak,1.010717233317842,0.7067981834067827,5.583333333333333
|
55 |
+
15T,moirai_1.1_R_large_no_leak,0.939877854260836,0.675873678929174,4.416666666666667
|
56 |
+
15T,moirai_1.1_R_small_no_leak,1.0967135488745525,0.7930862976995009,8.666666666666666
|
57 |
+
15T,n_beats,0.9635718857733048,0.9892577700794088,15.0
|
58 |
+
15T,naive,1.4410754166321356,2.423171378732352,19.75
|
59 |
+
15T,patch_tst,0.874459260826287,0.663243746282057,3.8333333333333335
|
60 |
+
15T,seasonal_naive,1.0,1.0,15.166666666666666
|
61 |
+
15T,tft,1.0582677267954164,0.7318724576230791,6.583333333333333
|
62 |
+
15T,tide,0.9787252364392137,0.8073681981088243,10.083333333333334
|
63 |
+
15T,timesfm,1.0222202532066558,0.7930028114468964,8.25
|
64 |
+
15T,visionts,0.9819059200401176,0.8783919060935471,12.833333333333334
|
65 |
+
5T,auto_arima,1.0,1.0,16.25
|
66 |
+
5T,auto_ets,1.0086175993752609,1.0467883914512524,15.5
|
67 |
+
5T,auto_theta,0.964060428373576,1.0460351344918941,16.666666666666668
|
68 |
+
5T,chronos-small,0.7802675296137581,0.7516150339138812,11.166666666666666
|
69 |
+
5T,chronos_base,0.7630965336585342,0.7494651012783219,11.25
|
70 |
+
5T,chronos_large,0.7718811079792048,0.7552409156866928,11.916666666666666
|
71 |
+
5T,crossformer,1.2734965922255723,1.040860998323226,11.416666666666666
|
72 |
+
5T,d_linear,0.9697247766365901,0.8333404387801792,14.5
|
73 |
+
5T,deepar,0.6722225413838093,0.7971245067076417,13.25
|
74 |
+
5T,i_transformer,0.6604920249872609,0.5433487060646278,5.5
|
75 |
+
5T,moirai_1.1_R_base_no_leak,0.6004862717957414,0.5407632644099643,4.75
|
76 |
+
5T,moirai_1.1_R_large_no_leak,0.5519935614821848,0.5321255655856548,3.25
|
77 |
+
5T,moirai_1.1_R_small_no_leak,0.5295698754870711,0.5360775348348382,4.416666666666667
|
78 |
+
5T,n_beats,0.6343946014070763,0.7257573125150426,12.416666666666666
|
79 |
+
5T,naive,0.8443455210066756,1.40000213208879,17.166666666666668
|
80 |
+
5T,patch_tst,0.5873226620946504,0.5432646554948832,4.75
|
81 |
+
5T,seasonal_naive,1.0,1.0,17.75
|
82 |
+
5T,tft,0.6434051298457072,0.5565798833891061,4.916666666666667
|
83 |
+
5T,tide,0.7950041895659674,0.64766319142376,10.083333333333334
|
84 |
+
5T,timesfm,0.8202436454444181,0.730338811538158,11.916666666666666
|
85 |
+
5T,visionts,0.8757565714382182,0.7336813267313066,12.166666666666666
|
86 |
+
A,auto_arima,1.0171428571428571,0.9420289855072463,10.0
|
87 |
+
A,auto_ets,0.9371428571428573,0.8043478260869564,3.0
|
88 |
+
A,auto_theta,0.9371428571428573,0.8333333333333333,6.0
|
89 |
+
A,chronos-small,1.0,1.0072463768115942,17.0
|
90 |
+
A,chronos_base,0.9771428571428573,0.9782608695652174,13.0
|
91 |
+
A,chronos_large,0.9771428571428573,0.9782608695652174,14.0
|
92 |
+
A,crossformer,6.857142857142858,102.89855072463767,21.0
|
93 |
+
A,d_linear,1.062857142857143,1.2173913043478262,20.0
|
94 |
+
A,deepar,1.0171428571428571,0.8188405797101449,4.0
|
95 |
+
A,i_transformer,1.0342857142857143,0.8478260869565217,7.0
|
96 |
+
A,moirai_1.1_R_base_no_leak,1.2057142857142857,0.9420289855072463,11.0
|
97 |
+
A,moirai_1.1_R_large_no_leak,0.9542857142857144,0.7753623188405796,1.0
|
98 |
+
A,moirai_1.1_R_small_no_leak,0.9771428571428573,0.8260869565217391,5.0
|
99 |
+
A,n_beats,0.9028571428571429,0.9710144927536232,12.0
|
100 |
+
A,naive,1.0,0.9927536231884058,15.0
|
101 |
+
A,patch_tst,1.0057142857142858,0.8478260869565217,8.0
|
102 |
+
A,seasonal_naive,1.0,1.0,16.0
|
103 |
+
A,tft,0.9257142857142858,0.7971014492753623,2.0
|
104 |
+
A,tide,1.2057142857142857,1.1231884057971013,18.0
|
105 |
+
A,timesfm,0.9714285714285715,0.8478260869565217,9.0
|
106 |
+
A,visionts,1.0914285714285714,1.1521739130434783,19.0
|
107 |
+
D,auto_arima,0.8529021613038067,0.4985004151026563,11.0
|
108 |
+
D,auto_ets,0.9328318534943983,9.346363276951774,14.333333333333334
|
109 |
+
D,auto_theta,0.9291332918118479,0.5748243358418598,14.333333333333334
|
110 |
+
D,chronos-small,0.7368243034917711,0.43658660925543036,8.066666666666666
|
111 |
+
D,chronos_base,0.6862749899568191,0.421268869505568,6.733333333333333
|
112 |
+
D,chronos_large,0.6959915152342351,0.4206978284635147,6.666666666666667
|
113 |
+
D,crossformer,154.59928198095446,26.923731402565622,18.4
|
114 |
+
D,d_linear,0.8979716998083668,0.6146338678499457,15.133333333333333
|
115 |
+
D,deepar,0.8010158939552874,0.5938211197218363,11.066666666666666
|
116 |
+
D,i_transformer,0.8178290414664529,0.4945942845202976,8.466666666666667
|
117 |
+
D,moirai_1.1_R_base_no_leak,0.7453788326511405,0.4633929439816138,7.466666666666667
|
118 |
+
D,moirai_1.1_R_large_no_leak,0.6628066770710596,0.39170739564229085,4.333333333333333
|
119 |
+
D,moirai_1.1_R_small_no_leak,0.7086474258839043,0.41290002863504544,6.2
|
120 |
+
D,n_beats,0.7960758852718125,0.571280865579523,14.4
|
121 |
+
D,naive,1.0,0.7975936297433697,17.6
|
122 |
+
D,patch_tst,0.7522257716055208,0.43688965053339623,7.2
|
123 |
+
D,seasonal_naive,1.0,1.0,19.0
|
124 |
+
D,tft,0.7622852957686599,0.4136783788887685,6.666666666666667
|
125 |
+
D,tide,0.9969048876212023,0.7484054326947167,13.533333333333333
|
126 |
+
D,timesfm,0.779898124642687,0.49167132325793367,5.6
|
127 |
+
D,visionts,0.8948792108968546,0.5558947866812461,14.8
|
128 |
+
H,auto_arima,0.9539421964948747,0.7767300171583383,15.483870967741936
|
129 |
+
H,auto_ets,1.328807641048494,4.448217317487267e+23,18.774193548387096
|
130 |
+
H,auto_theta,3.078013295555958,1.851945783529741,18.870967741935484
|
131 |
+
H,chronos-small,0.7327688330840401,0.5144358964770991,8.580645161290322
|
132 |
+
H,chronos_base,0.7385494318069303,0.505942185952322,7.806451612903226
|
133 |
+
H,chronos_large,0.7478851839242808,0.5091430423436873,8.0
|
134 |
+
H,crossformer,5676075428.759688,5.282163959765436,11.0
|
135 |
+
H,d_linear,1.3543590165990786,0.6813103317146129,14.129032258064516
|
136 |
+
H,deepar,1.1775584990695886,0.8933529848405231,10.870967741935484
|
137 |
+
H,i_transformer,0.960319640638691,0.476223391238969,6.129032258064516
|
138 |
+
H,moirai_1.1_R_base_no_leak,0.9074224107057469,0.46296554507116067,5.290322580645161
|
139 |
+
H,moirai_1.1_R_large_no_leak,0.8513557808071407,0.49904989135530137,6.548387096774194
|
140 |
+
H,moirai_1.1_R_small_no_leak,0.8133232904541264,0.4778830172118464,6.32258064516129
|
141 |
+
H,n_beats,1.3440056550191246,0.6646047122901354,13.419354838709678
|
142 |
+
H,naive,1.3316850451655482,1.8366794323255176,19.64516129032258
|
143 |
+
H,patch_tst,0.9278036240195197,0.46244079149616185,4.967741935483871
|
144 |
+
H,seasonal_naive,1.0,1.0,17.64516129032258
|
145 |
+
H,tft,1.1074721197519513,0.48721235955499614,6.645161290322581
|
146 |
+
H,tide,1.2515715821404725,0.5695063335623101,10.580645161290322
|
147 |
+
H,timesfm,0.9815849142972823,0.5102566138085366,8.35483870967742
|
148 |
+
H,visionts,0.989251601317308,0.6105910361801898,11.935483870967742
|
149 |
+
M,auto_arima,0.7897292418553448,0.7664432031389335,6.2
|
150 |
+
M,auto_ets,0.8246966155843344,0.7720428958818089,5.4
|
151 |
+
M,auto_theta,0.9196256618111756,0.8821604346726062,8.4
|
152 |
+
M,chronos-small,0.8622647029230233,0.8311626407190348,8.6
|
153 |
+
M,chronos_base,0.8946217478533949,0.8612725657711913,8.6
|
154 |
+
M,chronos_large,0.8472484866537497,0.81475133159962,8.2
|
155 |
+
M,crossformer,10.544298745396533,67.95761479628871,13.0
|
156 |
+
M,d_linear,1.081950231233934,1.1996315851033383,15.8
|
157 |
+
M,deepar,1.1770718831957356,1.0978451288049407,11.8
|
158 |
+
M,i_transformer,1.0050739577178593,0.8211301131061257,5.6
|
159 |
+
M,moirai_1.1_R_base_no_leak,1.894529314848318,1.4962012401441718,19.2
|
160 |
+
M,moirai_1.1_R_large_no_leak,0.9965918207836234,0.9172428125972104,9.6
|
161 |
+
M,moirai_1.1_R_small_no_leak,1.3843487814465443,1.2213717006565714,17.0
|
162 |
+
M,n_beats,0.9414022389216479,1.007258419666447,11.4
|
163 |
+
M,naive,1.2968159580843228,1.5810738327920664,19.2
|
164 |
+
M,patch_tst,0.9895171231152233,0.8480243667176828,7.6
|
165 |
+
M,seasonal_naive,1.0,1.0,13.4
|
166 |
+
M,tft,1.0540757144083999,0.8748338687935762,8.2
|
167 |
+
M,tide,1.0388749071339713,1.2356707678524768,15.4
|
168 |
+
M,timesfm,0.8627984408713761,0.7375099053093932,3.6
|
169 |
+
M,visionts,0.9379289156090017,1.0376995780191705,14.8
|
170 |
+
Q,auto_arima,0.8591549295774649,0.8225806451612904,5.0
|
171 |
+
Q,auto_ets,0.8591549295774649,0.7983870967741936,4.0
|
172 |
+
Q,auto_theta,0.8380281690140845,0.7973790322580646,2.0
|
173 |
+
Q,chronos-small,0.8239436619718311,0.8457661290322581,11.0
|
174 |
+
Q,chronos_base,0.8098591549295776,0.8397177419354839,8.0
|
175 |
+
Q,chronos_large,0.8098591549295776,0.8397177419354839,9.0
|
176 |
+
Q,crossformer,9.929577464788732,119.95967741935485,21.0
|
177 |
+
Q,d_linear,0.9859154929577466,1.1088709677419355,19.0
|
178 |
+
Q,deepar,0.9436619718309861,0.840725806451613,10.0
|
179 |
+
Q,i_transformer,0.9084507042253522,0.7973790322580646,3.0
|
180 |
+
Q,moirai_1.1_R_base_no_leak,1.4295774647887327,1.1290322580645162,20.0
|
181 |
+
Q,moirai_1.1_R_large_no_leak,0.8873239436619719,0.7883064516129034,1.0
|
182 |
+
Q,moirai_1.1_R_small_no_leak,1.0352112676056338,0.9324596774193549,13.0
|
183 |
+
Q,n_beats,0.8380281690140845,0.9717741935483871,15.0
|
184 |
+
Q,naive,0.9295774647887325,0.9506048387096774,14.0
|
185 |
+
Q,patch_tst,0.9366197183098592,0.8346774193548387,6.0
|
186 |
+
Q,seasonal_naive,1.0,1.0,16.0
|
187 |
+
Q,tft,0.9366197183098592,0.8366935483870969,7.0
|
188 |
+
Q,tide,1.1338028169014085,1.0181451612903227,17.0
|
189 |
+
Q,timesfm,0.8802816901408451,0.8528225806451613,12.0
|
190 |
+
Q,visionts,0.9366197183098592,1.0483870967741935,18.0
|
191 |
+
W,auto_arima,0.9759738266715013,0.748994017923637,9.875
|
192 |
+
W,auto_ets,0.971794800090248,0.7889859594373794,10.375
|
193 |
+
W,auto_theta,1.0807827233498426,0.8086208938554269,12.125
|
194 |
+
W,chronos-small,0.7075714340716913,0.554913070787174,5.25
|
195 |
+
W,chronos_base,0.7288517030693448,0.5619860060584291,4.5
|
196 |
+
W,chronos_large,0.7069908074836505,0.5515544393043641,4.375
|
197 |
+
W,crossformer,7.4085130347732155,49.820115045230985,20.375
|
198 |
+
W,d_linear,1.1283846019672135,0.97517397862518,16.875
|
199 |
+
W,deepar,1.862654170783469,1.3453366562739022,12.75
|
200 |
+
W,i_transformer,1.9015274495154941,1.3308062694365717,12.375
|
201 |
+
W,moirai_1.1_R_base_no_leak,0.9495981825008816,0.7483199869474646,9.75
|
202 |
+
W,moirai_1.1_R_large_no_leak,0.8957713665163969,0.6412758290160822,5.125
|
203 |
+
W,moirai_1.1_R_small_no_leak,0.9814158946324417,0.7581734604096699,8.625
|
204 |
+
W,n_beats,1.4457143774506727,1.0793531840148527,15.25
|
205 |
+
W,naive,1.0,0.875913187952162,13.625
|
206 |
+
W,patch_tst,0.9073153465406218,0.6981022157766977,8.0
|
207 |
+
W,seasonal_naive,1.0,1.0,16.5
|
208 |
+
W,tft,1.0396666971907644,0.7794595632824765,11.25
|
209 |
+
W,tide,1.7669411056805098,1.2225931248216542,13.125
|
210 |
+
W,timesfm,0.8615386867667885,0.6305866856856626,4.875
|
211 |
+
W,visionts,1.1355217147765972,0.9990150254993866,16.0
|
results/grouped_results_by_term_length.csv
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
term_length,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
|
2 |
+
long,auto_arima,1.0407648178370423,0.8433829698076462,15.095238095238095
|
3 |
+
long,auto_ets,1.2759037139699805,6.566416040544638e+23,18.666666666666668
|
4 |
+
long,auto_theta,3.1498850578770834,2.0601505251654686,17.38095238095238
|
5 |
+
long,chronos-small,1.0106686030682668,0.6462623576388796,11.857142857142858
|
6 |
+
long,chronos_base,0.9982206684476166,0.619712893481198,10.714285714285714
|
7 |
+
long,chronos_large,1.002171192668077,0.6208511887480599,11.047619047619047
|
8 |
+
long,crossformer,8164023382.865999,0.5072278949697392,8.285714285714286
|
9 |
+
long,d_linear,1.562322390892946,0.6730209945455377,13.238095238095237
|
10 |
+
long,deepar,1.701880985304897,0.9481973435022004,12.714285714285714
|
11 |
+
long,i_transformer,0.9961649824755919,0.45539997968708557,5.190476190476191
|
12 |
+
long,moirai_1.1_R_base_no_leak,1.0906983483127686,0.5005193850296795,5.428571428571429
|
13 |
+
long,moirai_1.1_R_large_no_leak,1.0073695402863057,0.538721786921065,6.761904761904762
|
14 |
+
long,moirai_1.1_R_small_no_leak,0.9192503918318538,0.5613725349527969,7.238095238095238
|
15 |
+
long,n_beats,1.1138460516712445,0.6641343878789057,11.952380952380953
|
16 |
+
long,naive,1.3505564252932616,2.111806155806968,19.61904761904762
|
17 |
+
long,patch_tst,0.9616382830362623,0.437002893484988,4.428571428571429
|
18 |
+
long,seasonal_naive,1.0,1.0,16.857142857142858
|
19 |
+
long,tft,1.1557571021609558,0.4529431176291461,4.904761904761905
|
20 |
+
long,tide,1.2799667076250163,0.5247494742051094,8.761904761904763
|
21 |
+
long,timesfm,1.796874605878608,0.6535892285331686,11.285714285714286
|
22 |
+
long,visionts,1.0324758074134404,0.5463144403056718,9.571428571428571
|
23 |
+
medium,auto_arima,0.9925217451047474,0.862989330588506,14.571428571428571
|
24 |
+
medium,auto_ets,1.5910566618211812,8307478.842344518,18.333333333333332
|
25 |
+
medium,auto_theta,1.8722187149301004,1.9594402227791685,17.428571428571427
|
26 |
+
medium,chronos-small,1.232070637409101,0.7382117459387273,11.476190476190476
|
27 |
+
medium,chronos_base,1.3793840327174318,0.777089828453266,11.571428571428571
|
28 |
+
medium,chronos_large,1.3264659611435963,0.7511074564157023,10.904761904761905
|
29 |
+
medium,crossformer,2510838525.595493,0.6608294014146207,8.571428571428571
|
30 |
+
medium,d_linear,1.389713789881514,0.766265949645093,13.857142857142858
|
31 |
+
medium,deepar,1.0742968052431434,0.8271972158292309,10.523809523809524
|
32 |
+
medium,i_transformer,1.0152562339844198,0.5288800110916518,4.619047619047619
|
33 |
+
medium,moirai_1.1_R_base_no_leak,1.1063058012973035,0.6371675078184162,6.380952380952381
|
34 |
+
medium,moirai_1.1_R_large_no_leak,0.9602362541309903,0.6248351913119883,6.761904761904762
|
35 |
+
medium,moirai_1.1_R_small_no_leak,0.9306757114627328,0.6605293213864295,7.523809523809524
|
36 |
+
medium,n_beats,1.1513061909700455,0.759822180558971,12.952380952380953
|
37 |
+
medium,naive,1.3041664719836585,2.0849508725128736,19.285714285714285
|
38 |
+
medium,patch_tst,0.9034466134216352,0.5192209987815241,4.0
|
39 |
+
medium,seasonal_naive,1.0,1.0,16.142857142857142
|
40 |
+
medium,tft,1.2614988599396852,0.5286354727281315,4.809523809523809
|
41 |
+
medium,tide,1.193587520563658,0.6109994159383925,9.523809523809524
|
42 |
+
medium,timesfm,1.524928039707925,0.8138067254337588,10.80952380952381
|
43 |
+
medium,visionts,1.0232867065677744,0.6690986273300213,10.952380952380953
|
44 |
+
short,auto_arima,0.9183158404157642,0.7782297371914904,11.872727272727273
|
45 |
+
short,auto_ets,1.0290159244978938,1287882.1620396667,13.781818181818181
|
46 |
+
short,auto_theta,1.1276737692944792,0.9555577026305212,13.527272727272727
|
47 |
+
short,chronos-small,0.7809947822060929,0.5973093408946208,7.254545454545455
|
48 |
+
short,chronos_base,0.7504208829287357,0.5900291013554769,6.163636363636364
|
49 |
+
short,chronos_large,0.7517445876583578,0.5860508463835682,6.2727272727272725
|
50 |
+
short,crossformer,1298062866.890411,28.58975534685244,17.581818181818182
|
51 |
+
short,d_linear,1.1372631367933204,0.8759004934560309,15.145454545454545
|
52 |
+
short,deepar,1.0793364184925522,1.0306040328879058,11.89090909090909
|
53 |
+
short,i_transformer,0.9683901902602449,0.7276300854958668,7.490909090909091
|
54 |
+
short,moirai_1.1_R_base_no_leak,0.9406066725109657,0.7078567359533983,8.836363636363636
|
55 |
+
short,moirai_1.1_R_large_no_leak,0.7726332258609173,0.6016193373717869,5.4
|
56 |
+
short,moirai_1.1_R_small_no_leak,0.8487478588017764,0.6661776706416213,8.145454545454545
|
57 |
+
short,n_beats,1.0323393751593084,0.8195659204638793,13.872727272727273
|
58 |
+
short,naive,1.104608870534287,1.20246170110538,17.21818181818182
|
59 |
+
short,patch_tst,0.80382107933716,0.6279448956983675,6.872727272727273
|
60 |
+
short,seasonal_naive,1.0,1.0,16.745454545454546
|
61 |
+
short,tft,1.0461332316971086,0.6755258572212629,8.218181818181819
|
62 |
+
short,tide,1.1230276648969284,0.9169276487302415,13.418181818181818
|
63 |
+
short,timesfm,0.9286461705272189,0.6445361651154075,6.636363636363637
|
64 |
+
short,visionts,0.9467929371136616,0.8196918268462717,14.654545454545454
|
results/grouped_results_by_univariate.csv
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
univariate,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
|
2 |
+
False,auto_arima,0.9804811342495535,0.8713104754311666,13.906976744186046
|
3 |
+
False,auto_ets,1.3150898838758078,28247779.184341986,17.023255813953487
|
4 |
+
False,auto_theta,2.411812537314736,1.1875727874520958,14.976744186046512
|
5 |
+
False,chronos-small,1.0763507868085402,0.6645935333486652,9.511627906976743
|
6 |
+
False,chronos_base,1.108253824730984,0.6827654193124292,9.44186046511628
|
7 |
+
False,chronos_large,1.0950741555125842,0.6729041022551422,9.55813953488372
|
8 |
+
False,crossformer,2.654080994925351,0.9189271735725226,12.906976744186046
|
9 |
+
False,d_linear,1.6213889770481567,0.7823221048850953,13.488372093023257
|
10 |
+
False,deepar,1.5448649630735394,1.177574529017474,14.0
|
11 |
+
False,i_transformer,1.1492193189093056,0.6224209228728989,5.186046511627907
|
12 |
+
False,moirai_1.1_R_base_no_leak,1.0862615181082176,0.6282749871527136,6.27906976744186
|
13 |
+
False,moirai_1.1_R_large_no_leak,0.9324785498208167,0.6271354928559256,6.674418604651163
|
14 |
+
False,moirai_1.1_R_small_no_leak,0.8841632883529932,0.6261701600960712,6.325581395348837
|
15 |
+
False,n_beats,1.4270416026611565,0.7348091745010178,12.581395348837209
|
16 |
+
False,naive,1.2123354003513602,1.3984175456034713,17.697674418604652
|
17 |
+
False,patch_tst,0.8748906969616712,0.5267294212715652,5.023255813953488
|
18 |
+
False,seasonal_naive,1.0,1.0,16.511627906976745
|
19 |
+
False,tft,1.3981834112189448,0.6093381204940588,6.906976744186046
|
20 |
+
False,tide,1.43858389278792,0.8200701505564333,11.395348837209303
|
21 |
+
False,timesfm,1.714449538660074,0.7498374834291154,10.069767441860465
|
22 |
+
False,visionts,1.0423141526857782,0.7009754803331653,11.534883720930232
|
23 |
+
True,auto_arima,0.9452907457761817,0.7624093224131667,12.555555555555555
|
24 |
+
True,auto_ets,1.1157996061544226,2.553606237989581e+23,14.87037037037037
|
25 |
+
True,auto_theta,1.1810795821409918,1.5907675831680725,15.38888888888889
|
26 |
+
True,chronos-small,0.8105398747887519,0.6175638516729869,8.88888888888889
|
27 |
+
True,chronos_base,0.8064432004095214,0.6004730500877874,7.425925925925926
|
28 |
+
True,chronos_large,0.7992434048928462,0.5946120685660026,7.314814814814815
|
29 |
+
True,crossformer,5473436252.639601,28.841701459950688,14.185185185185185
|
30 |
+
True,d_linear,1.0152315982745468,0.8288833788523304,15.222222222222221
|
31 |
+
True,deepar,0.9487785778969822,0.8024222742045092,10.0
|
32 |
+
True,i_transformer,0.8534236166087696,0.6282487930572502,7.314814814814815
|
33 |
+
True,moirai_1.1_R_base_no_leak,0.9474297935418754,0.6631059403644487,8.592592592592593
|
34 |
+
True,moirai_1.1_R_large_no_leak,0.8095920641262695,0.5658691463988443,5.444444444444445
|
35 |
+
True,moirai_1.1_R_small_no_leak,0.8798251667426911,0.6550813700792939,9.0
|
36 |
+
True,n_beats,0.7960013669407282,0.803378130725278,13.796296296296296
|
37 |
+
True,naive,1.1920786390202338,1.7432469758252465,18.574074074074073
|
38 |
+
True,patch_tst,0.8473452262927366,0.5920056645616604,6.277777777777778
|
39 |
+
True,seasonal_naive,1.0,1.0,16.74074074074074
|
40 |
+
True,tft,0.8921817826859224,0.5845469141375511,6.648148148148148
|
41 |
+
True,tide,0.9602232031743085,0.7225690907268552,11.703703703703704
|
42 |
+
True,timesfm,0.8724456439616861,0.6300331912444614,7.333333333333333
|
43 |
+
True,visionts,0.9337984401734895,0.7493477637512667,13.722222222222221
|
src/about.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
|
|
4 |
@dataclass
|
5 |
class Task:
|
6 |
benchmark: str
|
@@ -12,61 +13,85 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
# ---------------------------------------------------
|
20 |
|
21 |
|
22 |
-
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
|
|
|
|
|
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
LLM_BENCHMARKS_TEXT = f"""
|
33 |
-
|
|
|
34 |
|
35 |
-
|
36 |
-
To reproduce our results, here is the commands you can run:
|
37 |
|
38 |
-
|
|
|
|
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
46 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
47 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
48 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
49 |
```
|
50 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
51 |
|
52 |
-
|
53 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
54 |
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
-
|
59 |
-
|
|
|
|
|
60 |
|
61 |
-
|
62 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
63 |
|
64 |
-
## In case of model failure
|
65 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
66 |
-
Make sure you have followed the above steps first.
|
67 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
68 |
"""
|
69 |
|
70 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
71 |
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"""
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
+
|
5 |
@dataclass
|
6 |
class Task:
|
7 |
benchmark: str
|
|
|
13 |
# ---------------------------------------------------
|
14 |
class Tasks(Enum):
|
15 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
+
# task0 = Task("boolq", "acc", "BoolQA")
|
17 |
+
task1 = Task("trivia", "EM", "TriviaQA")
|
18 |
+
task2 = Task("truthfulqa", "EM", "TruthfulQA")
|
19 |
+
task3 = Task("popqa", "acc", "PopQA")
|
20 |
+
task4 = Task("hpqa", "EM", "HotpotQA")
|
21 |
+
task5 = Task("nq", "EM", "Natural Questions")
|
22 |
+
task6 = Task("2wiki", "EM", "2WikiMultiHop")
|
23 |
+
task7 = Task("musique", "EM", "MuSiQue")
|
24 |
+
# task0 = Task("anli_r1", "acc", "ANLI")
|
25 |
+
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
26 |
+
|
27 |
+
|
28 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
29 |
# ---------------------------------------------------
|
30 |
|
31 |
|
|
|
32 |
# Your leaderboard name
|
33 |
+
TITLE = """<h1 align="center" id="space-title">GIFT-Eval Time Series Forecasting Leaderboard</h1>"""
|
34 |
|
35 |
# What does your leaderboard evaluate?
|
36 |
INTRODUCTION_TEXT = """
|
37 |
+
[Placeholder] We introduce the General TIme Series ForecasTing Model Evaluation, GIFT-Eval,
|
38 |
+
a pioneering benchmark aimed at promoting evaluation across diverse datasets.
|
39 |
+
GIFT-Eval encompasses 28 datasets over 144,000 time series and 177 million data
|
40 |
+
points, spanning seven domains, 10 frequencies, multivariate inputs, and prediction lengths ranging from short to long-term forecasts.
|
41 |
"""
|
42 |
|
43 |
# Which evaluations are you running? how can people reproduce what you have?
|
44 |
LLM_BENCHMARKS_TEXT = f"""
|
45 |
+
How It Works
|
46 |
+
To participate in the ContextualBench leaderboard, follow these steps to evaluate your Large Language Model (LLM) using the ContextualBench framework:
|
47 |
|
48 |
+
Clone the Repository: Start by cloning the ContextualBench GitHub repository to your local machine using the following command:
|
|
|
49 |
|
50 |
+
```bash
|
51 |
+
git clone https://github.com/SalesforceAIResearch/SFR-RAG
|
52 |
+
```
|
53 |
|
54 |
+
Navigate to the Directory: Move into the cloned repository's directory:
|
55 |
+
|
56 |
+
|
57 |
+
``bash
|
58 |
+
cd ContextualBench
|
|
|
|
|
|
|
|
|
59 |
```
|
|
|
60 |
|
61 |
+
Install Dependencies: Install all necessary dependencies by executing:
|
|
|
62 |
|
63 |
+
```bash
|
64 |
+
pip install -r requirements.txt
|
65 |
+
```
|
66 |
+
|
67 |
+
Prepare Your Model and Dataset: Set up your model and dataset according to the guidelines provided in the repository's documentation.
|
68 |
+
Run the Evaluation Script: Execute the evaluation script to generate outputs for your model on the specified dataset:
|
69 |
+
|
70 |
+
|
71 |
+
```bash
|
72 |
+
python run.py [dataset_name]
|
73 |
+
```
|
74 |
+
|
75 |
+
Collect and Format Outputs: Gather the outputs generated for each dataset and format them according to the leaderboard submission guidelines.
|
76 |
+
Submit Your Results: Email the formatted outputs to the author's email address for evaluation. Our team will assess the performance and update the leaderboard accordingly.
|
77 |
|
78 |
+
Reproducibility
|
79 |
+
Ensuring reproducibility is a key aspect of the ContextualBench leaderboard.
|
80 |
+
By following the standardized steps outlined above, participants can consistently reproduce evaluation results. This process not only facilitates fair comparisons across different models but also encourages transparency and reliability in model assessments. Participants are encouraged to adhere strictly to the submission guidelines to ensure their results are accurately reflected on the leaderboard.
|
81 |
+
"""
|
82 |
|
83 |
+
EVALUATION_QUEUE_TEXT = """
|
|
|
84 |
|
|
|
|
|
|
|
|
|
85 |
"""
|
86 |
|
87 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
88 |
CITATION_BUTTON_TEXT = r"""
|
89 |
+
@article{
|
90 |
+
aksu2024gifteval,
|
91 |
+
title={{GIFT}-Eval: A Benchmark for General Time Series Forecasting Model Evaluation},
|
92 |
+
author={Taha Aksu and Gerald Woo and Juncheng Liu and Xu Liu and Chenghao Liu and Silvio Savarese and Caiming Xiong and Doyen Sahoo},
|
93 |
+
booktitle={NeurIPS Workshop on Time Series in the Age of Large Models},
|
94 |
+
year={2024},
|
95 |
+
url={https://openreview.net/forum?id=Z2cMOOANFX}
|
96 |
+
}
|
97 |
"""
|
src/populate.py
CHANGED
@@ -6,16 +6,34 @@ import pandas as pd
|
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
-
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
|
|
18 |
df = df[cols].round(decimals=2)
|
|
|
19 |
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
@@ -39,7 +57,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
39 |
all_evals.append(data)
|
40 |
elif ".md" not in entry:
|
41 |
# this is a folder
|
42 |
-
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if
|
43 |
for sub_entry in sub_entries:
|
44 |
file_path = os.path.join(save_path, entry, sub_entry)
|
45 |
with open(file_path) as fp:
|
|
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
+
import ipdb
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
# raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
+
# print('results_path:', results_path)
|
15 |
+
# all_data_json = [v.to_dict() for v in raw_data]
|
16 |
+
# print(f"The raw data is {all_data_json}")
|
17 |
+
#
|
18 |
+
# df = pd.DataFrame.from_records(all_data_json)
|
19 |
+
df = pd.read_csv(results_path)
|
20 |
+
# df = pd.read_csv('LOTSAv2_EvalBenchmark(Long).csv')
|
21 |
+
# Step 2: Pivot the DataFrame
|
22 |
+
df = df.pivot_table(index='model',
|
23 |
+
columns='dataset',
|
24 |
+
values='eval_metrics/MAE[0.5]',
|
25 |
+
aggfunc='first')
|
26 |
+
df.drop(columns=['ALL'], inplace=True)
|
27 |
+
df['Average'] = df.mean(axis=1)
|
28 |
+
# Reset the index if you want the model column to be part of the DataFrame
|
29 |
+
df.reset_index(inplace=True)
|
30 |
+
print(f"DF at stage 1 ********** {df}")
|
31 |
+
# ipdb.set_trace()
|
32 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
33 |
+
# df = df.sort_values(by=[AutoEvalColumn.__dataclass_fields__['average'].name], ascending=False)
|
34 |
+
print(f"DF at stage 2 ********** {df}")
|
35 |
df = df[cols].round(decimals=2)
|
36 |
+
print(f"DF at stage 3 ********** {df}")
|
37 |
|
38 |
# filter out if any of the benchmarks have not been produced
|
39 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
57 |
all_evals.append(data)
|
58 |
elif ".md" not in entry:
|
59 |
# this is a folder
|
60 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
61 |
for sub_entry in sub_entries:
|
62 |
file_path = os.path.join(save_path, entry, sub_entry)
|
63 |
with open(file_path) as fp:
|
src/utils.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
def norm_sNavie(df):
|
3 |
+
df_normalized = df.copy()
|
4 |
+
seasonal_naive_row = df[df['model'] == 'seasonal_naive'].iloc[0]
|
5 |
+
print('df: ',df)
|
6 |
+
for column in df.columns:
|
7 |
+
if column != 'model': # We skip normalizing the 'model' column
|
8 |
+
df_normalized[column] = df[column] / seasonal_naive_row[column]
|
9 |
+
return df_normalized
|
10 |
+
|
11 |
+
def pivot_df(file_name, tab_name):
|
12 |
+
df = pd.read_csv(file_name)
|
13 |
+
if tab_name == 'univariate':
|
14 |
+
df['univariate'] = df['univariate'].replace({True: 'univariate', False: 'multivariate'})
|
15 |
+
df.rename(columns={'univariate': 'variate_type'}, inplace=True)
|
16 |
+
tab_name = 'variate_type'
|
17 |
+
df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
|
18 |
+
df_melted['metric'] = df_melted['metric'].replace({
|
19 |
+
'eval_metrics/MAPE[0.5]': 'MAPE',
|
20 |
+
'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS'
|
21 |
+
})
|
22 |
+
df_pivot = df_melted.pivot_table(index='model', columns=[tab_name, 'metric'], values='value')
|
23 |
+
df_pivot.columns = [f'{tab_name} ({metric})' for tab_name, metric in df_pivot.columns]
|
24 |
+
# df_pivot.to_csv('pivoted_df.csv')
|
25 |
+
# print(df_pivot)
|
26 |
+
df_pivot = df_pivot.reset_index()
|
27 |
+
return df_pivot
|