juncliu commited on
Commit
409ae36
·
1 Parent(s): 1490aed

initial demo

Browse files
app.py CHANGED
@@ -27,29 +27,62 @@ from src.display.utils import (
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
 
 
30
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
 
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
40
  )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
47
  )
48
  except Exception:
49
  restart_space()
50
 
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  (
55
  finished_eval_queue_df,
@@ -57,6 +90,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -64,26 +98,28 @@ def init_leaderboard(dataframe):
64
  value=dataframe,
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
66
  select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
 
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
 
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
89
  )
@@ -95,21 +131,33 @@ with demo:
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
  with gr.Column():
106
  with gr.Row():
107
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
 
109
  with gr.Column():
110
  with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
  ):
114
  with gr.Row():
115
  finished_eval_table = gr.components.Dataframe(
@@ -119,8 +167,8 @@ with demo:
119
  row_count=5,
120
  )
121
  with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
  ):
125
  with gr.Row():
126
  running_eval_table = gr.components.Dataframe(
@@ -131,8 +179,8 @@ with demo:
131
  )
132
 
133
  with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
  ):
137
  with gr.Row():
138
  pending_eval_table = gr.components.Dataframe(
@@ -142,51 +190,54 @@ with demo:
142
  row_count=5,
143
  )
144
  with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
 
 
 
190
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
 
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
+ from src.utils import norm_sNavie, pivot_df
31
+ import ipdb
32
 
33
 
34
  def restart_space():
35
  API.restart_space(repo_id=REPO_ID)
36
 
37
+
38
  ### Space initialisation
39
  try:
40
  print(EVAL_REQUESTS_PATH)
41
  snapshot_download(
42
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
43
+ token=TOKEN
44
  )
45
  except Exception:
46
  restart_space()
47
  try:
48
  print(EVAL_RESULTS_PATH)
49
  snapshot_download(
50
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
51
+ token=TOKEN
52
  )
53
  except Exception:
54
  restart_space()
55
 
56
+ # # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
57
+ # df = pd.read_csv('LOTSAv2_EvalBenchmark(Long).csv')
58
+ # # Step 2: Pivot the DataFrame
59
+ # LEADERBOARD_DF = df.pivot_table(index='model',
60
+ # columns='dataset',
61
+ # values='eval_metrics/MAE[0.5]',
62
+ # aggfunc='first')
63
+ # LEADERBOARD_DF.drop(columns=['ALL'], inplace=True)
64
+ #
65
+ # # Reset the index if you want the model column to be part of the DataFrame
66
+ # LEADERBOARD_DF.reset_index(inplace=True)
67
+ # # Step 3: noramlize the values
68
+ # # ipdb.set_trace()
69
+ # LEADERBOARD_DF = norm_sNavie(LEADERBOARD_DF)
70
+ #
71
+ # # LEADERBOARD_DF['Average'] = LEADERBOARD_DF.mean(axis=1)
72
+ # # LEADERBOARD_DF.insert(1, 'Average', LEADERBOARD_DF.pop('Average'))
73
+ # # LEADERBOARD_DF = LEADERBOARD_DF.sort_values(by=['Average'], ascending=True)
74
+ # print(f"The leaderboard is {LEADERBOARD_DF}")
75
+ # print(f'Columns: ', LEADERBOARD_DF.columns)
76
+
77
+ # LEADERBOARD_DF = pd.read_csv('pivoted_df.csv')
78
+ domain_df = pivot_df('results/grouped_results_by_domain.csv', tab_name='domain')
79
+ print(f'Domain dataframe is {domain_df}')
80
+ freq_df = pivot_df('results/grouped_results_by_frequency.csv', tab_name='frequency')
81
+ print(f'Freq dataframe is {freq_df}')
82
+ term_length_df = pivot_df('results/grouped_results_by_term_length.csv', tab_name='term_length')
83
+ print(f'Term length dataframe is {term_length_df}')
84
+ variate_type_df = pivot_df('results/grouped_results_by_univariate.csv', tab_name='univariate')
85
+ print(f'Variate type dataframe is {variate_type_df}')
86
 
87
  (
88
  finished_eval_queue_df,
 
90
  pending_eval_queue_df,
91
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
92
 
93
+
94
  def init_leaderboard(dataframe):
95
  if dataframe is None or dataframe.empty:
96
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
98
  value=dataframe,
99
  datatype=[c.type for c in fields(AutoEvalColumn)],
100
  select_columns=SelectColumns(
101
+ # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.name not in ['params', 'available_on_hub', 'hub', 'Model sha','Hub License']],
102
+ default_selection=list(dataframe.columns),
103
+ cant_deselect=['model'],
104
+ label="Select Datasets to Display:",
105
  ),
106
+
107
+ search_columns=['model'],
108
+ # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
109
+ # filter_columns=[
110
+ # ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
111
+ # ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
112
+ # ColumnFilter(
113
+ # AutoEvalColumn.params.name,
114
+ # type="slider",
115
+ # min=0.01,
116
+ # max=500,
117
+ # label="Select the number of parameters (B)",
118
+ # ),
119
+ # ColumnFilter(
120
+ # AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
121
+ # ),
122
+ # ],
123
  bool_checkboxgroup_label="Hide models",
124
  interactive=False,
125
  )
 
131
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
132
 
133
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
134
+ with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
135
+ leaderboard = init_leaderboard(domain_df)
136
+ print(f"FINAL Domain LEADERBOARD 1 {domain_df}")
137
+
138
+ with gr.TabItem("🏅 By Frequency", elem_id="llm-benchmark-tab-table", id=1):
139
+ leaderboard = init_leaderboard(freq_df)
140
+ print(f"FINAL Frequency LEADERBOARD 1 {freq_df}")
141
+
142
+ with gr.TabItem("🏅 By term length", elem_id="llm-benchmark-tab-table", id=2):
143
+ leaderboard = init_leaderboard(term_length_df)
144
+ print(f"FINAL term length LEADERBOARD 1 {term_length_df}")
145
+
146
+ with gr.TabItem("🏅 By variate type", elem_id="llm-benchmark-tab-table", id=3):
147
+ leaderboard = init_leaderboard(variate_type_df)
148
+ print(f"FINAL LEADERBOARD 1 {variate_type_df}")
149
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
150
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
151
 
152
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=5):
153
  with gr.Column():
154
  with gr.Row():
155
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
156
 
157
  with gr.Column():
158
  with gr.Accordion(
159
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
160
+ open=False,
161
  ):
162
  with gr.Row():
163
  finished_eval_table = gr.components.Dataframe(
 
167
  row_count=5,
168
  )
169
  with gr.Accordion(
170
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
171
+ open=False,
172
  ):
173
  with gr.Row():
174
  running_eval_table = gr.components.Dataframe(
 
179
  )
180
 
181
  with gr.Accordion(
182
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
183
+ open=False,
184
  ):
185
  with gr.Row():
186
  pending_eval_table = gr.components.Dataframe(
 
190
  row_count=5,
191
  )
192
  with gr.Row():
193
+ gr.Markdown("# ✉️✨ Submit your model outputs !", elem_classes="markdown-text")
194
+ gr.Markdown(
195
+ "Send your model outputs for all the models using the ContextualBench code and email them to us at xnguyen@salesforce.com ",
196
+ elem_classes="markdown-text")
197
+
198
+ # with gr.Row():
199
+ # with gr.Column():
200
+ # model_name_textbox = gr.Textbox(label="Model name")
201
+ # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
202
+ # model_type = gr.Dropdown(
203
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
204
+ # label="Model type",
205
+ # multiselect=False,
206
+ # value=None,
207
+ # interactive=True,
208
+ # )
209
+
210
+ # with gr.Column():
211
+ # precision = gr.Dropdown(
212
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown],
213
+ # label="Precision",
214
+ # multiselect=False,
215
+ # value="float16",
216
+ # interactive=True,
217
+ # )
218
+ # weight_type = gr.Dropdown(
219
+ # choices=[i.value.name for i in WeightType],
220
+ # label="Weights type",
221
+ # multiselect=False,
222
+ # value="Original",
223
+ # interactive=True,
224
+ # )
225
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
226
+
227
+ # submit_button = gr.Button("Submit Eval")
228
+ # submission_result = gr.Markdown()
229
+ # submit_button.click(
230
+ # add_new_eval,
231
+ # [
232
+ # model_name_textbox,
233
+ # base_model_name_textbox,
234
+ # revision_name_textbox,
235
+ # precision,
236
+ # weight_type,
237
+ # model_type,
238
+ # ],
239
+ # submission_result,
240
+ # )
241
 
242
  with gr.Row():
243
  with gr.Accordion("📙 Citation", open=False):
results/grouped_results_by_domain.csv ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ domain,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
2
+ Econ/Fin,auto_arima,0.9365192023498371,0.82547313686996,7.166666666666667
3
+ Econ/Fin,auto_ets,1.069790707228502,0.9914310347244895,9.0
4
+ Econ/Fin,auto_theta,1.0054900702449128,0.846318218177465,7.833333333333333
5
+ Econ/Fin,chronos-small,0.8016337645320165,0.7776047759026133,7.5
6
+ Econ/Fin,chronos_base,0.8020694855013312,0.7682503560215578,6.333333333333333
7
+ Econ/Fin,chronos_large,0.7991640460606071,0.7743218919662125,7.0
8
+ Econ/Fin,crossformer,34.27971965234223,126.76758253255495,21.0
9
+ Econ/Fin,d_linear,1.1148346887593732,1.1323705430596809,17.5
10
+ Econ/Fin,deepar,1.2278569268049724,1.4148933569849873,14.333333333333334
11
+ Econ/Fin,i_transformer,1.0214445148835511,0.8548360574080038,8.333333333333334
12
+ Econ/Fin,moirai_1.1_R_base_no_leak,1.2693975078273285,1.035652986793995,15.5
13
+ Econ/Fin,moirai_1.1_R_large_no_leak,0.879988898990186,0.7355518100899686,3.1666666666666665
14
+ Econ/Fin,moirai_1.1_R_small_no_leak,1.035635272136103,0.8530614545680443,10.833333333333334
15
+ Econ/Fin,n_beats,0.8629634668807649,0.9855058224964246,12.666666666666666
16
+ Econ/Fin,naive,1.222637910798122,1.3614416351328862,15.166666666666666
17
+ Econ/Fin,patch_tst,0.9463918645805333,0.8154506342478772,6.666666666666667
18
+ Econ/Fin,seasonal_naive,1.0,1.0,14.833333333333334
19
+ Econ/Fin,tft,1.1360880871257253,0.8476154067835847,7.833333333333333
20
+ Econ/Fin,tide,1.1466847399238815,1.0894612064116727,17.0
21
+ Econ/Fin,timesfm,0.840899493300519,0.7288672456979325,4.833333333333333
22
+ Econ/Fin,visionts,0.9927325904977805,1.056804867374159,16.5
23
+ Energy,auto_arima,1.020271660057138,0.8634432097816029,12.28125
24
+ Energy,auto_ets,1.3407403717825905,4.309210526315789e+23,15.8125
25
+ Energy,auto_theta,1.3500329947825767,2.288546713618502,16.90625
26
+ Energy,chronos-small,0.9106949318701584,0.684989122019499,8.59375
27
+ Energy,chronos_base,0.908599317521494,0.6608616701619849,7.375
28
+ Energy,chronos_large,0.9122256820323856,0.6592412529602467,7.25
29
+ Energy,crossformer,9236423597.08186,11.594762627902593,14.25
30
+ Energy,d_linear,1.1889257931784007,0.9161631466345352,14.71875
31
+ Energy,deepar,1.678068514068864,1.3746230213328996,14.6875
32
+ Energy,i_transformer,1.2394136898117367,0.8256530209999239,7.0625
33
+ Energy,moirai_1.1_R_base_no_leak,0.9874963954081143,0.6351616784559907,5.6875
34
+ Energy,moirai_1.1_R_large_no_leak,0.8909713530536371,0.6258812753967218,5.375
35
+ Energy,moirai_1.1_R_small_no_leak,0.8991945372018496,0.6974970102561562,7.0
36
+ Energy,n_beats,1.1689169414175054,0.9804388197933043,15.09375
37
+ Energy,naive,1.2329496956881505,1.717544651501066,18.21875
38
+ Energy,patch_tst,1.0002148326675349,0.6519449023593654,5.75
39
+ Energy,seasonal_naive,1.0,1.0,15.84375
40
+ Energy,tft,1.094935043145979,0.6698073412229205,7.28125
41
+ Energy,tide,1.2969242335580595,0.8815970753846475,10.46875
42
+ Energy,timesfm,0.990249590684815,0.7036315984358718,8.25
43
+ Energy,visionts,1.1457219960976157,0.8253258612638164,13.09375
44
+ Healthcare,auto_arima,0.7829904478130371,0.6313761904761905,7.8
45
+ Healthcare,auto_ets,0.8061744759695987,0.6378749206349206,7.2
46
+ Healthcare,auto_theta,0.9667122096533645,0.8276838095238095,12.0
47
+ Healthcare,chronos-small,0.6676511382266025,0.5753475132275132,5.8
48
+ Healthcare,chronos_base,0.7248913460690712,0.5941960317460317,5.0
49
+ Healthcare,chronos_large,0.6616148175532721,0.5314045502645502,4.2
50
+ Healthcare,crossformer,454.59633819128214,70.08870634920635,16.0
51
+ Healthcare,d_linear,0.8850081956016165,0.9878685185185185,16.0
52
+ Healthcare,deepar,0.8566987649911045,0.8641066137566138,10.4
53
+ Healthcare,i_transformer,0.9023075876752837,0.7645924867724868,9.4
54
+ Healthcare,moirai_1.1_R_base_no_leak,1.1475007332958291,1.073448677248677,15.8
55
+ Healthcare,moirai_1.1_R_large_no_leak,0.8166086394810568,0.68611,8.0
56
+ Healthcare,moirai_1.1_R_small_no_leak,1.0612957366806712,0.925962328042328,14.8
57
+ Healthcare,n_beats,0.7698977996924792,0.8690820634920635,14.0
58
+ Healthcare,naive,1.2056710696279012,1.2890160846560845,17.8
59
+ Healthcare,patch_tst,0.7946209932133224,0.6791721164021164,8.6
60
+ Healthcare,seasonal_naive,1.0,1.0,15.0
61
+ Healthcare,tft,0.7904105809823141,0.7010021693121693,8.2
62
+ Healthcare,tide,0.8426731528233347,1.1561685185185187,14.4
63
+ Healthcare,timesfm,0.791330416522951,0.7994040740740741,7.8
64
+ Healthcare,visionts,0.8034595649263452,0.7956369841269841,12.8
65
+ Nature,auto_arima,0.9361489953148219,0.7095217542336816,14.4
66
+ Nature,auto_ets,1.206307582700855,76254692.36191763,16.933333333333334
67
+ Nature,auto_theta,5.150553292857,1.0319763336420513,16.4
68
+ Nature,chronos-small,0.9491555540245159,0.4571809141301187,9.666666666666666
69
+ Nature,chronos_base,0.8087692327609204,0.43483874046585586,8.733333333333333
70
+ Nature,chronos_large,0.7215301114853574,0.43339550624240464,8.2
71
+ Nature,crossformer,3.6857728593414816,1.8222011094109303,12.8
72
+ Nature,d_linear,1.6637383989524568,0.566411290270835,14.466666666666667
73
+ Nature,deepar,1.3368617172298543,0.784343921808244,11.866666666666667
74
+ Nature,i_transformer,1.0245163140037494,0.3923948352004396,6.533333333333333
75
+ Nature,moirai_1.1_R_base_no_leak,1.0846943426539009,0.42165427104639003,4.466666666666667
76
+ Nature,moirai_1.1_R_large_no_leak,0.9012043168826274,0.37755828000010155,4.133333333333334
77
+ Nature,moirai_1.1_R_small_no_leak,0.8636937125921123,0.400148437852242,4.4
78
+ Nature,n_beats,2.051183579793879,0.5865729148518006,14.533333333333333
79
+ Nature,naive,1.0153007149015423,1.5216687585771838,19.266666666666666
80
+ Nature,patch_tst,0.9757662316880771,0.40362241062795473,7.266666666666667
81
+ Nature,seasonal_naive,1.0,1.0,18.6
82
+ Nature,tft,1.3479799792947338,0.4024676715316202,7.466666666666667
83
+ Nature,tide,1.6518355265449745,0.648933595846154,13.466666666666667
84
+ Nature,timesfm,1.0759825145269837,0.38186899336385965,5.733333333333333
85
+ Nature,visionts,1.0368902840216354,0.4874755604862317,11.666666666666666
86
+ Sales,auto_arima,0.7716630938105196,0.4828581089269842,14.25
87
+ Sales,auto_ets,0.9017684593360312,30.76895095733506,17.25
88
+ Sales,auto_theta,0.8258637946630958,0.5029359984438486,14.5
89
+ Sales,chronos-small,0.7186805493171662,0.3848441218930615,9.0
90
+ Sales,chronos_base,0.7008513669210537,0.3850621220616795,7.25
91
+ Sales,chronos_large,0.7034956230009589,0.3844707529650858,7.25
92
+ Sales,crossformer,1.4946326987237475,7.655215975652274,20.75
93
+ Sales,d_linear,0.7999361690904114,0.5046638267307796,14.25
94
+ Sales,deepar,0.7388013496334613,0.3684882238444817,6.25
95
+ Sales,i_transformer,0.7592707077676131,0.37054645387589946,4.75
96
+ Sales,moirai_1.1_R_base_no_leak,0.667796706791987,0.5158002274792624,9.0
97
+ Sales,moirai_1.1_R_large_no_leak,0.6706792874796048,0.4063993273240754,5.25
98
+ Sales,moirai_1.1_R_small_no_leak,0.6717145779320488,0.4624986481003004,8.5
99
+ Sales,n_beats,0.7261206955984014,0.42619466975098175,11.0
100
+ Sales,naive,0.9988290398126464,0.9354131622562287,19.0
101
+ Sales,patch_tst,0.7506252415562384,0.36695813811595074,3.25
102
+ Sales,seasonal_naive,1.0,1.0,19.25
103
+ Sales,tft,0.7571398644569189,0.3639182778535524,8.0
104
+ Sales,tide,1.0042130411120884,0.5031160265435741,14.0
105
+ Sales,timesfm,0.6834660865486862,0.36525039257779146,2.75
106
+ Sales,visionts,0.8110052069079339,0.5234893692225551,15.5
107
+ Transport,auto_arima,1.067553229756302,0.7895352174994626,15.866666666666667
108
+ Transport,auto_ets,1.2519332616788197,62214211389283.484,18.333333333333332
109
+ Transport,auto_theta,1.080401746635928,1.484666133944374,18.733333333333334
110
+ Transport,chronos-small,0.8463413166527496,0.6018945114961274,10.066666666666666
111
+ Transport,chronos_base,0.8525884818870904,0.5855383296935212,8.066666666666666
112
+ Transport,chronos_large,0.847275145385676,0.5853558157193545,8.4
113
+ Transport,crossformer,2.133541126273085,2.824391583266013,10.466666666666667
114
+ Transport,d_linear,0.9088963832125505,0.7037765247623837,14.733333333333333
115
+ Transport,deepar,0.8113936242603784,0.5544948055430984,6.466666666666667
116
+ Transport,i_transformer,0.827077979022359,0.4999616864926626,5.866666666666666
117
+ Transport,moirai_1.1_R_base_no_leak,0.8561472773934119,0.47760992257555535,6.133333333333334
118
+ Transport,moirai_1.1_R_large_no_leak,0.9275713341627421,0.5021373535569643,6.666666666666667
119
+ Transport,moirai_1.1_R_small_no_leak,0.910054545689888,0.5002405082060885,8.333333333333334
120
+ Transport,n_beats,0.759611430343423,0.6406924635381934,12.6
121
+ Transport,naive,1.4793208069977917,2.2909473535610148,20.133333333333333
122
+ Transport,patch_tst,0.8021327551126702,0.5059201467965427,5.8
123
+ Transport,seasonal_naive,1.0,1.0,17.333333333333332
124
+ Transport,tft,0.8220948248404197,0.485294449011853,4.8
125
+ Transport,tide,0.8779217036886292,0.5698046392964627,10.4
126
+ Transport,timesfm,0.9283663454018408,0.577738758232893,8.133333333333333
127
+ Transport,visionts,0.8701382141384387,0.6655022406963716,13.666666666666666
128
+ Web/CloudOps,auto_arima,0.8940721359171526,0.9239632177767032,14.5
129
+ Web/CloudOps,auto_ets,1.1484024357848706,3541668.1195238987,17.05
130
+ Web/CloudOps,auto_theta,0.8325449233161077,0.7377937575734188,12.1
131
+ Web/CloudOps,chronos-small,1.1477382857881004,0.7519221963095372,10.4
132
+ Web/CloudOps,chronos_base,1.2983512147050473,0.8105248727247287,11.35
133
+ Web/CloudOps,chronos_large,1.3303495508509569,0.79130505302003,11.65
134
+ Web/CloudOps,crossformer,3.905809488486181,0.7282280986973914,11.35
135
+ Web/CloudOps,d_linear,1.679664570939319,0.813864694620387,12.55
136
+ Web/CloudOps,deepar,0.8567686630861442,0.7806071927900515,11.7
137
+ Web/CloudOps,i_transformer,0.7194432840929166,0.5224562708709003,4.5
138
+ Web/CloudOps,moirai_1.1_R_base_no_leak,1.0566434817767107,0.7682523197700815,9.25
139
+ Web/CloudOps,moirai_1.1_R_large_no_leak,0.7913068650225961,0.7415333306227597,8.35
140
+ Web/CloudOps,moirai_1.1_R_small_no_leak,0.797099135532333,0.7437898694659932,8.5
141
+ Web/CloudOps,n_beats,0.6423921434834379,0.6616483361015169,10.3
142
+ Web/CloudOps,naive,1.1134728329755728,1.1880618871151416,16.7
143
+ Web/CloudOps,patch_tst,0.6023812811006274,0.517794941208908,3.95
144
+ Web/CloudOps,seasonal_naive,1.0,1.0,16.35
145
+ Web/CloudOps,tft,1.3456759309631106,0.6485214709355084,5.95
146
+ Web/CloudOps,tide,0.957645003291147,0.6729746748245962,9.95
147
+ Web/CloudOps,timesfm,2.3672130873427584,0.9761625637942284,13.9
148
+ Web/CloudOps,visionts,0.8379189396040971,0.7244329358471615,10.65
results/grouped_results_by_frequency.csv ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ frequency,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
2
+ 10S,auto_arima,1.0,1.0,8.5
3
+ 10S,auto_ets,1.7729614789749542,2.9904066969242904,19.5
4
+ 10S,auto_theta,0.6490202004866038,0.4498416650221458,1.0
5
+ 10S,chronos-small,2.4209056102186177,1.0976423357911036,10.833333333333334
6
+ 10S,chronos_base,2.9496999072884034,1.2710971183275626,12.333333333333334
7
+ 10S,chronos_large,3.0233397960335355,1.1850701939432435,11.666666666666666
8
+ 10S,crossformer,10.78297867187441,0.9632338077490566,11.5
9
+ 10S,d_linear,3.464549733595483,1.1289544947882015,11.666666666666666
10
+ 10S,deepar,1.8262604211066729,1.028182807201692,11.166666666666666
11
+ 10S,i_transformer,1.3136663904154584,0.6908224736822052,2.5
12
+ 10S,moirai_1.1_R_base_no_leak,2.3417832113521384,1.4018041619123724,16.5
13
+ 10S,moirai_1.1_R_large_no_leak,1.5520209642771396,1.2388948175006333,14.166666666666666
14
+ 10S,moirai_1.1_R_small_no_leak,1.6476110189439555,1.3097769500283245,14.833333333333334
15
+ 10S,n_beats,0.9967269159720157,0.821329634043785,6.666666666666667
16
+ 10S,naive,1.9717621185326415,1.537476210057875,16.333333333333332
17
+ 10S,patch_tst,1.059144382825294,0.732123599927497,5.0
18
+ 10S,seasonal_naive,1.0,1.0,9.5
19
+ 10S,tft,3.2429931822722793,1.0464406489926144,7.666666666666667
20
+ 10S,tide,1.6709303370483655,0.9849620277425579,10.166666666666666
21
+ 10S,timesfm,6.2343560526627115,1.8234677065697433,19.333333333333332
22
+ 10S,visionts,1.0968489534913595,0.9341805708581102,10.166666666666666
23
+ 10T,auto_arima,1.0,1.0,15.5
24
+ 10T,auto_ets,1.655224521738566,2.151457928700739,16.666666666666668
25
+ 10T,auto_theta,2.629806275955802,3.6738488709022494,19.666666666666668
26
+ 10T,chronos-small,1.5117776833428394,0.6428724843624029,12.166666666666666
27
+ 10T,chronos_base,1.2748386240996392,0.5553606639434913,8.833333333333334
28
+ 10T,chronos_large,1.0346877837456316,0.5513783261338567,8.0
29
+ 10T,crossformer,19934536143.591114,2.1015695650743917,11.5
30
+ 10T,d_linear,1.299502689362975,0.6517937533428791,11.833333333333334
31
+ 10T,deepar,0.6449733779553711,0.5921106678712438,10.666666666666666
32
+ 10T,i_transformer,0.7796030121280376,0.590764568044439,6.833333333333333
33
+ 10T,moirai_1.1_R_base_no_leak,0.9116352990069784,0.4334215808615318,5.0
34
+ 10T,moirai_1.1_R_large_no_leak,1.0458495459083907,0.4946922044906253,7.5
35
+ 10T,moirai_1.1_R_small_no_leak,0.5833924359839285,0.5920693990394471,8.5
36
+ 10T,n_beats,1.2619963794129159,0.7808649076454232,13.166666666666666
37
+ 10T,naive,0.7585251839995307,2.2345586312657204,19.333333333333332
38
+ 10T,patch_tst,0.8811585844239113,0.5583960366038973,7.333333333333333
39
+ 10T,seasonal_naive,1.0,1.0,16.5
40
+ 10T,tft,1.2009513786157717,0.4124334801906368,4.333333333333333
41
+ 10T,tide,1.1631853970897563,0.655064716092509,11.833333333333334
42
+ 10T,timesfm,1.0220126106014014,0.5696636404743637,8.333333333333334
43
+ 10T,visionts,1.0799543876471867,0.5014732753557328,7.5
44
+ 15T,auto_arima,1.1006594936981116,0.9576282754708779,13.916666666666666
45
+ 15T,auto_ets,1.2708218834942537,77767764236603.9,18.166666666666668
46
+ 15T,auto_theta,0.9891316566321592,1.7607790421016025,17.0
47
+ 15T,chronos-small,0.9607408272903095,0.7932548659240278,9.75
48
+ 15T,chronos_base,0.9515313401712095,0.7673882941899328,8.5
49
+ 15T,chronos_large,0.9444257867877283,0.7636894556076325,8.166666666666666
50
+ 15T,crossformer,6.318458905448402,3.106359900469947,13.25
51
+ 15T,d_linear,1.0317429035906032,0.9374198238688525,14.083333333333334
52
+ 15T,deepar,1.9585178196595348,1.6805423348003437,14.5
53
+ 15T,i_transformer,0.8998429174870893,0.658390212420039,3.5
54
+ 15T,moirai_1.1_R_base_no_leak,1.010717233317842,0.7067981834067827,5.583333333333333
55
+ 15T,moirai_1.1_R_large_no_leak,0.939877854260836,0.675873678929174,4.416666666666667
56
+ 15T,moirai_1.1_R_small_no_leak,1.0967135488745525,0.7930862976995009,8.666666666666666
57
+ 15T,n_beats,0.9635718857733048,0.9892577700794088,15.0
58
+ 15T,naive,1.4410754166321356,2.423171378732352,19.75
59
+ 15T,patch_tst,0.874459260826287,0.663243746282057,3.8333333333333335
60
+ 15T,seasonal_naive,1.0,1.0,15.166666666666666
61
+ 15T,tft,1.0582677267954164,0.7318724576230791,6.583333333333333
62
+ 15T,tide,0.9787252364392137,0.8073681981088243,10.083333333333334
63
+ 15T,timesfm,1.0222202532066558,0.7930028114468964,8.25
64
+ 15T,visionts,0.9819059200401176,0.8783919060935471,12.833333333333334
65
+ 5T,auto_arima,1.0,1.0,16.25
66
+ 5T,auto_ets,1.0086175993752609,1.0467883914512524,15.5
67
+ 5T,auto_theta,0.964060428373576,1.0460351344918941,16.666666666666668
68
+ 5T,chronos-small,0.7802675296137581,0.7516150339138812,11.166666666666666
69
+ 5T,chronos_base,0.7630965336585342,0.7494651012783219,11.25
70
+ 5T,chronos_large,0.7718811079792048,0.7552409156866928,11.916666666666666
71
+ 5T,crossformer,1.2734965922255723,1.040860998323226,11.416666666666666
72
+ 5T,d_linear,0.9697247766365901,0.8333404387801792,14.5
73
+ 5T,deepar,0.6722225413838093,0.7971245067076417,13.25
74
+ 5T,i_transformer,0.6604920249872609,0.5433487060646278,5.5
75
+ 5T,moirai_1.1_R_base_no_leak,0.6004862717957414,0.5407632644099643,4.75
76
+ 5T,moirai_1.1_R_large_no_leak,0.5519935614821848,0.5321255655856548,3.25
77
+ 5T,moirai_1.1_R_small_no_leak,0.5295698754870711,0.5360775348348382,4.416666666666667
78
+ 5T,n_beats,0.6343946014070763,0.7257573125150426,12.416666666666666
79
+ 5T,naive,0.8443455210066756,1.40000213208879,17.166666666666668
80
+ 5T,patch_tst,0.5873226620946504,0.5432646554948832,4.75
81
+ 5T,seasonal_naive,1.0,1.0,17.75
82
+ 5T,tft,0.6434051298457072,0.5565798833891061,4.916666666666667
83
+ 5T,tide,0.7950041895659674,0.64766319142376,10.083333333333334
84
+ 5T,timesfm,0.8202436454444181,0.730338811538158,11.916666666666666
85
+ 5T,visionts,0.8757565714382182,0.7336813267313066,12.166666666666666
86
+ A,auto_arima,1.0171428571428571,0.9420289855072463,10.0
87
+ A,auto_ets,0.9371428571428573,0.8043478260869564,3.0
88
+ A,auto_theta,0.9371428571428573,0.8333333333333333,6.0
89
+ A,chronos-small,1.0,1.0072463768115942,17.0
90
+ A,chronos_base,0.9771428571428573,0.9782608695652174,13.0
91
+ A,chronos_large,0.9771428571428573,0.9782608695652174,14.0
92
+ A,crossformer,6.857142857142858,102.89855072463767,21.0
93
+ A,d_linear,1.062857142857143,1.2173913043478262,20.0
94
+ A,deepar,1.0171428571428571,0.8188405797101449,4.0
95
+ A,i_transformer,1.0342857142857143,0.8478260869565217,7.0
96
+ A,moirai_1.1_R_base_no_leak,1.2057142857142857,0.9420289855072463,11.0
97
+ A,moirai_1.1_R_large_no_leak,0.9542857142857144,0.7753623188405796,1.0
98
+ A,moirai_1.1_R_small_no_leak,0.9771428571428573,0.8260869565217391,5.0
99
+ A,n_beats,0.9028571428571429,0.9710144927536232,12.0
100
+ A,naive,1.0,0.9927536231884058,15.0
101
+ A,patch_tst,1.0057142857142858,0.8478260869565217,8.0
102
+ A,seasonal_naive,1.0,1.0,16.0
103
+ A,tft,0.9257142857142858,0.7971014492753623,2.0
104
+ A,tide,1.2057142857142857,1.1231884057971013,18.0
105
+ A,timesfm,0.9714285714285715,0.8478260869565217,9.0
106
+ A,visionts,1.0914285714285714,1.1521739130434783,19.0
107
+ D,auto_arima,0.8529021613038067,0.4985004151026563,11.0
108
+ D,auto_ets,0.9328318534943983,9.346363276951774,14.333333333333334
109
+ D,auto_theta,0.9291332918118479,0.5748243358418598,14.333333333333334
110
+ D,chronos-small,0.7368243034917711,0.43658660925543036,8.066666666666666
111
+ D,chronos_base,0.6862749899568191,0.421268869505568,6.733333333333333
112
+ D,chronos_large,0.6959915152342351,0.4206978284635147,6.666666666666667
113
+ D,crossformer,154.59928198095446,26.923731402565622,18.4
114
+ D,d_linear,0.8979716998083668,0.6146338678499457,15.133333333333333
115
+ D,deepar,0.8010158939552874,0.5938211197218363,11.066666666666666
116
+ D,i_transformer,0.8178290414664529,0.4945942845202976,8.466666666666667
117
+ D,moirai_1.1_R_base_no_leak,0.7453788326511405,0.4633929439816138,7.466666666666667
118
+ D,moirai_1.1_R_large_no_leak,0.6628066770710596,0.39170739564229085,4.333333333333333
119
+ D,moirai_1.1_R_small_no_leak,0.7086474258839043,0.41290002863504544,6.2
120
+ D,n_beats,0.7960758852718125,0.571280865579523,14.4
121
+ D,naive,1.0,0.7975936297433697,17.6
122
+ D,patch_tst,0.7522257716055208,0.43688965053339623,7.2
123
+ D,seasonal_naive,1.0,1.0,19.0
124
+ D,tft,0.7622852957686599,0.4136783788887685,6.666666666666667
125
+ D,tide,0.9969048876212023,0.7484054326947167,13.533333333333333
126
+ D,timesfm,0.779898124642687,0.49167132325793367,5.6
127
+ D,visionts,0.8948792108968546,0.5558947866812461,14.8
128
+ H,auto_arima,0.9539421964948747,0.7767300171583383,15.483870967741936
129
+ H,auto_ets,1.328807641048494,4.448217317487267e+23,18.774193548387096
130
+ H,auto_theta,3.078013295555958,1.851945783529741,18.870967741935484
131
+ H,chronos-small,0.7327688330840401,0.5144358964770991,8.580645161290322
132
+ H,chronos_base,0.7385494318069303,0.505942185952322,7.806451612903226
133
+ H,chronos_large,0.7478851839242808,0.5091430423436873,8.0
134
+ H,crossformer,5676075428.759688,5.282163959765436,11.0
135
+ H,d_linear,1.3543590165990786,0.6813103317146129,14.129032258064516
136
+ H,deepar,1.1775584990695886,0.8933529848405231,10.870967741935484
137
+ H,i_transformer,0.960319640638691,0.476223391238969,6.129032258064516
138
+ H,moirai_1.1_R_base_no_leak,0.9074224107057469,0.46296554507116067,5.290322580645161
139
+ H,moirai_1.1_R_large_no_leak,0.8513557808071407,0.49904989135530137,6.548387096774194
140
+ H,moirai_1.1_R_small_no_leak,0.8133232904541264,0.4778830172118464,6.32258064516129
141
+ H,n_beats,1.3440056550191246,0.6646047122901354,13.419354838709678
142
+ H,naive,1.3316850451655482,1.8366794323255176,19.64516129032258
143
+ H,patch_tst,0.9278036240195197,0.46244079149616185,4.967741935483871
144
+ H,seasonal_naive,1.0,1.0,17.64516129032258
145
+ H,tft,1.1074721197519513,0.48721235955499614,6.645161290322581
146
+ H,tide,1.2515715821404725,0.5695063335623101,10.580645161290322
147
+ H,timesfm,0.9815849142972823,0.5102566138085366,8.35483870967742
148
+ H,visionts,0.989251601317308,0.6105910361801898,11.935483870967742
149
+ M,auto_arima,0.7897292418553448,0.7664432031389335,6.2
150
+ M,auto_ets,0.8246966155843344,0.7720428958818089,5.4
151
+ M,auto_theta,0.9196256618111756,0.8821604346726062,8.4
152
+ M,chronos-small,0.8622647029230233,0.8311626407190348,8.6
153
+ M,chronos_base,0.8946217478533949,0.8612725657711913,8.6
154
+ M,chronos_large,0.8472484866537497,0.81475133159962,8.2
155
+ M,crossformer,10.544298745396533,67.95761479628871,13.0
156
+ M,d_linear,1.081950231233934,1.1996315851033383,15.8
157
+ M,deepar,1.1770718831957356,1.0978451288049407,11.8
158
+ M,i_transformer,1.0050739577178593,0.8211301131061257,5.6
159
+ M,moirai_1.1_R_base_no_leak,1.894529314848318,1.4962012401441718,19.2
160
+ M,moirai_1.1_R_large_no_leak,0.9965918207836234,0.9172428125972104,9.6
161
+ M,moirai_1.1_R_small_no_leak,1.3843487814465443,1.2213717006565714,17.0
162
+ M,n_beats,0.9414022389216479,1.007258419666447,11.4
163
+ M,naive,1.2968159580843228,1.5810738327920664,19.2
164
+ M,patch_tst,0.9895171231152233,0.8480243667176828,7.6
165
+ M,seasonal_naive,1.0,1.0,13.4
166
+ M,tft,1.0540757144083999,0.8748338687935762,8.2
167
+ M,tide,1.0388749071339713,1.2356707678524768,15.4
168
+ M,timesfm,0.8627984408713761,0.7375099053093932,3.6
169
+ M,visionts,0.9379289156090017,1.0376995780191705,14.8
170
+ Q,auto_arima,0.8591549295774649,0.8225806451612904,5.0
171
+ Q,auto_ets,0.8591549295774649,0.7983870967741936,4.0
172
+ Q,auto_theta,0.8380281690140845,0.7973790322580646,2.0
173
+ Q,chronos-small,0.8239436619718311,0.8457661290322581,11.0
174
+ Q,chronos_base,0.8098591549295776,0.8397177419354839,8.0
175
+ Q,chronos_large,0.8098591549295776,0.8397177419354839,9.0
176
+ Q,crossformer,9.929577464788732,119.95967741935485,21.0
177
+ Q,d_linear,0.9859154929577466,1.1088709677419355,19.0
178
+ Q,deepar,0.9436619718309861,0.840725806451613,10.0
179
+ Q,i_transformer,0.9084507042253522,0.7973790322580646,3.0
180
+ Q,moirai_1.1_R_base_no_leak,1.4295774647887327,1.1290322580645162,20.0
181
+ Q,moirai_1.1_R_large_no_leak,0.8873239436619719,0.7883064516129034,1.0
182
+ Q,moirai_1.1_R_small_no_leak,1.0352112676056338,0.9324596774193549,13.0
183
+ Q,n_beats,0.8380281690140845,0.9717741935483871,15.0
184
+ Q,naive,0.9295774647887325,0.9506048387096774,14.0
185
+ Q,patch_tst,0.9366197183098592,0.8346774193548387,6.0
186
+ Q,seasonal_naive,1.0,1.0,16.0
187
+ Q,tft,0.9366197183098592,0.8366935483870969,7.0
188
+ Q,tide,1.1338028169014085,1.0181451612903227,17.0
189
+ Q,timesfm,0.8802816901408451,0.8528225806451613,12.0
190
+ Q,visionts,0.9366197183098592,1.0483870967741935,18.0
191
+ W,auto_arima,0.9759738266715013,0.748994017923637,9.875
192
+ W,auto_ets,0.971794800090248,0.7889859594373794,10.375
193
+ W,auto_theta,1.0807827233498426,0.8086208938554269,12.125
194
+ W,chronos-small,0.7075714340716913,0.554913070787174,5.25
195
+ W,chronos_base,0.7288517030693448,0.5619860060584291,4.5
196
+ W,chronos_large,0.7069908074836505,0.5515544393043641,4.375
197
+ W,crossformer,7.4085130347732155,49.820115045230985,20.375
198
+ W,d_linear,1.1283846019672135,0.97517397862518,16.875
199
+ W,deepar,1.862654170783469,1.3453366562739022,12.75
200
+ W,i_transformer,1.9015274495154941,1.3308062694365717,12.375
201
+ W,moirai_1.1_R_base_no_leak,0.9495981825008816,0.7483199869474646,9.75
202
+ W,moirai_1.1_R_large_no_leak,0.8957713665163969,0.6412758290160822,5.125
203
+ W,moirai_1.1_R_small_no_leak,0.9814158946324417,0.7581734604096699,8.625
204
+ W,n_beats,1.4457143774506727,1.0793531840148527,15.25
205
+ W,naive,1.0,0.875913187952162,13.625
206
+ W,patch_tst,0.9073153465406218,0.6981022157766977,8.0
207
+ W,seasonal_naive,1.0,1.0,16.5
208
+ W,tft,1.0396666971907644,0.7794595632824765,11.25
209
+ W,tide,1.7669411056805098,1.2225931248216542,13.125
210
+ W,timesfm,0.8615386867667885,0.6305866856856626,4.875
211
+ W,visionts,1.1355217147765972,0.9990150254993866,16.0
results/grouped_results_by_term_length.csv ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ term_length,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
2
+ long,auto_arima,1.0407648178370423,0.8433829698076462,15.095238095238095
3
+ long,auto_ets,1.2759037139699805,6.566416040544638e+23,18.666666666666668
4
+ long,auto_theta,3.1498850578770834,2.0601505251654686,17.38095238095238
5
+ long,chronos-small,1.0106686030682668,0.6462623576388796,11.857142857142858
6
+ long,chronos_base,0.9982206684476166,0.619712893481198,10.714285714285714
7
+ long,chronos_large,1.002171192668077,0.6208511887480599,11.047619047619047
8
+ long,crossformer,8164023382.865999,0.5072278949697392,8.285714285714286
9
+ long,d_linear,1.562322390892946,0.6730209945455377,13.238095238095237
10
+ long,deepar,1.701880985304897,0.9481973435022004,12.714285714285714
11
+ long,i_transformer,0.9961649824755919,0.45539997968708557,5.190476190476191
12
+ long,moirai_1.1_R_base_no_leak,1.0906983483127686,0.5005193850296795,5.428571428571429
13
+ long,moirai_1.1_R_large_no_leak,1.0073695402863057,0.538721786921065,6.761904761904762
14
+ long,moirai_1.1_R_small_no_leak,0.9192503918318538,0.5613725349527969,7.238095238095238
15
+ long,n_beats,1.1138460516712445,0.6641343878789057,11.952380952380953
16
+ long,naive,1.3505564252932616,2.111806155806968,19.61904761904762
17
+ long,patch_tst,0.9616382830362623,0.437002893484988,4.428571428571429
18
+ long,seasonal_naive,1.0,1.0,16.857142857142858
19
+ long,tft,1.1557571021609558,0.4529431176291461,4.904761904761905
20
+ long,tide,1.2799667076250163,0.5247494742051094,8.761904761904763
21
+ long,timesfm,1.796874605878608,0.6535892285331686,11.285714285714286
22
+ long,visionts,1.0324758074134404,0.5463144403056718,9.571428571428571
23
+ medium,auto_arima,0.9925217451047474,0.862989330588506,14.571428571428571
24
+ medium,auto_ets,1.5910566618211812,8307478.842344518,18.333333333333332
25
+ medium,auto_theta,1.8722187149301004,1.9594402227791685,17.428571428571427
26
+ medium,chronos-small,1.232070637409101,0.7382117459387273,11.476190476190476
27
+ medium,chronos_base,1.3793840327174318,0.777089828453266,11.571428571428571
28
+ medium,chronos_large,1.3264659611435963,0.7511074564157023,10.904761904761905
29
+ medium,crossformer,2510838525.595493,0.6608294014146207,8.571428571428571
30
+ medium,d_linear,1.389713789881514,0.766265949645093,13.857142857142858
31
+ medium,deepar,1.0742968052431434,0.8271972158292309,10.523809523809524
32
+ medium,i_transformer,1.0152562339844198,0.5288800110916518,4.619047619047619
33
+ medium,moirai_1.1_R_base_no_leak,1.1063058012973035,0.6371675078184162,6.380952380952381
34
+ medium,moirai_1.1_R_large_no_leak,0.9602362541309903,0.6248351913119883,6.761904761904762
35
+ medium,moirai_1.1_R_small_no_leak,0.9306757114627328,0.6605293213864295,7.523809523809524
36
+ medium,n_beats,1.1513061909700455,0.759822180558971,12.952380952380953
37
+ medium,naive,1.3041664719836585,2.0849508725128736,19.285714285714285
38
+ medium,patch_tst,0.9034466134216352,0.5192209987815241,4.0
39
+ medium,seasonal_naive,1.0,1.0,16.142857142857142
40
+ medium,tft,1.2614988599396852,0.5286354727281315,4.809523809523809
41
+ medium,tide,1.193587520563658,0.6109994159383925,9.523809523809524
42
+ medium,timesfm,1.524928039707925,0.8138067254337588,10.80952380952381
43
+ medium,visionts,1.0232867065677744,0.6690986273300213,10.952380952380953
44
+ short,auto_arima,0.9183158404157642,0.7782297371914904,11.872727272727273
45
+ short,auto_ets,1.0290159244978938,1287882.1620396667,13.781818181818181
46
+ short,auto_theta,1.1276737692944792,0.9555577026305212,13.527272727272727
47
+ short,chronos-small,0.7809947822060929,0.5973093408946208,7.254545454545455
48
+ short,chronos_base,0.7504208829287357,0.5900291013554769,6.163636363636364
49
+ short,chronos_large,0.7517445876583578,0.5860508463835682,6.2727272727272725
50
+ short,crossformer,1298062866.890411,28.58975534685244,17.581818181818182
51
+ short,d_linear,1.1372631367933204,0.8759004934560309,15.145454545454545
52
+ short,deepar,1.0793364184925522,1.0306040328879058,11.89090909090909
53
+ short,i_transformer,0.9683901902602449,0.7276300854958668,7.490909090909091
54
+ short,moirai_1.1_R_base_no_leak,0.9406066725109657,0.7078567359533983,8.836363636363636
55
+ short,moirai_1.1_R_large_no_leak,0.7726332258609173,0.6016193373717869,5.4
56
+ short,moirai_1.1_R_small_no_leak,0.8487478588017764,0.6661776706416213,8.145454545454545
57
+ short,n_beats,1.0323393751593084,0.8195659204638793,13.872727272727273
58
+ short,naive,1.104608870534287,1.20246170110538,17.21818181818182
59
+ short,patch_tst,0.80382107933716,0.6279448956983675,6.872727272727273
60
+ short,seasonal_naive,1.0,1.0,16.745454545454546
61
+ short,tft,1.0461332316971086,0.6755258572212629,8.218181818181819
62
+ short,tide,1.1230276648969284,0.9169276487302415,13.418181818181818
63
+ short,timesfm,0.9286461705272189,0.6445361651154075,6.636363636363637
64
+ short,visionts,0.9467929371136616,0.8196918268462717,14.654545454545454
results/grouped_results_by_univariate.csv ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ univariate,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
2
+ False,auto_arima,0.9804811342495535,0.8713104754311666,13.906976744186046
3
+ False,auto_ets,1.3150898838758078,28247779.184341986,17.023255813953487
4
+ False,auto_theta,2.411812537314736,1.1875727874520958,14.976744186046512
5
+ False,chronos-small,1.0763507868085402,0.6645935333486652,9.511627906976743
6
+ False,chronos_base,1.108253824730984,0.6827654193124292,9.44186046511628
7
+ False,chronos_large,1.0950741555125842,0.6729041022551422,9.55813953488372
8
+ False,crossformer,2.654080994925351,0.9189271735725226,12.906976744186046
9
+ False,d_linear,1.6213889770481567,0.7823221048850953,13.488372093023257
10
+ False,deepar,1.5448649630735394,1.177574529017474,14.0
11
+ False,i_transformer,1.1492193189093056,0.6224209228728989,5.186046511627907
12
+ False,moirai_1.1_R_base_no_leak,1.0862615181082176,0.6282749871527136,6.27906976744186
13
+ False,moirai_1.1_R_large_no_leak,0.9324785498208167,0.6271354928559256,6.674418604651163
14
+ False,moirai_1.1_R_small_no_leak,0.8841632883529932,0.6261701600960712,6.325581395348837
15
+ False,n_beats,1.4270416026611565,0.7348091745010178,12.581395348837209
16
+ False,naive,1.2123354003513602,1.3984175456034713,17.697674418604652
17
+ False,patch_tst,0.8748906969616712,0.5267294212715652,5.023255813953488
18
+ False,seasonal_naive,1.0,1.0,16.511627906976745
19
+ False,tft,1.3981834112189448,0.6093381204940588,6.906976744186046
20
+ False,tide,1.43858389278792,0.8200701505564333,11.395348837209303
21
+ False,timesfm,1.714449538660074,0.7498374834291154,10.069767441860465
22
+ False,visionts,1.0423141526857782,0.7009754803331653,11.534883720930232
23
+ True,auto_arima,0.9452907457761817,0.7624093224131667,12.555555555555555
24
+ True,auto_ets,1.1157996061544226,2.553606237989581e+23,14.87037037037037
25
+ True,auto_theta,1.1810795821409918,1.5907675831680725,15.38888888888889
26
+ True,chronos-small,0.8105398747887519,0.6175638516729869,8.88888888888889
27
+ True,chronos_base,0.8064432004095214,0.6004730500877874,7.425925925925926
28
+ True,chronos_large,0.7992434048928462,0.5946120685660026,7.314814814814815
29
+ True,crossformer,5473436252.639601,28.841701459950688,14.185185185185185
30
+ True,d_linear,1.0152315982745468,0.8288833788523304,15.222222222222221
31
+ True,deepar,0.9487785778969822,0.8024222742045092,10.0
32
+ True,i_transformer,0.8534236166087696,0.6282487930572502,7.314814814814815
33
+ True,moirai_1.1_R_base_no_leak,0.9474297935418754,0.6631059403644487,8.592592592592593
34
+ True,moirai_1.1_R_large_no_leak,0.8095920641262695,0.5658691463988443,5.444444444444445
35
+ True,moirai_1.1_R_small_no_leak,0.8798251667426911,0.6550813700792939,9.0
36
+ True,n_beats,0.7960013669407282,0.803378130725278,13.796296296296296
37
+ True,naive,1.1920786390202338,1.7432469758252465,18.574074074074073
38
+ True,patch_tst,0.8473452262927366,0.5920056645616604,6.277777777777778
39
+ True,seasonal_naive,1.0,1.0,16.74074074074074
40
+ True,tft,0.8921817826859224,0.5845469141375511,6.648148148148148
41
+ True,tide,0.9602232031743085,0.7225690907268552,11.703703703703704
42
+ True,timesfm,0.8724456439616861,0.6300331912444614,7.333333333333333
43
+ True,visionts,0.9337984401734895,0.7493477637512667,13.722222222222221
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -12,61 +13,85 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
- NUM_FEWSHOT = 0 # Change with your few shot
 
 
 
 
 
 
 
 
 
19
  # ---------------------------------------------------
20
 
21
 
22
-
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
 
34
 
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
 
38
- """
 
 
39
 
40
- EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
-
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
  ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
 
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
 
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
 
 
60
 
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
 
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
72
  """
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
+ # task0 = Task("boolq", "acc", "BoolQA")
17
+ task1 = Task("trivia", "EM", "TriviaQA")
18
+ task2 = Task("truthfulqa", "EM", "TruthfulQA")
19
+ task3 = Task("popqa", "acc", "PopQA")
20
+ task4 = Task("hpqa", "EM", "HotpotQA")
21
+ task5 = Task("nq", "EM", "Natural Questions")
22
+ task6 = Task("2wiki", "EM", "2WikiMultiHop")
23
+ task7 = Task("musique", "EM", "MuSiQue")
24
+ # task0 = Task("anli_r1", "acc", "ANLI")
25
+ # task1 = Task("logiqa", "acc_norm", "LogiQA")
26
+
27
+
28
+ NUM_FEWSHOT = 0 # Change with your few shot
29
  # ---------------------------------------------------
30
 
31
 
 
32
  # Your leaderboard name
33
+ TITLE = """<h1 align="center" id="space-title">GIFT-Eval Time Series Forecasting Leaderboard</h1>"""
34
 
35
  # What does your leaderboard evaluate?
36
  INTRODUCTION_TEXT = """
37
+ [Placeholder] We introduce the General TIme Series ForecasTing Model Evaluation, GIFT-Eval,
38
+ a pioneering benchmark aimed at promoting evaluation across diverse datasets.
39
+ GIFT-Eval encompasses 28 datasets over 144,000 time series and 177 million data
40
+ points, spanning seven domains, 10 frequencies, multivariate inputs, and prediction lengths ranging from short to long-term forecasts.
41
  """
42
 
43
  # Which evaluations are you running? how can people reproduce what you have?
44
  LLM_BENCHMARKS_TEXT = f"""
45
+ How It Works
46
+ To participate in the ContextualBench leaderboard, follow these steps to evaluate your Large Language Model (LLM) using the ContextualBench framework:
47
 
48
+ Clone the Repository: Start by cloning the ContextualBench GitHub repository to your local machine using the following command:
 
49
 
50
+ ```bash
51
+ git clone https://github.com/SalesforceAIResearch/SFR-RAG
52
+ ```
53
 
54
+ Navigate to the Directory: Move into the cloned repository's directory:
55
+
56
+
57
+ ``bash
58
+ cd ContextualBench
 
 
 
 
59
  ```
 
60
 
61
+ Install Dependencies: Install all necessary dependencies by executing:
 
62
 
63
+ ```bash
64
+ pip install -r requirements.txt
65
+ ```
66
+
67
+ Prepare Your Model and Dataset: Set up your model and dataset according to the guidelines provided in the repository's documentation.
68
+ Run the Evaluation Script: Execute the evaluation script to generate outputs for your model on the specified dataset:
69
+
70
+
71
+ ```bash
72
+ python run.py [dataset_name]
73
+ ```
74
+
75
+ Collect and Format Outputs: Gather the outputs generated for each dataset and format them according to the leaderboard submission guidelines.
76
+ Submit Your Results: Email the formatted outputs to the author's email address for evaluation. Our team will assess the performance and update the leaderboard accordingly.
77
 
78
+ Reproducibility
79
+ Ensuring reproducibility is a key aspect of the ContextualBench leaderboard.
80
+ By following the standardized steps outlined above, participants can consistently reproduce evaluation results. This process not only facilitates fair comparisons across different models but also encourages transparency and reliability in model assessments. Participants are encouraged to adhere strictly to the submission guidelines to ensure their results are accurately reflected on the leaderboard.
81
+ """
82
 
83
+ EVALUATION_QUEUE_TEXT = """
 
84
 
 
 
 
 
85
  """
86
 
87
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
88
  CITATION_BUTTON_TEXT = r"""
89
+ @article{
90
+ aksu2024gifteval,
91
+ title={{GIFT}-Eval: A Benchmark for General Time Series Forecasting Model Evaluation},
92
+ author={Taha Aksu and Gerald Woo and Juncheng Liu and Xu Liu and Chenghao Liu and Silvio Savarese and Caiming Xiong and Doyen Sahoo},
93
+ booktitle={NeurIPS Workshop on Time Series in the Age of Large Models},
94
+ year={2024},
95
+ url={https://openreview.net/forum?id=Z2cMOOANFX}
96
+ }
97
  """
src/populate.py CHANGED
@@ -6,16 +6,34 @@ import pandas as pd
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
 
18
  df = df[cols].round(decimals=2)
 
19
 
20
  # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
@@ -39,7 +57,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
39
  all_evals.append(data)
40
  elif ".md" not in entry:
41
  # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
  for sub_entry in sub_entries:
44
  file_path = os.path.join(save_path, entry, sub_entry)
45
  with open(file_path) as fp:
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
+ import ipdb
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
+ # raw_data = get_raw_eval_results(results_path, requests_path)
14
+ # print('results_path:', results_path)
15
+ # all_data_json = [v.to_dict() for v in raw_data]
16
+ # print(f"The raw data is {all_data_json}")
17
+ #
18
+ # df = pd.DataFrame.from_records(all_data_json)
19
+ df = pd.read_csv(results_path)
20
+ # df = pd.read_csv('LOTSAv2_EvalBenchmark(Long).csv')
21
+ # Step 2: Pivot the DataFrame
22
+ df = df.pivot_table(index='model',
23
+ columns='dataset',
24
+ values='eval_metrics/MAE[0.5]',
25
+ aggfunc='first')
26
+ df.drop(columns=['ALL'], inplace=True)
27
+ df['Average'] = df.mean(axis=1)
28
+ # Reset the index if you want the model column to be part of the DataFrame
29
+ df.reset_index(inplace=True)
30
+ print(f"DF at stage 1 ********** {df}")
31
+ # ipdb.set_trace()
32
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
33
+ # df = df.sort_values(by=[AutoEvalColumn.__dataclass_fields__['average'].name], ascending=False)
34
+ print(f"DF at stage 2 ********** {df}")
35
  df = df[cols].round(decimals=2)
36
+ print(f"DF at stage 3 ********** {df}")
37
 
38
  # filter out if any of the benchmarks have not been produced
39
  df = df[has_no_nan_values(df, benchmark_cols)]
 
57
  all_evals.append(data)
58
  elif ".md" not in entry:
59
  # this is a folder
60
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
61
  for sub_entry in sub_entries:
62
  file_path = os.path.join(save_path, entry, sub_entry)
63
  with open(file_path) as fp:
src/utils.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ def norm_sNavie(df):
3
+ df_normalized = df.copy()
4
+ seasonal_naive_row = df[df['model'] == 'seasonal_naive'].iloc[0]
5
+ print('df: ',df)
6
+ for column in df.columns:
7
+ if column != 'model': # We skip normalizing the 'model' column
8
+ df_normalized[column] = df[column] / seasonal_naive_row[column]
9
+ return df_normalized
10
+
11
+ def pivot_df(file_name, tab_name):
12
+ df = pd.read_csv(file_name)
13
+ if tab_name == 'univariate':
14
+ df['univariate'] = df['univariate'].replace({True: 'univariate', False: 'multivariate'})
15
+ df.rename(columns={'univariate': 'variate_type'}, inplace=True)
16
+ tab_name = 'variate_type'
17
+ df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
18
+ df_melted['metric'] = df_melted['metric'].replace({
19
+ 'eval_metrics/MAPE[0.5]': 'MAPE',
20
+ 'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS'
21
+ })
22
+ df_pivot = df_melted.pivot_table(index='model', columns=[tab_name, 'metric'], values='value')
23
+ df_pivot.columns = [f'{tab_name} ({metric})' for tab_name, metric in df_pivot.columns]
24
+ # df_pivot.to_csv('pivoted_df.csv')
25
+ # print(df_pivot)
26
+ df_pivot = df_pivot.reset_index()
27
+ return df_pivot