Spaces:

Salesforce
/

GIFT-Eval

Running

App Files Files Community

juncliu commited on Oct 17, 2024

Commit

409ae36

1 Parent(s): 1490aed

initial demo

Browse files

Files changed (8) hide show

app.py +130 -79
results/grouped_results_by_domain.csv +148 -0
results/grouped_results_by_frequency.csv +211 -0
results/grouped_results_by_term_length.csv +64 -0
results/grouped_results_by_univariate.csv +43 -0
src/about.py +58 -33
src/populate.py +24 -6
src/utils.py +27 -0

app.py CHANGED Viewed

@@ -27,29 +27,62 @@ from src.display.utils import (
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
     finished_eval_queue_df,
@@ -57,6 +90,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -64,26 +98,28 @@ def init_leaderboard(dataframe):
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
         select_columns=SelectColumns(
-            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
-            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
-            label="Select Columns to Display:",
         ),
-        search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
-        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
-        filter_columns=[
-            ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
-            ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
-            ColumnFilter(
-                AutoEvalColumn.params.name,
-                type="slider",
-                min=0.01,
-                max=150,
-                label="Select the number of parameters (B)",
-            ),
-            ColumnFilter(
-                AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
-            ),
-        ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
@@ -95,21 +131,33 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                 with gr.Column():
                     with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
                     ):
                         with gr.Row():
                             finished_eval_table = gr.components.Dataframe(
@@ -119,8 +167,8 @@ with demo:
                                 row_count=5,
                             )
                     with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
                     ):
                         with gr.Row():
                             running_eval_table = gr.components.Dataframe(
@@ -131,8 +179,8 @@ with demo:
                             )
                     with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
                     ):
                         with gr.Row():
                             pending_eval_table = gr.components.Dataframe(
@@ -142,51 +190,54 @@ with demo:
                                 row_count=5,
                             )
             with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
+from src.utils import norm_sNavie, pivot_df
+import ipdb
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
+        token=TOKEN
     )
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
+        token=TOKEN
     )
 except Exception:
     restart_space()
+# # LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+# df = pd.read_csv('LOTSAv2_EvalBenchmark(Long).csv')
+# # Step 2: Pivot the DataFrame
+# LEADERBOARD_DF = df.pivot_table(index='model',
+#                              columns='dataset',
+#                              values='eval_metrics/MAE[0.5]',
+#                              aggfunc='first')
+# LEADERBOARD_DF.drop(columns=['ALL'], inplace=True)
+#
+# # Reset the index if you want the model column to be part of the DataFrame
+# LEADERBOARD_DF.reset_index(inplace=True)
+# # Step 3: noramlize the values
+# # ipdb.set_trace()
+# LEADERBOARD_DF = norm_sNavie(LEADERBOARD_DF)
+#
+# # LEADERBOARD_DF['Average'] = LEADERBOARD_DF.mean(axis=1)
+# # LEADERBOARD_DF.insert(1, 'Average', LEADERBOARD_DF.pop('Average'))
+# # LEADERBOARD_DF = LEADERBOARD_DF.sort_values(by=['Average'], ascending=True)
+# print(f"The leaderboard is {LEADERBOARD_DF}")
+# print(f'Columns: ', LEADERBOARD_DF.columns)
+# LEADERBOARD_DF = pd.read_csv('pivoted_df.csv')
+domain_df = pivot_df('results/grouped_results_by_domain.csv', tab_name='domain')
+print(f'Domain dataframe is {domain_df}')
+freq_df = pivot_df('results/grouped_results_by_frequency.csv', tab_name='frequency')
+print(f'Freq dataframe is {freq_df}')
+term_length_df = pivot_df('results/grouped_results_by_term_length.csv', tab_name='term_length')
+print(f'Term length dataframe is {term_length_df}')
+variate_type_df = pivot_df('results/grouped_results_by_univariate.csv', tab_name='univariate')
+print(f'Variate type dataframe is {variate_type_df}')
 (
     finished_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
         select_columns=SelectColumns(
+            # default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and c.name not in ['params', 'available_on_hub', 'hub', 'Model sha','Hub License']],
+            default_selection=list(dataframe.columns),
+            cant_deselect=['model'],
+            label="Select Datasets to Display:",
         ),
+        search_columns=['model'],
+        # hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+        # filter_columns=[
+        #     ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
+        #     ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
+        #     ColumnFilter(
+        #         AutoEvalColumn.params.name,
+        #         type="slider",
+        #         min=0.01,
+        #         max=500,
+        #         label="Select the number of parameters (B)",
+        #     ),
+        #     ColumnFilter(
+        #         AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
+        #     ),
+        # ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     )
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 By Domain", elem_id="llm-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(domain_df)
+            print(f"FINAL Domain LEADERBOARD 1 {domain_df}")
+        with gr.TabItem("🏅 By Frequency", elem_id="llm-benchmark-tab-table", id=1):
+            leaderboard = init_leaderboard(freq_df)
+            print(f"FINAL Frequency LEADERBOARD 1 {freq_df}")
+        with gr.TabItem("🏅 By term length", elem_id="llm-benchmark-tab-table", id=2):
+            leaderboard = init_leaderboard(term_length_df)
+            print(f"FINAL term length LEADERBOARD 1 {term_length_df}")
+        with gr.TabItem("🏅 By variate type", elem_id="llm-benchmark-tab-table", id=3):
+            leaderboard = init_leaderboard(variate_type_df)
+            print(f"FINAL LEADERBOARD 1 {variate_type_df}")
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=5):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                 with gr.Column():
                     with gr.Accordion(
+                            f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
+                            open=False,
                     ):
                         with gr.Row():
                             finished_eval_table = gr.components.Dataframe(
                                 row_count=5,
                             )
                     with gr.Accordion(
+                            f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                            open=False,
                     ):
                         with gr.Row():
                             running_eval_table = gr.components.Dataframe(
                             )
                     with gr.Accordion(
+                            f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                            open=False,
                     ):
                         with gr.Row():
                             pending_eval_table = gr.components.Dataframe(
                                 row_count=5,
                             )
             with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your model outputs !", elem_classes="markdown-text")
+                gr.Markdown(
+                    "Send your model outputs for all the models using the ContextualBench code and email them to us at xnguyen@salesforce.com ",
+                    elem_classes="markdown-text")
+            # with gr.Row():
+            #     with gr.Column():
+            #         model_name_textbox = gr.Textbox(label="Model name")
+            #         revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+            #         model_type = gr.Dropdown(
+            #             choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+            #             label="Model type",
+            #             multiselect=False,
+            #             value=None,
+            #             interactive=True,
+            #         )
+            #     with gr.Column():
+            #         precision = gr.Dropdown(
+            #             choices=[i.value.name for i in Precision if i != Precision.Unknown],
+            #             label="Precision",
+            #             multiselect=False,
+            #             value="float16",
+            #             interactive=True,
+            #         )
+            #         weight_type = gr.Dropdown(
+            #             choices=[i.value.name for i in WeightType],
+            #             label="Weights type",
+            #             multiselect=False,
+            #             value="Original",
+            #             interactive=True,
+            #         )
+            #         base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+            # submit_button = gr.Button("Submit Eval")
+            # submission_result = gr.Markdown()
+            # submit_button.click(
+            #     add_new_eval,
+            #     [
+            #         model_name_textbox,
+            #         base_model_name_textbox,
+            #         revision_name_textbox,
+            #         precision,
+            #         weight_type,
+            #         model_type,
+            #     ],
+            #     submission_result,
+            # )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

results/grouped_results_by_domain.csv ADDED Viewed

	@@ -0,0 +1,148 @@

+domain,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
+Econ/Fin,auto_arima,0.9365192023498371,0.82547313686996,7.166666666666667
+Econ/Fin,auto_ets,1.069790707228502,0.9914310347244895,9.0
+Econ/Fin,auto_theta,1.0054900702449128,0.846318218177465,7.833333333333333
+Econ/Fin,chronos-small,0.8016337645320165,0.7776047759026133,7.5
+Econ/Fin,chronos_base,0.8020694855013312,0.7682503560215578,6.333333333333333
+Econ/Fin,chronos_large,0.7991640460606071,0.7743218919662125,7.0
+Econ/Fin,crossformer,34.27971965234223,126.76758253255495,21.0
+Econ/Fin,d_linear,1.1148346887593732,1.1323705430596809,17.5
+Econ/Fin,deepar,1.2278569268049724,1.4148933569849873,14.333333333333334
+Econ/Fin,i_transformer,1.0214445148835511,0.8548360574080038,8.333333333333334
+Econ/Fin,moirai_1.1_R_base_no_leak,1.2693975078273285,1.035652986793995,15.5
+Econ/Fin,moirai_1.1_R_large_no_leak,0.879988898990186,0.7355518100899686,3.1666666666666665
+Econ/Fin,moirai_1.1_R_small_no_leak,1.035635272136103,0.8530614545680443,10.833333333333334
+Econ/Fin,n_beats,0.8629634668807649,0.9855058224964246,12.666666666666666
+Econ/Fin,naive,1.222637910798122,1.3614416351328862,15.166666666666666
+Econ/Fin,patch_tst,0.9463918645805333,0.8154506342478772,6.666666666666667
+Econ/Fin,seasonal_naive,1.0,1.0,14.833333333333334
+Econ/Fin,tft,1.1360880871257253,0.8476154067835847,7.833333333333333
+Econ/Fin,tide,1.1466847399238815,1.0894612064116727,17.0
+Econ/Fin,timesfm,0.840899493300519,0.7288672456979325,4.833333333333333
+Econ/Fin,visionts,0.9927325904977805,1.056804867374159,16.5
+Energy,auto_arima,1.020271660057138,0.8634432097816029,12.28125
+Energy,auto_ets,1.3407403717825905,4.309210526315789e+23,15.8125
+Energy,auto_theta,1.3500329947825767,2.288546713618502,16.90625
+Energy,chronos-small,0.9106949318701584,0.684989122019499,8.59375
+Energy,chronos_base,0.908599317521494,0.6608616701619849,7.375
+Energy,chronos_large,0.9122256820323856,0.6592412529602467,7.25
+Energy,crossformer,9236423597.08186,11.594762627902593,14.25
+Energy,d_linear,1.1889257931784007,0.9161631466345352,14.71875
+Energy,deepar,1.678068514068864,1.3746230213328996,14.6875
+Energy,i_transformer,1.2394136898117367,0.8256530209999239,7.0625
+Energy,moirai_1.1_R_base_no_leak,0.9874963954081143,0.6351616784559907,5.6875
+Energy,moirai_1.1_R_large_no_leak,0.8909713530536371,0.6258812753967218,5.375
+Energy,moirai_1.1_R_small_no_leak,0.8991945372018496,0.6974970102561562,7.0
+Energy,n_beats,1.1689169414175054,0.9804388197933043,15.09375
+Energy,naive,1.2329496956881505,1.717544651501066,18.21875
+Energy,patch_tst,1.0002148326675349,0.6519449023593654,5.75
+Energy,seasonal_naive,1.0,1.0,15.84375
+Energy,tft,1.094935043145979,0.6698073412229205,7.28125
+Energy,tide,1.2969242335580595,0.8815970753846475,10.46875
+Energy,timesfm,0.990249590684815,0.7036315984358718,8.25
+Energy,visionts,1.1457219960976157,0.8253258612638164,13.09375
+Healthcare,auto_arima,0.7829904478130371,0.6313761904761905,7.8
+Healthcare,auto_ets,0.8061744759695987,0.6378749206349206,7.2
+Healthcare,auto_theta,0.9667122096533645,0.8276838095238095,12.0
+Healthcare,chronos-small,0.6676511382266025,0.5753475132275132,5.8
+Healthcare,chronos_base,0.7248913460690712,0.5941960317460317,5.0
+Healthcare,chronos_large,0.6616148175532721,0.5314045502645502,4.2
+Healthcare,crossformer,454.59633819128214,70.08870634920635,16.0
+Healthcare,d_linear,0.8850081956016165,0.9878685185185185,16.0
+Healthcare,deepar,0.8566987649911045,0.8641066137566138,10.4
+Healthcare,i_transformer,0.9023075876752837,0.7645924867724868,9.4
+Healthcare,moirai_1.1_R_base_no_leak,1.1475007332958291,1.073448677248677,15.8
+Healthcare,moirai_1.1_R_large_no_leak,0.8166086394810568,0.68611,8.0
+Healthcare,moirai_1.1_R_small_no_leak,1.0612957366806712,0.925962328042328,14.8
+Healthcare,n_beats,0.7698977996924792,0.8690820634920635,14.0
+Healthcare,naive,1.2056710696279012,1.2890160846560845,17.8
+Healthcare,patch_tst,0.7946209932133224,0.6791721164021164,8.6
+Healthcare,seasonal_naive,1.0,1.0,15.0
+Healthcare,tft,0.7904105809823141,0.7010021693121693,8.2
+Healthcare,tide,0.8426731528233347,1.1561685185185187,14.4
+Healthcare,timesfm,0.791330416522951,0.7994040740740741,7.8
+Healthcare,visionts,0.8034595649263452,0.7956369841269841,12.8
+Nature,auto_arima,0.9361489953148219,0.7095217542336816,14.4
+Nature,auto_ets,1.206307582700855,76254692.36191763,16.933333333333334
+Nature,auto_theta,5.150553292857,1.0319763336420513,16.4
+Nature,chronos-small,0.9491555540245159,0.4571809141301187,9.666666666666666
+Nature,chronos_base,0.8087692327609204,0.43483874046585586,8.733333333333333
+Nature,chronos_large,0.7215301114853574,0.43339550624240464,8.2
+Nature,crossformer,3.6857728593414816,1.8222011094109303,12.8
+Nature,d_linear,1.6637383989524568,0.566411290270835,14.466666666666667
+Nature,deepar,1.3368617172298543,0.784343921808244,11.866666666666667
+Nature,i_transformer,1.0245163140037494,0.3923948352004396,6.533333333333333
+Nature,moirai_1.1_R_base_no_leak,1.0846943426539009,0.42165427104639003,4.466666666666667
+Nature,moirai_1.1_R_large_no_leak,0.9012043168826274,0.37755828000010155,4.133333333333334
+Nature,moirai_1.1_R_small_no_leak,0.8636937125921123,0.400148437852242,4.4
+Nature,n_beats,2.051183579793879,0.5865729148518006,14.533333333333333
+Nature,naive,1.0153007149015423,1.5216687585771838,19.266666666666666
+Nature,patch_tst,0.9757662316880771,0.40362241062795473,7.266666666666667
+Nature,seasonal_naive,1.0,1.0,18.6
+Nature,tft,1.3479799792947338,0.4024676715316202,7.466666666666667
+Nature,tide,1.6518355265449745,0.648933595846154,13.466666666666667
+Nature,timesfm,1.0759825145269837,0.38186899336385965,5.733333333333333
+Nature,visionts,1.0368902840216354,0.4874755604862317,11.666666666666666
+Sales,auto_arima,0.7716630938105196,0.4828581089269842,14.25
+Sales,auto_ets,0.9017684593360312,30.76895095733506,17.25
+Sales,auto_theta,0.8258637946630958,0.5029359984438486,14.5
+Sales,chronos-small,0.7186805493171662,0.3848441218930615,9.0
+Sales,chronos_base,0.7008513669210537,0.3850621220616795,7.25
+Sales,chronos_large,0.7034956230009589,0.3844707529650858,7.25
+Sales,crossformer,1.4946326987237475,7.655215975652274,20.75
+Sales,d_linear,0.7999361690904114,0.5046638267307796,14.25
+Sales,deepar,0.7388013496334613,0.3684882238444817,6.25
+Sales,i_transformer,0.7592707077676131,0.37054645387589946,4.75
+Sales,moirai_1.1_R_base_no_leak,0.667796706791987,0.5158002274792624,9.0
+Sales,moirai_1.1_R_large_no_leak,0.6706792874796048,0.4063993273240754,5.25
+Sales,moirai_1.1_R_small_no_leak,0.6717145779320488,0.4624986481003004,8.5
+Sales,n_beats,0.7261206955984014,0.42619466975098175,11.0
+Sales,naive,0.9988290398126464,0.9354131622562287,19.0
+Sales,patch_tst,0.7506252415562384,0.36695813811595074,3.25
+Sales,seasonal_naive,1.0,1.0,19.25
+Sales,tft,0.7571398644569189,0.3639182778535524,8.0
+Sales,tide,1.0042130411120884,0.5031160265435741,14.0
+Sales,timesfm,0.6834660865486862,0.36525039257779146,2.75
+Sales,visionts,0.8110052069079339,0.5234893692225551,15.5
+Transport,auto_arima,1.067553229756302,0.7895352174994626,15.866666666666667
+Transport,auto_ets,1.2519332616788197,62214211389283.484,18.333333333333332
+Transport,auto_theta,1.080401746635928,1.484666133944374,18.733333333333334
+Transport,chronos-small,0.8463413166527496,0.6018945114961274,10.066666666666666
+Transport,chronos_base,0.8525884818870904,0.5855383296935212,8.066666666666666
+Transport,chronos_large,0.847275145385676,0.5853558157193545,8.4
+Transport,crossformer,2.133541126273085,2.824391583266013,10.466666666666667
+Transport,d_linear,0.9088963832125505,0.7037765247623837,14.733333333333333
+Transport,deepar,0.8113936242603784,0.5544948055430984,6.466666666666667
+Transport,i_transformer,0.827077979022359,0.4999616864926626,5.866666666666666
+Transport,moirai_1.1_R_base_no_leak,0.8561472773934119,0.47760992257555535,6.133333333333334
+Transport,moirai_1.1_R_large_no_leak,0.9275713341627421,0.5021373535569643,6.666666666666667
+Transport,moirai_1.1_R_small_no_leak,0.910054545689888,0.5002405082060885,8.333333333333334
+Transport,n_beats,0.759611430343423,0.6406924635381934,12.6
+Transport,naive,1.4793208069977917,2.2909473535610148,20.133333333333333
+Transport,patch_tst,0.8021327551126702,0.5059201467965427,5.8
+Transport,seasonal_naive,1.0,1.0,17.333333333333332
+Transport,tft,0.8220948248404197,0.485294449011853,4.8
+Transport,tide,0.8779217036886292,0.5698046392964627,10.4
+Transport,timesfm,0.9283663454018408,0.577738758232893,8.133333333333333
+Transport,visionts,0.8701382141384387,0.6655022406963716,13.666666666666666
+Web/CloudOps,auto_arima,0.8940721359171526,0.9239632177767032,14.5
+Web/CloudOps,auto_ets,1.1484024357848706,3541668.1195238987,17.05
+Web/CloudOps,auto_theta,0.8325449233161077,0.7377937575734188,12.1
+Web/CloudOps,chronos-small,1.1477382857881004,0.7519221963095372,10.4
+Web/CloudOps,chronos_base,1.2983512147050473,0.8105248727247287,11.35
+Web/CloudOps,chronos_large,1.3303495508509569,0.79130505302003,11.65
+Web/CloudOps,crossformer,3.905809488486181,0.7282280986973914,11.35
+Web/CloudOps,d_linear,1.679664570939319,0.813864694620387,12.55
+Web/CloudOps,deepar,0.8567686630861442,0.7806071927900515,11.7
+Web/CloudOps,i_transformer,0.7194432840929166,0.5224562708709003,4.5
+Web/CloudOps,moirai_1.1_R_base_no_leak,1.0566434817767107,0.7682523197700815,9.25
+Web/CloudOps,moirai_1.1_R_large_no_leak,0.7913068650225961,0.7415333306227597,8.35
+Web/CloudOps,moirai_1.1_R_small_no_leak,0.797099135532333,0.7437898694659932,8.5
+Web/CloudOps,n_beats,0.6423921434834379,0.6616483361015169,10.3
+Web/CloudOps,naive,1.1134728329755728,1.1880618871151416,16.7
+Web/CloudOps,patch_tst,0.6023812811006274,0.517794941208908,3.95
+Web/CloudOps,seasonal_naive,1.0,1.0,16.35
+Web/CloudOps,tft,1.3456759309631106,0.6485214709355084,5.95
+Web/CloudOps,tide,0.957645003291147,0.6729746748245962,9.95
+Web/CloudOps,timesfm,2.3672130873427584,0.9761625637942284,13.9
+Web/CloudOps,visionts,0.8379189396040971,0.7244329358471615,10.65

results/grouped_results_by_frequency.csv ADDED Viewed

	@@ -0,0 +1,211 @@

+frequency,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
+10S,auto_arima,1.0,1.0,8.5
+10S,auto_ets,1.7729614789749542,2.9904066969242904,19.5
+10S,auto_theta,0.6490202004866038,0.4498416650221458,1.0
+10S,chronos-small,2.4209056102186177,1.0976423357911036,10.833333333333334
+10S,chronos_base,2.9496999072884034,1.2710971183275626,12.333333333333334
+10S,chronos_large,3.0233397960335355,1.1850701939432435,11.666666666666666
+10S,crossformer,10.78297867187441,0.9632338077490566,11.5
+10S,d_linear,3.464549733595483,1.1289544947882015,11.666666666666666
+10S,deepar,1.8262604211066729,1.028182807201692,11.166666666666666
+10S,i_transformer,1.3136663904154584,0.6908224736822052,2.5
+10S,moirai_1.1_R_base_no_leak,2.3417832113521384,1.4018041619123724,16.5
+10S,moirai_1.1_R_large_no_leak,1.5520209642771396,1.2388948175006333,14.166666666666666
+10S,moirai_1.1_R_small_no_leak,1.6476110189439555,1.3097769500283245,14.833333333333334
+10S,n_beats,0.9967269159720157,0.821329634043785,6.666666666666667
+10S,naive,1.9717621185326415,1.537476210057875,16.333333333333332
+10S,patch_tst,1.059144382825294,0.732123599927497,5.0
+10S,seasonal_naive,1.0,1.0,9.5
+10S,tft,3.2429931822722793,1.0464406489926144,7.666666666666667
+10S,tide,1.6709303370483655,0.9849620277425579,10.166666666666666
+10S,timesfm,6.2343560526627115,1.8234677065697433,19.333333333333332
+10S,visionts,1.0968489534913595,0.9341805708581102,10.166666666666666
+10T,auto_arima,1.0,1.0,15.5
+10T,auto_ets,1.655224521738566,2.151457928700739,16.666666666666668
+10T,auto_theta,2.629806275955802,3.6738488709022494,19.666666666666668
+10T,chronos-small,1.5117776833428394,0.6428724843624029,12.166666666666666
+10T,chronos_base,1.2748386240996392,0.5553606639434913,8.833333333333334
+10T,chronos_large,1.0346877837456316,0.5513783261338567,8.0
+10T,crossformer,19934536143.591114,2.1015695650743917,11.5
+10T,d_linear,1.299502689362975,0.6517937533428791,11.833333333333334
+10T,deepar,0.6449733779553711,0.5921106678712438,10.666666666666666
+10T,i_transformer,0.7796030121280376,0.590764568044439,6.833333333333333
+10T,moirai_1.1_R_base_no_leak,0.9116352990069784,0.4334215808615318,5.0
+10T,moirai_1.1_R_large_no_leak,1.0458495459083907,0.4946922044906253,7.5
+10T,moirai_1.1_R_small_no_leak,0.5833924359839285,0.5920693990394471,8.5
+10T,n_beats,1.2619963794129159,0.7808649076454232,13.166666666666666
+10T,naive,0.7585251839995307,2.2345586312657204,19.333333333333332
+10T,patch_tst,0.8811585844239113,0.5583960366038973,7.333333333333333
+10T,seasonal_naive,1.0,1.0,16.5
+10T,tft,1.2009513786157717,0.4124334801906368,4.333333333333333
+10T,tide,1.1631853970897563,0.655064716092509,11.833333333333334
+10T,timesfm,1.0220126106014014,0.5696636404743637,8.333333333333334
+10T,visionts,1.0799543876471867,0.5014732753557328,7.5
+15T,auto_arima,1.1006594936981116,0.9576282754708779,13.916666666666666
+15T,auto_ets,1.2708218834942537,77767764236603.9,18.166666666666668
+15T,auto_theta,0.9891316566321592,1.7607790421016025,17.0
+15T,chronos-small,0.9607408272903095,0.7932548659240278,9.75
+15T,chronos_base,0.9515313401712095,0.7673882941899328,8.5
+15T,chronos_large,0.9444257867877283,0.7636894556076325,8.166666666666666
+15T,crossformer,6.318458905448402,3.106359900469947,13.25
+15T,d_linear,1.0317429035906032,0.9374198238688525,14.083333333333334
+15T,deepar,1.9585178196595348,1.6805423348003437,14.5
+15T,i_transformer,0.8998429174870893,0.658390212420039,3.5
+15T,moirai_1.1_R_base_no_leak,1.010717233317842,0.7067981834067827,5.583333333333333
+15T,moirai_1.1_R_large_no_leak,0.939877854260836,0.675873678929174,4.416666666666667
+15T,moirai_1.1_R_small_no_leak,1.0967135488745525,0.7930862976995009,8.666666666666666
+15T,n_beats,0.9635718857733048,0.9892577700794088,15.0
+15T,naive,1.4410754166321356,2.423171378732352,19.75
+15T,patch_tst,0.874459260826287,0.663243746282057,3.8333333333333335
+15T,seasonal_naive,1.0,1.0,15.166666666666666
+15T,tft,1.0582677267954164,0.7318724576230791,6.583333333333333
+15T,tide,0.9787252364392137,0.8073681981088243,10.083333333333334
+15T,timesfm,1.0222202532066558,0.7930028114468964,8.25
+15T,visionts,0.9819059200401176,0.8783919060935471,12.833333333333334
+5T,auto_arima,1.0,1.0,16.25
+5T,auto_ets,1.0086175993752609,1.0467883914512524,15.5
+5T,auto_theta,0.964060428373576,1.0460351344918941,16.666666666666668
+5T,chronos-small,0.7802675296137581,0.7516150339138812,11.166666666666666
+5T,chronos_base,0.7630965336585342,0.7494651012783219,11.25
+5T,chronos_large,0.7718811079792048,0.7552409156866928,11.916666666666666
+5T,crossformer,1.2734965922255723,1.040860998323226,11.416666666666666
+5T,d_linear,0.9697247766365901,0.8333404387801792,14.5
+5T,deepar,0.6722225413838093,0.7971245067076417,13.25
+5T,i_transformer,0.6604920249872609,0.5433487060646278,5.5
+5T,moirai_1.1_R_base_no_leak,0.6004862717957414,0.5407632644099643,4.75
+5T,moirai_1.1_R_large_no_leak,0.5519935614821848,0.5321255655856548,3.25
+5T,moirai_1.1_R_small_no_leak,0.5295698754870711,0.5360775348348382,4.416666666666667
+5T,n_beats,0.6343946014070763,0.7257573125150426,12.416666666666666
+5T,naive,0.8443455210066756,1.40000213208879,17.166666666666668
+5T,patch_tst,0.5873226620946504,0.5432646554948832,4.75
+5T,seasonal_naive,1.0,1.0,17.75
+5T,tft,0.6434051298457072,0.5565798833891061,4.916666666666667
+5T,tide,0.7950041895659674,0.64766319142376,10.083333333333334
+5T,timesfm,0.8202436454444181,0.730338811538158,11.916666666666666
+5T,visionts,0.8757565714382182,0.7336813267313066,12.166666666666666
+A,auto_arima,1.0171428571428571,0.9420289855072463,10.0
+A,auto_ets,0.9371428571428573,0.8043478260869564,3.0
+A,auto_theta,0.9371428571428573,0.8333333333333333,6.0
+A,chronos-small,1.0,1.0072463768115942,17.0
+A,chronos_base,0.9771428571428573,0.9782608695652174,13.0
+A,chronos_large,0.9771428571428573,0.9782608695652174,14.0
+A,crossformer,6.857142857142858,102.89855072463767,21.0
+A,d_linear,1.062857142857143,1.2173913043478262,20.0
+A,deepar,1.0171428571428571,0.8188405797101449,4.0
+A,i_transformer,1.0342857142857143,0.8478260869565217,7.0
+A,moirai_1.1_R_base_no_leak,1.2057142857142857,0.9420289855072463,11.0
+A,moirai_1.1_R_large_no_leak,0.9542857142857144,0.7753623188405796,1.0
+A,moirai_1.1_R_small_no_leak,0.9771428571428573,0.8260869565217391,5.0
+A,n_beats,0.9028571428571429,0.9710144927536232,12.0
+A,naive,1.0,0.9927536231884058,15.0
+A,patch_tst,1.0057142857142858,0.8478260869565217,8.0
+A,seasonal_naive,1.0,1.0,16.0
+A,tft,0.9257142857142858,0.7971014492753623,2.0
+A,tide,1.2057142857142857,1.1231884057971013,18.0
+A,timesfm,0.9714285714285715,0.8478260869565217,9.0
+A,visionts,1.0914285714285714,1.1521739130434783,19.0
+D,auto_arima,0.8529021613038067,0.4985004151026563,11.0
+D,auto_ets,0.9328318534943983,9.346363276951774,14.333333333333334
+D,auto_theta,0.9291332918118479,0.5748243358418598,14.333333333333334
+D,chronos-small,0.7368243034917711,0.43658660925543036,8.066666666666666
+D,chronos_base,0.6862749899568191,0.421268869505568,6.733333333333333
+D,chronos_large,0.6959915152342351,0.4206978284635147,6.666666666666667
+D,crossformer,154.59928198095446,26.923731402565622,18.4
+D,d_linear,0.8979716998083668,0.6146338678499457,15.133333333333333
+D,deepar,0.8010158939552874,0.5938211197218363,11.066666666666666
+D,i_transformer,0.8178290414664529,0.4945942845202976,8.466666666666667
+D,moirai_1.1_R_base_no_leak,0.7453788326511405,0.4633929439816138,7.466666666666667
+D,moirai_1.1_R_large_no_leak,0.6628066770710596,0.39170739564229085,4.333333333333333
+D,moirai_1.1_R_small_no_leak,0.7086474258839043,0.41290002863504544,6.2
+D,n_beats,0.7960758852718125,0.571280865579523,14.4
+D,naive,1.0,0.7975936297433697,17.6
+D,patch_tst,0.7522257716055208,0.43688965053339623,7.2
+D,seasonal_naive,1.0,1.0,19.0
+D,tft,0.7622852957686599,0.4136783788887685,6.666666666666667
+D,tide,0.9969048876212023,0.7484054326947167,13.533333333333333
+D,timesfm,0.779898124642687,0.49167132325793367,5.6
+D,visionts,0.8948792108968546,0.5558947866812461,14.8
+H,auto_arima,0.9539421964948747,0.7767300171583383,15.483870967741936
+H,auto_ets,1.328807641048494,4.448217317487267e+23,18.774193548387096
+H,auto_theta,3.078013295555958,1.851945783529741,18.870967741935484
+H,chronos-small,0.7327688330840401,0.5144358964770991,8.580645161290322
+H,chronos_base,0.7385494318069303,0.505942185952322,7.806451612903226
+H,chronos_large,0.7478851839242808,0.5091430423436873,8.0
+H,crossformer,5676075428.759688,5.282163959765436,11.0
+H,d_linear,1.3543590165990786,0.6813103317146129,14.129032258064516
+H,deepar,1.1775584990695886,0.8933529848405231,10.870967741935484
+H,i_transformer,0.960319640638691,0.476223391238969,6.129032258064516
+H,moirai_1.1_R_base_no_leak,0.9074224107057469,0.46296554507116067,5.290322580645161
+H,moirai_1.1_R_large_no_leak,0.8513557808071407,0.49904989135530137,6.548387096774194
+H,moirai_1.1_R_small_no_leak,0.8133232904541264,0.4778830172118464,6.32258064516129
+H,n_beats,1.3440056550191246,0.6646047122901354,13.419354838709678
+H,naive,1.3316850451655482,1.8366794323255176,19.64516129032258
+H,patch_tst,0.9278036240195197,0.46244079149616185,4.967741935483871
+H,seasonal_naive,1.0,1.0,17.64516129032258
+H,tft,1.1074721197519513,0.48721235955499614,6.645161290322581
+H,tide,1.2515715821404725,0.5695063335623101,10.580645161290322
+H,timesfm,0.9815849142972823,0.5102566138085366,8.35483870967742
+H,visionts,0.989251601317308,0.6105910361801898,11.935483870967742
+M,auto_arima,0.7897292418553448,0.7664432031389335,6.2
+M,auto_ets,0.8246966155843344,0.7720428958818089,5.4
+M,auto_theta,0.9196256618111756,0.8821604346726062,8.4
+M,chronos-small,0.8622647029230233,0.8311626407190348,8.6
+M,chronos_base,0.8946217478533949,0.8612725657711913,8.6
+M,chronos_large,0.8472484866537497,0.81475133159962,8.2
+M,crossformer,10.544298745396533,67.95761479628871,13.0
+M,d_linear,1.081950231233934,1.1996315851033383,15.8
+M,deepar,1.1770718831957356,1.0978451288049407,11.8
+M,i_transformer,1.0050739577178593,0.8211301131061257,5.6
+M,moirai_1.1_R_base_no_leak,1.894529314848318,1.4962012401441718,19.2
+M,moirai_1.1_R_large_no_leak,0.9965918207836234,0.9172428125972104,9.6
+M,moirai_1.1_R_small_no_leak,1.3843487814465443,1.2213717006565714,17.0
+M,n_beats,0.9414022389216479,1.007258419666447,11.4
+M,naive,1.2968159580843228,1.5810738327920664,19.2
+M,patch_tst,0.9895171231152233,0.8480243667176828,7.6
+M,seasonal_naive,1.0,1.0,13.4
+M,tft,1.0540757144083999,0.8748338687935762,8.2
+M,tide,1.0388749071339713,1.2356707678524768,15.4
+M,timesfm,0.8627984408713761,0.7375099053093932,3.6
+M,visionts,0.9379289156090017,1.0376995780191705,14.8
+Q,auto_arima,0.8591549295774649,0.8225806451612904,5.0
+Q,auto_ets,0.8591549295774649,0.7983870967741936,4.0
+Q,auto_theta,0.8380281690140845,0.7973790322580646,2.0
+Q,chronos-small,0.8239436619718311,0.8457661290322581,11.0
+Q,chronos_base,0.8098591549295776,0.8397177419354839,8.0
+Q,chronos_large,0.8098591549295776,0.8397177419354839,9.0
+Q,crossformer,9.929577464788732,119.95967741935485,21.0
+Q,d_linear,0.9859154929577466,1.1088709677419355,19.0
+Q,deepar,0.9436619718309861,0.840725806451613,10.0
+Q,i_transformer,0.9084507042253522,0.7973790322580646,3.0
+Q,moirai_1.1_R_base_no_leak,1.4295774647887327,1.1290322580645162,20.0
+Q,moirai_1.1_R_large_no_leak,0.8873239436619719,0.7883064516129034,1.0
+Q,moirai_1.1_R_small_no_leak,1.0352112676056338,0.9324596774193549,13.0
+Q,n_beats,0.8380281690140845,0.9717741935483871,15.0
+Q,naive,0.9295774647887325,0.9506048387096774,14.0
+Q,patch_tst,0.9366197183098592,0.8346774193548387,6.0
+Q,seasonal_naive,1.0,1.0,16.0
+Q,tft,0.9366197183098592,0.8366935483870969,7.0
+Q,tide,1.1338028169014085,1.0181451612903227,17.0
+Q,timesfm,0.8802816901408451,0.8528225806451613,12.0
+Q,visionts,0.9366197183098592,1.0483870967741935,18.0
+W,auto_arima,0.9759738266715013,0.748994017923637,9.875
+W,auto_ets,0.971794800090248,0.7889859594373794,10.375
+W,auto_theta,1.0807827233498426,0.8086208938554269,12.125
+W,chronos-small,0.7075714340716913,0.554913070787174,5.25
+W,chronos_base,0.7288517030693448,0.5619860060584291,4.5
+W,chronos_large,0.7069908074836505,0.5515544393043641,4.375
+W,crossformer,7.4085130347732155,49.820115045230985,20.375
+W,d_linear,1.1283846019672135,0.97517397862518,16.875
+W,deepar,1.862654170783469,1.3453366562739022,12.75
+W,i_transformer,1.9015274495154941,1.3308062694365717,12.375
+W,moirai_1.1_R_base_no_leak,0.9495981825008816,0.7483199869474646,9.75
+W,moirai_1.1_R_large_no_leak,0.8957713665163969,0.6412758290160822,5.125
+W,moirai_1.1_R_small_no_leak,0.9814158946324417,0.7581734604096699,8.625
+W,n_beats,1.4457143774506727,1.0793531840148527,15.25
+W,naive,1.0,0.875913187952162,13.625
+W,patch_tst,0.9073153465406218,0.6981022157766977,8.0
+W,seasonal_naive,1.0,1.0,16.5
+W,tft,1.0396666971907644,0.7794595632824765,11.25
+W,tide,1.7669411056805098,1.2225931248216542,13.125
+W,timesfm,0.8615386867667885,0.6305866856856626,4.875
+W,visionts,1.1355217147765972,0.9990150254993866,16.0

results/grouped_results_by_term_length.csv ADDED Viewed

	@@ -0,0 +1,64 @@

+term_length,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
+long,auto_arima,1.0407648178370423,0.8433829698076462,15.095238095238095
+long,auto_ets,1.2759037139699805,6.566416040544638e+23,18.666666666666668
+long,auto_theta,3.1498850578770834,2.0601505251654686,17.38095238095238
+long,chronos-small,1.0106686030682668,0.6462623576388796,11.857142857142858
+long,chronos_base,0.9982206684476166,0.619712893481198,10.714285714285714
+long,chronos_large,1.002171192668077,0.6208511887480599,11.047619047619047
+long,crossformer,8164023382.865999,0.5072278949697392,8.285714285714286
+long,d_linear,1.562322390892946,0.6730209945455377,13.238095238095237
+long,deepar,1.701880985304897,0.9481973435022004,12.714285714285714
+long,i_transformer,0.9961649824755919,0.45539997968708557,5.190476190476191
+long,moirai_1.1_R_base_no_leak,1.0906983483127686,0.5005193850296795,5.428571428571429
+long,moirai_1.1_R_large_no_leak,1.0073695402863057,0.538721786921065,6.761904761904762
+long,moirai_1.1_R_small_no_leak,0.9192503918318538,0.5613725349527969,7.238095238095238
+long,n_beats,1.1138460516712445,0.6641343878789057,11.952380952380953
+long,naive,1.3505564252932616,2.111806155806968,19.61904761904762
+long,patch_tst,0.9616382830362623,0.437002893484988,4.428571428571429
+long,seasonal_naive,1.0,1.0,16.857142857142858
+long,tft,1.1557571021609558,0.4529431176291461,4.904761904761905
+long,tide,1.2799667076250163,0.5247494742051094,8.761904761904763
+long,timesfm,1.796874605878608,0.6535892285331686,11.285714285714286
+long,visionts,1.0324758074134404,0.5463144403056718,9.571428571428571
+medium,auto_arima,0.9925217451047474,0.862989330588506,14.571428571428571
+medium,auto_ets,1.5910566618211812,8307478.842344518,18.333333333333332
+medium,auto_theta,1.8722187149301004,1.9594402227791685,17.428571428571427
+medium,chronos-small,1.232070637409101,0.7382117459387273,11.476190476190476
+medium,chronos_base,1.3793840327174318,0.777089828453266,11.571428571428571
+medium,chronos_large,1.3264659611435963,0.7511074564157023,10.904761904761905
+medium,crossformer,2510838525.595493,0.6608294014146207,8.571428571428571
+medium,d_linear,1.389713789881514,0.766265949645093,13.857142857142858
+medium,deepar,1.0742968052431434,0.8271972158292309,10.523809523809524
+medium,i_transformer,1.0152562339844198,0.5288800110916518,4.619047619047619
+medium,moirai_1.1_R_base_no_leak,1.1063058012973035,0.6371675078184162,6.380952380952381
+medium,moirai_1.1_R_large_no_leak,0.9602362541309903,0.6248351913119883,6.761904761904762
+medium,moirai_1.1_R_small_no_leak,0.9306757114627328,0.6605293213864295,7.523809523809524
+medium,n_beats,1.1513061909700455,0.759822180558971,12.952380952380953
+medium,naive,1.3041664719836585,2.0849508725128736,19.285714285714285
+medium,patch_tst,0.9034466134216352,0.5192209987815241,4.0
+medium,seasonal_naive,1.0,1.0,16.142857142857142
+medium,tft,1.2614988599396852,0.5286354727281315,4.809523809523809
+medium,tide,1.193587520563658,0.6109994159383925,9.523809523809524
+medium,timesfm,1.524928039707925,0.8138067254337588,10.80952380952381
+medium,visionts,1.0232867065677744,0.6690986273300213,10.952380952380953
+short,auto_arima,0.9183158404157642,0.7782297371914904,11.872727272727273
+short,auto_ets,1.0290159244978938,1287882.1620396667,13.781818181818181
+short,auto_theta,1.1276737692944792,0.9555577026305212,13.527272727272727
+short,chronos-small,0.7809947822060929,0.5973093408946208,7.254545454545455
+short,chronos_base,0.7504208829287357,0.5900291013554769,6.163636363636364
+short,chronos_large,0.7517445876583578,0.5860508463835682,6.2727272727272725
+short,crossformer,1298062866.890411,28.58975534685244,17.581818181818182
+short,d_linear,1.1372631367933204,0.8759004934560309,15.145454545454545
+short,deepar,1.0793364184925522,1.0306040328879058,11.89090909090909
+short,i_transformer,0.9683901902602449,0.7276300854958668,7.490909090909091
+short,moirai_1.1_R_base_no_leak,0.9406066725109657,0.7078567359533983,8.836363636363636
+short,moirai_1.1_R_large_no_leak,0.7726332258609173,0.6016193373717869,5.4
+short,moirai_1.1_R_small_no_leak,0.8487478588017764,0.6661776706416213,8.145454545454545
+short,n_beats,1.0323393751593084,0.8195659204638793,13.872727272727273
+short,naive,1.104608870534287,1.20246170110538,17.21818181818182
+short,patch_tst,0.80382107933716,0.6279448956983675,6.872727272727273
+short,seasonal_naive,1.0,1.0,16.745454545454546
+short,tft,1.0461332316971086,0.6755258572212629,8.218181818181819
+short,tide,1.1230276648969284,0.9169276487302415,13.418181818181818
+short,timesfm,0.9286461705272189,0.6445361651154075,6.636363636363637
+short,visionts,0.9467929371136616,0.8196918268462717,14.654545454545454

results/grouped_results_by_univariate.csv ADDED Viewed

	@@ -0,0 +1,43 @@

+univariate,model,eval_metrics/MAPE[0.5],eval_metrics/mean_weighted_sum_quantile_loss,rank
+False,auto_arima,0.9804811342495535,0.8713104754311666,13.906976744186046
+False,auto_ets,1.3150898838758078,28247779.184341986,17.023255813953487
+False,auto_theta,2.411812537314736,1.1875727874520958,14.976744186046512
+False,chronos-small,1.0763507868085402,0.6645935333486652,9.511627906976743
+False,chronos_base,1.108253824730984,0.6827654193124292,9.44186046511628
+False,chronos_large,1.0950741555125842,0.6729041022551422,9.55813953488372
+False,crossformer,2.654080994925351,0.9189271735725226,12.906976744186046
+False,d_linear,1.6213889770481567,0.7823221048850953,13.488372093023257
+False,deepar,1.5448649630735394,1.177574529017474,14.0
+False,i_transformer,1.1492193189093056,0.6224209228728989,5.186046511627907
+False,moirai_1.1_R_base_no_leak,1.0862615181082176,0.6282749871527136,6.27906976744186
+False,moirai_1.1_R_large_no_leak,0.9324785498208167,0.6271354928559256,6.674418604651163
+False,moirai_1.1_R_small_no_leak,0.8841632883529932,0.6261701600960712,6.325581395348837
+False,n_beats,1.4270416026611565,0.7348091745010178,12.581395348837209
+False,naive,1.2123354003513602,1.3984175456034713,17.697674418604652
+False,patch_tst,0.8748906969616712,0.5267294212715652,5.023255813953488
+False,seasonal_naive,1.0,1.0,16.511627906976745
+False,tft,1.3981834112189448,0.6093381204940588,6.906976744186046
+False,tide,1.43858389278792,0.8200701505564333,11.395348837209303
+False,timesfm,1.714449538660074,0.7498374834291154,10.069767441860465
+False,visionts,1.0423141526857782,0.7009754803331653,11.534883720930232
+True,auto_arima,0.9452907457761817,0.7624093224131667,12.555555555555555
+True,auto_ets,1.1157996061544226,2.553606237989581e+23,14.87037037037037
+True,auto_theta,1.1810795821409918,1.5907675831680725,15.38888888888889
+True,chronos-small,0.8105398747887519,0.6175638516729869,8.88888888888889
+True,chronos_base,0.8064432004095214,0.6004730500877874,7.425925925925926
+True,chronos_large,0.7992434048928462,0.5946120685660026,7.314814814814815
+True,crossformer,5473436252.639601,28.841701459950688,14.185185185185185
+True,d_linear,1.0152315982745468,0.8288833788523304,15.222222222222221
+True,deepar,0.9487785778969822,0.8024222742045092,10.0
+True,i_transformer,0.8534236166087696,0.6282487930572502,7.314814814814815
+True,moirai_1.1_R_base_no_leak,0.9474297935418754,0.6631059403644487,8.592592592592593
+True,moirai_1.1_R_large_no_leak,0.8095920641262695,0.5658691463988443,5.444444444444445
+True,moirai_1.1_R_small_no_leak,0.8798251667426911,0.6550813700792939,9.0
+True,n_beats,0.7960013669407282,0.803378130725278,13.796296296296296
+True,naive,1.1920786390202338,1.7432469758252465,18.574074074074073
+True,patch_tst,0.8473452262927366,0.5920056645616604,6.277777777777778
+True,seasonal_naive,1.0,1.0,16.74074074074074
+True,tft,0.8921817826859224,0.5845469141375511,6.648148148148148
+True,tide,0.9602232031743085,0.7225690907268552,11.703703703703704
+True,timesfm,0.8724456439616861,0.6300331912444614,7.333333333333333
+True,visionts,0.9337984401734895,0.7493477637512667,13.722222222222221

src/about.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     benchmark: str
@@ -12,61 +13,85 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
-NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
-"""
-EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
 ```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """

 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     benchmark: str
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    # task0 = Task("boolq", "acc", "BoolQA")
+    task1 = Task("trivia", "EM", "TriviaQA")
+    task2 = Task("truthfulqa", "EM", "TruthfulQA")
+    task3 = Task("popqa", "acc", "PopQA")
+    task4 = Task("hpqa", "EM", "HotpotQA")
+    task5 = Task("nq", "EM", "Natural Questions")
+    task6 = Task("2wiki", "EM", "2WikiMultiHop")
+    task7 = Task("musique", "EM", "MuSiQue")
+    # task0 = Task("anli_r1", "acc", "ANLI")
+    # task1 = Task("logiqa", "acc_norm", "LogiQA")
+NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">GIFT-Eval Time Series Forecasting Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+[Placeholder] We introduce the General TIme Series ForecasTing Model Evaluation, GIFT-Eval,
+a pioneering benchmark aimed at promoting evaluation across diverse datasets.
+GIFT-Eval encompasses 28 datasets over 144,000 time series and 177 million data
+points, spanning seven domains, 10 frequencies, multivariate inputs, and prediction lengths ranging from short to long-term forecasts.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
+How It Works
+To participate in the ContextualBench leaderboard, follow these steps to evaluate your Large Language Model (LLM) using the ContextualBench framework:
+Clone the Repository: Start by cloning the ContextualBench GitHub repository to your local machine using the following command:
+```bash
+git clone https://github.com/SalesforceAIResearch/SFR-RAG
+```
+Navigate to the Directory: Move into the cloned repository's directory:
+``bash
+cd ContextualBench
 ```
+Install Dependencies: Install all necessary dependencies by executing:
+```bash
+pip install -r requirements.txt
+```
+Prepare Your Model and Dataset: Set up your model and dataset according to the guidelines provided in the repository's documentation.
+Run the Evaluation Script: Execute the evaluation script to generate outputs for your model on the specified dataset:
+```bash
+python run.py [dataset_name]
+```
+Collect and Format Outputs: Gather the outputs generated for each dataset and format them according to the leaderboard submission guidelines.
+Submit Your Results: Email the formatted outputs to the author's email address for evaluation. Our team will assess the performance and update the leaderboard accordingly.
+Reproducibility
+Ensuring reproducibility is a key aspect of the ContextualBench leaderboard.
+By following the standardized steps outlined above, participants can consistently reproduce evaluation results. This process not only facilitates fair comparisons across different models but also encourages transparency and reliability in model assessments. Participants are encouraged to adhere strictly to the submission guidelines to ensure their results are accurately reflected on the leaderboard.
+"""
+EVALUATION_QUEUE_TEXT = """
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
+@article{
+aksu2024gifteval,
+title={{GIFT}-Eval: A Benchmark for General Time Series Forecasting Model Evaluation},
+author={Taha Aksu and Gerald Woo and Juncheng Liu and Xu Liu and Chenghao Liu and Silvio Savarese and Caiming Xiong and Doyen Sahoo},
+booktitle={NeurIPS Workshop on Time Series in the Age of Large Models},
+year={2024},
+url={https://openreview.net/forum?id=Z2cMOOANFX}
+}
 """

src/populate.py CHANGED Viewed

@@ -6,16 +6,34 @@ import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
-    raw_data = get_raw_eval_results(results_path, requests_path)
-    all_data_json = [v.to_dict() for v in raw_data]
-    df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
@@ -39,7 +57,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
             all_evals.append(data)
         elif ".md" not in entry:
             # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
             for sub_entry in sub_entries:
                 file_path = os.path.join(save_path, entry, sub_entry)
                 with open(file_path) as fp:

 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
+import ipdb
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
+    # raw_data = get_raw_eval_results(results_path, requests_path)
+    # print('results_path:', results_path)
+    # all_data_json = [v.to_dict() for v in raw_data]
+    # print(f"The raw data is {all_data_json}")
+    #
+    # df = pd.DataFrame.from_records(all_data_json)
+    df = pd.read_csv(results_path)
+    # df = pd.read_csv('LOTSAv2_EvalBenchmark(Long).csv')
+    # Step 2: Pivot the DataFrame
+    df = df.pivot_table(index='model',
+                                    columns='dataset',
+                                    values='eval_metrics/MAE[0.5]',
+                                    aggfunc='first')
+    df.drop(columns=['ALL'], inplace=True)
+    df['Average'] = df.mean(axis=1)
+    # Reset the index if you want the model column to be part of the DataFrame
+    df.reset_index(inplace=True)
+    print(f"DF at stage 1 ********** {df}")
+    # ipdb.set_trace()
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
+    # df = df.sort_values(by=[AutoEvalColumn.__dataclass_fields__['average'].name], ascending=False)
+    print(f"DF at stage 2 ********** {df}")
     df = df[cols].round(decimals=2)
+    print(f"DF at stage 3 ********** {df}")
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
             all_evals.append(data)
         elif ".md" not in entry:
             # this is a folder
+            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
             for sub_entry in sub_entries:
                 file_path = os.path.join(save_path, entry, sub_entry)
                 with open(file_path) as fp:

src/utils.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import pandas as pd
+def norm_sNavie(df):
+    df_normalized = df.copy()
+    seasonal_naive_row = df[df['model'] == 'seasonal_naive'].iloc[0]
+    print('df: ',df)
+    for column in df.columns:
+        if column != 'model':  # We skip normalizing the 'model' column
+            df_normalized[column] = df[column] / seasonal_naive_row[column]
+    return df_normalized
+def pivot_df(file_name, tab_name):
+    df = pd.read_csv(file_name)
+    if tab_name == 'univariate':
+        df['univariate'] = df['univariate'].replace({True: 'univariate', False: 'multivariate'})
+        df.rename(columns={'univariate': 'variate_type'}, inplace=True)
+        tab_name = 'variate_type'
+    df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
+    df_melted['metric'] = df_melted['metric'].replace({
+        'eval_metrics/MAPE[0.5]': 'MAPE',
+        'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS'
+    })
+    df_pivot = df_melted.pivot_table(index='model', columns=[tab_name, 'metric'], values='value')
+    df_pivot.columns = [f'{tab_name} ({metric})' for tab_name, metric in df_pivot.columns]
+    # df_pivot.to_csv('pivoted_df.csv')
+    # print(df_pivot)
+    df_pivot = df_pivot.reset_index()
+    return df_pivot