File size: 12,489 Bytes
76f9c73
409ae36
90f7215
 
 
ccb6d6d
 
 
 
 
 
 
 
 
 
 
409ae36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5303bc
90f7215
 
 
8fab3a5
 
 
4b38e69
 
8fab3a5
 
 
 
 
 
 
 
5928f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90f7215
 
 
 
 
 
 
 
 
 
 
4b38e69
 
90f7215
 
 
 
ccb6d6d
8fab3a5
 
 
 
90f7215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5928f02
 
90f7215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fab3a5
4b38e69
90f7215
 
 
 
 
8fab3a5
 
90f7215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5928f02
 
 
 
 
 
 
90f7215
 
 
 
 
 
 
 
 
 
b5a77c9
90f7215
5928f02
 
 
 
 
 
 
 
 
b5a77c9
90f7215
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import ipdb
import pandas as pd
import os
import re

# Define the formatting function
def format_number(num):
    # Check if the value is numeric
    if isinstance(num, (int, float)):
        if abs(num) >= 10**2:
            return f"{num:.1e}"
        else:
            return f"{num:.3f}"
    # Return non-numeric values as-is
    return num

def norm_sNavie(df):
    df_normalized = df.copy()
    seasonal_naive_row = df[df['model'] == 'seasonal_naive'].iloc[0]
    print('df: ',df)
    for column in df.columns:
        if column != 'model':  # We skip normalizing the 'model' column
            df_normalized[column] = df[column] / seasonal_naive_row[column]
    return df_normalized

def pivot_df(file_name, tab_name):
    df = pd.read_csv(file_name)
    if tab_name == 'univariate':
        df['univariate'] = df['univariate'].replace({True: 'univariate', False: 'multivariate'})
        df.rename(columns={'univariate': 'variate_type'}, inplace=True)
        tab_name = 'variate_type'
    df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
    df_melted['metric'] = df_melted['metric'].replace({
        'eval_metrics/MAPE[0.5]': 'MAPE',
        'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS'
    })
    df_pivot = df_melted.pivot_table(index='model', columns=[tab_name, 'metric'], values='value')
    df_pivot.columns = [f'{tab_name} ({metric})' for tab_name, metric in df_pivot.columns]
    # df_pivot.to_csv('pivoted_df.csv')
    # print(df_pivot)
    df_pivot = df_pivot.reset_index()
    df_pivot = df_pivot.round(3)
    return df_pivot


def rename_metrics(df):
    df = df.rename(columns={
        'eval_metrics/MAPE[0.5]': 'MAPE',
        'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
        'rank': 'Rank'
    })
    return df

def format_df(df):
    df = df.applymap(format_number)
    # make sure the data type is float
    df.iloc[:, 1:] = df.iloc[:, 1:].astype(float)
    return df

def unify_freq(df):
    # Remove all numeric characters from the 'frequency' column
    df['frequency'] = df['frequency'].str.replace(r'\d+', '', regex=True)
    # Remove everything after '-' if present
    df['frequency'] = df['frequency'].str.split('-').str[0]

    # Define the frequency conversion dictionary
    freq_conversion = {
        'T': 'Minutely',
        'H': 'Hourly',
        'D': 'Daily',
        'W': 'Weekly',
        'M': 'Monthly',
        'Q': 'Quarterly',
        'Y': 'Yearly',
        'A': 'Yearly',
        'S': 'Secondly'
    }

    # Map the cleaned 'frequency' values using the dictionary
    df['frequency'] = df['frequency'].replace(freq_conversion)
    return df
def pivot_existed_df(df, tab_name):
    df = df.reset_index()
    if tab_name == 'univariate':
        df['univariate'] = df['univariate'].replace({True: 'univariate', False: 'multivariate'})
        df.rename(columns={'univariate': 'variate_type'}, inplace=True)
        tab_name = 'variate_type'
    print('tab_name:', tab_name, 'df: ',df)
    print('columns', df.columns)
    df_melted = pd.melt(df, id_vars=[tab_name, 'model'], var_name='metric', value_name='value')
    df_melted['metric'] = df_melted['metric'].replace({
        'eval_metrics/MAPE[0.5]': 'MAPE',
        'eval_metrics/mean_weighted_sum_quantile_loss': 'CRPS',
        'rank': 'Rank',
    })
    df_pivot = df_melted.pivot_table(index='model', columns=[tab_name, 'metric'], values='value')
    df_pivot.columns = [f'{tab_name} ({metric})' for tab_name, metric in df_pivot.columns]
    df_pivot = df_pivot.reset_index()
    # df_pivot = df_pivot.round(3)
    df_pivot = format_df(df_pivot)
    # df_pivot = df_pivot.applymap(format_number)
    # # make sure the data type is float
    # df_pivot.iloc[:, 1:] = df_pivot.iloc[:, 1:].astype(float)
    return df_pivot


def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_properties.csv'):
    df_list = []

    # Walk through all folders and subfolders in the root directory
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file == 'all_results.csv':
                file_path = os.path.join(subdir, file)
                df = pd.read_csv(file_path)
                df_list.append(df)
    # Concatenate all dataframes into one
    all_results_df = pd.concat(df_list, ignore_index=True)
    all_results_df = all_results_df.sort_values(by=['model', 'dataset']).reset_index(drop=True)
    all_results_df[['dataset', 'frequency', 'term_length']] = all_results_df['dataset'].str.split('/', expand=True)

    dataset_properties = pd.read_csv(ds_properties)
    # Reforemat the the first element of each row after the header following these rules:
    # 1. make all characters lowercase
    dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.lower())
    # 2. replace all spaces with underscores
    dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.replace(' ', '_'))
    # 3. Replace all dashes with underscores
    dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.replace('-', '_'))
    # 4. Replace consecutive underscores with a single underscore. There maybe more than 2 consecutive underscores
    dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: re.sub('_+', '_', x))
    # 5. Remove all leading and trailing underscores
    dataset_properties['dataset'] = dataset_properties['dataset'].apply(lambda x: x.strip('_'))

    df = all_results_df

    # convert it to a dictionary, with dataset as the key, and the value as another dictionary. The inner dictionary has the column names as the key, and the value as the value.
    dataset_properties_dict = dataset_properties.set_index('dataset').T.to_dict('dict')
    dataset_properties_dict.keys()

    # # match the dataset name in model_properties_dict with the dataset name in df and add a new column for each key value pair in the inner dictionary.
    for dataset in dataset_properties_dict.keys():
        for key in dataset_properties_dict[dataset].keys():
            # set the row with the dataset name to the value of the key think step by step
            # First, get the row with the dataset name
            # Second, set the value of the key to the value of the key
            if key == 'frequency':
                # only set the frequency if the frequency column for all rows for the dataset is empty string
                if all(df[df['dataset'] == dataset]['frequency'].isna()):
                    df.loc[df['dataset'] == dataset, key] = dataset_properties_dict[dataset][key]
            else:
                df.loc[df['dataset'] == dataset, key] = dataset_properties_dict[dataset][key]

    # unify the frequency
    df = unify_freq(df)
    # standardize by seasonal naive
    df = standardize_df(df)
    metric_columns = ['eval_metrics/MSE[mean]', 'eval_metrics/MSE[0.5]', 'eval_metrics/MAE[0.5]',
                      'eval_metrics/MASE[0.5]', 'eval_metrics/MAPE[0.5]', 'eval_metrics/sMAPE[0.5]',
                      'eval_metrics/MSIS', 'eval_metrics/RMSE[mean]', 'eval_metrics/NRMSE[mean]',
                      'eval_metrics/ND[0.5]', 'eval_metrics/mean_weighted_sum_quantile_loss']
    RANKING_METRIC = "eval_metrics/mean_weighted_sum_quantile_loss"
    df['rank'] = df.groupby(['dataset', 'term_length', 'frequency'])[f'{RANKING_METRIC}'].rank(method='first',
                                                                                               ascending=True)
    # create a new column called rank
    metric_columns.append('rank')
    # create a new column called univariate. Set it to true if column num_variates is 1, otherwise set it to false
    df['univariate'] = df['num_variates'] == 1

    # group by domain
    METRIC_CHOICES = ["eval_metrics/MAPE[0.5]", "eval_metrics/mean_weighted_sum_quantile_loss", "rank"]

    grouped_results_overall = df.groupby(['model'])[METRIC_CHOICES].mean()
    # grouped_results_overall = grouped_results_overall.rename(columns={'model':'Model'})
    # grouped_results.to_csv(f'artefacts/grouped_results_by_model.csv')
    grouped_dfs = {}
    for col_name in ["domain", 'term_length', 'frequency', 'univariate']:
        grouped_dfs[col_name] = group_by(df, col_name)
        # print(f"Grouping by {col_name}:\n {grouped_dfs.head(20)}")
    # ipdb.set_trace()
    grouped_dfs['overall'] = grouped_results_overall
    return grouped_dfs

def standardize_df(df):
    # These are the metrics: eval_metrics/MSE[mean]	eval_metrics/MSE[0.5]	eval_metrics/MAE[0.5]	eval_metrics/MASE[0.5]	eval_metrics/MAPE[0.5]	eval_metrics/sMAPE[0.5]	eval_metrics/MSIS	eval_metrics/RMSE[mean]	eval_metrics/NRMSE[mean]	eval_metrics/ND[0.5]	eval_metrics/mean_weighted_sum_quantile_loss
    metric_columns = ['eval_metrics/MSE[mean]', 'eval_metrics/MSE[0.5]', 'eval_metrics/MAE[0.5]',
                      'eval_metrics/MASE[0.5]', 'eval_metrics/MAPE[0.5]', 'eval_metrics/sMAPE[0.5]',
                      'eval_metrics/MSIS', 'eval_metrics/RMSE[mean]', 'eval_metrics/NRMSE[mean]',
                      'eval_metrics/ND[0.5]', 'eval_metrics/mean_weighted_sum_quantile_loss']
    # convert the metric columns to float
    # for all metrics will rows with NA values, replace them with the average of the column
    for metric in metric_columns:
        df[metric] = df[metric].astype(float)
        df[metric] = df[metric].fillna(df[metric].mean())

    df[metric_columns] = df[metric_columns].astype(float)
    # Standardize each row in the metric columns by dividing the models results by seasonal_naive ressults for the same dataset, and frequency
    # Steps:
    # 1. Get all the unique dataset names
    # 2. For each dataset name, get all the unique frequencies and term lengths
    # 3. For each dataset name, frequency, and term length get the seasonal_naive results
    # 4. Get all the unique model names
    # 5. For each model name, dataset name, frequency, and term length, divide the model results by the seasonal_naive results
    # 6. Create a new df with standardized results
    original_df = df.copy()
    # 1. Get all the unique dataset names
    dataset_corrections = {
        "saugeenday": "saugeen",
        "temperature_rain_with_missing": "temperature_rain",
        "kdd_cup_2018_with_missing": "kdd_cup_2018",
        "car_parts_with_missing": "car_parts",
    }
    df['dataset'] = df['dataset'].replace(dataset_corrections)
    dataset_names = df['dataset'].unique()
    # 2. For each dataset name, get all the unique frequencies and term lengths
    for dataset in dataset_names:
        term_lengths = df[df['dataset'] == dataset]['term_length'].unique()
        for term_length in term_lengths:
            frequencies = df[(df['dataset'] == dataset) & (df['term_length'] == term_length)]['frequency'].unique()
            for frequency in frequencies:
                # 3. For each dataset name, frequency, and term length get the seasonal_naive results
                seasonal_naive_results = df[
                    (df['dataset'] == dataset) & (df['frequency'] == frequency) & (df['term_length'] == term_length) & (
                                df['model'] == 'Seasonal_Naive')]
                for metric in metric_columns:
                    try:
                        # 5. For each model name, dataset name, frequency, and term length, divide the model results by the seasonal_naive results
                        df.loc[(df['dataset'] == dataset) & (df['frequency'] == frequency) & (
                                    df['term_length'] == term_length), metric] = df[(df['dataset'] == dataset) & (
                                    df['frequency'] == frequency) & (df['term_length'] == term_length)][metric] / \
                                                                                 seasonal_naive_results[metric].values[0]
                    except Exception:
                        print(f"Error: {dataset} {term_length} {frequency} {metric}")
                        ipdb.set_trace()
    # df[(df['dataset'] == 'bitbrains_fast_storage') & (df['model'] == 'seasonal_naive')]
    return df

def group_by(df, col_name):
    METRIC_CHOICES = ["eval_metrics/MAPE[0.5]", "eval_metrics/mean_weighted_sum_quantile_loss", "rank"]
    grouped_results = df.groupby([col_name, 'model'])[METRIC_CHOICES].mean()
    # Display the results
    # Write the results to a csv file
    # grouped_results.to_csv(f'grouped_results_by_{col_name}.csv')
    return grouped_results