File size: 2,769 Bytes
d4c62b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
from itertools import combinations
import matplotlib.pyplot as plt

def load_energy_data(file_path:str="../data/raw/energy_dataset.csv", date:str="time") -> pd.DataFrame:
    df = pd.read_csv(file_path, parse_dates=[date])
    return df

def load_weather_data(file_path:str="../data/raw/weather_features.csv", date:str="dt_iso") -> pd.DataFrame:
    df = pd.read_csv(file_path, parse_dates=[date])
    return df

def load_preprocessed_data(file_path:str="../data/processed/processed_merged_data.pkl") -> pd.DataFrame: 
    df = pd.read_pickle(file_path)
    return df

def id_outliers(df:pd.DataFrame)-> pd.DataFrame:
    """
    Identify outliers for each column of a dataframe
    :param df: dataframe
    :return: dataframe with lower and upper bound and number of outliers
    """
    result_data = []
    for col_name in df.columns:
        q1 = df[col_name].quantile(0.25)
        q3 = df[col_name].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        n_outliers = len(df[(df[col_name] < lower_bound) | (df[col_name] > upper_bound)])
        result_data.append([lower_bound, upper_bound, n_outliers])
    outliers = pd.DataFrame(result_data, columns=['lower_bound', 'upper_bound', 'n_outliers'], index=df.columns)
    return outliers

def filtered_heatmap(df:pd.DataFrame, absthreshold:int=0) -> pd.DataFrame:
    """
    Filter a correlation matrix by absolute value threshold
    :param df: correlation matrix
    :param absthreshold: absolute value threshold
    :return: filtered correlation matrix
    """
    passed = set()
    for (r,c) in combinations(df.columns, 2):
        if (abs(df.loc[r,c]) >= absthreshold) and (r != c):
            passed.add(r)
            passed.add(c)
    passed = sorted(passed)
    return df.loc[passed,passed]

def compare_metrics(eval_df:pd.DataFrame, stat:str, metrics:list) -> None:
    """
    Compare the mean or standard deviation of each metric for each model
    :param eval_df: dataframe with evaluation metrics for each model
    :param stat: 'mean' or 'std'
    :param metrics: list of metrics to compare
    :return: None
    """
    fig, axes = plt.subplots(1, len(metrics), figsize=(15, 4))

    for i, metric in enumerate(metrics):
        ax = axes[i]
        ax.bar(eval_df['Model'], eval_df[metric].apply(lambda x: eval(f'np.{stat}(x)')), color='skyblue')
        ax.set_title(metric)
        ax.set_xlabel('Model')
        ax.set_ylabel(metric)
        ax.set_xticklabels(eval_df['Model'], rotation=45, ha='right')
        ax.yaxis.grid()

    # Adjust the layout to avoid overlapping titles
    plt.tight_layout()

    # Save the plot
    plt.savefig(f'../plots/metrics_{stat}.png')

    # Show the plot
    plt.show()