André Catarino
wip
d4c62b5
import pandas as pd
from itertools import combinations
import matplotlib.pyplot as plt
def load_energy_data(file_path:str="../data/raw/energy_dataset.csv", date:str="time") -> pd.DataFrame:
df = pd.read_csv(file_path, parse_dates=[date])
return df
def load_weather_data(file_path:str="../data/raw/weather_features.csv", date:str="dt_iso") -> pd.DataFrame:
df = pd.read_csv(file_path, parse_dates=[date])
return df
def load_preprocessed_data(file_path:str="../data/processed/processed_merged_data.pkl") -> pd.DataFrame:
df = pd.read_pickle(file_path)
return df
def id_outliers(df:pd.DataFrame)-> pd.DataFrame:
"""
Identify outliers for each column of a dataframe
:param df: dataframe
:return: dataframe with lower and upper bound and number of outliers
"""
result_data = []
for col_name in df.columns:
q1 = df[col_name].quantile(0.25)
q3 = df[col_name].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
n_outliers = len(df[(df[col_name] < lower_bound) | (df[col_name] > upper_bound)])
result_data.append([lower_bound, upper_bound, n_outliers])
outliers = pd.DataFrame(result_data, columns=['lower_bound', 'upper_bound', 'n_outliers'], index=df.columns)
return outliers
def filtered_heatmap(df:pd.DataFrame, absthreshold:int=0) -> pd.DataFrame:
"""
Filter a correlation matrix by absolute value threshold
:param df: correlation matrix
:param absthreshold: absolute value threshold
:return: filtered correlation matrix
"""
passed = set()
for (r,c) in combinations(df.columns, 2):
if (abs(df.loc[r,c]) >= absthreshold) and (r != c):
passed.add(r)
passed.add(c)
passed = sorted(passed)
return df.loc[passed,passed]
def compare_metrics(eval_df:pd.DataFrame, stat:str, metrics:list) -> None:
"""
Compare the mean or standard deviation of each metric for each model
:param eval_df: dataframe with evaluation metrics for each model
:param stat: 'mean' or 'std'
:param metrics: list of metrics to compare
:return: None
"""
fig, axes = plt.subplots(1, len(metrics), figsize=(15, 4))
for i, metric in enumerate(metrics):
ax = axes[i]
ax.bar(eval_df['Model'], eval_df[metric].apply(lambda x: eval(f'np.{stat}(x)')), color='skyblue')
ax.set_title(metric)
ax.set_xlabel('Model')
ax.set_ylabel(metric)
ax.set_xticklabels(eval_df['Model'], rotation=45, ha='right')
ax.yaxis.grid()
# Adjust the layout to avoid overlapping titles
plt.tight_layout()
# Save the plot
plt.savefig(f'../plots/metrics_{stat}.png')
# Show the plot
plt.show()